Files
smart-speaker/cleaner.py
2026-01-07 17:31:22 +03:00

267 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Response cleaner module.
Removes markdown formatting and special characters from AI responses.
Handles complex number-to-text conversion for Russian language.
"""
import re
import pymorphy3
from num2words import num2words
# Initialize morphological analyzer
morph = pymorphy3.MorphAnalyzer()
# Preposition to case mapping (simplified heuristics)
PREPOSITION_CASES = {
'в': 'loct', # Prepositional (Locative 2) or Accusative. 'v godu' -> loct
'во': 'loct',
'на': 'accs', # Dates: 'na 5 maya' -> Accusative (na pyatoe)
'о': 'loct',
'об': 'loct',
'обо': 'loct',
'при': 'loct',
'у': 'gent',
'от': 'gent',
'до': 'gent',
'из': 'gent',
'с': 'gent', # or ablt (instrumental)
'со': 'gent',
'без': 'gent',
'для': 'gent',
'вокруг': 'gent',
'после': 'gent',
'к': 'datv',
'ко': 'datv',
'по': 'datv', # or accs for dates (limit). Heuristic: datv defaults usually.
'над': 'ablt',
'под': 'ablt',
'перед': 'ablt',
'за': 'ablt', # or acc
'между': 'ablt',
}
# Mapping pymorphy cases to num2words cases
PYMORPHY_TO_NUM2WORDS = {
'nomn': 'nominative',
'gent': 'genitive',
'datv': 'dative',
'accs': 'accusative',
'ablt': 'instrumental',
'loct': 'prepositional',
'voct': 'nominative', # Fallback
'gen2': 'genitive',
'acc2': 'accusative',
'loc2': 'prepositional',
}
# Month names in Genitive case (as they appear in dates)
MONTHS_GENITIVE = [
'января', 'февраля', 'марта', 'апреля', 'мая', 'июня',
'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря'
]
def get_case_from_preposition(prep_token):
"""Return pymorphy case based on preposition."""
if not prep_token:
return None
return PREPOSITION_CASES.get(prep_token.lower())
def convert_number(number_str, context_type='cardinal', case='nominative', gender='m'):
"""Convert a number string to words with specific parameters."""
try:
# Handle floats
if '.' in number_str or ',' in number_str:
num_val = float(number_str.replace(',', '.'))
else:
num_val = int(number_str)
return num2words(
num_val,
lang='ru',
to=context_type,
case=case,
gender=gender
)
except Exception as e:
print(f"Error converting number {number_str}: {e}")
return number_str
def numbers_to_words(text: str) -> str:
"""
Intelligent conversion of digits in text to Russian words.
Handles years, dates, and basic case agreement.
"""
if not text:
return ""
# 1. Identify "Year" patterns: "1999 год", "в 2024 году"
def replace_year_match(match):
full_str = match.group(0)
prep = match.group(1) # Could be None
year_str = match.group(2)
year_word = match.group(3) # год, году, года...
parsed = morph.parse(year_word)[0]
case_tag = parsed.tag.case
if prep and prep.strip().lower() in ['в', 'во'] and case_tag in ['accs', 'nomn']:
pass
nw_case = PYMORPHY_TO_NUM2WORDS.get(case_tag, 'nominative')
words = convert_number(year_str, context_type='ordinal', case=nw_case, gender='m')
prefix = f"{prep} " if prep else ""
return f"{prefix}{words} {year_word}"
text = re.sub(
r'(?i)\b((?:в|с|к|до|от)\s+)?(\d{3,4})\s+(год[а-я]*)\b',
replace_year_match,
text
)
# 2. Identify "Date" patterns: "25 июня", "с 1 мая"
# Matches: (Preposition)? (Day) (Month_Genitive)
# Day is usually 1-31.
month_regex = '|'.join(MONTHS_GENITIVE)
def replace_date_match(match):
prep = match.group(1)
day_str = match.group(2)
month_word = match.group(3)
# Determine case
# Default to Genitive ("25 июня" -> "двадцать пятого июня")
case = 'genitive'
if prep:
prep_clean = prep.strip().lower()
# Specific overrides for dates
if prep_clean == 'на':
case = 'accusative' # на 5 мая -> на пятое
elif prep_clean == 'по':
case = 'accusative' # по 5 мая -> по пятое (limit)
elif prep_clean == 'к':
case = 'dative' # к 5 мая -> к пятому
elif prep_clean in ['с', 'до', 'от']:
case = 'genitive' # с 5 мая -> с пятого
else:
# Fallback to general preposition map
morph_case = get_case_from_preposition(prep_clean)
if morph_case:
case = PYMORPHY_TO_NUM2WORDS.get(morph_case, 'genitive')
# Convert to Ordinal
# Dates are neuter ("число" implies neuter: "пятое", "пятого")
# However, num2words for genitive ordinal:
# 5, ordinal, genitive -> "пятого" (masc/neut are same)
# 5, ordinal, accusative -> "пятое" (neuter) vs "пятый" (masc inanimate?)
# Let's specify gender='n' (neuter) for dates to be safe (пятое, пятого, пятому).
words = convert_number(day_str, context_type='ordinal', case=case, gender='n')
prefix = f"{prep} " if prep else ""
return f"{prefix}{words} {month_word}"
text = re.sub(
r'(?i)\b((?:с|к|до|от|на|по)\s+)?(\d{1,2})\s+(' + month_regex + r')\b',
replace_date_match,
text
)
# 3. Handle remaining numbers (Cardinals)
def replace_cardinal_match(match):
prep = match.group(1)
num_str = match.group(2)
case = 'nominative'
if prep:
morph_case = get_case_from_preposition(prep.strip())
if morph_case:
case = PYMORPHY_TO_NUM2WORDS.get(morph_case, 'nominative')
words = convert_number(num_str, context_type='cardinal', case=case)
prefix = f"{prep} " if prep else ""
return f"{prefix}{words}"
text = re.sub(
r'(?i)\b((?:в|на|о|об|обо|при|у|от|до|из|с|со|без|для|вокруг|после|к|ко|по|над|под|перед|за|между)\s+)?(\d+(?:[.,]\d+)?)\b',
replace_cardinal_match,
text
)
return text
def clean_response(text: str) -> str:
"""
Clean AI response from markdown formatting and special characters.
Args:
text: Raw AI response with possible markdown
Returns:
Clean text suitable for TTS
"""
if not text:
return ""
# Remove citation references like [1], [2], [citation], etc.
# Using hex escapes for brackets to avoid escaping issues
text = re.sub(r'\x5B\d+\x5D', '', text)
text = re.sub(r'\x5Bcitation\s*needed\x5D', '', text, flags=re.IGNORECASE)
text = re.sub(r'\x5Bsource\x5D', '', text, flags=re.IGNORECASE)
# Remove markdown bold **text** and __text__
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
text = re.sub(r'__(.+?)__', r'\1', text)
# Remove markdown italic *text* and _text_
text = re.sub(r'\*(.+?)\*', r'\1', text)
text = re.sub(r'(?<!\w)_(.+?)_(?!\w)', r'\1', text)
# Remove markdown strikethrough ~~text~~
text = re.sub(r'~~(.+?)~~', r'\1', text)
# Remove markdown headers # ## ### etc.
text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE)
# Remove markdown links [text](url) -> text
text = re.sub(r'\x5B([^\x5D]+)\x5D\([^)]+\)', r'\1', text)
# Remove markdown images ![alt](url)
text = re.sub(r'!\x5B([^\x5D]*)\x5D\([^)]+\)', '', text)
# Remove inline code `code`
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove code blocks ```code```
text = re.sub(r'```[\s\S]*?```', '', text)
# Remove markdown list markers (-, *, +, numbered)
text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
# Remove blockquotes
text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE)
# Remove horizontal rules
text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
# Remove HTML tags if any
text = re.sub(r'<[^>]+>', '', text)
# Remove informal slang greetings at the beginning of sentences/responses
text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
# Convert numbers to words (Russian)
text = numbers_to_words(text)
# Remove extra whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r' +', ' ', text)
# Clean up and return
text = text.strip()
return text