""" Response cleaner module. Removes markdown formatting and special characters from AI responses. Handles complex number-to-text conversion for Russian language. """ import re import pymorphy3 from num2words import num2words # Initialize morphological analyzer morph = pymorphy3.MorphAnalyzer() # Preposition to case mapping (simplified heuristics) PREPOSITION_CASES = { 'в': 'loct', # Prepositional (Locative 2) or Accusative. 'v godu' -> loct 'во': 'loct', 'на': 'accs', # Dates: 'na 5 maya' -> Accusative (na pyatoe) 'о': 'loct', 'об': 'loct', 'обо': 'loct', 'при': 'loct', 'у': 'gent', 'от': 'gent', 'до': 'gent', 'из': 'gent', 'с': 'gent', # or ablt (instrumental) 'со': 'gent', 'без': 'gent', 'для': 'gent', 'вокруг': 'gent', 'после': 'gent', 'к': 'datv', 'ко': 'datv', 'по': 'datv', # or accs for dates (limit). Heuristic: datv defaults usually. 'над': 'ablt', 'под': 'ablt', 'перед': 'ablt', 'за': 'ablt', # or acc 'между': 'ablt', } # Mapping pymorphy cases to num2words cases PYMORPHY_TO_NUM2WORDS = { 'nomn': 'nominative', 'gent': 'genitive', 'datv': 'dative', 'accs': 'accusative', 'ablt': 'instrumental', 'loct': 'prepositional', 'voct': 'nominative', # Fallback 'gen2': 'genitive', 'acc2': 'accusative', 'loc2': 'prepositional', } # Month names in Genitive case (as they appear in dates) MONTHS_GENITIVE = [ 'января', 'февраля', 'марта', 'апреля', 'мая', 'июня', 'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря' ] def get_case_from_preposition(prep_token): """Return pymorphy case based on preposition.""" if not prep_token: return None return PREPOSITION_CASES.get(prep_token.lower()) def convert_number(number_str, context_type='cardinal', case='nominative', gender='m'): """Convert a number string to words with specific parameters.""" try: # Handle floats if '.' in number_str or ',' in number_str: num_val = float(number_str.replace(',', '.')) else: num_val = int(number_str) return num2words( num_val, lang='ru', to=context_type, case=case, gender=gender ) except Exception as e: print(f"Error converting number {number_str}: {e}") return number_str def numbers_to_words(text: str) -> str: """ Intelligent conversion of digits in text to Russian words. Handles years, dates, and basic case agreement. """ if not text: return "" # 1. Identify "Year" patterns: "1999 год", "в 2024 году" def replace_year_match(match): full_str = match.group(0) prep = match.group(1) # Could be None year_str = match.group(2) year_word = match.group(3) # год, году, года... parsed = morph.parse(year_word)[0] case_tag = parsed.tag.case if prep and prep.strip().lower() in ['в', 'во'] and case_tag in ['accs', 'nomn']: pass nw_case = PYMORPHY_TO_NUM2WORDS.get(case_tag, 'nominative') words = convert_number(year_str, context_type='ordinal', case=nw_case, gender='m') prefix = f"{prep} " if prep else "" return f"{prefix}{words} {year_word}" text = re.sub( r'(?i)\b((?:в|с|к|до|от)\s+)?(\d{3,4})\s+(год[а-я]*)\b', replace_year_match, text ) # 2. Identify "Date" patterns: "25 июня", "с 1 мая" # Matches: (Preposition)? (Day) (Month_Genitive) # Day is usually 1-31. month_regex = '|'.join(MONTHS_GENITIVE) def replace_date_match(match): prep = match.group(1) day_str = match.group(2) month_word = match.group(3) # Determine case # Default to Genitive ("25 июня" -> "двадцать пятого июня") case = 'genitive' if prep: prep_clean = prep.strip().lower() # Specific overrides for dates if prep_clean == 'на': case = 'accusative' # на 5 мая -> на пятое elif prep_clean == 'по': case = 'accusative' # по 5 мая -> по пятое (limit) elif prep_clean == 'к': case = 'dative' # к 5 мая -> к пятому elif prep_clean in ['с', 'до', 'от']: case = 'genitive' # с 5 мая -> с пятого else: # Fallback to general preposition map morph_case = get_case_from_preposition(prep_clean) if morph_case: case = PYMORPHY_TO_NUM2WORDS.get(morph_case, 'genitive') # Convert to Ordinal # Dates are neuter ("число" implies neuter: "пятое", "пятого") # However, num2words for genitive ordinal: # 5, ordinal, genitive -> "пятого" (masc/neut are same) # 5, ordinal, accusative -> "пятое" (neuter) vs "пятый" (masc inanimate?) # Let's specify gender='n' (neuter) for dates to be safe (пятое, пятого, пятому). words = convert_number(day_str, context_type='ordinal', case=case, gender='n') prefix = f"{prep} " if prep else "" return f"{prefix}{words} {month_word}" text = re.sub( r'(?i)\b((?:с|к|до|от|на|по)\s+)?(\d{1,2})\s+(' + month_regex + r')\b', replace_date_match, text ) # 3. Handle remaining numbers (Cardinals) def replace_cardinal_match(match): prep = match.group(1) num_str = match.group(2) case = 'nominative' if prep: morph_case = get_case_from_preposition(prep.strip()) if morph_case: case = PYMORPHY_TO_NUM2WORDS.get(morph_case, 'nominative') words = convert_number(num_str, context_type='cardinal', case=case) prefix = f"{prep} " if prep else "" return f"{prefix}{words}" text = re.sub( r'(?i)\b((?:в|на|о|об|обо|при|у|от|до|из|с|со|без|для|вокруг|после|к|ко|по|над|под|перед|за|между)\s+)?(\d+(?:[.,]\d+)?)\b', replace_cardinal_match, text ) return text def clean_response(text: str) -> str: """ Clean AI response from markdown formatting and special characters. Args: text: Raw AI response with possible markdown Returns: Clean text suitable for TTS """ if not text: return "" # Remove citation references like [1], [2], [citation], etc. # Using hex escapes for brackets to avoid escaping issues text = re.sub(r'\x5B\d+\x5D', '', text) text = re.sub(r'\x5Bcitation\s*needed\x5D', '', text, flags=re.IGNORECASE) text = re.sub(r'\x5Bsource\x5D', '', text, flags=re.IGNORECASE) # Remove markdown bold **text** and __text__ text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'__(.+?)__', r'\1', text) # Remove markdown italic *text* and _text_ text = re.sub(r'\*(.+?)\*', r'\1', text) text = re.sub(r'(? text text = re.sub(r'\x5B([^\x5D]+)\x5D\([^)]+\)', r'\1', text) # Remove markdown images ![alt](url) text = re.sub(r'!\x5B([^\x5D]*)\x5D\([^)]+\)', '', text) # Remove inline code `code` text = re.sub(r'`([^`]+)`', r'\1', text) # Remove code blocks ```code``` text = re.sub(r'```[\s\S]*?```', '', text) # Remove markdown list markers (-, *, +, numbered) text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE) text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) # Remove blockquotes text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE) # Remove horizontal rules text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) # Remove HTML tags if any text = re.sub(r'<[^>]+>', '', text) # Convert numbers to words (Russian) text = numbers_to_words(text) # Remove extra whitespace text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r' +', ' ', text) # Clean up and return text = text.strip() return text