""" Response cleaner module. Removes markdown formatting and special characters from AI responses. Handles complex number-to-text conversion for Russian language. """ import re import pymorphy3 from num2words import num2words # Initialize morphological analyzer morph = pymorphy3.MorphAnalyzer() # Preposition to case mapping (simplified heuristics) PREPOSITION_CASES = { "в": "loct", # Prepositional (Locative 2) or Accusative. 'v godu' -> loct "во": "loct", "на": "accs", # Dates: 'na 5 maya' -> Accusative (na pyatoe) "о": "loct", "об": "loct", "обо": "loct", "при": "loct", "у": "gent", "от": "gent", "до": "gent", "из": "gent", "с": "gent", # or ablt (instrumental) "со": "gent", "без": "gent", "для": "gent", "вокруг": "gent", "после": "gent", "к": "datv", "ко": "datv", "по": "datv", # or accs for dates (limit). Heuristic: datv defaults usually. "над": "ablt", "под": "ablt", "перед": "ablt", "за": "ablt", # or acc "между": "ablt", } # Mapping pymorphy cases to num2words cases PYMORPHY_TO_NUM2WORDS = { "nomn": "nominative", "gent": "genitive", "datv": "dative", "accs": "accusative", "ablt": "instrumental", "loct": "prepositional", "voct": "nominative", # Fallback "gen2": "genitive", "acc2": "accusative", "loc2": "prepositional", } # Month names in Genitive case (as they appear in dates) MONTHS_GENITIVE = [ "января", "февраля", "марта", "апреля", "мая", "июня", "июля", "августа", "сентября", "октября", "ноября", "декабря", ] def get_case_from_preposition(prep_token): """Return pymorphy case based on preposition.""" if not prep_token: return None return PREPOSITION_CASES.get(prep_token.lower()) def convert_number(number_str, context_type="cardinal", case="nominative", gender="m"): """Convert a number string to words with specific parameters.""" try: # Handle floats if "." in number_str or "," in number_str: num_val = float(number_str.replace(",", ".")) else: num_val = int(number_str) return num2words(num_val, lang="ru", to=context_type, case=case, gender=gender) except Exception as e: print(f"Error converting number {number_str}: {e}") return number_str def numbers_to_words(text: str) -> str: """ Intelligent conversion of digits in text to Russian words. Handles years, dates, and basic case agreement. """ if not text: return "" # 1. Identify "Year" patterns: "1999 год", "в 2024 году" def replace_year_match(match): full_str = match.group(0) prep = match.group(1) # Could be None year_str = match.group(2) year_word = match.group(3) # год, году, года... parsed = morph.parse(year_word)[0] case_tag = parsed.tag.case if ( prep and prep.strip().lower() in ["в", "во"] and case_tag in ["accs", "nomn"] ): pass nw_case = PYMORPHY_TO_NUM2WORDS.get(case_tag, "nominative") words = convert_number( year_str, context_type="ordinal", case=nw_case, gender="m" ) prefix = f"{prep} " if prep else "" return f"{prefix}{words} {year_word}" text = re.sub( r"(?i)\b((?:в|с|к|до|от)\s+)?(\d{3,4})\s+(год[а-я]*)\b", replace_year_match, text, ) # 2. Identify "Date" patterns: "25 июня", "с 1 мая" # Matches: (Preposition)? (Day) (Month_Genitive) # Day is usually 1-31. month_regex = "|".join(MONTHS_GENITIVE) def replace_date_match(match): prep = match.group(1) day_str = match.group(2) month_word = match.group(3) # Determine case # Default to Genitive ("25 июня" -> "двадцать пятого июня") case = "genitive" if prep: prep_clean = prep.strip().lower() # Specific overrides for dates if prep_clean == "на": case = "accusative" # на 5 мая -> на пятое elif prep_clean == "по": case = "accusative" # по 5 мая -> по пятое (limit) elif prep_clean == "к": case = "dative" # к 5 мая -> к пятому elif prep_clean in ["с", "до", "от"]: case = "genitive" # с 5 мая -> с пятого else: # Fallback to general preposition map morph_case = get_case_from_preposition(prep_clean) if morph_case: case = PYMORPHY_TO_NUM2WORDS.get(morph_case, "genitive") # Convert to Ordinal # Dates are neuter ("число" implies neuter: "пятое", "пятого") # However, num2words for genitive ordinal: # 5, ordinal, genitive -> "пятого" (masc/neut are same) # 5, ordinal, accusative -> "пятое" (neuter) vs "пятый" (masc inanimate?) # Let's specify gender='n' (neuter) for dates to be safe (пятое, пятого, пятому). words = convert_number(day_str, context_type="ordinal", case=case, gender="n") prefix = f"{prep} " if prep else "" return f"{prefix}{words} {month_word}" text = re.sub( r"(?i)\b((?:с|к|до|от|на|по)\s+)?(\d{1,2})\s+(" + month_regex + r")\b", replace_date_match, text, ) # 3. Handle remaining numbers (Cardinals) def replace_cardinal_match(match): prep = match.group(1) num_str = match.group(2) case = "nominative" if prep: morph_case = get_case_from_preposition(prep.strip()) if morph_case: case = PYMORPHY_TO_NUM2WORDS.get(morph_case, "nominative") words = convert_number(num_str, context_type="cardinal", case=case) prefix = f"{prep} " if prep else "" return f"{prefix}{words}" text = re.sub( r"(?i)\b((?:в|на|о|об|обо|при|у|от|до|из|с|со|без|для|вокруг|после|к|ко|по|над|под|перед|за|между)\s+)?(\d+(?:[.,]\d+)?)\b", replace_cardinal_match, text, ) return text def clean_response(text: str, language: str = "ru") -> str: """ Clean AI response from markdown formatting and special characters. Args: text: Raw AI response with possible markdown language: Target language for output (affects post-processing) Returns: Clean text suitable for TTS """ if not text: return "" # Remove citation references like [1], [2], [citation], etc. # Using hex escapes for brackets to avoid escaping issues text = re.sub(r"\x5B\d+\x5D", "", text) text = re.sub(r"\x5Bcitation\s*needed\x5D", "", text, flags=re.IGNORECASE) text = re.sub(r"\x5Bsource\x5D", "", text, flags=re.IGNORECASE) # Remove markdown bold **text** and __text__ text = re.sub(r"\*\*(.+?)\*\*", r"\1", text) text = re.sub(r"__(.+?)__", r"\1", text) # Remove markdown italic *text* and _text_ text = re.sub(r"\*(.+?)\*", r"\1", text) text = re.sub(r"(? text text = re.sub(r"\x5B([^\x5D]+)\x5D\([^)]+\)", r"\1", text) # Remove markdown images ![alt](url) text = re.sub(r"!\x5B([^\x5D]*)\x5D\([^)]+\)", "", text) # Remove inline code `code` text = re.sub(r"`([^`]+)`", r"\1", text) # Remove code blocks ```code``` text = re.sub(r"```[\s\S]*?```", "", text) # Remove markdown list markers (-, *, +, numbered) text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE) text = re.sub(r"^\s*\d+\.\s+", "", text, flags=re.MULTILINE) # Remove blockquotes text = re.sub(r"^\s*>\s*", "", text, flags=re.MULTILINE) # Remove horizontal rules text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE) # Remove HTML tags if any text = re.sub(r"<[^>]+>", "", text) # Remove informal slang greetings at the beginning of sentences/responses text = re.sub( r"^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*", "", text, flags=re.IGNORECASE | re.MULTILINE, ) # Convert numbers to words only for Russian, and only if digits exist if language == "ru" and re.search(r"\d", text): text = numbers_to_words(text) # Remove extra whitespace text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r" +", " ", text) # Clean up and return text = text.strip() return text