translator но без озвучивания слов на английском

2026-01-09 01:01:27 +03:00
parent 53809c03f4
commit 242ead5355
11 changed files with 845 additions and 238 deletions
--- a/cleaner.py
+++ b/cleaner.py
@@ -3,6 +3,7 @@ Response cleaner module.
 Removes markdown formatting and special characters from AI responses.
 Handles complex number-to-text conversion for Russian language.
 """
+
 import re
 import pymorphy3
 from num2words import num2words
@@ -12,79 +13,86 @@ morph = pymorphy3.MorphAnalyzer()

 # Preposition to case mapping (simplified heuristics)
 PREPOSITION_CASES = {
-    'в': 'loct',  # Prepositional (Locative 2) or Accusative. 'v godu' -> loct
-    'во': 'loct',
-    'на': 'accs', # Dates: 'na 5 maya' -> Accusative (na pyatoe)
-    'о': 'loct',
-    'об': 'loct',
-    'обо': 'loct',
-    'при': 'loct',
-    'у': 'gent',
-    'от': 'gent',
-    'до': 'gent',
-    'из': 'gent',
-    'с': 'gent',  # or ablt (instrumental)
-    'со': 'gent',
-    'без': 'gent',
-    'для': 'gent',
-    'вокруг': 'gent',
-    'после': 'gent',
-    'к': 'datv',
-    'ко': 'datv',
-    'по': 'datv', # or accs for dates (limit). Heuristic: datv defaults usually.
-    'над': 'ablt',
-    'под': 'ablt',
-    'перед': 'ablt',
-    'за': 'ablt', # or acc
-    'между': 'ablt',
+    "в": "loct",  # Prepositional (Locative 2) or Accusative. 'v godu' -> loct
+    "во": "loct",
+    "на": "accs",  # Dates: 'na 5 maya' -> Accusative (na pyatoe)
+    "о": "loct",
+    "об": "loct",
+    "обо": "loct",
+    "при": "loct",
+    "у": "gent",
+    "от": "gent",
+    "до": "gent",
+    "из": "gent",
+    "с": "gent",  # or ablt (instrumental)
+    "со": "gent",
+    "без": "gent",
+    "для": "gent",
+    "вокруг": "gent",
+    "после": "gent",
+    "к": "datv",
+    "ко": "datv",
+    "по": "datv",  # or accs for dates (limit). Heuristic: datv defaults usually.
+    "над": "ablt",
+    "под": "ablt",
+    "перед": "ablt",
+    "за": "ablt",  # or acc
+    "между": "ablt",
 }

 # Mapping pymorphy cases to num2words cases
 PYMORPHY_TO_NUM2WORDS = {
-    'nomn': 'nominative',
-    'gent': 'genitive',
-    'datv': 'dative',
-    'accs': 'accusative',
-    'ablt': 'instrumental',
-    'loct': 'prepositional',
-    'voct': 'nominative', # Fallback
-    'gen2': 'genitive',
-    'acc2': 'accusative',
-    'loc2': 'prepositional',
+    "nomn": "nominative",
+    "gent": "genitive",
+    "datv": "dative",
+    "accs": "accusative",
+    "ablt": "instrumental",
+    "loct": "prepositional",
+    "voct": "nominative",  # Fallback
+    "gen2": "genitive",
+    "acc2": "accusative",
+    "loc2": "prepositional",
 }

 # Month names in Genitive case (as they appear in dates)
 MONTHS_GENITIVE = [
-    'января', 'февраля', 'марта', 'апреля', 'мая', 'июня',
-    'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря'
+    "января",
+    "февраля",
+    "марта",
+    "апреля",
+    "мая",
+    "июня",
+    "июля",
+    "августа",
+    "сентября",
+    "октября",
+    "ноября",
+    "декабря",
 ]

+
 def get_case_from_preposition(prep_token):
    """Return pymorphy case based on preposition."""
    if not prep_token:
        return None
    return PREPOSITION_CASES.get(prep_token.lower())

-def convert_number(number_str, context_type='cardinal', case='nominative', gender='m'):
+
+def convert_number(number_str, context_type="cardinal", case="nominative", gender="m"):
    """Convert a number string to words with specific parameters."""
    try:
        # Handle floats
-        if '.' in number_str or ',' in number_str:
-            num_val = float(number_str.replace(',', '.'))
+        if "." in number_str or "," in number_str:
+            num_val = float(number_str.replace(",", "."))
        else:
            num_val = int(number_str)
-        
-        return num2words(
-            num_val, 
-            lang='ru', 
-            to=context_type, 
-            case=case, 
-            gender=gender
-        )
+
+        return num2words(num_val, lang="ru", to=context_type, case=case, gender=gender)
    except Exception as e:
        print(f"Error converting number {number_str}: {e}")
        return number_str

+
 def numbers_to_words(text: str) -> str:
    """
    Intelligent conversion of digits in text to Russian words.
@@ -96,59 +104,65 @@ def numbers_to_words(text: str) -> str:
    # 1. Identify "Year" patterns: "1999 год", "в 2024 году"
    def replace_year_match(match):
        full_str = match.group(0)
-        prep = match.group(1) # Could be None
+        prep = match.group(1)  # Could be None
        year_str = match.group(2)
-        year_word = match.group(3) # год, году, года...
-        
+        year_word = match.group(3)  # год, году, года...
+
        parsed = morph.parse(year_word)[0]
        case_tag = parsed.tag.case
-        
-        if prep and prep.strip().lower() in ['в', 'во'] and case_tag in ['accs', 'nomn']:
-             pass

-        nw_case = PYMORPHY_TO_NUM2WORDS.get(case_tag, 'nominative')
-        
-        words = convert_number(year_str, context_type='ordinal', case=nw_case, gender='m')
-        
+        if (
+            prep
+            and prep.strip().lower() in ["в", "во"]
+            and case_tag in ["accs", "nomn"]
+        ):
+            pass
+
+        nw_case = PYMORPHY_TO_NUM2WORDS.get(case_tag, "nominative")
+
+        words = convert_number(
+            year_str, context_type="ordinal", case=nw_case, gender="m"
+        )
+
        prefix = f"{prep} " if prep else ""
        return f"{prefix}{words} {year_word}"

    text = re.sub(
-        r'(?i)\b((?:в|с|к|до|от)\s+)?(\d{3,4})\s+(год[а-я]*)\b',
+        r"(?i)\b((?:в|с|к|до|от)\s+)?(\d{3,4})\s+(год[а-я]*)\b",
        replace_year_match,
-        text
+        text,
    )

    # 2. Identify "Date" patterns: "25 июня", "с 1 мая"
    # Matches: (Preposition)? (Day) (Month_Genitive)
    # Day is usually 1-31.
-    month_regex = '|'.join(MONTHS_GENITIVE)
-    
+    month_regex = "|".join(MONTHS_GENITIVE)
+
    def replace_date_match(match):
        prep = match.group(1)
        day_str = match.group(2)
        month_word = match.group(3)
-        
+
        # Determine case
        # Default to Genitive ("25 июня" -> "двадцать пятого июня")
-        case = 'genitive' 
-        
+        case = "genitive"
+
        if prep:
            prep_clean = prep.strip().lower()
            # Specific overrides for dates
-            if prep_clean == 'на':
-                case = 'accusative' # на 5 мая -> на пятое
-            elif prep_clean == 'по':
-                case = 'accusative' # по 5 мая -> по пятое (limit)
-            elif prep_clean == 'к':
-                case = 'dative' # к 5 мая -> к пятому
-            elif prep_clean in ['с', 'до', 'от']:
-                case = 'genitive' # с 5 мая -> с пятого
+            if prep_clean == "на":
+                case = "accusative"  # на 5 мая -> на пятое
+            elif prep_clean == "по":
+                case = "accusative"  # по 5 мая -> по пятое (limit)
+            elif prep_clean == "к":
+                case = "dative"  # к 5 мая -> к пятому
+            elif prep_clean in ["с", "до", "от"]:
+                case = "genitive"  # с 5 мая -> с пятого
            else:
                # Fallback to general preposition map
                morph_case = get_case_from_preposition(prep_clean)
                if morph_case:
-                    case = PYMORPHY_TO_NUM2WORDS.get(morph_case, 'genitive')
+                    case = PYMORPHY_TO_NUM2WORDS.get(morph_case, "genitive")

        # Convert to Ordinal
        # Dates are neuter ("число" implies neuter: "пятое", "пятого")
@@ -156,112 +170,119 @@ def numbers_to_words(text: str) -> str:
        # 5, ordinal, genitive -> "пятого" (masc/neut are same)
        # 5, ordinal, accusative -> "пятое" (neuter) vs "пятый" (masc inanimate?)
        # Let's specify gender='n' (neuter) for dates to be safe (пятое, пятого, пятому).
-        
-        words = convert_number(day_str, context_type='ordinal', case=case, gender='n')
-        
+
+        words = convert_number(day_str, context_type="ordinal", case=case, gender="n")
+
        prefix = f"{prep} " if prep else ""
        return f"{prefix}{words} {month_word}"

    text = re.sub(
-        r'(?i)\b((?:с|к|до|от|на|по)\s+)?(\d{1,2})\s+(' + month_regex + r')\b',
+        r"(?i)\b((?:с|к|до|от|на|по)\s+)?(\d{1,2})\s+(" + month_regex + r")\b",
        replace_date_match,
-        text
+        text,
    )

    # 3. Handle remaining numbers (Cardinals)
    def replace_cardinal_match(match):
        prep = match.group(1)
        num_str = match.group(2)
-        
-        case = 'nominative'
+
+        case = "nominative"
        if prep:
            morph_case = get_case_from_preposition(prep.strip())
            if morph_case:
-                case = PYMORPHY_TO_NUM2WORDS.get(morph_case, 'nominative')
-        
-        words = convert_number(num_str, context_type='cardinal', case=case)
-        
+                case = PYMORPHY_TO_NUM2WORDS.get(morph_case, "nominative")
+
+        words = convert_number(num_str, context_type="cardinal", case=case)
+
        prefix = f"{prep} " if prep else ""
        return f"{prefix}{words}"

    text = re.sub(
-        r'(?i)\b((?:в|на|о|об|обо|при|у|от|до|из|с|со|без|для|вокруг|после|к|ко|по|над|под|перед|за|между)\s+)?(\d+(?:[.,]\d+)?)\b',
+        r"(?i)\b((?:в|на|о|об|обо|при|у|от|до|из|с|со|без|для|вокруг|после|к|ко|по|над|под|перед|за|между)\s+)?(\d+(?:[.,]\d+)?)\b",
        replace_cardinal_match,
-        text
+        text,
    )
-    
+
    return text


-def clean_response(text: str) -> str:
+def clean_response(text: str, language: str = "ru") -> str:
    """
    Clean AI response from markdown formatting and special characters.
-    
+
    Args:
        text: Raw AI response with possible markdown
-        
+        language: Target language for output (affects post-processing)
+
    Returns:
        Clean text suitable for TTS
    """
    if not text:
        return ""
-    
+
    # Remove citation references like [1], [2], [citation], etc.
    # Using hex escapes for brackets to avoid escaping issues
-    text = re.sub(r'\x5B\d+\x5D', '', text)
-    text = re.sub(r'\x5Bcitation\s*needed\x5D', '', text, flags=re.IGNORECASE)
-    text = re.sub(r'\x5Bsource\x5D', '', text, flags=re.IGNORECASE)
-    
+    text = re.sub(r"\x5B\d+\x5D", "", text)
+    text = re.sub(r"\x5Bcitation\s*needed\x5D", "", text, flags=re.IGNORECASE)
+    text = re.sub(r"\x5Bsource\x5D", "", text, flags=re.IGNORECASE)
+
    # Remove markdown bold **text** and __text__
-    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
-    text = re.sub(r'__(.+?)__', r'\1', text)
-    
+    text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
+    text = re.sub(r"__(.+?)__", r"\1", text)
+
    # Remove markdown italic *text* and _text_
-    text = re.sub(r'\*(.+?)\*', r'\1', text)
-    text = re.sub(r'(?<!\w)_(.+?)_(?!\w)', r'\1', text)
-    
+    text = re.sub(r"\*(.+?)\*", r"\1", text)
+    text = re.sub(r"(?<!\w)_(.+?)_(?!\w)", r"\1", text)
+
    # Remove markdown strikethrough ~~text~~
-    text = re.sub(r'~~(.+?)~~', r'\1', text)
-    
+    text = re.sub(r"~~(.+?)~~", r"\1", text)
+
    # Remove markdown headers # ## ### etc.
-    text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE)
-    
+    text = re.sub(r"^#{1,6}\s*", "", text, flags=re.MULTILINE)
+
    # Remove markdown links [text](url) -> text
-    text = re.sub(r'\x5B([^\x5D]+)\x5D\([^)]+\)', r'\1', text)
-    
+    text = re.sub(r"\x5B([^\x5D]+)\x5D\([^)]+\)", r"\1", text)
+
    # Remove markdown images ![alt](url)
-    text = re.sub(r'!\x5B([^\x5D]*)\x5D\([^)]+\)', '', text)
-    
+    text = re.sub(r"!\x5B([^\x5D]*)\x5D\([^)]+\)", "", text)
+
    # Remove inline code `code`
-    text = re.sub(r'`([^`]+)`', r'\1', text)
-    
+    text = re.sub(r"`([^`]+)`", r"\1", text)
+
    # Remove code blocks ```code```
-    text = re.sub(r'```[\s\S]*?```', '', text)
-    
+    text = re.sub(r"```[\s\S]*?```", "", text)
+
    # Remove markdown list markers (-, *, +, numbered)
-    text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
-    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
-    
+    text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"^\s*\d+\.\s+", "", text, flags=re.MULTILINE)
+
    # Remove blockquotes
-    text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE)
-    
+    text = re.sub(r"^\s*>\s*", "", text, flags=re.MULTILINE)
+
    # Remove horizontal rules
-    text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
-    
+    text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
+
    # Remove HTML tags if any
-    text = re.sub(r'<[^>]+>', '', text)
+    text = re.sub(r"<[^>]+>", "", text)

    # Remove informal slang greetings at the beginning of sentences/responses
-    text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
-    
-    # Convert numbers to words (Russian)
-    text = numbers_to_words(text)
-    
+    text = re.sub(
+        r"^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*",
+        "",
+        text,
+        flags=re.IGNORECASE | re.MULTILINE,
+    )
+
+    # Convert numbers to words only for Russian, and only if digits exist
+    if language == "ru" and re.search(r"\d", text):
+        text = numbers_to_words(text)
+
    # Remove extra whitespace
-    text = re.sub(r'\n{3,}', '\n\n', text)
-    text = re.sub(r'  +', ' ', text)
-    
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r"  +", " ", text)
+
    # Clean up and return
    text = text.strip()
-    
-    return text
+
+    return text