translator но без озвучивания слов на английском

This commit is contained in:
2026-01-09 01:01:27 +03:00
parent 53809c03f4
commit 242ead5355
11 changed files with 845 additions and 238 deletions

View File

@@ -3,6 +3,7 @@ Response cleaner module.
Removes markdown formatting and special characters from AI responses.
Handles complex number-to-text conversion for Russian language.
"""
import re
import pymorphy3
from num2words import num2words
@@ -12,79 +13,86 @@ morph = pymorphy3.MorphAnalyzer()
# Preposition to case mapping (simplified heuristics)
PREPOSITION_CASES = {
'в': 'loct', # Prepositional (Locative 2) or Accusative. 'v godu' -> loct
'во': 'loct',
'на': 'accs', # Dates: 'na 5 maya' -> Accusative (na pyatoe)
'о': 'loct',
'об': 'loct',
'обо': 'loct',
'при': 'loct',
'у': 'gent',
'от': 'gent',
'до': 'gent',
'из': 'gent',
'с': 'gent', # or ablt (instrumental)
'со': 'gent',
'без': 'gent',
'для': 'gent',
'вокруг': 'gent',
'после': 'gent',
'к': 'datv',
'ко': 'datv',
'по': 'datv', # or accs for dates (limit). Heuristic: datv defaults usually.
'над': 'ablt',
'под': 'ablt',
'перед': 'ablt',
'за': 'ablt', # or acc
'между': 'ablt',
"в": "loct", # Prepositional (Locative 2) or Accusative. 'v godu' -> loct
"во": "loct",
"на": "accs", # Dates: 'na 5 maya' -> Accusative (na pyatoe)
"о": "loct",
"об": "loct",
"обо": "loct",
"при": "loct",
"у": "gent",
"от": "gent",
"до": "gent",
"из": "gent",
"с": "gent", # or ablt (instrumental)
"со": "gent",
"без": "gent",
"для": "gent",
"вокруг": "gent",
"после": "gent",
"к": "datv",
"ко": "datv",
"по": "datv", # or accs for dates (limit). Heuristic: datv defaults usually.
"над": "ablt",
"под": "ablt",
"перед": "ablt",
"за": "ablt", # or acc
"между": "ablt",
}
# Mapping pymorphy cases to num2words cases
PYMORPHY_TO_NUM2WORDS = {
'nomn': 'nominative',
'gent': 'genitive',
'datv': 'dative',
'accs': 'accusative',
'ablt': 'instrumental',
'loct': 'prepositional',
'voct': 'nominative', # Fallback
'gen2': 'genitive',
'acc2': 'accusative',
'loc2': 'prepositional',
"nomn": "nominative",
"gent": "genitive",
"datv": "dative",
"accs": "accusative",
"ablt": "instrumental",
"loct": "prepositional",
"voct": "nominative", # Fallback
"gen2": "genitive",
"acc2": "accusative",
"loc2": "prepositional",
}
# Month names in Genitive case (as they appear in dates)
MONTHS_GENITIVE = [
'января', 'февраля', 'марта', 'апреля', 'мая', 'июня',
'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря'
"января",
"февраля",
"марта",
"апреля",
"мая",
"июня",
"июля",
"августа",
"сентября",
"октября",
"ноября",
"декабря",
]
def get_case_from_preposition(prep_token):
"""Return pymorphy case based on preposition."""
if not prep_token:
return None
return PREPOSITION_CASES.get(prep_token.lower())
def convert_number(number_str, context_type='cardinal', case='nominative', gender='m'):
def convert_number(number_str, context_type="cardinal", case="nominative", gender="m"):
"""Convert a number string to words with specific parameters."""
try:
# Handle floats
if '.' in number_str or ',' in number_str:
num_val = float(number_str.replace(',', '.'))
if "." in number_str or "," in number_str:
num_val = float(number_str.replace(",", "."))
else:
num_val = int(number_str)
return num2words(
num_val,
lang='ru',
to=context_type,
case=case,
gender=gender
)
return num2words(num_val, lang="ru", to=context_type, case=case, gender=gender)
except Exception as e:
print(f"Error converting number {number_str}: {e}")
return number_str
def numbers_to_words(text: str) -> str:
"""
Intelligent conversion of digits in text to Russian words.
@@ -96,59 +104,65 @@ def numbers_to_words(text: str) -> str:
# 1. Identify "Year" patterns: "1999 год", "в 2024 году"
def replace_year_match(match):
full_str = match.group(0)
prep = match.group(1) # Could be None
prep = match.group(1) # Could be None
year_str = match.group(2)
year_word = match.group(3) # год, году, года...
year_word = match.group(3) # год, году, года...
parsed = morph.parse(year_word)[0]
case_tag = parsed.tag.case
if prep and prep.strip().lower() in ['в', 'во'] and case_tag in ['accs', 'nomn']:
pass
nw_case = PYMORPHY_TO_NUM2WORDS.get(case_tag, 'nominative')
words = convert_number(year_str, context_type='ordinal', case=nw_case, gender='m')
if (
prep
and prep.strip().lower() in ["в", "во"]
and case_tag in ["accs", "nomn"]
):
pass
nw_case = PYMORPHY_TO_NUM2WORDS.get(case_tag, "nominative")
words = convert_number(
year_str, context_type="ordinal", case=nw_case, gender="m"
)
prefix = f"{prep} " if prep else ""
return f"{prefix}{words} {year_word}"
text = re.sub(
r'(?i)\b((?:в|с|к|до|от)\s+)?(\d{3,4})\s+(год[а-я]*)\b',
r"(?i)\b((?:в|с|к|до|от)\s+)?(\d{3,4})\s+(год[а-я]*)\b",
replace_year_match,
text
text,
)
# 2. Identify "Date" patterns: "25 июня", "с 1 мая"
# Matches: (Preposition)? (Day) (Month_Genitive)
# Day is usually 1-31.
month_regex = '|'.join(MONTHS_GENITIVE)
month_regex = "|".join(MONTHS_GENITIVE)
def replace_date_match(match):
prep = match.group(1)
day_str = match.group(2)
month_word = match.group(3)
# Determine case
# Default to Genitive ("25 июня" -> "двадцать пятого июня")
case = 'genitive'
case = "genitive"
if prep:
prep_clean = prep.strip().lower()
# Specific overrides for dates
if prep_clean == 'на':
case = 'accusative' # на 5 мая -> на пятое
elif prep_clean == 'по':
case = 'accusative' # по 5 мая -> по пятое (limit)
elif prep_clean == 'к':
case = 'dative' # к 5 мая -> к пятому
elif prep_clean in ['с', 'до', 'от']:
case = 'genitive' # с 5 мая -> с пятого
if prep_clean == "на":
case = "accusative" # на 5 мая -> на пятое
elif prep_clean == "по":
case = "accusative" # по 5 мая -> по пятое (limit)
elif prep_clean == "к":
case = "dative" # к 5 мая -> к пятому
elif prep_clean in ["с", "до", "от"]:
case = "genitive" # с 5 мая -> с пятого
else:
# Fallback to general preposition map
morph_case = get_case_from_preposition(prep_clean)
if morph_case:
case = PYMORPHY_TO_NUM2WORDS.get(morph_case, 'genitive')
case = PYMORPHY_TO_NUM2WORDS.get(morph_case, "genitive")
# Convert to Ordinal
# Dates are neuter ("число" implies neuter: "пятое", "пятого")
@@ -156,112 +170,119 @@ def numbers_to_words(text: str) -> str:
# 5, ordinal, genitive -> "пятого" (masc/neut are same)
# 5, ordinal, accusative -> "пятое" (neuter) vs "пятый" (masc inanimate?)
# Let's specify gender='n' (neuter) for dates to be safe (пятое, пятого, пятому).
words = convert_number(day_str, context_type='ordinal', case=case, gender='n')
words = convert_number(day_str, context_type="ordinal", case=case, gender="n")
prefix = f"{prep} " if prep else ""
return f"{prefix}{words} {month_word}"
text = re.sub(
r'(?i)\b((?:с|к|до|от|на|по)\s+)?(\d{1,2})\s+(' + month_regex + r')\b',
r"(?i)\b((?:с|к|до|от|на|по)\s+)?(\d{1,2})\s+(" + month_regex + r")\b",
replace_date_match,
text
text,
)
# 3. Handle remaining numbers (Cardinals)
def replace_cardinal_match(match):
prep = match.group(1)
num_str = match.group(2)
case = 'nominative'
case = "nominative"
if prep:
morph_case = get_case_from_preposition(prep.strip())
if morph_case:
case = PYMORPHY_TO_NUM2WORDS.get(morph_case, 'nominative')
words = convert_number(num_str, context_type='cardinal', case=case)
case = PYMORPHY_TO_NUM2WORDS.get(morph_case, "nominative")
words = convert_number(num_str, context_type="cardinal", case=case)
prefix = f"{prep} " if prep else ""
return f"{prefix}{words}"
text = re.sub(
r'(?i)\b((?:в|на|о|об|обо|при|у|от|до|из|с|со|без|для|вокруг|после|к|ко|по|над|под|перед|за|между)\s+)?(\d+(?:[.,]\d+)?)\b',
r"(?i)\b((?:в|на|о|об|обо|при|у|от|до|из|с|со|без|для|вокруг|после|к|ко|по|над|под|перед|за|между)\s+)?(\d+(?:[.,]\d+)?)\b",
replace_cardinal_match,
text
text,
)
return text
def clean_response(text: str) -> str:
def clean_response(text: str, language: str = "ru") -> str:
"""
Clean AI response from markdown formatting and special characters.
Args:
text: Raw AI response with possible markdown
language: Target language for output (affects post-processing)
Returns:
Clean text suitable for TTS
"""
if not text:
return ""
# Remove citation references like [1], [2], [citation], etc.
# Using hex escapes for brackets to avoid escaping issues
text = re.sub(r'\x5B\d+\x5D', '', text)
text = re.sub(r'\x5Bcitation\s*needed\x5D', '', text, flags=re.IGNORECASE)
text = re.sub(r'\x5Bsource\x5D', '', text, flags=re.IGNORECASE)
text = re.sub(r"\x5B\d+\x5D", "", text)
text = re.sub(r"\x5Bcitation\s*needed\x5D", "", text, flags=re.IGNORECASE)
text = re.sub(r"\x5Bsource\x5D", "", text, flags=re.IGNORECASE)
# Remove markdown bold **text** and __text__
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
text = re.sub(r'__(.+?)__', r'\1', text)
text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
text = re.sub(r"__(.+?)__", r"\1", text)
# Remove markdown italic *text* and _text_
text = re.sub(r'\*(.+?)\*', r'\1', text)
text = re.sub(r'(?<!\w)_(.+?)_(?!\w)', r'\1', text)
text = re.sub(r"\*(.+?)\*", r"\1", text)
text = re.sub(r"(?<!\w)_(.+?)_(?!\w)", r"\1", text)
# Remove markdown strikethrough ~~text~~
text = re.sub(r'~~(.+?)~~', r'\1', text)
text = re.sub(r"~~(.+?)~~", r"\1", text)
# Remove markdown headers # ## ### etc.
text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE)
text = re.sub(r"^#{1,6}\s*", "", text, flags=re.MULTILINE)
# Remove markdown links [text](url) -> text
text = re.sub(r'\x5B([^\x5D]+)\x5D\([^)]+\)', r'\1', text)
text = re.sub(r"\x5B([^\x5D]+)\x5D\([^)]+\)", r"\1", text)
# Remove markdown images ![alt](url)
text = re.sub(r'!\x5B([^\x5D]*)\x5D\([^)]+\)', '', text)
text = re.sub(r"!\x5B([^\x5D]*)\x5D\([^)]+\)", "", text)
# Remove inline code `code`
text = re.sub(r'`([^`]+)`', r'\1', text)
text = re.sub(r"`([^`]+)`", r"\1", text)
# Remove code blocks ```code```
text = re.sub(r'```[\s\S]*?```', '', text)
text = re.sub(r"```[\s\S]*?```", "", text)
# Remove markdown list markers (-, *, +, numbered)
text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE)
text = re.sub(r"^\s*\d+\.\s+", "", text, flags=re.MULTILINE)
# Remove blockquotes
text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE)
text = re.sub(r"^\s*>\s*", "", text, flags=re.MULTILINE)
# Remove horizontal rules
text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
# Remove HTML tags if any
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r"<[^>]+>", "", text)
# Remove informal slang greetings at the beginning of sentences/responses
text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
# Convert numbers to words (Russian)
text = numbers_to_words(text)
text = re.sub(
r"^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*",
"",
text,
flags=re.IGNORECASE | re.MULTILINE,
)
# Convert numbers to words only for Russian, and only if digits exist
if language == "ru" and re.search(r"\d", text):
text = numbers_to_words(text)
# Remove extra whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r" +", " ", text)
# Clean up and return
text = text.strip()
return text
return text