511 lines
15 KiB
Python
511 lines
15 KiB
Python
"""Text cleaner for TTS."""
|
||
|
||
import re
|
||
import pymorphy3
|
||
from num2words import num2words
|
||
from .config import WAKE_WORD, WAKE_WORD_ALIASES
|
||
from .roman import roman_to_int
|
||
|
||
morph = pymorphy3.MorphAnalyzer()
|
||
|
||
# Предлоги и падежи
|
||
PREPOSITION_CASES = {
|
||
"в": "loct", # В ком/чем? (Предложный) или Винительный. Часто loct для годов.
|
||
"во": "loct",
|
||
"на": "accs", # На какое число? (Винительный) - для дат.
|
||
"о": "loct",
|
||
"об": "loct",
|
||
"обо": "loct",
|
||
"при": "loct",
|
||
"у": "gent", # У кого/чего? (Родительный)
|
||
"от": "gent",
|
||
"до": "gent",
|
||
"из": "gent",
|
||
"с": "gent", # Или Творительный. Но чаще Родительный (с 5 числа).
|
||
"со": "gent",
|
||
"без": "gent",
|
||
"для": "gent",
|
||
"вокруг": "gent",
|
||
"после": "gent",
|
||
"к": "datv", # К кому/чему? (Дательный)
|
||
"ко": "datv",
|
||
"по": "datv",
|
||
"над": "ablt", # Над кем/чем? (Творительный)
|
||
"под": "ablt",
|
||
"перед": "ablt",
|
||
"за": "ablt",
|
||
"между": "ablt",
|
||
"около": "gent",
|
||
"против": "gent",
|
||
"вместо": "gent",
|
||
"кроме": "gent",
|
||
"из-за": "gent",
|
||
"сквозь": "accs",
|
||
"через": "accs",
|
||
"про": "accs",
|
||
}
|
||
|
||
# Соответствие падежей
|
||
PYMORPHY_TO_NUM2WORDS = {
|
||
"nomn": "nominative",
|
||
"gent": "genitive",
|
||
"datv": "dative",
|
||
"accs": "accusative",
|
||
"ablt": "instrumental",
|
||
"loct": "prepositional",
|
||
"voct": "nominative",
|
||
"gen2": "genitive",
|
||
"acc2": "accusative",
|
||
"loc2": "prepositional",
|
||
}
|
||
|
||
# Роды
|
||
PYMORPHY_TO_GENDER = {
|
||
"masc": "m",
|
||
"femn": "f",
|
||
"neut": "n",
|
||
}
|
||
|
||
# Месяца
|
||
MONTHS_GENITIVE = [
|
||
"января",
|
||
"февраля",
|
||
"марта",
|
||
"апреля",
|
||
"мая",
|
||
"июня",
|
||
"июля",
|
||
"августа",
|
||
"сентября",
|
||
"октября",
|
||
"ноября",
|
||
"декабря",
|
||
]
|
||
|
||
# Время
|
||
TIME_UNIT_LEMMAS = {"час", "минута", "секунда"}
|
||
WAKE_WORD_BLOCKED_PATTERNS = [
|
||
re.compile(rf"\b{re.escape(alias)}\b", flags=re.IGNORECASE)
|
||
for alias in set(WAKE_WORD_ALIASES) | {WAKE_WORD.lower()}
|
||
]
|
||
|
||
# Суффиксы порядковых
|
||
_ORDINAL_SUFFIX_MAP = {
|
||
# Masculine
|
||
"ого": ("genitive", "m"),
|
||
"его": ("genitive", "m"),
|
||
"ому": ("dative", "m"),
|
||
"ему": ("dative", "m"),
|
||
"ым": ("instrumental", "m"),
|
||
"им": ("instrumental", "m"),
|
||
"ом": ("prepositional", "m"),
|
||
"ем": ("prepositional", "m"),
|
||
"ый": ("nominative", "m"),
|
||
"ий": ("nominative", "m"),
|
||
"й": ("nominative", "m"),
|
||
"го": ("genitive", "m"),
|
||
"му": ("dative", "m"),
|
||
"м": ("prepositional", "m"),
|
||
# Feminine
|
||
"ая": ("nominative", "f"),
|
||
"яя": ("nominative", "f"),
|
||
"ую": ("accusative", "f"),
|
||
"юю": ("accusative", "f"),
|
||
"ой": ("genitive", "f"),
|
||
"ей": ("genitive", "f"),
|
||
# Neuter
|
||
"ое": ("nominative", "n"),
|
||
"ее": ("nominative", "n"),
|
||
}
|
||
|
||
|
||
def get_case_from_preposition(prep_token):
|
||
"""Падеж по предлогу."""
|
||
if not prep_token:
|
||
return None
|
||
return PREPOSITION_CASES.get(prep_token.lower())
|
||
|
||
|
||
def convert_number(number_str, context_type="cardinal", case="nominative", gender="m"):
|
||
"""Число в слова."""
|
||
try:
|
||
if "." in number_str or "," in number_str:
|
||
num_val = float(number_str.replace(",", "."))
|
||
else:
|
||
num_val = int(number_str)
|
||
|
||
return num2words(num_val, lang="ru", to=context_type, case=case, gender=gender)
|
||
except Exception as e:
|
||
print(f"Error converting number {number_str}: {e}")
|
||
return number_str
|
||
|
||
|
||
def numbers_to_words(text: str) -> str:
|
||
"""Замена цифр на слова."""
|
||
if not text:
|
||
return ""
|
||
|
||
preps_list = "|".join(map(re.escape, PREPOSITION_CASES.keys()))
|
||
|
||
# Время вида "в 7:00" / "во 7:00" / "к 7:05" / "07:00" -> человеческая русская форма.
|
||
# Важно: "в семь" (не "в семи"), "к семи" (дательный).
|
||
def _minute_words(minute_val: int) -> str:
|
||
if minute_val == 0:
|
||
return "ровно"
|
||
if minute_val < 10:
|
||
return "ноль " + convert_number(
|
||
str(minute_val), context_type="cardinal", case="nominative", gender="m"
|
||
)
|
||
return convert_number(str(minute_val), context_type="cardinal", case="nominative", gender="m")
|
||
|
||
def replace_time_match(match):
|
||
prep = match.group(1) or ""
|
||
hour_str = match.group(2)
|
||
minute_str = match.group(3)
|
||
|
||
try:
|
||
hour_val = int(hour_str)
|
||
minute_val = int(minute_str)
|
||
except Exception:
|
||
return match.group(0)
|
||
|
||
if not (0 <= hour_val <= 23 and 0 <= minute_val <= 59):
|
||
return match.group(0)
|
||
|
||
prep_clean = prep.strip().lower()
|
||
if prep_clean in {"в", "во"}:
|
||
hour_case = "accusative"
|
||
elif prep_clean in {"к", "ко"}:
|
||
hour_case = "dative"
|
||
else:
|
||
hour_case = "nominative"
|
||
|
||
hour_words = convert_number(str(hour_val), context_type="cardinal", case=hour_case, gender="m")
|
||
minute_words = _minute_words(minute_val)
|
||
|
||
prefix = f"{prep} " if prep else ""
|
||
return f"{prefix}{hour_words} {minute_words}"
|
||
|
||
def replace_time_no_prep_match(match):
|
||
hour_str = match.group(1)
|
||
minute_str = match.group(2)
|
||
|
||
try:
|
||
hour_val = int(hour_str)
|
||
minute_val = int(minute_str)
|
||
except Exception:
|
||
return match.group(0)
|
||
|
||
if not (0 <= hour_val <= 23 and 0 <= minute_val <= 59):
|
||
return match.group(0)
|
||
|
||
hour_words = convert_number(str(hour_val), context_type="cardinal", case="nominative", gender="m")
|
||
minute_words = _minute_words(minute_val)
|
||
return f"{hour_words} {minute_words}"
|
||
|
||
text = re.sub(
|
||
r"(?i)\b(в|во|к|ко)\s+(\d{1,2})\s*:\s*(\d{2})\b",
|
||
replace_time_match,
|
||
text,
|
||
)
|
||
text = re.sub(
|
||
r"\b(\d{1,2})\s*:\s*(\d{2})\b",
|
||
replace_time_no_prep_match,
|
||
text,
|
||
)
|
||
|
||
# Года с суффиксом
|
||
def replace_year_suffix_match(match):
|
||
prep = match.group(1)
|
||
year_str = match.group(2)
|
||
suffix = match.group(3)
|
||
year_word = match.group(4)
|
||
|
||
case = None
|
||
gender = None
|
||
|
||
if prep:
|
||
morph_case = get_case_from_preposition(prep.strip().lower())
|
||
if morph_case:
|
||
case = PYMORPHY_TO_NUM2WORDS.get(morph_case)
|
||
|
||
suffix_key = suffix.lower()
|
||
suffix_case, suffix_gender = _ORDINAL_SUFFIX_MAP.get(suffix_key, (None, None))
|
||
|
||
if not case and suffix_case:
|
||
case = suffix_case
|
||
|
||
if year_word:
|
||
gender = "m"
|
||
elif suffix_gender:
|
||
gender = suffix_gender
|
||
|
||
if not case:
|
||
case = "nominative"
|
||
if not gender:
|
||
gender = "m"
|
||
|
||
words = convert_number(
|
||
year_str, context_type="ordinal", case=case, gender=gender
|
||
)
|
||
|
||
prefix = f"{prep} " if prep else ""
|
||
if year_word:
|
||
return f"{prefix}{words} {year_word}"
|
||
return f"{prefix}{words}"
|
||
|
||
text = re.sub(
|
||
rf"(?i)\b((?:{preps_list})\s+)?(\d{{3,4}})[-‑–—]"
|
||
r"(ого|его|ому|ему|ым|им|ом|ем|ый|ий|ая|яя|ую|юю|ой|ей|ое|ее|й|го|му|м)\b"
|
||
r"(?:\s+(год[а-я]*))?",
|
||
replace_year_suffix_match,
|
||
text,
|
||
)
|
||
|
||
# Года
|
||
def replace_year_match(match):
|
||
prep = match.group(1)
|
||
year_str = match.group(2)
|
||
year_word = match.group(3)
|
||
|
||
# Падеж
|
||
parsed = morph.parse(year_word)[0]
|
||
case_tag = parsed.tag.case
|
||
|
||
nw_case = PYMORPHY_TO_NUM2WORDS.get(case_tag, "nominative")
|
||
|
||
# Без предлога - именительный
|
||
if not prep and year_word.lower().startswith("год"):
|
||
nw_case = "nominative"
|
||
|
||
# Конвертируем
|
||
words = convert_number(
|
||
year_str, context_type="ordinal", case=nw_case, gender="m"
|
||
)
|
||
|
||
prefix = f"{prep} " if prep else ""
|
||
return f"{prefix}{words} {year_word}"
|
||
|
||
# Регулярка для годов
|
||
text = re.sub(
|
||
r"(?i)\b((?:в|с|к|до|от)\s+)?(\d{3,4})\s+(год[а-я]*)\b",
|
||
replace_year_match,
|
||
text,
|
||
)
|
||
|
||
# Даты
|
||
month_regex = "|".join(MONTHS_GENITIVE)
|
||
|
||
def replace_date_match(match):
|
||
prep = match.group(1)
|
||
day_str = match.group(2)
|
||
month_word = match.group(3)
|
||
|
||
# По умолчанию родительный
|
||
case = "genitive"
|
||
|
||
if prep:
|
||
prep_clean = prep.strip().lower()
|
||
# Специфичные правила для дат
|
||
if prep_clean == "на":
|
||
case = "accusative" # на пятое мая
|
||
elif prep_clean == "по":
|
||
case = "accusative" # по пятое
|
||
elif prep_clean == "к":
|
||
case = "dative" # к пятому
|
||
elif prep_clean in ["с", "до", "от"]:
|
||
case = "genitive" # с пятого
|
||
else:
|
||
morph_case = get_case_from_preposition(prep_clean)
|
||
if morph_case:
|
||
case = PYMORPHY_TO_NUM2WORDS.get(morph_case, "genitive")
|
||
|
||
# Средний род для дат
|
||
words = convert_number(day_str, context_type="ordinal", case=case, gender="n")
|
||
|
||
prefix = f"{prep} " if prep else ""
|
||
return f"{prefix}{words} {month_word}"
|
||
|
||
# Конкатенация regex для месяцев (FIX: используем f-строку)
|
||
text = re.sub(
|
||
rf"(?i)\b((?:с|к|до|от|на|по)\s+)?(\d{{1,2}})\s+({month_regex})\b",
|
||
replace_date_match,
|
||
text,
|
||
)
|
||
|
||
# Остальные числа
|
||
def replace_cardinal_match(match):
|
||
prep = match.group(1)
|
||
num_str = match.group(2)
|
||
next_word = match.group(3)
|
||
|
||
case = "nominative"
|
||
gender = "m"
|
||
prep_clean = prep.strip().lower() if prep else None
|
||
parsed = None
|
||
|
||
if prep_clean:
|
||
morph_case = get_case_from_preposition(prep_clean)
|
||
if morph_case:
|
||
case = PYMORPHY_TO_NUM2WORDS.get(morph_case, "nominative")
|
||
|
||
# Проверяем род
|
||
if next_word:
|
||
word_clean = next_word.strip()
|
||
parsed = morph.parse(word_clean)[0]
|
||
if "NOUN" in parsed.tag:
|
||
morph_gender = parsed.tag.gender
|
||
gender = PYMORPHY_TO_GENDER.get(morph_gender, "m")
|
||
|
||
# Спец-случай: "на 1 час"
|
||
if (
|
||
prep_clean == "на"
|
||
and parsed is not None
|
||
and parsed.normal_form in TIME_UNIT_LEMMAS
|
||
and parsed.tag.gender in ("masc", "neut")
|
||
):
|
||
case = "nominative"
|
||
|
||
words = convert_number(
|
||
num_str, context_type="cardinal", case=case, gender=gender
|
||
)
|
||
|
||
# Если конвертация не удалась - возвращаем цифры
|
||
if not words:
|
||
words = num_str
|
||
|
||
prefix = f"{prep} " if prep else ""
|
||
# suffix removed (lookahead)
|
||
return f"{prefix}{words}"
|
||
|
||
# Регулярка теперь захватывает (опционально) следующее слово для определения рода
|
||
|
||
text = re.sub(
|
||
rf"(?i)(?<!\w)((?:{preps_list})\s+)?([+-]?\d+(?:[.,]\d+)?)(?=(\s+[а-яА-ЯёЁ]+))?\b",
|
||
replace_cardinal_match,
|
||
text,
|
||
)
|
||
|
||
return text
|
||
|
||
|
||
def roman_numerals_to_words(text: str) -> str:
|
||
"""Римские в слова."""
|
||
if not text:
|
||
return ""
|
||
|
||
def replace_roman_match(match):
|
||
prev_word = match.group(1)
|
||
roman = match.group(2)
|
||
|
||
number = roman_to_int(roman)
|
||
if number is None:
|
||
return match.group(0)
|
||
|
||
case = "nominative"
|
||
gender = "m"
|
||
|
||
try:
|
||
parsed = morph.parse(prev_word)[0]
|
||
case_tag = parsed.tag.case
|
||
gender_tag = parsed.tag.gender
|
||
|
||
if case_tag:
|
||
case = PYMORPHY_TO_NUM2WORDS.get(case_tag, "nominative")
|
||
if gender_tag:
|
||
gender = PYMORPHY_TO_GENDER.get(gender_tag, "m")
|
||
except Exception:
|
||
pass
|
||
|
||
ordinal = convert_number(
|
||
str(number), context_type="ordinal", case=case, gender=gender
|
||
)
|
||
return f"{prev_word} {ordinal}"
|
||
|
||
return re.sub(
|
||
r"(?i)\b([А-Яа-яЁё]+)\s+([IVXLCDM]+)\b",
|
||
replace_roman_match,
|
||
text,
|
||
)
|
||
|
||
|
||
def clean_response(text: str, language: str = "ru") -> str:
|
||
"""Очистка текста для TTS."""
|
||
if not text:
|
||
return ""
|
||
|
||
# Удаление ссылок
|
||
text = re.sub(r"\x5B\d+\x5D", "", text)
|
||
text = re.sub(r"\x5Bcitation\s*needed\x5D", "", text, flags=re.IGNORECASE)
|
||
text = re.sub(r"\x5Bsource\x5D", "", text, flags=re.IGNORECASE)
|
||
|
||
# Удаление жирного
|
||
text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
|
||
text = re.sub(r"__(.+?)__", r"\1", text)
|
||
|
||
# Удаление курсива
|
||
text = re.sub(r"\*(.+?)\*", r"\1", text)
|
||
text = re.sub(r"(?<!\w)_(.+?)_(?!\w)", r"\1", text)
|
||
|
||
# Удаление зачеркнутого
|
||
text = re.sub(r"~~(.+?)~~", r"\1", text)
|
||
|
||
# Заголовки
|
||
text = re.sub(r"^#{1,6}\s*", "", text, flags=re.MULTILINE)
|
||
|
||
# Картинки
|
||
text = re.sub(r"!\x5B([^\x5D]*)\x5D\([^)]+\)", "", text)
|
||
|
||
# Ссылки
|
||
text = re.sub(r"\x5B([^\x5D]+)\x5D\([^)]+\)", r"\1", text)
|
||
|
||
# Код
|
||
text = re.sub(r"`([^`]+)`", r"\1", text)
|
||
text = re.sub(r"```[\s\S]*?```", "", text)
|
||
|
||
# Списки
|
||
text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE)
|
||
text = re.sub(r"^\s*\d+\.\s+", "", text, flags=re.MULTILINE)
|
||
|
||
# Цитаты
|
||
text = re.sub(r"^\s*>\s*", "", text, flags=re.MULTILINE)
|
||
|
||
# Линии
|
||
text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
|
||
|
||
# HTML теги
|
||
text = re.sub(r"<[^>]+>", "", text)
|
||
|
||
# Корректировки
|
||
text = re.sub(
|
||
r"([—-])\s*это,\s*скорее\s*всего\b\s*,?\s*",
|
||
r"\1 ",
|
||
text,
|
||
flags=re.IGNORECASE,
|
||
)
|
||
text = re.sub(r"[—-]\s*([.!?])", r"\1", text)
|
||
|
||
# Удаление сленга
|
||
text = re.sub(
|
||
r"^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*",
|
||
"",
|
||
text,
|
||
flags=re.IGNORECASE | re.MULTILINE,
|
||
)
|
||
|
||
# Запрет на произнесение wake word в любых ответах ассистента.
|
||
for pattern in WAKE_WORD_BLOCKED_PATTERNS:
|
||
text = pattern.sub("ассистент", text)
|
||
|
||
# Числа в слова
|
||
if language == "ru":
|
||
text = roman_numerals_to_words(text)
|
||
if re.search(r"\d", text):
|
||
text = numbers_to_words(text)
|
||
|
||
# Чистка пробелов
|
||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||
text = re.sub(r" +", " ", text)
|
||
|
||
return text.strip()
|