Normalize year suffixes for TTS

2026-02-05 17:07:04 +03:00
parent a342b05875
commit 875ff7d2c4
1 changed files with 31 additions and 0 deletions
--- a/app/core/cleaner.py
+++ b/app/core/cleaner.py
@@ -128,6 +128,28 @@ def numbers_to_words(text: str) -> str:
    if not text:
        return ""

+    # 0. Обработка короткой записи годов с суффиксом: "1968-м", "в 1968-м году"
+    def replace_year_suffix_match(match):
+        prep = match.group(1)  # Предлог (в, во, о...)
+        year_str = match.group(2)  # Само число
+        year_word = match.group(3)  # Слово "год", "году" и т.д. (опционально)
+
+        # Суффикс "-м/-ом" обычно соответствует предложному падежу
+        words = convert_number(
+            year_str, context_type="ordinal", case="prepositional", gender="m"
+        )
+
+        prefix = f"{prep} " if prep else ""
+        if year_word:
+            return f"{prefix}{words} {year_word}"
+        return f"{prefix}{words}"
+
+    text = re.sub(
+        r"(?i)\b((?:в|во|о|об|обо|при)\s+)?(\d{3,4})[-‑–—](?:м|ом)\b(?:\s+(год[а-я]*))?",
+        replace_year_suffix_match,
+        text,
+    )
+
    # 1. Обработка годов: "в 1999 году", "2024 год"
    def replace_year_match(match):
        prep = match.group(1)  # Предлог (в, с, к...)
@@ -315,6 +337,15 @@ def clean_response(text: str, language: str = "ru") -> str:
    # Удаление HTML тегов
    text = re.sub(r"<[^>]+>", "", text)

+    # Удаление фразы "— это, скорее всего" в корректировках произношения
+    text = re.sub(
+        r"([—-])\s*это,\s*скорее\s*всего\b\s*,?\s*",
+        r"\1 ",
+        text,
+        flags=re.IGNORECASE,
+    )
+    text = re.sub(r"[—-]\s*([.!?])", r"\1", text)
+
    # Remove informal slang greetings at the beginning of sentences/responses
    text = re.sub(
        r"^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*",