Update assistant features and docs

2026-02-12 14:12:37 +03:00
parent bb3133a1c0
commit ca8ebd6657
19 changed files with 814 additions and 180 deletions
--- a/app/audio/stt.py
+++ b/app/audio/stt.py
@@ -8,6 +8,7 @@ Supports Russian (default) and English.
 # Использует Deepgram API через веб-сокеты для потокового распознавания в реальном времени.

 import asyncio
+import re
 import time
 import pyaudio
 import logging
@@ -24,16 +25,19 @@ import websockets.sync.client
 from ..core.audio_manager import get_audio_manager

 # --- Патч (исправление) для библиотеки websockets ---
-# По умолчанию Deepgram SDK использует слишком короткий таймаут подключения.
-# Это часто вызывает ошибки при медленном SSL рукопожатии.
-# Мы подменяем функцию connect, чтобы увеличить таймаут до 30 секунд.
+# Явно задаём таймауты подключения, чтобы не зависать на долгом handshake.
 _original_connect = websockets.sync.client.connect

+DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 3.0
+DEEPGRAM_CONNECT_WAIT_SECONDS = 1.5
+DEEPGRAM_CONNECT_POLL_SECONDS = 0.001
+

 def _patched_connect(*args, **kwargs):
-    kwargs.setdefault("open_timeout", 30)
-    kwargs.setdefault("ping_timeout", 30)
-    kwargs.setdefault("close_timeout", 30)
+    # Принудительно задаём короткие таймауты, даже если SDK передал свои (например, 30с).
+    kwargs["open_timeout"] = DEEPGRAM_CONNECT_TIMEOUT_SECONDS
+    kwargs["ping_timeout"] = DEEPGRAM_CONNECT_TIMEOUT_SECONDS
+    kwargs["close_timeout"] = DEEPGRAM_CONNECT_TIMEOUT_SECONDS
    print(f"DEBUG: Connecting to Deepgram with timeout={kwargs.get('open_timeout')}s")
    return _original_connect(*args, **kwargs)

@@ -44,6 +48,34 @@ sdk_ws.connect = _patched_connect
 # Отключаем лишний мусор в логах
 logging.getLogger("deepgram").setLevel(logging.WARNING)

+# Базовые пороги для остановки STT
+INITIAL_SILENCE_TIMEOUT_SECONDS = 5.0
+POST_SPEECH_SILENCE_TIMEOUT_SECONDS = 3.0
+# Длинный защитный предел, чтобы не обрывать обычную длинную фразу.
+# Фактическое завершение происходит по 3 сек тишины после речи.
+MAX_ACTIVE_SPEECH_SECONDS = 300.0
+
+_FAST_STOP_UTTERANCE_RE = re.compile(
+    r"^(?:(?:александр|алесандр|alexander|alexandr)\s+)?"
+    r"(?:стоп|хватит|перестань|прекрати|замолчи|тихо|пауза)"
+    r"(?:\s+(?:пожалуйста|please))?$",
+    flags=re.IGNORECASE,
+)
+
+
+def _normalize_command_text(text: str) -> str:
+    normalized = text.lower().replace("ё", "е")
+    normalized = re.sub(r"[^\w\s]+", " ", normalized, flags=re.UNICODE)
+    normalized = re.sub(r"\s+", " ", normalized, flags=re.UNICODE).strip()
+    return normalized
+
+
+def _is_fast_stop_utterance(text: str) -> bool:
+    normalized = _normalize_command_text(text)
+    if not normalized:
+        return False
+    return _FAST_STOP_UTTERANCE_RE.fullmatch(normalized) is not None
+

 class SpeechRecognizer:
    """Класс распознавания речи через Deepgram."""
@@ -105,24 +137,42 @@ class SpeechRecognizer:
            )
        return self.stream

-    async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
+    async def _process_audio(
+        self, dg_connection, timeout_seconds, detection_timeout, fast_stop
+    ):
        """
        Асинхронная функция для отправки аудио и получения текста.

        Args:
            dg_connection: Активное соединение с Deepgram.
-            timeout_seconds: Общее время прослушивания.
+            timeout_seconds: Аварийный лимит длительности активной речи.
            detection_timeout: Время ожидания начала речи.
+            fast_stop: Если True, короткая стоп-фраза завершает STT после 1с тишины.
        """
        self.transcript = ""
        transcript_parts = []

        loop = asyncio.get_running_loop()
        stream = self._get_stream()
+        effective_detection_timeout = (
+            detection_timeout
+            if detection_timeout is not None
+            else INITIAL_SILENCE_TIMEOUT_SECONDS
+        )

        # События для синхронизации
        stop_event = asyncio.Event()  # Пора останавливаться
        speech_started_event = asyncio.Event()  # Речь обнаружена (VAD)
+        last_speech_activity = time.monotonic()
+        first_speech_activity_at = None
+
+        def mark_speech_activity():
+            nonlocal last_speech_activity, first_speech_activity_at
+            now = time.monotonic()
+            last_speech_activity = now
+            if first_speech_activity_at is None:
+                first_speech_activity_at = now
+            speech_started_event.set()

        # --- Обработчики событий Deepgram ---
        def on_transcript(unused_self, result, **kwargs):
@@ -130,6 +180,20 @@ class SpeechRecognizer:
            sentence = result.channel.alternatives[0].transcript
            if len(sentence) == 0:
                return
+            try:
+                loop.call_soon_threadsafe(mark_speech_activity)
+            except RuntimeError:
+                pass
+
+            if fast_stop:
+                if _is_fast_stop_utterance(sentence):
+                    self.transcript = sentence.strip()
+                    try:
+                        loop.call_soon_threadsafe(stop_event.set)
+                    except RuntimeError:
+                        pass
+                    return
+
            if result.is_final:
                # Собираем только финальные (подтвержденные) фразы
                transcript_parts.append(sentence)
@@ -138,18 +202,16 @@ class SpeechRecognizer:
        def on_speech_started(unused_self, speech_started, **kwargs):
            """Вызывается, когда VAD (Voice Activity Detection) слышит голос."""
            try:
-                loop.call_soon_threadsafe(speech_started_event.set)
+                loop.call_soon_threadsafe(mark_speech_activity)
            except RuntimeError:
                # Event loop might be closed, ignore
                pass

        def on_utterance_end(unused_self, utterance_end, **kwargs):
            """Вызывается, когда Deepgram решает, что фраза закончилась (пауза)."""
-            try:
-                loop.call_soon_threadsafe(stop_event.set)
-            except RuntimeError:
-                # Event loop might be closed, ignore
-                pass
+            # Не останавливаемся мгновенно на событии Deepgram.
+            # Остановка управляется локальным порогом тишины POST_SPEECH_SILENCE_TIMEOUT_SECONDS.
+            return

        def on_error(unused_self, error, **kwargs):
            print(f"Deepgram Error: {error}")
@@ -174,10 +236,10 @@ class SpeechRecognizer:
            channels=1,
            sample_rate=SAMPLE_RATE,
            interim_results=True,
-            utterance_end_ms=1000,  # Пауза 1.0с считается концом фразы (было 1.2)
+            utterance_end_ms=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
            vad_events=True,
-            # Добавляем параметры таймаута для долгой работы
-            endpointing=300,  # Таймаут в миллисекундах для автоматического завершения
+            # Сглаженный порог endpointing, чтобы не резать речь на коротких паузах.
+            endpointing=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
        )

        # --- Задача отправки аудио с буферизацией ---
@@ -198,24 +260,29 @@ class SpeechRecognizer:
                    None, lambda: dg_connection.start(options)
                )

-                # Пока подключаемся, копим данные
-                timeout_count = 0
-                max_timeout = 5000  # Максимальное количество итераций ожидания (около 2.5 секунд при 0.0005 задержке)
-
-                while not connect_future.done() and timeout_count < max_timeout:
+                # Пока подключаемся, копим данные.
+                # Ждём коротко: если сеть подвисла, быстрее перезапускаем попытку.
+                connect_deadline = time.monotonic() + DEEPGRAM_CONNECT_WAIT_SECONDS
+                while (
+                    not connect_future.done()
+                    and time.monotonic() < connect_deadline
+                ):
                    if stream.is_active():
                        data = stream.read(4096, exception_on_overflow=False)
                        audio_buffer.append(data)
-                    await asyncio.sleep(0.0005)  # Уменьшаем задержку для более быстрой обработки
-                    timeout_count += 1
+                    await asyncio.sleep(DEEPGRAM_CONNECT_POLL_SECONDS)

-                if timeout_count >= max_timeout:
-                    print("⏰ Timeout connecting to Deepgram")
+                if not connect_future.done():
+                    print(
+                        f"⏰ Timeout connecting to Deepgram ({DEEPGRAM_CONNECT_WAIT_SECONDS:.1f}s)"
+                    )
+                    stop_event.set()
                    return

                # Проверяем результат подключения
                if connect_future.result() is False:
                    print("Failed to start Deepgram connection")
+                    stop_event.set()
                    return

                print(f"🚀 Connected! Sending buffer ({len(audio_buffer)} chunks)...")
@@ -227,11 +294,8 @@ class SpeechRecognizer:

                audio_buffer = None  # Освобождаем память

-                # 4. Продолжаем стримить в реальном времени
-                stream_timeout = 0
-                max_stream_timeout = int(timeout_seconds / 0.002)  # Примерный таймаут в зависимости от timeout_seconds
-
-                while not stop_event.is_set() and stream_timeout < max_stream_timeout:
+                # 4. Продолжаем стримить в реальном времени до события остановки.
+                while not stop_event.is_set():
                    if stream.is_active():
                        data = stream.read(4096, exception_on_overflow=False)
                        dg_connection.send(data)
@@ -239,7 +303,6 @@ class SpeechRecognizer:
                        if chunks_sent % 50 == 0:
                            print(".", end="", flush=True)
                    await asyncio.sleep(0.002)  # Уменьшаем задержку для более быстрого реагирования
-                    stream_timeout += 1

            except Exception as e:
                print(f"Audio send error: {e}")
@@ -255,19 +318,60 @@ class SpeechRecognizer:

        try:
            # 1. Ждем начала речи (если задан detection_timeout)
-            if detection_timeout:
+            if (
+                effective_detection_timeout
+                and effective_detection_timeout > 0
+                and not stop_event.is_set()
+            ):
+                speech_wait_task = asyncio.create_task(speech_started_event.wait())
+                stop_wait_task = asyncio.create_task(stop_event.wait())
                try:
-                    await asyncio.wait_for(
-                        speech_started_event.wait(), timeout=detection_timeout
+                    done, pending = await asyncio.wait(
+                        {speech_wait_task, stop_wait_task},
+                        timeout=effective_detection_timeout,
+                        return_when=asyncio.FIRST_COMPLETED,
                    )
-                except asyncio.TimeoutError:
+                finally:
+                    for task in (speech_wait_task, stop_wait_task):
+                        if not task.done():
+                            task.cancel()
+                    await asyncio.gather(
+                        speech_wait_task, stop_wait_task, return_exceptions=True
+                    )
+
+                if not done:
                    # Если за detection_timeout никто не начал говорить, выходим
                    stop_event.set()

-            # 2. Если речь началась (или таймаута нет), ждем завершения (stop_event)
-            # stop_event сработает либо по UtteranceEnd (пауза), либо по общему таймауту
+            # 2. После старта речи завершаем только по тишине POST_SPEECH_SILENCE_TIMEOUT_SECONDS.
+            # Добавляем длинный защитный лимит, чтобы сессия не зависла навсегда.
            if not stop_event.is_set():
-                await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
+                max_active_speech_seconds = max(
+                    timeout_seconds if timeout_seconds else 0.0,
+                    MAX_ACTIVE_SPEECH_SECONDS,
+                )
+
+                while not stop_event.is_set():
+                    now = time.monotonic()
+
+                    if speech_started_event.is_set():
+                        if (
+                            now - last_speech_activity
+                            >= POST_SPEECH_SILENCE_TIMEOUT_SECONDS
+                        ):
+                            stop_event.set()
+                            break
+
+                        if (
+                            first_speech_activity_at is not None
+                            and now - first_speech_activity_at
+                            >= max_active_speech_seconds
+                        ):
+                            print("⏱️ Достигнут защитный лимит активного прослушивания.")
+                            stop_event.set()
+                            break
+
+                    await asyncio.sleep(0.05)

        except asyncio.TimeoutError:
            pass  # Общий таймаут вышел
@@ -291,16 +395,18 @@ class SpeechRecognizer:
    def listen(
        self,
        timeout_seconds: float = 7.0,
-        detection_timeout: float = None,
+        detection_timeout: float = INITIAL_SILENCE_TIMEOUT_SECONDS,
        lang: str = "ru",
+        fast_stop: bool = False,
    ) -> str:
        """
        Основной метод: слушает микрофон и возвращает текст.

        Args:
-            timeout_seconds: Максимальная длительность фразы.
+            timeout_seconds: Защитный лимит длительности активной речи.
            detection_timeout: Сколько ждать начала речи перед тем как сдаться.
            lang: Язык ("ru" или "en").
+            fast_stop: Быстрое завершение для коротких stop-команд.
        """
        if not self.dg_client:
            self.initialize()
@@ -323,7 +429,7 @@ class SpeechRecognizer:
                # Запускаем асинхронный процесс обработки
                transcript = asyncio.run(
                    self._process_audio(
-                        dg_connection, timeout_seconds, detection_timeout
+                        dg_connection, timeout_seconds, detection_timeout, fast_stop
                    )
                )
                final_text = transcript.strip() if transcript else ""
@@ -389,10 +495,13 @@ def get_recognizer() -> SpeechRecognizer:


 def listen(
-    timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru"
+    timeout_seconds: float = 7.0,
+    detection_timeout: float = INITIAL_SILENCE_TIMEOUT_SECONDS,
+    lang: str = "ru",
+    fast_stop: bool = False,
 ) -> str:
    """Внешняя функция для прослушивания."""
-    return get_recognizer().listen(timeout_seconds, detection_timeout, lang)
+    return get_recognizer().listen(timeout_seconds, detection_timeout, lang, fast_stop)


 def cleanup():