feat: refine assistant logic and update docs

2026-04-09 21:03:02 +03:00
parent ebe79c3692
commit 42c064a274
19 changed files with 1958 additions and 492 deletions
--- a/app/audio/stt.py
+++ b/app/audio/stt.py
@@ -8,14 +8,13 @@ Supports Russian (default) and English.
 # Использует Deepgram API через веб-сокеты для потокового распознавания в реальном времени.

 import asyncio
-import re
 import time
 import pyaudio
 import logging
 import contextlib
 import threading
 from datetime import datetime, timedelta
-from ..core.config import DEEPGRAM_API_KEY, SAMPLE_RATE, WAKE_WORD_ALIASES
+from ..core.config import DEEPGRAM_API_KEY, SAMPLE_RATE
 from deepgram import (
    DeepgramClient,
    DeepgramClientOptions,
@@ -25,13 +24,14 @@ from deepgram import (
 import deepgram.clients.common.v1.abstract_sync_websocket as sdk_ws
 import websockets.sync.client
 from ..core.audio_manager import get_audio_manager
+from ..core.commands import is_fast_command

 # --- Патч (исправление) для библиотеки websockets ---
 # Явно задаём таймауты подключения, чтобы не зависать на долгом handshake.
 _original_connect = websockets.sync.client.connect

-DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 3.0
-DEEPGRAM_CONNECT_WAIT_SECONDS = 4.0
+DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 5.0
+DEEPGRAM_CONNECT_WAIT_SECONDS = 6.5
 DEEPGRAM_CONNECT_POLL_SECONDS = 0.001
 SENDER_STOP_WAIT_SECONDS = 2.5
 SENDER_FORCE_RELEASE_WAIT_SECONDS = 2.5
@@ -62,28 +62,6 @@ POST_SPEECH_SILENCE_TIMEOUT_SECONDS = 2.0
 # Фактическое завершение происходит примерно после 2.0 сек тишины после речи.
 MAX_ACTIVE_SPEECH_SECONDS = 300.0

-_FAST_STOP_UTTERANCE_RE = re.compile(
-    r"^(?:(?:" + "|".join(re.escape(alias) for alias in WAKE_WORD_ALIASES) + r")\s+)?"
-    r"(?:стоп|хватит|перестань|прекрати|замолчи|тихо|пауза)"
-    r"(?:\s+(?:пожалуйста|please))?$",
-    flags=re.IGNORECASE,
-)
-
-
-def _normalize_command_text(text: str) -> str:
-    normalized = text.lower().replace("ё", "е")
-    normalized = re.sub(r"[^\w\s]+", " ", normalized, flags=re.UNICODE)
-    normalized = re.sub(r"\s+", " ", normalized, flags=re.UNICODE).strip()
-    return normalized
-
-
-def _is_fast_stop_utterance(text: str) -> bool:
-    normalized = _normalize_command_text(text)
-    if not normalized:
-        return False
-    return _FAST_STOP_UTTERANCE_RE.fullmatch(normalized) is not None
-
-
 class SpeechRecognizer:
    """Класс распознавания речи через Deepgram."""

@@ -280,7 +258,7 @@ class SpeechRecognizer:
            dg_connection: Активное соединение с Deepgram.
            timeout_seconds: Аварийный лимит длительности активной речи.
            detection_timeout: Время ожидания начала речи.
-            fast_stop: Если True, короткая стоп-фраза завершает STT после 1с тишины.
+            fast_stop: Если True, короткие системные команды завершают STT раньше.
        """
        self.transcript = ""
        transcript_parts = []
@@ -296,6 +274,8 @@ class SpeechRecognizer:
        # События для синхронизации
        stop_event = asyncio.Event()  # Пора останавливаться
        speech_started_event = asyncio.Event()  # Речь обнаружена (VAD)
+        connection_ready_event = threading.Event()  # WS с Deepgram готов
+        connection_failed_event = threading.Event()  # WS с Deepgram завершился ошибкой
        last_speech_activity = time.monotonic()
        first_speech_activity_at = None
        session_error = {"message": None}
@@ -338,14 +318,13 @@ class SpeechRecognizer:
            except RuntimeError:
                pass

-            if fast_stop:
-                if _is_fast_stop_utterance(sentence):
-                    self.transcript = sentence
-                    try:
-                        loop.call_soon_threadsafe(request_stop)
-                    except RuntimeError:
-                        pass
-                    return
+            if fast_stop and is_fast_command(sentence):
+                self.transcript = sentence
+                try:
+                    loop.call_soon_threadsafe(request_stop)
+                except RuntimeError:
+                    pass
+                return

            if result.is_final:
                # Собираем только финальные (подтвержденные) фразы
@@ -470,6 +449,7 @@ class SpeechRecognizer:
                    print(
                        f"⏰ Timeout connecting to Deepgram ({DEEPGRAM_CONNECT_WAIT_SECONDS:.1f}s)"
                    )
+                    connection_failed_event.set()
                    loop.call_soon_threadsafe(request_stop)
                    return

@@ -479,15 +459,18 @@ class SpeechRecognizer:
                        f"Failed to start Deepgram connection: {connect_result['error']}"
                    )
                    print(f"Failed to start Deepgram connection: {connect_result['error']}")
+                    connection_failed_event.set()
                    loop.call_soon_threadsafe(request_stop)
                    return

                if connect_result["ok"] is False:
                    mark_session_error("Failed to start Deepgram connection")
                    print("Failed to start Deepgram connection")
+                    connection_failed_event.set()
                    loop.call_soon_threadsafe(request_stop)
                    return

+                connection_ready_event.set()
                print(f"🚀 Connected! Sending buffer ({len(audio_buffer)} chunks)...")

                # 3. Отправляем накопленный буфер
@@ -522,6 +505,7 @@ class SpeechRecognizer:
            except Exception as e:
                mark_session_error(f"Audio send error: {e}")
                print(f"Audio send error: {e}")
+                connection_failed_event.set()
                with contextlib.suppress(RuntimeError):
                    loop.call_soon_threadsafe(request_stop)
            finally:
@@ -551,26 +535,56 @@ class SpeechRecognizer:
                and effective_detection_timeout > 0
                and not stop_event.is_set()
            ):
-                speech_wait_task = asyncio.create_task(speech_started_event.wait())
-                stop_wait_task = asyncio.create_task(stop_event.wait())
-                try:
-                    done, pending = await asyncio.wait(
-                        {speech_wait_task, stop_wait_task},
-                        timeout=effective_detection_timeout,
-                        return_when=asyncio.FIRST_COMPLETED,
-                    )
-                finally:
-                    for task in (speech_wait_task, stop_wait_task):
-                        if not task.done():
-                            task.cancel()
-                    await asyncio.gather(
-                        speech_wait_task, stop_wait_task, return_exceptions=True
-                    )
+                # Важно: не считаем пользователя "молчаливым", пока WS-соединение
+                # с Deepgram еще не поднялось.
+                connect_ready_deadline = time.monotonic() + max(
+                    effective_detection_timeout + 0.25,
+                    DEEPGRAM_CONNECT_WAIT_SECONDS + 0.75,
+                )
+                while (
+                    not stop_event.is_set()
+                    and not connection_ready_event.is_set()
+                    and time.monotonic() < connect_ready_deadline
+                ):
+                    if connection_failed_event.is_set():
+                        break
+                    await asyncio.sleep(0.05)

-                if not done:
-                    # Если за detection_timeout никто не начал говорить, выходим
+                if (
+                    not stop_event.is_set()
+                    and not connection_ready_event.is_set()
+                    and not connection_failed_event.is_set()
+                ):
+                    mark_session_error("Deepgram connection was not ready before speech timeout.")
                    request_stop()

+                if (
+                    stop_event.is_set()
+                    or connection_failed_event.is_set()
+                    or not connection_ready_event.is_set()
+                ):
+                    request_stop()
+                else:
+                    speech_wait_task = asyncio.create_task(speech_started_event.wait())
+                    stop_wait_task = asyncio.create_task(stop_event.wait())
+                    try:
+                        done, pending = await asyncio.wait(
+                            {speech_wait_task, stop_wait_task},
+                            timeout=effective_detection_timeout,
+                            return_when=asyncio.FIRST_COMPLETED,
+                        )
+                    finally:
+                        for task in (speech_wait_task, stop_wait_task):
+                            if not task.done():
+                                task.cancel()
+                        await asyncio.gather(
+                            speech_wait_task, stop_wait_task, return_exceptions=True
+                        )
+
+                    if not done:
+                        # Если за detection_timeout после поднятия WS никто не начал говорить, выходим.
+                        request_stop()
+
            # 2. После старта речи завершаем только по тишине POST_SPEECH_SILENCE_TIMEOUT_SECONDS.
            # Добавляем длинный защитный лимит, чтобы сессия не зависла навсегда.
            if not stop_event.is_set():
@@ -687,7 +701,7 @@ class SpeechRecognizer:
            timeout_seconds: Защитный лимит длительности активной речи.
            detection_timeout: Сколько ждать начала речи перед тем как сдаться.
            lang: Язык ("ru" или "en").
-            fast_stop: Быстрое завершение для коротких stop-команд.
+            fast_stop: Быстрое завершение для коротких системных команд.
        """
        if not self.dg_client:
            self.initialize()