feat: refine assistant logic and update docs
This commit is contained in:
120
app/audio/stt.py
120
app/audio/stt.py
@@ -8,14 +8,13 @@ Supports Russian (default) and English.
|
||||
# Использует Deepgram API через веб-сокеты для потокового распознавания в реальном времени.
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
import pyaudio
|
||||
import logging
|
||||
import contextlib
|
||||
import threading
|
||||
from datetime import datetime, timedelta
|
||||
from ..core.config import DEEPGRAM_API_KEY, SAMPLE_RATE, WAKE_WORD_ALIASES
|
||||
from ..core.config import DEEPGRAM_API_KEY, SAMPLE_RATE
|
||||
from deepgram import (
|
||||
DeepgramClient,
|
||||
DeepgramClientOptions,
|
||||
@@ -25,13 +24,14 @@ from deepgram import (
|
||||
import deepgram.clients.common.v1.abstract_sync_websocket as sdk_ws
|
||||
import websockets.sync.client
|
||||
from ..core.audio_manager import get_audio_manager
|
||||
from ..core.commands import is_fast_command
|
||||
|
||||
# --- Патч (исправление) для библиотеки websockets ---
|
||||
# Явно задаём таймауты подключения, чтобы не зависать на долгом handshake.
|
||||
_original_connect = websockets.sync.client.connect
|
||||
|
||||
DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 3.0
|
||||
DEEPGRAM_CONNECT_WAIT_SECONDS = 4.0
|
||||
DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 5.0
|
||||
DEEPGRAM_CONNECT_WAIT_SECONDS = 6.5
|
||||
DEEPGRAM_CONNECT_POLL_SECONDS = 0.001
|
||||
SENDER_STOP_WAIT_SECONDS = 2.5
|
||||
SENDER_FORCE_RELEASE_WAIT_SECONDS = 2.5
|
||||
@@ -62,28 +62,6 @@ POST_SPEECH_SILENCE_TIMEOUT_SECONDS = 2.0
|
||||
# Фактическое завершение происходит примерно после 2.0 сек тишины после речи.
|
||||
MAX_ACTIVE_SPEECH_SECONDS = 300.0
|
||||
|
||||
_FAST_STOP_UTTERANCE_RE = re.compile(
|
||||
r"^(?:(?:" + "|".join(re.escape(alias) for alias in WAKE_WORD_ALIASES) + r")\s+)?"
|
||||
r"(?:стоп|хватит|перестань|прекрати|замолчи|тихо|пауза)"
|
||||
r"(?:\s+(?:пожалуйста|please))?$",
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _normalize_command_text(text: str) -> str:
|
||||
normalized = text.lower().replace("ё", "е")
|
||||
normalized = re.sub(r"[^\w\s]+", " ", normalized, flags=re.UNICODE)
|
||||
normalized = re.sub(r"\s+", " ", normalized, flags=re.UNICODE).strip()
|
||||
return normalized
|
||||
|
||||
|
||||
def _is_fast_stop_utterance(text: str) -> bool:
|
||||
normalized = _normalize_command_text(text)
|
||||
if not normalized:
|
||||
return False
|
||||
return _FAST_STOP_UTTERANCE_RE.fullmatch(normalized) is not None
|
||||
|
||||
|
||||
class SpeechRecognizer:
|
||||
"""Класс распознавания речи через Deepgram."""
|
||||
|
||||
@@ -280,7 +258,7 @@ class SpeechRecognizer:
|
||||
dg_connection: Активное соединение с Deepgram.
|
||||
timeout_seconds: Аварийный лимит длительности активной речи.
|
||||
detection_timeout: Время ожидания начала речи.
|
||||
fast_stop: Если True, короткая стоп-фраза завершает STT после 1с тишины.
|
||||
fast_stop: Если True, короткие системные команды завершают STT раньше.
|
||||
"""
|
||||
self.transcript = ""
|
||||
transcript_parts = []
|
||||
@@ -296,6 +274,8 @@ class SpeechRecognizer:
|
||||
# События для синхронизации
|
||||
stop_event = asyncio.Event() # Пора останавливаться
|
||||
speech_started_event = asyncio.Event() # Речь обнаружена (VAD)
|
||||
connection_ready_event = threading.Event() # WS с Deepgram готов
|
||||
connection_failed_event = threading.Event() # WS с Deepgram завершился ошибкой
|
||||
last_speech_activity = time.monotonic()
|
||||
first_speech_activity_at = None
|
||||
session_error = {"message": None}
|
||||
@@ -338,14 +318,13 @@ class SpeechRecognizer:
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
if fast_stop:
|
||||
if _is_fast_stop_utterance(sentence):
|
||||
self.transcript = sentence
|
||||
try:
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
except RuntimeError:
|
||||
pass
|
||||
return
|
||||
if fast_stop and is_fast_command(sentence):
|
||||
self.transcript = sentence
|
||||
try:
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
except RuntimeError:
|
||||
pass
|
||||
return
|
||||
|
||||
if result.is_final:
|
||||
# Собираем только финальные (подтвержденные) фразы
|
||||
@@ -470,6 +449,7 @@ class SpeechRecognizer:
|
||||
print(
|
||||
f"⏰ Timeout connecting to Deepgram ({DEEPGRAM_CONNECT_WAIT_SECONDS:.1f}s)"
|
||||
)
|
||||
connection_failed_event.set()
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
@@ -479,15 +459,18 @@ class SpeechRecognizer:
|
||||
f"Failed to start Deepgram connection: {connect_result['error']}"
|
||||
)
|
||||
print(f"Failed to start Deepgram connection: {connect_result['error']}")
|
||||
connection_failed_event.set()
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
if connect_result["ok"] is False:
|
||||
mark_session_error("Failed to start Deepgram connection")
|
||||
print("Failed to start Deepgram connection")
|
||||
connection_failed_event.set()
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
connection_ready_event.set()
|
||||
print(f"🚀 Connected! Sending buffer ({len(audio_buffer)} chunks)...")
|
||||
|
||||
# 3. Отправляем накопленный буфер
|
||||
@@ -522,6 +505,7 @@ class SpeechRecognizer:
|
||||
except Exception as e:
|
||||
mark_session_error(f"Audio send error: {e}")
|
||||
print(f"Audio send error: {e}")
|
||||
connection_failed_event.set()
|
||||
with contextlib.suppress(RuntimeError):
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
finally:
|
||||
@@ -551,26 +535,56 @@ class SpeechRecognizer:
|
||||
and effective_detection_timeout > 0
|
||||
and not stop_event.is_set()
|
||||
):
|
||||
speech_wait_task = asyncio.create_task(speech_started_event.wait())
|
||||
stop_wait_task = asyncio.create_task(stop_event.wait())
|
||||
try:
|
||||
done, pending = await asyncio.wait(
|
||||
{speech_wait_task, stop_wait_task},
|
||||
timeout=effective_detection_timeout,
|
||||
return_when=asyncio.FIRST_COMPLETED,
|
||||
)
|
||||
finally:
|
||||
for task in (speech_wait_task, stop_wait_task):
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
await asyncio.gather(
|
||||
speech_wait_task, stop_wait_task, return_exceptions=True
|
||||
)
|
||||
# Важно: не считаем пользователя "молчаливым", пока WS-соединение
|
||||
# с Deepgram еще не поднялось.
|
||||
connect_ready_deadline = time.monotonic() + max(
|
||||
effective_detection_timeout + 0.25,
|
||||
DEEPGRAM_CONNECT_WAIT_SECONDS + 0.75,
|
||||
)
|
||||
while (
|
||||
not stop_event.is_set()
|
||||
and not connection_ready_event.is_set()
|
||||
and time.monotonic() < connect_ready_deadline
|
||||
):
|
||||
if connection_failed_event.is_set():
|
||||
break
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
if not done:
|
||||
# Если за detection_timeout никто не начал говорить, выходим
|
||||
if (
|
||||
not stop_event.is_set()
|
||||
and not connection_ready_event.is_set()
|
||||
and not connection_failed_event.is_set()
|
||||
):
|
||||
mark_session_error("Deepgram connection was not ready before speech timeout.")
|
||||
request_stop()
|
||||
|
||||
if (
|
||||
stop_event.is_set()
|
||||
or connection_failed_event.is_set()
|
||||
or not connection_ready_event.is_set()
|
||||
):
|
||||
request_stop()
|
||||
else:
|
||||
speech_wait_task = asyncio.create_task(speech_started_event.wait())
|
||||
stop_wait_task = asyncio.create_task(stop_event.wait())
|
||||
try:
|
||||
done, pending = await asyncio.wait(
|
||||
{speech_wait_task, stop_wait_task},
|
||||
timeout=effective_detection_timeout,
|
||||
return_when=asyncio.FIRST_COMPLETED,
|
||||
)
|
||||
finally:
|
||||
for task in (speech_wait_task, stop_wait_task):
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
await asyncio.gather(
|
||||
speech_wait_task, stop_wait_task, return_exceptions=True
|
||||
)
|
||||
|
||||
if not done:
|
||||
# Если за detection_timeout после поднятия WS никто не начал говорить, выходим.
|
||||
request_stop()
|
||||
|
||||
# 2. После старта речи завершаем только по тишине POST_SPEECH_SILENCE_TIMEOUT_SECONDS.
|
||||
# Добавляем длинный защитный лимит, чтобы сессия не зависла навсегда.
|
||||
if not stop_event.is_set():
|
||||
@@ -687,7 +701,7 @@ class SpeechRecognizer:
|
||||
timeout_seconds: Защитный лимит длительности активной речи.
|
||||
detection_timeout: Сколько ждать начала речи перед тем как сдаться.
|
||||
lang: Язык ("ru" или "en").
|
||||
fast_stop: Быстрое завершение для коротких stop-команд.
|
||||
fast_stop: Быстрое завершение для коротких системных команд.
|
||||
"""
|
||||
if not self.dg_client:
|
||||
self.initialize()
|
||||
|
||||
Reference in New Issue
Block a user