feat: improve semantic voice control and music playback
This commit is contained in:
262
app/audio/stt.py
262
app/audio/stt.py
@@ -12,6 +12,8 @@ import re
|
||||
import time
|
||||
import pyaudio
|
||||
import logging
|
||||
import contextlib
|
||||
import threading
|
||||
from datetime import datetime, timedelta
|
||||
from ..core.config import DEEPGRAM_API_KEY, SAMPLE_RATE, WAKE_WORD_ALIASES
|
||||
from deepgram import (
|
||||
@@ -29,8 +31,12 @@ from ..core.audio_manager import get_audio_manager
|
||||
_original_connect = websockets.sync.client.connect
|
||||
|
||||
DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 3.0
|
||||
DEEPGRAM_CONNECT_WAIT_SECONDS = 1.5
|
||||
DEEPGRAM_CONNECT_WAIT_SECONDS = 4.0
|
||||
DEEPGRAM_CONNECT_POLL_SECONDS = 0.001
|
||||
SENDER_STOP_WAIT_SECONDS = 2.5
|
||||
SENDER_FORCE_RELEASE_WAIT_SECONDS = 2.5
|
||||
DEEPGRAM_FINALIZATION_GRACE_SECONDS = 0.35
|
||||
DEEPGRAM_FINISH_TIMEOUT_SECONDS = 4.0
|
||||
|
||||
|
||||
def _patched_connect(*args, **kwargs):
|
||||
@@ -152,6 +158,82 @@ class SpeechRecognizer:
|
||||
)
|
||||
return self.stream
|
||||
|
||||
def _open_stream_for_session(self):
|
||||
"""Открывает отдельный входной поток для одной STT-сессии."""
|
||||
if self.audio_manager is None:
|
||||
self.audio_manager = get_audio_manager()
|
||||
stream, self._input_device_index, sample_rate = self.audio_manager.open_input_stream(
|
||||
rate=SAMPLE_RATE,
|
||||
channels=1,
|
||||
format=pyaudio.paInt16,
|
||||
frames_per_buffer=4096,
|
||||
preferred_index=self._input_device_index,
|
||||
fallback_rates=[48000, 44100, 32000, 22050, 16000, 8000],
|
||||
)
|
||||
if sample_rate != SAMPLE_RATE:
|
||||
print(
|
||||
f"⚠️ STT mic stream uses fallback rate={sample_rate} "
|
||||
f"(requested {SAMPLE_RATE})"
|
||||
)
|
||||
return stream, int(sample_rate)
|
||||
|
||||
def _stop_stream_quietly(self):
|
||||
if not self.stream:
|
||||
return
|
||||
try:
|
||||
if self.stream.is_active():
|
||||
self.stream.stop_stream()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _release_stream(self):
|
||||
if not self.stream:
|
||||
return
|
||||
self._stop_stream_quietly()
|
||||
try:
|
||||
self.stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.stream = None
|
||||
|
||||
async def _wait_for_thread(self, thread, timeout_seconds: float) -> bool:
|
||||
"""Асинхронно ждет завершения daemon-thread без блокировки event loop."""
|
||||
deadline = time.monotonic() + timeout_seconds
|
||||
while thread.is_alive() and time.monotonic() < deadline:
|
||||
await asyncio.sleep(0.05)
|
||||
return not thread.is_alive()
|
||||
|
||||
async def _run_blocking_cleanup(self, func, timeout_seconds: float, label: str) -> bool:
|
||||
"""Запускает потенциально подвисающий cleanup в daemon-thread и ждет ограниченное время."""
|
||||
done_event = threading.Event()
|
||||
error_holder = {}
|
||||
|
||||
def runner():
|
||||
try:
|
||||
func()
|
||||
except Exception as exc:
|
||||
error_holder["error"] = exc
|
||||
finally:
|
||||
done_event.set()
|
||||
|
||||
thread = threading.Thread(target=runner, daemon=True, name=label)
|
||||
thread.start()
|
||||
|
||||
deadline = time.monotonic() + timeout_seconds
|
||||
while not done_event.is_set() and time.monotonic() < deadline:
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
if not done_event.is_set():
|
||||
print(f"⚠️ {label} timed out; continuing cleanup.")
|
||||
return False
|
||||
|
||||
error = error_holder.get("error")
|
||||
if error is not None:
|
||||
print(f"⚠️ {label} failed: {error}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def _process_audio(
|
||||
self, dg_connection, timeout_seconds, detection_timeout, fast_stop
|
||||
):
|
||||
@@ -166,9 +248,9 @@ class SpeechRecognizer:
|
||||
"""
|
||||
self.transcript = ""
|
||||
transcript_parts = []
|
||||
latest_interim = ""
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
stream = self._get_stream()
|
||||
effective_detection_timeout = (
|
||||
detection_timeout
|
||||
if detection_timeout is not None
|
||||
@@ -192,9 +274,13 @@ class SpeechRecognizer:
|
||||
# --- Обработчики событий Deepgram ---
|
||||
def on_transcript(unused_self, result, **kwargs):
|
||||
"""Вызывается, когда приходит часть текста."""
|
||||
nonlocal latest_interim
|
||||
sentence = result.channel.alternatives[0].transcript
|
||||
if len(sentence) == 0:
|
||||
return
|
||||
sentence = sentence.strip()
|
||||
if not sentence:
|
||||
return
|
||||
try:
|
||||
loop.call_soon_threadsafe(mark_speech_activity)
|
||||
except RuntimeError:
|
||||
@@ -202,9 +288,9 @@ class SpeechRecognizer:
|
||||
|
||||
if fast_stop:
|
||||
if _is_fast_stop_utterance(sentence):
|
||||
self.transcript = sentence.strip()
|
||||
self.transcript = sentence
|
||||
try:
|
||||
loop.call_soon_threadsafe(stop_event.set)
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
except RuntimeError:
|
||||
pass
|
||||
return
|
||||
@@ -213,6 +299,10 @@ class SpeechRecognizer:
|
||||
# Собираем только финальные (подтвержденные) фразы
|
||||
transcript_parts.append(sentence)
|
||||
self.transcript = " ".join(transcript_parts).strip()
|
||||
latest_interim = ""
|
||||
else:
|
||||
# Fallback: некоторые сессии завершаются без is_final.
|
||||
latest_interim = sentence
|
||||
|
||||
def on_speech_started(unused_self, speech_started, **kwargs):
|
||||
"""Вызывается, когда VAD (Voice Activity Detection) слышит голос."""
|
||||
@@ -231,7 +321,7 @@ class SpeechRecognizer:
|
||||
def on_error(unused_self, error, **kwargs):
|
||||
print(f"Deepgram Error: {error}")
|
||||
try:
|
||||
loop.call_soon_threadsafe(stop_event.set)
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
except RuntimeError:
|
||||
# Event loop might be closed, ignore
|
||||
pass
|
||||
@@ -242,27 +332,34 @@ class SpeechRecognizer:
|
||||
dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
|
||||
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
|
||||
|
||||
# Параметры распознавания
|
||||
options = LiveOptions(
|
||||
model="nova-2", # Самая быстрая и точная модель
|
||||
language=self.current_lang,
|
||||
smart_format=True, # Расстановка знаков препинания
|
||||
encoding="linear16",
|
||||
channels=1,
|
||||
sample_rate=self._stream_sample_rate,
|
||||
interim_results=True,
|
||||
utterance_end_ms=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
|
||||
vad_events=True,
|
||||
# Сглаженный порог endpointing, чтобы не резать речь на коротких паузах.
|
||||
endpointing=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
|
||||
)
|
||||
|
||||
# --- Задача отправки аудио с буферизацией ---
|
||||
async def send_audio():
|
||||
sender_stop_event = threading.Event()
|
||||
|
||||
def request_stop():
|
||||
stop_event.set()
|
||||
sender_stop_event.set()
|
||||
|
||||
def send_audio():
|
||||
chunks_sent = 0
|
||||
audio_buffer = [] # Буфер для накопления звука во время подключения
|
||||
stream = None
|
||||
|
||||
try:
|
||||
stream, stream_sample_rate = self._open_stream_for_session()
|
||||
options = LiveOptions(
|
||||
model="nova-2", # Самая быстрая и точная модель
|
||||
language=self.current_lang,
|
||||
smart_format=True, # Расстановка знаков препинания
|
||||
encoding="linear16",
|
||||
channels=1,
|
||||
sample_rate=stream_sample_rate,
|
||||
interim_results=True,
|
||||
utterance_end_ms=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
|
||||
vad_events=True,
|
||||
# Сглаженный порог endpointing, чтобы не резать речь на коротких паузах.
|
||||
endpointing=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
|
||||
)
|
||||
|
||||
# 1. Сразу начинаем захват звука, не дожидаясь сети!
|
||||
stream.start_stream()
|
||||
print("🎤 Stream started (buffering)...")
|
||||
@@ -270,34 +367,61 @@ class SpeechRecognizer:
|
||||
# 2. Запускаем подключение к Deepgram в фоне (через ThreadPool, т.к. start() блокирующий)
|
||||
# Но в данном SDK start() возвращает bool, он может быть блокирующим.
|
||||
# Deepgram Python SDK v3+ start() делает handshake.
|
||||
connect_result = {"done": False, "ok": None, "error": None}
|
||||
|
||||
connect_future = loop.run_in_executor(
|
||||
None, lambda: dg_connection.start(options)
|
||||
def start_connection():
|
||||
try:
|
||||
connect_result["ok"] = dg_connection.start(options)
|
||||
except Exception as exc:
|
||||
connect_result["error"] = exc
|
||||
finally:
|
||||
connect_result["done"] = True
|
||||
|
||||
connect_thread = threading.Thread(
|
||||
target=start_connection, daemon=True
|
||||
)
|
||||
connect_thread.start()
|
||||
|
||||
# Пока подключаемся, копим данные.
|
||||
# Ждём коротко: если сеть подвисла, быстрее перезапускаем попытку.
|
||||
connect_deadline = time.monotonic() + DEEPGRAM_CONNECT_WAIT_SECONDS
|
||||
while (
|
||||
not connect_future.done()
|
||||
not connect_result["done"]
|
||||
and time.monotonic() < connect_deadline
|
||||
and not sender_stop_event.is_set()
|
||||
):
|
||||
if stream.is_active():
|
||||
data = stream.read(4096, exception_on_overflow=False)
|
||||
try:
|
||||
data = stream.read(4096, exception_on_overflow=False)
|
||||
except Exception as read_error:
|
||||
if sender_stop_event.is_set():
|
||||
return
|
||||
print(f"Audio read error during connect: {read_error}")
|
||||
with contextlib.suppress(RuntimeError):
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
audio_buffer.append(data)
|
||||
await asyncio.sleep(DEEPGRAM_CONNECT_POLL_SECONDS)
|
||||
time.sleep(DEEPGRAM_CONNECT_POLL_SECONDS)
|
||||
|
||||
if not connect_future.done():
|
||||
if sender_stop_event.is_set():
|
||||
return
|
||||
|
||||
if not connect_result["done"]:
|
||||
print(
|
||||
f"⏰ Timeout connecting to Deepgram ({DEEPGRAM_CONNECT_WAIT_SECONDS:.1f}s)"
|
||||
)
|
||||
stop_event.set()
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
# Проверяем результат подключения
|
||||
if connect_future.result() is False:
|
||||
if connect_result["error"] is not None:
|
||||
print(f"Failed to start Deepgram connection: {connect_result['error']}")
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
if connect_result["ok"] is False:
|
||||
print("Failed to start Deepgram connection")
|
||||
stop_event.set()
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
print(f"🚀 Connected! Sending buffer ({len(audio_buffer)} chunks)...")
|
||||
@@ -310,23 +434,45 @@ class SpeechRecognizer:
|
||||
audio_buffer = None # Освобождаем память
|
||||
|
||||
# 4. Продолжаем стримить в реальном времени до события остановки.
|
||||
while not stop_event.is_set():
|
||||
if stream.is_active():
|
||||
while not sender_stop_event.is_set():
|
||||
if not stream.is_active():
|
||||
break
|
||||
try:
|
||||
data = stream.read(4096, exception_on_overflow=False)
|
||||
dg_connection.send(data)
|
||||
chunks_sent += 1
|
||||
if chunks_sent % 50 == 0:
|
||||
print(".", end="", flush=True)
|
||||
await asyncio.sleep(0.002) # Уменьшаем задержку для более быстрого реагирования
|
||||
except Exception as read_error:
|
||||
if sender_stop_event.is_set():
|
||||
break
|
||||
print(f"Audio read error: {read_error}")
|
||||
with contextlib.suppress(RuntimeError):
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
break
|
||||
if sender_stop_event.is_set():
|
||||
break
|
||||
dg_connection.send(data)
|
||||
chunks_sent += 1
|
||||
if chunks_sent % 50 == 0:
|
||||
print(".", end="", flush=True)
|
||||
time.sleep(0.002) # Уменьшаем задержку для более быстрого реагирования
|
||||
|
||||
except Exception as e:
|
||||
print(f"Audio send error: {e}")
|
||||
with contextlib.suppress(RuntimeError):
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
finally:
|
||||
if stream.is_active():
|
||||
stream.stop_stream()
|
||||
with contextlib.suppress(Exception):
|
||||
if stream and stream.is_active():
|
||||
stream.stop_stream()
|
||||
with contextlib.suppress(Exception):
|
||||
if stream:
|
||||
stream.close()
|
||||
print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
|
||||
|
||||
sender_task = asyncio.create_task(send_audio())
|
||||
sender_thread = threading.Thread(
|
||||
target=send_audio,
|
||||
daemon=True,
|
||||
name="deepgram-audio-sender",
|
||||
)
|
||||
sender_thread.start()
|
||||
|
||||
if False: # dg_connection.start(options) перенесен внутрь send_audio
|
||||
pass
|
||||
@@ -356,7 +502,7 @@ class SpeechRecognizer:
|
||||
|
||||
if not done:
|
||||
# Если за detection_timeout никто не начал говорить, выходим
|
||||
stop_event.set()
|
||||
request_stop()
|
||||
|
||||
# 2. После старта речи завершаем только по тишине POST_SPEECH_SILENCE_TIMEOUT_SECONDS.
|
||||
# Добавляем длинный защитный лимит, чтобы сессия не зависла навсегда.
|
||||
@@ -374,7 +520,7 @@ class SpeechRecognizer:
|
||||
now - last_speech_activity
|
||||
>= POST_SPEECH_SILENCE_TIMEOUT_SECONDS
|
||||
):
|
||||
stop_event.set()
|
||||
request_stop()
|
||||
break
|
||||
|
||||
if (
|
||||
@@ -383,7 +529,7 @@ class SpeechRecognizer:
|
||||
>= max_active_speech_seconds
|
||||
):
|
||||
print("⏱️ Достигнут защитный лимит активного прослушивания.")
|
||||
stop_event.set()
|
||||
request_stop()
|
||||
break
|
||||
|
||||
await asyncio.sleep(0.05)
|
||||
@@ -393,19 +539,29 @@ class SpeechRecognizer:
|
||||
except Exception as e:
|
||||
print(f"Error in waiting for events: {e}")
|
||||
|
||||
stop_event.set()
|
||||
try:
|
||||
await sender_task
|
||||
except Exception as e:
|
||||
print(f"Error waiting for sender task: {e}")
|
||||
request_stop()
|
||||
sender_stopped = await self._wait_for_thread(
|
||||
sender_thread,
|
||||
timeout_seconds=max(SENDER_STOP_WAIT_SECONDS, SENDER_FORCE_RELEASE_WAIT_SECONDS),
|
||||
)
|
||||
if not sender_stopped:
|
||||
print("⚠️ Audio sender shutdown timed out; continuing cleanup.")
|
||||
|
||||
# Небольшая пауза, чтобы получить последние transcript-события перед finish().
|
||||
await asyncio.sleep(DEEPGRAM_FINALIZATION_GRACE_SECONDS)
|
||||
|
||||
# Завершаем соединение и ждем последние результаты
|
||||
try:
|
||||
dg_connection.finish()
|
||||
except Exception as e:
|
||||
print(f"Error finishing connection: {e}")
|
||||
await self._run_blocking_cleanup(
|
||||
dg_connection.finish,
|
||||
timeout_seconds=DEEPGRAM_FINISH_TIMEOUT_SECONDS,
|
||||
label="Deepgram finish",
|
||||
)
|
||||
|
||||
return self.transcript
|
||||
final_text = self.transcript.strip()
|
||||
if not final_text:
|
||||
final_text = latest_interim.strip()
|
||||
self.transcript = final_text
|
||||
return final_text
|
||||
|
||||
def listen(
|
||||
self,
|
||||
|
||||
@@ -286,6 +286,9 @@ class TextToSpeech:
|
||||
if not text.strip():
|
||||
return True
|
||||
|
||||
if check_interrupt is None:
|
||||
check_interrupt = self._default_interrupt_checker()
|
||||
|
||||
if language == "ru":
|
||||
text = self._preprocess_text(text)
|
||||
segments = self._split_mixed_language(text)
|
||||
@@ -296,6 +299,14 @@ class TextToSpeech:
|
||||
text, check_interrupt=check_interrupt, language=language
|
||||
)
|
||||
|
||||
def _default_interrupt_checker(self):
|
||||
try:
|
||||
from .wakeword import check_wakeword_once
|
||||
|
||||
return check_wakeword_once
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _resample_audio(self, audio_np: np.ndarray, src_rate: int, dst_rate: int):
|
||||
if src_rate == dst_rate:
|
||||
return audio_np.astype(np.float32, copy=False)
|
||||
|
||||
Reference in New Issue
Block a user