Migrate to Deepgram STT, Silero v5 TTS, and fix wake word mic handling

This commit is contained in:
2026-01-07 17:59:18 +03:00
parent 7b79593cad
commit 53809c03f4
5 changed files with 233 additions and 89 deletions

256
stt.py
View File

@@ -1,109 +1,197 @@
"""
Speech-to-Text module using Vosk.
Recognizes Russian speech from microphone.
Speech-to-Text module using Deepgram API.
Recognizes speech from microphone using streaming WebSocket.
Supports Russian (default) and English.
"""
import json
import os
import asyncio
import threading
import pyaudio
from vosk import Model, KaldiRecognizer
from config import VOSK_MODEL_PATH, SAMPLE_RATE
import logging
from config import DEEPGRAM_API_KEY, SAMPLE_RATE
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
Microphone,
)
# Configure logging to suppress debug noise
logging.getLogger("deepgram").setLevel(logging.WARNING)
class SpeechRecognizer:
"""Speech recognizer using Vosk."""
"""Speech recognizer using Deepgram streaming."""
def __init__(self):
self.model = None
self.recognizer = None
self.dg_client = None
self.pa = None
self.stream = None
self.transcript = ""
self.lock = threading.Lock()
def initialize(self):
"""Initialize Vosk model and audio stream."""
print("📦 Загрузка модели Vosk...")
self.model = Model(str(VOSK_MODEL_PATH))
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
self.recognizer.SetWords(True)
"""Initialize Deepgram client and PyAudio."""
if not DEEPGRAM_API_KEY:
raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
print("📦 Инициализация Deepgram STT...")
config = DeepgramClientOptions(
verbose=logging.WARNING,
)
self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(
rate=SAMPLE_RATE,
print("✅ Deepgram клиент готов")
def _get_stream(self):
"""Open audio stream if not open."""
if self.stream is None:
self.stream = self.pa.open(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=4096,
)
return self.stream
async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
"""Async loop to send audio and wait for results."""
self.transcript = ""
loop = asyncio.get_running_loop()
stream = self._get_stream()
stop_event = asyncio.Event()
speech_started_event = asyncio.Event()
# We need access to the outer 'self' (SpeechRecognizer instance)
speech_recognizer_self = self
def on_transcript(unused_self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
if result.is_final:
print(f"📝 Частичный результат: {sentence}")
with speech_recognizer_self.lock:
speech_recognizer_self.transcript = sentence
def on_speech_started(unused_self, speech_started, **kwargs):
loop.call_soon_threadsafe(speech_started_event.set)
def on_utterance_end(unused_self, utterance_end, **kwargs):
loop.call_soon_threadsafe(stop_event.set)
def on_error(unused_self, error, **kwargs):
print(f"Error: {error}")
loop.call_soon_threadsafe(stop_event.set)
dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
# Start connection (Synchronous call, NO await)
options = LiveOptions(
model="nova-2",
language=self.current_lang,
smart_format=True,
encoding="linear16",
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=4096
sample_rate=SAMPLE_RATE,
interim_results=True,
utterance_end_ms="1200",
vad_events=True,
)
print("✅ Модель Vosk загружена")
def listen(self, timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
if dg_connection.start(options) is False:
print("Failed to start Deepgram connection")
return
# Audio sending loop
async def send_audio():
chunks_sent = 0
try:
stream.start_stream()
print("🎤 Stream started, sending audio...")
while not stop_event.is_set():
if stream.is_active():
data = stream.read(4096, exception_on_overflow=False)
# Send is synchronous in Sync client, NO await
dg_connection.send(data)
chunks_sent += 1
if chunks_sent % 50 == 0:
print(f".", end="", flush=True)
# Yield to allow event loop to process events (timeouts etc)
await asyncio.sleep(0.005)
except Exception as e:
print(f"Audio send error: {e}")
finally:
stream.stop_stream()
print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
sender_task = asyncio.create_task(send_audio())
try:
# 1. Wait for speech to start (detection_timeout)
if detection_timeout:
try:
await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
except asyncio.TimeoutError:
# print("Detection timeout - no speech")
stop_event.set()
# 2. If started (or no detection timeout), wait for completion
if not stop_event.is_set():
await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
except asyncio.TimeoutError:
# print("Global timeout")
pass
stop_event.set()
await sender_task
# Finish is synchronous
dg_connection.finish()
return self.transcript
def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
"""
Listen to microphone and transcribe speech.
Args:
timeout_seconds: Maximum time to listen for speech
detection_timeout: Time to wait for speech to start. If None, uses timeout_seconds.
Returns:
Transcribed text from speech
"""
if not self.model:
if not self.dg_client:
self.initialize()
print("🎙️ Слушаю... (говорите)")
# Reset recognizer for new recognition
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
detection_frames = int(SAMPLE_RATE * detection_timeout / 4096) if detection_timeout else frames_to_read
silence_frames = 0
max_silence_frames = 10 # About 2.5 seconds of silence
speech_started = False
for i in range(frames_to_read):
data = self.stream.read(4096, exception_on_overflow=False)
if self.recognizer.AcceptWaveform(data):
result = json.loads(self.recognizer.Result())
text = result.get("text", "").strip()
if text:
print(f"📝 Распознано: {text}")
return text
silence_frames += 1
self.current_lang = lang
print(f"🎙️ Слушаю ({lang})...")
# Create a new connection for each listen session
dg_connection = self.dg_client.listen.live.v("1")
try:
transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
final_text = transcript.strip() if transcript else ""
if final_text:
print(f"📝 Распознано: {final_text}")
else:
# Check partial result
partial = json.loads(self.recognizer.PartialResult())
if partial.get("partial", ""):
silence_frames = 0
speech_started = True
else:
silence_frames += 1
# Check detection timeout
if not speech_started and i > detection_frames:
break
# Stop if too much silence after speech
if silence_frames > max_silence_frames:
break
# Get final result
result = json.loads(self.recognizer.FinalResult())
text = result.get("text", "").strip()
if text:
print(f"📝 Распознано: {text}")
else:
# Only print if we weren't just checking for presence of speech
if not detection_timeout or speech_started:
print("⚠️ Речь не распознана")
return text
return final_text
except Exception as e:
print(f"❌ Ошибка STT: {e}")
return ""
def cleanup(self):
"""Release resources."""
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.pa:
self.pa.terminate()
@@ -120,9 +208,9 @@ def get_recognizer() -> SpeechRecognizer:
return _recognizer
def listen(timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
"""Listen to microphone and return transcribed text."""
return get_recognizer().listen(timeout_seconds, detection_timeout)
return get_recognizer().listen(timeout_seconds, detection_timeout, lang)
def cleanup():
@@ -130,4 +218,4 @@ def cleanup():
global _recognizer
if _recognizer:
_recognizer.cleanup()
_recognizer = None
_recognizer = None