Migrate to Deepgram STT, Silero v5 TTS, and fix wake word mic handling

2026-01-07 17:59:18 +03:00
parent 7b79593cad
commit 53809c03f4
5 changed files with 233 additions and 89 deletions
--- a/stt.py
+++ b/stt.py
@@ -1,109 +1,197 @@
 """
-Speech-to-Text module using Vosk.
-Recognizes Russian speech from microphone.
+Speech-to-Text module using Deepgram API.
+Recognizes speech from microphone using streaming WebSocket.
+Supports Russian (default) and English.
 """
-import json
+import os
+import asyncio
+import threading
 import pyaudio
-from vosk import Model, KaldiRecognizer
-from config import VOSK_MODEL_PATH, SAMPLE_RATE
+import logging
+from config import DEEPGRAM_API_KEY, SAMPLE_RATE
+from deepgram import (
+    DeepgramClient,
+    DeepgramClientOptions,
+    LiveTranscriptionEvents,
+    LiveOptions,
+    Microphone,
+)

+# Configure logging to suppress debug noise
+logging.getLogger("deepgram").setLevel(logging.WARNING)

 class SpeechRecognizer:
-    """Speech recognizer using Vosk."""
-    
+    """Speech recognizer using Deepgram streaming."""
+
    def __init__(self):
-        self.model = None
-        self.recognizer = None
+        self.dg_client = None
        self.pa = None
        self.stream = None
-    
+        self.transcript = ""
+        self.lock = threading.Lock()
+        
    def initialize(self):
-        """Initialize Vosk model and audio stream."""
-        print("📦 Загрузка модели Vosk...")
-        self.model = Model(str(VOSK_MODEL_PATH))
-        self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
-        self.recognizer.SetWords(True)
+        """Initialize Deepgram client and PyAudio."""
+        if not DEEPGRAM_API_KEY:
+            raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
+            
+        print("📦 Инициализация Deepgram STT...")
+        config = DeepgramClientOptions(
+            verbose=logging.WARNING,
+        )
+        self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
        
        self.pa = pyaudio.PyAudio()
-        self.stream = self.pa.open(
-            rate=SAMPLE_RATE,
+        print("✅ Deepgram клиент готов")
+
+    def _get_stream(self):
+        """Open audio stream if not open."""
+        if self.stream is None:
+            self.stream = self.pa.open(
+                rate=SAMPLE_RATE,
+                channels=1,
+                format=pyaudio.paInt16,
+                input=True,
+                frames_per_buffer=4096,
+            )
+        return self.stream
+
+    async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
+        """Async loop to send audio and wait for results."""
+        self.transcript = ""
+        
+        loop = asyncio.get_running_loop()
+        stream = self._get_stream()
+        
+        stop_event = asyncio.Event()
+        speech_started_event = asyncio.Event()
+        
+        # We need access to the outer 'self' (SpeechRecognizer instance)
+        speech_recognizer_self = self
+
+        def on_transcript(unused_self, result, **kwargs):
+            sentence = result.channel.alternatives[0].transcript
+            if len(sentence) == 0:
+                return
+            if result.is_final:
+                print(f"📝 Частичный результат: {sentence}")
+                with speech_recognizer_self.lock:
+                    speech_recognizer_self.transcript = sentence
+
+        def on_speech_started(unused_self, speech_started, **kwargs):
+            loop.call_soon_threadsafe(speech_started_event.set)
+
+        def on_utterance_end(unused_self, utterance_end, **kwargs):
+            loop.call_soon_threadsafe(stop_event.set)
+
+        def on_error(unused_self, error, **kwargs):
+            print(f"Error: {error}")
+            loop.call_soon_threadsafe(stop_event.set)
+
+        dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
+        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
+        dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
+        dg_connection.on(LiveTranscriptionEvents.Error, on_error)
+
+        # Start connection (Synchronous call, NO await)
+        options = LiveOptions(
+            model="nova-2",
+            language=self.current_lang,
+            smart_format=True,
+            encoding="linear16",
            channels=1,
-            format=pyaudio.paInt16,
-            input=True,
-            frames_per_buffer=4096
+            sample_rate=SAMPLE_RATE,
+            interim_results=True,
+            utterance_end_ms="1200", 
+            vad_events=True,
        )
-        print("✅ Модель Vosk загружена")
-    
-    def listen(self, timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
+
+        if dg_connection.start(options) is False:
+            print("Failed to start Deepgram connection")
+            return
+
+        # Audio sending loop
+        async def send_audio():
+            chunks_sent = 0
+            try:
+                stream.start_stream()
+                print("🎤 Stream started, sending audio...")
+                while not stop_event.is_set():
+                    if stream.is_active():
+                        data = stream.read(4096, exception_on_overflow=False)
+                        # Send is synchronous in Sync client, NO await
+                        dg_connection.send(data)
+                        chunks_sent += 1
+                        if chunks_sent % 50 == 0:
+                            print(f".", end="", flush=True)
+                    # Yield to allow event loop to process events (timeouts etc)
+                    await asyncio.sleep(0.005)
+            except Exception as e:
+                print(f"Audio send error: {e}")
+            finally:
+                stream.stop_stream()
+                print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
+
+        sender_task = asyncio.create_task(send_audio())
+        
+        try:
+            # 1. Wait for speech to start (detection_timeout)
+            if detection_timeout:
+                try:
+                    await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
+                except asyncio.TimeoutError:
+                    # print("Detection timeout - no speech")
+                    stop_event.set()
+            
+            # 2. If started (or no detection timeout), wait for completion
+            if not stop_event.is_set():
+                await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
+                
+        except asyncio.TimeoutError:
+            # print("Global timeout")
+            pass
+            
+        stop_event.set()
+        await sender_task
+        # Finish is synchronous
+        dg_connection.finish()
+        
+        return self.transcript
+
+    def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
        """
        Listen to microphone and transcribe speech.
-        
-        Args:
-            timeout_seconds: Maximum time to listen for speech
-            detection_timeout: Time to wait for speech to start. If None, uses timeout_seconds.
-            
-        Returns:
-            Transcribed text from speech
        """
-        if not self.model:
+        if not self.dg_client:
            self.initialize()
-        
-        print("🎙️ Слушаю... (говорите)")
-        
-        # Reset recognizer for new recognition
-        self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
-        
-        frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
-        detection_frames = int(SAMPLE_RATE * detection_timeout / 4096) if detection_timeout else frames_to_read
-        
-        silence_frames = 0
-        max_silence_frames = 10  # About 2.5 seconds of silence
-        speech_started = False
-        
-        for i in range(frames_to_read):
-            data = self.stream.read(4096, exception_on_overflow=False)
            
-            if self.recognizer.AcceptWaveform(data):
-                result = json.loads(self.recognizer.Result())
-                text = result.get("text", "").strip()
-                if text:
-                    print(f"📝 Распознано: {text}")
-                    return text
-                silence_frames += 1
+        self.current_lang = lang
+        print(f"🎙️ Слушаю ({lang})...")
+
+        # Create a new connection for each listen session
+        dg_connection = self.dg_client.listen.live.v("1")
+
+        try:
+            transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
+            
+            final_text = transcript.strip() if transcript else ""
+            if final_text:
+                print(f"📝 Распознано: {final_text}")
            else:
-                # Check partial result
-                partial = json.loads(self.recognizer.PartialResult())
-                if partial.get("partial", ""):
-                    silence_frames = 0
-                    speech_started = True
-                else:
-                    silence_frames += 1
-            
-            # Check detection timeout
-            if not speech_started and i > detection_frames:
-                break
-            
-            # Stop if too much silence after speech
-            if silence_frames > max_silence_frames:
-                break
-        
-        # Get final result
-        result = json.loads(self.recognizer.FinalResult())
-        text = result.get("text", "").strip()
-        
-        if text:
-            print(f"📝 Распознано: {text}")
-        else:
-            # Only print if we weren't just checking for presence of speech
-            if not detection_timeout or speech_started:
                print("⚠️ Речь не распознана")
-        
-        return text
-    
+                
+            return final_text
+            
+        except Exception as e:
+            print(f"❌ Ошибка STT: {e}")
+            return ""
+
    def cleanup(self):
        """Release resources."""
        if self.stream:
+            self.stream.stop_stream()
            self.stream.close()
+            self.stream = None
        if self.pa:
            self.pa.terminate()

@@ -120,9 +208,9 @@ def get_recognizer() -> SpeechRecognizer:
    return _recognizer


-def listen(timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
+def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
    """Listen to microphone and return transcribed text."""
-    return get_recognizer().listen(timeout_seconds, detection_timeout)
+    return get_recognizer().listen(timeout_seconds, detection_timeout, lang)


 def cleanup():
@@ -130,4 +218,4 @@ def cleanup():
    global _recognizer
    if _recognizer:
        _recognizer.cleanup()
-        _recognizer = None
+        _recognizer = None