Migrate to Deepgram STT, Silero v5 TTS, and fix wake word mic handling

2026-01-07 17:59:18 +03:00
parent 7b79593cad
commit 53809c03f4
5 changed files with 233 additions and 89 deletions
--- a/config.py
+++ b/config.py
@@ -17,6 +17,9 @@ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
 PERPLEXITY_MODEL = os.getenv("PERPLEXITY_MODEL", "llama-3.1-sonar-small-128k-chat")
 PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"

+# Deepgram configuration
+DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
+
 # Porcupine configuration
 PORCUPINE_ACCESS_KEY = os.getenv("PORCUPINE_ACCESS_KEY")
 PORCUPINE_KEYWORD_PATH = BASE_DIR / "Alexandr_en_linux_v4_0_0.ppn"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,32 @@
+aenum==3.1.16
+aiofiles==25.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
 antlr4-python3-runtime==4.9.3
+anyio==4.12.1
+attrs==25.4.0
 certifi==2025.11.12
 cffi==2.0.0
 charset-normalizer==3.4.4
+dataclasses-json==0.6.7
 DAWG2-Python==0.9.0
+deepgram-sdk==3.11.0
+deprecation==2.1.0
 docopt==0.6.2
 filelock==3.20.1
+frozenlist==1.8.0
 fsspec==2025.12.0
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
 idna==3.11
 Jinja2==3.1.6
 MarkupSafe==3.0.3
+marshmallow==3.26.2
 mpmath==1.3.0
+multidict==6.7.0
+mypy_extensions==1.1.0
 networkx==3.6.1
 num2words==0.5.14
 numpy==2.4.0
@@ -29,6 +46,8 @@ nvidia-nvjitlink-cu12==12.8.93
 nvidia-nvshmem-cu12==3.3.20
 nvidia-nvtx-cu12==12.8.90
 omegaconf==2.3.0
+packaging==25.0
+propcache==0.4.1
 pvporcupine==4.0.1
 PyAudio==0.2.14
 pycparser==2.23
@@ -46,7 +65,9 @@ torch==2.9.1
 torchaudio==2.9.1
 tqdm==4.67.1
 triton==3.5.1
+typing-inspect==0.9.0
 typing_extensions==4.15.0
 urllib3==2.6.2
 vosk==0.3.45
 websockets==15.0.1
+yarl==1.22.0
--- a/stt.py
+++ b/stt.py
@@ -1,109 +1,197 @@
 """
-Speech-to-Text module using Vosk.
-Recognizes Russian speech from microphone.
+Speech-to-Text module using Deepgram API.
+Recognizes speech from microphone using streaming WebSocket.
+Supports Russian (default) and English.
 """
-import json
+import os
+import asyncio
+import threading
 import pyaudio
-from vosk import Model, KaldiRecognizer
-from config import VOSK_MODEL_PATH, SAMPLE_RATE
+import logging
+from config import DEEPGRAM_API_KEY, SAMPLE_RATE
+from deepgram import (
+    DeepgramClient,
+    DeepgramClientOptions,
+    LiveTranscriptionEvents,
+    LiveOptions,
+    Microphone,
+)

+# Configure logging to suppress debug noise
+logging.getLogger("deepgram").setLevel(logging.WARNING)

 class SpeechRecognizer:
-    """Speech recognizer using Vosk."""
+    """Speech recognizer using Deepgram streaming."""

    def __init__(self):
-        self.model = None
-        self.recognizer = None
+        self.dg_client = None
        self.pa = None
        self.stream = None
+        self.transcript = ""
+        self.lock = threading.Lock()
        
    def initialize(self):
-        """Initialize Vosk model and audio stream."""
-        print("📦 Загрузка модели Vosk...")
-        self.model = Model(str(VOSK_MODEL_PATH))
-        self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
-        self.recognizer.SetWords(True)
+        """Initialize Deepgram client and PyAudio."""
+        if not DEEPGRAM_API_KEY:
+            raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
+            
+        print("📦 Инициализация Deepgram STT...")
+        config = DeepgramClientOptions(
+            verbose=logging.WARNING,
+        )
+        self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
        
        self.pa = pyaudio.PyAudio()
-        self.stream = self.pa.open(
-            rate=SAMPLE_RATE,
-            channels=1,
-            format=pyaudio.paInt16,
-            input=True,
-            frames_per_buffer=4096
-        )
-        print("✅ Модель Vosk загружена")
+        print("✅ Deepgram клиент готов")

-    def listen(self, timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
+    def _get_stream(self):
+        """Open audio stream if not open."""
+        if self.stream is None:
+            self.stream = self.pa.open(
+                rate=SAMPLE_RATE,
+                channels=1,
+                format=pyaudio.paInt16,
+                input=True,
+                frames_per_buffer=4096,
+            )
+        return self.stream
+
+    async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
+        """Async loop to send audio and wait for results."""
+        self.transcript = ""
+        
+        loop = asyncio.get_running_loop()
+        stream = self._get_stream()
+        
+        stop_event = asyncio.Event()
+        speech_started_event = asyncio.Event()
+        
+        # We need access to the outer 'self' (SpeechRecognizer instance)
+        speech_recognizer_self = self
+
+        def on_transcript(unused_self, result, **kwargs):
+            sentence = result.channel.alternatives[0].transcript
+            if len(sentence) == 0:
+                return
+            if result.is_final:
+                print(f"📝 Частичный результат: {sentence}")
+                with speech_recognizer_self.lock:
+                    speech_recognizer_self.transcript = sentence
+
+        def on_speech_started(unused_self, speech_started, **kwargs):
+            loop.call_soon_threadsafe(speech_started_event.set)
+
+        def on_utterance_end(unused_self, utterance_end, **kwargs):
+            loop.call_soon_threadsafe(stop_event.set)
+
+        def on_error(unused_self, error, **kwargs):
+            print(f"Error: {error}")
+            loop.call_soon_threadsafe(stop_event.set)
+
+        dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
+        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
+        dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
+        dg_connection.on(LiveTranscriptionEvents.Error, on_error)
+
+        # Start connection (Synchronous call, NO await)
+        options = LiveOptions(
+            model="nova-2",
+            language=self.current_lang,
+            smart_format=True,
+            encoding="linear16",
+            channels=1,
+            sample_rate=SAMPLE_RATE,
+            interim_results=True,
+            utterance_end_ms="1200", 
+            vad_events=True,
+        )
+
+        if dg_connection.start(options) is False:
+            print("Failed to start Deepgram connection")
+            return
+
+        # Audio sending loop
+        async def send_audio():
+            chunks_sent = 0
+            try:
+                stream.start_stream()
+                print("🎤 Stream started, sending audio...")
+                while not stop_event.is_set():
+                    if stream.is_active():
+                        data = stream.read(4096, exception_on_overflow=False)
+                        # Send is synchronous in Sync client, NO await
+                        dg_connection.send(data)
+                        chunks_sent += 1
+                        if chunks_sent % 50 == 0:
+                            print(f".", end="", flush=True)
+                    # Yield to allow event loop to process events (timeouts etc)
+                    await asyncio.sleep(0.005)
+            except Exception as e:
+                print(f"Audio send error: {e}")
+            finally:
+                stream.stop_stream()
+                print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
+
+        sender_task = asyncio.create_task(send_audio())
+        
+        try:
+            # 1. Wait for speech to start (detection_timeout)
+            if detection_timeout:
+                try:
+                    await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
+                except asyncio.TimeoutError:
+                    # print("Detection timeout - no speech")
+                    stop_event.set()
+            
+            # 2. If started (or no detection timeout), wait for completion
+            if not stop_event.is_set():
+                await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
+                
+        except asyncio.TimeoutError:
+            # print("Global timeout")
+            pass
+            
+        stop_event.set()
+        await sender_task
+        # Finish is synchronous
+        dg_connection.finish()
+        
+        return self.transcript
+
+    def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
        """
        Listen to microphone and transcribe speech.
-        
-        Args:
-            timeout_seconds: Maximum time to listen for speech
-            detection_timeout: Time to wait for speech to start. If None, uses timeout_seconds.
-            
-        Returns:
-            Transcribed text from speech
        """
-        if not self.model:
+        if not self.dg_client:
            self.initialize()
            
-        print("🎙️ Слушаю... (говорите)")
+        self.current_lang = lang
+        print(f"🎙️ Слушаю ({lang})...")

-        # Reset recognizer for new recognition
-        self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
+        # Create a new connection for each listen session
+        dg_connection = self.dg_client.listen.live.v("1")

-        frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
-        detection_frames = int(SAMPLE_RATE * detection_timeout / 4096) if detection_timeout else frames_to_read
+        try:
+            transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
            
-        silence_frames = 0
-        max_silence_frames = 10  # About 2.5 seconds of silence
-        speech_started = False
-        
-        for i in range(frames_to_read):
-            data = self.stream.read(4096, exception_on_overflow=False)
-            
-            if self.recognizer.AcceptWaveform(data):
-                result = json.loads(self.recognizer.Result())
-                text = result.get("text", "").strip()
-                if text:
-                    print(f"📝 Распознано: {text}")
-                    return text
-                silence_frames += 1
+            final_text = transcript.strip() if transcript else ""
+            if final_text:
+                print(f"📝 Распознано: {final_text}")
            else:
-                # Check partial result
-                partial = json.loads(self.recognizer.PartialResult())
-                if partial.get("partial", ""):
-                    silence_frames = 0
-                    speech_started = True
-                else:
-                    silence_frames += 1
-            
-            # Check detection timeout
-            if not speech_started and i > detection_frames:
-                break
-            
-            # Stop if too much silence after speech
-            if silence_frames > max_silence_frames:
-                break
-        
-        # Get final result
-        result = json.loads(self.recognizer.FinalResult())
-        text = result.get("text", "").strip()
-        
-        if text:
-            print(f"📝 Распознано: {text}")
-        else:
-            # Only print if we weren't just checking for presence of speech
-            if not detection_timeout or speech_started:
                print("⚠️ Речь не распознана")
                
-        return text
+            return final_text
+            
+        except Exception as e:
+            print(f"❌ Ошибка STT: {e}")
+            return ""

    def cleanup(self):
        """Release resources."""
        if self.stream:
+            self.stream.stop_stream()
            self.stream.close()
+            self.stream = None
        if self.pa:
            self.pa.terminate()

@@ -120,9 +208,9 @@ def get_recognizer() -> SpeechRecognizer:
    return _recognizer


-def listen(timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
+def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
    """Listen to microphone and return transcribed text."""
-    return get_recognizer().listen(timeout_seconds, detection_timeout)
+    return get_recognizer().listen(timeout_seconds, detection_timeout, lang)


 def cleanup():
--- a/tts.py
+++ b/tts.py
@@ -1,6 +1,6 @@
 """
 Text-to-Speech module using Silero TTS.
-Generates natural Russian speech with Xenia voice.
+Generates natural Russian speech.
 Supports interruption via wake word detection using threading.
 """

@@ -140,10 +140,6 @@ class TextToSpeech:
            except Exception as e:
                print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
                success = False
-                # Continue with next chunk? or break?
-                # Usually if one fails, we might want to try others, but for "too long" error
-                # splitting should solve it. If it fails for other reasons, maybe better to stop.
-                # But let's keep trying subsequent chunks in case it's a specific symbol issue.

        if success and not self._interrupted:
            print("✅ Воспроизведение завершено")
--- a/wakeword.py
+++ b/wakeword.py
@@ -41,6 +41,24 @@ class WakeWordDetector:
        if not self.porcupine:
            self.initialize()

+        # Ensure stream is open and active
+        if self.audio_stream is None or not self.audio_stream.is_active():
+             # If closed or None, we might need to recreate it. 
+             # PyAudio streams once closed cannot be reopened usually? 
+             # We should probably recreate it.
+             if self.audio_stream:
+                 try:
+                     self.audio_stream.close()
+                 except: pass
+             
+             self.audio_stream = self.pa.open(
+                rate=self.porcupine.sample_rate,
+                channels=1,
+                format=pyaudio.paInt16,
+                input=True,
+                frames_per_buffer=self.porcupine.frame_length
+             )
+        
        while True:
            pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
            pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)
@@ -48,6 +66,9 @@ class WakeWordDetector:
            keyword_index = self.porcupine.process(pcm)
            if keyword_index >= 0:
                print("✅ Wake word обнаружен!")
+                # Stop and CLOSE stream to release mic for STT
+                self.audio_stream.stop_stream()
+                self.audio_stream.close()
                return True
    
    def check_wakeword_once(self) -> bool:
@@ -59,6 +80,21 @@ class WakeWordDetector:
            self.initialize()
        
        try:
+            # Ensure stream is open/active
+            if self.audio_stream is None or not self.audio_stream.is_active():
+                # Re-open if needed (similar to wait_for_wakeword logic)
+                 if self.audio_stream:
+                     try:
+                         self.audio_stream.close()
+                     except: pass
+                 self.audio_stream = self.pa.open(
+                    rate=self.porcupine.sample_rate,
+                    channels=1,
+                    format=pyaudio.paInt16,
+                    input=True,
+                    frames_per_buffer=self.porcupine.frame_length
+                 )
+
            pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
            pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)