Migrate to Deepgram STT, Silero v5 TTS, and fix wake word mic handling

2026-01-07 17:59:18 +03:00
parent 7b79593cad
commit 53809c03f4
5 changed files with 233 additions and 89 deletions
--- a/config.py
+++ b/config.py
@@ -17,6 +17,9 @@ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
 PERPLEXITY_MODEL = os.getenv("PERPLEXITY_MODEL", "llama-3.1-sonar-small-128k-chat")
 PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
 # Deepgram configuration
 DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
 # Porcupine configuration
 PORCUPINE_ACCESS_KEY = os.getenv("PORCUPINE_ACCESS_KEY")
 PORCUPINE_KEYWORD_PATH = BASE_DIR / "Alexandr_en_linux_v4_0_0.ppn"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,32 @@
 aenum==3.1.16
 aiofiles==25.1.0
 aiohappyeyeballs==2.6.1
 aiohttp==3.13.3
 aiosignal==1.4.0
 antlr4-python3-runtime==4.9.3
 anyio==4.12.1
 attrs==25.4.0
 certifi==2025.11.12
 cffi==2.0.0
 charset-normalizer==3.4.4
 dataclasses-json==0.6.7
 DAWG2-Python==0.9.0
 deepgram-sdk==3.11.0
 deprecation==2.1.0
 docopt==0.6.2
 filelock==3.20.1
 frozenlist==1.8.0
 fsspec==2025.12.0
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
 idna==3.11
 Jinja2==3.1.6
 MarkupSafe==3.0.3
 marshmallow==3.26.2
 mpmath==1.3.0
 multidict==6.7.0
 mypy_extensions==1.1.0
 networkx==3.6.1
 num2words==0.5.14
 numpy==2.4.0
@@ -29,6 +46,8 @@ nvidia-nvjitlink-cu12==12.8.93
 nvidia-nvshmem-cu12==3.3.20
 nvidia-nvtx-cu12==12.8.90
 omegaconf==2.3.0
 packaging==25.0
 propcache==0.4.1
 pvporcupine==4.0.1
 PyAudio==0.2.14
 pycparser==2.23
@@ -46,7 +65,9 @@ torch==2.9.1
 torchaudio==2.9.1
 tqdm==4.67.1
 triton==3.5.1
 typing-inspect==0.9.0
 typing_extensions==4.15.0
 urllib3==2.6.2
 vosk==0.3.45
 websockets==15.0.1
 yarl==1.22.0
--- a/stt.py
+++ b/stt.py
@@ -1,109 +1,197 @@
 """
-Speech-to-Text module using Vosk.
+Speech-to-Text module using Deepgram API.
-Recognizes Russian speech from microphone.
+Recognizes speech from microphone using streaming WebSocket.
 Supports Russian (default) and English.
 """
-import json
+import os
 import asyncio
 import threading
 import pyaudio
-from vosk import Model, KaldiRecognizer
+import logging
-from config import VOSK_MODEL_PATH, SAMPLE_RATE
+from config import DEEPGRAM_API_KEY, SAMPLE_RATE
 from deepgram import (
    DeepgramClient,
    DeepgramClientOptions,
    LiveTranscriptionEvents,
    LiveOptions,
    Microphone,
 )
 # Configure logging to suppress debug noise
 logging.getLogger("deepgram").setLevel(logging.WARNING)
 class SpeechRecognizer:
-    """Speech recognizer using Vosk."""
+    """Speech recognizer using Deepgram streaming."""
-    
+
    def __init__(self):
-        self.model = None
+        self.dg_client = None
        self.recognizer = None
        self.pa = None
        self.stream = None
-    
+        self.transcript = ""
        self.lock = threading.Lock()
    def initialize(self):
-        """Initialize Vosk model and audio stream."""
+        """Initialize Deepgram client and PyAudio."""
-        print("📦 Загрузка модели Vosk...")
+        if not DEEPGRAM_API_KEY:
-        self.model = Model(str(VOSK_MODEL_PATH))
+            raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
-        self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
+            
-        self.recognizer.SetWords(True)
+        print("📦 Инициализация Deepgram STT...")
        config = DeepgramClientOptions(
            verbose=logging.WARNING,
        )
        self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
        self.pa = pyaudio.PyAudio()
-        self.stream = self.pa.open(
+        print("✅ Deepgram клиент готов")
-            rate=SAMPLE_RATE,
+
    def _get_stream(self):
        """Open audio stream if not open."""
        if self.stream is None:
            self.stream = self.pa.open(
                rate=SAMPLE_RATE,
                channels=1,
                format=pyaudio.paInt16,
                input=True,
                frames_per_buffer=4096,
            )
        return self.stream
    async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
        """Async loop to send audio and wait for results."""
        self.transcript = ""
        loop = asyncio.get_running_loop()
        stream = self._get_stream()
        stop_event = asyncio.Event()
        speech_started_event = asyncio.Event()
        # We need access to the outer 'self' (SpeechRecognizer instance)
        speech_recognizer_self = self
        def on_transcript(unused_self, result, **kwargs):
            sentence = result.channel.alternatives[0].transcript
            if len(sentence) == 0:
                return
            if result.is_final:
                print(f"📝 Частичный результат: {sentence}")
                with speech_recognizer_self.lock:
                    speech_recognizer_self.transcript = sentence
        def on_speech_started(unused_self, speech_started, **kwargs):
            loop.call_soon_threadsafe(speech_started_event.set)
        def on_utterance_end(unused_self, utterance_end, **kwargs):
            loop.call_soon_threadsafe(stop_event.set)
        def on_error(unused_self, error, **kwargs):
            print(f"Error: {error}")
            loop.call_soon_threadsafe(stop_event.set)
        dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
        dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
        dg_connection.on(LiveTranscriptionEvents.Error, on_error)
        # Start connection (Synchronous call, NO await)
        options = LiveOptions(
            model="nova-2",
            language=self.current_lang,
            smart_format=True,
            encoding="linear16",
            channels=1,
-            format=pyaudio.paInt16,
+            sample_rate=SAMPLE_RATE,
-            input=True,
+            interim_results=True,
-            frames_per_buffer=4096
+            utterance_end_ms="1200", 
            vad_events=True,
        )
-        print("✅ Модель Vosk загружена")
+
-    
+        if dg_connection.start(options) is False:
-    def listen(self, timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
+            print("Failed to start Deepgram connection")
            return
        # Audio sending loop
        async def send_audio():
            chunks_sent = 0
            try:
                stream.start_stream()
                print("🎤 Stream started, sending audio...")
                while not stop_event.is_set():
                    if stream.is_active():
                        data = stream.read(4096, exception_on_overflow=False)
                        # Send is synchronous in Sync client, NO await
                        dg_connection.send(data)
                        chunks_sent += 1
                        if chunks_sent % 50 == 0:
                            print(f".", end="", flush=True)
                    # Yield to allow event loop to process events (timeouts etc)
                    await asyncio.sleep(0.005)
            except Exception as e:
                print(f"Audio send error: {e}")
            finally:
                stream.stop_stream()
                print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
        sender_task = asyncio.create_task(send_audio())
        try:
            # 1. Wait for speech to start (detection_timeout)
            if detection_timeout:
                try:
                    await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
                except asyncio.TimeoutError:
                    # print("Detection timeout - no speech")
                    stop_event.set()
            # 2. If started (or no detection timeout), wait for completion
            if not stop_event.is_set():
                await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
        except asyncio.TimeoutError:
            # print("Global timeout")
            pass
        stop_event.set()
        await sender_task
        # Finish is synchronous
        dg_connection.finish()
        return self.transcript
    def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
        """
        Listen to microphone and transcribe speech.
        Args:
            timeout_seconds: Maximum time to listen for speech
            detection_timeout: Time to wait for speech to start. If None, uses timeout_seconds.
        Returns:
            Transcribed text from speech
        """
-        if not self.model:
+        if not self.dg_client:
            self.initialize()
        print("🎙️ Слушаю... (говорите)")
        # Reset recognizer for new recognition
        self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
        frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
        detection_frames = int(SAMPLE_RATE * detection_timeout / 4096) if detection_timeout else frames_to_read
        silence_frames = 0
        max_silence_frames = 10  # About 2.5 seconds of silence
        speech_started = False
        for i in range(frames_to_read):
            data = self.stream.read(4096, exception_on_overflow=False)
-            if self.recognizer.AcceptWaveform(data):
+        self.current_lang = lang
-                result = json.loads(self.recognizer.Result())
+        print(f"🎙️ Слушаю ({lang})...")
-                text = result.get("text", "").strip()
+
-                if text:
+        # Create a new connection for each listen session
-                    print(f"📝 Распознано: {text}")
+        dg_connection = self.dg_client.listen.live.v("1")
-                    return text
+
-                silence_frames += 1
+        try:
            transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
            final_text = transcript.strip() if transcript else ""
            if final_text:
                print(f"📝 Распознано: {final_text}")
            else:
                # Check partial result
                partial = json.loads(self.recognizer.PartialResult())
                if partial.get("partial", ""):
                    silence_frames = 0
                    speech_started = True
                else:
                    silence_frames += 1
            # Check detection timeout
            if not speech_started and i > detection_frames:
                break
            # Stop if too much silence after speech
            if silence_frames > max_silence_frames:
                break
        # Get final result
        result = json.loads(self.recognizer.FinalResult())
        text = result.get("text", "").strip()
        if text:
            print(f"📝 Распознано: {text}")
        else:
            # Only print if we weren't just checking for presence of speech
            if not detection_timeout or speech_started:
                print("⚠️ Речь не распознана")
-        
+                
-        return text
+            return final_text
-    
+            
        except Exception as e:
            print(f"❌ Ошибка STT: {e}")
            return ""
    def cleanup(self):
        """Release resources."""
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
        if self.pa:
            self.pa.terminate()
@@ -120,9 +208,9 @@ def get_recognizer() -> SpeechRecognizer:
    return _recognizer
-def listen(timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
+def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
    """Listen to microphone and return transcribed text."""
-    return get_recognizer().listen(timeout_seconds, detection_timeout)
+    return get_recognizer().listen(timeout_seconds, detection_timeout, lang)
 def cleanup():
@@ -130,4 +218,4 @@ def cleanup():
    global _recognizer
    if _recognizer:
        _recognizer.cleanup()
-        _recognizer = None
+        _recognizer = None
--- a/tts.py
+++ b/tts.py
@@ -1,6 +1,6 @@
 """
 Text-to-Speech module using Silero TTS.
-Generates natural Russian speech with Xenia voice.
+Generates natural Russian speech.
 Supports interruption via wake word detection using threading.
 """
@@ -140,10 +140,6 @@ class TextToSpeech:
            except Exception as e:
                print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
                success = False
                # Continue with next chunk? or break?
                # Usually if one fails, we might want to try others, but for "too long" error
                # splitting should solve it. If it fails for other reasons, maybe better to stop.
                # But let's keep trying subsequent chunks in case it's a specific symbol issue.
        if success and not self._interrupted:
            print("✅ Воспроизведение завершено")
--- a/wakeword.py
+++ b/wakeword.py
@@ -40,6 +40,24 @@ class WakeWordDetector:
        """
        if not self.porcupine:
            self.initialize()
        # Ensure stream is open and active
        if self.audio_stream is None or not self.audio_stream.is_active():
             # If closed or None, we might need to recreate it. 
             # PyAudio streams once closed cannot be reopened usually? 
             # We should probably recreate it.
             if self.audio_stream:
                 try:
                     self.audio_stream.close()
                 except: pass
             self.audio_stream = self.pa.open(
                rate=self.porcupine.sample_rate,
                channels=1,
                format=pyaudio.paInt16,
                input=True,
                frames_per_buffer=self.porcupine.frame_length
             )
        while True:
            pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
@@ -48,6 +66,9 @@ class WakeWordDetector:
            keyword_index = self.porcupine.process(pcm)
            if keyword_index >= 0:
                print("✅ Wake word обнаружен!")
                # Stop and CLOSE stream to release mic for STT
                self.audio_stream.stop_stream()
                self.audio_stream.close()
                return True
    def check_wakeword_once(self) -> bool:
@@ -59,6 +80,21 @@ class WakeWordDetector:
            self.initialize()
        try:
            # Ensure stream is open/active
            if self.audio_stream is None or not self.audio_stream.is_active():
                # Re-open if needed (similar to wait_for_wakeword logic)
                 if self.audio_stream:
                     try:
                         self.audio_stream.close()
                     except: pass
                 self.audio_stream = self.pa.open(
                    rate=self.porcupine.sample_rate,
                    channels=1,
                    format=pyaudio.paInt16,
                    input=True,
                    frames_per_buffer=self.porcupine.frame_length
                 )
            pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
            pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)