diff --git a/config.py b/config.py index 1fc3b80..8c2c978 100644 --- a/config.py +++ b/config.py @@ -17,6 +17,9 @@ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY") PERPLEXITY_MODEL = os.getenv("PERPLEXITY_MODEL", "llama-3.1-sonar-small-128k-chat") PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions" +# Deepgram configuration +DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") + # Porcupine configuration PORCUPINE_ACCESS_KEY = os.getenv("PORCUPINE_ACCESS_KEY") PORCUPINE_KEYWORD_PATH = BASE_DIR / "Alexandr_en_linux_v4_0_0.ppn" diff --git a/requirements.txt b/requirements.txt index 83af425..9283cff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,32 @@ +aenum==3.1.16 +aiofiles==25.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiosignal==1.4.0 antlr4-python3-runtime==4.9.3 +anyio==4.12.1 +attrs==25.4.0 certifi==2025.11.12 cffi==2.0.0 charset-normalizer==3.4.4 +dataclasses-json==0.6.7 DAWG2-Python==0.9.0 +deepgram-sdk==3.11.0 +deprecation==2.1.0 docopt==0.6.2 filelock==3.20.1 +frozenlist==1.8.0 fsspec==2025.12.0 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 idna==3.11 Jinja2==3.1.6 MarkupSafe==3.0.3 +marshmallow==3.26.2 mpmath==1.3.0 +multidict==6.7.0 +mypy_extensions==1.1.0 networkx==3.6.1 num2words==0.5.14 numpy==2.4.0 @@ -29,6 +46,8 @@ nvidia-nvjitlink-cu12==12.8.93 nvidia-nvshmem-cu12==3.3.20 nvidia-nvtx-cu12==12.8.90 omegaconf==2.3.0 +packaging==25.0 +propcache==0.4.1 pvporcupine==4.0.1 PyAudio==0.2.14 pycparser==2.23 @@ -46,7 +65,9 @@ torch==2.9.1 torchaudio==2.9.1 tqdm==4.67.1 triton==3.5.1 +typing-inspect==0.9.0 typing_extensions==4.15.0 urllib3==2.6.2 vosk==0.3.45 websockets==15.0.1 +yarl==1.22.0 diff --git a/stt.py b/stt.py index 554a490..b4a632a 100644 --- a/stt.py +++ b/stt.py @@ -1,109 +1,197 @@ """ -Speech-to-Text module using Vosk. -Recognizes Russian speech from microphone. +Speech-to-Text module using Deepgram API. +Recognizes speech from microphone using streaming WebSocket. +Supports Russian (default) and English. """ -import json +import os +import asyncio +import threading import pyaudio -from vosk import Model, KaldiRecognizer -from config import VOSK_MODEL_PATH, SAMPLE_RATE +import logging +from config import DEEPGRAM_API_KEY, SAMPLE_RATE +from deepgram import ( + DeepgramClient, + DeepgramClientOptions, + LiveTranscriptionEvents, + LiveOptions, + Microphone, +) +# Configure logging to suppress debug noise +logging.getLogger("deepgram").setLevel(logging.WARNING) class SpeechRecognizer: - """Speech recognizer using Vosk.""" - + """Speech recognizer using Deepgram streaming.""" + def __init__(self): - self.model = None - self.recognizer = None + self.dg_client = None self.pa = None self.stream = None - + self.transcript = "" + self.lock = threading.Lock() + def initialize(self): - """Initialize Vosk model and audio stream.""" - print("📦 Загрузка модели Vosk...") - self.model = Model(str(VOSK_MODEL_PATH)) - self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE) - self.recognizer.SetWords(True) + """Initialize Deepgram client and PyAudio.""" + if not DEEPGRAM_API_KEY: + raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.") + + print("📦 Инициализация Deepgram STT...") + config = DeepgramClientOptions( + verbose=logging.WARNING, + ) + self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config) self.pa = pyaudio.PyAudio() - self.stream = self.pa.open( - rate=SAMPLE_RATE, + print("✅ Deepgram клиент готов") + + def _get_stream(self): + """Open audio stream if not open.""" + if self.stream is None: + self.stream = self.pa.open( + rate=SAMPLE_RATE, + channels=1, + format=pyaudio.paInt16, + input=True, + frames_per_buffer=4096, + ) + return self.stream + + async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout): + """Async loop to send audio and wait for results.""" + self.transcript = "" + + loop = asyncio.get_running_loop() + stream = self._get_stream() + + stop_event = asyncio.Event() + speech_started_event = asyncio.Event() + + # We need access to the outer 'self' (SpeechRecognizer instance) + speech_recognizer_self = self + + def on_transcript(unused_self, result, **kwargs): + sentence = result.channel.alternatives[0].transcript + if len(sentence) == 0: + return + if result.is_final: + print(f"📝 Частичный результат: {sentence}") + with speech_recognizer_self.lock: + speech_recognizer_self.transcript = sentence + + def on_speech_started(unused_self, speech_started, **kwargs): + loop.call_soon_threadsafe(speech_started_event.set) + + def on_utterance_end(unused_self, utterance_end, **kwargs): + loop.call_soon_threadsafe(stop_event.set) + + def on_error(unused_self, error, **kwargs): + print(f"Error: {error}") + loop.call_soon_threadsafe(stop_event.set) + + dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript) + dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started) + dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end) + dg_connection.on(LiveTranscriptionEvents.Error, on_error) + + # Start connection (Synchronous call, NO await) + options = LiveOptions( + model="nova-2", + language=self.current_lang, + smart_format=True, + encoding="linear16", channels=1, - format=pyaudio.paInt16, - input=True, - frames_per_buffer=4096 + sample_rate=SAMPLE_RATE, + interim_results=True, + utterance_end_ms="1200", + vad_events=True, ) - print("✅ Модель Vosk загружена") - - def listen(self, timeout_seconds: float = 5.0, detection_timeout: float = None) -> str: + + if dg_connection.start(options) is False: + print("Failed to start Deepgram connection") + return + + # Audio sending loop + async def send_audio(): + chunks_sent = 0 + try: + stream.start_stream() + print("🎤 Stream started, sending audio...") + while not stop_event.is_set(): + if stream.is_active(): + data = stream.read(4096, exception_on_overflow=False) + # Send is synchronous in Sync client, NO await + dg_connection.send(data) + chunks_sent += 1 + if chunks_sent % 50 == 0: + print(f".", end="", flush=True) + # Yield to allow event loop to process events (timeouts etc) + await asyncio.sleep(0.005) + except Exception as e: + print(f"Audio send error: {e}") + finally: + stream.stop_stream() + print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}") + + sender_task = asyncio.create_task(send_audio()) + + try: + # 1. Wait for speech to start (detection_timeout) + if detection_timeout: + try: + await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout) + except asyncio.TimeoutError: + # print("Detection timeout - no speech") + stop_event.set() + + # 2. If started (or no detection timeout), wait for completion + if not stop_event.is_set(): + await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds) + + except asyncio.TimeoutError: + # print("Global timeout") + pass + + stop_event.set() + await sender_task + # Finish is synchronous + dg_connection.finish() + + return self.transcript + + def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str: """ Listen to microphone and transcribe speech. - - Args: - timeout_seconds: Maximum time to listen for speech - detection_timeout: Time to wait for speech to start. If None, uses timeout_seconds. - - Returns: - Transcribed text from speech """ - if not self.model: + if not self.dg_client: self.initialize() - - print("🎙️ Слушаю... (говорите)") - - # Reset recognizer for new recognition - self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE) - - frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096) - detection_frames = int(SAMPLE_RATE * detection_timeout / 4096) if detection_timeout else frames_to_read - - silence_frames = 0 - max_silence_frames = 10 # About 2.5 seconds of silence - speech_started = False - - for i in range(frames_to_read): - data = self.stream.read(4096, exception_on_overflow=False) - if self.recognizer.AcceptWaveform(data): - result = json.loads(self.recognizer.Result()) - text = result.get("text", "").strip() - if text: - print(f"📝 Распознано: {text}") - return text - silence_frames += 1 + self.current_lang = lang + print(f"🎙️ Слушаю ({lang})...") + + # Create a new connection for each listen session + dg_connection = self.dg_client.listen.live.v("1") + + try: + transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout)) + + final_text = transcript.strip() if transcript else "" + if final_text: + print(f"📝 Распознано: {final_text}") else: - # Check partial result - partial = json.loads(self.recognizer.PartialResult()) - if partial.get("partial", ""): - silence_frames = 0 - speech_started = True - else: - silence_frames += 1 - - # Check detection timeout - if not speech_started and i > detection_frames: - break - - # Stop if too much silence after speech - if silence_frames > max_silence_frames: - break - - # Get final result - result = json.loads(self.recognizer.FinalResult()) - text = result.get("text", "").strip() - - if text: - print(f"📝 Распознано: {text}") - else: - # Only print if we weren't just checking for presence of speech - if not detection_timeout or speech_started: print("⚠️ Речь не распознана") - - return text - + + return final_text + + except Exception as e: + print(f"❌ Ошибка STT: {e}") + return "" + def cleanup(self): """Release resources.""" if self.stream: + self.stream.stop_stream() self.stream.close() + self.stream = None if self.pa: self.pa.terminate() @@ -120,9 +208,9 @@ def get_recognizer() -> SpeechRecognizer: return _recognizer -def listen(timeout_seconds: float = 5.0, detection_timeout: float = None) -> str: +def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str: """Listen to microphone and return transcribed text.""" - return get_recognizer().listen(timeout_seconds, detection_timeout) + return get_recognizer().listen(timeout_seconds, detection_timeout, lang) def cleanup(): @@ -130,4 +218,4 @@ def cleanup(): global _recognizer if _recognizer: _recognizer.cleanup() - _recognizer = None + _recognizer = None \ No newline at end of file diff --git a/tts.py b/tts.py index 7a9e97c..3e37a2e 100644 --- a/tts.py +++ b/tts.py @@ -1,6 +1,6 @@ """ Text-to-Speech module using Silero TTS. -Generates natural Russian speech with Xenia voice. +Generates natural Russian speech. Supports interruption via wake word detection using threading. """ @@ -140,10 +140,6 @@ class TextToSpeech: except Exception as e: print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}") success = False - # Continue with next chunk? or break? - # Usually if one fails, we might want to try others, but for "too long" error - # splitting should solve it. If it fails for other reasons, maybe better to stop. - # But let's keep trying subsequent chunks in case it's a specific symbol issue. if success and not self._interrupted: print("✅ Воспроизведение завершено") diff --git a/wakeword.py b/wakeword.py index 7e49ce1..1c7f8be 100644 --- a/wakeword.py +++ b/wakeword.py @@ -40,6 +40,24 @@ class WakeWordDetector: """ if not self.porcupine: self.initialize() + + # Ensure stream is open and active + if self.audio_stream is None or not self.audio_stream.is_active(): + # If closed or None, we might need to recreate it. + # PyAudio streams once closed cannot be reopened usually? + # We should probably recreate it. + if self.audio_stream: + try: + self.audio_stream.close() + except: pass + + self.audio_stream = self.pa.open( + rate=self.porcupine.sample_rate, + channels=1, + format=pyaudio.paInt16, + input=True, + frames_per_buffer=self.porcupine.frame_length + ) while True: pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False) @@ -48,6 +66,9 @@ class WakeWordDetector: keyword_index = self.porcupine.process(pcm) if keyword_index >= 0: print("✅ Wake word обнаружен!") + # Stop and CLOSE stream to release mic for STT + self.audio_stream.stop_stream() + self.audio_stream.close() return True def check_wakeword_once(self) -> bool: @@ -59,6 +80,21 @@ class WakeWordDetector: self.initialize() try: + # Ensure stream is open/active + if self.audio_stream is None or not self.audio_stream.is_active(): + # Re-open if needed (similar to wait_for_wakeword logic) + if self.audio_stream: + try: + self.audio_stream.close() + except: pass + self.audio_stream = self.pa.open( + rate=self.porcupine.sample_rate, + channels=1, + format=pyaudio.paInt16, + input=True, + frames_per_buffer=self.porcupine.frame_length + ) + pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False) pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)