""" Speech-to-Text module using Deepgram API. Recognizes speech from microphone using streaming WebSocket. Supports Russian (default) and English. """ import os import asyncio import threading import pyaudio import logging from config import DEEPGRAM_API_KEY, SAMPLE_RATE from deepgram import ( DeepgramClient, DeepgramClientOptions, LiveTranscriptionEvents, LiveOptions, Microphone, ) # Configure logging to suppress debug noise logging.getLogger("deepgram").setLevel(logging.WARNING) class SpeechRecognizer: """Speech recognizer using Deepgram streaming.""" def __init__(self): self.dg_client = None self.pa = None self.stream = None self.transcript = "" self.lock = threading.Lock() def initialize(self): """Initialize Deepgram client and PyAudio.""" if not DEEPGRAM_API_KEY: raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.") print("📦 Инициализация Deepgram STT...") config = DeepgramClientOptions( verbose=logging.WARNING, ) self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config) self.pa = pyaudio.PyAudio() print("✅ Deepgram клиент готов") def _get_stream(self): """Open audio stream if not open.""" if self.stream is None: self.stream = self.pa.open( rate=SAMPLE_RATE, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=4096, ) return self.stream async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout): """Async loop to send audio and wait for results.""" self.transcript = "" transcript_parts = [] loop = asyncio.get_running_loop() stream = self._get_stream() stop_event = asyncio.Event() speech_started_event = asyncio.Event() # We need access to the outer 'self' (SpeechRecognizer instance) speech_recognizer_self = self def on_transcript(unused_self, result, **kwargs): sentence = result.channel.alternatives[0].transcript if len(sentence) == 0: return if result.is_final: with speech_recognizer_self.lock: transcript_parts.append(sentence) speech_recognizer_self.transcript = " ".join( transcript_parts ).strip() def on_speech_started(unused_self, speech_started, **kwargs): loop.call_soon_threadsafe(speech_started_event.set) def on_utterance_end(unused_self, utterance_end, **kwargs): loop.call_soon_threadsafe(stop_event.set) def on_error(unused_self, error, **kwargs): print(f"Error: {error}") loop.call_soon_threadsafe(stop_event.set) dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript) dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started) dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end) dg_connection.on(LiveTranscriptionEvents.Error, on_error) # Start connection (Synchronous call, NO await) options = LiveOptions( model="nova-2", language=self.current_lang, smart_format=True, encoding="linear16", channels=1, sample_rate=SAMPLE_RATE, interim_results=True, utterance_end_ms=1200, vad_events=True, ) if dg_connection.start(options) is False: print("Failed to start Deepgram connection") return # Audio sending loop async def send_audio(): chunks_sent = 0 try: stream.start_stream() print("🎤 Stream started, sending audio...") while not stop_event.is_set(): if stream.is_active(): data = stream.read(4096, exception_on_overflow=False) # Send is synchronous in Sync client, NO await dg_connection.send(data) chunks_sent += 1 if chunks_sent % 50 == 0: print(f".", end="", flush=True) # Yield to allow event loop to process events (timeouts etc) await asyncio.sleep(0.005) except Exception as e: print(f"Audio send error: {e}") finally: stream.stop_stream() print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}") sender_task = asyncio.create_task(send_audio()) try: # 1. Wait for speech to start (detection_timeout) if detection_timeout: try: await asyncio.wait_for( speech_started_event.wait(), timeout=detection_timeout ) except asyncio.TimeoutError: # print("Detection timeout - no speech") stop_event.set() # 2. If started (or no detection timeout), wait for completion if not stop_event.is_set(): await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds) except asyncio.TimeoutError: # print("Global timeout") pass stop_event.set() await sender_task # Finish is synchronous dg_connection.finish() return self.transcript def listen( self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru", ) -> str: """ Listen to microphone and transcribe speech. """ if not self.dg_client: self.initialize() self.current_lang = lang print(f"🎙️ Слушаю ({lang})...") # Create a new connection for each listen session dg_connection = self.dg_client.listen.live.v("1") try: transcript = asyncio.run( self._process_audio(dg_connection, timeout_seconds, detection_timeout) ) final_text = transcript.strip() if transcript else "" if final_text: print(f"📝 Распознано: {final_text}") else: print("⚠️ Речь не распознана") return final_text except Exception as e: print(f"❌ Ошибка STT: {e}") return "" def cleanup(self): """Release resources.""" if self.stream: self.stream.stop_stream() self.stream.close() self.stream = None if self.pa: self.pa.terminate() # Global instance _recognizer = None def get_recognizer() -> SpeechRecognizer: """Get or create speech recognizer instance.""" global _recognizer if _recognizer is None: _recognizer = SpeechRecognizer() return _recognizer def listen( timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru" ) -> str: """Listen to microphone and return transcribed text.""" return get_recognizer().listen(timeout_seconds, detection_timeout, lang) def cleanup(): """Cleanup recognizer resources.""" global _recognizer if _recognizer: _recognizer.cleanup() _recognizer = None