smart-speaker/stt.py

"""
Speech-to-Text module using Deepgram API.
Recognizes speech from microphone using streaming WebSocket.
Supports Russian (default) and English.
"""
import os
import asyncio
import threading
import pyaudio
import logging
from config import DEEPGRAM_API_KEY, SAMPLE_RATE
from deepgram import (
    DeepgramClient,
    DeepgramClientOptions,
    LiveTranscriptionEvents,
    LiveOptions,
    Microphone,
)

# Configure logging to suppress debug noise
logging.getLogger("deepgram").setLevel(logging.WARNING)

class SpeechRecognizer:
    """Speech recognizer using Deepgram streaming."""

    def __init__(self):
        self.dg_client = None
        self.pa = None
        self.stream = None
        self.transcript = ""
        self.lock = threading.Lock()

    def initialize(self):
        """Initialize Deepgram client and PyAudio."""
        if not DEEPGRAM_API_KEY:
            raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")

        print("📦 Инициализация Deepgram STT...")
        config = DeepgramClientOptions(
            verbose=logging.WARNING,
        )
        self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)

        self.pa = pyaudio.PyAudio()
        print("✅ Deepgram клиент готов")

    def _get_stream(self):
        """Open audio stream if not open."""
        if self.stream is None:
            self.stream = self.pa.open(
                rate=SAMPLE_RATE,
                channels=1,
                format=pyaudio.paInt16,
                input=True,
                frames_per_buffer=4096,
            )
        return self.stream

    async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
        """Async loop to send audio and wait for results."""
        self.transcript = ""

        loop = asyncio.get_running_loop()
        stream = self._get_stream()

        stop_event = asyncio.Event()
        speech_started_event = asyncio.Event()

        # We need access to the outer 'self' (SpeechRecognizer instance)
        speech_recognizer_self = self

        def on_transcript(unused_self, result, **kwargs):
            sentence = result.channel.alternatives[0].transcript
            if len(sentence) == 0:
                return
            if result.is_final:
                print(f"📝 Частичный результат: {sentence}")
                with speech_recognizer_self.lock:
                    speech_recognizer_self.transcript = sentence

        def on_speech_started(unused_self, speech_started, **kwargs):
            loop.call_soon_threadsafe(speech_started_event.set)

        def on_utterance_end(unused_self, utterance_end, **kwargs):
            loop.call_soon_threadsafe(stop_event.set)

        def on_error(unused_self, error, **kwargs):
            print(f"Error: {error}")
            loop.call_soon_threadsafe(stop_event.set)

        dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
        dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
        dg_connection.on(LiveTranscriptionEvents.Error, on_error)

        # Start connection (Synchronous call, NO await)
        options = LiveOptions(
            model="nova-2",
            language=self.current_lang,
            smart_format=True,
            encoding="linear16",
            channels=1,
            sample_rate=SAMPLE_RATE,
            interim_results=True,
            utterance_end_ms="1200",
            vad_events=True,
        )

        if dg_connection.start(options) is False:
            print("Failed to start Deepgram connection")
            return

        # Audio sending loop
        async def send_audio():
            chunks_sent = 0
            try:
                stream.start_stream()
                print("🎤 Stream started, sending audio...")
                while not stop_event.is_set():
                    if stream.is_active():
                        data = stream.read(4096, exception_on_overflow=False)
                        # Send is synchronous in Sync client, NO await
                        dg_connection.send(data)
                        chunks_sent += 1
                        if chunks_sent % 50 == 0:
                            print(f".", end="", flush=True)
                    # Yield to allow event loop to process events (timeouts etc)
                    await asyncio.sleep(0.005)
            except Exception as e:
                print(f"Audio send error: {e}")
            finally:
                stream.stop_stream()
                print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")

        sender_task = asyncio.create_task(send_audio())

        try:
            # 1. Wait for speech to start (detection_timeout)
            if detection_timeout:
                try:
                    await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
                except asyncio.TimeoutError:
                    # print("Detection timeout - no speech")
                    stop_event.set()

            # 2. If started (or no detection timeout), wait for completion
            if not stop_event.is_set():
                await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)

        except asyncio.TimeoutError:
            # print("Global timeout")
            pass

        stop_event.set()
        await sender_task
        # Finish is synchronous
        dg_connection.finish()

        return self.transcript

    def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
        """
        Listen to microphone and transcribe speech.
        """
        if not self.dg_client:
            self.initialize()

        self.current_lang = lang
        print(f"🎙️ Слушаю ({lang})...")

        # Create a new connection for each listen session
        dg_connection = self.dg_client.listen.live.v("1")

        try:
            transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))

            final_text = transcript.strip() if transcript else ""
            if final_text:
                print(f"📝 Распознано: {final_text}")
            else:
                print("⚠️ Речь не распознана")

            return final_text

        except Exception as e:
            print(f"❌ Ошибка STT: {e}")
            return ""

    def cleanup(self):
        """Release resources."""
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
        if self.pa:
            self.pa.terminate()


# Global instance
_recognizer = None


def get_recognizer() -> SpeechRecognizer:
    """Get or create speech recognizer instance."""
    global _recognizer
    if _recognizer is None:
        _recognizer = SpeechRecognizer()
    return _recognizer


def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
    """Listen to microphone and return transcribed text."""
    return get_recognizer().listen(timeout_seconds, detection_timeout, lang)


def cleanup():
    """Cleanup recognizer resources."""
    global _recognizer
    if _recognizer:
        _recognizer.cleanup()
        _recognizer = None