Files
smart-speaker/stt.py

221 lines
7.4 KiB
Python

"""
Speech-to-Text module using Deepgram API.
Recognizes speech from microphone using streaming WebSocket.
Supports Russian (default) and English.
"""
import os
import asyncio
import threading
import pyaudio
import logging
from config import DEEPGRAM_API_KEY, SAMPLE_RATE
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
Microphone,
)
# Configure logging to suppress debug noise
logging.getLogger("deepgram").setLevel(logging.WARNING)
class SpeechRecognizer:
"""Speech recognizer using Deepgram streaming."""
def __init__(self):
self.dg_client = None
self.pa = None
self.stream = None
self.transcript = ""
self.lock = threading.Lock()
def initialize(self):
"""Initialize Deepgram client and PyAudio."""
if not DEEPGRAM_API_KEY:
raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
print("📦 Инициализация Deepgram STT...")
config = DeepgramClientOptions(
verbose=logging.WARNING,
)
self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
self.pa = pyaudio.PyAudio()
print("✅ Deepgram клиент готов")
def _get_stream(self):
"""Open audio stream if not open."""
if self.stream is None:
self.stream = self.pa.open(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=4096,
)
return self.stream
async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
"""Async loop to send audio and wait for results."""
self.transcript = ""
loop = asyncio.get_running_loop()
stream = self._get_stream()
stop_event = asyncio.Event()
speech_started_event = asyncio.Event()
# We need access to the outer 'self' (SpeechRecognizer instance)
speech_recognizer_self = self
def on_transcript(unused_self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
if result.is_final:
print(f"📝 Частичный результат: {sentence}")
with speech_recognizer_self.lock:
speech_recognizer_self.transcript = sentence
def on_speech_started(unused_self, speech_started, **kwargs):
loop.call_soon_threadsafe(speech_started_event.set)
def on_utterance_end(unused_self, utterance_end, **kwargs):
loop.call_soon_threadsafe(stop_event.set)
def on_error(unused_self, error, **kwargs):
print(f"Error: {error}")
loop.call_soon_threadsafe(stop_event.set)
dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
# Start connection (Synchronous call, NO await)
options = LiveOptions(
model="nova-2",
language=self.current_lang,
smart_format=True,
encoding="linear16",
channels=1,
sample_rate=SAMPLE_RATE,
interim_results=True,
utterance_end_ms="1200",
vad_events=True,
)
if dg_connection.start(options) is False:
print("Failed to start Deepgram connection")
return
# Audio sending loop
async def send_audio():
chunks_sent = 0
try:
stream.start_stream()
print("🎤 Stream started, sending audio...")
while not stop_event.is_set():
if stream.is_active():
data = stream.read(4096, exception_on_overflow=False)
# Send is synchronous in Sync client, NO await
dg_connection.send(data)
chunks_sent += 1
if chunks_sent % 50 == 0:
print(f".", end="", flush=True)
# Yield to allow event loop to process events (timeouts etc)
await asyncio.sleep(0.005)
except Exception as e:
print(f"Audio send error: {e}")
finally:
stream.stop_stream()
print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
sender_task = asyncio.create_task(send_audio())
try:
# 1. Wait for speech to start (detection_timeout)
if detection_timeout:
try:
await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
except asyncio.TimeoutError:
# print("Detection timeout - no speech")
stop_event.set()
# 2. If started (or no detection timeout), wait for completion
if not stop_event.is_set():
await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
except asyncio.TimeoutError:
# print("Global timeout")
pass
stop_event.set()
await sender_task
# Finish is synchronous
dg_connection.finish()
return self.transcript
def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
"""
Listen to microphone and transcribe speech.
"""
if not self.dg_client:
self.initialize()
self.current_lang = lang
print(f"🎙️ Слушаю ({lang})...")
# Create a new connection for each listen session
dg_connection = self.dg_client.listen.live.v("1")
try:
transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
final_text = transcript.strip() if transcript else ""
if final_text:
print(f"📝 Распознано: {final_text}")
else:
print("⚠️ Речь не распознана")
return final_text
except Exception as e:
print(f"❌ Ошибка STT: {e}")
return ""
def cleanup(self):
"""Release resources."""
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.pa:
self.pa.terminate()
# Global instance
_recognizer = None
def get_recognizer() -> SpeechRecognizer:
"""Get or create speech recognizer instance."""
global _recognizer
if _recognizer is None:
_recognizer = SpeechRecognizer()
return _recognizer
def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
"""Listen to microphone and return transcribed text."""
return get_recognizer().listen(timeout_seconds, detection_timeout, lang)
def cleanup():
"""Cleanup recognizer resources."""
global _recognizer
if _recognizer:
_recognizer.cleanup()
_recognizer = None