221 lines
7.4 KiB
Python
221 lines
7.4 KiB
Python
"""
|
|
Speech-to-Text module using Deepgram API.
|
|
Recognizes speech from microphone using streaming WebSocket.
|
|
Supports Russian (default) and English.
|
|
"""
|
|
import os
|
|
import asyncio
|
|
import threading
|
|
import pyaudio
|
|
import logging
|
|
from config import DEEPGRAM_API_KEY, SAMPLE_RATE
|
|
from deepgram import (
|
|
DeepgramClient,
|
|
DeepgramClientOptions,
|
|
LiveTranscriptionEvents,
|
|
LiveOptions,
|
|
Microphone,
|
|
)
|
|
|
|
# Configure logging to suppress debug noise
|
|
logging.getLogger("deepgram").setLevel(logging.WARNING)
|
|
|
|
class SpeechRecognizer:
|
|
"""Speech recognizer using Deepgram streaming."""
|
|
|
|
def __init__(self):
|
|
self.dg_client = None
|
|
self.pa = None
|
|
self.stream = None
|
|
self.transcript = ""
|
|
self.lock = threading.Lock()
|
|
|
|
def initialize(self):
|
|
"""Initialize Deepgram client and PyAudio."""
|
|
if not DEEPGRAM_API_KEY:
|
|
raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
|
|
|
|
print("📦 Инициализация Deepgram STT...")
|
|
config = DeepgramClientOptions(
|
|
verbose=logging.WARNING,
|
|
)
|
|
self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
|
|
|
|
self.pa = pyaudio.PyAudio()
|
|
print("✅ Deepgram клиент готов")
|
|
|
|
def _get_stream(self):
|
|
"""Open audio stream if not open."""
|
|
if self.stream is None:
|
|
self.stream = self.pa.open(
|
|
rate=SAMPLE_RATE,
|
|
channels=1,
|
|
format=pyaudio.paInt16,
|
|
input=True,
|
|
frames_per_buffer=4096,
|
|
)
|
|
return self.stream
|
|
|
|
async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
|
|
"""Async loop to send audio and wait for results."""
|
|
self.transcript = ""
|
|
|
|
loop = asyncio.get_running_loop()
|
|
stream = self._get_stream()
|
|
|
|
stop_event = asyncio.Event()
|
|
speech_started_event = asyncio.Event()
|
|
|
|
# We need access to the outer 'self' (SpeechRecognizer instance)
|
|
speech_recognizer_self = self
|
|
|
|
def on_transcript(unused_self, result, **kwargs):
|
|
sentence = result.channel.alternatives[0].transcript
|
|
if len(sentence) == 0:
|
|
return
|
|
if result.is_final:
|
|
print(f"📝 Частичный результат: {sentence}")
|
|
with speech_recognizer_self.lock:
|
|
speech_recognizer_self.transcript = sentence
|
|
|
|
def on_speech_started(unused_self, speech_started, **kwargs):
|
|
loop.call_soon_threadsafe(speech_started_event.set)
|
|
|
|
def on_utterance_end(unused_self, utterance_end, **kwargs):
|
|
loop.call_soon_threadsafe(stop_event.set)
|
|
|
|
def on_error(unused_self, error, **kwargs):
|
|
print(f"Error: {error}")
|
|
loop.call_soon_threadsafe(stop_event.set)
|
|
|
|
dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
|
|
dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
|
|
dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
|
|
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
|
|
|
|
# Start connection (Synchronous call, NO await)
|
|
options = LiveOptions(
|
|
model="nova-2",
|
|
language=self.current_lang,
|
|
smart_format=True,
|
|
encoding="linear16",
|
|
channels=1,
|
|
sample_rate=SAMPLE_RATE,
|
|
interim_results=True,
|
|
utterance_end_ms="1200",
|
|
vad_events=True,
|
|
)
|
|
|
|
if dg_connection.start(options) is False:
|
|
print("Failed to start Deepgram connection")
|
|
return
|
|
|
|
# Audio sending loop
|
|
async def send_audio():
|
|
chunks_sent = 0
|
|
try:
|
|
stream.start_stream()
|
|
print("🎤 Stream started, sending audio...")
|
|
while not stop_event.is_set():
|
|
if stream.is_active():
|
|
data = stream.read(4096, exception_on_overflow=False)
|
|
# Send is synchronous in Sync client, NO await
|
|
dg_connection.send(data)
|
|
chunks_sent += 1
|
|
if chunks_sent % 50 == 0:
|
|
print(f".", end="", flush=True)
|
|
# Yield to allow event loop to process events (timeouts etc)
|
|
await asyncio.sleep(0.005)
|
|
except Exception as e:
|
|
print(f"Audio send error: {e}")
|
|
finally:
|
|
stream.stop_stream()
|
|
print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
|
|
|
|
sender_task = asyncio.create_task(send_audio())
|
|
|
|
try:
|
|
# 1. Wait for speech to start (detection_timeout)
|
|
if detection_timeout:
|
|
try:
|
|
await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
|
|
except asyncio.TimeoutError:
|
|
# print("Detection timeout - no speech")
|
|
stop_event.set()
|
|
|
|
# 2. If started (or no detection timeout), wait for completion
|
|
if not stop_event.is_set():
|
|
await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
|
|
|
|
except asyncio.TimeoutError:
|
|
# print("Global timeout")
|
|
pass
|
|
|
|
stop_event.set()
|
|
await sender_task
|
|
# Finish is synchronous
|
|
dg_connection.finish()
|
|
|
|
return self.transcript
|
|
|
|
def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
|
|
"""
|
|
Listen to microphone and transcribe speech.
|
|
"""
|
|
if not self.dg_client:
|
|
self.initialize()
|
|
|
|
self.current_lang = lang
|
|
print(f"🎙️ Слушаю ({lang})...")
|
|
|
|
# Create a new connection for each listen session
|
|
dg_connection = self.dg_client.listen.live.v("1")
|
|
|
|
try:
|
|
transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
|
|
|
|
final_text = transcript.strip() if transcript else ""
|
|
if final_text:
|
|
print(f"📝 Распознано: {final_text}")
|
|
else:
|
|
print("⚠️ Речь не распознана")
|
|
|
|
return final_text
|
|
|
|
except Exception as e:
|
|
print(f"❌ Ошибка STT: {e}")
|
|
return ""
|
|
|
|
def cleanup(self):
|
|
"""Release resources."""
|
|
if self.stream:
|
|
self.stream.stop_stream()
|
|
self.stream.close()
|
|
self.stream = None
|
|
if self.pa:
|
|
self.pa.terminate()
|
|
|
|
|
|
# Global instance
|
|
_recognizer = None
|
|
|
|
|
|
def get_recognizer() -> SpeechRecognizer:
|
|
"""Get or create speech recognizer instance."""
|
|
global _recognizer
|
|
if _recognizer is None:
|
|
_recognizer = SpeechRecognizer()
|
|
return _recognizer
|
|
|
|
|
|
def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
|
|
"""Listen to microphone and return transcribed text."""
|
|
return get_recognizer().listen(timeout_seconds, detection_timeout, lang)
|
|
|
|
|
|
def cleanup():
|
|
"""Cleanup recognizer resources."""
|
|
global _recognizer
|
|
if _recognizer:
|
|
_recognizer.cleanup()
|
|
_recognizer = None |