Migrate to Deepgram STT, Silero v5 TTS, and fix wake word mic handling

This commit is contained in:
2026-01-07 17:59:18 +03:00
parent 7b79593cad
commit 53809c03f4
5 changed files with 233 additions and 89 deletions

View File

@@ -17,6 +17,9 @@ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
PERPLEXITY_MODEL = os.getenv("PERPLEXITY_MODEL", "llama-3.1-sonar-small-128k-chat")
PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
# Deepgram configuration
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
# Porcupine configuration
PORCUPINE_ACCESS_KEY = os.getenv("PORCUPINE_ACCESS_KEY")
PORCUPINE_KEYWORD_PATH = BASE_DIR / "Alexandr_en_linux_v4_0_0.ppn"

View File

@@ -1,15 +1,32 @@
aenum==3.1.16
aiofiles==25.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.13.3
aiosignal==1.4.0
antlr4-python3-runtime==4.9.3
anyio==4.12.1
attrs==25.4.0
certifi==2025.11.12
cffi==2.0.0
charset-normalizer==3.4.4
dataclasses-json==0.6.7
DAWG2-Python==0.9.0
deepgram-sdk==3.11.0
deprecation==2.1.0
docopt==0.6.2
filelock==3.20.1
frozenlist==1.8.0
fsspec==2025.12.0
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.11
Jinja2==3.1.6
MarkupSafe==3.0.3
marshmallow==3.26.2
mpmath==1.3.0
multidict==6.7.0
mypy_extensions==1.1.0
networkx==3.6.1
num2words==0.5.14
numpy==2.4.0
@@ -29,6 +46,8 @@ nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.8.90
omegaconf==2.3.0
packaging==25.0
propcache==0.4.1
pvporcupine==4.0.1
PyAudio==0.2.14
pycparser==2.23
@@ -46,7 +65,9 @@ torch==2.9.1
torchaudio==2.9.1
tqdm==4.67.1
triton==3.5.1
typing-inspect==0.9.0
typing_extensions==4.15.0
urllib3==2.6.2
vosk==0.3.45
websockets==15.0.1
yarl==1.22.0

242
stt.py
View File

@@ -1,109 +1,197 @@
"""
Speech-to-Text module using Vosk.
Recognizes Russian speech from microphone.
Speech-to-Text module using Deepgram API.
Recognizes speech from microphone using streaming WebSocket.
Supports Russian (default) and English.
"""
import json
import os
import asyncio
import threading
import pyaudio
from vosk import Model, KaldiRecognizer
from config import VOSK_MODEL_PATH, SAMPLE_RATE
import logging
from config import DEEPGRAM_API_KEY, SAMPLE_RATE
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
Microphone,
)
# Configure logging to suppress debug noise
logging.getLogger("deepgram").setLevel(logging.WARNING)
class SpeechRecognizer:
"""Speech recognizer using Vosk."""
"""Speech recognizer using Deepgram streaming."""
def __init__(self):
self.model = None
self.recognizer = None
self.dg_client = None
self.pa = None
self.stream = None
self.transcript = ""
self.lock = threading.Lock()
def initialize(self):
"""Initialize Vosk model and audio stream."""
print("📦 Загрузка модели Vosk...")
self.model = Model(str(VOSK_MODEL_PATH))
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
self.recognizer.SetWords(True)
"""Initialize Deepgram client and PyAudio."""
if not DEEPGRAM_API_KEY:
raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
print("📦 Инициализация Deepgram STT...")
config = DeepgramClientOptions(
verbose=logging.WARNING,
)
self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=4096
)
print("✅ Модель Vosk загружена")
print("✅ Deepgram клиент готов")
def listen(self, timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
def _get_stream(self):
"""Open audio stream if not open."""
if self.stream is None:
self.stream = self.pa.open(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=4096,
)
return self.stream
async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
"""Async loop to send audio and wait for results."""
self.transcript = ""
loop = asyncio.get_running_loop()
stream = self._get_stream()
stop_event = asyncio.Event()
speech_started_event = asyncio.Event()
# We need access to the outer 'self' (SpeechRecognizer instance)
speech_recognizer_self = self
def on_transcript(unused_self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
if result.is_final:
print(f"📝 Частичный результат: {sentence}")
with speech_recognizer_self.lock:
speech_recognizer_self.transcript = sentence
def on_speech_started(unused_self, speech_started, **kwargs):
loop.call_soon_threadsafe(speech_started_event.set)
def on_utterance_end(unused_self, utterance_end, **kwargs):
loop.call_soon_threadsafe(stop_event.set)
def on_error(unused_self, error, **kwargs):
print(f"Error: {error}")
loop.call_soon_threadsafe(stop_event.set)
dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
# Start connection (Synchronous call, NO await)
options = LiveOptions(
model="nova-2",
language=self.current_lang,
smart_format=True,
encoding="linear16",
channels=1,
sample_rate=SAMPLE_RATE,
interim_results=True,
utterance_end_ms="1200",
vad_events=True,
)
if dg_connection.start(options) is False:
print("Failed to start Deepgram connection")
return
# Audio sending loop
async def send_audio():
chunks_sent = 0
try:
stream.start_stream()
print("🎤 Stream started, sending audio...")
while not stop_event.is_set():
if stream.is_active():
data = stream.read(4096, exception_on_overflow=False)
# Send is synchronous in Sync client, NO await
dg_connection.send(data)
chunks_sent += 1
if chunks_sent % 50 == 0:
print(f".", end="", flush=True)
# Yield to allow event loop to process events (timeouts etc)
await asyncio.sleep(0.005)
except Exception as e:
print(f"Audio send error: {e}")
finally:
stream.stop_stream()
print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
sender_task = asyncio.create_task(send_audio())
try:
# 1. Wait for speech to start (detection_timeout)
if detection_timeout:
try:
await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
except asyncio.TimeoutError:
# print("Detection timeout - no speech")
stop_event.set()
# 2. If started (or no detection timeout), wait for completion
if not stop_event.is_set():
await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
except asyncio.TimeoutError:
# print("Global timeout")
pass
stop_event.set()
await sender_task
# Finish is synchronous
dg_connection.finish()
return self.transcript
def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
"""
Listen to microphone and transcribe speech.
Args:
timeout_seconds: Maximum time to listen for speech
detection_timeout: Time to wait for speech to start. If None, uses timeout_seconds.
Returns:
Transcribed text from speech
"""
if not self.model:
if not self.dg_client:
self.initialize()
print("🎙️ Слушаю... (говорите)")
self.current_lang = lang
print(f"🎙️ Слушаю ({lang})...")
# Reset recognizer for new recognition
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
# Create a new connection for each listen session
dg_connection = self.dg_client.listen.live.v("1")
frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
detection_frames = int(SAMPLE_RATE * detection_timeout / 4096) if detection_timeout else frames_to_read
try:
transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
silence_frames = 0
max_silence_frames = 10 # About 2.5 seconds of silence
speech_started = False
for i in range(frames_to_read):
data = self.stream.read(4096, exception_on_overflow=False)
if self.recognizer.AcceptWaveform(data):
result = json.loads(self.recognizer.Result())
text = result.get("text", "").strip()
if text:
print(f"📝 Распознано: {text}")
return text
silence_frames += 1
final_text = transcript.strip() if transcript else ""
if final_text:
print(f"📝 Распознано: {final_text}")
else:
# Check partial result
partial = json.loads(self.recognizer.PartialResult())
if partial.get("partial", ""):
silence_frames = 0
speech_started = True
else:
silence_frames += 1
# Check detection timeout
if not speech_started and i > detection_frames:
break
# Stop if too much silence after speech
if silence_frames > max_silence_frames:
break
# Get final result
result = json.loads(self.recognizer.FinalResult())
text = result.get("text", "").strip()
if text:
print(f"📝 Распознано: {text}")
else:
# Only print if we weren't just checking for presence of speech
if not detection_timeout or speech_started:
print("⚠️ Речь не распознана")
return text
return final_text
except Exception as e:
print(f"❌ Ошибка STT: {e}")
return ""
def cleanup(self):
"""Release resources."""
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.pa:
self.pa.terminate()
@@ -120,9 +208,9 @@ def get_recognizer() -> SpeechRecognizer:
return _recognizer
def listen(timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
"""Listen to microphone and return transcribed text."""
return get_recognizer().listen(timeout_seconds, detection_timeout)
return get_recognizer().listen(timeout_seconds, detection_timeout, lang)
def cleanup():

6
tts.py
View File

@@ -1,6 +1,6 @@
"""
Text-to-Speech module using Silero TTS.
Generates natural Russian speech with Xenia voice.
Generates natural Russian speech.
Supports interruption via wake word detection using threading.
"""
@@ -140,10 +140,6 @@ class TextToSpeech:
except Exception as e:
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
success = False
# Continue with next chunk? or break?
# Usually if one fails, we might want to try others, but for "too long" error
# splitting should solve it. If it fails for other reasons, maybe better to stop.
# But let's keep trying subsequent chunks in case it's a specific symbol issue.
if success and not self._interrupted:
print("✅ Воспроизведение завершено")

View File

@@ -41,6 +41,24 @@ class WakeWordDetector:
if not self.porcupine:
self.initialize()
# Ensure stream is open and active
if self.audio_stream is None or not self.audio_stream.is_active():
# If closed or None, we might need to recreate it.
# PyAudio streams once closed cannot be reopened usually?
# We should probably recreate it.
if self.audio_stream:
try:
self.audio_stream.close()
except: pass
self.audio_stream = self.pa.open(
rate=self.porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=self.porcupine.frame_length
)
while True:
pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)
@@ -48,6 +66,9 @@ class WakeWordDetector:
keyword_index = self.porcupine.process(pcm)
if keyword_index >= 0:
print("✅ Wake word обнаружен!")
# Stop and CLOSE stream to release mic for STT
self.audio_stream.stop_stream()
self.audio_stream.close()
return True
def check_wakeword_once(self) -> bool:
@@ -59,6 +80,21 @@ class WakeWordDetector:
self.initialize()
try:
# Ensure stream is open/active
if self.audio_stream is None or not self.audio_stream.is_active():
# Re-open if needed (similar to wait_for_wakeword logic)
if self.audio_stream:
try:
self.audio_stream.close()
except: pass
self.audio_stream = self.pa.open(
rate=self.porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=self.porcupine.frame_length
)
pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)