Migrate to Deepgram STT, Silero v5 TTS, and fix wake word mic handling

This commit is contained in:
2026-01-07 17:59:18 +03:00
parent 7b79593cad
commit 53809c03f4
5 changed files with 233 additions and 89 deletions

View File

@@ -17,6 +17,9 @@ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
PERPLEXITY_MODEL = os.getenv("PERPLEXITY_MODEL", "llama-3.1-sonar-small-128k-chat") PERPLEXITY_MODEL = os.getenv("PERPLEXITY_MODEL", "llama-3.1-sonar-small-128k-chat")
PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions" PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
# Deepgram configuration
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
# Porcupine configuration # Porcupine configuration
PORCUPINE_ACCESS_KEY = os.getenv("PORCUPINE_ACCESS_KEY") PORCUPINE_ACCESS_KEY = os.getenv("PORCUPINE_ACCESS_KEY")
PORCUPINE_KEYWORD_PATH = BASE_DIR / "Alexandr_en_linux_v4_0_0.ppn" PORCUPINE_KEYWORD_PATH = BASE_DIR / "Alexandr_en_linux_v4_0_0.ppn"

View File

@@ -1,15 +1,32 @@
aenum==3.1.16
aiofiles==25.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.13.3
aiosignal==1.4.0
antlr4-python3-runtime==4.9.3 antlr4-python3-runtime==4.9.3
anyio==4.12.1
attrs==25.4.0
certifi==2025.11.12 certifi==2025.11.12
cffi==2.0.0 cffi==2.0.0
charset-normalizer==3.4.4 charset-normalizer==3.4.4
dataclasses-json==0.6.7
DAWG2-Python==0.9.0 DAWG2-Python==0.9.0
deepgram-sdk==3.11.0
deprecation==2.1.0
docopt==0.6.2 docopt==0.6.2
filelock==3.20.1 filelock==3.20.1
frozenlist==1.8.0
fsspec==2025.12.0 fsspec==2025.12.0
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.11 idna==3.11
Jinja2==3.1.6 Jinja2==3.1.6
MarkupSafe==3.0.3 MarkupSafe==3.0.3
marshmallow==3.26.2
mpmath==1.3.0 mpmath==1.3.0
multidict==6.7.0
mypy_extensions==1.1.0
networkx==3.6.1 networkx==3.6.1
num2words==0.5.14 num2words==0.5.14
numpy==2.4.0 numpy==2.4.0
@@ -29,6 +46,8 @@ nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20 nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.8.90 nvidia-nvtx-cu12==12.8.90
omegaconf==2.3.0 omegaconf==2.3.0
packaging==25.0
propcache==0.4.1
pvporcupine==4.0.1 pvporcupine==4.0.1
PyAudio==0.2.14 PyAudio==0.2.14
pycparser==2.23 pycparser==2.23
@@ -46,7 +65,9 @@ torch==2.9.1
torchaudio==2.9.1 torchaudio==2.9.1
tqdm==4.67.1 tqdm==4.67.1
triton==3.5.1 triton==3.5.1
typing-inspect==0.9.0
typing_extensions==4.15.0 typing_extensions==4.15.0
urllib3==2.6.2 urllib3==2.6.2
vosk==0.3.45 vosk==0.3.45
websockets==15.0.1 websockets==15.0.1
yarl==1.22.0

230
stt.py
View File

@@ -1,109 +1,197 @@
""" """
Speech-to-Text module using Vosk. Speech-to-Text module using Deepgram API.
Recognizes Russian speech from microphone. Recognizes speech from microphone using streaming WebSocket.
Supports Russian (default) and English.
""" """
import json import os
import asyncio
import threading
import pyaudio import pyaudio
from vosk import Model, KaldiRecognizer import logging
from config import VOSK_MODEL_PATH, SAMPLE_RATE from config import DEEPGRAM_API_KEY, SAMPLE_RATE
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
Microphone,
)
# Configure logging to suppress debug noise
logging.getLogger("deepgram").setLevel(logging.WARNING)
class SpeechRecognizer: class SpeechRecognizer:
"""Speech recognizer using Vosk.""" """Speech recognizer using Deepgram streaming."""
def __init__(self): def __init__(self):
self.model = None self.dg_client = None
self.recognizer = None
self.pa = None self.pa = None
self.stream = None self.stream = None
self.transcript = ""
self.lock = threading.Lock()
def initialize(self): def initialize(self):
"""Initialize Vosk model and audio stream.""" """Initialize Deepgram client and PyAudio."""
print("📦 Загрузка модели Vosk...") if not DEEPGRAM_API_KEY:
self.model = Model(str(VOSK_MODEL_PATH)) raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
self.recognizer.SetWords(True) print("📦 Инициализация Deepgram STT...")
config = DeepgramClientOptions(
verbose=logging.WARNING,
)
self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
self.pa = pyaudio.PyAudio() self.pa = pyaudio.PyAudio()
print("✅ Deepgram клиент готов")
def _get_stream(self):
"""Open audio stream if not open."""
if self.stream is None:
self.stream = self.pa.open( self.stream = self.pa.open(
rate=SAMPLE_RATE, rate=SAMPLE_RATE,
channels=1, channels=1,
format=pyaudio.paInt16, format=pyaudio.paInt16,
input=True, input=True,
frames_per_buffer=4096 frames_per_buffer=4096,
) )
print("✅ Модель Vosk загружена") return self.stream
def listen(self, timeout_seconds: float = 5.0, detection_timeout: float = None) -> str: async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
"""Async loop to send audio and wait for results."""
self.transcript = ""
loop = asyncio.get_running_loop()
stream = self._get_stream()
stop_event = asyncio.Event()
speech_started_event = asyncio.Event()
# We need access to the outer 'self' (SpeechRecognizer instance)
speech_recognizer_self = self
def on_transcript(unused_self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
if result.is_final:
print(f"📝 Частичный результат: {sentence}")
with speech_recognizer_self.lock:
speech_recognizer_self.transcript = sentence
def on_speech_started(unused_self, speech_started, **kwargs):
loop.call_soon_threadsafe(speech_started_event.set)
def on_utterance_end(unused_self, utterance_end, **kwargs):
loop.call_soon_threadsafe(stop_event.set)
def on_error(unused_self, error, **kwargs):
print(f"Error: {error}")
loop.call_soon_threadsafe(stop_event.set)
dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
# Start connection (Synchronous call, NO await)
options = LiveOptions(
model="nova-2",
language=self.current_lang,
smart_format=True,
encoding="linear16",
channels=1,
sample_rate=SAMPLE_RATE,
interim_results=True,
utterance_end_ms="1200",
vad_events=True,
)
if dg_connection.start(options) is False:
print("Failed to start Deepgram connection")
return
# Audio sending loop
async def send_audio():
chunks_sent = 0
try:
stream.start_stream()
print("🎤 Stream started, sending audio...")
while not stop_event.is_set():
if stream.is_active():
data = stream.read(4096, exception_on_overflow=False)
# Send is synchronous in Sync client, NO await
dg_connection.send(data)
chunks_sent += 1
if chunks_sent % 50 == 0:
print(f".", end="", flush=True)
# Yield to allow event loop to process events (timeouts etc)
await asyncio.sleep(0.005)
except Exception as e:
print(f"Audio send error: {e}")
finally:
stream.stop_stream()
print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
sender_task = asyncio.create_task(send_audio())
try:
# 1. Wait for speech to start (detection_timeout)
if detection_timeout:
try:
await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
except asyncio.TimeoutError:
# print("Detection timeout - no speech")
stop_event.set()
# 2. If started (or no detection timeout), wait for completion
if not stop_event.is_set():
await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
except asyncio.TimeoutError:
# print("Global timeout")
pass
stop_event.set()
await sender_task
# Finish is synchronous
dg_connection.finish()
return self.transcript
def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
""" """
Listen to microphone and transcribe speech. Listen to microphone and transcribe speech.
Args:
timeout_seconds: Maximum time to listen for speech
detection_timeout: Time to wait for speech to start. If None, uses timeout_seconds.
Returns:
Transcribed text from speech
""" """
if not self.model: if not self.dg_client:
self.initialize() self.initialize()
print("🎙️ Слушаю... (говорите)") self.current_lang = lang
print(f"🎙️ Слушаю ({lang})...")
# Reset recognizer for new recognition # Create a new connection for each listen session
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE) dg_connection = self.dg_client.listen.live.v("1")
frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096) try:
detection_frames = int(SAMPLE_RATE * detection_timeout / 4096) if detection_timeout else frames_to_read transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
silence_frames = 0 final_text = transcript.strip() if transcript else ""
max_silence_frames = 10 # About 2.5 seconds of silence if final_text:
speech_started = False print(f"📝 Распознано: {final_text}")
for i in range(frames_to_read):
data = self.stream.read(4096, exception_on_overflow=False)
if self.recognizer.AcceptWaveform(data):
result = json.loads(self.recognizer.Result())
text = result.get("text", "").strip()
if text:
print(f"📝 Распознано: {text}")
return text
silence_frames += 1
else: else:
# Check partial result
partial = json.loads(self.recognizer.PartialResult())
if partial.get("partial", ""):
silence_frames = 0
speech_started = True
else:
silence_frames += 1
# Check detection timeout
if not speech_started and i > detection_frames:
break
# Stop if too much silence after speech
if silence_frames > max_silence_frames:
break
# Get final result
result = json.loads(self.recognizer.FinalResult())
text = result.get("text", "").strip()
if text:
print(f"📝 Распознано: {text}")
else:
# Only print if we weren't just checking for presence of speech
if not detection_timeout or speech_started:
print("⚠️ Речь не распознана") print("⚠️ Речь не распознана")
return text return final_text
except Exception as e:
print(f"❌ Ошибка STT: {e}")
return ""
def cleanup(self): def cleanup(self):
"""Release resources.""" """Release resources."""
if self.stream: if self.stream:
self.stream.stop_stream()
self.stream.close() self.stream.close()
self.stream = None
if self.pa: if self.pa:
self.pa.terminate() self.pa.terminate()
@@ -120,9 +208,9 @@ def get_recognizer() -> SpeechRecognizer:
return _recognizer return _recognizer
def listen(timeout_seconds: float = 5.0, detection_timeout: float = None) -> str: def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
"""Listen to microphone and return transcribed text.""" """Listen to microphone and return transcribed text."""
return get_recognizer().listen(timeout_seconds, detection_timeout) return get_recognizer().listen(timeout_seconds, detection_timeout, lang)
def cleanup(): def cleanup():

6
tts.py
View File

@@ -1,6 +1,6 @@
""" """
Text-to-Speech module using Silero TTS. Text-to-Speech module using Silero TTS.
Generates natural Russian speech with Xenia voice. Generates natural Russian speech.
Supports interruption via wake word detection using threading. Supports interruption via wake word detection using threading.
""" """
@@ -140,10 +140,6 @@ class TextToSpeech:
except Exception as e: except Exception as e:
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}") print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
success = False success = False
# Continue with next chunk? or break?
# Usually if one fails, we might want to try others, but for "too long" error
# splitting should solve it. If it fails for other reasons, maybe better to stop.
# But let's keep trying subsequent chunks in case it's a specific symbol issue.
if success and not self._interrupted: if success and not self._interrupted:
print("✅ Воспроизведение завершено") print("✅ Воспроизведение завершено")

View File

@@ -41,6 +41,24 @@ class WakeWordDetector:
if not self.porcupine: if not self.porcupine:
self.initialize() self.initialize()
# Ensure stream is open and active
if self.audio_stream is None or not self.audio_stream.is_active():
# If closed or None, we might need to recreate it.
# PyAudio streams once closed cannot be reopened usually?
# We should probably recreate it.
if self.audio_stream:
try:
self.audio_stream.close()
except: pass
self.audio_stream = self.pa.open(
rate=self.porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=self.porcupine.frame_length
)
while True: while True:
pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False) pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm) pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)
@@ -48,6 +66,9 @@ class WakeWordDetector:
keyword_index = self.porcupine.process(pcm) keyword_index = self.porcupine.process(pcm)
if keyword_index >= 0: if keyword_index >= 0:
print("✅ Wake word обнаружен!") print("✅ Wake word обнаружен!")
# Stop and CLOSE stream to release mic for STT
self.audio_stream.stop_stream()
self.audio_stream.close()
return True return True
def check_wakeword_once(self) -> bool: def check_wakeword_once(self) -> bool:
@@ -59,6 +80,21 @@ class WakeWordDetector:
self.initialize() self.initialize()
try: try:
# Ensure stream is open/active
if self.audio_stream is None or not self.audio_stream.is_active():
# Re-open if needed (similar to wait_for_wakeword logic)
if self.audio_stream:
try:
self.audio_stream.close()
except: pass
self.audio_stream = self.pa.open(
rate=self.porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=self.porcupine.frame_length
)
pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False) pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm) pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)