Migrate to Deepgram STT, Silero v5 TTS, and fix wake word mic handling
This commit is contained in:
@@ -17,6 +17,9 @@ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
|
|||||||
PERPLEXITY_MODEL = os.getenv("PERPLEXITY_MODEL", "llama-3.1-sonar-small-128k-chat")
|
PERPLEXITY_MODEL = os.getenv("PERPLEXITY_MODEL", "llama-3.1-sonar-small-128k-chat")
|
||||||
PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
|
PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
|
||||||
|
|
||||||
|
# Deepgram configuration
|
||||||
|
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
|
||||||
|
|
||||||
# Porcupine configuration
|
# Porcupine configuration
|
||||||
PORCUPINE_ACCESS_KEY = os.getenv("PORCUPINE_ACCESS_KEY")
|
PORCUPINE_ACCESS_KEY = os.getenv("PORCUPINE_ACCESS_KEY")
|
||||||
PORCUPINE_KEYWORD_PATH = BASE_DIR / "Alexandr_en_linux_v4_0_0.ppn"
|
PORCUPINE_KEYWORD_PATH = BASE_DIR / "Alexandr_en_linux_v4_0_0.ppn"
|
||||||
|
|||||||
@@ -1,15 +1,32 @@
|
|||||||
|
aenum==3.1.16
|
||||||
|
aiofiles==25.1.0
|
||||||
|
aiohappyeyeballs==2.6.1
|
||||||
|
aiohttp==3.13.3
|
||||||
|
aiosignal==1.4.0
|
||||||
antlr4-python3-runtime==4.9.3
|
antlr4-python3-runtime==4.9.3
|
||||||
|
anyio==4.12.1
|
||||||
|
attrs==25.4.0
|
||||||
certifi==2025.11.12
|
certifi==2025.11.12
|
||||||
cffi==2.0.0
|
cffi==2.0.0
|
||||||
charset-normalizer==3.4.4
|
charset-normalizer==3.4.4
|
||||||
|
dataclasses-json==0.6.7
|
||||||
DAWG2-Python==0.9.0
|
DAWG2-Python==0.9.0
|
||||||
|
deepgram-sdk==3.11.0
|
||||||
|
deprecation==2.1.0
|
||||||
docopt==0.6.2
|
docopt==0.6.2
|
||||||
filelock==3.20.1
|
filelock==3.20.1
|
||||||
|
frozenlist==1.8.0
|
||||||
fsspec==2025.12.0
|
fsspec==2025.12.0
|
||||||
|
h11==0.16.0
|
||||||
|
httpcore==1.0.9
|
||||||
|
httpx==0.28.1
|
||||||
idna==3.11
|
idna==3.11
|
||||||
Jinja2==3.1.6
|
Jinja2==3.1.6
|
||||||
MarkupSafe==3.0.3
|
MarkupSafe==3.0.3
|
||||||
|
marshmallow==3.26.2
|
||||||
mpmath==1.3.0
|
mpmath==1.3.0
|
||||||
|
multidict==6.7.0
|
||||||
|
mypy_extensions==1.1.0
|
||||||
networkx==3.6.1
|
networkx==3.6.1
|
||||||
num2words==0.5.14
|
num2words==0.5.14
|
||||||
numpy==2.4.0
|
numpy==2.4.0
|
||||||
@@ -29,6 +46,8 @@ nvidia-nvjitlink-cu12==12.8.93
|
|||||||
nvidia-nvshmem-cu12==3.3.20
|
nvidia-nvshmem-cu12==3.3.20
|
||||||
nvidia-nvtx-cu12==12.8.90
|
nvidia-nvtx-cu12==12.8.90
|
||||||
omegaconf==2.3.0
|
omegaconf==2.3.0
|
||||||
|
packaging==25.0
|
||||||
|
propcache==0.4.1
|
||||||
pvporcupine==4.0.1
|
pvporcupine==4.0.1
|
||||||
PyAudio==0.2.14
|
PyAudio==0.2.14
|
||||||
pycparser==2.23
|
pycparser==2.23
|
||||||
@@ -46,7 +65,9 @@ torch==2.9.1
|
|||||||
torchaudio==2.9.1
|
torchaudio==2.9.1
|
||||||
tqdm==4.67.1
|
tqdm==4.67.1
|
||||||
triton==3.5.1
|
triton==3.5.1
|
||||||
|
typing-inspect==0.9.0
|
||||||
typing_extensions==4.15.0
|
typing_extensions==4.15.0
|
||||||
urllib3==2.6.2
|
urllib3==2.6.2
|
||||||
vosk==0.3.45
|
vosk==0.3.45
|
||||||
websockets==15.0.1
|
websockets==15.0.1
|
||||||
|
yarl==1.22.0
|
||||||
|
|||||||
256
stt.py
256
stt.py
@@ -1,109 +1,197 @@
|
|||||||
"""
|
"""
|
||||||
Speech-to-Text module using Vosk.
|
Speech-to-Text module using Deepgram API.
|
||||||
Recognizes Russian speech from microphone.
|
Recognizes speech from microphone using streaming WebSocket.
|
||||||
|
Supports Russian (default) and English.
|
||||||
"""
|
"""
|
||||||
import json
|
import os
|
||||||
|
import asyncio
|
||||||
|
import threading
|
||||||
import pyaudio
|
import pyaudio
|
||||||
from vosk import Model, KaldiRecognizer
|
import logging
|
||||||
from config import VOSK_MODEL_PATH, SAMPLE_RATE
|
from config import DEEPGRAM_API_KEY, SAMPLE_RATE
|
||||||
|
from deepgram import (
|
||||||
|
DeepgramClient,
|
||||||
|
DeepgramClientOptions,
|
||||||
|
LiveTranscriptionEvents,
|
||||||
|
LiveOptions,
|
||||||
|
Microphone,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configure logging to suppress debug noise
|
||||||
|
logging.getLogger("deepgram").setLevel(logging.WARNING)
|
||||||
|
|
||||||
class SpeechRecognizer:
|
class SpeechRecognizer:
|
||||||
"""Speech recognizer using Vosk."""
|
"""Speech recognizer using Deepgram streaming."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.model = None
|
self.dg_client = None
|
||||||
self.recognizer = None
|
|
||||||
self.pa = None
|
self.pa = None
|
||||||
self.stream = None
|
self.stream = None
|
||||||
|
self.transcript = ""
|
||||||
|
self.lock = threading.Lock()
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
"""Initialize Vosk model and audio stream."""
|
"""Initialize Deepgram client and PyAudio."""
|
||||||
print("📦 Загрузка модели Vosk...")
|
if not DEEPGRAM_API_KEY:
|
||||||
self.model = Model(str(VOSK_MODEL_PATH))
|
raise ValueError("DEEPGRAM_API_KEY is not set in environment or config.")
|
||||||
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
|
|
||||||
self.recognizer.SetWords(True)
|
print("📦 Инициализация Deepgram STT...")
|
||||||
|
config = DeepgramClientOptions(
|
||||||
|
verbose=logging.WARNING,
|
||||||
|
)
|
||||||
|
self.dg_client = DeepgramClient(DEEPGRAM_API_KEY, config)
|
||||||
|
|
||||||
self.pa = pyaudio.PyAudio()
|
self.pa = pyaudio.PyAudio()
|
||||||
self.stream = self.pa.open(
|
print("✅ Deepgram клиент готов")
|
||||||
rate=SAMPLE_RATE,
|
|
||||||
|
def _get_stream(self):
|
||||||
|
"""Open audio stream if not open."""
|
||||||
|
if self.stream is None:
|
||||||
|
self.stream = self.pa.open(
|
||||||
|
rate=SAMPLE_RATE,
|
||||||
|
channels=1,
|
||||||
|
format=pyaudio.paInt16,
|
||||||
|
input=True,
|
||||||
|
frames_per_buffer=4096,
|
||||||
|
)
|
||||||
|
return self.stream
|
||||||
|
|
||||||
|
async def _process_audio(self, dg_connection, timeout_seconds, detection_timeout):
|
||||||
|
"""Async loop to send audio and wait for results."""
|
||||||
|
self.transcript = ""
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
stream = self._get_stream()
|
||||||
|
|
||||||
|
stop_event = asyncio.Event()
|
||||||
|
speech_started_event = asyncio.Event()
|
||||||
|
|
||||||
|
# We need access to the outer 'self' (SpeechRecognizer instance)
|
||||||
|
speech_recognizer_self = self
|
||||||
|
|
||||||
|
def on_transcript(unused_self, result, **kwargs):
|
||||||
|
sentence = result.channel.alternatives[0].transcript
|
||||||
|
if len(sentence) == 0:
|
||||||
|
return
|
||||||
|
if result.is_final:
|
||||||
|
print(f"📝 Частичный результат: {sentence}")
|
||||||
|
with speech_recognizer_self.lock:
|
||||||
|
speech_recognizer_self.transcript = sentence
|
||||||
|
|
||||||
|
def on_speech_started(unused_self, speech_started, **kwargs):
|
||||||
|
loop.call_soon_threadsafe(speech_started_event.set)
|
||||||
|
|
||||||
|
def on_utterance_end(unused_self, utterance_end, **kwargs):
|
||||||
|
loop.call_soon_threadsafe(stop_event.set)
|
||||||
|
|
||||||
|
def on_error(unused_self, error, **kwargs):
|
||||||
|
print(f"Error: {error}")
|
||||||
|
loop.call_soon_threadsafe(stop_event.set)
|
||||||
|
|
||||||
|
dg_connection.on(LiveTranscriptionEvents.Transcript, on_transcript)
|
||||||
|
dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
|
||||||
|
dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
|
||||||
|
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
|
||||||
|
|
||||||
|
# Start connection (Synchronous call, NO await)
|
||||||
|
options = LiveOptions(
|
||||||
|
model="nova-2",
|
||||||
|
language=self.current_lang,
|
||||||
|
smart_format=True,
|
||||||
|
encoding="linear16",
|
||||||
channels=1,
|
channels=1,
|
||||||
format=pyaudio.paInt16,
|
sample_rate=SAMPLE_RATE,
|
||||||
input=True,
|
interim_results=True,
|
||||||
frames_per_buffer=4096
|
utterance_end_ms="1200",
|
||||||
|
vad_events=True,
|
||||||
)
|
)
|
||||||
print("✅ Модель Vosk загружена")
|
|
||||||
|
if dg_connection.start(options) is False:
|
||||||
def listen(self, timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
|
print("Failed to start Deepgram connection")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Audio sending loop
|
||||||
|
async def send_audio():
|
||||||
|
chunks_sent = 0
|
||||||
|
try:
|
||||||
|
stream.start_stream()
|
||||||
|
print("🎤 Stream started, sending audio...")
|
||||||
|
while not stop_event.is_set():
|
||||||
|
if stream.is_active():
|
||||||
|
data = stream.read(4096, exception_on_overflow=False)
|
||||||
|
# Send is synchronous in Sync client, NO await
|
||||||
|
dg_connection.send(data)
|
||||||
|
chunks_sent += 1
|
||||||
|
if chunks_sent % 50 == 0:
|
||||||
|
print(f".", end="", flush=True)
|
||||||
|
# Yield to allow event loop to process events (timeouts etc)
|
||||||
|
await asyncio.sleep(0.005)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Audio send error: {e}")
|
||||||
|
finally:
|
||||||
|
stream.stop_stream()
|
||||||
|
print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
|
||||||
|
|
||||||
|
sender_task = asyncio.create_task(send_audio())
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1. Wait for speech to start (detection_timeout)
|
||||||
|
if detection_timeout:
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(speech_started_event.wait(), timeout=detection_timeout)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
# print("Detection timeout - no speech")
|
||||||
|
stop_event.set()
|
||||||
|
|
||||||
|
# 2. If started (or no detection timeout), wait for completion
|
||||||
|
if not stop_event.is_set():
|
||||||
|
await asyncio.wait_for(stop_event.wait(), timeout=timeout_seconds)
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
# print("Global timeout")
|
||||||
|
pass
|
||||||
|
|
||||||
|
stop_event.set()
|
||||||
|
await sender_task
|
||||||
|
# Finish is synchronous
|
||||||
|
dg_connection.finish()
|
||||||
|
|
||||||
|
return self.transcript
|
||||||
|
|
||||||
|
def listen(self, timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
|
||||||
"""
|
"""
|
||||||
Listen to microphone and transcribe speech.
|
Listen to microphone and transcribe speech.
|
||||||
|
|
||||||
Args:
|
|
||||||
timeout_seconds: Maximum time to listen for speech
|
|
||||||
detection_timeout: Time to wait for speech to start. If None, uses timeout_seconds.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Transcribed text from speech
|
|
||||||
"""
|
"""
|
||||||
if not self.model:
|
if not self.dg_client:
|
||||||
self.initialize()
|
self.initialize()
|
||||||
|
|
||||||
print("🎙️ Слушаю... (говорите)")
|
|
||||||
|
|
||||||
# Reset recognizer for new recognition
|
|
||||||
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
|
|
||||||
|
|
||||||
frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
|
|
||||||
detection_frames = int(SAMPLE_RATE * detection_timeout / 4096) if detection_timeout else frames_to_read
|
|
||||||
|
|
||||||
silence_frames = 0
|
|
||||||
max_silence_frames = 10 # About 2.5 seconds of silence
|
|
||||||
speech_started = False
|
|
||||||
|
|
||||||
for i in range(frames_to_read):
|
|
||||||
data = self.stream.read(4096, exception_on_overflow=False)
|
|
||||||
|
|
||||||
if self.recognizer.AcceptWaveform(data):
|
self.current_lang = lang
|
||||||
result = json.loads(self.recognizer.Result())
|
print(f"🎙️ Слушаю ({lang})...")
|
||||||
text = result.get("text", "").strip()
|
|
||||||
if text:
|
# Create a new connection for each listen session
|
||||||
print(f"📝 Распознано: {text}")
|
dg_connection = self.dg_client.listen.live.v("1")
|
||||||
return text
|
|
||||||
silence_frames += 1
|
try:
|
||||||
|
transcript = asyncio.run(self._process_audio(dg_connection, timeout_seconds, detection_timeout))
|
||||||
|
|
||||||
|
final_text = transcript.strip() if transcript else ""
|
||||||
|
if final_text:
|
||||||
|
print(f"📝 Распознано: {final_text}")
|
||||||
else:
|
else:
|
||||||
# Check partial result
|
|
||||||
partial = json.loads(self.recognizer.PartialResult())
|
|
||||||
if partial.get("partial", ""):
|
|
||||||
silence_frames = 0
|
|
||||||
speech_started = True
|
|
||||||
else:
|
|
||||||
silence_frames += 1
|
|
||||||
|
|
||||||
# Check detection timeout
|
|
||||||
if not speech_started and i > detection_frames:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Stop if too much silence after speech
|
|
||||||
if silence_frames > max_silence_frames:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Get final result
|
|
||||||
result = json.loads(self.recognizer.FinalResult())
|
|
||||||
text = result.get("text", "").strip()
|
|
||||||
|
|
||||||
if text:
|
|
||||||
print(f"📝 Распознано: {text}")
|
|
||||||
else:
|
|
||||||
# Only print if we weren't just checking for presence of speech
|
|
||||||
if not detection_timeout or speech_started:
|
|
||||||
print("⚠️ Речь не распознана")
|
print("⚠️ Речь не распознана")
|
||||||
|
|
||||||
return text
|
return final_text
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Ошибка STT: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
"""Release resources."""
|
"""Release resources."""
|
||||||
if self.stream:
|
if self.stream:
|
||||||
|
self.stream.stop_stream()
|
||||||
self.stream.close()
|
self.stream.close()
|
||||||
|
self.stream = None
|
||||||
if self.pa:
|
if self.pa:
|
||||||
self.pa.terminate()
|
self.pa.terminate()
|
||||||
|
|
||||||
@@ -120,9 +208,9 @@ def get_recognizer() -> SpeechRecognizer:
|
|||||||
return _recognizer
|
return _recognizer
|
||||||
|
|
||||||
|
|
||||||
def listen(timeout_seconds: float = 5.0, detection_timeout: float = None) -> str:
|
def listen(timeout_seconds: float = 7.0, detection_timeout: float = None, lang: str = "ru") -> str:
|
||||||
"""Listen to microphone and return transcribed text."""
|
"""Listen to microphone and return transcribed text."""
|
||||||
return get_recognizer().listen(timeout_seconds, detection_timeout)
|
return get_recognizer().listen(timeout_seconds, detection_timeout, lang)
|
||||||
|
|
||||||
|
|
||||||
def cleanup():
|
def cleanup():
|
||||||
@@ -130,4 +218,4 @@ def cleanup():
|
|||||||
global _recognizer
|
global _recognizer
|
||||||
if _recognizer:
|
if _recognizer:
|
||||||
_recognizer.cleanup()
|
_recognizer.cleanup()
|
||||||
_recognizer = None
|
_recognizer = None
|
||||||
6
tts.py
6
tts.py
@@ -1,6 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
Text-to-Speech module using Silero TTS.
|
Text-to-Speech module using Silero TTS.
|
||||||
Generates natural Russian speech with Xenia voice.
|
Generates natural Russian speech.
|
||||||
Supports interruption via wake word detection using threading.
|
Supports interruption via wake word detection using threading.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -140,10 +140,6 @@ class TextToSpeech:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
|
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
|
||||||
success = False
|
success = False
|
||||||
# Continue with next chunk? or break?
|
|
||||||
# Usually if one fails, we might want to try others, but for "too long" error
|
|
||||||
# splitting should solve it. If it fails for other reasons, maybe better to stop.
|
|
||||||
# But let's keep trying subsequent chunks in case it's a specific symbol issue.
|
|
||||||
|
|
||||||
if success and not self._interrupted:
|
if success and not self._interrupted:
|
||||||
print("✅ Воспроизведение завершено")
|
print("✅ Воспроизведение завершено")
|
||||||
|
|||||||
36
wakeword.py
36
wakeword.py
@@ -40,6 +40,24 @@ class WakeWordDetector:
|
|||||||
"""
|
"""
|
||||||
if not self.porcupine:
|
if not self.porcupine:
|
||||||
self.initialize()
|
self.initialize()
|
||||||
|
|
||||||
|
# Ensure stream is open and active
|
||||||
|
if self.audio_stream is None or not self.audio_stream.is_active():
|
||||||
|
# If closed or None, we might need to recreate it.
|
||||||
|
# PyAudio streams once closed cannot be reopened usually?
|
||||||
|
# We should probably recreate it.
|
||||||
|
if self.audio_stream:
|
||||||
|
try:
|
||||||
|
self.audio_stream.close()
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
self.audio_stream = self.pa.open(
|
||||||
|
rate=self.porcupine.sample_rate,
|
||||||
|
channels=1,
|
||||||
|
format=pyaudio.paInt16,
|
||||||
|
input=True,
|
||||||
|
frames_per_buffer=self.porcupine.frame_length
|
||||||
|
)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
|
pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
|
||||||
@@ -48,6 +66,9 @@ class WakeWordDetector:
|
|||||||
keyword_index = self.porcupine.process(pcm)
|
keyword_index = self.porcupine.process(pcm)
|
||||||
if keyword_index >= 0:
|
if keyword_index >= 0:
|
||||||
print("✅ Wake word обнаружен!")
|
print("✅ Wake word обнаружен!")
|
||||||
|
# Stop and CLOSE stream to release mic for STT
|
||||||
|
self.audio_stream.stop_stream()
|
||||||
|
self.audio_stream.close()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def check_wakeword_once(self) -> bool:
|
def check_wakeword_once(self) -> bool:
|
||||||
@@ -59,6 +80,21 @@ class WakeWordDetector:
|
|||||||
self.initialize()
|
self.initialize()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Ensure stream is open/active
|
||||||
|
if self.audio_stream is None or not self.audio_stream.is_active():
|
||||||
|
# Re-open if needed (similar to wait_for_wakeword logic)
|
||||||
|
if self.audio_stream:
|
||||||
|
try:
|
||||||
|
self.audio_stream.close()
|
||||||
|
except: pass
|
||||||
|
self.audio_stream = self.pa.open(
|
||||||
|
rate=self.porcupine.sample_rate,
|
||||||
|
channels=1,
|
||||||
|
format=pyaudio.paInt16,
|
||||||
|
input=True,
|
||||||
|
frames_per_buffer=self.porcupine.frame_length
|
||||||
|
)
|
||||||
|
|
||||||
pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
|
pcm = self.audio_stream.read(self.porcupine.frame_length, exception_on_overflow=False)
|
||||||
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)
|
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user