Files
smart-speaker/local_stt.py

117 lines
3.8 KiB
Python

"""
Local offline Speech-to-Text module using Vosk.
Used for simple command detection (like "stop") without internet.
"""
import os
import sys
import json
import pyaudio
from vosk import Model, KaldiRecognizer
from config import VOSK_MODEL_PATH, SAMPLE_RATE
class LocalRecognizer:
def __init__(self):
self.model = None
self.rec = None
self.pa = None
self.stream = None
def initialize(self):
if not os.path.exists(VOSK_MODEL_PATH):
print(f"❌ Ошибка: Vosk модель не найдена по пути {VOSK_MODEL_PATH}")
return False
print("📦 Инициализация локального STT (Vosk)...")
# Redirect stderr to suppress Vosk logs
try:
null_fd = os.open(os.devnull, os.O_WRONLY)
old_stderr = os.dup(2)
sys.stderr.flush()
os.dup2(null_fd, 2)
os.close(null_fd)
self.model = Model(str(VOSK_MODEL_PATH))
# Restore stderr
os.dup2(old_stderr, 2)
os.close(old_stderr)
except Exception as e:
print(f"Error initializing Vosk: {e}")
return False
self.rec = KaldiRecognizer(self.model, SAMPLE_RATE)
self.pa = pyaudio.PyAudio()
return True
def listen_for_keywords(self, keywords: list, timeout: float = 10.0) -> str:
"""
Listen for specific keywords locally.
Returns the recognized keyword if found, or empty string.
"""
if not self.model:
if not self.initialize():
return ""
# Open stream
try:
stream = self.pa.open(format=pyaudio.paInt16, channels=1, rate=SAMPLE_RATE, input=True, frames_per_buffer=4096)
stream.start_stream()
except Exception as e:
print(f"❌ Ошибка микрофона: {e}")
return ""
import time
start_time = time.time()
print(f"👂 Локальное слушание ожидает: {keywords}")
detected_text = ""
try:
while time.time() - start_time < timeout:
data = stream.read(4096, exception_on_overflow=False)
if self.rec.AcceptWaveform(data):
res = json.loads(self.rec.Result())
text = res.get("text", "")
if text:
print(f"📝 Локально: {text}")
# Check against keywords
for kw in keywords:
if kw in text:
detected_text = text
break
else:
# Partial result
res = json.loads(self.rec.PartialResult())
partial = res.get("partial", "")
if partial:
for kw in keywords:
if kw in partial:
detected_text = partial
break
if detected_text:
break
finally:
stream.stop_stream()
stream.close()
return detected_text
def cleanup(self):
if self.pa:
self.pa.terminate()
# Global instance
_local_recognizer = None
def get_local_recognizer():
global _local_recognizer
if _local_recognizer is None:
_local_recognizer = LocalRecognizer()
return _local_recognizer
def listen_for_keywords(keywords: list, timeout: float = 5.0) -> str:
"""Listen for keywords using Vosk."""
return get_local_recognizer().listen_for_keywords(keywords, timeout)