Files
smart-speaker/stt.py
2026-01-02 20:26:44 +03:00

123 lines
3.6 KiB
Python

"""
Speech-to-Text module using Vosk.
Recognizes Russian speech from microphone.
"""
import json
import pyaudio
from vosk import Model, KaldiRecognizer
from config import VOSK_MODEL_PATH, SAMPLE_RATE
class SpeechRecognizer:
"""Speech recognizer using Vosk."""
def __init__(self):
self.model = None
self.recognizer = None
self.pa = None
self.stream = None
def initialize(self):
"""Initialize Vosk model and audio stream."""
print("📦 Загрузка модели Vosk...")
self.model = Model(str(VOSK_MODEL_PATH))
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
self.recognizer.SetWords(True)
self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=4096
)
print("✅ Модель Vosk загружена")
def listen(self, timeout_seconds: float = 5.0) -> str:
"""
Listen to microphone and transcribe speech.
Args:
timeout_seconds: Maximum time to listen for speech
Returns:
Transcribed text from speech
"""
if not self.model:
self.initialize()
print("🎙️ Слушаю... (говорите)")
# Reset recognizer for new recognition
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
silence_frames = 0
max_silence_frames = 10 # About 2.5 seconds of silence
for _ in range(frames_to_read):
data = self.stream.read(4096, exception_on_overflow=False)
if self.recognizer.AcceptWaveform(data):
result = json.loads(self.recognizer.Result())
text = result.get("text", "").strip()
if text:
print(f"📝 Распознано: {text}")
return text
silence_frames += 1
else:
# Check partial result
partial = json.loads(self.recognizer.PartialResult())
if partial.get("partial", ""):
silence_frames = 0
else:
silence_frames += 1
# Stop if too much silence after speech
if silence_frames > max_silence_frames:
break
# Get final result
result = json.loads(self.recognizer.FinalResult())
text = result.get("text", "").strip()
if text:
print(f"📝 Распознано: {text}")
else:
print("⚠️ Речь не распознана")
return text
def cleanup(self):
"""Release resources."""
if self.stream:
self.stream.close()
if self.pa:
self.pa.terminate()
# Global instance
_recognizer = None
def get_recognizer() -> SpeechRecognizer:
"""Get or create speech recognizer instance."""
global _recognizer
if _recognizer is None:
_recognizer = SpeechRecognizer()
return _recognizer
def listen(timeout_seconds: float = 5.0) -> str:
"""Listen to microphone and return transcribed text."""
return get_recognizer().listen(timeout_seconds)
def cleanup():
"""Cleanup recognizer resources."""
global _recognizer
if _recognizer:
_recognizer.cleanup()
_recognizer = None