first commit
This commit is contained in:
122
stt.py
Normal file
122
stt.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""
|
||||
Speech-to-Text module using Vosk.
|
||||
Recognizes Russian speech from microphone.
|
||||
"""
|
||||
import json
|
||||
import pyaudio
|
||||
from vosk import Model, KaldiRecognizer
|
||||
from config import VOSK_MODEL_PATH, SAMPLE_RATE
|
||||
|
||||
|
||||
class SpeechRecognizer:
|
||||
"""Speech recognizer using Vosk."""
|
||||
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.recognizer = None
|
||||
self.pa = None
|
||||
self.stream = None
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize Vosk model and audio stream."""
|
||||
print("📦 Загрузка модели Vosk...")
|
||||
self.model = Model(str(VOSK_MODEL_PATH))
|
||||
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
|
||||
self.recognizer.SetWords(True)
|
||||
|
||||
self.pa = pyaudio.PyAudio()
|
||||
self.stream = self.pa.open(
|
||||
rate=SAMPLE_RATE,
|
||||
channels=1,
|
||||
format=pyaudio.paInt16,
|
||||
input=True,
|
||||
frames_per_buffer=4096
|
||||
)
|
||||
print("✅ Модель Vosk загружена")
|
||||
|
||||
def listen(self, timeout_seconds: float = 5.0) -> str:
|
||||
"""
|
||||
Listen to microphone and transcribe speech.
|
||||
|
||||
Args:
|
||||
timeout_seconds: Maximum time to listen for speech
|
||||
|
||||
Returns:
|
||||
Transcribed text from speech
|
||||
"""
|
||||
if not self.model:
|
||||
self.initialize()
|
||||
|
||||
print("🎙️ Слушаю... (говорите)")
|
||||
|
||||
# Reset recognizer for new recognition
|
||||
self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
|
||||
|
||||
frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
|
||||
silence_frames = 0
|
||||
max_silence_frames = 10 # About 2.5 seconds of silence
|
||||
|
||||
for _ in range(frames_to_read):
|
||||
data = self.stream.read(4096, exception_on_overflow=False)
|
||||
|
||||
if self.recognizer.AcceptWaveform(data):
|
||||
result = json.loads(self.recognizer.Result())
|
||||
text = result.get("text", "").strip()
|
||||
if text:
|
||||
print(f"📝 Распознано: {text}")
|
||||
return text
|
||||
silence_frames += 1
|
||||
else:
|
||||
# Check partial result
|
||||
partial = json.loads(self.recognizer.PartialResult())
|
||||
if partial.get("partial", ""):
|
||||
silence_frames = 0
|
||||
else:
|
||||
silence_frames += 1
|
||||
|
||||
# Stop if too much silence after speech
|
||||
if silence_frames > max_silence_frames:
|
||||
break
|
||||
|
||||
# Get final result
|
||||
result = json.loads(self.recognizer.FinalResult())
|
||||
text = result.get("text", "").strip()
|
||||
|
||||
if text:
|
||||
print(f"📝 Распознано: {text}")
|
||||
else:
|
||||
print("⚠️ Речь не распознана")
|
||||
|
||||
return text
|
||||
|
||||
def cleanup(self):
|
||||
"""Release resources."""
|
||||
if self.stream:
|
||||
self.stream.close()
|
||||
if self.pa:
|
||||
self.pa.terminate()
|
||||
|
||||
|
||||
# Global instance
|
||||
_recognizer = None
|
||||
|
||||
|
||||
def get_recognizer() -> SpeechRecognizer:
|
||||
"""Get or create speech recognizer instance."""
|
||||
global _recognizer
|
||||
if _recognizer is None:
|
||||
_recognizer = SpeechRecognizer()
|
||||
return _recognizer
|
||||
|
||||
|
||||
def listen(timeout_seconds: float = 5.0) -> str:
|
||||
"""Listen to microphone and return transcribed text."""
|
||||
return get_recognizer().listen(timeout_seconds)
|
||||
|
||||
|
||||
def cleanup():
|
||||
"""Cleanup recognizer resources."""
|
||||
global _recognizer
|
||||
if _recognizer:
|
||||
_recognizer.cleanup()
|
||||
_recognizer = None
|
||||
Reference in New Issue
Block a user