first commit

2026-01-02 20:26:44 +03:00
commit 51ed78078b
14 changed files with 841 additions and 0 deletions
--- a/stt.py
+++ b/stt.py
@@ -0,0 +1,122 @@
+"""
+Speech-to-Text module using Vosk.
+Recognizes Russian speech from microphone.
+"""
+import json
+import pyaudio
+from vosk import Model, KaldiRecognizer
+from config import VOSK_MODEL_PATH, SAMPLE_RATE
+
+
+class SpeechRecognizer:
+    """Speech recognizer using Vosk."""
+    
+    def __init__(self):
+        self.model = None
+        self.recognizer = None
+        self.pa = None
+        self.stream = None
+    
+    def initialize(self):
+        """Initialize Vosk model and audio stream."""
+        print("📦 Загрузка модели Vosk...")
+        self.model = Model(str(VOSK_MODEL_PATH))
+        self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
+        self.recognizer.SetWords(True)
+        
+        self.pa = pyaudio.PyAudio()
+        self.stream = self.pa.open(
+            rate=SAMPLE_RATE,
+            channels=1,
+            format=pyaudio.paInt16,
+            input=True,
+            frames_per_buffer=4096
+        )
+        print("✅ Модель Vosk загружена")
+    
+    def listen(self, timeout_seconds: float = 5.0) -> str:
+        """
+        Listen to microphone and transcribe speech.
+        
+        Args:
+            timeout_seconds: Maximum time to listen for speech
+            
+        Returns:
+            Transcribed text from speech
+        """
+        if not self.model:
+            self.initialize()
+        
+        print("🎙️ Слушаю... (говорите)")
+        
+        # Reset recognizer for new recognition
+        self.recognizer = KaldiRecognizer(self.model, SAMPLE_RATE)
+        
+        frames_to_read = int(SAMPLE_RATE * timeout_seconds / 4096)
+        silence_frames = 0
+        max_silence_frames = 10  # About 2.5 seconds of silence
+        
+        for _ in range(frames_to_read):
+            data = self.stream.read(4096, exception_on_overflow=False)
+            
+            if self.recognizer.AcceptWaveform(data):
+                result = json.loads(self.recognizer.Result())
+                text = result.get("text", "").strip()
+                if text:
+                    print(f"📝 Распознано: {text}")
+                    return text
+                silence_frames += 1
+            else:
+                # Check partial result
+                partial = json.loads(self.recognizer.PartialResult())
+                if partial.get("partial", ""):
+                    silence_frames = 0
+                else:
+                    silence_frames += 1
+            
+            # Stop if too much silence after speech
+            if silence_frames > max_silence_frames:
+                break
+        
+        # Get final result
+        result = json.loads(self.recognizer.FinalResult())
+        text = result.get("text", "").strip()
+        
+        if text:
+            print(f"📝 Распознано: {text}")
+        else:
+            print("⚠️ Речь не распознана")
+        
+        return text
+    
+    def cleanup(self):
+        """Release resources."""
+        if self.stream:
+            self.stream.close()
+        if self.pa:
+            self.pa.terminate()
+
+
+# Global instance
+_recognizer = None
+
+
+def get_recognizer() -> SpeechRecognizer:
+    """Get or create speech recognizer instance."""
+    global _recognizer
+    if _recognizer is None:
+        _recognizer = SpeechRecognizer()
+    return _recognizer
+
+
+def listen(timeout_seconds: float = 5.0) -> str:
+    """Listen to microphone and return transcribed text."""
+    return get_recognizer().listen(timeout_seconds)
+
+
+def cleanup():
+    """Cleanup recognizer resources."""
+    global _recognizer
+    if _recognizer:
+        _recognizer.cleanup()
+        _recognizer = None