feat: refine assistant logic and update docs

2026-04-09 21:03:02 +03:00
parent ebe79c3692
commit 42c064a274
19 changed files with 1958 additions and 492 deletions
--- a/app/audio/wakeword.py
+++ b/app/audio/wakeword.py
@@ -9,12 +9,26 @@ Listens for the configured wake word.
 import pvporcupine
 import pyaudio
 import struct
+import io
+import wave
+import time
 import numpy as np
+import httpx
+from collections import deque
+from deepgram import DeepgramClient
+from deepgram.clients.listen.v1.rest.options import PrerecordedOptions
 from ..core.config import (
+    DEEPGRAM_API_KEY,
    PORCUPINE_ACCESS_KEY,
    PORCUPINE_KEYWORD_PATH,
    PORCUPINE_SENSITIVITY,
+    WAKEWORD_HIT_COOLDOWN_SECONDS,
+    WAKEWORD_ENABLE_FALLBACK_STT,
+    WAKEWORD_MIN_RMS,
+    WAKEWORD_REOPEN_GRACE_SECONDS,
+    WAKEWORD_RMS_MULTIPLIER,
    WAKE_WORD,
+    WAKE_WORD_ALIASES,
 )
 from ..core.audio_manager import get_audio_manager

@@ -33,6 +47,19 @@ class WakeWordDetector:
        self._resampled_pcm_buffer = np.array([], dtype=np.int16)
        self._stream_closed = True  # Флаг состояния потока (закрыт/открыт)
        self._last_hit_ts = 0.0
+        self._fallback_dg_client = None
+        self._fallback_pre_roll = deque(maxlen=4)
+        self._fallback_frames = []
+        self._fallback_active = False
+        self._fallback_silence_frames = 0
+        self._fallback_last_attempt_ts = 0.0
+        self._fallback_last_error_ts = 0.0
+        self._stream_opened_ts = 0.0
+        self._rms_history = deque(maxlen=220)
+        self._wakeword_aliases_compact = {
+            self._compact_text(WAKE_WORD),
+            *(self._compact_text(alias) for alias in WAKE_WORD_ALIASES),
+        }

    def initialize(self):
        """Инициализация Porcupine и PyAudio."""
@@ -87,6 +114,211 @@ class WakeWordDetector:
        )
        self._resampled_pcm_buffer = np.array([], dtype=np.int16)
        self._stream_closed = False
+        self._stream_opened_ts = time.time()
+        self._reset_fallback_state()
+
+    @staticmethod
+    def _compute_rms(pcm: np.ndarray) -> float:
+        if pcm.size == 0:
+            return 0.0
+        as_float = pcm.astype(np.float32)
+        return float(np.sqrt(np.mean(as_float * as_float)))
+
+    @staticmethod
+    def _compact_text(text: str) -> str:
+        text = str(text or "").lower().replace("ё", "е")
+        return "".join(ch for ch in text if ch.isalnum())
+
+    def _remember_rms(self, rms: float):
+        if rms <= 0:
+            return
+        self._rms_history.append(float(rms))
+
+    def _noise_floor_rms(self) -> float:
+        if not self._rms_history:
+            return 0.0
+        # Низкий процентиль устойчив к редким всплескам/голосу.
+        return float(np.percentile(np.asarray(self._rms_history, dtype=np.float32), 20))
+
+    def _wakeword_rms_threshold(self) -> float:
+        floor = self._noise_floor_rms()
+        dynamic = floor * float(WAKEWORD_RMS_MULTIPLIER)
+        # Защитный максимум, чтобы в очень шумном окружении не "убить" детект полностью.
+        dynamic = min(dynamic, float(WAKEWORD_MIN_RMS) * 4.0)
+        return max(float(WAKEWORD_MIN_RMS), dynamic)
+
+    def _is_hit_in_guard_window(
+        self, now_ts: float, *, ignore_hit_cooldown: bool = False
+    ) -> bool:
+        if (
+            not ignore_hit_cooldown
+            and now_ts - self._last_hit_ts < float(WAKEWORD_HIT_COOLDOWN_SECONDS)
+        ):
+            return True
+        if (
+            self._stream_opened_ts > 0
+            and now_ts - self._stream_opened_ts < float(WAKEWORD_REOPEN_GRACE_SECONDS)
+        ):
+            return True
+        return False
+
+    def _accept_porcupine_hit(
+        self,
+        pcm: np.ndarray,
+        now_ts: float,
+        *,
+        ignore_hit_cooldown: bool = False,
+        during_tts: bool = False,
+    ) -> bool:
+        if self._is_hit_in_guard_window(
+            now_ts, ignore_hit_cooldown=ignore_hit_cooldown
+        ):
+            return False
+        rms = self._compute_rms(pcm)
+        # Для "чистого" Porcupine оставляем мягкий амплитудный фильтр:
+        # он отсеивает тишину/щелчки и ложные фаны от фонового шума.
+        # Во время TTS делаем фильтр строже, чтобы собственная колонка
+        # не "будила" ассистента.
+        factor = 0.95 if during_tts else 0.75
+        threshold = max(80.0, self._wakeword_rms_threshold() * factor)
+        if rms < threshold:
+            return False
+        self._last_hit_ts = now_ts
+        return True
+
+    def _reset_fallback_state(self):
+        self._fallback_pre_roll.clear()
+        self._fallback_frames = []
+        self._fallback_active = False
+        self._fallback_silence_frames = 0
+
+    def _get_fallback_client(self):
+        if not WAKEWORD_ENABLE_FALLBACK_STT:
+            return None
+        if not DEEPGRAM_API_KEY:
+            return None
+        if self._fallback_dg_client is None:
+            self._fallback_dg_client = DeepgramClient(DEEPGRAM_API_KEY)
+        return self._fallback_dg_client
+
+    def _pcm_to_wav_bytes(self, pcm: np.ndarray) -> bytes:
+        buffer = io.BytesIO()
+        with wave.open(buffer, "wb") as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(int(self.porcupine.sample_rate))
+            wav_file.writeframes(np.asarray(pcm, dtype=np.int16).tobytes())
+        return buffer.getvalue()
+
+    def _transcribe_wakeword_candidate(self, pcm: np.ndarray) -> bool:
+        client = self._get_fallback_client()
+        if client is None or pcm.size == 0:
+            return False
+
+        try:
+            response = client.listen.rest.v("1").transcribe_file(
+                {"buffer": self._pcm_to_wav_bytes(pcm)},
+                PrerecordedOptions(
+                    model="nova-2",
+                    language="ru",
+                    smart_format=False,
+                    punctuate=False,
+                    utterances=False,
+                    numerals=False,
+                ),
+                timeout=httpx.Timeout(2.2, connect=2.2, read=2.2, write=2.2),
+            )
+        except Exception as exc:
+            now = time.time()
+            if now - self._fallback_last_error_ts >= 30.0:
+                print(f"⚠️ Wake word fallback STT failed: {exc}")
+                self._fallback_last_error_ts = now
+            return False
+
+        transcript = ""
+        confidence = None
+        try:
+            channels = response.results.channels or []
+            if channels and channels[0].alternatives:
+                first_alt = channels[0].alternatives[0]
+                transcript = str(first_alt.transcript or "").strip()
+                try:
+                    confidence = float(first_alt.confidence)
+                except Exception:
+                    confidence = None
+        except Exception:
+            transcript = ""
+            confidence = None
+
+        compact = self._compact_text(transcript)
+        if confidence is not None and confidence < 0.62:
+            return False
+        if compact in self._wakeword_aliases_compact:
+            print(f"✅ Wake word обнаружен fallback STT: {transcript}")
+            return True
+        return False
+
+    def _check_fallback_wakeword(
+        self,
+        pcm: np.ndarray,
+        *,
+        during_tts: bool = False,
+        ignore_hit_cooldown: bool = False,
+    ) -> bool:
+        if not WAKEWORD_ENABLE_FALLBACK_STT:
+            return False
+        if self.porcupine is None:
+            return False
+
+        rms = self._compute_rms(pcm)
+        base_threshold = self._wakeword_rms_threshold()
+        speech_factor = 1.1 if during_tts else 0.85
+        speech_threshold = max(170.0, base_threshold * speech_factor)
+        silence_threshold = max(95.0, speech_threshold * 0.55)
+        silence_frames_to_finalize = 10 if during_tts else 8
+        min_frames = 10 if during_tts else 7
+        max_frames = 40
+        min_attempt_interval = 2.5 if during_tts else 1.0
+
+        if rms >= speech_threshold:
+            if not self._fallback_active:
+                self._fallback_active = True
+                self._fallback_frames = list(self._fallback_pre_roll)
+                self._fallback_silence_frames = 0
+            self._fallback_frames.append(np.asarray(pcm, dtype=np.int16))
+        elif self._fallback_active:
+            self._fallback_frames.append(np.asarray(pcm, dtype=np.int16))
+            if rms <= silence_threshold:
+                self._fallback_silence_frames += 1
+            else:
+                self._fallback_silence_frames = 0
+
+            if len(self._fallback_frames) > max_frames:
+                self._reset_fallback_state()
+            elif self._fallback_silence_frames >= silence_frames_to_finalize:
+                candidate = np.concatenate(self._fallback_frames) if self._fallback_frames else np.asarray([], dtype=np.int16)
+                self._reset_fallback_state()
+                if len(candidate) >= min_frames * int(self.porcupine.frame_length):
+                    now = time.time()
+                    candidate_rms = self._compute_rms(candidate)
+                    candidate_threshold = self._wakeword_rms_threshold() * (
+                        0.95 if during_tts else 0.75
+                    )
+                    candidate_threshold = max(float(WAKEWORD_MIN_RMS), candidate_threshold)
+                    if (
+                        now - self._fallback_last_attempt_ts >= min_attempt_interval
+                        and not self._is_hit_in_guard_window(
+                            now, ignore_hit_cooldown=ignore_hit_cooldown
+                        )
+                        and candidate_rms >= candidate_threshold
+                    ):
+                        self._fallback_last_attempt_ts = now
+                        if self._transcribe_wakeword_candidate(candidate):
+                            self._last_hit_ts = now
+                            return True
+
+        self._fallback_pre_roll.append(np.asarray(pcm, dtype=np.int16))
+        return False

    def stop_monitoring(self):
        """Явная остановка и закрытие потока (чтобы освободить микрофон для других задач)."""
@@ -97,6 +329,8 @@ class WakeWordDetector:
            except Exception:
                pass
            self._stream_closed = True
+            self._stream_opened_ts = 0.0
+        self._reset_fallback_state()

    def _resample_to_target_rate(self, pcm: np.ndarray) -> np.ndarray:
        target_rate = int(self.porcupine.sample_rate)
@@ -160,14 +394,20 @@ class WakeWordDetector:

            # Читаем небольшой кусочек аудио (frame)
            pcm = self._read_porcupine_frame()
+            self._remember_rms(self._compute_rms(pcm))

            # Обрабатываем фрейм через Porcupine
            keyword_index = self.porcupine.process(pcm.tolist())

            # Если keyword_index >= 0, значит ключевое слово обнаружено
            if keyword_index >= 0:
-                print("✅ Wake word обнаружен!")
-                # Важно: закрываем поток, чтобы освободить микрофон для STT (Deepgram)
+                now = time.time()
+                if self._accept_porcupine_hit(pcm, now, during_tts=False):
+                    print("✅ Wake word обнаружен!")
+                    # Важно: закрываем поток, чтобы освободить микрофон для STT (Deepgram)
+                    self.stop_monitoring()
+                    return True
+            if self._check_fallback_wakeword(pcm):
                self.stop_monitoring()
                return True

@@ -189,15 +429,25 @@ class WakeWordDetector:
            self._open_stream()

            pcm = self._read_porcupine_frame()
+            self._remember_rms(self._compute_rms(pcm))

            keyword_index = self.porcupine.process(pcm.tolist())
            if keyword_index >= 0:
                now = time.time()
-                if now - self._last_hit_ts < 0.2:  # Уменьшаем интервал для более быстрой реакции
+                if not self._accept_porcupine_hit(
+                    pcm,
+                    now,
+                    ignore_hit_cooldown=True,
+                    during_tts=True,
+                ):
                    return False
-                self._last_hit_ts = now
                print("🛑 Wake word обнаружен во время ответа!")
                return True
+            if self._check_fallback_wakeword(
+                pcm, during_tts=True, ignore_hit_cooldown=True
+            ):
+                print("🛑 Wake word обнаружен fallback STT во время ответа!")
+                return True
            return False
        except Exception:
            return False