feat: refine assistant logic and update docs
This commit is contained in:
@@ -9,12 +9,26 @@ Listens for the configured wake word.
|
||||
import pvporcupine
|
||||
import pyaudio
|
||||
import struct
|
||||
import io
|
||||
import wave
|
||||
import time
|
||||
import numpy as np
|
||||
import httpx
|
||||
from collections import deque
|
||||
from deepgram import DeepgramClient
|
||||
from deepgram.clients.listen.v1.rest.options import PrerecordedOptions
|
||||
from ..core.config import (
|
||||
DEEPGRAM_API_KEY,
|
||||
PORCUPINE_ACCESS_KEY,
|
||||
PORCUPINE_KEYWORD_PATH,
|
||||
PORCUPINE_SENSITIVITY,
|
||||
WAKEWORD_HIT_COOLDOWN_SECONDS,
|
||||
WAKEWORD_ENABLE_FALLBACK_STT,
|
||||
WAKEWORD_MIN_RMS,
|
||||
WAKEWORD_REOPEN_GRACE_SECONDS,
|
||||
WAKEWORD_RMS_MULTIPLIER,
|
||||
WAKE_WORD,
|
||||
WAKE_WORD_ALIASES,
|
||||
)
|
||||
from ..core.audio_manager import get_audio_manager
|
||||
|
||||
@@ -33,6 +47,19 @@ class WakeWordDetector:
|
||||
self._resampled_pcm_buffer = np.array([], dtype=np.int16)
|
||||
self._stream_closed = True # Флаг состояния потока (закрыт/открыт)
|
||||
self._last_hit_ts = 0.0
|
||||
self._fallback_dg_client = None
|
||||
self._fallback_pre_roll = deque(maxlen=4)
|
||||
self._fallback_frames = []
|
||||
self._fallback_active = False
|
||||
self._fallback_silence_frames = 0
|
||||
self._fallback_last_attempt_ts = 0.0
|
||||
self._fallback_last_error_ts = 0.0
|
||||
self._stream_opened_ts = 0.0
|
||||
self._rms_history = deque(maxlen=220)
|
||||
self._wakeword_aliases_compact = {
|
||||
self._compact_text(WAKE_WORD),
|
||||
*(self._compact_text(alias) for alias in WAKE_WORD_ALIASES),
|
||||
}
|
||||
|
||||
def initialize(self):
|
||||
"""Инициализация Porcupine и PyAudio."""
|
||||
@@ -87,6 +114,211 @@ class WakeWordDetector:
|
||||
)
|
||||
self._resampled_pcm_buffer = np.array([], dtype=np.int16)
|
||||
self._stream_closed = False
|
||||
self._stream_opened_ts = time.time()
|
||||
self._reset_fallback_state()
|
||||
|
||||
@staticmethod
|
||||
def _compute_rms(pcm: np.ndarray) -> float:
|
||||
if pcm.size == 0:
|
||||
return 0.0
|
||||
as_float = pcm.astype(np.float32)
|
||||
return float(np.sqrt(np.mean(as_float * as_float)))
|
||||
|
||||
@staticmethod
|
||||
def _compact_text(text: str) -> str:
|
||||
text = str(text or "").lower().replace("ё", "е")
|
||||
return "".join(ch for ch in text if ch.isalnum())
|
||||
|
||||
def _remember_rms(self, rms: float):
|
||||
if rms <= 0:
|
||||
return
|
||||
self._rms_history.append(float(rms))
|
||||
|
||||
def _noise_floor_rms(self) -> float:
|
||||
if not self._rms_history:
|
||||
return 0.0
|
||||
# Низкий процентиль устойчив к редким всплескам/голосу.
|
||||
return float(np.percentile(np.asarray(self._rms_history, dtype=np.float32), 20))
|
||||
|
||||
def _wakeword_rms_threshold(self) -> float:
|
||||
floor = self._noise_floor_rms()
|
||||
dynamic = floor * float(WAKEWORD_RMS_MULTIPLIER)
|
||||
# Защитный максимум, чтобы в очень шумном окружении не "убить" детект полностью.
|
||||
dynamic = min(dynamic, float(WAKEWORD_MIN_RMS) * 4.0)
|
||||
return max(float(WAKEWORD_MIN_RMS), dynamic)
|
||||
|
||||
def _is_hit_in_guard_window(
|
||||
self, now_ts: float, *, ignore_hit_cooldown: bool = False
|
||||
) -> bool:
|
||||
if (
|
||||
not ignore_hit_cooldown
|
||||
and now_ts - self._last_hit_ts < float(WAKEWORD_HIT_COOLDOWN_SECONDS)
|
||||
):
|
||||
return True
|
||||
if (
|
||||
self._stream_opened_ts > 0
|
||||
and now_ts - self._stream_opened_ts < float(WAKEWORD_REOPEN_GRACE_SECONDS)
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _accept_porcupine_hit(
|
||||
self,
|
||||
pcm: np.ndarray,
|
||||
now_ts: float,
|
||||
*,
|
||||
ignore_hit_cooldown: bool = False,
|
||||
during_tts: bool = False,
|
||||
) -> bool:
|
||||
if self._is_hit_in_guard_window(
|
||||
now_ts, ignore_hit_cooldown=ignore_hit_cooldown
|
||||
):
|
||||
return False
|
||||
rms = self._compute_rms(pcm)
|
||||
# Для "чистого" Porcupine оставляем мягкий амплитудный фильтр:
|
||||
# он отсеивает тишину/щелчки и ложные фаны от фонового шума.
|
||||
# Во время TTS делаем фильтр строже, чтобы собственная колонка
|
||||
# не "будила" ассистента.
|
||||
factor = 0.95 if during_tts else 0.75
|
||||
threshold = max(80.0, self._wakeword_rms_threshold() * factor)
|
||||
if rms < threshold:
|
||||
return False
|
||||
self._last_hit_ts = now_ts
|
||||
return True
|
||||
|
||||
def _reset_fallback_state(self):
|
||||
self._fallback_pre_roll.clear()
|
||||
self._fallback_frames = []
|
||||
self._fallback_active = False
|
||||
self._fallback_silence_frames = 0
|
||||
|
||||
def _get_fallback_client(self):
|
||||
if not WAKEWORD_ENABLE_FALLBACK_STT:
|
||||
return None
|
||||
if not DEEPGRAM_API_KEY:
|
||||
return None
|
||||
if self._fallback_dg_client is None:
|
||||
self._fallback_dg_client = DeepgramClient(DEEPGRAM_API_KEY)
|
||||
return self._fallback_dg_client
|
||||
|
||||
def _pcm_to_wav_bytes(self, pcm: np.ndarray) -> bytes:
|
||||
buffer = io.BytesIO()
|
||||
with wave.open(buffer, "wb") as wav_file:
|
||||
wav_file.setnchannels(1)
|
||||
wav_file.setsampwidth(2)
|
||||
wav_file.setframerate(int(self.porcupine.sample_rate))
|
||||
wav_file.writeframes(np.asarray(pcm, dtype=np.int16).tobytes())
|
||||
return buffer.getvalue()
|
||||
|
||||
def _transcribe_wakeword_candidate(self, pcm: np.ndarray) -> bool:
|
||||
client = self._get_fallback_client()
|
||||
if client is None or pcm.size == 0:
|
||||
return False
|
||||
|
||||
try:
|
||||
response = client.listen.rest.v("1").transcribe_file(
|
||||
{"buffer": self._pcm_to_wav_bytes(pcm)},
|
||||
PrerecordedOptions(
|
||||
model="nova-2",
|
||||
language="ru",
|
||||
smart_format=False,
|
||||
punctuate=False,
|
||||
utterances=False,
|
||||
numerals=False,
|
||||
),
|
||||
timeout=httpx.Timeout(2.2, connect=2.2, read=2.2, write=2.2),
|
||||
)
|
||||
except Exception as exc:
|
||||
now = time.time()
|
||||
if now - self._fallback_last_error_ts >= 30.0:
|
||||
print(f"⚠️ Wake word fallback STT failed: {exc}")
|
||||
self._fallback_last_error_ts = now
|
||||
return False
|
||||
|
||||
transcript = ""
|
||||
confidence = None
|
||||
try:
|
||||
channels = response.results.channels or []
|
||||
if channels and channels[0].alternatives:
|
||||
first_alt = channels[0].alternatives[0]
|
||||
transcript = str(first_alt.transcript or "").strip()
|
||||
try:
|
||||
confidence = float(first_alt.confidence)
|
||||
except Exception:
|
||||
confidence = None
|
||||
except Exception:
|
||||
transcript = ""
|
||||
confidence = None
|
||||
|
||||
compact = self._compact_text(transcript)
|
||||
if confidence is not None and confidence < 0.62:
|
||||
return False
|
||||
if compact in self._wakeword_aliases_compact:
|
||||
print(f"✅ Wake word обнаружен fallback STT: {transcript}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def _check_fallback_wakeword(
|
||||
self,
|
||||
pcm: np.ndarray,
|
||||
*,
|
||||
during_tts: bool = False,
|
||||
ignore_hit_cooldown: bool = False,
|
||||
) -> bool:
|
||||
if not WAKEWORD_ENABLE_FALLBACK_STT:
|
||||
return False
|
||||
if self.porcupine is None:
|
||||
return False
|
||||
|
||||
rms = self._compute_rms(pcm)
|
||||
base_threshold = self._wakeword_rms_threshold()
|
||||
speech_factor = 1.1 if during_tts else 0.85
|
||||
speech_threshold = max(170.0, base_threshold * speech_factor)
|
||||
silence_threshold = max(95.0, speech_threshold * 0.55)
|
||||
silence_frames_to_finalize = 10 if during_tts else 8
|
||||
min_frames = 10 if during_tts else 7
|
||||
max_frames = 40
|
||||
min_attempt_interval = 2.5 if during_tts else 1.0
|
||||
|
||||
if rms >= speech_threshold:
|
||||
if not self._fallback_active:
|
||||
self._fallback_active = True
|
||||
self._fallback_frames = list(self._fallback_pre_roll)
|
||||
self._fallback_silence_frames = 0
|
||||
self._fallback_frames.append(np.asarray(pcm, dtype=np.int16))
|
||||
elif self._fallback_active:
|
||||
self._fallback_frames.append(np.asarray(pcm, dtype=np.int16))
|
||||
if rms <= silence_threshold:
|
||||
self._fallback_silence_frames += 1
|
||||
else:
|
||||
self._fallback_silence_frames = 0
|
||||
|
||||
if len(self._fallback_frames) > max_frames:
|
||||
self._reset_fallback_state()
|
||||
elif self._fallback_silence_frames >= silence_frames_to_finalize:
|
||||
candidate = np.concatenate(self._fallback_frames) if self._fallback_frames else np.asarray([], dtype=np.int16)
|
||||
self._reset_fallback_state()
|
||||
if len(candidate) >= min_frames * int(self.porcupine.frame_length):
|
||||
now = time.time()
|
||||
candidate_rms = self._compute_rms(candidate)
|
||||
candidate_threshold = self._wakeword_rms_threshold() * (
|
||||
0.95 if during_tts else 0.75
|
||||
)
|
||||
candidate_threshold = max(float(WAKEWORD_MIN_RMS), candidate_threshold)
|
||||
if (
|
||||
now - self._fallback_last_attempt_ts >= min_attempt_interval
|
||||
and not self._is_hit_in_guard_window(
|
||||
now, ignore_hit_cooldown=ignore_hit_cooldown
|
||||
)
|
||||
and candidate_rms >= candidate_threshold
|
||||
):
|
||||
self._fallback_last_attempt_ts = now
|
||||
if self._transcribe_wakeword_candidate(candidate):
|
||||
self._last_hit_ts = now
|
||||
return True
|
||||
|
||||
self._fallback_pre_roll.append(np.asarray(pcm, dtype=np.int16))
|
||||
return False
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Явная остановка и закрытие потока (чтобы освободить микрофон для других задач)."""
|
||||
@@ -97,6 +329,8 @@ class WakeWordDetector:
|
||||
except Exception:
|
||||
pass
|
||||
self._stream_closed = True
|
||||
self._stream_opened_ts = 0.0
|
||||
self._reset_fallback_state()
|
||||
|
||||
def _resample_to_target_rate(self, pcm: np.ndarray) -> np.ndarray:
|
||||
target_rate = int(self.porcupine.sample_rate)
|
||||
@@ -160,14 +394,20 @@ class WakeWordDetector:
|
||||
|
||||
# Читаем небольшой кусочек аудио (frame)
|
||||
pcm = self._read_porcupine_frame()
|
||||
self._remember_rms(self._compute_rms(pcm))
|
||||
|
||||
# Обрабатываем фрейм через Porcupine
|
||||
keyword_index = self.porcupine.process(pcm.tolist())
|
||||
|
||||
# Если keyword_index >= 0, значит ключевое слово обнаружено
|
||||
if keyword_index >= 0:
|
||||
print("✅ Wake word обнаружен!")
|
||||
# Важно: закрываем поток, чтобы освободить микрофон для STT (Deepgram)
|
||||
now = time.time()
|
||||
if self._accept_porcupine_hit(pcm, now, during_tts=False):
|
||||
print("✅ Wake word обнаружен!")
|
||||
# Важно: закрываем поток, чтобы освободить микрофон для STT (Deepgram)
|
||||
self.stop_monitoring()
|
||||
return True
|
||||
if self._check_fallback_wakeword(pcm):
|
||||
self.stop_monitoring()
|
||||
return True
|
||||
|
||||
@@ -189,15 +429,25 @@ class WakeWordDetector:
|
||||
self._open_stream()
|
||||
|
||||
pcm = self._read_porcupine_frame()
|
||||
self._remember_rms(self._compute_rms(pcm))
|
||||
|
||||
keyword_index = self.porcupine.process(pcm.tolist())
|
||||
if keyword_index >= 0:
|
||||
now = time.time()
|
||||
if now - self._last_hit_ts < 0.2: # Уменьшаем интервал для более быстрой реакции
|
||||
if not self._accept_porcupine_hit(
|
||||
pcm,
|
||||
now,
|
||||
ignore_hit_cooldown=True,
|
||||
during_tts=True,
|
||||
):
|
||||
return False
|
||||
self._last_hit_ts = now
|
||||
print("🛑 Wake word обнаружен во время ответа!")
|
||||
return True
|
||||
if self._check_fallback_wakeword(
|
||||
pcm, during_tts=True, ignore_hit_cooldown=True
|
||||
):
|
||||
print("🛑 Wake word обнаружен fallback STT во время ответа!")
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user