feat: refine assistant logic and update docs

This commit is contained in:
future
2026-04-09 21:03:02 +03:00
parent ebe79c3692
commit 42c064a274
19 changed files with 1958 additions and 492 deletions

View File

@@ -9,12 +9,26 @@ Listens for the configured wake word.
import pvporcupine
import pyaudio
import struct
import io
import wave
import time
import numpy as np
import httpx
from collections import deque
from deepgram import DeepgramClient
from deepgram.clients.listen.v1.rest.options import PrerecordedOptions
from ..core.config import (
DEEPGRAM_API_KEY,
PORCUPINE_ACCESS_KEY,
PORCUPINE_KEYWORD_PATH,
PORCUPINE_SENSITIVITY,
WAKEWORD_HIT_COOLDOWN_SECONDS,
WAKEWORD_ENABLE_FALLBACK_STT,
WAKEWORD_MIN_RMS,
WAKEWORD_REOPEN_GRACE_SECONDS,
WAKEWORD_RMS_MULTIPLIER,
WAKE_WORD,
WAKE_WORD_ALIASES,
)
from ..core.audio_manager import get_audio_manager
@@ -33,6 +47,19 @@ class WakeWordDetector:
self._resampled_pcm_buffer = np.array([], dtype=np.int16)
self._stream_closed = True # Флаг состояния потока (закрыт/открыт)
self._last_hit_ts = 0.0
self._fallback_dg_client = None
self._fallback_pre_roll = deque(maxlen=4)
self._fallback_frames = []
self._fallback_active = False
self._fallback_silence_frames = 0
self._fallback_last_attempt_ts = 0.0
self._fallback_last_error_ts = 0.0
self._stream_opened_ts = 0.0
self._rms_history = deque(maxlen=220)
self._wakeword_aliases_compact = {
self._compact_text(WAKE_WORD),
*(self._compact_text(alias) for alias in WAKE_WORD_ALIASES),
}
def initialize(self):
"""Инициализация Porcupine и PyAudio."""
@@ -87,6 +114,211 @@ class WakeWordDetector:
)
self._resampled_pcm_buffer = np.array([], dtype=np.int16)
self._stream_closed = False
self._stream_opened_ts = time.time()
self._reset_fallback_state()
@staticmethod
def _compute_rms(pcm: np.ndarray) -> float:
if pcm.size == 0:
return 0.0
as_float = pcm.astype(np.float32)
return float(np.sqrt(np.mean(as_float * as_float)))
@staticmethod
def _compact_text(text: str) -> str:
text = str(text or "").lower().replace("ё", "е")
return "".join(ch for ch in text if ch.isalnum())
def _remember_rms(self, rms: float):
if rms <= 0:
return
self._rms_history.append(float(rms))
def _noise_floor_rms(self) -> float:
if not self._rms_history:
return 0.0
# Низкий процентиль устойчив к редким всплескам/голосу.
return float(np.percentile(np.asarray(self._rms_history, dtype=np.float32), 20))
def _wakeword_rms_threshold(self) -> float:
floor = self._noise_floor_rms()
dynamic = floor * float(WAKEWORD_RMS_MULTIPLIER)
# Защитный максимум, чтобы в очень шумном окружении не "убить" детект полностью.
dynamic = min(dynamic, float(WAKEWORD_MIN_RMS) * 4.0)
return max(float(WAKEWORD_MIN_RMS), dynamic)
def _is_hit_in_guard_window(
self, now_ts: float, *, ignore_hit_cooldown: bool = False
) -> bool:
if (
not ignore_hit_cooldown
and now_ts - self._last_hit_ts < float(WAKEWORD_HIT_COOLDOWN_SECONDS)
):
return True
if (
self._stream_opened_ts > 0
and now_ts - self._stream_opened_ts < float(WAKEWORD_REOPEN_GRACE_SECONDS)
):
return True
return False
def _accept_porcupine_hit(
self,
pcm: np.ndarray,
now_ts: float,
*,
ignore_hit_cooldown: bool = False,
during_tts: bool = False,
) -> bool:
if self._is_hit_in_guard_window(
now_ts, ignore_hit_cooldown=ignore_hit_cooldown
):
return False
rms = self._compute_rms(pcm)
# Для "чистого" Porcupine оставляем мягкий амплитудный фильтр:
# он отсеивает тишину/щелчки и ложные фаны от фонового шума.
# Во время TTS делаем фильтр строже, чтобы собственная колонка
# не "будила" ассистента.
factor = 0.95 if during_tts else 0.75
threshold = max(80.0, self._wakeword_rms_threshold() * factor)
if rms < threshold:
return False
self._last_hit_ts = now_ts
return True
def _reset_fallback_state(self):
self._fallback_pre_roll.clear()
self._fallback_frames = []
self._fallback_active = False
self._fallback_silence_frames = 0
def _get_fallback_client(self):
if not WAKEWORD_ENABLE_FALLBACK_STT:
return None
if not DEEPGRAM_API_KEY:
return None
if self._fallback_dg_client is None:
self._fallback_dg_client = DeepgramClient(DEEPGRAM_API_KEY)
return self._fallback_dg_client
def _pcm_to_wav_bytes(self, pcm: np.ndarray) -> bytes:
buffer = io.BytesIO()
with wave.open(buffer, "wb") as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(int(self.porcupine.sample_rate))
wav_file.writeframes(np.asarray(pcm, dtype=np.int16).tobytes())
return buffer.getvalue()
def _transcribe_wakeword_candidate(self, pcm: np.ndarray) -> bool:
client = self._get_fallback_client()
if client is None or pcm.size == 0:
return False
try:
response = client.listen.rest.v("1").transcribe_file(
{"buffer": self._pcm_to_wav_bytes(pcm)},
PrerecordedOptions(
model="nova-2",
language="ru",
smart_format=False,
punctuate=False,
utterances=False,
numerals=False,
),
timeout=httpx.Timeout(2.2, connect=2.2, read=2.2, write=2.2),
)
except Exception as exc:
now = time.time()
if now - self._fallback_last_error_ts >= 30.0:
print(f"⚠️ Wake word fallback STT failed: {exc}")
self._fallback_last_error_ts = now
return False
transcript = ""
confidence = None
try:
channels = response.results.channels or []
if channels and channels[0].alternatives:
first_alt = channels[0].alternatives[0]
transcript = str(first_alt.transcript or "").strip()
try:
confidence = float(first_alt.confidence)
except Exception:
confidence = None
except Exception:
transcript = ""
confidence = None
compact = self._compact_text(transcript)
if confidence is not None and confidence < 0.62:
return False
if compact in self._wakeword_aliases_compact:
print(f"✅ Wake word обнаружен fallback STT: {transcript}")
return True
return False
def _check_fallback_wakeword(
self,
pcm: np.ndarray,
*,
during_tts: bool = False,
ignore_hit_cooldown: bool = False,
) -> bool:
if not WAKEWORD_ENABLE_FALLBACK_STT:
return False
if self.porcupine is None:
return False
rms = self._compute_rms(pcm)
base_threshold = self._wakeword_rms_threshold()
speech_factor = 1.1 if during_tts else 0.85
speech_threshold = max(170.0, base_threshold * speech_factor)
silence_threshold = max(95.0, speech_threshold * 0.55)
silence_frames_to_finalize = 10 if during_tts else 8
min_frames = 10 if during_tts else 7
max_frames = 40
min_attempt_interval = 2.5 if during_tts else 1.0
if rms >= speech_threshold:
if not self._fallback_active:
self._fallback_active = True
self._fallback_frames = list(self._fallback_pre_roll)
self._fallback_silence_frames = 0
self._fallback_frames.append(np.asarray(pcm, dtype=np.int16))
elif self._fallback_active:
self._fallback_frames.append(np.asarray(pcm, dtype=np.int16))
if rms <= silence_threshold:
self._fallback_silence_frames += 1
else:
self._fallback_silence_frames = 0
if len(self._fallback_frames) > max_frames:
self._reset_fallback_state()
elif self._fallback_silence_frames >= silence_frames_to_finalize:
candidate = np.concatenate(self._fallback_frames) if self._fallback_frames else np.asarray([], dtype=np.int16)
self._reset_fallback_state()
if len(candidate) >= min_frames * int(self.porcupine.frame_length):
now = time.time()
candidate_rms = self._compute_rms(candidate)
candidate_threshold = self._wakeword_rms_threshold() * (
0.95 if during_tts else 0.75
)
candidate_threshold = max(float(WAKEWORD_MIN_RMS), candidate_threshold)
if (
now - self._fallback_last_attempt_ts >= min_attempt_interval
and not self._is_hit_in_guard_window(
now, ignore_hit_cooldown=ignore_hit_cooldown
)
and candidate_rms >= candidate_threshold
):
self._fallback_last_attempt_ts = now
if self._transcribe_wakeword_candidate(candidate):
self._last_hit_ts = now
return True
self._fallback_pre_roll.append(np.asarray(pcm, dtype=np.int16))
return False
def stop_monitoring(self):
"""Явная остановка и закрытие потока (чтобы освободить микрофон для других задач)."""
@@ -97,6 +329,8 @@ class WakeWordDetector:
except Exception:
pass
self._stream_closed = True
self._stream_opened_ts = 0.0
self._reset_fallback_state()
def _resample_to_target_rate(self, pcm: np.ndarray) -> np.ndarray:
target_rate = int(self.porcupine.sample_rate)
@@ -160,14 +394,20 @@ class WakeWordDetector:
# Читаем небольшой кусочек аудио (frame)
pcm = self._read_porcupine_frame()
self._remember_rms(self._compute_rms(pcm))
# Обрабатываем фрейм через Porcupine
keyword_index = self.porcupine.process(pcm.tolist())
# Если keyword_index >= 0, значит ключевое слово обнаружено
if keyword_index >= 0:
print("✅ Wake word обнаружен!")
# Важно: закрываем поток, чтобы освободить микрофон для STT (Deepgram)
now = time.time()
if self._accept_porcupine_hit(pcm, now, during_tts=False):
print("✅ Wake word обнаружен!")
# Важно: закрываем поток, чтобы освободить микрофон для STT (Deepgram)
self.stop_monitoring()
return True
if self._check_fallback_wakeword(pcm):
self.stop_monitoring()
return True
@@ -189,15 +429,25 @@ class WakeWordDetector:
self._open_stream()
pcm = self._read_porcupine_frame()
self._remember_rms(self._compute_rms(pcm))
keyword_index = self.porcupine.process(pcm.tolist())
if keyword_index >= 0:
now = time.time()
if now - self._last_hit_ts < 0.2: # Уменьшаем интервал для более быстрой реакции
if not self._accept_porcupine_hit(
pcm,
now,
ignore_hit_cooldown=True,
during_tts=True,
):
return False
self._last_hit_ts = now
print("🛑 Wake word обнаружен во время ответа!")
return True
if self._check_fallback_wakeword(
pcm, during_tts=True, ignore_hit_cooldown=True
):
print("🛑 Wake word обнаружен fallback STT во время ответа!")
return True
return False
except Exception:
return False