feat: refine assistant logic and update docs
This commit is contained in:
@@ -11,20 +11,57 @@ import re
|
||||
import platform
|
||||
from ..core.roman import replace_roman_numerals
|
||||
|
||||
try:
|
||||
import pymorphy3
|
||||
|
||||
_MORPH = pymorphy3.MorphAnalyzer()
|
||||
except Exception:
|
||||
_MORPH = None
|
||||
|
||||
# Карта для перевода слов в цифры ("пять" -> 5)
|
||||
NUMBER_MAP = {
|
||||
"ноль": 0,
|
||||
"один": 1,
|
||||
"одна": 1,
|
||||
"раз": 1,
|
||||
"единица": 1,
|
||||
"единичка": 1,
|
||||
"два": 2,
|
||||
"две": 2,
|
||||
"двойка": 2,
|
||||
"двоечка": 2,
|
||||
"три": 3,
|
||||
"тройка": 3,
|
||||
"троечка": 3,
|
||||
"четыре": 4,
|
||||
"четверка": 4,
|
||||
"четверочка": 4,
|
||||
"пять": 5,
|
||||
"пятерка": 5,
|
||||
"пятерочка": 5,
|
||||
"шесть": 6,
|
||||
"шестерка": 6,
|
||||
"шестерочка": 6,
|
||||
"семь": 7,
|
||||
"семерка": 7,
|
||||
"семерочка": 7,
|
||||
"восемь": 8,
|
||||
"восьмерка": 8,
|
||||
"восьмерочка": 8,
|
||||
"девять": 9,
|
||||
"девятка": 9,
|
||||
"девяточка": 9,
|
||||
"десять": 10,
|
||||
"десятка": 10,
|
||||
"десяточка": 10,
|
||||
}
|
||||
_VOLUME_COMMAND_RE = re.compile(r"\b(громкост\w*|звук\w*|volume)\b")
|
||||
|
||||
|
||||
def _lemmatize(token: str) -> str:
|
||||
if _MORPH is None:
|
||||
return token
|
||||
return _MORPH.parse(token)[0].normal_form.replace("ё", "е")
|
||||
|
||||
|
||||
def _get_volume_command(level: int):
|
||||
@@ -149,16 +186,25 @@ def parse_volume_text(text: str) -> int | None:
|
||||
Пытается найти число громкости в тексте.
|
||||
Понимает и цифры ("5"), и слова ("пять").
|
||||
"""
|
||||
text = replace_roman_numerals(text.lower())
|
||||
text = replace_roman_numerals(text.lower().replace("ё", "е"))
|
||||
|
||||
# 1. Ищем цифры (1-10)
|
||||
num_match = re.search(r"\b(10|[1-9])\b", text)
|
||||
if num_match:
|
||||
return int(num_match.group())
|
||||
# 1. Ищем цифры в любом месте фразы.
|
||||
for match in re.finditer(r"\d+", text):
|
||||
value = int(match.group())
|
||||
if 1 <= value <= 10:
|
||||
return value
|
||||
|
||||
# 2. Ищем слова из словаря
|
||||
for word, value in NUMBER_MAP.items():
|
||||
if word in text:
|
||||
# 2. Ищем числительные и разговорные формы по леммам:
|
||||
# "семерку", "десяточку", "на двух" -> 7, 10, 2.
|
||||
for token in re.findall(r"[a-zA-Zа-яА-ЯёЁ]+", text):
|
||||
value = NUMBER_MAP.get(_lemmatize(token))
|
||||
if value is not None and 1 <= value <= 10:
|
||||
return value
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def is_volume_command(text: str) -> bool:
|
||||
if not text:
|
||||
return False
|
||||
return bool(_VOLUME_COMMAND_RE.search(text.lower().replace("ё", "е")))
|
||||
|
||||
120
app/audio/stt.py
120
app/audio/stt.py
@@ -8,14 +8,13 @@ Supports Russian (default) and English.
|
||||
# Использует Deepgram API через веб-сокеты для потокового распознавания в реальном времени.
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
import pyaudio
|
||||
import logging
|
||||
import contextlib
|
||||
import threading
|
||||
from datetime import datetime, timedelta
|
||||
from ..core.config import DEEPGRAM_API_KEY, SAMPLE_RATE, WAKE_WORD_ALIASES
|
||||
from ..core.config import DEEPGRAM_API_KEY, SAMPLE_RATE
|
||||
from deepgram import (
|
||||
DeepgramClient,
|
||||
DeepgramClientOptions,
|
||||
@@ -25,13 +24,14 @@ from deepgram import (
|
||||
import deepgram.clients.common.v1.abstract_sync_websocket as sdk_ws
|
||||
import websockets.sync.client
|
||||
from ..core.audio_manager import get_audio_manager
|
||||
from ..core.commands import is_fast_command
|
||||
|
||||
# --- Патч (исправление) для библиотеки websockets ---
|
||||
# Явно задаём таймауты подключения, чтобы не зависать на долгом handshake.
|
||||
_original_connect = websockets.sync.client.connect
|
||||
|
||||
DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 3.0
|
||||
DEEPGRAM_CONNECT_WAIT_SECONDS = 4.0
|
||||
DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 5.0
|
||||
DEEPGRAM_CONNECT_WAIT_SECONDS = 6.5
|
||||
DEEPGRAM_CONNECT_POLL_SECONDS = 0.001
|
||||
SENDER_STOP_WAIT_SECONDS = 2.5
|
||||
SENDER_FORCE_RELEASE_WAIT_SECONDS = 2.5
|
||||
@@ -62,28 +62,6 @@ POST_SPEECH_SILENCE_TIMEOUT_SECONDS = 2.0
|
||||
# Фактическое завершение происходит примерно после 2.0 сек тишины после речи.
|
||||
MAX_ACTIVE_SPEECH_SECONDS = 300.0
|
||||
|
||||
_FAST_STOP_UTTERANCE_RE = re.compile(
|
||||
r"^(?:(?:" + "|".join(re.escape(alias) for alias in WAKE_WORD_ALIASES) + r")\s+)?"
|
||||
r"(?:стоп|хватит|перестань|прекрати|замолчи|тихо|пауза)"
|
||||
r"(?:\s+(?:пожалуйста|please))?$",
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _normalize_command_text(text: str) -> str:
|
||||
normalized = text.lower().replace("ё", "е")
|
||||
normalized = re.sub(r"[^\w\s]+", " ", normalized, flags=re.UNICODE)
|
||||
normalized = re.sub(r"\s+", " ", normalized, flags=re.UNICODE).strip()
|
||||
return normalized
|
||||
|
||||
|
||||
def _is_fast_stop_utterance(text: str) -> bool:
|
||||
normalized = _normalize_command_text(text)
|
||||
if not normalized:
|
||||
return False
|
||||
return _FAST_STOP_UTTERANCE_RE.fullmatch(normalized) is not None
|
||||
|
||||
|
||||
class SpeechRecognizer:
|
||||
"""Класс распознавания речи через Deepgram."""
|
||||
|
||||
@@ -280,7 +258,7 @@ class SpeechRecognizer:
|
||||
dg_connection: Активное соединение с Deepgram.
|
||||
timeout_seconds: Аварийный лимит длительности активной речи.
|
||||
detection_timeout: Время ожидания начала речи.
|
||||
fast_stop: Если True, короткая стоп-фраза завершает STT после 1с тишины.
|
||||
fast_stop: Если True, короткие системные команды завершают STT раньше.
|
||||
"""
|
||||
self.transcript = ""
|
||||
transcript_parts = []
|
||||
@@ -296,6 +274,8 @@ class SpeechRecognizer:
|
||||
# События для синхронизации
|
||||
stop_event = asyncio.Event() # Пора останавливаться
|
||||
speech_started_event = asyncio.Event() # Речь обнаружена (VAD)
|
||||
connection_ready_event = threading.Event() # WS с Deepgram готов
|
||||
connection_failed_event = threading.Event() # WS с Deepgram завершился ошибкой
|
||||
last_speech_activity = time.monotonic()
|
||||
first_speech_activity_at = None
|
||||
session_error = {"message": None}
|
||||
@@ -338,14 +318,13 @@ class SpeechRecognizer:
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
if fast_stop:
|
||||
if _is_fast_stop_utterance(sentence):
|
||||
self.transcript = sentence
|
||||
try:
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
except RuntimeError:
|
||||
pass
|
||||
return
|
||||
if fast_stop and is_fast_command(sentence):
|
||||
self.transcript = sentence
|
||||
try:
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
except RuntimeError:
|
||||
pass
|
||||
return
|
||||
|
||||
if result.is_final:
|
||||
# Собираем только финальные (подтвержденные) фразы
|
||||
@@ -470,6 +449,7 @@ class SpeechRecognizer:
|
||||
print(
|
||||
f"⏰ Timeout connecting to Deepgram ({DEEPGRAM_CONNECT_WAIT_SECONDS:.1f}s)"
|
||||
)
|
||||
connection_failed_event.set()
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
@@ -479,15 +459,18 @@ class SpeechRecognizer:
|
||||
f"Failed to start Deepgram connection: {connect_result['error']}"
|
||||
)
|
||||
print(f"Failed to start Deepgram connection: {connect_result['error']}")
|
||||
connection_failed_event.set()
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
if connect_result["ok"] is False:
|
||||
mark_session_error("Failed to start Deepgram connection")
|
||||
print("Failed to start Deepgram connection")
|
||||
connection_failed_event.set()
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
return
|
||||
|
||||
connection_ready_event.set()
|
||||
print(f"🚀 Connected! Sending buffer ({len(audio_buffer)} chunks)...")
|
||||
|
||||
# 3. Отправляем накопленный буфер
|
||||
@@ -522,6 +505,7 @@ class SpeechRecognizer:
|
||||
except Exception as e:
|
||||
mark_session_error(f"Audio send error: {e}")
|
||||
print(f"Audio send error: {e}")
|
||||
connection_failed_event.set()
|
||||
with contextlib.suppress(RuntimeError):
|
||||
loop.call_soon_threadsafe(request_stop)
|
||||
finally:
|
||||
@@ -551,26 +535,56 @@ class SpeechRecognizer:
|
||||
and effective_detection_timeout > 0
|
||||
and not stop_event.is_set()
|
||||
):
|
||||
speech_wait_task = asyncio.create_task(speech_started_event.wait())
|
||||
stop_wait_task = asyncio.create_task(stop_event.wait())
|
||||
try:
|
||||
done, pending = await asyncio.wait(
|
||||
{speech_wait_task, stop_wait_task},
|
||||
timeout=effective_detection_timeout,
|
||||
return_when=asyncio.FIRST_COMPLETED,
|
||||
)
|
||||
finally:
|
||||
for task in (speech_wait_task, stop_wait_task):
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
await asyncio.gather(
|
||||
speech_wait_task, stop_wait_task, return_exceptions=True
|
||||
)
|
||||
# Важно: не считаем пользователя "молчаливым", пока WS-соединение
|
||||
# с Deepgram еще не поднялось.
|
||||
connect_ready_deadline = time.monotonic() + max(
|
||||
effective_detection_timeout + 0.25,
|
||||
DEEPGRAM_CONNECT_WAIT_SECONDS + 0.75,
|
||||
)
|
||||
while (
|
||||
not stop_event.is_set()
|
||||
and not connection_ready_event.is_set()
|
||||
and time.monotonic() < connect_ready_deadline
|
||||
):
|
||||
if connection_failed_event.is_set():
|
||||
break
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
if not done:
|
||||
# Если за detection_timeout никто не начал говорить, выходим
|
||||
if (
|
||||
not stop_event.is_set()
|
||||
and not connection_ready_event.is_set()
|
||||
and not connection_failed_event.is_set()
|
||||
):
|
||||
mark_session_error("Deepgram connection was not ready before speech timeout.")
|
||||
request_stop()
|
||||
|
||||
if (
|
||||
stop_event.is_set()
|
||||
or connection_failed_event.is_set()
|
||||
or not connection_ready_event.is_set()
|
||||
):
|
||||
request_stop()
|
||||
else:
|
||||
speech_wait_task = asyncio.create_task(speech_started_event.wait())
|
||||
stop_wait_task = asyncio.create_task(stop_event.wait())
|
||||
try:
|
||||
done, pending = await asyncio.wait(
|
||||
{speech_wait_task, stop_wait_task},
|
||||
timeout=effective_detection_timeout,
|
||||
return_when=asyncio.FIRST_COMPLETED,
|
||||
)
|
||||
finally:
|
||||
for task in (speech_wait_task, stop_wait_task):
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
await asyncio.gather(
|
||||
speech_wait_task, stop_wait_task, return_exceptions=True
|
||||
)
|
||||
|
||||
if not done:
|
||||
# Если за detection_timeout после поднятия WS никто не начал говорить, выходим.
|
||||
request_stop()
|
||||
|
||||
# 2. После старта речи завершаем только по тишине POST_SPEECH_SILENCE_TIMEOUT_SECONDS.
|
||||
# Добавляем длинный защитный лимит, чтобы сессия не зависла навсегда.
|
||||
if not stop_event.is_set():
|
||||
@@ -687,7 +701,7 @@ class SpeechRecognizer:
|
||||
timeout_seconds: Защитный лимит длительности активной речи.
|
||||
detection_timeout: Сколько ждать начала речи перед тем как сдаться.
|
||||
lang: Язык ("ru" или "en").
|
||||
fast_stop: Быстрое завершение для коротких stop-команд.
|
||||
fast_stop: Быстрое завершение для коротких системных команд.
|
||||
"""
|
||||
if not self.dg_client:
|
||||
self.initialize()
|
||||
|
||||
176
app/audio/tts.py
176
app/audio/tts.py
@@ -19,12 +19,14 @@ import sounddevice as sd
|
||||
import torch
|
||||
|
||||
from ..core.audio_manager import get_audio_manager
|
||||
from ..core.config import TTS_EN_SPEAKER, TTS_SAMPLE_RATE, TTS_SPEAKER
|
||||
from ..core.config import TTS_EN_SPEAKER, TTS_SAMPLE_RATE, TTS_SPEAKER, TTS_SPEED
|
||||
|
||||
# Подавляем предупреждения Silero о длинном тексте (мы сами его режем)
|
||||
warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols")
|
||||
|
||||
_EN_WORD_RE = re.compile(r"[A-Za-z][A-Za-z0-9'-]*")
|
||||
_MIXED_TTS_BUFFERED_SWITCHES = 3
|
||||
_INTERRUPT_POLL_SECONDS = 0.01
|
||||
|
||||
|
||||
class TextToSpeech:
|
||||
@@ -34,6 +36,7 @@ class TextToSpeech:
|
||||
self.model_ru = None
|
||||
self.model_en = None
|
||||
self.sample_rate = TTS_SAMPLE_RATE
|
||||
self.speed_factor = float(TTS_SPEED)
|
||||
self.speaker_ru = TTS_SPEAKER
|
||||
self.speaker_en = TTS_EN_SPEAKER
|
||||
self._interrupted = False
|
||||
@@ -41,6 +44,23 @@ class TextToSpeech:
|
||||
self._audio_manager = None
|
||||
self._output_device_index = None
|
||||
|
||||
def _apply_speed(self, audio_np: np.ndarray) -> np.ndarray:
|
||||
"""Применяет небольшой time-stretch без изменения остальной логики TTS."""
|
||||
audio = np.asarray(audio_np, dtype=np.float32)
|
||||
if audio.size == 0:
|
||||
return audio
|
||||
|
||||
speed = max(0.85, min(1.15, float(self.speed_factor)))
|
||||
if abs(speed - 1.0) < 0.01:
|
||||
return audio
|
||||
|
||||
# speed < 1.0 -> медленнее (длина массива больше), speed > 1.0 -> быстрее.
|
||||
target_length = max(1, int(round(audio.size / speed)))
|
||||
x_old = np.arange(audio.size, dtype=np.float32)
|
||||
x_new = np.linspace(0.0, float(max(0, audio.size - 1)), target_length)
|
||||
stretched = np.interp(x_new, x_old, audio)
|
||||
return np.asarray(stretched, dtype=np.float32)
|
||||
|
||||
def _load_model(self, language: str):
|
||||
"""
|
||||
Загрузка и кэширование модели Silero TTS.
|
||||
@@ -52,21 +72,12 @@ class TextToSpeech:
|
||||
if self.model_en:
|
||||
return self.model_en
|
||||
print("📦 Загрузка модели Silero TTS (en)...")
|
||||
try:
|
||||
model, _ = torch.hub.load(
|
||||
repo_or_dir="snakers4/silero-models",
|
||||
model="silero_tts",
|
||||
language="en",
|
||||
speaker="v5_en",
|
||||
)
|
||||
except Exception as exc:
|
||||
print(f"⚠️ Не удалось загрузить v5_en, пробую v3_en: {exc}")
|
||||
model, _ = torch.hub.load(
|
||||
repo_or_dir="snakers4/silero-models",
|
||||
model="silero_tts",
|
||||
language="en",
|
||||
speaker="v3_en",
|
||||
)
|
||||
model, _ = torch.hub.load(
|
||||
repo_or_dir="snakers4/silero-models",
|
||||
model="silero_tts",
|
||||
language="en",
|
||||
speaker="v3_en",
|
||||
)
|
||||
model.to(device)
|
||||
self.model_en = model
|
||||
return model
|
||||
@@ -185,28 +196,7 @@ class TextToSpeech:
|
||||
if not text.strip():
|
||||
return True
|
||||
|
||||
# Выбор модели
|
||||
if language == "en":
|
||||
model = self._load_model("en")
|
||||
speaker = self.speaker_en
|
||||
else:
|
||||
model = self._load_model("ru")
|
||||
speaker = self.speaker_ru
|
||||
|
||||
# Проверка наличия спикера в модели (защита от ошибок конфига).
|
||||
# Для русского языка сохраняем мужской голос по умолчанию.
|
||||
if hasattr(model, "speakers") and model.speakers:
|
||||
if language == "ru":
|
||||
male_speakers = ("eugene", "aidar")
|
||||
if speaker not in model.speakers or speaker not in male_speakers:
|
||||
for candidate in male_speakers:
|
||||
if candidate in model.speakers:
|
||||
speaker = candidate
|
||||
break
|
||||
else:
|
||||
speaker = model.speakers[0]
|
||||
elif speaker not in model.speakers:
|
||||
speaker = model.speakers[0]
|
||||
model, speaker = self._get_model_and_speaker(language)
|
||||
|
||||
# Разбиваем текст на куски
|
||||
chunks = self._split_text(text)
|
||||
@@ -233,7 +223,7 @@ class TextToSpeech:
|
||||
)
|
||||
|
||||
# Конвертация в numpy массив для sounddevice
|
||||
audio_np = audio.numpy()
|
||||
audio_np = self._apply_speed(audio.numpy())
|
||||
|
||||
if check_interrupt:
|
||||
if not self._play_audio_with_interrupt(audio_np, check_interrupt):
|
||||
@@ -256,10 +246,104 @@ class TextToSpeech:
|
||||
else:
|
||||
return False
|
||||
|
||||
def _get_model_and_speaker(self, language: str):
|
||||
"""Возвращает модель и подходящий голос для языка."""
|
||||
# Выбор модели
|
||||
if language == "en":
|
||||
model = self._load_model("en")
|
||||
speaker = self.speaker_en
|
||||
else:
|
||||
model = self._load_model("ru")
|
||||
speaker = self.speaker_ru
|
||||
|
||||
# Проверка наличия спикера в модели (защита от ошибок конфига).
|
||||
# Для русского языка сохраняем мужской голос по умолчанию.
|
||||
if hasattr(model, "speakers") and model.speakers:
|
||||
if language == "ru":
|
||||
male_speakers = ("eugene", "aidar")
|
||||
if speaker not in model.speakers or speaker not in male_speakers:
|
||||
for candidate in male_speakers:
|
||||
if candidate in model.speakers:
|
||||
speaker = candidate
|
||||
break
|
||||
else:
|
||||
speaker = model.speakers[0]
|
||||
elif speaker not in model.speakers:
|
||||
speaker = model.speakers[0]
|
||||
|
||||
return model, speaker
|
||||
|
||||
def _synthesize_language_audio(self, text: str, language: str) -> np.ndarray | None:
|
||||
"""Собирает аудио для одного языка без промежуточного воспроизведения."""
|
||||
if not text.strip():
|
||||
return np.asarray([], dtype=np.float32)
|
||||
|
||||
model, speaker = self._get_model_and_speaker(language)
|
||||
chunks = self._split_text(text)
|
||||
audio_parts = []
|
||||
|
||||
for chunk in chunks:
|
||||
if self._interrupted:
|
||||
return None
|
||||
audio = model.apply_tts(text=chunk, speaker=speaker, sample_rate=self.sample_rate)
|
||||
audio_parts.append(self._apply_speed(audio.numpy()))
|
||||
|
||||
if not audio_parts:
|
||||
return np.asarray([], dtype=np.float32)
|
||||
|
||||
return np.concatenate(audio_parts)
|
||||
|
||||
def _count_language_switches(self, segments: list[tuple[str, str]]) -> int:
|
||||
if len(segments) < 2:
|
||||
return 0
|
||||
return sum(
|
||||
1
|
||||
for idx in range(1, len(segments))
|
||||
if segments[idx - 1][1] != segments[idx][1]
|
||||
)
|
||||
|
||||
def _speak_mixed_buffered(
|
||||
self, segments: list[tuple[str, str]], check_interrupt=None
|
||||
) -> bool:
|
||||
"""Сначала собирает mixed RU/EN аудио, затем проигрывает единым потоком."""
|
||||
print(f"🔊 Mixed TTS: буферизация сегментов ({len(segments)} шт.)")
|
||||
self._interrupted = False
|
||||
self._stop_flag.clear()
|
||||
|
||||
audio_parts = []
|
||||
for idx, (segment, lang) in enumerate(segments, start=1):
|
||||
if not segment.strip():
|
||||
continue
|
||||
if check_interrupt and check_interrupt():
|
||||
self._interrupted = True
|
||||
return False
|
||||
try:
|
||||
audio_np = self._synthesize_language_audio(segment, language=lang)
|
||||
except Exception as exc:
|
||||
print(f"❌ Ошибка mixed TTS (сегмент {idx}/{len(segments)}): {exc}")
|
||||
return False
|
||||
if audio_np is None:
|
||||
return False
|
||||
if audio_np.size:
|
||||
audio_parts.append(audio_np)
|
||||
|
||||
if not audio_parts:
|
||||
return True
|
||||
|
||||
full_audio = np.concatenate(audio_parts)
|
||||
if check_interrupt:
|
||||
return self._play_audio_with_interrupt(full_audio, check_interrupt)
|
||||
return self._play_audio_blocking(full_audio)
|
||||
|
||||
def _speak_mixed(
|
||||
self, segments: list[tuple[str, str]], check_interrupt=None
|
||||
) -> bool:
|
||||
"""Озвучивание текста с переключением RU/EN по сегментам."""
|
||||
if self._count_language_switches(segments) >= _MIXED_TTS_BUFFERED_SWITCHES:
|
||||
return self._speak_mixed_buffered(
|
||||
segments, check_interrupt=check_interrupt
|
||||
)
|
||||
|
||||
for segment, lang in segments:
|
||||
if not segment.strip():
|
||||
continue
|
||||
@@ -390,6 +474,7 @@ class TextToSpeech:
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(_INTERRUPT_POLL_SECONDS)
|
||||
|
||||
def _play_with_interrupt_sounddevice(
|
||||
self, audio_np: np.ndarray, check_interrupt
|
||||
@@ -407,11 +492,18 @@ class TextToSpeech:
|
||||
# Запускаем воспроизведение (неблокирующее)
|
||||
sd.play(audio_np, self.sample_rate)
|
||||
|
||||
# Ждем окончания воспроизведения в цикле
|
||||
while sd.get_stream().active:
|
||||
# Ждем окончания воспроизведения в цикле.
|
||||
while True:
|
||||
if self._interrupted:
|
||||
break
|
||||
time.sleep(0.02) # Уменьшаем задержку для более быстрого реагирования
|
||||
stream = sd.get_stream()
|
||||
if stream is None or not stream.active:
|
||||
break
|
||||
time.sleep(0.02)
|
||||
|
||||
if not self._interrupted:
|
||||
# Добираем хвост буфера даже если stream.active мигнул в False чуть раньше.
|
||||
sd.wait()
|
||||
|
||||
finally:
|
||||
# Сообщаем потоку-наблюдателю, что пора завершаться
|
||||
|
||||
@@ -9,12 +9,26 @@ Listens for the configured wake word.
|
||||
import pvporcupine
|
||||
import pyaudio
|
||||
import struct
|
||||
import io
|
||||
import wave
|
||||
import time
|
||||
import numpy as np
|
||||
import httpx
|
||||
from collections import deque
|
||||
from deepgram import DeepgramClient
|
||||
from deepgram.clients.listen.v1.rest.options import PrerecordedOptions
|
||||
from ..core.config import (
|
||||
DEEPGRAM_API_KEY,
|
||||
PORCUPINE_ACCESS_KEY,
|
||||
PORCUPINE_KEYWORD_PATH,
|
||||
PORCUPINE_SENSITIVITY,
|
||||
WAKEWORD_HIT_COOLDOWN_SECONDS,
|
||||
WAKEWORD_ENABLE_FALLBACK_STT,
|
||||
WAKEWORD_MIN_RMS,
|
||||
WAKEWORD_REOPEN_GRACE_SECONDS,
|
||||
WAKEWORD_RMS_MULTIPLIER,
|
||||
WAKE_WORD,
|
||||
WAKE_WORD_ALIASES,
|
||||
)
|
||||
from ..core.audio_manager import get_audio_manager
|
||||
|
||||
@@ -33,6 +47,19 @@ class WakeWordDetector:
|
||||
self._resampled_pcm_buffer = np.array([], dtype=np.int16)
|
||||
self._stream_closed = True # Флаг состояния потока (закрыт/открыт)
|
||||
self._last_hit_ts = 0.0
|
||||
self._fallback_dg_client = None
|
||||
self._fallback_pre_roll = deque(maxlen=4)
|
||||
self._fallback_frames = []
|
||||
self._fallback_active = False
|
||||
self._fallback_silence_frames = 0
|
||||
self._fallback_last_attempt_ts = 0.0
|
||||
self._fallback_last_error_ts = 0.0
|
||||
self._stream_opened_ts = 0.0
|
||||
self._rms_history = deque(maxlen=220)
|
||||
self._wakeword_aliases_compact = {
|
||||
self._compact_text(WAKE_WORD),
|
||||
*(self._compact_text(alias) for alias in WAKE_WORD_ALIASES),
|
||||
}
|
||||
|
||||
def initialize(self):
|
||||
"""Инициализация Porcupine и PyAudio."""
|
||||
@@ -87,6 +114,211 @@ class WakeWordDetector:
|
||||
)
|
||||
self._resampled_pcm_buffer = np.array([], dtype=np.int16)
|
||||
self._stream_closed = False
|
||||
self._stream_opened_ts = time.time()
|
||||
self._reset_fallback_state()
|
||||
|
||||
@staticmethod
|
||||
def _compute_rms(pcm: np.ndarray) -> float:
|
||||
if pcm.size == 0:
|
||||
return 0.0
|
||||
as_float = pcm.astype(np.float32)
|
||||
return float(np.sqrt(np.mean(as_float * as_float)))
|
||||
|
||||
@staticmethod
|
||||
def _compact_text(text: str) -> str:
|
||||
text = str(text or "").lower().replace("ё", "е")
|
||||
return "".join(ch for ch in text if ch.isalnum())
|
||||
|
||||
def _remember_rms(self, rms: float):
|
||||
if rms <= 0:
|
||||
return
|
||||
self._rms_history.append(float(rms))
|
||||
|
||||
def _noise_floor_rms(self) -> float:
|
||||
if not self._rms_history:
|
||||
return 0.0
|
||||
# Низкий процентиль устойчив к редким всплескам/голосу.
|
||||
return float(np.percentile(np.asarray(self._rms_history, dtype=np.float32), 20))
|
||||
|
||||
def _wakeword_rms_threshold(self) -> float:
|
||||
floor = self._noise_floor_rms()
|
||||
dynamic = floor * float(WAKEWORD_RMS_MULTIPLIER)
|
||||
# Защитный максимум, чтобы в очень шумном окружении не "убить" детект полностью.
|
||||
dynamic = min(dynamic, float(WAKEWORD_MIN_RMS) * 4.0)
|
||||
return max(float(WAKEWORD_MIN_RMS), dynamic)
|
||||
|
||||
def _is_hit_in_guard_window(
|
||||
self, now_ts: float, *, ignore_hit_cooldown: bool = False
|
||||
) -> bool:
|
||||
if (
|
||||
not ignore_hit_cooldown
|
||||
and now_ts - self._last_hit_ts < float(WAKEWORD_HIT_COOLDOWN_SECONDS)
|
||||
):
|
||||
return True
|
||||
if (
|
||||
self._stream_opened_ts > 0
|
||||
and now_ts - self._stream_opened_ts < float(WAKEWORD_REOPEN_GRACE_SECONDS)
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _accept_porcupine_hit(
|
||||
self,
|
||||
pcm: np.ndarray,
|
||||
now_ts: float,
|
||||
*,
|
||||
ignore_hit_cooldown: bool = False,
|
||||
during_tts: bool = False,
|
||||
) -> bool:
|
||||
if self._is_hit_in_guard_window(
|
||||
now_ts, ignore_hit_cooldown=ignore_hit_cooldown
|
||||
):
|
||||
return False
|
||||
rms = self._compute_rms(pcm)
|
||||
# Для "чистого" Porcupine оставляем мягкий амплитудный фильтр:
|
||||
# он отсеивает тишину/щелчки и ложные фаны от фонового шума.
|
||||
# Во время TTS делаем фильтр строже, чтобы собственная колонка
|
||||
# не "будила" ассистента.
|
||||
factor = 0.95 if during_tts else 0.75
|
||||
threshold = max(80.0, self._wakeword_rms_threshold() * factor)
|
||||
if rms < threshold:
|
||||
return False
|
||||
self._last_hit_ts = now_ts
|
||||
return True
|
||||
|
||||
def _reset_fallback_state(self):
|
||||
self._fallback_pre_roll.clear()
|
||||
self._fallback_frames = []
|
||||
self._fallback_active = False
|
||||
self._fallback_silence_frames = 0
|
||||
|
||||
def _get_fallback_client(self):
|
||||
if not WAKEWORD_ENABLE_FALLBACK_STT:
|
||||
return None
|
||||
if not DEEPGRAM_API_KEY:
|
||||
return None
|
||||
if self._fallback_dg_client is None:
|
||||
self._fallback_dg_client = DeepgramClient(DEEPGRAM_API_KEY)
|
||||
return self._fallback_dg_client
|
||||
|
||||
def _pcm_to_wav_bytes(self, pcm: np.ndarray) -> bytes:
|
||||
buffer = io.BytesIO()
|
||||
with wave.open(buffer, "wb") as wav_file:
|
||||
wav_file.setnchannels(1)
|
||||
wav_file.setsampwidth(2)
|
||||
wav_file.setframerate(int(self.porcupine.sample_rate))
|
||||
wav_file.writeframes(np.asarray(pcm, dtype=np.int16).tobytes())
|
||||
return buffer.getvalue()
|
||||
|
||||
def _transcribe_wakeword_candidate(self, pcm: np.ndarray) -> bool:
|
||||
client = self._get_fallback_client()
|
||||
if client is None or pcm.size == 0:
|
||||
return False
|
||||
|
||||
try:
|
||||
response = client.listen.rest.v("1").transcribe_file(
|
||||
{"buffer": self._pcm_to_wav_bytes(pcm)},
|
||||
PrerecordedOptions(
|
||||
model="nova-2",
|
||||
language="ru",
|
||||
smart_format=False,
|
||||
punctuate=False,
|
||||
utterances=False,
|
||||
numerals=False,
|
||||
),
|
||||
timeout=httpx.Timeout(2.2, connect=2.2, read=2.2, write=2.2),
|
||||
)
|
||||
except Exception as exc:
|
||||
now = time.time()
|
||||
if now - self._fallback_last_error_ts >= 30.0:
|
||||
print(f"⚠️ Wake word fallback STT failed: {exc}")
|
||||
self._fallback_last_error_ts = now
|
||||
return False
|
||||
|
||||
transcript = ""
|
||||
confidence = None
|
||||
try:
|
||||
channels = response.results.channels or []
|
||||
if channels and channels[0].alternatives:
|
||||
first_alt = channels[0].alternatives[0]
|
||||
transcript = str(first_alt.transcript or "").strip()
|
||||
try:
|
||||
confidence = float(first_alt.confidence)
|
||||
except Exception:
|
||||
confidence = None
|
||||
except Exception:
|
||||
transcript = ""
|
||||
confidence = None
|
||||
|
||||
compact = self._compact_text(transcript)
|
||||
if confidence is not None and confidence < 0.62:
|
||||
return False
|
||||
if compact in self._wakeword_aliases_compact:
|
||||
print(f"✅ Wake word обнаружен fallback STT: {transcript}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def _check_fallback_wakeword(
|
||||
self,
|
||||
pcm: np.ndarray,
|
||||
*,
|
||||
during_tts: bool = False,
|
||||
ignore_hit_cooldown: bool = False,
|
||||
) -> bool:
|
||||
if not WAKEWORD_ENABLE_FALLBACK_STT:
|
||||
return False
|
||||
if self.porcupine is None:
|
||||
return False
|
||||
|
||||
rms = self._compute_rms(pcm)
|
||||
base_threshold = self._wakeword_rms_threshold()
|
||||
speech_factor = 1.1 if during_tts else 0.85
|
||||
speech_threshold = max(170.0, base_threshold * speech_factor)
|
||||
silence_threshold = max(95.0, speech_threshold * 0.55)
|
||||
silence_frames_to_finalize = 10 if during_tts else 8
|
||||
min_frames = 10 if during_tts else 7
|
||||
max_frames = 40
|
||||
min_attempt_interval = 2.5 if during_tts else 1.0
|
||||
|
||||
if rms >= speech_threshold:
|
||||
if not self._fallback_active:
|
||||
self._fallback_active = True
|
||||
self._fallback_frames = list(self._fallback_pre_roll)
|
||||
self._fallback_silence_frames = 0
|
||||
self._fallback_frames.append(np.asarray(pcm, dtype=np.int16))
|
||||
elif self._fallback_active:
|
||||
self._fallback_frames.append(np.asarray(pcm, dtype=np.int16))
|
||||
if rms <= silence_threshold:
|
||||
self._fallback_silence_frames += 1
|
||||
else:
|
||||
self._fallback_silence_frames = 0
|
||||
|
||||
if len(self._fallback_frames) > max_frames:
|
||||
self._reset_fallback_state()
|
||||
elif self._fallback_silence_frames >= silence_frames_to_finalize:
|
||||
candidate = np.concatenate(self._fallback_frames) if self._fallback_frames else np.asarray([], dtype=np.int16)
|
||||
self._reset_fallback_state()
|
||||
if len(candidate) >= min_frames * int(self.porcupine.frame_length):
|
||||
now = time.time()
|
||||
candidate_rms = self._compute_rms(candidate)
|
||||
candidate_threshold = self._wakeword_rms_threshold() * (
|
||||
0.95 if during_tts else 0.75
|
||||
)
|
||||
candidate_threshold = max(float(WAKEWORD_MIN_RMS), candidate_threshold)
|
||||
if (
|
||||
now - self._fallback_last_attempt_ts >= min_attempt_interval
|
||||
and not self._is_hit_in_guard_window(
|
||||
now, ignore_hit_cooldown=ignore_hit_cooldown
|
||||
)
|
||||
and candidate_rms >= candidate_threshold
|
||||
):
|
||||
self._fallback_last_attempt_ts = now
|
||||
if self._transcribe_wakeword_candidate(candidate):
|
||||
self._last_hit_ts = now
|
||||
return True
|
||||
|
||||
self._fallback_pre_roll.append(np.asarray(pcm, dtype=np.int16))
|
||||
return False
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Явная остановка и закрытие потока (чтобы освободить микрофон для других задач)."""
|
||||
@@ -97,6 +329,8 @@ class WakeWordDetector:
|
||||
except Exception:
|
||||
pass
|
||||
self._stream_closed = True
|
||||
self._stream_opened_ts = 0.0
|
||||
self._reset_fallback_state()
|
||||
|
||||
def _resample_to_target_rate(self, pcm: np.ndarray) -> np.ndarray:
|
||||
target_rate = int(self.porcupine.sample_rate)
|
||||
@@ -160,14 +394,20 @@ class WakeWordDetector:
|
||||
|
||||
# Читаем небольшой кусочек аудио (frame)
|
||||
pcm = self._read_porcupine_frame()
|
||||
self._remember_rms(self._compute_rms(pcm))
|
||||
|
||||
# Обрабатываем фрейм через Porcupine
|
||||
keyword_index = self.porcupine.process(pcm.tolist())
|
||||
|
||||
# Если keyword_index >= 0, значит ключевое слово обнаружено
|
||||
if keyword_index >= 0:
|
||||
print("✅ Wake word обнаружен!")
|
||||
# Важно: закрываем поток, чтобы освободить микрофон для STT (Deepgram)
|
||||
now = time.time()
|
||||
if self._accept_porcupine_hit(pcm, now, during_tts=False):
|
||||
print("✅ Wake word обнаружен!")
|
||||
# Важно: закрываем поток, чтобы освободить микрофон для STT (Deepgram)
|
||||
self.stop_monitoring()
|
||||
return True
|
||||
if self._check_fallback_wakeword(pcm):
|
||||
self.stop_monitoring()
|
||||
return True
|
||||
|
||||
@@ -189,15 +429,25 @@ class WakeWordDetector:
|
||||
self._open_stream()
|
||||
|
||||
pcm = self._read_porcupine_frame()
|
||||
self._remember_rms(self._compute_rms(pcm))
|
||||
|
||||
keyword_index = self.porcupine.process(pcm.tolist())
|
||||
if keyword_index >= 0:
|
||||
now = time.time()
|
||||
if now - self._last_hit_ts < 0.2: # Уменьшаем интервал для более быстрой реакции
|
||||
if not self._accept_porcupine_hit(
|
||||
pcm,
|
||||
now,
|
||||
ignore_hit_cooldown=True,
|
||||
during_tts=True,
|
||||
):
|
||||
return False
|
||||
self._last_hit_ts = now
|
||||
print("🛑 Wake word обнаружен во время ответа!")
|
||||
return True
|
||||
if self._check_fallback_wakeword(
|
||||
pcm, during_tts=True, ignore_hit_cooldown=True
|
||||
):
|
||||
print("🛑 Wake word обнаружен fallback STT во время ответа!")
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user