feat: harden audio device compatibility across machines

This commit is contained in:
2026-03-12 14:08:20 +03:00
parent e9f26f8050
commit 6c2702d5e3
7 changed files with 480 additions and 74 deletions

View File

@@ -83,10 +83,12 @@ class SpeechRecognizer:
def __init__(self):
self.dg_client = None
self.pa = None
self.audio_manager = None
self.stream = None
self.transcript = ""
self.last_successful_operation = datetime.now()
self._input_device_index = None
self._stream_sample_rate = SAMPLE_RATE
def initialize(self):
"""Инициализация клиента Deepgram и PyAudio."""
@@ -103,9 +105,9 @@ class SpeechRecognizer:
print(f"❌ Ошибка при создании клиента Deepgram: {e}")
raise
audio_manager = get_audio_manager()
self.pa = audio_manager.get_pyaudio()
self._input_device_index = audio_manager.get_input_device_index()
self.audio_manager = get_audio_manager()
self.pa = self.audio_manager.get_pyaudio()
self._input_device_index = self.audio_manager.get_input_device_index()
print("✅ Deepgram клиент готов")
# Обновляем время последней успешной операции
self.last_successful_operation = datetime.now()
@@ -131,18 +133,23 @@ class SpeechRecognizer:
def _get_stream(self):
"""Открывает аудиопоток PyAudio, если он еще не открыт."""
if self.stream is None:
kwargs = {}
if self._input_device_index is not None:
kwargs["input_device_index"] = self._input_device_index
self.stream = self.pa.open(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=4096,
**kwargs,
if self.audio_manager is None:
self.audio_manager = get_audio_manager()
self.stream, self._input_device_index, self._stream_sample_rate = (
self.audio_manager.open_input_stream(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
frames_per_buffer=4096,
preferred_index=self._input_device_index,
fallback_rates=[48000, 44100, 32000, 22050, 16000, 8000],
)
)
if self._stream_sample_rate != SAMPLE_RATE:
print(
f"⚠️ STT mic stream uses fallback rate={self._stream_sample_rate} "
f"(requested {SAMPLE_RATE})"
)
return self.stream
async def _process_audio(
@@ -242,7 +249,7 @@ class SpeechRecognizer:
smart_format=True, # Расстановка знаков препинания
encoding="linear16",
channels=1,
sample_rate=SAMPLE_RATE,
sample_rate=self._stream_sample_rate,
interim_results=True,
utterance_end_ms=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
vad_events=True,

View File

@@ -14,9 +14,11 @@ import time
import warnings
import numpy as np
import pyaudio
import sounddevice as sd
import torch
from ..core.audio_manager import get_audio_manager
from ..core.config import TTS_EN_SPEAKER, TTS_SAMPLE_RATE, TTS_SPEAKER
# Подавляем предупреждения Silero о длинном тексте (мы сами его режем)
@@ -36,6 +38,8 @@ class TextToSpeech:
self.speaker_en = TTS_EN_SPEAKER
self._interrupted = False
self._stop_flag = threading.Event()
self._audio_manager = None
self._output_device_index = None
def _load_model(self, language: str):
"""
@@ -232,14 +236,13 @@ class TextToSpeech:
audio_np = audio.numpy()
if check_interrupt:
# Воспроизведение с проверкой прерывания (сложная логика)
if not self._play_with_interrupt(audio_np, check_interrupt):
if not self._play_audio_with_interrupt(audio_np, check_interrupt):
success = False
break
else:
# Обычное воспроизведение (блокирующее)
sd.play(audio_np, self.sample_rate)
sd.wait()
if not self._play_audio_blocking(audio_np):
success = False
break
except Exception as e:
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
@@ -293,6 +296,75 @@ class TextToSpeech:
text, check_interrupt=check_interrupt, language=language
)
def _resample_audio(self, audio_np: np.ndarray, src_rate: int, dst_rate: int):
if src_rate == dst_rate:
return audio_np.astype(np.float32, copy=False)
if audio_np.size == 0:
return np.asarray([], dtype=np.float32)
target_length = max(1, int(round(audio_np.size * dst_rate / src_rate)))
x_old = np.arange(audio_np.size, dtype=np.float32)
x_new = np.linspace(0.0, float(max(0, audio_np.size - 1)), target_length)
resampled = np.interp(x_new, x_old, audio_np.astype(np.float32))
return np.asarray(resampled, dtype=np.float32)
def _play_audio_blocking(self, audio_np: np.ndarray) -> bool:
try:
sd.play(audio_np, self.sample_rate)
sd.wait()
return True
except Exception as exc:
print(f"⚠️ sounddevice playback failed, fallback to PyAudio: {exc}")
return self._play_with_pyaudio(audio_np, check_interrupt=None)
def _play_audio_with_interrupt(self, audio_np: np.ndarray, check_interrupt) -> bool:
try:
return self._play_with_interrupt_sounddevice(audio_np, check_interrupt)
except Exception as exc:
print(
"⚠️ sounddevice playback-with-interrupt failed, fallback to PyAudio: "
f"{exc}"
)
return self._play_with_pyaudio(audio_np, check_interrupt=check_interrupt)
def _play_with_pyaudio(self, audio_np: np.ndarray, check_interrupt=None) -> bool:
if self._audio_manager is None:
self._audio_manager = get_audio_manager()
output_stream = None
try:
output_stream, self._output_device_index, out_rate = (
self._audio_manager.open_output_stream(
rate=self.sample_rate,
channels=1,
format=pyaudio.paFloat32,
preferred_index=self._output_device_index,
fallback_rates=[48000, 44100, 32000, 22050],
)
)
pcm = self._resample_audio(audio_np, self.sample_rate, out_rate)
chunk_size = max(256, int(out_rate * 0.03))
for offset in range(0, len(pcm), chunk_size):
if check_interrupt and check_interrupt():
self._interrupted = True
return False
output_stream.write(pcm[offset : offset + chunk_size].tobytes())
return True
except Exception as exc:
print(f"❌ PyAudio playback failed: {exc}")
return False
finally:
if output_stream is not None:
try:
output_stream.stop_stream()
except Exception:
pass
try:
output_stream.close()
except Exception:
pass
def _check_interrupt_worker(self, check_interrupt):
"""
Фоновая функция для потока: постоянно опрашивает check_interrupt.
@@ -308,7 +380,9 @@ class TextToSpeech:
except Exception:
pass
def _play_with_interrupt(self, audio_np: np.ndarray, check_interrupt) -> bool:
def _play_with_interrupt_sounddevice(
self, audio_np: np.ndarray, check_interrupt
) -> bool:
"""
Воспроизводит аудио, параллельно проверяя условие прерывания в отдельном потоке.
"""

View File

@@ -9,6 +9,7 @@ Listens for the "Alexandr" wake word.
import pvporcupine
import pyaudio
import struct
import numpy as np
from ..core.config import (
PORCUPINE_ACCESS_KEY,
PORCUPINE_KEYWORD_PATH,
@@ -24,6 +25,11 @@ class WakeWordDetector:
self.porcupine = None
self.audio_stream = None
self.pa = None
self._audio_manager = None
self._input_device_index = None
self._capture_sample_rate = None
self._capture_frame_length = None
self._resampled_pcm_buffer = np.array([], dtype=np.int16)
self._stream_closed = True # Флаг состояния потока (закрыт/открыт)
self._last_hit_ts = 0.0
@@ -37,11 +43,13 @@ class WakeWordDetector:
)
# Используем общий экземпляр PyAudio
audio_manager = get_audio_manager()
self.pa = audio_manager.get_pyaudio()
self._input_device_index = audio_manager.get_input_device_index()
self._audio_manager = get_audio_manager()
self.pa = self._audio_manager.get_pyaudio()
self._open_stream()
print(f"🎤 Ожидание wake word 'Alexandr' (sens={PORCUPINE_SENSITIVITY:.2f})...")
print(
"🎤 Ожидание wake word 'Alexandr' "
f"(sens={PORCUPINE_SENSITIVITY:.2f}, mic_rate={self._capture_sample_rate})..."
)
def _open_stream(self):
"""Открытие аудиопотока с микрофона."""
@@ -55,19 +63,28 @@ class WakeWordDetector:
except Exception:
pass
# Открываем поток с параметрами, которые требует Porcupine
kwargs = {}
if getattr(self, "_input_device_index", None) is not None:
kwargs["input_device_index"] = self._input_device_index
self.audio_stream = self.pa.open(
rate=self.porcupine.sample_rate,
target_rate = int(self.porcupine.sample_rate)
fallback_rates = [48000, 44100, 32000, 22050, 16000]
self.audio_stream, self._input_device_index, actual_rate = self._audio_manager.open_input_stream(
rate=target_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=self.porcupine.frame_length,
**kwargs,
preferred_index=self._input_device_index,
fallback_rates=fallback_rates,
)
self._capture_sample_rate = int(actual_rate)
self._capture_frame_length = max(
64,
int(
round(
self.porcupine.frame_length
* self._capture_sample_rate
/ target_rate
)
),
)
self._resampled_pcm_buffer = np.array([], dtype=np.int16)
self._stream_closed = False
def stop_monitoring(self):
@@ -80,6 +97,40 @@ class WakeWordDetector:
pass
self._stream_closed = True
def _resample_to_target_rate(self, pcm: np.ndarray) -> np.ndarray:
target_rate = int(self.porcupine.sample_rate)
source_rate = int(self._capture_sample_rate or target_rate)
if source_rate == target_rate:
return pcm
if pcm.size == 0:
return np.array([], dtype=np.int16)
target_length = max(1, int(round(pcm.size * target_rate / source_rate)))
x_old = np.arange(pcm.size, dtype=np.float32)
x_new = np.linspace(0.0, float(max(0, pcm.size - 1)), target_length)
resampled = np.interp(x_new, x_old, pcm.astype(np.float32))
return np.asarray(resampled, dtype=np.int16)
def _read_porcupine_frame(self):
target_length = int(self.porcupine.frame_length)
if self._capture_sample_rate == self.porcupine.sample_rate:
pcm = self.audio_stream.read(target_length, exception_on_overflow=False)
return np.asarray(struct.unpack_from("h" * target_length, pcm), dtype=np.int16)
while self._resampled_pcm_buffer.size < target_length:
raw = self.audio_stream.read(
self._capture_frame_length, exception_on_overflow=False
)
captured = np.frombuffer(raw, dtype=np.int16)
converted = self._resample_to_target_rate(captured)
if converted.size:
self._resampled_pcm_buffer = np.concatenate(
(self._resampled_pcm_buffer, converted)
)
frame = self._resampled_pcm_buffer[:target_length]
self._resampled_pcm_buffer = self._resampled_pcm_buffer[target_length:]
return frame
def wait_for_wakeword(self, timeout: float = None) -> bool:
"""
Блокирующая функция: ждет, пока не будет услышана фраза "Alexandr"
@@ -107,14 +158,10 @@ class WakeWordDetector:
return False
# Читаем небольшой кусочек аудио (frame)
pcm = self.audio_stream.read(
self.porcupine.frame_length, exception_on_overflow=False
)
# Конвертируем байты в кортеж чисел (требование Porcupine)
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)
pcm = self._read_porcupine_frame()
# Обрабатываем фрейм через Porcupine
keyword_index = self.porcupine.process(pcm)
keyword_index = self.porcupine.process(pcm.tolist())
# Если keyword_index >= 0, значит ключевое слово обнаружено
if keyword_index >= 0:
@@ -140,12 +187,9 @@ class WakeWordDetector:
try:
self._open_stream()
pcm = self.audio_stream.read(
self.porcupine.frame_length, exception_on_overflow=False
)
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)
pcm = self._read_porcupine_frame()
keyword_index = self.porcupine.process(pcm)
keyword_index = self.porcupine.process(pcm.tolist())
if keyword_index >= 0:
now = time.time()
if now - self._last_hit_ts < 0.2: # Уменьшаем интервал для более быстрой реакции