feat: harden audio device compatibility across machines

This commit is contained in:
2026-03-12 14:08:20 +03:00
parent e9f26f8050
commit 6c2702d5e3
7 changed files with 480 additions and 74 deletions

View File

@@ -83,10 +83,12 @@ class SpeechRecognizer:
def __init__(self):
self.dg_client = None
self.pa = None
self.audio_manager = None
self.stream = None
self.transcript = ""
self.last_successful_operation = datetime.now()
self._input_device_index = None
self._stream_sample_rate = SAMPLE_RATE
def initialize(self):
"""Инициализация клиента Deepgram и PyAudio."""
@@ -103,9 +105,9 @@ class SpeechRecognizer:
print(f"❌ Ошибка при создании клиента Deepgram: {e}")
raise
audio_manager = get_audio_manager()
self.pa = audio_manager.get_pyaudio()
self._input_device_index = audio_manager.get_input_device_index()
self.audio_manager = get_audio_manager()
self.pa = self.audio_manager.get_pyaudio()
self._input_device_index = self.audio_manager.get_input_device_index()
print("✅ Deepgram клиент готов")
# Обновляем время последней успешной операции
self.last_successful_operation = datetime.now()
@@ -131,18 +133,23 @@ class SpeechRecognizer:
def _get_stream(self):
"""Открывает аудиопоток PyAudio, если он еще не открыт."""
if self.stream is None:
kwargs = {}
if self._input_device_index is not None:
kwargs["input_device_index"] = self._input_device_index
self.stream = self.pa.open(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=4096,
**kwargs,
if self.audio_manager is None:
self.audio_manager = get_audio_manager()
self.stream, self._input_device_index, self._stream_sample_rate = (
self.audio_manager.open_input_stream(
rate=SAMPLE_RATE,
channels=1,
format=pyaudio.paInt16,
frames_per_buffer=4096,
preferred_index=self._input_device_index,
fallback_rates=[48000, 44100, 32000, 22050, 16000, 8000],
)
)
if self._stream_sample_rate != SAMPLE_RATE:
print(
f"⚠️ STT mic stream uses fallback rate={self._stream_sample_rate} "
f"(requested {SAMPLE_RATE})"
)
return self.stream
async def _process_audio(
@@ -242,7 +249,7 @@ class SpeechRecognizer:
smart_format=True, # Расстановка знаков препинания
encoding="linear16",
channels=1,
sample_rate=SAMPLE_RATE,
sample_rate=self._stream_sample_rate,
interim_results=True,
utterance_end_ms=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
vad_events=True,