feat: improve semantic voice control and music playback

2026-03-15 14:40:33 +03:00
parent e1a94c68db
commit cb54a9ee75
8 changed files with 1656 additions and 276 deletions
--- a/.env.example
+++ b/.env.example
@@ -39,6 +39,12 @@ TTS_EN_SPEAKER=en_0
 WEATHER_LAT=63.56
 WEATHER_LON=53.69
 WEATHER_CITY=Ухта
 # Navidrome (приоритетный источник музыки; при ошибке — fallback на Spotify)
 NAVIDROME_URL=https://navidrome.example.com
 NAVIDROME_USERNAME=your_navidrome_username
 NAVIDROME_PASSWORD=your_navidrome_password
 SPOTIFY_CLIENT_ID=your_spotify_client_id
 SPOTIFY_CLIENT_SECRET=your_spotify_client_secret
 SPOTIFY_REDIRECT_URI=http://localhost:8888/callback
--- a/README.md
+++ b/README.md
@@ -31,7 +31,8 @@
 - Погода: текущий прогноз по городу по умолчанию или по названию города.
 - Таймеры, будильники (включая будни/выходные), секундомеры.
 - Управление громкостью системы (через `pactl`/`amixer`).
- Управление Spotify (play/pause/next/what's playing).
+- Управление музыкой через Navidrome (приоритет) с fallback на Spotify.
 - Persistent resume: `пауза`/`продолжи` продолжают с сохранённой позиции даже после перезапуска колонки.
 - Мини-игра "Города".
 ## Как это работает
@@ -60,7 +61,7 @@ flowchart TD
 ```bash
 sudo apt-get update
-sudo apt-get install -y portaudio19-dev libasound2-dev mpg123 pulseaudio-utils alsa-utils
+sudo apt-get install -y portaudio19-dev libasound2-dev mpg123 mpv pulseaudio-utils alsa-utils
 ```
 ### 2) Установка Python-зависимостей
@@ -156,6 +157,9 @@ python run.py
 | `WEATHER_LAT` | Нет | - | Широта города по умолчанию |
 | `WEATHER_LON` | Нет | - | Долгота города по умолчанию |
 | `WEATHER_CITY` | Нет | `Ухта` | Город по умолчанию для погоды |
 | `NAVIDROME_URL` | Нет | - | URL Navidrome (например `https://navidrome.example.com`) |
 | `NAVIDROME_USERNAME` | Нет | - | Логин Navidrome |
 | `NAVIDROME_PASSWORD` | Нет | - | Пароль Navidrome |
 | `SPOTIFY_CLIENT_ID` | Нет | - | Spotify OAuth Client ID |
 | `SPOTIFY_CLIENT_SECRET` | Нет | - | Spotify OAuth Client Secret |
 | `SPOTIFY_REDIRECT_URI` | Нет | `http://localhost:8888/callback` | Redirect URI для Spotify |
@@ -172,7 +176,7 @@ python run.py
 | Будильник | `Поставь будильник на 7:30`, `Будильник по будням в 8:00` |
 | Секундомер | `Запусти секундомер`, `Покажи активные секундомеры` |
 | Громкость | `Громкость 7` |
-| Spotify | `Включи музыку`, `Пауза`, `Что сейчас играет` |
+| Музыка (Navidrome first) | `Включи музыку`, `Пауза`, `Продолжи`, `Следующий`, `Предыдущий`, `Что играет`, `Включи жанр electronic`, `Включи папку crystal castles` |
 | Игра | `Давай сыграем в города` |
 | Управление диалогом | `Повтори`, `Стоп`, `Хватит` |
@@ -222,6 +226,8 @@ alexander_smart-speaker/
 | `Audio input/output initialization failed` | проверить, что звук-сервер запущен (PipeWire/PulseAudio), и при необходимости задать `AUDIO_INPUT_DEVICE_NAME`/`AUDIO_OUTPUT_DEVICE_NAME` |
 | Будильник/таймер не звонит | наличие `mpg123` в системе |
 | Ошибка про несколько AI API | в `.env` должен остаться только один незакомментированный AI ключ |
 | Navidrome не воспроизводит | заполнены `NAVIDROME_*`, доступен `NAVIDROME_URL`, установлен `mpv` |
 | Fallback ушёл в Spotify | проверить доступность Navidrome, SSL и корректность `NAVIDROME_USERNAME`/`NAVIDROME_PASSWORD` |
 | Spotify не управляется | заполнены `SPOTIFY_*`, есть активное устройство, Premium-аккаунт |
 ## Лицензия
--- a/app/audio/stt.py
+++ b/app/audio/stt.py
@@ -12,6 +12,8 @@ import re
 import time
 import pyaudio
 import logging
 import contextlib
 import threading
 from datetime import datetime, timedelta
 from ..core.config import DEEPGRAM_API_KEY, SAMPLE_RATE, WAKE_WORD_ALIASES
 from deepgram import (
@@ -29,8 +31,12 @@ from ..core.audio_manager import get_audio_manager
 _original_connect = websockets.sync.client.connect
 DEEPGRAM_CONNECT_TIMEOUT_SECONDS = 3.0
-DEEPGRAM_CONNECT_WAIT_SECONDS = 1.5
+DEEPGRAM_CONNECT_WAIT_SECONDS = 4.0
 DEEPGRAM_CONNECT_POLL_SECONDS = 0.001
 SENDER_STOP_WAIT_SECONDS = 2.5
 SENDER_FORCE_RELEASE_WAIT_SECONDS = 2.5
 DEEPGRAM_FINALIZATION_GRACE_SECONDS = 0.35
 DEEPGRAM_FINISH_TIMEOUT_SECONDS = 4.0
 def _patched_connect(*args, **kwargs):
@@ -152,6 +158,82 @@ class SpeechRecognizer:
                )
        return self.stream
    def _open_stream_for_session(self):
        """Открывает отдельный входной поток для одной STT-сессии."""
        if self.audio_manager is None:
            self.audio_manager = get_audio_manager()
        stream, self._input_device_index, sample_rate = self.audio_manager.open_input_stream(
            rate=SAMPLE_RATE,
            channels=1,
            format=pyaudio.paInt16,
            frames_per_buffer=4096,
            preferred_index=self._input_device_index,
            fallback_rates=[48000, 44100, 32000, 22050, 16000, 8000],
        )
        if sample_rate != SAMPLE_RATE:
            print(
                f"⚠️ STT mic stream uses fallback rate={sample_rate} "
                f"(requested {SAMPLE_RATE})"
            )
        return stream, int(sample_rate)
    def _stop_stream_quietly(self):
        if not self.stream:
            return
        try:
            if self.stream.is_active():
                self.stream.stop_stream()
        except Exception:
            pass
    def _release_stream(self):
        if not self.stream:
            return
        self._stop_stream_quietly()
        try:
            self.stream.close()
        except Exception:
            pass
        self.stream = None
    async def _wait_for_thread(self, thread, timeout_seconds: float) -> bool:
        """Асинхронно ждет завершения daemon-thread без блокировки event loop."""
        deadline = time.monotonic() + timeout_seconds
        while thread.is_alive() and time.monotonic() < deadline:
            await asyncio.sleep(0.05)
        return not thread.is_alive()
    async def _run_blocking_cleanup(self, func, timeout_seconds: float, label: str) -> bool:
        """Запускает потенциально подвисающий cleanup в daemon-thread и ждет ограниченное время."""
        done_event = threading.Event()
        error_holder = {}
        def runner():
            try:
                func()
            except Exception as exc:
                error_holder["error"] = exc
            finally:
                done_event.set()
        thread = threading.Thread(target=runner, daemon=True, name=label)
        thread.start()
        deadline = time.monotonic() + timeout_seconds
        while not done_event.is_set() and time.monotonic() < deadline:
            await asyncio.sleep(0.05)
        if not done_event.is_set():
            print(f"⚠️ {label} timed out; continuing cleanup.")
            return False
        error = error_holder.get("error")
        if error is not None:
            print(f"⚠️ {label} failed: {error}")
            return False
        return True
    async def _process_audio(
        self, dg_connection, timeout_seconds, detection_timeout, fast_stop
    ):
@@ -166,9 +248,9 @@ class SpeechRecognizer:
        """
        self.transcript = ""
        transcript_parts = []
        latest_interim = ""
        loop = asyncio.get_running_loop()
        stream = self._get_stream()
        effective_detection_timeout = (
            detection_timeout
            if detection_timeout is not None
@@ -192,9 +274,13 @@ class SpeechRecognizer:
        # --- Обработчики событий Deepgram ---
        def on_transcript(unused_self, result, **kwargs):
            """Вызывается, когда приходит часть текста."""
            nonlocal latest_interim
            sentence = result.channel.alternatives[0].transcript
            if len(sentence) == 0:
                return
            sentence = sentence.strip()
            if not sentence:
                return
            try:
                loop.call_soon_threadsafe(mark_speech_activity)
            except RuntimeError:
@@ -202,9 +288,9 @@ class SpeechRecognizer:
            if fast_stop:
                if _is_fast_stop_utterance(sentence):
-                    self.transcript = sentence.strip()
+                    self.transcript = sentence
                    try:
-                        loop.call_soon_threadsafe(stop_event.set)
+                        loop.call_soon_threadsafe(request_stop)
                    except RuntimeError:
                        pass
                    return
@@ -213,6 +299,10 @@ class SpeechRecognizer:
                # Собираем только финальные (подтвержденные) фразы
                transcript_parts.append(sentence)
                self.transcript = " ".join(transcript_parts).strip()
                latest_interim = ""
            else:
                # Fallback: некоторые сессии завершаются без is_final.
                latest_interim = sentence
        def on_speech_started(unused_self, speech_started, **kwargs):
            """Вызывается, когда VAD (Voice Activity Detection) слышит голос."""
@@ -231,7 +321,7 @@ class SpeechRecognizer:
        def on_error(unused_self, error, **kwargs):
            print(f"Deepgram Error: {error}")
            try:
-                loop.call_soon_threadsafe(stop_event.set)
+                loop.call_soon_threadsafe(request_stop)
            except RuntimeError:
                # Event loop might be closed, ignore
                pass
@@ -242,27 +332,34 @@ class SpeechRecognizer:
        dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
        dg_connection.on(LiveTranscriptionEvents.Error, on_error)
        # Параметры распознавания
        options = LiveOptions(
            model="nova-2",  # Самая быстрая и точная модель
            language=self.current_lang,
            smart_format=True,  # Расстановка знаков препинания
            encoding="linear16",
            channels=1,
            sample_rate=self._stream_sample_rate,
            interim_results=True,
            utterance_end_ms=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
            vad_events=True,
            # Сглаженный порог endpointing, чтобы не резать речь на коротких паузах.
            endpointing=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
        )
        # --- Задача отправки аудио с буферизацией ---
-        async def send_audio():
+        sender_stop_event = threading.Event()
        def request_stop():
            stop_event.set()
            sender_stop_event.set()
        def send_audio():
            chunks_sent = 0
            audio_buffer = []  # Буфер для накопления звука во время подключения
            stream = None
            try:
                stream, stream_sample_rate = self._open_stream_for_session()
                options = LiveOptions(
                    model="nova-2",  # Самая быстрая и точная модель
                    language=self.current_lang,
                    smart_format=True,  # Расстановка знаков препинания
                    encoding="linear16",
                    channels=1,
                    sample_rate=stream_sample_rate,
                    interim_results=True,
                    utterance_end_ms=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
                    vad_events=True,
                    # Сглаженный порог endpointing, чтобы не резать речь на коротких паузах.
                    endpointing=int(POST_SPEECH_SILENCE_TIMEOUT_SECONDS * 1000),
                )
                # 1. Сразу начинаем захват звука, не дожидаясь сети!
                stream.start_stream()
                print("🎤 Stream started (buffering)...")
@@ -270,34 +367,61 @@ class SpeechRecognizer:
                # 2. Запускаем подключение к Deepgram в фоне (через ThreadPool, т.к. start() блокирующий)
                # Но в данном SDK start() возвращает bool, он может быть блокирующим.
                # Deepgram Python SDK v3+ start() делает handshake.
                connect_result = {"done": False, "ok": None, "error": None}
-                connect_future = loop.run_in_executor(
+                def start_connection():
-                    None, lambda: dg_connection.start(options)
+                    try:
                        connect_result["ok"] = dg_connection.start(options)
                    except Exception as exc:
                        connect_result["error"] = exc
                    finally:
                        connect_result["done"] = True
                connect_thread = threading.Thread(
                    target=start_connection, daemon=True
                )
                connect_thread.start()
                # Пока подключаемся, копим данные.
                # Ждём коротко: если сеть подвисла, быстрее перезапускаем попытку.
                connect_deadline = time.monotonic() + DEEPGRAM_CONNECT_WAIT_SECONDS
                while (
-                    not connect_future.done()
+                    not connect_result["done"]
                    and time.monotonic() < connect_deadline
                    and not sender_stop_event.is_set()
                ):
                    if stream.is_active():
-                        data = stream.read(4096, exception_on_overflow=False)
+                        try:
                            data = stream.read(4096, exception_on_overflow=False)
                        except Exception as read_error:
                            if sender_stop_event.is_set():
                                return
                            print(f"Audio read error during connect: {read_error}")
                            with contextlib.suppress(RuntimeError):
                                loop.call_soon_threadsafe(request_stop)
                            return
                        audio_buffer.append(data)
-                    await asyncio.sleep(DEEPGRAM_CONNECT_POLL_SECONDS)
+                    time.sleep(DEEPGRAM_CONNECT_POLL_SECONDS)
-                if not connect_future.done():
+                if sender_stop_event.is_set():
                    return
                if not connect_result["done"]:
                    print(
                        f"⏰ Timeout connecting to Deepgram ({DEEPGRAM_CONNECT_WAIT_SECONDS:.1f}s)"
                    )
-                    stop_event.set()
+                    loop.call_soon_threadsafe(request_stop)
                    return
                # Проверяем результат подключения
-                if connect_future.result() is False:
+                if connect_result["error"] is not None:
                    print(f"Failed to start Deepgram connection: {connect_result['error']}")
                    loop.call_soon_threadsafe(request_stop)
                    return
                if connect_result["ok"] is False:
                    print("Failed to start Deepgram connection")
-                    stop_event.set()
+                    loop.call_soon_threadsafe(request_stop)
                    return
                print(f"🚀 Connected! Sending buffer ({len(audio_buffer)} chunks)...")
@@ -310,23 +434,45 @@ class SpeechRecognizer:
                audio_buffer = None  # Освобождаем память
                # 4. Продолжаем стримить в реальном времени до события остановки.
-                while not stop_event.is_set():
+                while not sender_stop_event.is_set():
-                    if stream.is_active():
+                    if not stream.is_active():
                        break
                    try:
                        data = stream.read(4096, exception_on_overflow=False)
-                        dg_connection.send(data)
+                    except Exception as read_error:
-                        chunks_sent += 1
+                        if sender_stop_event.is_set():
-                        if chunks_sent % 50 == 0:
+                            break
-                            print(".", end="", flush=True)
+                        print(f"Audio read error: {read_error}")
-                    await asyncio.sleep(0.002)  # Уменьшаем задержку для более быстрого реагирования
+                        with contextlib.suppress(RuntimeError):
                            loop.call_soon_threadsafe(request_stop)
                        break
                    if sender_stop_event.is_set():
                        break
                    dg_connection.send(data)
                    chunks_sent += 1
                    if chunks_sent % 50 == 0:
                        print(".", end="", flush=True)
                    time.sleep(0.002)  # Уменьшаем задержку для более быстрого реагирования
            except Exception as e:
                print(f"Audio send error: {e}")
                with contextlib.suppress(RuntimeError):
                    loop.call_soon_threadsafe(request_stop)
            finally:
-                if stream.is_active():
+                with contextlib.suppress(Exception):
-                    stream.stop_stream()
+                    if stream and stream.is_active():
                        stream.stop_stream()
                with contextlib.suppress(Exception):
                    if stream:
                        stream.close()
                print(f"\n🛑 Stream stopped. Chunks sent: {chunks_sent}")
-        sender_task = asyncio.create_task(send_audio())
+        sender_thread = threading.Thread(
            target=send_audio,
            daemon=True,
            name="deepgram-audio-sender",
        )
        sender_thread.start()
        if False:  # dg_connection.start(options) перенесен внутрь send_audio
            pass
@@ -356,7 +502,7 @@ class SpeechRecognizer:
                if not done:
                    # Если за detection_timeout никто не начал говорить, выходим
-                    stop_event.set()
+                    request_stop()
            # 2. После старта речи завершаем только по тишине POST_SPEECH_SILENCE_TIMEOUT_SECONDS.
            # Добавляем длинный защитный лимит, чтобы сессия не зависла навсегда.
@@ -374,7 +520,7 @@ class SpeechRecognizer:
                            now - last_speech_activity
                            >= POST_SPEECH_SILENCE_TIMEOUT_SECONDS
                        ):
-                            stop_event.set()
+                            request_stop()
                            break
                        if (
@@ -383,7 +529,7 @@ class SpeechRecognizer:
                            >= max_active_speech_seconds
                        ):
                            print("⏱️ Достигнут защитный лимит активного прослушивания.")
-                            stop_event.set()
+                            request_stop()
                            break
                    await asyncio.sleep(0.05)
@@ -393,19 +539,29 @@ class SpeechRecognizer:
        except Exception as e:
            print(f"Error in waiting for events: {e}")
-        stop_event.set()
+        request_stop()
-        try:
+        sender_stopped = await self._wait_for_thread(
-            await sender_task
+            sender_thread,
-        except Exception as e:
+            timeout_seconds=max(SENDER_STOP_WAIT_SECONDS, SENDER_FORCE_RELEASE_WAIT_SECONDS),
-            print(f"Error waiting for sender task: {e}")
+        )
        if not sender_stopped:
            print("⚠️ Audio sender shutdown timed out; continuing cleanup.")
        # Небольшая пауза, чтобы получить последние transcript-события перед finish().
        await asyncio.sleep(DEEPGRAM_FINALIZATION_GRACE_SECONDS)
        # Завершаем соединение и ждем последние результаты
-        try:
+        await self._run_blocking_cleanup(
-            dg_connection.finish()
+            dg_connection.finish,
-        except Exception as e:
+            timeout_seconds=DEEPGRAM_FINISH_TIMEOUT_SECONDS,
-            print(f"Error finishing connection: {e}")
+            label="Deepgram finish",
        )
-        return self.transcript
+        final_text = self.transcript.strip()
        if not final_text:
            final_text = latest_interim.strip()
        self.transcript = final_text
        return final_text
    def listen(
        self,
--- a/app/audio/tts.py
+++ b/app/audio/tts.py
@@ -286,6 +286,9 @@ class TextToSpeech:
        if not text.strip():
            return True
        if check_interrupt is None:
            check_interrupt = self._default_interrupt_checker()
        if language == "ru":
            text = self._preprocess_text(text)
            segments = self._split_mixed_language(text)
@@ -296,6 +299,14 @@ class TextToSpeech:
            text, check_interrupt=check_interrupt, language=language
        )
    def _default_interrupt_checker(self):
        try:
            from .wakeword import check_wakeword_once
            return check_wakeword_once
        except Exception:
            return None
    def _resample_audio(self, audio_np: np.ndarray, src_rate: int, dst_rate: int):
        if src_rate == dst_rate:
            return audio_np.astype(np.float32, copy=False)
--- a/app/core/ai.py
+++ b/app/core/ai.py
@@ -54,6 +54,26 @@ No explanations, no quotes, no comments.
 Separate variants with " / " (space slash space).
 Keep the translation максимально кратким и естественным, без лишних слов."""
 INTENT_SYSTEM_PROMPT = """Ты NLU-модуль голосовой колонки.
 Твоя задача: распознать намерение пользователя и вернуть СТРОГО JSON без markdown и пояснений.
 Всегда возвращай объект c ключами:
 {
  "intent": "none|music|timer|alarm|weather|volume|translation|cities|repeat|stop|smalltalk|chat",
  "normalized_command": "<краткая нормализованная команда на русском или пусто>",
  "music_action": "none|play|pause|resume|next|previous|current|play_genre|play_folder|play_query",
  "music_query": "<запрос для музыки/жанра/папки или пусто>",
  "confidence": 0.0
 }
 Правила:
 - Если это музыка, ставь intent=music и выбирай music_action.
 - "Включи музыку" и любые эквиваленты = music_action=play.
 - Для "пауза/останови музыку/выключи музыку" = music_action=pause.
 - Для "что играет" = music_action=current.
 - Для "включи жанр X" = music_action=play_genre, music_query=X.
 - Для "включи папку X" = music_action=play_folder, music_query=X.
 - normalized_command должен быть пригоден для командного парсера (без лишних слов).
 - Если уверенность низкая, ставь intent=none, music_action=none, confidence <= 0.4."""
 _PROVIDER_ALIASES = {
    "": "openrouter",
    "anthropic": "anthropic",
@@ -381,6 +401,32 @@ def _log_request_exception(cfg, error: Exception):
    print(f"❌ Ошибка API ({cfg['name']}): {error}{details}")
 def _extract_json_object(raw_text: str) -> Optional[dict]:
    text = str(raw_text or "").strip()
    if not text:
        return None
    try:
        payload = json.loads(text)
        if isinstance(payload, dict):
            return payload
    except json.JSONDecodeError:
        pass
    match = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not match:
        return None
    candidate = match.group(0).strip()
    try:
        payload = json.loads(candidate)
    except json.JSONDecodeError:
        return None
    if isinstance(payload, dict):
        return payload
    return None
 def _send_request(messages, max_tokens, temperature, error_text):
    """
    Внутренняя функция для отправки HTTP-запроса к выбранному AI-провайдеру.
@@ -422,6 +468,98 @@ def _send_request(messages, max_tokens, temperature, error_text):
        return "Не удалось обработать ответ от AI."
 def interpret_assistant_intent(text: str) -> dict:
    """
    Interprets voice command semantics for downstream command routers.
    Returns a normalized dict even when AI is unavailable.
    """
    result = {
        "intent": "none",
        "normalized_command": "",
        "music_action": "none",
        "music_query": "",
        "confidence": 0.0,
    }
    cleaned_text = str(text or "").strip()
    if not cleaned_text:
        return result
    cfg, selection_error = _get_provider_settings()
    if selection_error:
        return result
    if _get_provider_config_error(cfg):
        return result
    messages = [
        {"role": "system", "content": INTENT_SYSTEM_PROMPT},
        {"role": "user", "content": cleaned_text},
    ]
    response = _send_request(
        messages,
        max_tokens=220,
        temperature=0.0,
        error_text="",
    )
    payload = _extract_json_object(response)
    if not payload:
        return result
    allowed_intents = {
        "none",
        "music",
        "timer",
        "alarm",
        "weather",
        "volume",
        "translation",
        "cities",
        "repeat",
        "stop",
        "smalltalk",
        "chat",
    }
    allowed_music_actions = {
        "none",
        "play",
        "pause",
        "resume",
        "next",
        "previous",
        "current",
        "play_genre",
        "play_folder",
        "play_query",
    }
    intent = str(payload.get("intent", "none")).strip().lower()
    if intent not in allowed_intents:
        intent = "none"
    music_action = str(payload.get("music_action", "none")).strip().lower()
    if music_action not in allowed_music_actions:
        music_action = "none"
    try:
        confidence = float(payload.get("confidence", 0.0))
    except (TypeError, ValueError):
        confidence = 0.0
    confidence = max(0.0, min(1.0, confidence))
    normalized_command = str(payload.get("normalized_command", "")).strip()
    music_query = str(payload.get("music_query", "")).strip()
    result.update(
        {
            "intent": intent,
            "normalized_command": normalized_command,
            "music_action": music_action,
            "music_query": music_query,
            "confidence": confidence,
        }
    )
    return result
 def ask_ai(messages_history: list) -> str:
    """
    Запрос к AI в режиме чата.
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -129,3 +129,8 @@ TTS_SAMPLE_RATE = 48000
 WEATHER_LAT = os.getenv("WEATHER_LAT")
 WEATHER_LON = os.getenv("WEATHER_LON")
 WEATHER_CITY = os.getenv("WEATHER_CITY", "Ухта")
 # --- Настройки Navidrome (музыка) ---
 NAVIDROME_URL = os.getenv("NAVIDROME_URL", "").strip().rstrip("/")
 NAVIDROME_USERNAME = os.getenv("NAVIDROME_USERNAME", "").strip()
 NAVIDROME_PASSWORD = os.getenv("NAVIDROME_PASSWORD", "")
--- a/app/features/music.py
+++ b/app/features/music.py
--- a/app/main.py
+++ b/app/main.py
@@ -33,7 +33,7 @@ from .audio.wakeword import (
 from .audio.wakeword import (
    stop_monitoring as stop_wakeword_monitoring,
 )
-from .core.ai import ask_ai_stream, translate_text
+from .core.ai import ask_ai_stream, interpret_assistant_intent, translate_text
 from .core.config import BASE_DIR, WAKE_WORD
 from .core.cleaner import clean_response
 from .core.commands import is_stop_command
@@ -163,6 +163,10 @@ _CITY_PATTERNS = [
    ),
 ]
 _SEMANTIC_INTENT_MIN_CONFIDENCE = 0.55
 _SEMANTIC_MUSIC_MIN_CONFIDENCE = 0.45
 _SEMANTIC_REPEAT_STOP_MIN_CONFIDENCE = 0.72
 def signal_handler(sig, frame):
    """Обработчик Ctrl+C."""
@@ -311,7 +315,7 @@ def main():
                    continue  # Продолжаем цикл
            else:
                # Follow-up режим — без wake word
-                print(f"👂 Слушаю ({followup_idle_timeout_seconds:.0f} сек)...")
+                print(f"👂 Слушаю ({followup_idle_timeout_seconds:.1f} сек)...")
                try:
                    user_text = listen(
                        timeout_seconds=7.0,
@@ -341,6 +345,11 @@ def main():
            # Проверка на команду "Стоп"
            if is_stop_command(user_text):
                music_controller = get_music_controller()
                music_stop_response = music_controller.pause_for_stop_word()
                if music_stop_response:
                    print(f"🎵 {music_stop_response}")
                if stopwatch_manager.has_running_stopwatches():
                    stopwatch_stop_response = stopwatch_manager.pause_stopwatches()
                    clean_stopwatch_stop_response = clean_response(
@@ -369,8 +378,93 @@ def main():
                skip_wakeword = True
                continue
            effective_text = user_text
            semantic_intent = interpret_assistant_intent(user_text)
            semantic_type = str(semantic_intent.get("intent", "none")).strip().lower()
            try:
                semantic_confidence = float(
                    semantic_intent.get("confidence", 0.0) or 0.0
                )
            except (TypeError, ValueError):
                semantic_confidence = 0.0
            semantic_command = str(semantic_intent.get("normalized_command", "")).strip()
            semantic_music_action = (
                str(semantic_intent.get("music_action", "none")).strip().lower()
            )
            semantic_music_query = str(semantic_intent.get("music_query", "")).strip()
            if (
                semantic_type == "stop"
                and semantic_confidence >= _SEMANTIC_REPEAT_STOP_MIN_CONFIDENCE
            ):
                music_controller = get_music_controller()
                music_stop_response = music_controller.pause_for_stop_word()
                if music_stop_response:
                    print(f"🎵 {music_stop_response}")
                if stopwatch_manager.has_running_stopwatches():
                    stopwatch_stop_response = stopwatch_manager.pause_stopwatches()
                    clean_stopwatch_stop_response = clean_response(
                        stopwatch_stop_response, language="ru"
                    )
                    speak(clean_stopwatch_stop_response)
                    last_response = clean_stopwatch_stop_response
                    skip_wakeword = False
                    continue
                print("_" * 50)
                print(f"💤 Жду '{WAKE_WORD}'...")
                skip_wakeword = False
                continue
            if (
                semantic_type == "repeat"
                and semantic_confidence >= _SEMANTIC_REPEAT_STOP_MIN_CONFIDENCE
            ):
                if last_response:
                    print(f"🔁 Повторяю: {last_response}")
                    speak(last_response)
                else:
                    speak("Я еще ничего не говорил.")
                skip_wakeword = True
                continue
            if (
                semantic_type == "music"
                and semantic_confidence >= _SEMANTIC_MUSIC_MIN_CONFIDENCE
            ):
                music_controller = get_music_controller()
                semantic_music_response = music_controller.handle_semantic_action(
                    semantic_music_action,
                    semantic_music_query,
                )
                if semantic_music_response:
                    clean_music_response = clean_response(
                        semantic_music_response, language="ru"
                    )
                    speak(clean_music_response)
                    last_response = clean_music_response
                    skip_wakeword = True
                    continue
            if (
                semantic_command
                and semantic_confidence >= _SEMANTIC_INTENT_MIN_CONFIDENCE
                and semantic_type
                in {
                    "music",
                    "timer",
                    "alarm",
                    "weather",
                    "volume",
                    "translation",
                    "cities",
                }
            ):
                effective_text = semantic_command
                print(f"🧠 Команда: '{user_text}' -> '{effective_text}'")
            # Small-talk
-            smalltalk_response = get_smalltalk_response(user_text)
+            smalltalk_response = get_smalltalk_response(effective_text)
            if smalltalk_response:
                clean_smalltalk = clean_response(smalltalk_response, language="ru")
                speak(clean_smalltalk)
@@ -378,7 +472,7 @@ def main():
                skip_wakeword = True
                continue
-            command_text = user_text
+            command_text = effective_text
            command_text_lower = command_text.lower()
            if pending_time_target == "timer" and "таймер" not in command_text_lower:
                command_text = f"таймер {command_text}"
@@ -427,9 +521,9 @@ def main():
                continue
            # Громкость
-            if user_text.lower().startswith("громкость"):
+            if command_text.lower().startswith("громкость"):
                try:
-                    vol_str = user_text.lower().replace("громкость", "", 1).strip()
+                    vol_str = command_text.lower().replace("громкость", "", 1).strip()
                    level = parse_volume_text(vol_str)
                    if level is not None:
@@ -455,7 +549,7 @@ def main():
            # Погода
            requested_city = None
-            user_text_lower = user_text.lower()
+            user_text_lower = command_text.lower()
            for pattern in _CITY_PATTERNS:
                match = pattern.search(user_text_lower)
@@ -487,7 +581,7 @@ def main():
            # Музыка
            music_controller = get_music_controller()
-            music_response = music_controller.parse_command(user_text)
+            music_response = music_controller.parse_command(command_text)
            if music_response:
                clean_music_response = clean_response(music_response, language="ru")
                speak(clean_music_response)
@@ -496,7 +590,7 @@ def main():
                continue
            # Перевод
-            translation_request = parse_translation_request(user_text)
+            translation_request = parse_translation_request(command_text)
            if translation_request:
                source_lang = translation_request["source_lang"]
                target_lang = translation_request["target_lang"]
@@ -553,8 +647,7 @@ def main():
                continue
            # Игра "Города"
-            cities_response = cities_game.handle(user_text)
+            cities_response = cities_game.handle(command_text)
            cities_response = cities_game.handle(user_text)
            if cities_response:
                clean_cities_response = clean_response(cities_response, language="ru")
                speak(clean_cities_response)