feat: improve semantic voice control and music playback

2026-03-15 14:40:33 +03:00
parent e1a94c68db
commit cb54a9ee75
8 changed files with 1656 additions and 276 deletions
--- a/app/core/ai.py
+++ b/app/core/ai.py
@@ -54,6 +54,26 @@ No explanations, no quotes, no comments.
 Separate variants with " / " (space slash space).
 Keep the translation максимально кратким и естественным, без лишних слов."""

+INTENT_SYSTEM_PROMPT = """Ты NLU-модуль голосовой колонки.
+Твоя задача: распознать намерение пользователя и вернуть СТРОГО JSON без markdown и пояснений.
+Всегда возвращай объект c ключами:
+{
+  "intent": "none|music|timer|alarm|weather|volume|translation|cities|repeat|stop|smalltalk|chat",
+  "normalized_command": "<краткая нормализованная команда на русском или пусто>",
+  "music_action": "none|play|pause|resume|next|previous|current|play_genre|play_folder|play_query",
+  "music_query": "<запрос для музыки/жанра/папки или пусто>",
+  "confidence": 0.0
+}
+Правила:
+- Если это музыка, ставь intent=music и выбирай music_action.
+- "Включи музыку" и любые эквиваленты = music_action=play.
+- Для "пауза/останови музыку/выключи музыку" = music_action=pause.
+- Для "что играет" = music_action=current.
+- Для "включи жанр X" = music_action=play_genre, music_query=X.
+- Для "включи папку X" = music_action=play_folder, music_query=X.
+- normalized_command должен быть пригоден для командного парсера (без лишних слов).
+- Если уверенность низкая, ставь intent=none, music_action=none, confidence <= 0.4."""
+
 _PROVIDER_ALIASES = {
    "": "openrouter",
    "anthropic": "anthropic",
@@ -381,6 +401,32 @@ def _log_request_exception(cfg, error: Exception):
    print(f"❌ Ошибка API ({cfg['name']}): {error}{details}")


+def _extract_json_object(raw_text: str) -> Optional[dict]:
+    text = str(raw_text or "").strip()
+    if not text:
+        return None
+
+    try:
+        payload = json.loads(text)
+        if isinstance(payload, dict):
+            return payload
+    except json.JSONDecodeError:
+        pass
+
+    match = re.search(r"\{.*\}", text, flags=re.DOTALL)
+    if not match:
+        return None
+
+    candidate = match.group(0).strip()
+    try:
+        payload = json.loads(candidate)
+    except json.JSONDecodeError:
+        return None
+    if isinstance(payload, dict):
+        return payload
+    return None
+
+
 def _send_request(messages, max_tokens, temperature, error_text):
    """
    Внутренняя функция для отправки HTTP-запроса к выбранному AI-провайдеру.
@@ -422,6 +468,98 @@ def _send_request(messages, max_tokens, temperature, error_text):
        return "Не удалось обработать ответ от AI."


+def interpret_assistant_intent(text: str) -> dict:
+    """
+    Interprets voice command semantics for downstream command routers.
+    Returns a normalized dict even when AI is unavailable.
+    """
+    result = {
+        "intent": "none",
+        "normalized_command": "",
+        "music_action": "none",
+        "music_query": "",
+        "confidence": 0.0,
+    }
+    cleaned_text = str(text or "").strip()
+    if not cleaned_text:
+        return result
+
+    cfg, selection_error = _get_provider_settings()
+    if selection_error:
+        return result
+    if _get_provider_config_error(cfg):
+        return result
+
+    messages = [
+        {"role": "system", "content": INTENT_SYSTEM_PROMPT},
+        {"role": "user", "content": cleaned_text},
+    ]
+    response = _send_request(
+        messages,
+        max_tokens=220,
+        temperature=0.0,
+        error_text="",
+    )
+    payload = _extract_json_object(response)
+    if not payload:
+        return result
+
+    allowed_intents = {
+        "none",
+        "music",
+        "timer",
+        "alarm",
+        "weather",
+        "volume",
+        "translation",
+        "cities",
+        "repeat",
+        "stop",
+        "smalltalk",
+        "chat",
+    }
+    allowed_music_actions = {
+        "none",
+        "play",
+        "pause",
+        "resume",
+        "next",
+        "previous",
+        "current",
+        "play_genre",
+        "play_folder",
+        "play_query",
+    }
+
+    intent = str(payload.get("intent", "none")).strip().lower()
+    if intent not in allowed_intents:
+        intent = "none"
+
+    music_action = str(payload.get("music_action", "none")).strip().lower()
+    if music_action not in allowed_music_actions:
+        music_action = "none"
+
+    try:
+        confidence = float(payload.get("confidence", 0.0))
+    except (TypeError, ValueError):
+        confidence = 0.0
+    confidence = max(0.0, min(1.0, confidence))
+
+    normalized_command = str(payload.get("normalized_command", "")).strip()
+    music_query = str(payload.get("music_query", "")).strip()
+
+    result.update(
+        {
+            "intent": intent,
+            "normalized_command": normalized_command,
+            "music_action": music_action,
+            "music_query": music_query,
+            "confidence": confidence,
+        }
+    )
+    return result
+
+
 def ask_ai(messages_history: list) -> str:
    """
    Запрос к AI в режиме чата.
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -129,3 +129,8 @@ TTS_SAMPLE_RATE = 48000
 WEATHER_LAT = os.getenv("WEATHER_LAT")
 WEATHER_LON = os.getenv("WEATHER_LON")
 WEATHER_CITY = os.getenv("WEATHER_CITY", "Ухта")
+
+# --- Настройки Navidrome (музыка) ---
+NAVIDROME_URL = os.getenv("NAVIDROME_URL", "").strip().rstrip("/")
+NAVIDROME_USERNAME = os.getenv("NAVIDROME_USERNAME", "").strip()
+NAVIDROME_PASSWORD = os.getenv("NAVIDROME_PASSWORD", "")