feat: improve semantic voice control and music playback

2026-03-15 14:40:33 +03:00
parent e1a94c68db
commit cb54a9ee75
8 changed files with 1656 additions and 276 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -33,7 +33,7 @@ from .audio.wakeword import (
 from .audio.wakeword import (
    stop_monitoring as stop_wakeword_monitoring,
 )
-from .core.ai import ask_ai_stream, translate_text
+from .core.ai import ask_ai_stream, interpret_assistant_intent, translate_text
 from .core.config import BASE_DIR, WAKE_WORD
 from .core.cleaner import clean_response
 from .core.commands import is_stop_command
@@ -163,6 +163,10 @@ _CITY_PATTERNS = [
    ),
 ]

+_SEMANTIC_INTENT_MIN_CONFIDENCE = 0.55
+_SEMANTIC_MUSIC_MIN_CONFIDENCE = 0.45
+_SEMANTIC_REPEAT_STOP_MIN_CONFIDENCE = 0.72
+

 def signal_handler(sig, frame):
    """Обработчик Ctrl+C."""
@@ -311,7 +315,7 @@ def main():
                    continue  # Продолжаем цикл
            else:
                # Follow-up режим — без wake word
-                print(f"👂 Слушаю ({followup_idle_timeout_seconds:.0f} сек)...")
+                print(f"👂 Слушаю ({followup_idle_timeout_seconds:.1f} сек)...")
                try:
                    user_text = listen(
                        timeout_seconds=7.0,
@@ -341,6 +345,11 @@ def main():

            # Проверка на команду "Стоп"
            if is_stop_command(user_text):
+                music_controller = get_music_controller()
+                music_stop_response = music_controller.pause_for_stop_word()
+                if music_stop_response:
+                    print(f"🎵 {music_stop_response}")
+
                if stopwatch_manager.has_running_stopwatches():
                    stopwatch_stop_response = stopwatch_manager.pause_stopwatches()
                    clean_stopwatch_stop_response = clean_response(
@@ -369,8 +378,93 @@ def main():
                skip_wakeword = True
                continue

+            effective_text = user_text
+            semantic_intent = interpret_assistant_intent(user_text)
+            semantic_type = str(semantic_intent.get("intent", "none")).strip().lower()
+            try:
+                semantic_confidence = float(
+                    semantic_intent.get("confidence", 0.0) or 0.0
+                )
+            except (TypeError, ValueError):
+                semantic_confidence = 0.0
+            semantic_command = str(semantic_intent.get("normalized_command", "")).strip()
+            semantic_music_action = (
+                str(semantic_intent.get("music_action", "none")).strip().lower()
+            )
+            semantic_music_query = str(semantic_intent.get("music_query", "")).strip()
+
+            if (
+                semantic_type == "stop"
+                and semantic_confidence >= _SEMANTIC_REPEAT_STOP_MIN_CONFIDENCE
+            ):
+                music_controller = get_music_controller()
+                music_stop_response = music_controller.pause_for_stop_word()
+                if music_stop_response:
+                    print(f"🎵 {music_stop_response}")
+
+                if stopwatch_manager.has_running_stopwatches():
+                    stopwatch_stop_response = stopwatch_manager.pause_stopwatches()
+                    clean_stopwatch_stop_response = clean_response(
+                        stopwatch_stop_response, language="ru"
+                    )
+                    speak(clean_stopwatch_stop_response)
+                    last_response = clean_stopwatch_stop_response
+                    skip_wakeword = False
+                    continue
+                print("_" * 50)
+                print(f"💤 Жду '{WAKE_WORD}'...")
+                skip_wakeword = False
+                continue
+
+            if (
+                semantic_type == "repeat"
+                and semantic_confidence >= _SEMANTIC_REPEAT_STOP_MIN_CONFIDENCE
+            ):
+                if last_response:
+                    print(f"🔁 Повторяю: {last_response}")
+                    speak(last_response)
+                else:
+                    speak("Я еще ничего не говорил.")
+                skip_wakeword = True
+                continue
+
+            if (
+                semantic_type == "music"
+                and semantic_confidence >= _SEMANTIC_MUSIC_MIN_CONFIDENCE
+            ):
+                music_controller = get_music_controller()
+                semantic_music_response = music_controller.handle_semantic_action(
+                    semantic_music_action,
+                    semantic_music_query,
+                )
+                if semantic_music_response:
+                    clean_music_response = clean_response(
+                        semantic_music_response, language="ru"
+                    )
+                    speak(clean_music_response)
+                    last_response = clean_music_response
+                    skip_wakeword = True
+                    continue
+
+            if (
+                semantic_command
+                and semantic_confidence >= _SEMANTIC_INTENT_MIN_CONFIDENCE
+                and semantic_type
+                in {
+                    "music",
+                    "timer",
+                    "alarm",
+                    "weather",
+                    "volume",
+                    "translation",
+                    "cities",
+                }
+            ):
+                effective_text = semantic_command
+                print(f"🧠 Команда: '{user_text}' -> '{effective_text}'")
+
            # Small-talk
-            smalltalk_response = get_smalltalk_response(user_text)
+            smalltalk_response = get_smalltalk_response(effective_text)
            if smalltalk_response:
                clean_smalltalk = clean_response(smalltalk_response, language="ru")
                speak(clean_smalltalk)
@@ -378,7 +472,7 @@ def main():
                skip_wakeword = True
                continue

-            command_text = user_text
+            command_text = effective_text
            command_text_lower = command_text.lower()
            if pending_time_target == "timer" and "таймер" not in command_text_lower:
                command_text = f"таймер {command_text}"
@@ -427,9 +521,9 @@ def main():
                continue

            # Громкость
-            if user_text.lower().startswith("громкость"):
+            if command_text.lower().startswith("громкость"):
                try:
-                    vol_str = user_text.lower().replace("громкость", "", 1).strip()
+                    vol_str = command_text.lower().replace("громкость", "", 1).strip()
                    level = parse_volume_text(vol_str)

                    if level is not None:
@@ -455,7 +549,7 @@ def main():

            # Погода
            requested_city = None
-            user_text_lower = user_text.lower()
+            user_text_lower = command_text.lower()

            for pattern in _CITY_PATTERNS:
                match = pattern.search(user_text_lower)
@@ -487,7 +581,7 @@ def main():

            # Музыка
            music_controller = get_music_controller()
-            music_response = music_controller.parse_command(user_text)
+            music_response = music_controller.parse_command(command_text)
            if music_response:
                clean_music_response = clean_response(music_response, language="ru")
                speak(clean_music_response)
@@ -496,7 +590,7 @@ def main():
                continue

            # Перевод
-            translation_request = parse_translation_request(user_text)
+            translation_request = parse_translation_request(command_text)
            if translation_request:
                source_lang = translation_request["source_lang"]
                target_lang = translation_request["target_lang"]
@@ -553,8 +647,7 @@ def main():
                continue

            # Игра "Города"
-            cities_response = cities_game.handle(user_text)
-            cities_response = cities_game.handle(user_text)
+            cities_response = cities_game.handle(command_text)
            if cities_response:
                clean_cities_response = clean_response(cities_response, language="ru")
                speak(clean_cities_response)