silero v5

2026-01-07 17:31:22 +03:00
parent ebaed3fbbe
commit 7b79593cad
5 changed files with 183 additions and 87 deletions
--- a/ai.py
+++ b/ai.py
@@ -13,7 +13,8 @@ SYSTEM_PROMPT = """Ты — Александр, умный голосовой а
 Твоя главная цель — помогать пользователю и поддерживать интересный диалог.
 Отвечай кратко и по существу, на русском языке.
 Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом.
-Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов."""
+Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов.
+ВАЖНО: Не используй в ответах панибратские или сленговые приветствия и обращения, такие как "Эй", "Хэй", "Слушай" в начале фразы и подобные."""


 def ask_ai(messages_history: list) -> str:
--- a/cleaner.py
+++ b/cleaner.py
@@ -251,6 +251,9 @@ def clean_response(text: str) -> str:
    # Remove HTML tags if any
    text = re.sub(r'<[^>]+>', '', text)

+    # Remove informal slang greetings at the beginning of sentences/responses
+    text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
+    
    # Convert numbers to words (Russian)
    text = numbers_to_words(text)
    
--- a/config.py
+++ b/config.py
@@ -29,5 +29,5 @@ SAMPLE_RATE = 16000
 CHANNELS = 1

 # TTS configuration
-TTS_SPEAKER = "xenia"  # Available: aidar, baya, kseniya, xenia, eugene
+TTS_SPEAKER = "eugene"  # Available: aidar, baya, kseniya, xenia, eugene
 TTS_SAMPLE_RATE = 48000
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,28 +1,52 @@
-# Smart Speaker Dependencies
-# Python 3.12.8
-
-# Wake word detection
-pvporcupine>=3.0.0
-
-# Speech-to-Text
-vosk>=0.3.45
-
-# Audio
-pyaudio>=0.2.14
-sounddevice>=0.4.6
-
-# AI API
-requests>=2.31.0
-
-# Environment
-python-dotenv>=1.0.0
-
-# TTS (Silero)
-torch>=2.0.0
-torchaudio>=2.0.0
-omegaconf>=2.3.0
-
-# Utils
-numpy>=1.24.0
-num2words
-pymorphy3
+antlr4-python3-runtime==4.9.3
+certifi==2025.11.12
+cffi==2.0.0
+charset-normalizer==3.4.4
+DAWG2-Python==0.9.0
+docopt==0.6.2
+filelock==3.20.1
+fsspec==2025.12.0
+idna==3.11
+Jinja2==3.1.6
+MarkupSafe==3.0.3
+mpmath==1.3.0
+networkx==3.6.1
+num2words==0.5.14
+numpy==2.4.0
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+omegaconf==2.3.0
+pvporcupine==4.0.1
+PyAudio==0.2.14
+pycparser==2.23
+pymorphy3==2.0.6
+pymorphy3-dicts-ru==2.4.417150.4580142
+python-dotenv==1.2.1
+PyYAML==6.0.3
+requests==2.32.5
+scipy==1.16.3
+setuptools==80.9.0
+sounddevice==0.5.3
+srt==3.5.3
+sympy==1.14.0
+torch==2.9.1
+torchaudio==2.9.1
+tqdm==4.67.1
+triton==3.5.1
+typing_extensions==4.15.0
+urllib3==2.6.2
+vosk==0.3.45
+websockets==15.0.1
--- a/tts.py
+++ b/tts.py
@@ -3,13 +3,19 @@ Text-to-Speech module using Silero TTS.
 Generates natural Russian speech with Xenia voice.
 Supports interruption via wake word detection using threading.
 """
+
 import torch
 import sounddevice as sd
 import numpy as np
 import threading
 import time
+import warnings
+import re
 from config import TTS_SPEAKER, TTS_SAMPLE_RATE

+# Suppress Silero TTS warning about text length
+warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols")
+

 class TextToSpeech:
    """Text-to-Speech using Silero TTS with wake word interruption support."""
@@ -23,17 +29,59 @@ class TextToSpeech:

    def initialize(self):
        """Initialize Silero TTS model."""
-        print("📦 Загрузка модели Silero TTS...")
+        print("📦 Загрузка модели Silero TTS v5...")

        # Load Silero TTS model
+        device = torch.device('cpu')
        self.model, _ = torch.hub.load(
-            repo_or_dir='snakers4/silero-models',
-            model='silero_tts',
-            language='ru',
-            speaker='v4_ru'
+            repo_or_dir="snakers4/silero-models",
+            model="silero_tts",
+            language="ru",
+            speaker="v5_ru",
        )
+        self.model.to(device)

-        print(f"✅ Модель TTS загружена (голос: {self.speaker})")
+        print(f"✅ Модель TTS v5 загружена (голос: {self.speaker})")
+
+    def _split_text(self, text: str, max_length: int = 900) -> list[str]:
+        """Split text into chunks smaller than max_length."""
+        if len(text) <= max_length:
+            return [text]
+
+        chunks = []
+        # Split by sentence endings, keeping the punctuation
+        # pattern matches [.!?] followed by optional newlines
+        parts = re.split(r"([.!?]+\s*)", text)
+
+        current_chunk = ""
+        # Reconstruct sentences. re.split with groups returns [text, delimiter, text, delimiter...]
+        # We iterate through parts. If part is a delimiter (matches pattern), we append to previous text.
+
+        for part in parts:
+            # If the part combined with current_chunk exceeds max_length, save current_chunk
+            if len(current_chunk) + len(part) > max_length:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                    current_chunk = ""
+
+            current_chunk += part
+
+            # If even a single part is too big (very long sentence without punctuation), force split
+            while len(current_chunk) > max_length:
+                # Try to split by space
+                split_idx = current_chunk.rfind(" ", 0, max_length)
+                if split_idx == -1:
+                    # No space found, hard cut
+                    split_idx = max_length
+
+                chunks.append(current_chunk[:split_idx].strip())
+                current_chunk = current_chunk[split_idx:].lstrip()
+
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+
+        # Filter empty chunks
+        return [c for c in chunks if c]

    def speak(self, text: str, check_interrupt=None) -> bool:
        """
@@ -52,17 +100,28 @@ class TextToSpeech:
        if not self.model:
            self.initialize()

+        # Split text into manageable chunks
+        chunks = self._split_text(text)
+        total_chunks = len(chunks)
+
+        if total_chunks > 1:
+            print(f"🔊 Озвучивание (частей: {total_chunks}): {text[:50]}...")
+        else:
            print(f"🔊 Озвучивание: {text[:50]}...")

        self._interrupted = False
        self._stop_flag.clear()

+        success = True
+
+        for i, chunk in enumerate(chunks):
+            if self._interrupted:
+                break
+
            try:
-            # Generate audio
+                # Generate audio for chunk
                audio = self.model.apply_tts(
-                text=text,
-                speaker=self.speaker,
-                sample_rate=self.sample_rate
+                    text=chunk, speaker=self.speaker, sample_rate=self.sample_rate
                )

                # Convert to numpy array
@@ -70,16 +129,28 @@ class TextToSpeech:

                if check_interrupt:
                    # Play with interrupt checking in parallel thread
-                return self._play_with_interrupt(audio_np, check_interrupt)
+                    if not self._play_with_interrupt(audio_np, check_interrupt):
+                        success = False
+                        break
                else:
                    # Standard playback
                    sd.play(audio_np, self.sample_rate)
                    sd.wait()
-                print("✅ Воспроизведение завершено")
-                return True

            except Exception as e:
-            print(f"❌ Ошибка TTS: {e}")
+                print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
+                success = False
+                # Continue with next chunk? or break?
+                # Usually if one fails, we might want to try others, but for "too long" error
+                # splitting should solve it. If it fails for other reasons, maybe better to stop.
+                # But let's keep trying subsequent chunks in case it's a specific symbol issue.
+
+        if success and not self._interrupted:
+            print("✅ Воспроизведение завершено")
+            return True
+        elif self._interrupted:
+            return False
+        else:
            return False

    def _check_interrupt_worker(self, check_interrupt):
@@ -109,9 +180,7 @@ class TextToSpeech:
        """
        # Start interrupt checker thread
        checker_thread = threading.Thread(
-            target=self._check_interrupt_worker, 
-            args=(check_interrupt,),
-            daemon=True
+            target=self._check_interrupt_worker, args=(check_interrupt,), daemon=True
        )
        checker_thread.start()

@@ -133,7 +202,6 @@ class TextToSpeech:
        if self._interrupted:
            return False

-        print("✅ Воспроизведение завершено")
        return True

    @property