silero v5

2026-01-07 17:31:22 +03:00
parent ebaed3fbbe
commit 7b79593cad
5 changed files with 183 additions and 87 deletions
--- a/ai.py
+++ b/ai.py
@@ -13,7 +13,8 @@ SYSTEM_PROMPT = """Ты — Александр, умный голосовой а
 Твоя главная цель — помогать пользователю и поддерживать интересный диалог.
 Отвечай кратко и по существу, на русском языке.
 Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом.
-Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов."""
+Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов.
 ВАЖНО: Не используй в ответах панибратские или сленговые приветствия и обращения, такие как "Эй", "Хэй", "Слушай" в начале фразы и подобные."""
 def ask_ai(messages_history: list) -> str:
--- a/cleaner.py
+++ b/cleaner.py
@@ -251,6 +251,9 @@ def clean_response(text: str) -> str:
    # Remove HTML tags if any
    text = re.sub(r'<[^>]+>', '', text)
    # Remove informal slang greetings at the beginning of sentences/responses
    text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
    # Convert numbers to words (Russian)
    text = numbers_to_words(text)
--- a/config.py
+++ b/config.py
@@ -29,5 +29,5 @@ SAMPLE_RATE = 16000
 CHANNELS = 1
 # TTS configuration
-TTS_SPEAKER = "xenia"  # Available: aidar, baya, kseniya, xenia, eugene
+TTS_SPEAKER = "eugene"  # Available: aidar, baya, kseniya, xenia, eugene
 TTS_SAMPLE_RATE = 48000
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,28 +1,52 @@
-# Smart Speaker Dependencies
+antlr4-python3-runtime==4.9.3
-# Python 3.12.8
+certifi==2025.11.12
-
+cffi==2.0.0
-# Wake word detection
+charset-normalizer==3.4.4
-pvporcupine>=3.0.0
+DAWG2-Python==0.9.0
-
+docopt==0.6.2
-# Speech-to-Text
+filelock==3.20.1
-vosk>=0.3.45
+fsspec==2025.12.0
-
+idna==3.11
-# Audio
+Jinja2==3.1.6
-pyaudio>=0.2.14
+MarkupSafe==3.0.3
-sounddevice>=0.4.6
+mpmath==1.3.0
-
+networkx==3.6.1
-# AI API
+num2words==0.5.14
-requests>=2.31.0
+numpy==2.4.0
-
+nvidia-cublas-cu12==12.8.4.1
-# Environment
+nvidia-cuda-cupti-cu12==12.8.90
-python-dotenv>=1.0.0
+nvidia-cuda-nvrtc-cu12==12.8.93
-
+nvidia-cuda-runtime-cu12==12.8.90
-# TTS (Silero)
+nvidia-cudnn-cu12==9.10.2.21
-torch>=2.0.0
+nvidia-cufft-cu12==11.3.3.83
-torchaudio>=2.0.0
+nvidia-cufile-cu12==1.13.1.3
-omegaconf>=2.3.0
+nvidia-curand-cu12==10.3.9.90
-
+nvidia-cusolver-cu12==11.7.3.90
-# Utils
+nvidia-cusparse-cu12==12.5.8.93
-numpy>=1.24.0
+nvidia-cusparselt-cu12==0.7.1
-num2words
+nvidia-nccl-cu12==2.27.5
-pymorphy3
+nvidia-nvjitlink-cu12==12.8.93
 nvidia-nvshmem-cu12==3.3.20
 nvidia-nvtx-cu12==12.8.90
 omegaconf==2.3.0
 pvporcupine==4.0.1
 PyAudio==0.2.14
 pycparser==2.23
 pymorphy3==2.0.6
 pymorphy3-dicts-ru==2.4.417150.4580142
 python-dotenv==1.2.1
 PyYAML==6.0.3
 requests==2.32.5
 scipy==1.16.3
 setuptools==80.9.0
 sounddevice==0.5.3
 srt==3.5.3
 sympy==1.14.0
 torch==2.9.1
 torchaudio==2.9.1
 tqdm==4.67.1
 triton==3.5.1
 typing_extensions==4.15.0
 urllib3==2.6.2
 vosk==0.3.45
 websockets==15.0.1
--- a/tts.py
+++ b/tts.py
@@ -3,13 +3,19 @@ Text-to-Speech module using Silero TTS.
 Generates natural Russian speech with Xenia voice.
 Supports interruption via wake word detection using threading.
 """
 import torch
 import sounddevice as sd
 import numpy as np
 import threading
 import time
 import warnings
 import re
 from config import TTS_SPEAKER, TTS_SAMPLE_RATE
 # Suppress Silero TTS warning about text length
 warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols")
 class TextToSpeech:
    """Text-to-Speech using Silero TTS with wake word interruption support."""
@@ -23,17 +29,59 @@ class TextToSpeech:
    def initialize(self):
        """Initialize Silero TTS model."""
-        print("📦 Загрузка модели Silero TTS...")
+        print("📦 Загрузка модели Silero TTS v5...")
        # Load Silero TTS model
        device = torch.device('cpu')
        self.model, _ = torch.hub.load(
-            repo_or_dir='snakers4/silero-models',
+            repo_or_dir="snakers4/silero-models",
-            model='silero_tts',
+            model="silero_tts",
-            language='ru',
+            language="ru",
-            speaker='v4_ru'
+            speaker="v5_ru",
        )
        self.model.to(device)
-        print(f"✅ Модель TTS загружена (голос: {self.speaker})")
+        print(f"✅ Модель TTS v5 загружена (голос: {self.speaker})")
    def _split_text(self, text: str, max_length: int = 900) -> list[str]:
        """Split text into chunks smaller than max_length."""
        if len(text) <= max_length:
            return [text]
        chunks = []
        # Split by sentence endings, keeping the punctuation
        # pattern matches [.!?] followed by optional newlines
        parts = re.split(r"([.!?]+\s*)", text)
        current_chunk = ""
        # Reconstruct sentences. re.split with groups returns [text, delimiter, text, delimiter...]
        # We iterate through parts. If part is a delimiter (matches pattern), we append to previous text.
        for part in parts:
            # If the part combined with current_chunk exceeds max_length, save current_chunk
            if len(current_chunk) + len(part) > max_length:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
            current_chunk += part
            # If even a single part is too big (very long sentence without punctuation), force split
            while len(current_chunk) > max_length:
                # Try to split by space
                split_idx = current_chunk.rfind(" ", 0, max_length)
                if split_idx == -1:
                    # No space found, hard cut
                    split_idx = max_length
                chunks.append(current_chunk[:split_idx].strip())
                current_chunk = current_chunk[split_idx:].lstrip()
        if current_chunk:
            chunks.append(current_chunk.strip())
        # Filter empty chunks
        return [c for c in chunks if c]
    def speak(self, text: str, check_interrupt=None) -> bool:
        """
@@ -52,34 +100,57 @@ class TextToSpeech:
        if not self.model:
            self.initialize()
-        print(f"🔊 Озвучивание: {text[:50]}...")
+        # Split text into manageable chunks
        chunks = self._split_text(text)
        total_chunks = len(chunks)
        if total_chunks > 1:
            print(f"🔊 Озвучивание (частей: {total_chunks}): {text[:50]}...")
        else:
            print(f"🔊 Озвучивание: {text[:50]}...")
        self._interrupted = False
        self._stop_flag.clear()
-        try:
+        success = True
            # Generate audio
            audio = self.model.apply_tts(
                text=text,
                speaker=self.speaker,
                sample_rate=self.sample_rate
            )
-            # Convert to numpy array
+        for i, chunk in enumerate(chunks):
-            audio_np = audio.numpy()
+            if self._interrupted:
                break
-            if check_interrupt:
+            try:
-                # Play with interrupt checking in parallel thread
+                # Generate audio for chunk
-                return self._play_with_interrupt(audio_np, check_interrupt)
+                audio = self.model.apply_tts(
-            else:
+                    text=chunk, speaker=self.speaker, sample_rate=self.sample_rate
-                # Standard playback
+                )
                sd.play(audio_np, self.sample_rate)
                sd.wait()
                print("✅ Воспроизведение завершено")
                return True
-        except Exception as e:
+                # Convert to numpy array
-            print(f"❌ Ошибка TTS: {e}")
+                audio_np = audio.numpy()
                if check_interrupt:
                    # Play with interrupt checking in parallel thread
                    if not self._play_with_interrupt(audio_np, check_interrupt):
                        success = False
                        break
                else:
                    # Standard playback
                    sd.play(audio_np, self.sample_rate)
                    sd.wait()
            except Exception as e:
                print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
                success = False
                # Continue with next chunk? or break?
                # Usually if one fails, we might want to try others, but for "too long" error
                # splitting should solve it. If it fails for other reasons, maybe better to stop.
                # But let's keep trying subsequent chunks in case it's a specific symbol issue.
        if success and not self._interrupted:
            print("✅ Воспроизведение завершено")
            return True
        elif self._interrupted:
            return False
        else:
            return False
    def _check_interrupt_worker(self, check_interrupt):
@@ -109,9 +180,7 @@ class TextToSpeech:
        """
        # Start interrupt checker thread
        checker_thread = threading.Thread(
-            target=self._check_interrupt_worker, 
+            target=self._check_interrupt_worker, args=(check_interrupt,), daemon=True
            args=(check_interrupt,),
            daemon=True
        )
        checker_thread.start()
@@ -133,7 +202,6 @@ class TextToSpeech:
        if self._interrupted:
            return False
        print("✅ Воспроизведение завершено")
        return True
    @property