silero v5

2026-01-07 17:31:22 +03:00
parent ebaed3fbbe
commit 7b79593cad
5 changed files with 183 additions and 87 deletions
--- a/ai.py
+++ b/ai.py
@@ -13,7 +13,8 @@ SYSTEM_PROMPT = """Ты — Александр, умный голосовой а
 Твоя главная цель — помогать пользователю и поддерживать интересный диалог.
 Отвечай кратко и по существу, на русском языке.
 Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом.
-Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов."""
+Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов.
 ВАЖНО: Не используй в ответах панибратские или сленговые приветствия и обращения, такие как "Эй", "Хэй", "Слушай" в начале фразы и подобные."""
 def ask_ai(messages_history: list) -> str:
--- a/cleaner.py
+++ b/cleaner.py
@@ -250,6 +250,9 @@ def clean_response(text: str) -> str:
    # Remove HTML tags if any
    text = re.sub(r'<[^>]+>', '', text)
    # Remove informal slang greetings at the beginning of sentences/responses
    text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
    # Convert numbers to words (Russian)
    text = numbers_to_words(text)
--- a/config.py
+++ b/config.py
@@ -29,5 +29,5 @@ SAMPLE_RATE = 16000
 CHANNELS = 1
 # TTS configuration
-TTS_SPEAKER = "xenia"  # Available: aidar, baya, kseniya, xenia, eugene
+TTS_SPEAKER = "eugene"  # Available: aidar, baya, kseniya, xenia, eugene
 TTS_SAMPLE_RATE = 48000
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,28 +1,52 @@
-# Smart Speaker Dependencies
+antlr4-python3-runtime==4.9.3
-# Python 3.12.8
+certifi==2025.11.12
-
+cffi==2.0.0
-# Wake word detection
+charset-normalizer==3.4.4
-pvporcupine>=3.0.0
+DAWG2-Python==0.9.0
-
+docopt==0.6.2
-# Speech-to-Text
+filelock==3.20.1
-vosk>=0.3.45
+fsspec==2025.12.0
-
+idna==3.11
-# Audio
+Jinja2==3.1.6
-pyaudio>=0.2.14
+MarkupSafe==3.0.3
-sounddevice>=0.4.6
+mpmath==1.3.0
-
+networkx==3.6.1
-# AI API
+num2words==0.5.14
-requests>=2.31.0
+numpy==2.4.0
-
+nvidia-cublas-cu12==12.8.4.1
-# Environment
+nvidia-cuda-cupti-cu12==12.8.90
-python-dotenv>=1.0.0
+nvidia-cuda-nvrtc-cu12==12.8.93
-
+nvidia-cuda-runtime-cu12==12.8.90
-# TTS (Silero)
+nvidia-cudnn-cu12==9.10.2.21
-torch>=2.0.0
+nvidia-cufft-cu12==11.3.3.83
-torchaudio>=2.0.0
+nvidia-cufile-cu12==1.13.1.3
-omegaconf>=2.3.0
+nvidia-curand-cu12==10.3.9.90
-
+nvidia-cusolver-cu12==11.7.3.90
-# Utils
+nvidia-cusparse-cu12==12.5.8.93
-numpy>=1.24.0
+nvidia-cusparselt-cu12==0.7.1
-num2words
+nvidia-nccl-cu12==2.27.5
-pymorphy3
+nvidia-nvjitlink-cu12==12.8.93
 nvidia-nvshmem-cu12==3.3.20
 nvidia-nvtx-cu12==12.8.90
 omegaconf==2.3.0
 pvporcupine==4.0.1
 PyAudio==0.2.14
 pycparser==2.23
 pymorphy3==2.0.6
 pymorphy3-dicts-ru==2.4.417150.4580142
 python-dotenv==1.2.1
 PyYAML==6.0.3
 requests==2.32.5
 scipy==1.16.3
 setuptools==80.9.0
 sounddevice==0.5.3
 srt==3.5.3
 sympy==1.14.0
 torch==2.9.1
 torchaudio==2.9.1
 tqdm==4.67.1
 triton==3.5.1
 typing_extensions==4.15.0
 urllib3==2.6.2
 vosk==0.3.45
 websockets==15.0.1
--- a/tts.py
+++ b/tts.py
@@ -3,85 +3,156 @@ Text-to-Speech module using Silero TTS.
 Generates natural Russian speech with Xenia voice.
 Supports interruption via wake word detection using threading.
 """
 import torch
 import sounddevice as sd
 import numpy as np
 import threading
 import time
 import warnings
 import re
 from config import TTS_SPEAKER, TTS_SAMPLE_RATE
 # Suppress Silero TTS warning about text length
 warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols")
 class TextToSpeech:
    """Text-to-Speech using Silero TTS with wake word interruption support."""
-    
+
    def __init__(self):
        self.model = None
        self.sample_rate = TTS_SAMPLE_RATE
        self.speaker = TTS_SPEAKER
        self._interrupted = False
        self._stop_flag = threading.Event()
-    
+
    def initialize(self):
        """Initialize Silero TTS model."""
-        print("📦 Загрузка модели Silero TTS...")
+        print("📦 Загрузка модели Silero TTS v5...")
-        
+
        # Load Silero TTS model
        device = torch.device('cpu')
        self.model, _ = torch.hub.load(
-            repo_or_dir='snakers4/silero-models',
+            repo_or_dir="snakers4/silero-models",
-            model='silero_tts',
+            model="silero_tts",
-            language='ru',
+            language="ru",
-            speaker='v4_ru'
+            speaker="v5_ru",
        )
-        
+        self.model.to(device)
-        print(f"✅ Модель TTS загружена (голос: {self.speaker})")
+
-    
+        print(f"✅ Модель TTS v5 загружена (голос: {self.speaker})")
    def _split_text(self, text: str, max_length: int = 900) -> list[str]:
        """Split text into chunks smaller than max_length."""
        if len(text) <= max_length:
            return [text]
        chunks = []
        # Split by sentence endings, keeping the punctuation
        # pattern matches [.!?] followed by optional newlines
        parts = re.split(r"([.!?]+\s*)", text)
        current_chunk = ""
        # Reconstruct sentences. re.split with groups returns [text, delimiter, text, delimiter...]
        # We iterate through parts. If part is a delimiter (matches pattern), we append to previous text.
        for part in parts:
            # If the part combined with current_chunk exceeds max_length, save current_chunk
            if len(current_chunk) + len(part) > max_length:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
            current_chunk += part
            # If even a single part is too big (very long sentence without punctuation), force split
            while len(current_chunk) > max_length:
                # Try to split by space
                split_idx = current_chunk.rfind(" ", 0, max_length)
                if split_idx == -1:
                    # No space found, hard cut
                    split_idx = max_length
                chunks.append(current_chunk[:split_idx].strip())
                current_chunk = current_chunk[split_idx:].lstrip()
        if current_chunk:
            chunks.append(current_chunk.strip())
        # Filter empty chunks
        return [c for c in chunks if c]
    def speak(self, text: str, check_interrupt=None) -> bool:
        """
        Convert text to speech and play it.
-        
+
        Args:
            text: Text to synthesize and speak
            check_interrupt: Optional callback function that returns True if playback should stop
-            
+
        Returns:
            True if playback completed normally, False if interrupted
        """
        if not text.strip():
            return True
-        
+
        if not self.model:
            self.initialize()
-        
+
-        print(f"🔊 Озвучивание: {text[:50]}...")
+        # Split text into manageable chunks
-        
+        chunks = self._split_text(text)
        total_chunks = len(chunks)
        if total_chunks > 1:
            print(f"🔊 Озвучивание (частей: {total_chunks}): {text[:50]}...")
        else:
            print(f"🔊 Озвучивание: {text[:50]}...")
        self._interrupted = False
        self._stop_flag.clear()
-        
+
-        try:
+        success = True
-            # Generate audio
+
-            audio = self.model.apply_tts(
+        for i, chunk in enumerate(chunks):
-                text=text,
+            if self._interrupted:
-                speaker=self.speaker,
+                break
-                sample_rate=self.sample_rate
+
-            )
+            try:
-            
+                # Generate audio for chunk
-            # Convert to numpy array
+                audio = self.model.apply_tts(
-            audio_np = audio.numpy()
+                    text=chunk, speaker=self.speaker, sample_rate=self.sample_rate
-            
+                )
-            if check_interrupt:
+
-                # Play with interrupt checking in parallel thread
+                # Convert to numpy array
-                return self._play_with_interrupt(audio_np, check_interrupt)
+                audio_np = audio.numpy()
-            else:
+
-                # Standard playback
+                if check_interrupt:
-                sd.play(audio_np, self.sample_rate)
+                    # Play with interrupt checking in parallel thread
-                sd.wait()
+                    if not self._play_with_interrupt(audio_np, check_interrupt):
-                print("✅ Воспроизведение завершено")
+                        success = False
-                return True
+                        break
-            
+                else:
-        except Exception as e:
+                    # Standard playback
-            print(f"❌ Ошибка TTS: {e}")
+                    sd.play(audio_np, self.sample_rate)
                    sd.wait()
            except Exception as e:
                print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
                success = False
                # Continue with next chunk? or break?
                # Usually if one fails, we might want to try others, but for "too long" error
                # splitting should solve it. If it fails for other reasons, maybe better to stop.
                # But let's keep trying subsequent chunks in case it's a specific symbol issue.
        if success and not self._interrupted:
            print("✅ Воспроизведение завершено")
            return True
        elif self._interrupted:
            return False
-    
+        else:
            return False
    def _check_interrupt_worker(self, check_interrupt):
        """
        Worker thread that continuously checks for interrupt signal.
@@ -95,47 +166,44 @@ class TextToSpeech:
                    return
            except Exception:
                pass
-    
+
    def _play_with_interrupt(self, audio_np: np.ndarray, check_interrupt) -> bool:
        """
        Play audio with interrupt checking in parallel thread.
-        
+
        Args:
            audio_np: Audio data as numpy array
            check_interrupt: Callback that returns True if should interrupt
-            
+
        Returns:
            True if completed normally, False if interrupted
        """
        # Start interrupt checker thread
        checker_thread = threading.Thread(
-            target=self._check_interrupt_worker, 
+            target=self._check_interrupt_worker, args=(check_interrupt,), daemon=True
            args=(check_interrupt,),
            daemon=True
        )
        checker_thread.start()
-        
+
        try:
            # Play audio (non-blocking start)
            sd.play(audio_np, self.sample_rate)
-            
+
            # Wait for playback to finish or interrupt
            while sd.get_stream().active:
                if self._interrupted:
                    break
                time.sleep(0.05)
-            
+
        finally:
            # Signal checker thread to stop
            self._stop_flag.set()
            checker_thread.join(timeout=0.5)
-        
+
        if self._interrupted:
            return False
-        
+
        print("✅ Воспроизведение завершено")
        return True
-    
+
    @property
    def was_interrupted(self) -> bool:
        """Check if the last playback was interrupted."""
@@ -157,11 +225,11 @@ def get_tts() -> TextToSpeech:
 def speak(text: str, check_interrupt=None) -> bool:
    """
    Synthesize and speak the given text.
-    
+
    Args:
        text: Text to speak
        check_interrupt: Optional callback for interrupt checking
-        
+
    Returns:
        True if completed normally, False if interrupted
    """