diff --git a/ai.py b/ai.py index cfbbb70..9deeaac 100644 --- a/ai.py +++ b/ai.py @@ -13,7 +13,8 @@ SYSTEM_PROMPT = """Ты — Александр, умный голосовой а Твоя главная цель — помогать пользователю и поддерживать интересный диалог. Отвечай кратко и по существу, на русском языке. Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом. -Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов.""" +Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов. +ВАЖНО: Не используй в ответах панибратские или сленговые приветствия и обращения, такие как "Эй", "Хэй", "Слушай" в начале фразы и подобные.""" def ask_ai(messages_history: list) -> str: diff --git a/cleaner.py b/cleaner.py index 1bbcb12..dfc2f29 100644 --- a/cleaner.py +++ b/cleaner.py @@ -250,6 +250,9 @@ def clean_response(text: str) -> str: # Remove HTML tags if any text = re.sub(r'<[^>]+>', '', text) + + # Remove informal slang greetings at the beginning of sentences/responses + text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE) # Convert numbers to words (Russian) text = numbers_to_words(text) diff --git a/config.py b/config.py index 7847ec0..1fc3b80 100644 --- a/config.py +++ b/config.py @@ -29,5 +29,5 @@ SAMPLE_RATE = 16000 CHANNELS = 1 # TTS configuration -TTS_SPEAKER = "xenia" # Available: aidar, baya, kseniya, xenia, eugene +TTS_SPEAKER = "eugene" # Available: aidar, baya, kseniya, xenia, eugene TTS_SAMPLE_RATE = 48000 diff --git a/requirements.txt b/requirements.txt index 18ed72d..83af425 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,52 @@ -# Smart Speaker Dependencies -# Python 3.12.8 - -# Wake word detection -pvporcupine>=3.0.0 - -# Speech-to-Text -vosk>=0.3.45 - -# Audio -pyaudio>=0.2.14 -sounddevice>=0.4.6 - -# AI API -requests>=2.31.0 - -# Environment -python-dotenv>=1.0.0 - -# TTS (Silero) -torch>=2.0.0 -torchaudio>=2.0.0 -omegaconf>=2.3.0 - -# Utils -numpy>=1.24.0 -num2words -pymorphy3 +antlr4-python3-runtime==4.9.3 +certifi==2025.11.12 +cffi==2.0.0 +charset-normalizer==3.4.4 +DAWG2-Python==0.9.0 +docopt==0.6.2 +filelock==3.20.1 +fsspec==2025.12.0 +idna==3.11 +Jinja2==3.1.6 +MarkupSafe==3.0.3 +mpmath==1.3.0 +networkx==3.6.1 +num2words==0.5.14 +numpy==2.4.0 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.8.90 +omegaconf==2.3.0 +pvporcupine==4.0.1 +PyAudio==0.2.14 +pycparser==2.23 +pymorphy3==2.0.6 +pymorphy3-dicts-ru==2.4.417150.4580142 +python-dotenv==1.2.1 +PyYAML==6.0.3 +requests==2.32.5 +scipy==1.16.3 +setuptools==80.9.0 +sounddevice==0.5.3 +srt==3.5.3 +sympy==1.14.0 +torch==2.9.1 +torchaudio==2.9.1 +tqdm==4.67.1 +triton==3.5.1 +typing_extensions==4.15.0 +urllib3==2.6.2 +vosk==0.3.45 +websockets==15.0.1 diff --git a/tts.py b/tts.py index 8af11fb..7a9e97c 100644 --- a/tts.py +++ b/tts.py @@ -3,85 +3,156 @@ Text-to-Speech module using Silero TTS. Generates natural Russian speech with Xenia voice. Supports interruption via wake word detection using threading. """ + import torch import sounddevice as sd import numpy as np import threading import time +import warnings +import re from config import TTS_SPEAKER, TTS_SAMPLE_RATE +# Suppress Silero TTS warning about text length +warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols") + class TextToSpeech: """Text-to-Speech using Silero TTS with wake word interruption support.""" - + def __init__(self): self.model = None self.sample_rate = TTS_SAMPLE_RATE self.speaker = TTS_SPEAKER self._interrupted = False self._stop_flag = threading.Event() - + def initialize(self): """Initialize Silero TTS model.""" - print("📦 Загрузка модели Silero TTS...") - + print("📦 Загрузка модели Silero TTS v5...") + # Load Silero TTS model + device = torch.device('cpu') self.model, _ = torch.hub.load( - repo_or_dir='snakers4/silero-models', - model='silero_tts', - language='ru', - speaker='v4_ru' + repo_or_dir="snakers4/silero-models", + model="silero_tts", + language="ru", + speaker="v5_ru", ) - - print(f"✅ Модель TTS загружена (голос: {self.speaker})") - + self.model.to(device) + + print(f"✅ Модель TTS v5 загружена (голос: {self.speaker})") + + def _split_text(self, text: str, max_length: int = 900) -> list[str]: + """Split text into chunks smaller than max_length.""" + if len(text) <= max_length: + return [text] + + chunks = [] + # Split by sentence endings, keeping the punctuation + # pattern matches [.!?] followed by optional newlines + parts = re.split(r"([.!?]+\s*)", text) + + current_chunk = "" + # Reconstruct sentences. re.split with groups returns [text, delimiter, text, delimiter...] + # We iterate through parts. If part is a delimiter (matches pattern), we append to previous text. + + for part in parts: + # If the part combined with current_chunk exceeds max_length, save current_chunk + if len(current_chunk) + len(part) > max_length: + if current_chunk: + chunks.append(current_chunk.strip()) + current_chunk = "" + + current_chunk += part + + # If even a single part is too big (very long sentence without punctuation), force split + while len(current_chunk) > max_length: + # Try to split by space + split_idx = current_chunk.rfind(" ", 0, max_length) + if split_idx == -1: + # No space found, hard cut + split_idx = max_length + + chunks.append(current_chunk[:split_idx].strip()) + current_chunk = current_chunk[split_idx:].lstrip() + + if current_chunk: + chunks.append(current_chunk.strip()) + + # Filter empty chunks + return [c for c in chunks if c] + def speak(self, text: str, check_interrupt=None) -> bool: """ Convert text to speech and play it. - + Args: text: Text to synthesize and speak check_interrupt: Optional callback function that returns True if playback should stop - + Returns: True if playback completed normally, False if interrupted """ if not text.strip(): return True - + if not self.model: self.initialize() - - print(f"🔊 Озвучивание: {text[:50]}...") - + + # Split text into manageable chunks + chunks = self._split_text(text) + total_chunks = len(chunks) + + if total_chunks > 1: + print(f"🔊 Озвучивание (частей: {total_chunks}): {text[:50]}...") + else: + print(f"🔊 Озвучивание: {text[:50]}...") + self._interrupted = False self._stop_flag.clear() - - try: - # Generate audio - audio = self.model.apply_tts( - text=text, - speaker=self.speaker, - sample_rate=self.sample_rate - ) - - # Convert to numpy array - audio_np = audio.numpy() - - if check_interrupt: - # Play with interrupt checking in parallel thread - return self._play_with_interrupt(audio_np, check_interrupt) - else: - # Standard playback - sd.play(audio_np, self.sample_rate) - sd.wait() - print("✅ Воспроизведение завершено") - return True - - except Exception as e: - print(f"❌ Ошибка TTS: {e}") + + success = True + + for i, chunk in enumerate(chunks): + if self._interrupted: + break + + try: + # Generate audio for chunk + audio = self.model.apply_tts( + text=chunk, speaker=self.speaker, sample_rate=self.sample_rate + ) + + # Convert to numpy array + audio_np = audio.numpy() + + if check_interrupt: + # Play with interrupt checking in parallel thread + if not self._play_with_interrupt(audio_np, check_interrupt): + success = False + break + else: + # Standard playback + sd.play(audio_np, self.sample_rate) + sd.wait() + + except Exception as e: + print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}") + success = False + # Continue with next chunk? or break? + # Usually if one fails, we might want to try others, but for "too long" error + # splitting should solve it. If it fails for other reasons, maybe better to stop. + # But let's keep trying subsequent chunks in case it's a specific symbol issue. + + if success and not self._interrupted: + print("✅ Воспроизведение завершено") + return True + elif self._interrupted: return False - + else: + return False + def _check_interrupt_worker(self, check_interrupt): """ Worker thread that continuously checks for interrupt signal. @@ -95,47 +166,44 @@ class TextToSpeech: return except Exception: pass - + def _play_with_interrupt(self, audio_np: np.ndarray, check_interrupt) -> bool: """ Play audio with interrupt checking in parallel thread. - + Args: audio_np: Audio data as numpy array check_interrupt: Callback that returns True if should interrupt - + Returns: True if completed normally, False if interrupted """ # Start interrupt checker thread checker_thread = threading.Thread( - target=self._check_interrupt_worker, - args=(check_interrupt,), - daemon=True + target=self._check_interrupt_worker, args=(check_interrupt,), daemon=True ) checker_thread.start() - + try: # Play audio (non-blocking start) sd.play(audio_np, self.sample_rate) - + # Wait for playback to finish or interrupt while sd.get_stream().active: if self._interrupted: break time.sleep(0.05) - + finally: # Signal checker thread to stop self._stop_flag.set() checker_thread.join(timeout=0.5) - + if self._interrupted: return False - - print("✅ Воспроизведение завершено") + return True - + @property def was_interrupted(self) -> bool: """Check if the last playback was interrupted.""" @@ -157,11 +225,11 @@ def get_tts() -> TextToSpeech: def speak(text: str, check_interrupt=None) -> bool: """ Synthesize and speak the given text. - + Args: text: Text to speak check_interrupt: Optional callback for interrupt checking - + Returns: True if completed normally, False if interrupted """