silero v5

This commit is contained in:
2026-01-07 17:31:22 +03:00
parent ebaed3fbbe
commit 7b79593cad
5 changed files with 183 additions and 87 deletions

3
ai.py
View File

@@ -13,7 +13,8 @@ SYSTEM_PROMPT = """Ты — Александр, умный голосовой а
Твоя главная цель — помогать пользователю и поддерживать интересный диалог. Твоя главная цель — помогать пользователю и поддерживать интересный диалог.
Отвечай кратко и по существу, на русском языке. Отвечай кратко и по существу, на русском языке.
Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом. Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом.
Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов.""" Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов.
ВАЖНО: Не используй в ответах панибратские или сленговые приветствия и обращения, такие как "Эй", "Хэй", "Слушай" в начале фразы и подобные."""
def ask_ai(messages_history: list) -> str: def ask_ai(messages_history: list) -> str:

View File

@@ -251,6 +251,9 @@ def clean_response(text: str) -> str:
# Remove HTML tags if any # Remove HTML tags if any
text = re.sub(r'<[^>]+>', '', text) text = re.sub(r'<[^>]+>', '', text)
# Remove informal slang greetings at the beginning of sentences/responses
text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
# Convert numbers to words (Russian) # Convert numbers to words (Russian)
text = numbers_to_words(text) text = numbers_to_words(text)

View File

@@ -29,5 +29,5 @@ SAMPLE_RATE = 16000
CHANNELS = 1 CHANNELS = 1
# TTS configuration # TTS configuration
TTS_SPEAKER = "xenia" # Available: aidar, baya, kseniya, xenia, eugene TTS_SPEAKER = "eugene" # Available: aidar, baya, kseniya, xenia, eugene
TTS_SAMPLE_RATE = 48000 TTS_SAMPLE_RATE = 48000

View File

@@ -1,28 +1,52 @@
# Smart Speaker Dependencies antlr4-python3-runtime==4.9.3
# Python 3.12.8 certifi==2025.11.12
cffi==2.0.0
# Wake word detection charset-normalizer==3.4.4
pvporcupine>=3.0.0 DAWG2-Python==0.9.0
docopt==0.6.2
# Speech-to-Text filelock==3.20.1
vosk>=0.3.45 fsspec==2025.12.0
idna==3.11
# Audio Jinja2==3.1.6
pyaudio>=0.2.14 MarkupSafe==3.0.3
sounddevice>=0.4.6 mpmath==1.3.0
networkx==3.6.1
# AI API num2words==0.5.14
requests>=2.31.0 numpy==2.4.0
nvidia-cublas-cu12==12.8.4.1
# Environment nvidia-cuda-cupti-cu12==12.8.90
python-dotenv>=1.0.0 nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
# TTS (Silero) nvidia-cudnn-cu12==9.10.2.21
torch>=2.0.0 nvidia-cufft-cu12==11.3.3.83
torchaudio>=2.0.0 nvidia-cufile-cu12==1.13.1.3
omegaconf>=2.3.0 nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
# Utils nvidia-cusparse-cu12==12.5.8.93
numpy>=1.24.0 nvidia-cusparselt-cu12==0.7.1
num2words nvidia-nccl-cu12==2.27.5
pymorphy3 nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.8.90
omegaconf==2.3.0
pvporcupine==4.0.1
PyAudio==0.2.14
pycparser==2.23
pymorphy3==2.0.6
pymorphy3-dicts-ru==2.4.417150.4580142
python-dotenv==1.2.1
PyYAML==6.0.3
requests==2.32.5
scipy==1.16.3
setuptools==80.9.0
sounddevice==0.5.3
srt==3.5.3
sympy==1.14.0
torch==2.9.1
torchaudio==2.9.1
tqdm==4.67.1
triton==3.5.1
typing_extensions==4.15.0
urllib3==2.6.2
vosk==0.3.45
websockets==15.0.1

130
tts.py
View File

@@ -3,13 +3,19 @@ Text-to-Speech module using Silero TTS.
Generates natural Russian speech with Xenia voice. Generates natural Russian speech with Xenia voice.
Supports interruption via wake word detection using threading. Supports interruption via wake word detection using threading.
""" """
import torch import torch
import sounddevice as sd import sounddevice as sd
import numpy as np import numpy as np
import threading import threading
import time import time
import warnings
import re
from config import TTS_SPEAKER, TTS_SAMPLE_RATE from config import TTS_SPEAKER, TTS_SAMPLE_RATE
# Suppress Silero TTS warning about text length
warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols")
class TextToSpeech: class TextToSpeech:
"""Text-to-Speech using Silero TTS with wake word interruption support.""" """Text-to-Speech using Silero TTS with wake word interruption support."""
@@ -23,17 +29,59 @@ class TextToSpeech:
def initialize(self): def initialize(self):
"""Initialize Silero TTS model.""" """Initialize Silero TTS model."""
print("📦 Загрузка модели Silero TTS...") print("📦 Загрузка модели Silero TTS v5...")
# Load Silero TTS model # Load Silero TTS model
device = torch.device('cpu')
self.model, _ = torch.hub.load( self.model, _ = torch.hub.load(
repo_or_dir='snakers4/silero-models', repo_or_dir="snakers4/silero-models",
model='silero_tts', model="silero_tts",
language='ru', language="ru",
speaker='v4_ru' speaker="v5_ru",
) )
self.model.to(device)
print(f"✅ Модель TTS загружена (голос: {self.speaker})") print(f"✅ Модель TTS v5 загружена (голос: {self.speaker})")
def _split_text(self, text: str, max_length: int = 900) -> list[str]:
"""Split text into chunks smaller than max_length."""
if len(text) <= max_length:
return [text]
chunks = []
# Split by sentence endings, keeping the punctuation
# pattern matches [.!?] followed by optional newlines
parts = re.split(r"([.!?]+\s*)", text)
current_chunk = ""
# Reconstruct sentences. re.split with groups returns [text, delimiter, text, delimiter...]
# We iterate through parts. If part is a delimiter (matches pattern), we append to previous text.
for part in parts:
# If the part combined with current_chunk exceeds max_length, save current_chunk
if len(current_chunk) + len(part) > max_length:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = ""
current_chunk += part
# If even a single part is too big (very long sentence without punctuation), force split
while len(current_chunk) > max_length:
# Try to split by space
split_idx = current_chunk.rfind(" ", 0, max_length)
if split_idx == -1:
# No space found, hard cut
split_idx = max_length
chunks.append(current_chunk[:split_idx].strip())
current_chunk = current_chunk[split_idx:].lstrip()
if current_chunk:
chunks.append(current_chunk.strip())
# Filter empty chunks
return [c for c in chunks if c]
def speak(self, text: str, check_interrupt=None) -> bool: def speak(self, text: str, check_interrupt=None) -> bool:
""" """
@@ -52,34 +100,57 @@ class TextToSpeech:
if not self.model: if not self.model:
self.initialize() self.initialize()
print(f"🔊 Озвучивание: {text[:50]}...") # Split text into manageable chunks
chunks = self._split_text(text)
total_chunks = len(chunks)
if total_chunks > 1:
print(f"🔊 Озвучивание (частей: {total_chunks}): {text[:50]}...")
else:
print(f"🔊 Озвучивание: {text[:50]}...")
self._interrupted = False self._interrupted = False
self._stop_flag.clear() self._stop_flag.clear()
try: success = True
# Generate audio
audio = self.model.apply_tts(
text=text,
speaker=self.speaker,
sample_rate=self.sample_rate
)
# Convert to numpy array for i, chunk in enumerate(chunks):
audio_np = audio.numpy() if self._interrupted:
break
if check_interrupt: try:
# Play with interrupt checking in parallel thread # Generate audio for chunk
return self._play_with_interrupt(audio_np, check_interrupt) audio = self.model.apply_tts(
else: text=chunk, speaker=self.speaker, sample_rate=self.sample_rate
# Standard playback )
sd.play(audio_np, self.sample_rate)
sd.wait()
print("✅ Воспроизведение завершено")
return True
except Exception as e: # Convert to numpy array
print(f"❌ Ошибка TTS: {e}") audio_np = audio.numpy()
if check_interrupt:
# Play with interrupt checking in parallel thread
if not self._play_with_interrupt(audio_np, check_interrupt):
success = False
break
else:
# Standard playback
sd.play(audio_np, self.sample_rate)
sd.wait()
except Exception as e:
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
success = False
# Continue with next chunk? or break?
# Usually if one fails, we might want to try others, but for "too long" error
# splitting should solve it. If it fails for other reasons, maybe better to stop.
# But let's keep trying subsequent chunks in case it's a specific symbol issue.
if success and not self._interrupted:
print("✅ Воспроизведение завершено")
return True
elif self._interrupted:
return False
else:
return False return False
def _check_interrupt_worker(self, check_interrupt): def _check_interrupt_worker(self, check_interrupt):
@@ -109,9 +180,7 @@ class TextToSpeech:
""" """
# Start interrupt checker thread # Start interrupt checker thread
checker_thread = threading.Thread( checker_thread = threading.Thread(
target=self._check_interrupt_worker, target=self._check_interrupt_worker, args=(check_interrupt,), daemon=True
args=(check_interrupt,),
daemon=True
) )
checker_thread.start() checker_thread.start()
@@ -133,7 +202,6 @@ class TextToSpeech:
if self._interrupted: if self._interrupted:
return False return False
print("✅ Воспроизведение завершено")
return True return True
@property @property