silero v5
This commit is contained in:
3
ai.py
3
ai.py
@@ -13,7 +13,8 @@ SYSTEM_PROMPT = """Ты — Александр, умный голосовой а
|
||||
Твоя главная цель — помогать пользователю и поддерживать интересный диалог.
|
||||
Отвечай кратко и по существу, на русском языке.
|
||||
Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом.
|
||||
Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов."""
|
||||
Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов.
|
||||
ВАЖНО: Не используй в ответах панибратские или сленговые приветствия и обращения, такие как "Эй", "Хэй", "Слушай" в начале фразы и подобные."""
|
||||
|
||||
|
||||
def ask_ai(messages_history: list) -> str:
|
||||
|
||||
@@ -251,6 +251,9 @@ def clean_response(text: str) -> str:
|
||||
# Remove HTML tags if any
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
|
||||
# Remove informal slang greetings at the beginning of sentences/responses
|
||||
text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
# Convert numbers to words (Russian)
|
||||
text = numbers_to_words(text)
|
||||
|
||||
|
||||
@@ -29,5 +29,5 @@ SAMPLE_RATE = 16000
|
||||
CHANNELS = 1
|
||||
|
||||
# TTS configuration
|
||||
TTS_SPEAKER = "xenia" # Available: aidar, baya, kseniya, xenia, eugene
|
||||
TTS_SPEAKER = "eugene" # Available: aidar, baya, kseniya, xenia, eugene
|
||||
TTS_SAMPLE_RATE = 48000
|
||||
|
||||
@@ -1,28 +1,52 @@
|
||||
# Smart Speaker Dependencies
|
||||
# Python 3.12.8
|
||||
|
||||
# Wake word detection
|
||||
pvporcupine>=3.0.0
|
||||
|
||||
# Speech-to-Text
|
||||
vosk>=0.3.45
|
||||
|
||||
# Audio
|
||||
pyaudio>=0.2.14
|
||||
sounddevice>=0.4.6
|
||||
|
||||
# AI API
|
||||
requests>=2.31.0
|
||||
|
||||
# Environment
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# TTS (Silero)
|
||||
torch>=2.0.0
|
||||
torchaudio>=2.0.0
|
||||
omegaconf>=2.3.0
|
||||
|
||||
# Utils
|
||||
numpy>=1.24.0
|
||||
num2words
|
||||
pymorphy3
|
||||
antlr4-python3-runtime==4.9.3
|
||||
certifi==2025.11.12
|
||||
cffi==2.0.0
|
||||
charset-normalizer==3.4.4
|
||||
DAWG2-Python==0.9.0
|
||||
docopt==0.6.2
|
||||
filelock==3.20.1
|
||||
fsspec==2025.12.0
|
||||
idna==3.11
|
||||
Jinja2==3.1.6
|
||||
MarkupSafe==3.0.3
|
||||
mpmath==1.3.0
|
||||
networkx==3.6.1
|
||||
num2words==0.5.14
|
||||
numpy==2.4.0
|
||||
nvidia-cublas-cu12==12.8.4.1
|
||||
nvidia-cuda-cupti-cu12==12.8.90
|
||||
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||
nvidia-cuda-runtime-cu12==12.8.90
|
||||
nvidia-cudnn-cu12==9.10.2.21
|
||||
nvidia-cufft-cu12==11.3.3.83
|
||||
nvidia-cufile-cu12==1.13.1.3
|
||||
nvidia-curand-cu12==10.3.9.90
|
||||
nvidia-cusolver-cu12==11.7.3.90
|
||||
nvidia-cusparse-cu12==12.5.8.93
|
||||
nvidia-cusparselt-cu12==0.7.1
|
||||
nvidia-nccl-cu12==2.27.5
|
||||
nvidia-nvjitlink-cu12==12.8.93
|
||||
nvidia-nvshmem-cu12==3.3.20
|
||||
nvidia-nvtx-cu12==12.8.90
|
||||
omegaconf==2.3.0
|
||||
pvporcupine==4.0.1
|
||||
PyAudio==0.2.14
|
||||
pycparser==2.23
|
||||
pymorphy3==2.0.6
|
||||
pymorphy3-dicts-ru==2.4.417150.4580142
|
||||
python-dotenv==1.2.1
|
||||
PyYAML==6.0.3
|
||||
requests==2.32.5
|
||||
scipy==1.16.3
|
||||
setuptools==80.9.0
|
||||
sounddevice==0.5.3
|
||||
srt==3.5.3
|
||||
sympy==1.14.0
|
||||
torch==2.9.1
|
||||
torchaudio==2.9.1
|
||||
tqdm==4.67.1
|
||||
triton==3.5.1
|
||||
typing_extensions==4.15.0
|
||||
urllib3==2.6.2
|
||||
vosk==0.3.45
|
||||
websockets==15.0.1
|
||||
|
||||
104
tts.py
104
tts.py
@@ -3,13 +3,19 @@ Text-to-Speech module using Silero TTS.
|
||||
Generates natural Russian speech with Xenia voice.
|
||||
Supports interruption via wake word detection using threading.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
import threading
|
||||
import time
|
||||
import warnings
|
||||
import re
|
||||
from config import TTS_SPEAKER, TTS_SAMPLE_RATE
|
||||
|
||||
# Suppress Silero TTS warning about text length
|
||||
warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols")
|
||||
|
||||
|
||||
class TextToSpeech:
|
||||
"""Text-to-Speech using Silero TTS with wake word interruption support."""
|
||||
@@ -23,17 +29,59 @@ class TextToSpeech:
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize Silero TTS model."""
|
||||
print("📦 Загрузка модели Silero TTS...")
|
||||
print("📦 Загрузка модели Silero TTS v5...")
|
||||
|
||||
# Load Silero TTS model
|
||||
device = torch.device('cpu')
|
||||
self.model, _ = torch.hub.load(
|
||||
repo_or_dir='snakers4/silero-models',
|
||||
model='silero_tts',
|
||||
language='ru',
|
||||
speaker='v4_ru'
|
||||
repo_or_dir="snakers4/silero-models",
|
||||
model="silero_tts",
|
||||
language="ru",
|
||||
speaker="v5_ru",
|
||||
)
|
||||
self.model.to(device)
|
||||
|
||||
print(f"✅ Модель TTS загружена (голос: {self.speaker})")
|
||||
print(f"✅ Модель TTS v5 загружена (голос: {self.speaker})")
|
||||
|
||||
def _split_text(self, text: str, max_length: int = 900) -> list[str]:
|
||||
"""Split text into chunks smaller than max_length."""
|
||||
if len(text) <= max_length:
|
||||
return [text]
|
||||
|
||||
chunks = []
|
||||
# Split by sentence endings, keeping the punctuation
|
||||
# pattern matches [.!?] followed by optional newlines
|
||||
parts = re.split(r"([.!?]+\s*)", text)
|
||||
|
||||
current_chunk = ""
|
||||
# Reconstruct sentences. re.split with groups returns [text, delimiter, text, delimiter...]
|
||||
# We iterate through parts. If part is a delimiter (matches pattern), we append to previous text.
|
||||
|
||||
for part in parts:
|
||||
# If the part combined with current_chunk exceeds max_length, save current_chunk
|
||||
if len(current_chunk) + len(part) > max_length:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = ""
|
||||
|
||||
current_chunk += part
|
||||
|
||||
# If even a single part is too big (very long sentence without punctuation), force split
|
||||
while len(current_chunk) > max_length:
|
||||
# Try to split by space
|
||||
split_idx = current_chunk.rfind(" ", 0, max_length)
|
||||
if split_idx == -1:
|
||||
# No space found, hard cut
|
||||
split_idx = max_length
|
||||
|
||||
chunks.append(current_chunk[:split_idx].strip())
|
||||
current_chunk = current_chunk[split_idx:].lstrip()
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
# Filter empty chunks
|
||||
return [c for c in chunks if c]
|
||||
|
||||
def speak(self, text: str, check_interrupt=None) -> bool:
|
||||
"""
|
||||
@@ -52,17 +100,28 @@ class TextToSpeech:
|
||||
if not self.model:
|
||||
self.initialize()
|
||||
|
||||
# Split text into manageable chunks
|
||||
chunks = self._split_text(text)
|
||||
total_chunks = len(chunks)
|
||||
|
||||
if total_chunks > 1:
|
||||
print(f"🔊 Озвучивание (частей: {total_chunks}): {text[:50]}...")
|
||||
else:
|
||||
print(f"🔊 Озвучивание: {text[:50]}...")
|
||||
|
||||
self._interrupted = False
|
||||
self._stop_flag.clear()
|
||||
|
||||
success = True
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
if self._interrupted:
|
||||
break
|
||||
|
||||
try:
|
||||
# Generate audio
|
||||
# Generate audio for chunk
|
||||
audio = self.model.apply_tts(
|
||||
text=text,
|
||||
speaker=self.speaker,
|
||||
sample_rate=self.sample_rate
|
||||
text=chunk, speaker=self.speaker, sample_rate=self.sample_rate
|
||||
)
|
||||
|
||||
# Convert to numpy array
|
||||
@@ -70,16 +129,28 @@ class TextToSpeech:
|
||||
|
||||
if check_interrupt:
|
||||
# Play with interrupt checking in parallel thread
|
||||
return self._play_with_interrupt(audio_np, check_interrupt)
|
||||
if not self._play_with_interrupt(audio_np, check_interrupt):
|
||||
success = False
|
||||
break
|
||||
else:
|
||||
# Standard playback
|
||||
sd.play(audio_np, self.sample_rate)
|
||||
sd.wait()
|
||||
print("✅ Воспроизведение завершено")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Ошибка TTS: {e}")
|
||||
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
|
||||
success = False
|
||||
# Continue with next chunk? or break?
|
||||
# Usually if one fails, we might want to try others, but for "too long" error
|
||||
# splitting should solve it. If it fails for other reasons, maybe better to stop.
|
||||
# But let's keep trying subsequent chunks in case it's a specific symbol issue.
|
||||
|
||||
if success and not self._interrupted:
|
||||
print("✅ Воспроизведение завершено")
|
||||
return True
|
||||
elif self._interrupted:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
def _check_interrupt_worker(self, check_interrupt):
|
||||
@@ -109,9 +180,7 @@ class TextToSpeech:
|
||||
"""
|
||||
# Start interrupt checker thread
|
||||
checker_thread = threading.Thread(
|
||||
target=self._check_interrupt_worker,
|
||||
args=(check_interrupt,),
|
||||
daemon=True
|
||||
target=self._check_interrupt_worker, args=(check_interrupt,), daemon=True
|
||||
)
|
||||
checker_thread.start()
|
||||
|
||||
@@ -133,7 +202,6 @@ class TextToSpeech:
|
||||
if self._interrupted:
|
||||
return False
|
||||
|
||||
print("✅ Воспроизведение завершено")
|
||||
return True
|
||||
|
||||
@property
|
||||
|
||||
Reference in New Issue
Block a user