silero v5
This commit is contained in:
3
ai.py
3
ai.py
@@ -13,7 +13,8 @@ SYSTEM_PROMPT = """Ты — Александр, умный голосовой а
|
|||||||
Твоя главная цель — помогать пользователю и поддерживать интересный диалог.
|
Твоя главная цель — помогать пользователю и поддерживать интересный диалог.
|
||||||
Отвечай кратко и по существу, на русском языке.
|
Отвечай кратко и по существу, на русском языке.
|
||||||
Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом.
|
Избегай длинных списков, сложного форматирования и спецсимволов, так как твои ответы озвучиваются голосом.
|
||||||
Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов."""
|
Пиши в разговорном стиле, как при живом общении, но не забывай о вежливости и правильности твоих ответов.
|
||||||
|
ВАЖНО: Не используй в ответах панибратские или сленговые приветствия и обращения, такие как "Эй", "Хэй", "Слушай" в начале фразы и подобные."""
|
||||||
|
|
||||||
|
|
||||||
def ask_ai(messages_history: list) -> str:
|
def ask_ai(messages_history: list) -> str:
|
||||||
|
|||||||
@@ -251,6 +251,9 @@ def clean_response(text: str) -> str:
|
|||||||
# Remove HTML tags if any
|
# Remove HTML tags if any
|
||||||
text = re.sub(r'<[^>]+>', '', text)
|
text = re.sub(r'<[^>]+>', '', text)
|
||||||
|
|
||||||
|
# Remove informal slang greetings at the beginning of sentences/responses
|
||||||
|
text = re.sub(r'^(Эй|Хэй|Слушай|Так|Ну|Короче|В\s+общем)[,!?:]?\s*', '', text, flags=re.IGNORECASE | re.MULTILINE)
|
||||||
|
|
||||||
# Convert numbers to words (Russian)
|
# Convert numbers to words (Russian)
|
||||||
text = numbers_to_words(text)
|
text = numbers_to_words(text)
|
||||||
|
|
||||||
|
|||||||
@@ -29,5 +29,5 @@ SAMPLE_RATE = 16000
|
|||||||
CHANNELS = 1
|
CHANNELS = 1
|
||||||
|
|
||||||
# TTS configuration
|
# TTS configuration
|
||||||
TTS_SPEAKER = "xenia" # Available: aidar, baya, kseniya, xenia, eugene
|
TTS_SPEAKER = "eugene" # Available: aidar, baya, kseniya, xenia, eugene
|
||||||
TTS_SAMPLE_RATE = 48000
|
TTS_SAMPLE_RATE = 48000
|
||||||
|
|||||||
@@ -1,28 +1,52 @@
|
|||||||
# Smart Speaker Dependencies
|
antlr4-python3-runtime==4.9.3
|
||||||
# Python 3.12.8
|
certifi==2025.11.12
|
||||||
|
cffi==2.0.0
|
||||||
# Wake word detection
|
charset-normalizer==3.4.4
|
||||||
pvporcupine>=3.0.0
|
DAWG2-Python==0.9.0
|
||||||
|
docopt==0.6.2
|
||||||
# Speech-to-Text
|
filelock==3.20.1
|
||||||
vosk>=0.3.45
|
fsspec==2025.12.0
|
||||||
|
idna==3.11
|
||||||
# Audio
|
Jinja2==3.1.6
|
||||||
pyaudio>=0.2.14
|
MarkupSafe==3.0.3
|
||||||
sounddevice>=0.4.6
|
mpmath==1.3.0
|
||||||
|
networkx==3.6.1
|
||||||
# AI API
|
num2words==0.5.14
|
||||||
requests>=2.31.0
|
numpy==2.4.0
|
||||||
|
nvidia-cublas-cu12==12.8.4.1
|
||||||
# Environment
|
nvidia-cuda-cupti-cu12==12.8.90
|
||||||
python-dotenv>=1.0.0
|
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||||
|
nvidia-cuda-runtime-cu12==12.8.90
|
||||||
# TTS (Silero)
|
nvidia-cudnn-cu12==9.10.2.21
|
||||||
torch>=2.0.0
|
nvidia-cufft-cu12==11.3.3.83
|
||||||
torchaudio>=2.0.0
|
nvidia-cufile-cu12==1.13.1.3
|
||||||
omegaconf>=2.3.0
|
nvidia-curand-cu12==10.3.9.90
|
||||||
|
nvidia-cusolver-cu12==11.7.3.90
|
||||||
# Utils
|
nvidia-cusparse-cu12==12.5.8.93
|
||||||
numpy>=1.24.0
|
nvidia-cusparselt-cu12==0.7.1
|
||||||
num2words
|
nvidia-nccl-cu12==2.27.5
|
||||||
pymorphy3
|
nvidia-nvjitlink-cu12==12.8.93
|
||||||
|
nvidia-nvshmem-cu12==3.3.20
|
||||||
|
nvidia-nvtx-cu12==12.8.90
|
||||||
|
omegaconf==2.3.0
|
||||||
|
pvporcupine==4.0.1
|
||||||
|
PyAudio==0.2.14
|
||||||
|
pycparser==2.23
|
||||||
|
pymorphy3==2.0.6
|
||||||
|
pymorphy3-dicts-ru==2.4.417150.4580142
|
||||||
|
python-dotenv==1.2.1
|
||||||
|
PyYAML==6.0.3
|
||||||
|
requests==2.32.5
|
||||||
|
scipy==1.16.3
|
||||||
|
setuptools==80.9.0
|
||||||
|
sounddevice==0.5.3
|
||||||
|
srt==3.5.3
|
||||||
|
sympy==1.14.0
|
||||||
|
torch==2.9.1
|
||||||
|
torchaudio==2.9.1
|
||||||
|
tqdm==4.67.1
|
||||||
|
triton==3.5.1
|
||||||
|
typing_extensions==4.15.0
|
||||||
|
urllib3==2.6.2
|
||||||
|
vosk==0.3.45
|
||||||
|
websockets==15.0.1
|
||||||
|
|||||||
130
tts.py
130
tts.py
@@ -3,13 +3,19 @@ Text-to-Speech module using Silero TTS.
|
|||||||
Generates natural Russian speech with Xenia voice.
|
Generates natural Russian speech with Xenia voice.
|
||||||
Supports interruption via wake word detection using threading.
|
Supports interruption via wake word detection using threading.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import warnings
|
||||||
|
import re
|
||||||
from config import TTS_SPEAKER, TTS_SAMPLE_RATE
|
from config import TTS_SPEAKER, TTS_SAMPLE_RATE
|
||||||
|
|
||||||
|
# Suppress Silero TTS warning about text length
|
||||||
|
warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols")
|
||||||
|
|
||||||
|
|
||||||
class TextToSpeech:
|
class TextToSpeech:
|
||||||
"""Text-to-Speech using Silero TTS with wake word interruption support."""
|
"""Text-to-Speech using Silero TTS with wake word interruption support."""
|
||||||
@@ -23,17 +29,59 @@ class TextToSpeech:
|
|||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
"""Initialize Silero TTS model."""
|
"""Initialize Silero TTS model."""
|
||||||
print("📦 Загрузка модели Silero TTS...")
|
print("📦 Загрузка модели Silero TTS v5...")
|
||||||
|
|
||||||
# Load Silero TTS model
|
# Load Silero TTS model
|
||||||
|
device = torch.device('cpu')
|
||||||
self.model, _ = torch.hub.load(
|
self.model, _ = torch.hub.load(
|
||||||
repo_or_dir='snakers4/silero-models',
|
repo_or_dir="snakers4/silero-models",
|
||||||
model='silero_tts',
|
model="silero_tts",
|
||||||
language='ru',
|
language="ru",
|
||||||
speaker='v4_ru'
|
speaker="v5_ru",
|
||||||
)
|
)
|
||||||
|
self.model.to(device)
|
||||||
|
|
||||||
print(f"✅ Модель TTS загружена (голос: {self.speaker})")
|
print(f"✅ Модель TTS v5 загружена (голос: {self.speaker})")
|
||||||
|
|
||||||
|
def _split_text(self, text: str, max_length: int = 900) -> list[str]:
|
||||||
|
"""Split text into chunks smaller than max_length."""
|
||||||
|
if len(text) <= max_length:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
# Split by sentence endings, keeping the punctuation
|
||||||
|
# pattern matches [.!?] followed by optional newlines
|
||||||
|
parts = re.split(r"([.!?]+\s*)", text)
|
||||||
|
|
||||||
|
current_chunk = ""
|
||||||
|
# Reconstruct sentences. re.split with groups returns [text, delimiter, text, delimiter...]
|
||||||
|
# We iterate through parts. If part is a delimiter (matches pattern), we append to previous text.
|
||||||
|
|
||||||
|
for part in parts:
|
||||||
|
# If the part combined with current_chunk exceeds max_length, save current_chunk
|
||||||
|
if len(current_chunk) + len(part) > max_length:
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append(current_chunk.strip())
|
||||||
|
current_chunk = ""
|
||||||
|
|
||||||
|
current_chunk += part
|
||||||
|
|
||||||
|
# If even a single part is too big (very long sentence without punctuation), force split
|
||||||
|
while len(current_chunk) > max_length:
|
||||||
|
# Try to split by space
|
||||||
|
split_idx = current_chunk.rfind(" ", 0, max_length)
|
||||||
|
if split_idx == -1:
|
||||||
|
# No space found, hard cut
|
||||||
|
split_idx = max_length
|
||||||
|
|
||||||
|
chunks.append(current_chunk[:split_idx].strip())
|
||||||
|
current_chunk = current_chunk[split_idx:].lstrip()
|
||||||
|
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append(current_chunk.strip())
|
||||||
|
|
||||||
|
# Filter empty chunks
|
||||||
|
return [c for c in chunks if c]
|
||||||
|
|
||||||
def speak(self, text: str, check_interrupt=None) -> bool:
|
def speak(self, text: str, check_interrupt=None) -> bool:
|
||||||
"""
|
"""
|
||||||
@@ -52,34 +100,57 @@ class TextToSpeech:
|
|||||||
if not self.model:
|
if not self.model:
|
||||||
self.initialize()
|
self.initialize()
|
||||||
|
|
||||||
print(f"🔊 Озвучивание: {text[:50]}...")
|
# Split text into manageable chunks
|
||||||
|
chunks = self._split_text(text)
|
||||||
|
total_chunks = len(chunks)
|
||||||
|
|
||||||
|
if total_chunks > 1:
|
||||||
|
print(f"🔊 Озвучивание (частей: {total_chunks}): {text[:50]}...")
|
||||||
|
else:
|
||||||
|
print(f"🔊 Озвучивание: {text[:50]}...")
|
||||||
|
|
||||||
self._interrupted = False
|
self._interrupted = False
|
||||||
self._stop_flag.clear()
|
self._stop_flag.clear()
|
||||||
|
|
||||||
try:
|
success = True
|
||||||
# Generate audio
|
|
||||||
audio = self.model.apply_tts(
|
|
||||||
text=text,
|
|
||||||
speaker=self.speaker,
|
|
||||||
sample_rate=self.sample_rate
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert to numpy array
|
for i, chunk in enumerate(chunks):
|
||||||
audio_np = audio.numpy()
|
if self._interrupted:
|
||||||
|
break
|
||||||
|
|
||||||
if check_interrupt:
|
try:
|
||||||
# Play with interrupt checking in parallel thread
|
# Generate audio for chunk
|
||||||
return self._play_with_interrupt(audio_np, check_interrupt)
|
audio = self.model.apply_tts(
|
||||||
else:
|
text=chunk, speaker=self.speaker, sample_rate=self.sample_rate
|
||||||
# Standard playback
|
)
|
||||||
sd.play(audio_np, self.sample_rate)
|
|
||||||
sd.wait()
|
|
||||||
print("✅ Воспроизведение завершено")
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
# Convert to numpy array
|
||||||
print(f"❌ Ошибка TTS: {e}")
|
audio_np = audio.numpy()
|
||||||
|
|
||||||
|
if check_interrupt:
|
||||||
|
# Play with interrupt checking in parallel thread
|
||||||
|
if not self._play_with_interrupt(audio_np, check_interrupt):
|
||||||
|
success = False
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Standard playback
|
||||||
|
sd.play(audio_np, self.sample_rate)
|
||||||
|
sd.wait()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
|
||||||
|
success = False
|
||||||
|
# Continue with next chunk? or break?
|
||||||
|
# Usually if one fails, we might want to try others, but for "too long" error
|
||||||
|
# splitting should solve it. If it fails for other reasons, maybe better to stop.
|
||||||
|
# But let's keep trying subsequent chunks in case it's a specific symbol issue.
|
||||||
|
|
||||||
|
if success and not self._interrupted:
|
||||||
|
print("✅ Воспроизведение завершено")
|
||||||
|
return True
|
||||||
|
elif self._interrupted:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _check_interrupt_worker(self, check_interrupt):
|
def _check_interrupt_worker(self, check_interrupt):
|
||||||
@@ -109,9 +180,7 @@ class TextToSpeech:
|
|||||||
"""
|
"""
|
||||||
# Start interrupt checker thread
|
# Start interrupt checker thread
|
||||||
checker_thread = threading.Thread(
|
checker_thread = threading.Thread(
|
||||||
target=self._check_interrupt_worker,
|
target=self._check_interrupt_worker, args=(check_interrupt,), daemon=True
|
||||||
args=(check_interrupt,),
|
|
||||||
daemon=True
|
|
||||||
)
|
)
|
||||||
checker_thread.start()
|
checker_thread.start()
|
||||||
|
|
||||||
@@ -133,7 +202,6 @@ class TextToSpeech:
|
|||||||
if self._interrupted:
|
if self._interrupted:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
print("✅ Воспроизведение завершено")
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
Reference in New Issue
Block a user