silero v5
This commit is contained in:
182
tts.py
182
tts.py
@@ -3,85 +3,156 @@ Text-to-Speech module using Silero TTS.
|
||||
Generates natural Russian speech with Xenia voice.
|
||||
Supports interruption via wake word detection using threading.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
import threading
|
||||
import time
|
||||
import warnings
|
||||
import re
|
||||
from config import TTS_SPEAKER, TTS_SAMPLE_RATE
|
||||
|
||||
# Suppress Silero TTS warning about text length
|
||||
warnings.filterwarnings("ignore", message="Text string is longer than 1000 symbols")
|
||||
|
||||
|
||||
class TextToSpeech:
|
||||
"""Text-to-Speech using Silero TTS with wake word interruption support."""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.sample_rate = TTS_SAMPLE_RATE
|
||||
self.speaker = TTS_SPEAKER
|
||||
self._interrupted = False
|
||||
self._stop_flag = threading.Event()
|
||||
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize Silero TTS model."""
|
||||
print("📦 Загрузка модели Silero TTS...")
|
||||
|
||||
print("📦 Загрузка модели Silero TTS v5...")
|
||||
|
||||
# Load Silero TTS model
|
||||
device = torch.device('cpu')
|
||||
self.model, _ = torch.hub.load(
|
||||
repo_or_dir='snakers4/silero-models',
|
||||
model='silero_tts',
|
||||
language='ru',
|
||||
speaker='v4_ru'
|
||||
repo_or_dir="snakers4/silero-models",
|
||||
model="silero_tts",
|
||||
language="ru",
|
||||
speaker="v5_ru",
|
||||
)
|
||||
|
||||
print(f"✅ Модель TTS загружена (голос: {self.speaker})")
|
||||
|
||||
self.model.to(device)
|
||||
|
||||
print(f"✅ Модель TTS v5 загружена (голос: {self.speaker})")
|
||||
|
||||
def _split_text(self, text: str, max_length: int = 900) -> list[str]:
|
||||
"""Split text into chunks smaller than max_length."""
|
||||
if len(text) <= max_length:
|
||||
return [text]
|
||||
|
||||
chunks = []
|
||||
# Split by sentence endings, keeping the punctuation
|
||||
# pattern matches [.!?] followed by optional newlines
|
||||
parts = re.split(r"([.!?]+\s*)", text)
|
||||
|
||||
current_chunk = ""
|
||||
# Reconstruct sentences. re.split with groups returns [text, delimiter, text, delimiter...]
|
||||
# We iterate through parts. If part is a delimiter (matches pattern), we append to previous text.
|
||||
|
||||
for part in parts:
|
||||
# If the part combined with current_chunk exceeds max_length, save current_chunk
|
||||
if len(current_chunk) + len(part) > max_length:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = ""
|
||||
|
||||
current_chunk += part
|
||||
|
||||
# If even a single part is too big (very long sentence without punctuation), force split
|
||||
while len(current_chunk) > max_length:
|
||||
# Try to split by space
|
||||
split_idx = current_chunk.rfind(" ", 0, max_length)
|
||||
if split_idx == -1:
|
||||
# No space found, hard cut
|
||||
split_idx = max_length
|
||||
|
||||
chunks.append(current_chunk[:split_idx].strip())
|
||||
current_chunk = current_chunk[split_idx:].lstrip()
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
# Filter empty chunks
|
||||
return [c for c in chunks if c]
|
||||
|
||||
def speak(self, text: str, check_interrupt=None) -> bool:
|
||||
"""
|
||||
Convert text to speech and play it.
|
||||
|
||||
|
||||
Args:
|
||||
text: Text to synthesize and speak
|
||||
check_interrupt: Optional callback function that returns True if playback should stop
|
||||
|
||||
|
||||
Returns:
|
||||
True if playback completed normally, False if interrupted
|
||||
"""
|
||||
if not text.strip():
|
||||
return True
|
||||
|
||||
|
||||
if not self.model:
|
||||
self.initialize()
|
||||
|
||||
print(f"🔊 Озвучивание: {text[:50]}...")
|
||||
|
||||
|
||||
# Split text into manageable chunks
|
||||
chunks = self._split_text(text)
|
||||
total_chunks = len(chunks)
|
||||
|
||||
if total_chunks > 1:
|
||||
print(f"🔊 Озвучивание (частей: {total_chunks}): {text[:50]}...")
|
||||
else:
|
||||
print(f"🔊 Озвучивание: {text[:50]}...")
|
||||
|
||||
self._interrupted = False
|
||||
self._stop_flag.clear()
|
||||
|
||||
try:
|
||||
# Generate audio
|
||||
audio = self.model.apply_tts(
|
||||
text=text,
|
||||
speaker=self.speaker,
|
||||
sample_rate=self.sample_rate
|
||||
)
|
||||
|
||||
# Convert to numpy array
|
||||
audio_np = audio.numpy()
|
||||
|
||||
if check_interrupt:
|
||||
# Play with interrupt checking in parallel thread
|
||||
return self._play_with_interrupt(audio_np, check_interrupt)
|
||||
else:
|
||||
# Standard playback
|
||||
sd.play(audio_np, self.sample_rate)
|
||||
sd.wait()
|
||||
print("✅ Воспроизведение завершено")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Ошибка TTS: {e}")
|
||||
|
||||
success = True
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
if self._interrupted:
|
||||
break
|
||||
|
||||
try:
|
||||
# Generate audio for chunk
|
||||
audio = self.model.apply_tts(
|
||||
text=chunk, speaker=self.speaker, sample_rate=self.sample_rate
|
||||
)
|
||||
|
||||
# Convert to numpy array
|
||||
audio_np = audio.numpy()
|
||||
|
||||
if check_interrupt:
|
||||
# Play with interrupt checking in parallel thread
|
||||
if not self._play_with_interrupt(audio_np, check_interrupt):
|
||||
success = False
|
||||
break
|
||||
else:
|
||||
# Standard playback
|
||||
sd.play(audio_np, self.sample_rate)
|
||||
sd.wait()
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Ошибка TTS (часть {i + 1}/{total_chunks}): {e}")
|
||||
success = False
|
||||
# Continue with next chunk? or break?
|
||||
# Usually if one fails, we might want to try others, but for "too long" error
|
||||
# splitting should solve it. If it fails for other reasons, maybe better to stop.
|
||||
# But let's keep trying subsequent chunks in case it's a specific symbol issue.
|
||||
|
||||
if success and not self._interrupted:
|
||||
print("✅ Воспроизведение завершено")
|
||||
return True
|
||||
elif self._interrupted:
|
||||
return False
|
||||
|
||||
else:
|
||||
return False
|
||||
|
||||
def _check_interrupt_worker(self, check_interrupt):
|
||||
"""
|
||||
Worker thread that continuously checks for interrupt signal.
|
||||
@@ -95,47 +166,44 @@ class TextToSpeech:
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _play_with_interrupt(self, audio_np: np.ndarray, check_interrupt) -> bool:
|
||||
"""
|
||||
Play audio with interrupt checking in parallel thread.
|
||||
|
||||
|
||||
Args:
|
||||
audio_np: Audio data as numpy array
|
||||
check_interrupt: Callback that returns True if should interrupt
|
||||
|
||||
|
||||
Returns:
|
||||
True if completed normally, False if interrupted
|
||||
"""
|
||||
# Start interrupt checker thread
|
||||
checker_thread = threading.Thread(
|
||||
target=self._check_interrupt_worker,
|
||||
args=(check_interrupt,),
|
||||
daemon=True
|
||||
target=self._check_interrupt_worker, args=(check_interrupt,), daemon=True
|
||||
)
|
||||
checker_thread.start()
|
||||
|
||||
|
||||
try:
|
||||
# Play audio (non-blocking start)
|
||||
sd.play(audio_np, self.sample_rate)
|
||||
|
||||
|
||||
# Wait for playback to finish or interrupt
|
||||
while sd.get_stream().active:
|
||||
if self._interrupted:
|
||||
break
|
||||
time.sleep(0.05)
|
||||
|
||||
|
||||
finally:
|
||||
# Signal checker thread to stop
|
||||
self._stop_flag.set()
|
||||
checker_thread.join(timeout=0.5)
|
||||
|
||||
|
||||
if self._interrupted:
|
||||
return False
|
||||
|
||||
print("✅ Воспроизведение завершено")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@property
|
||||
def was_interrupted(self) -> bool:
|
||||
"""Check if the last playback was interrupted."""
|
||||
@@ -157,11 +225,11 @@ def get_tts() -> TextToSpeech:
|
||||
def speak(text: str, check_interrupt=None) -> bool:
|
||||
"""
|
||||
Synthesize and speak the given text.
|
||||
|
||||
|
||||
Args:
|
||||
text: Text to speak
|
||||
check_interrupt: Optional callback for interrupt checking
|
||||
|
||||
|
||||
Returns:
|
||||
True if completed normally, False if interrupted
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user