Files
smart-speaker/cleaner.py
2026-01-02 20:26:44 +03:00

73 lines
2.1 KiB
Python

"""
Response cleaner module.
Removes markdown formatting and special characters from AI responses.
"""
import re
def clean_response(text: str) -> str:
"""
Clean AI response from markdown formatting and special characters.
Args:
text: Raw AI response with possible markdown
Returns:
Clean text suitable for TTS
"""
if not text:
return ""
# Remove citation references like [1], [2], [citation], etc.
text = re.sub(r'\[\d+\]', '', text)
text = re.sub(r'\[citation\s*needed\]', '', text, flags=re.IGNORECASE)
text = re.sub(r'\[source\]', '', text, flags=re.IGNORECASE)
# Remove markdown bold **text** and __text__
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
text = re.sub(r'__(.+?)__', r'\1', text)
# Remove markdown italic *text* and _text_
text = re.sub(r'\*(.+?)\*', r'\1', text)
text = re.sub(r'(?<!\w)_(.+?)_(?!\w)', r'\1', text)
# Remove markdown strikethrough ~~text~~
text = re.sub(r'~~(.+?)~~', r'\1', text)
# Remove markdown headers # ## ### etc.
text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE)
# Remove markdown links [text](url) -> text
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
# Remove markdown images ![alt](url)
text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', '', text)
# Remove inline code `code`
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove code blocks ```code```
text = re.sub(r'```[\s\S]*?```', '', text)
# Remove markdown list markers (-, *, +, numbered)
text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
# Remove blockquotes
text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE)
# Remove horizontal rules
text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
# Remove HTML tags if any
text = re.sub(r'<[^>]+>', '', text)
# Remove extra whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r' +', ' ', text)
# Clean up and return
text = text.strip()
return text