73 lines
2.1 KiB
Python
73 lines
2.1 KiB
Python
"""
|
|
Response cleaner module.
|
|
Removes markdown formatting and special characters from AI responses.
|
|
"""
|
|
import re
|
|
|
|
|
|
def clean_response(text: str) -> str:
|
|
"""
|
|
Clean AI response from markdown formatting and special characters.
|
|
|
|
Args:
|
|
text: Raw AI response with possible markdown
|
|
|
|
Returns:
|
|
Clean text suitable for TTS
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
# Remove citation references like [1], [2], [citation], etc.
|
|
text = re.sub(r'\[\d+\]', '', text)
|
|
text = re.sub(r'\[citation\s*needed\]', '', text, flags=re.IGNORECASE)
|
|
text = re.sub(r'\[source\]', '', text, flags=re.IGNORECASE)
|
|
|
|
# Remove markdown bold **text** and __text__
|
|
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
|
text = re.sub(r'__(.+?)__', r'\1', text)
|
|
|
|
# Remove markdown italic *text* and _text_
|
|
text = re.sub(r'\*(.+?)\*', r'\1', text)
|
|
text = re.sub(r'(?<!\w)_(.+?)_(?!\w)', r'\1', text)
|
|
|
|
# Remove markdown strikethrough ~~text~~
|
|
text = re.sub(r'~~(.+?)~~', r'\1', text)
|
|
|
|
# Remove markdown headers # ## ### etc.
|
|
text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE)
|
|
|
|
# Remove markdown links [text](url) -> text
|
|
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
|
|
|
# Remove markdown images 
|
|
text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', '', text)
|
|
|
|
# Remove inline code `code`
|
|
text = re.sub(r'`([^`]+)`', r'\1', text)
|
|
|
|
# Remove code blocks ```code```
|
|
text = re.sub(r'```[\s\S]*?```', '', text)
|
|
|
|
# Remove markdown list markers (-, *, +, numbered)
|
|
text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
|
|
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
|
|
|
|
# Remove blockquotes
|
|
text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE)
|
|
|
|
# Remove horizontal rules
|
|
text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
|
|
|
# Remove HTML tags if any
|
|
text = re.sub(r'<[^>]+>', '', text)
|
|
|
|
# Remove extra whitespace
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
text = re.sub(r' +', ' ', text)
|
|
|
|
# Clean up and return
|
|
text = text.strip()
|
|
|
|
return text
|