first commit
This commit is contained in:
72
cleaner.py
Normal file
72
cleaner.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
Response cleaner module.
|
||||
Removes markdown formatting and special characters from AI responses.
|
||||
"""
|
||||
import re
|
||||
|
||||
|
||||
def clean_response(text: str) -> str:
|
||||
"""
|
||||
Clean AI response from markdown formatting and special characters.
|
||||
|
||||
Args:
|
||||
text: Raw AI response with possible markdown
|
||||
|
||||
Returns:
|
||||
Clean text suitable for TTS
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Remove citation references like [1], [2], [citation], etc.
|
||||
text = re.sub(r'\[\d+\]', '', text)
|
||||
text = re.sub(r'\[citation\s*needed\]', '', text, flags=re.IGNORECASE)
|
||||
text = re.sub(r'\[source\]', '', text, flags=re.IGNORECASE)
|
||||
|
||||
# Remove markdown bold **text** and __text__
|
||||
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
||||
text = re.sub(r'__(.+?)__', r'\1', text)
|
||||
|
||||
# Remove markdown italic *text* and _text_
|
||||
text = re.sub(r'\*(.+?)\*', r'\1', text)
|
||||
text = re.sub(r'(?<!\w)_(.+?)_(?!\w)', r'\1', text)
|
||||
|
||||
# Remove markdown strikethrough ~~text~~
|
||||
text = re.sub(r'~~(.+?)~~', r'\1', text)
|
||||
|
||||
# Remove markdown headers # ## ### etc.
|
||||
text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE)
|
||||
|
||||
# Remove markdown links [text](url) -> text
|
||||
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
||||
|
||||
# Remove markdown images 
|
||||
text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', '', text)
|
||||
|
||||
# Remove inline code `code`
|
||||
text = re.sub(r'`([^`]+)`', r'\1', text)
|
||||
|
||||
# Remove code blocks ```code```
|
||||
text = re.sub(r'```[\s\S]*?```', '', text)
|
||||
|
||||
# Remove markdown list markers (-, *, +, numbered)
|
||||
text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)
|
||||
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
|
||||
|
||||
# Remove blockquotes
|
||||
text = re.sub(r'^\s*>\s*', '', text, flags=re.MULTILINE)
|
||||
|
||||
# Remove horizontal rules
|
||||
text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
||||
|
||||
# Remove HTML tags if any
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
|
||||
# Remove extra whitespace
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
text = re.sub(r' +', ' ', text)
|
||||
|
||||
# Clean up and return
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
Reference in New Issue
Block a user