idb_channel_bot/_textutils.py
2025-05-15 18:39:04 +03:00

63 lines
1.9 KiB
Python

import re
import xml.etree.ElementTree
from markdown import Markdown
from markdown.extensions import Extension
from markdown.inlinepatterns import InlineProcessor
from markdown.preprocessors import Preprocessor
from bs4 import BeautifulSoup
from html import unescape
class UnderlineInlineProcessor(InlineProcessor):
def handleMatch(self, m, data):
el = xml.etree.ElementTree.Element('u')
el.text = m.group(1)
return el, m.start(0), m.end(0)
class UnderlineExtension(Extension):
def extendMarkdown(self, md):
md.inlinePatterns.register(UnderlineInlineProcessor(r'___(.*?)___', md), 'underline', 175)
class EscapeListPreprocessor(Preprocessor):
def run(self, lines):
new_lines = []
pattern = re.compile(r'^([ \t]*)([-*+]\s|\d+\.\s)')
for line in lines:
new_line = pattern.sub(lambda m: f'{m.group(1)}\u200B{m.group(2)}', line)
new_lines.append(new_line)
return new_lines
class EscapeListExtension(Extension):
def extendMarkdown(self, md):
md.preprocessors.register(EscapeListPreprocessor(md), 'escape_list', 175)
def markdown_to_html(text):
html = Markdown(extensions=[EscapeListExtension(), UnderlineExtension()]).convert(text)
html = html.replace('\u200B', '')
html = html.replace('<blockquote>', '\n<blockquote>')
html = html.replace('</blockquote>', '</blockquote>\n')
return html
def remove_paragraph_tags(text):
text = re.sub(r"</p>\s*<p>", r"\n\n", text)
text = re.sub(r"</?p>", "", text)
return text.strip()
def html_to_text(html):
soup = BeautifulSoup(html, 'html.parser')
paragraphs = soup.find_all('p')
if paragraphs:
# join the text of all paragraphs with two newlines between them
text = "\n\n".join(p.get_text() for p in paragraphs)
else:
# fallback if no <p> tags are found
text = soup.get_text()
return unescape(text)