import re import xml.etree.ElementTree from markdown import Markdown from markdown.extensions import Extension from markdown.inlinepatterns import InlineProcessor from markdown.preprocessors import Preprocessor from bs4 import BeautifulSoup from html import unescape class UnderlineInlineProcessor(InlineProcessor): def handleMatch(self, m, data): el = xml.etree.ElementTree.Element('u') el.text = m.group(1) return el, m.start(0), m.end(0) class UnderlineExtension(Extension): def extendMarkdown(self, md): md.inlinePatterns.register(UnderlineInlineProcessor(r'___(.*?)___', md), 'underline', 175) class EscapeListPreprocessor(Preprocessor): def run(self, lines): new_lines = [] pattern = re.compile(r'^([ \t]*)([-*+]\s|\d+\.\s)') for line in lines: new_line = pattern.sub(lambda m: f'{m.group(1)}\u200B{m.group(2)}', line) new_lines.append(new_line) return new_lines class EscapeListExtension(Extension): def extendMarkdown(self, md): md.preprocessors.register(EscapeListPreprocessor(md), 'escape_list', 175) def markdown_to_html(text): html = Markdown(extensions=[EscapeListExtension(), UnderlineExtension()]).convert(text) html = html.replace('\u200B', '') html = html.replace('
', '\n\n') return html def remove_paragraph_tags(text): text = re.sub(r"\s*') html = html.replace('', '
", r"\n\n", text) text = re.sub(r"?p>", "", text) return text.strip() def html_to_text(html): soup = BeautifulSoup(html, 'html.parser') paragraphs = soup.find_all('p') if paragraphs: # join the text of all paragraphs with two newlines between them text = "\n\n".join(p.get_text() for p in paragraphs) else: # fallback if no
tags are found text = soup.get_text() return unescape(text)