61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
import re
|
|
import xml.etree.ElementTree
|
|
|
|
from markdown import Markdown
|
|
from markdown.extensions import Extension
|
|
from markdown.inlinepatterns import InlineProcessor
|
|
from markdown.preprocessors import Preprocessor
|
|
|
|
from bs4 import BeautifulSoup
|
|
from html import unescape
|
|
|
|
|
|
class UnderlineInlineProcessor(InlineProcessor):
|
|
def handleMatch(self, m, data):
|
|
el = xml.etree.ElementTree.Element('u')
|
|
el.text = m.group(1)
|
|
return el, m.start(0), m.end(0)
|
|
|
|
|
|
class UnderlineExtension(Extension):
|
|
def extendMarkdown(self, md):
|
|
md.inlinePatterns.register(UnderlineInlineProcessor(r'___(.*?)___', md), 'underline', 175)
|
|
|
|
|
|
class EscapeListPreprocessor(Preprocessor):
|
|
def run(self, lines):
|
|
new_lines = []
|
|
pattern = re.compile(r'^([ \t]*)([-*+]\s|\d+\.\s)')
|
|
for line in lines:
|
|
new_line = pattern.sub(lambda m: f'{m.group(1)}\u200B{m.group(2)}', line)
|
|
new_lines.append(new_line)
|
|
return new_lines
|
|
|
|
|
|
class EscapeListExtension(Extension):
|
|
def extendMarkdown(self, md):
|
|
md.preprocessors.register(EscapeListPreprocessor(md), 'escape_list', 175)
|
|
|
|
|
|
def markdown_to_html(text):
|
|
html = Markdown(extensions=[EscapeListExtension(), UnderlineExtension()]).convert(text)
|
|
html = html.replace('\u200B', '')
|
|
return html
|
|
|
|
|
|
def remove_paragraph_tags(text):
|
|
text = re.sub(r"</p>\s*<p>", r"\n\n", text)
|
|
text = re.sub(r"</?p>", "", text)
|
|
return text.strip()
|
|
|
|
|
|
def html_to_text(html):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
paragraphs = soup.find_all('p')
|
|
if paragraphs:
|
|
# join the text of all paragraphs with two newlines between them
|
|
text = "\n\n".join(p.get_text() for p in paragraphs)
|
|
else:
|
|
# fallback if no <p> tags are found
|
|
text = soup.get_text()
|
|
return unescape(text) |