idb_channel_bot/_textutils.py

import re
import xml.etree.ElementTree

from markdown import Markdown
from markdown.extensions import Extension
from markdown.inlinepatterns import InlineProcessor
from markdown.preprocessors import Preprocessor

from bs4 import BeautifulSoup
from html import unescape


class UnderlineInlineProcessor(InlineProcessor):
    def handleMatch(self, m, data):
        el = xml.etree.ElementTree.Element('u')
        el.text = m.group(1)
        return el, m.start(0), m.end(0)


class UnderlineExtension(Extension):
    def extendMarkdown(self, md):
        md.inlinePatterns.register(UnderlineInlineProcessor(r'___(.*?)___', md), 'underline', 175)


class EscapeListPreprocessor(Preprocessor):
    def run(self, lines):
        new_lines = []
        pattern = re.compile(r'^([ \t]*)([-*+]\s|\d+\.\s)')
        for line in lines:
            new_line = pattern.sub(lambda m: f'{m.group(1)}\u200B{m.group(2)}', line)
            new_lines.append(new_line)
        return new_lines


class EscapeListExtension(Extension):
    def extendMarkdown(self, md):
        md.preprocessors.register(EscapeListPreprocessor(md), 'escape_list', 175)


def markdown_to_html(text):
    html = Markdown(extensions=[EscapeListExtension(), UnderlineExtension()]).convert(text)
    html = html.replace('\u200B', '')
    html = html.replace('<blockquote>', '\n<blockquote>')
    html = html.replace('</blockquote>', '</blockquote>\n')
    return html


def remove_paragraph_tags(text):
    text = re.sub(r"</p>\s*<p>", r"\n\n", text)
    text = re.sub(r"</?p>", "", text)
    return text.strip()


def html_to_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    paragraphs = soup.find_all('p')
    if paragraphs:
        # join the text of all paragraphs with two newlines between them
        text = "\n\n".join(p.get_text() for p in paragraphs)
    else:
        # fallback if no <p> tags are found
        text = soup.get_text()
    return unescape(text)