idb_utils/idb/wordpress.py

import requests, re
from bs4 import BeautifulSoup
from html import unescape
from markdownify import MarkdownConverter
from markdown import markdown
from enum import Enum


class WordpressMarkdownConverter(MarkdownConverter):
    def convert_p(self, el, text, convert_as_inline):
        md = self.idb_convert_image(el)
        if md is not None:
            return md
        if str(el).startswith('<p style="text-align:center;">'):
            text = text.replace('\n\n', '<br>')
            return f'<center>{text}</center>\n\n'
        return super().convert_p(el, text, convert_as_inline)

    def _convert_hn(self, n, el, text, convert_as_inline):
        md = self.idb_convert_image(el)
        if md is not None:
            return md
        return super()._convert_hn(n, el, text, convert_as_inline)

    def convert_a(self, el, *args):
        md = self.idb_convert_image(el)
        if md is not None:
            return md
        return super().convert_a(el, *args)

    def convert_div(self, el, *args):
        if str(el).startswith('<div class="wp-caption aligncenter" data-shortcode="caption"'):
            md = self.idb_convert_image(el)
            if md is not None:
                return md
        return super().convert_a(el, *args)

    def idb_convert_image(self, el):
        html = str(el)

        pattern = re.compile(r'^<(?:h[1-6]|p)[^>]*>.*?<img.*?src="([^"?]+)(?:\?[^"]*)?"')
        match = pattern.search(html)
        if match:
            return f'![]({match.group(1)})\n\n'

        pattern = re.compile(r'^<div class="wp-caption aligncenter" data-shortcode="caption"[^>]+><a[^>]+><img alt="[^"]*" aria-describedby="caption-attachment.*?src="([^"?]+)(?:\?[^"]*)?".*?/></a><p.*?id="caption-attachment[^"]+"[^>]*>(.*?)</p>', re.S)
        match = pattern.search(html)
        if match:
            src, title = match.groups()
            title = unescape(title)
            return f'![{title}]({src})\n\n'


def _markdownify(html, **options):
    return WordpressMarkdownConverter(**options).convert(html)


def markdown_from_html(html):
    def mapper(s):
        s = s.strip()
        if s in ('#', '# #', '# # #', '##', '###', '___'):
            return f'<center>{s}</center>'
        return s

    md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
    return '\n\n'.join(map(mapper, md.split('\n\n')))


def html_from_markdown(s):
    return markdown(s, extensions=['extra', 'tables'])


class Article:
    title: str
    html: str
    md: str

    def __init__(self, title, html, md):
        self.title = title
        self.html = html
        self.md = md

    @classmethod
    def from_html(cls, title, html):
        return Article(title, html, markdown_from_html(html))

    @classmethod
    def from_markdown(cls, title, md):
        return Article(title, html_from_markdown(md), md)

    @classmethod
    def from_markdown_file(cls, filename, with_title=True):
        if with_title:
            with open(filename) as f:
                lines = f.readlines()

            first_line = lines[0].strip()
            if not first_line.startswith('#'):
                raise ValueError('first line must start with #')
            title = first_line.lstrip('#').strip()

            lines.pop(0)
            if lines and not lines[0].strip():
                lines.pop(0)

            md = '\n\n'.join(lines)
        else:
            with open(filename) as f:
                md = f.read()
            title = ''

        return cls.from_markdown(title, md)


class Language(Enum):
    English = 'en'
    Russian = 'ru'


def fetch_article(url) -> Article:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    flair_element = soup.find(id="jp-post-flair")
    if flair_element:
        flair_element.decompose()

    html = str(soup.find("div", class_="entry-content")).strip()
    title = unescape(soup.find(class_="entry-title").get_text(strip=True))

    return Article.from_html(title, html)