idb_utils/idb/wordpress.py
2025-03-06 02:04:06 +03:00

132 lines
3.9 KiB
Python

import requests, re
from bs4 import BeautifulSoup
from html import unescape
from markdownify import MarkdownConverter
from markdown import markdown
from enum import Enum
class WordpressMarkdownConverter(MarkdownConverter):
def convert_p(self, el, text, convert_as_inline):
md = self.idb_convert_image(el)
if md is not None:
return md
if str(el).startswith('<p style="text-align:center;">'):
text = text.replace('\n\n', '<br>')
return f'<center>{text}</center>\n\n'
return super().convert_p(el, text, convert_as_inline)
def _convert_hn(self, n, el, text, convert_as_inline):
md = self.idb_convert_image(el)
if md is not None:
return md
return super()._convert_hn(n, el, text, convert_as_inline)
def convert_a(self, el, *args):
md = self.idb_convert_image(el)
if md is not None:
return md
return super().convert_a(el, *args)
def convert_div(self, el, *args):
if str(el).startswith('<div class="wp-caption aligncenter" data-shortcode="caption"'):
md = self.idb_convert_image(el)
if md is not None:
return md
return super().convert_a(el, *args)
def idb_convert_image(self, el):
html = str(el)
pattern = re.compile(r'^<(?:h[1-6]|p)[^>]*>.*?<img.*?src="([^"?]+)(?:\?[^"]*)?"')
match = pattern.search(html)
if match:
return f'![]({match.group(1)})\n\n'
pattern = re.compile(r'^<div class="wp-caption aligncenter" data-shortcode="caption"[^>]+><a[^>]+><img alt="[^"]*" aria-describedby="caption-attachment.*?src="([^"?]+)(?:\?[^"]*)?".*?/></a><p.*?id="caption-attachment[^"]+"[^>]*>(.*?)</p>', re.S)
match = pattern.search(html)
if match:
src, title = match.groups()
title = unescape(title)
return f'![{title}]({src})\n\n'
def _markdownify(html, **options):
return WordpressMarkdownConverter(**options).convert(html)
def markdown_from_html(html):
def mapper(s):
s = s.strip()
if s in ('#', '# #', '# # #', '##', '###', '___'):
return f'<center>{s}</center>'
return s
md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
return '\n\n'.join(map(mapper, md.split('\n\n')))
def html_from_markdown(s):
return markdown(s, extensions=['extra', 'tables'])
class Article:
title: str
html: str
md: str
def __init__(self, title, html, md):
self.title = title
self.html = html
self.md = md
@classmethod
def from_html(cls, title, html):
return Article(title, html, markdown_from_html(html))
@classmethod
def from_markdown(cls, title, md):
return Article(title, html_from_markdown(md), md)
@classmethod
def from_markdown_file(cls, filename, with_title=True):
if with_title:
with open(filename) as f:
lines = f.readlines()
first_line = lines[0].strip()
if not first_line.startswith('#'):
raise ValueError('first line must start with #')
title = first_line.lstrip('#').strip()
lines.pop(0)
if lines and not lines[0].strip():
lines.pop(0)
md = '\n\n'.join(lines)
else:
with open(filename) as f:
md = f.read()
title = ''
return cls.from_markdown(title, md)
class Language(Enum):
English = 'en'
Russian = 'ru'
def fetch_article(url) -> Article:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
flair_element = soup.find(id="jp-post-flair")
if flair_element:
flair_element.decompose()
html = str(soup.find("div", class_="entry-content")).strip()
title = unescape(soup.find(class_="entry-title").get_text(strip=True))
return Article.from_html(title, html)