132 lines
3.9 KiB
Python
132 lines
3.9 KiB
Python
import requests, re
|
|
from bs4 import BeautifulSoup
|
|
from html import unescape
|
|
from markdownify import MarkdownConverter
|
|
from markdown import markdown
|
|
from enum import Enum
|
|
|
|
|
|
class WordpressMarkdownConverter(MarkdownConverter):
|
|
def convert_p(self, el, text, convert_as_inline):
|
|
md = self.idb_convert_image(el)
|
|
if md is not None:
|
|
return md
|
|
if str(el).startswith('<p style="text-align:center;">'):
|
|
text = text.replace('\n\n', '<br>')
|
|
return f'<center>{text}</center>\n\n'
|
|
return super().convert_p(el, text, convert_as_inline)
|
|
|
|
def _convert_hn(self, n, el, text, convert_as_inline):
|
|
md = self.idb_convert_image(el)
|
|
if md is not None:
|
|
return md
|
|
return super()._convert_hn(n, el, text, convert_as_inline)
|
|
|
|
def convert_a(self, el, *args):
|
|
md = self.idb_convert_image(el)
|
|
if md is not None:
|
|
return md
|
|
return super().convert_a(el, *args)
|
|
|
|
def convert_div(self, el, *args):
|
|
if str(el).startswith('<div class="wp-caption aligncenter" data-shortcode="caption"'):
|
|
md = self.idb_convert_image(el)
|
|
if md is not None:
|
|
return md
|
|
return super().convert_a(el, *args)
|
|
|
|
def idb_convert_image(self, el):
|
|
html = str(el)
|
|
|
|
pattern = re.compile(r'^<(?:h[1-6]|p)[^>]*>.*?<img.*?src="([^"?]+)(?:\?[^"]*)?"')
|
|
match = pattern.search(html)
|
|
if match:
|
|
return f'})\n\n'
|
|
|
|
pattern = re.compile(r'^<div class="wp-caption aligncenter" data-shortcode="caption"[^>]+><a[^>]+><img alt="[^"]*" aria-describedby="caption-attachment.*?src="([^"?]+)(?:\?[^"]*)?".*?/></a><p.*?id="caption-attachment[^"]+"[^>]*>(.*?)</p>', re.S)
|
|
match = pattern.search(html)
|
|
if match:
|
|
src, title = match.groups()
|
|
title = unescape(title)
|
|
return f'\n\n'
|
|
|
|
|
|
def _markdownify(html, **options):
|
|
return WordpressMarkdownConverter(**options).convert(html)
|
|
|
|
|
|
def markdown_from_html(html):
|
|
def mapper(s):
|
|
s = s.strip()
|
|
if s in ('#', '# #', '# # #', '##', '###', '___'):
|
|
return f'<center>{s}</center>'
|
|
return s
|
|
|
|
md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
|
|
return '\n\n'.join(map(mapper, md.split('\n\n')))
|
|
|
|
|
|
def html_from_markdown(s):
|
|
return markdown(s, extensions=['extra', 'tables'])
|
|
|
|
|
|
class Article:
|
|
title: str
|
|
html: str
|
|
md: str
|
|
|
|
def __init__(self, title, html, md):
|
|
self.title = title
|
|
self.html = html
|
|
self.md = md
|
|
|
|
@classmethod
|
|
def from_html(cls, title, html):
|
|
return Article(title, html, markdown_from_html(html))
|
|
|
|
@classmethod
|
|
def from_markdown(cls, title, md):
|
|
return Article(title, html_from_markdown(md), md)
|
|
|
|
@classmethod
|
|
def from_markdown_file(cls, filename, with_title=True):
|
|
if with_title:
|
|
with open(filename) as f:
|
|
lines = f.readlines()
|
|
|
|
first_line = lines[0].strip()
|
|
if not first_line.startswith('#'):
|
|
raise ValueError('first line must start with #')
|
|
title = first_line.lstrip('#').strip()
|
|
|
|
lines.pop(0)
|
|
if lines and not lines[0].strip():
|
|
lines.pop(0)
|
|
|
|
md = '\n\n'.join(lines)
|
|
else:
|
|
with open(filename) as f:
|
|
md = f.read()
|
|
title = ''
|
|
|
|
return cls.from_markdown(title, md)
|
|
|
|
|
|
class Language(Enum):
|
|
English = 'en'
|
|
Russian = 'ru'
|
|
|
|
|
|
def fetch_article(url) -> Article:
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
flair_element = soup.find(id="jp-post-flair")
|
|
if flair_element:
|
|
flair_element.decompose()
|
|
|
|
html = str(soup.find("div", class_="entry-content")).strip()
|
|
title = unescape(soup.find(class_="entry-title").get_text(strip=True))
|
|
|
|
return Article.from_html(title, html)
|