import requests, re
from bs4 import BeautifulSoup
from html import unescape
from markdownify import MarkdownConverter
from markdown import markdown
from enum import Enum
class WordpressMarkdownConverter(MarkdownConverter):
def convert_p(self, el, text, convert_as_inline):
md = self.idb_convert_image(el)
if md is not None:
return md
if str(el).startswith('
'):
text = text.replace('\n\n', '
')
return f'
{text}\n\n'
return super().convert_p(el, text, convert_as_inline)
def _convert_hn(self, n, el, text, convert_as_inline):
md = self.idb_convert_image(el)
if md is not None:
return md
return super()._convert_hn(n, el, text, convert_as_inline)
def convert_a(self, el, *args):
md = self.idb_convert_image(el)
if md is not None:
return md
return super().convert_a(el, *args)
def convert_div(self, el, *args):
if str(el).startswith(']*>.*?
]+>]+>
]*>(.*?)', re.S)
match = pattern.search(html)
if match:
src, title = match.groups()
title = unescape(title)
return f'\n\n'
def _markdownify(html, **options):
return WordpressMarkdownConverter(**options).convert(html)
def markdown_from_html(html):
def mapper(s):
s = s.strip()
if s in ('#', '# #', '# # #', '##', '###', '___'):
return f'{s}'
return s
md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
return '\n\n'.join(map(mapper, md.split('\n\n')))
def html_from_markdown(s):
return markdown(s, extensions=['extra', 'tables'])
class Article:
title: str
html: str
md: str
def __init__(self, title, html, md):
self.title = title
self.html = html
self.md = md
@classmethod
def from_html(cls, title, html):
return Article(title, html, markdown_from_html(html))
@classmethod
def from_markdown(cls, title, md):
return Article(title, html_from_markdown(md), md)
@classmethod
def from_markdown_file(cls, filename, with_title=True):
if with_title:
with open(filename) as f:
lines = f.readlines()
first_line = lines[0].strip()
if not first_line.startswith('#'):
raise ValueError('first line must start with #')
title = first_line.lstrip('#').strip()
lines.pop(0)
if lines and not lines[0].strip():
lines.pop(0)
md = '\n\n'.join(lines)
else:
with open(filename) as f:
md = f.read()
title = ''
return cls.from_markdown(title, md)
class Language(Enum):
English = 'en'
Russian = 'ru'
def fetch_article(url) -> Article:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
flair_element = soup.find(id="jp-post-flair")
if flair_element:
flair_element.decompose()
html = str(soup.find("div", class_="entry-content")).strip()
title = unescape(soup.find(class_="entry-title").get_text(strip=True))
return Article.from_html(title, html)