]*>.*?]+>]+>

]*>(.*?)

', re.S) match = pattern.search(html) if match: src, title = match.groups() title = unescape(title) return f'![{title}]({src})\n\n' def _markdownify(html, **options): return WordpressMarkdownConverter(**options).convert(html) def markdown_from_html(html): def mapper(s): s = s.strip() if s in ('#', '# #', '# # #', '##', '###', '___'): return f'{s}' return s md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip() return '\n\n'.join(map(mapper, md.split('\n\n'))) def html_from_markdown(s): return markdown(s, extensions=['extra', 'tables']) class Article: title: str html: str md: str def __init__(self, title, html, md): self.title = title self.html = html self.md = md @classmethod def from_html(cls, title, html): return Article(title, html, markdown_from_html(html)) @classmethod def from_markdown(cls, title, md): return Article(title, html_from_markdown(md), md) @classmethod def from_markdown_file(cls, filename, with_title=True): if with_title: with open(filename) as f: lines = f.readlines() first_line = lines[0].strip() if not first_line.startswith('#'): raise ValueError('first line must start with #') title = first_line.lstrip('#').strip() lines.pop(0) if lines and not lines[0].strip(): lines.pop(0) md = '\n\n'.join(lines) else: with open(filename) as f: md = f.read() title = '' return cls.from_markdown(title, md) class Language(Enum): English = 'en' Russian = 'ru' def fetch_article(url) -> Article: response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') flair_element = soup.find(id="jp-post-flair") if flair_element: flair_element.decompose() html = str(soup.find("div", class_="entry-content")).strip() title = unescape(soup.find(class_="entry-title").get_text(strip=True)) return Article.from_html(title, html)