idb_utils/idb/wordpress.py
2025-02-08 04:54:42 +03:00

23 lines
666 B
Python

import requests
from bs4 import BeautifulSoup
from html import unescape
from markdownify import markdownify
from collections import namedtuple
ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
def fetch(url) -> ArticleContent:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
flair_element = soup.find(id="jp-post-flair")
if flair_element:
flair_element.decompose()
html = str(soup.find("div", class_="entry-content")).strip()
title = unescape(soup.find(class_="entry-title").get_text(strip=True))
md = markdownify(html).strip()
return ArticleContent(title, html, md)