md -> odt converter, first commit
This commit is contained in:
parent
2ebf5f18fa
commit
7f4b460c96
10
.gitignore
vendored
10
.gitignore
vendored
@ -1,3 +1,11 @@
|
||||
/.idea
|
||||
/.venv
|
||||
/.env
|
||||
/.env
|
||||
/tzo_save.py
|
||||
/test*py
|
||||
/*.html
|
||||
/*.odt
|
||||
/*.md
|
||||
/*.txt
|
||||
/*.zip
|
||||
/.DS_Store
|
24
cartier_odt.py
Executable file
24
cartier_odt.py
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
import os.path
|
||||
from idb import Article, DocumentCreator
|
||||
from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
name = 'cartier3'
|
||||
orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru')
|
||||
trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en')
|
||||
|
||||
orig = Article.from_markdown_file(orig_path, with_title=False)
|
||||
trans = Article.from_markdown_file(trans_path, with_title=False)
|
||||
|
||||
image_urls = extract_images_from_markdown(read_file(orig_path))
|
||||
for image_url in image_urls:
|
||||
image_name = image_url_to_filename(image_url)
|
||||
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
|
||||
if not os.path.exists(output_file):
|
||||
download_file(image_url, output_file)
|
||||
print(f'{image_name} saved')
|
||||
|
||||
doc = DocumentCreator()
|
||||
doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))
|
@ -0,0 +1,24 @@
|
||||
from .wordpress import Article, fetch_article
|
||||
from .translator import translate_markdown
|
||||
from .doc import DocumentCreator
|
||||
|
||||
tzo_urls = (
|
||||
'https://kniganews.org/2012/12/20/beyond-clouds-1/',
|
||||
'https://kniganews.org/2012/12/21/beyond-clouds-21/',
|
||||
'https://kniganews.org/2012/12/22/beyond-clouds-22/',
|
||||
'https://kniganews.org/2012/12/23/beyond-clouds-31/',
|
||||
'https://kniganews.org/2012/12/24/beyond-clouds-32/',
|
||||
'https://kniganews.org/2012/12/25/beyond-clouds-33/',
|
||||
'https://kniganews.org/2012/12/28/beyond-clouds-41/',
|
||||
'https://kniganews.org/2012/12/29/beyond-clouds-42/',
|
||||
'https://kniganews.org/2012/12/30/beyond-clouds-43/',
|
||||
'https://kniganews.org/2013/01/01/beyond-clouds-44/',
|
||||
'https://kniganews.org/2013/01/06/beyond-clouds-51/',
|
||||
'https://kniganews.org/2013/01/07/beyond-clouds-52/',
|
||||
'https://kniganews.org/2013/02/16/beyond-clouds-53/',
|
||||
'https://kniganews.org/2013/03/25/beyond-clouds-61/',
|
||||
'https://kniganews.org/2013/05/10/beyond-clouds-62/',
|
||||
'https://kniganews.org/2013/06/17/beyond-clouds-731/',
|
||||
'https://kniganews.org/2013/08/07/beyond-clouds-732/',
|
||||
'https://kniganews.org/2013/09/17/beyond-clouds-73/'
|
||||
)
|
424
idb/doc.py
Normal file
424
idb/doc.py
Normal file
@ -0,0 +1,424 @@
|
||||
import os.path
|
||||
|
||||
from odf.opendocument import OpenDocumentText
|
||||
from odf.text import P, H, Span, A, LineBreak, List, ListItem
|
||||
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
|
||||
from odf.table import TableColumn, TableCell, TableRow, Table
|
||||
from odf.draw import Frame, Image
|
||||
|
||||
from PIL import Image as PILImage
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from idb import Article
|
||||
from idb.util import image_url_to_filename
|
||||
|
||||
PAGE_LAYOUT_NAME = 'LandscapeLayout'
|
||||
MASTER_PAGE_NAME = 'Standard'
|
||||
BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
|
||||
ITALIC_STYLE_NAME = 'Italic'
|
||||
BOLD_STYLE_NAME = 'Bold'
|
||||
CAPTION_STYLE_NAME = 'Caption'
|
||||
UNDERLINE_STYLE_NAME = 'Underline'
|
||||
CENTER_STYLE_NAME = 'CenterAligned'
|
||||
TITLE_STYLE_NAME = 'Title'
|
||||
|
||||
|
||||
def add_child(parent, child):
|
||||
if hasattr(child, "qname"):
|
||||
parent.addElement(child)
|
||||
else:
|
||||
parent.addText(child)
|
||||
|
||||
|
||||
def calc_frame_dimensions(image_path, desired_width_cm):
|
||||
with PILImage.open(image_path) as img:
|
||||
orig_width, orig_height = img.size
|
||||
|
||||
dpi = 96.0
|
||||
orig_width_cm = (orig_width / dpi) * 2.54
|
||||
orig_height_cm = (orig_height / dpi) * 2.54
|
||||
|
||||
scale = desired_width_cm / orig_width_cm
|
||||
new_height_cm = orig_height_cm * scale
|
||||
|
||||
return f"{desired_width_cm}cm", f"{new_height_cm}cm"
|
||||
|
||||
|
||||
class ImageWrap:
|
||||
def __init__(self, image_file, caption):
|
||||
self.image_file = image_file
|
||||
self.caption = caption
|
||||
|
||||
def get_elements(self, doc):
|
||||
embedded_href = doc.addPicture(self.image_file)
|
||||
|
||||
desired_width = 13.5
|
||||
width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)
|
||||
|
||||
frm = Frame(width=width_str, height=height_str)
|
||||
img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
|
||||
frm.addElement(img)
|
||||
|
||||
elements = [frm]
|
||||
|
||||
if self.caption:
|
||||
caption = P(stylename=CAPTION_STYLE_NAME)
|
||||
caption.addText(self.caption)
|
||||
elements.append(caption)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
class DocumentCreator:
|
||||
def __init__(self):
|
||||
self.doc = OpenDocumentText()
|
||||
self.set_styles()
|
||||
|
||||
def set_styles(self):
|
||||
landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
|
||||
landscape_props = PageLayoutProperties(
|
||||
pagewidth="29.7cm",
|
||||
pageheight="21.0cm",
|
||||
printorientation="landscape",
|
||||
margin="1cm"
|
||||
)
|
||||
landscape_layout.addElement(landscape_props)
|
||||
self.doc.automaticstyles.addElement(landscape_layout)
|
||||
|
||||
masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
|
||||
self.doc.masterstyles.addElement(masterpage)
|
||||
|
||||
# bold
|
||||
style = Style(name=BOLD_STYLE_NAME, family="text")
|
||||
style.addElement(TextProperties(attributes={
|
||||
'fontweight': "bold",
|
||||
'fontweightasian': "bold",
|
||||
'fontweightcomplex': "bold"
|
||||
}))
|
||||
self.doc.automaticstyles.addElement(style)
|
||||
|
||||
# italic
|
||||
style = Style(name=ITALIC_STYLE_NAME, family="text")
|
||||
style.addElement(TextProperties(attributes={
|
||||
'fontstyle': "italic",
|
||||
'fontstyleasian': "italic",
|
||||
'fontstylecomplex': "italic"
|
||||
}))
|
||||
self.doc.automaticstyles.addElement(style)
|
||||
|
||||
# caption
|
||||
style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
|
||||
style.addElement(TextProperties(attributes={
|
||||
'fontstyle': "italic",
|
||||
'fontstyleasian': "italic",
|
||||
'fontstylecomplex': "italic",
|
||||
'fontsize': '10pt',
|
||||
'color': '#777777'
|
||||
}))
|
||||
style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
|
||||
self.doc.automaticstyles.addElement(style)
|
||||
|
||||
# underline
|
||||
style = Style(name=UNDERLINE_STYLE_NAME, family="text")
|
||||
style.addElement(TextProperties(attributes={
|
||||
'textunderlinestyle': "solid",
|
||||
'textunderlinewidth': "auto"
|
||||
}))
|
||||
self.doc.automaticstyles.addElement(style)
|
||||
|
||||
# blockquote
|
||||
style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
|
||||
style.addElement(ParagraphProperties(attributes={
|
||||
'marginleft': '0.6cm',
|
||||
'margintop': '0.15cm',
|
||||
'marginbottom': '0.15cm',
|
||||
}))
|
||||
style.addElement(TextProperties(attributes={'color': '#378A62'}))
|
||||
self.doc.styles.addElement(style)
|
||||
|
||||
# title
|
||||
style = Style(name=TITLE_STYLE_NAME, family="paragraph")
|
||||
style.addElement(TextProperties(attributes={
|
||||
'fontsize': '20pt',
|
||||
'fontweight': "bold",
|
||||
'fontweightasian': "bold",
|
||||
'fontweightcomplex': "bold"
|
||||
}))
|
||||
style.addElement(ParagraphProperties(textalign='center'))
|
||||
self.doc.styles.addElement(style)
|
||||
|
||||
# centered text
|
||||
style = Style(name=CENTER_STYLE_NAME, family="paragraph")
|
||||
style.addElement(ParagraphProperties(textalign="center"))
|
||||
self.doc.automaticstyles.addElement(style)
|
||||
|
||||
def process_inline(self, node):
|
||||
if isinstance(node, str):
|
||||
return [node]
|
||||
|
||||
tag = node.name.lower()
|
||||
|
||||
simple_tags = (
|
||||
('strong', 'b'),
|
||||
('em', 'i'),
|
||||
('ins', 'u')
|
||||
)
|
||||
simple_styles = (
|
||||
BOLD_STYLE_NAME,
|
||||
ITALIC_STYLE_NAME,
|
||||
UNDERLINE_STYLE_NAME
|
||||
)
|
||||
for i, tags_list in enumerate(simple_tags):
|
||||
if tag in tags_list:
|
||||
span = Span(stylename=simple_styles[i])
|
||||
for child in node.contents:
|
||||
for inline in self.process_inline(child):
|
||||
add_child(span, inline)
|
||||
return [span]
|
||||
|
||||
if tag == "code":
|
||||
return [Span(stylename="Code", text=node.get_text())]
|
||||
|
||||
elif tag == "a":
|
||||
return [A(href=node.get("href"), text=node.get_text())]
|
||||
|
||||
elif tag == "img":
|
||||
if node.name and node.name.lower() == "img":
|
||||
return [self.process_img(node)]
|
||||
|
||||
else:
|
||||
result = []
|
||||
for child in node.contents:
|
||||
result.extend(self.process_inline(child))
|
||||
return result
|
||||
|
||||
def process_block(self, elem):
|
||||
h_elem = self.try_process_heading(elem)
|
||||
if h_elem is not None:
|
||||
return h_elem
|
||||
|
||||
tag = elem.name.lower()
|
||||
if tag == "p":
|
||||
is_centered = False
|
||||
has_image = False
|
||||
for child in elem.contents:
|
||||
# try converting heading
|
||||
h_elem = self.try_process_heading(child)
|
||||
if h_elem is not None:
|
||||
return h_elem
|
||||
|
||||
if child.name:
|
||||
if child.name.lower() == "img":
|
||||
has_image = True
|
||||
if child.name.lower() == "center":
|
||||
for cchild in child.contents:
|
||||
h_elem = self.try_process_heading(cchild)
|
||||
if h_elem is not None:
|
||||
return h_elem
|
||||
is_centered = True
|
||||
break
|
||||
|
||||
if is_centered or has_image:
|
||||
p_elem = P(stylename=CENTER_STYLE_NAME)
|
||||
else:
|
||||
p_elem = P()
|
||||
|
||||
for child in elem.contents:
|
||||
for inline in self.process_inline(child):
|
||||
if has_image and isinstance(inline, ImageWrap):
|
||||
image = inline.get_elements(self.doc)
|
||||
p_elem.addElement(image[0])
|
||||
elems = [p_elem]
|
||||
if len(image) == 2:
|
||||
elems.append(image[1])
|
||||
return elems
|
||||
|
||||
add_child(p_elem, inline)
|
||||
|
||||
return p_elem
|
||||
|
||||
elif tag == "blockquote":
|
||||
items = []
|
||||
for child in elem.contents:
|
||||
text = child.get_text()
|
||||
if text.strip() == '':
|
||||
continue
|
||||
items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
|
||||
return items
|
||||
|
||||
elif tag in ("ul", "ol"):
|
||||
odf_list = List()
|
||||
li_elements = elem.find_all("li", recursive=False)
|
||||
for li in li_elements:
|
||||
li_item = ListItem()
|
||||
p = P()
|
||||
for child in li.contents:
|
||||
# if nested list is found, we'll process it later
|
||||
if hasattr(child, "name") and child.name in ("ul", "ol"):
|
||||
continue
|
||||
for inline in self.process_inline(child):
|
||||
add_child(p, inline)
|
||||
li_item.addElement(p)
|
||||
|
||||
# process nested lists
|
||||
for child in li.contents:
|
||||
if hasattr(child, "name") and child.name in ("ul", "ol"):
|
||||
nested_list = self.process_block(child)
|
||||
li_item.addElement(nested_list)
|
||||
|
||||
odf_list.addElement(li_item)
|
||||
return odf_list
|
||||
|
||||
elif tag == "pre":
|
||||
return P(stylename="Preformatted", text=elem.get_text())
|
||||
|
||||
elif tag == "hr":
|
||||
return P(stylename=CENTER_STYLE_NAME, text='---')
|
||||
|
||||
elif tag == "table":
|
||||
odf_table = Table()
|
||||
for tr in elem.find_all("tr"):
|
||||
row = TableRow()
|
||||
for cell in tr.find_all(["th", "td"]):
|
||||
cell_elem = TableCell()
|
||||
cell_html = "".join(str(child) for child in cell.contents)
|
||||
cell_odf_elements = self.html_to_odf_elements(cell_html)
|
||||
for el in cell_odf_elements:
|
||||
cell_elem.addElement(el)
|
||||
row.addElement(cell_elem)
|
||||
odf_table.addElement(row)
|
||||
return odf_table
|
||||
|
||||
elif tag == "img":
|
||||
return self.process_img(elem).get_elements(self.doc)
|
||||
|
||||
elif tag == "br":
|
||||
return LineBreak()
|
||||
|
||||
else:
|
||||
p_elem = P()
|
||||
p_elem.addText(elem.get_text())
|
||||
return p_elem
|
||||
|
||||
def try_process_heading(self, elem):
|
||||
if not elem.name:
|
||||
return
|
||||
|
||||
tag = elem.name.lower()
|
||||
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
||||
return H(outlinelevel=tag[1], text=elem.get_text())
|
||||
|
||||
def process_img(self, elem) -> ImageWrap:
|
||||
href = elem.get("src")
|
||||
saved_file = os.path.join(
|
||||
os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
|
||||
)
|
||||
if not os.path.exists(saved_file):
|
||||
raise ValueError(f'image {saved_file} not found')
|
||||
|
||||
alt = elem.get('alt')
|
||||
return ImageWrap(saved_file, alt)
|
||||
|
||||
def html_to_odf_elements(self, html):
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
elements = []
|
||||
top_nodes = soup.body.contents if soup.body else soup.contents
|
||||
|
||||
keeping = False
|
||||
stack = []
|
||||
ending_headings = (
|
||||
'дополнительное чтение',
|
||||
'основные источники',
|
||||
'источники',
|
||||
'дополнительные материалы',
|
||||
'additional reading',
|
||||
'further reading',
|
||||
'main sources',
|
||||
'additional sources',
|
||||
)
|
||||
|
||||
for node in top_nodes:
|
||||
if isinstance(node, str):
|
||||
if not node.strip():
|
||||
continue
|
||||
p = P()
|
||||
p.addText(node)
|
||||
elements.append(p)
|
||||
elif node.name:
|
||||
if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
|
||||
if stack:
|
||||
elements.append(stack)
|
||||
stack = []
|
||||
keeping = True
|
||||
result = self.process_block(node)
|
||||
if keeping:
|
||||
stack.append(result)
|
||||
else:
|
||||
elements.append(result)
|
||||
if stack:
|
||||
elements.append(stack)
|
||||
return elements
|
||||
|
||||
def create(self,
|
||||
orig: Article,
|
||||
trans: Article,
|
||||
output_odt: str,
|
||||
with_title=False):
|
||||
orig_elements = self.html_to_odf_elements(orig.html)
|
||||
trans_elements = self.html_to_odf_elements(trans.html)
|
||||
|
||||
max_len = max(len(orig_elements), len(trans_elements))
|
||||
while len(orig_elements) < max_len:
|
||||
orig_elements.append(P())
|
||||
while len(trans_elements) < max_len:
|
||||
trans_elements.append(P())
|
||||
|
||||
main_table = Table(name="ComparisonTable")
|
||||
|
||||
col1 = TableColumn()
|
||||
col2 = TableColumn()
|
||||
|
||||
main_table.addElement(col1)
|
||||
main_table.addElement(col2)
|
||||
|
||||
if with_title:
|
||||
# headings
|
||||
header_row = TableRow()
|
||||
|
||||
header_cell_left = TableCell()
|
||||
header_cell_right = TableCell()
|
||||
|
||||
header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
|
||||
header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))
|
||||
|
||||
header_row.addElement(header_cell_left)
|
||||
header_row.addElement(header_cell_right)
|
||||
|
||||
main_table.addElement(header_row)
|
||||
|
||||
# content
|
||||
for i in range(max_len):
|
||||
row = TableRow()
|
||||
|
||||
cell_orig = TableCell()
|
||||
cell_trans = TableCell()
|
||||
|
||||
if isinstance(orig_elements[i], list):
|
||||
for elem in orig_elements[i]:
|
||||
cell_orig.addElement(elem)
|
||||
else:
|
||||
cell_orig.addElement(orig_elements[i])
|
||||
|
||||
if isinstance(trans_elements[i], list):
|
||||
for elem in trans_elements[i]:
|
||||
cell_trans.addElement(elem)
|
||||
else:
|
||||
cell_trans.addElement(trans_elements[i])
|
||||
|
||||
row.addElement(cell_orig)
|
||||
row.addElement(cell_trans)
|
||||
|
||||
main_table.addElement(row)
|
||||
|
||||
self.doc.text.addElement(main_table)
|
||||
self.doc.save(output_odt)
|
@ -2,11 +2,18 @@ import tiktoken
|
||||
from openai import OpenAI
|
||||
|
||||
model = "gpt-4o"
|
||||
system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations."
|
||||
input_token_limit = 2000
|
||||
system_prompt = """
|
||||
Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form:
|
||||
- Quotes (e.g., > quoted text). Can be multi-line.
|
||||
- Links (e.g., [text](url))
|
||||
- Images (e.g., )
|
||||
|
||||
Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
|
||||
"""
|
||||
input_token_limit = 3500
|
||||
|
||||
|
||||
def translate(text):
|
||||
def translate_markdown(text):
|
||||
buf = []
|
||||
bufsize = 0
|
||||
cl = OpenAI()
|
||||
|
44
idb/util.py
Normal file
44
idb/util.py
Normal file
@ -0,0 +1,44 @@
|
||||
import re, os.path
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def save_file(file, content):
|
||||
with open(file, 'w') as f:
|
||||
f.write(content)
|
||||
|
||||
|
||||
def read_file(filename):
|
||||
with open(filename) as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def name_from_url(url):
|
||||
return os.path.basename(url[:-1])
|
||||
|
||||
|
||||
def image_url_to_filename(url):
|
||||
parsed_url = urlparse(url)
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
name, ext = os.path.splitext(filename)
|
||||
date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path)
|
||||
if not date_match:
|
||||
raise ValueError("no valid date found in URL")
|
||||
year = date_match.group(1)
|
||||
day = date_match.group(3) if date_match.group(3) else "01"
|
||||
return f"{year}{day}_{name}{ext}"
|
||||
|
||||
|
||||
def extract_images_from_markdown(markdown_text):
|
||||
image_pattern = r"!\[.*?\]\((.*?)\)"
|
||||
images = re.findall(image_pattern, markdown_text)
|
||||
return images
|
||||
|
||||
|
||||
def download_file(url, filename):
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
if response.status_code == 200:
|
||||
with open(filename, 'wb') as file:
|
||||
for chunk in response.iter_content(1024):
|
||||
file.write(chunk)
|
125
idb/wordpress.py
125
idb/wordpress.py
@ -1,13 +1,123 @@
|
||||
import requests
|
||||
import requests, re
|
||||
from bs4 import BeautifulSoup
|
||||
from html import unescape
|
||||
from markdownify import markdownify
|
||||
from collections import namedtuple
|
||||
|
||||
ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
|
||||
from markdownify import MarkdownConverter
|
||||
from markdown import markdown
|
||||
from enum import Enum
|
||||
|
||||
|
||||
def fetch(url) -> ArticleContent:
|
||||
class WordpressMarkdownConverter(MarkdownConverter):
|
||||
def convert_p(self, el, text, convert_as_inline):
|
||||
md = self.idb_convert_image(el)
|
||||
if md is not None:
|
||||
return md
|
||||
if str(el).startswith('<p style="text-align:center;">'):
|
||||
text = text.replace('\n\n', '<br>')
|
||||
return f'<center>{text}</center>\n\n'
|
||||
return super().convert_p(el, text, convert_as_inline)
|
||||
|
||||
def _convert_hn(self, n, el, text, convert_as_inline):
|
||||
md = self.idb_convert_image(el)
|
||||
if md is not None:
|
||||
return md
|
||||
return super()._convert_hn(n, el, text, convert_as_inline)
|
||||
|
||||
def convert_a(self, el, *args):
|
||||
md = self.idb_convert_image(el)
|
||||
if md is not None:
|
||||
return md
|
||||
return super().convert_a(el, *args)
|
||||
|
||||
def convert_div(self, el, *args):
|
||||
if str(el).startswith('<div class="wp-caption aligncenter" data-shortcode="caption"'):
|
||||
md = self.idb_convert_image(el)
|
||||
if md is not None:
|
||||
return md
|
||||
return super().convert_a(el, *args)
|
||||
|
||||
def idb_convert_image(self, el):
|
||||
html = str(el)
|
||||
|
||||
pattern = re.compile(r'^<(?:h[1-6]|p)[^>]*>.*?<img.*?src="([^"?]+)(?:\?[^"]*)?"')
|
||||
match = pattern.search(html)
|
||||
if match:
|
||||
return f'})\n\n'
|
||||
|
||||
pattern = re.compile(r'^<div class="wp-caption aligncenter" data-shortcode="caption"[^>]+><a[^>]+><img alt="[^"]*" aria-describedby="caption-attachment.*?src="([^"?]+)(?:\?[^"]*)?".*?/></a><p.*?id="caption-attachment[^"]+"[^>]*>(.*?)</p>', re.S)
|
||||
match = pattern.search(html)
|
||||
if match:
|
||||
src, title = match.groups()
|
||||
title = unescape(title)
|
||||
return f'\n\n'
|
||||
|
||||
|
||||
def _markdownify(html, **options):
|
||||
return WordpressMarkdownConverter(**options).convert(html)
|
||||
|
||||
|
||||
def markdown_from_html(html):
|
||||
def mapper(s):
|
||||
s = s.strip()
|
||||
if s in ('#', '# #', '# # #', '##', '###', '___'):
|
||||
return f'<center>{s}</center>'
|
||||
return s
|
||||
|
||||
md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
|
||||
return '\n\n'.join(map(mapper, md.split('\n\n')))
|
||||
|
||||
|
||||
def html_from_markdown(s):
|
||||
return markdown(s, extensions=['extra', 'tables'])
|
||||
|
||||
|
||||
class Article:
|
||||
title: str
|
||||
html: str
|
||||
md: str
|
||||
|
||||
def __init__(self, title, html, md):
|
||||
self.title = title
|
||||
self.html = html
|
||||
self.md = md
|
||||
|
||||
@classmethod
|
||||
def from_html(cls, title, html):
|
||||
return Article(title, html, markdown_from_html(html))
|
||||
|
||||
@classmethod
|
||||
def from_markdown(cls, title, md):
|
||||
return Article(title, html_from_markdown(md), md)
|
||||
|
||||
@classmethod
|
||||
def from_markdown_file(cls, filename, with_title=True):
|
||||
if with_title:
|
||||
with open(filename) as f:
|
||||
lines = f.readlines()
|
||||
|
||||
first_line = lines[0].strip()
|
||||
if not first_line.startswith('#'):
|
||||
raise ValueError('first line must start with #')
|
||||
title = first_line.lstrip('#').strip()
|
||||
|
||||
lines.pop(0)
|
||||
if lines and not lines[0].strip():
|
||||
lines.pop(0)
|
||||
|
||||
md = '\n\n'.join(lines)
|
||||
else:
|
||||
with open(filename) as f:
|
||||
md = f.read()
|
||||
title = ''
|
||||
|
||||
return cls.from_markdown(title, md)
|
||||
|
||||
|
||||
class Language(Enum):
|
||||
English = 'en'
|
||||
Russian = 'ru'
|
||||
|
||||
|
||||
def fetch_article(url) -> Article:
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
@ -17,6 +127,5 @@ def fetch(url) -> ArticleContent:
|
||||
|
||||
html = str(soup.find("div", class_="entry-content")).strip()
|
||||
title = unescape(soup.find(class_="entry-title").get_text(strip=True))
|
||||
md = markdownify(html).strip()
|
||||
|
||||
return ArticleContent(title, html, md)
|
||||
return Article.from_html(title, html)
|
||||
|
@ -1,6 +1,9 @@
|
||||
odfpy~=1.4.1
|
||||
pillow~=11.1.0
|
||||
beautifulsoup4~=4.13.3
|
||||
markdownify~=0.14.1
|
||||
requests~=2.32.3
|
||||
markdownify~=0.14.1
|
||||
Markdown~=3.7
|
||||
tiktoken~=0.8.0
|
||||
openai~=1.61.1
|
||||
python-dotenv~=1.0.1
|
||||
tiktoken~=0.8.0
|
||||
python-dotenv~=1.0.1
|
@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
from argparse import ArgumentParser
|
||||
from idb.wordpress import fetch
|
||||
from idb.translator import translate
|
||||
from idb import fetch_article, translate_markdown
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
@ -18,8 +17,8 @@ if __name__ == '__main__':
|
||||
help="output files")
|
||||
args = parser.parse_args()
|
||||
|
||||
article = fetch(args.url)
|
||||
translation = translate(article.md)
|
||||
a = fetch_article(args.url)
|
||||
translation = translate_markdown(a.md)
|
||||
|
||||
save(args.output[0], article.md)
|
||||
save(args.output[0], a.md)
|
||||
save(args.output[1], translation)
|
||||
|
19
tzo_images.py
Executable file
19
tzo_images.py
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env python3
|
||||
import os.path
|
||||
from idb import tzo_urls
|
||||
from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for url in tzo_urls:
|
||||
name = name_from_url(url)
|
||||
markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt')
|
||||
image_urls = extract_images_from_markdown(read_file(markdown_file))
|
||||
for image_url in image_urls:
|
||||
image_name = image_url_to_filename(image_url)
|
||||
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
|
||||
download_file(image_url, output_file)
|
||||
print(f'{image_name} saved')
|
15
tzo_odt.py
Executable file
15
tzo_odt.py
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
import os.path
|
||||
from idb import Article, DocumentCreator, tzo_urls
|
||||
from idb.util import name_from_url
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for url in tzo_urls:
|
||||
name = name_from_url(url)
|
||||
|
||||
orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)
|
||||
trans = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-en.txt'), with_title=False)
|
||||
|
||||
doc = DocumentCreator()
|
||||
doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))
|
Loading…
x
Reference in New Issue
Block a user