md -> odt converter, first commit
This commit is contained in:
parent
2ebf5f18fa
commit
7f4b460c96
10
.gitignore
vendored
10
.gitignore
vendored
@ -1,3 +1,11 @@
|
|||||||
/.idea
|
/.idea
|
||||||
/.venv
|
/.venv
|
||||||
/.env
|
/.env
|
||||||
|
/tzo_save.py
|
||||||
|
/test*py
|
||||||
|
/*.html
|
||||||
|
/*.odt
|
||||||
|
/*.md
|
||||||
|
/*.txt
|
||||||
|
/*.zip
|
||||||
|
/.DS_Store
|
24
cartier_odt.py
Executable file
24
cartier_odt.py
Executable file
@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import os.path
|
||||||
|
from idb import Article, DocumentCreator
|
||||||
|
from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
name = 'cartier3'
|
||||||
|
orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru')
|
||||||
|
trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en')
|
||||||
|
|
||||||
|
orig = Article.from_markdown_file(orig_path, with_title=False)
|
||||||
|
trans = Article.from_markdown_file(trans_path, with_title=False)
|
||||||
|
|
||||||
|
image_urls = extract_images_from_markdown(read_file(orig_path))
|
||||||
|
for image_url in image_urls:
|
||||||
|
image_name = image_url_to_filename(image_url)
|
||||||
|
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
|
||||||
|
if not os.path.exists(output_file):
|
||||||
|
download_file(image_url, output_file)
|
||||||
|
print(f'{image_name} saved')
|
||||||
|
|
||||||
|
doc = DocumentCreator()
|
||||||
|
doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))
|
@ -0,0 +1,24 @@
|
|||||||
|
from .wordpress import Article, fetch_article
|
||||||
|
from .translator import translate_markdown
|
||||||
|
from .doc import DocumentCreator
|
||||||
|
|
||||||
|
tzo_urls = (
|
||||||
|
'https://kniganews.org/2012/12/20/beyond-clouds-1/',
|
||||||
|
'https://kniganews.org/2012/12/21/beyond-clouds-21/',
|
||||||
|
'https://kniganews.org/2012/12/22/beyond-clouds-22/',
|
||||||
|
'https://kniganews.org/2012/12/23/beyond-clouds-31/',
|
||||||
|
'https://kniganews.org/2012/12/24/beyond-clouds-32/',
|
||||||
|
'https://kniganews.org/2012/12/25/beyond-clouds-33/',
|
||||||
|
'https://kniganews.org/2012/12/28/beyond-clouds-41/',
|
||||||
|
'https://kniganews.org/2012/12/29/beyond-clouds-42/',
|
||||||
|
'https://kniganews.org/2012/12/30/beyond-clouds-43/',
|
||||||
|
'https://kniganews.org/2013/01/01/beyond-clouds-44/',
|
||||||
|
'https://kniganews.org/2013/01/06/beyond-clouds-51/',
|
||||||
|
'https://kniganews.org/2013/01/07/beyond-clouds-52/',
|
||||||
|
'https://kniganews.org/2013/02/16/beyond-clouds-53/',
|
||||||
|
'https://kniganews.org/2013/03/25/beyond-clouds-61/',
|
||||||
|
'https://kniganews.org/2013/05/10/beyond-clouds-62/',
|
||||||
|
'https://kniganews.org/2013/06/17/beyond-clouds-731/',
|
||||||
|
'https://kniganews.org/2013/08/07/beyond-clouds-732/',
|
||||||
|
'https://kniganews.org/2013/09/17/beyond-clouds-73/'
|
||||||
|
)
|
424
idb/doc.py
Normal file
424
idb/doc.py
Normal file
@ -0,0 +1,424 @@
|
|||||||
|
import os.path
|
||||||
|
|
||||||
|
from odf.opendocument import OpenDocumentText
|
||||||
|
from odf.text import P, H, Span, A, LineBreak, List, ListItem
|
||||||
|
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
|
||||||
|
from odf.table import TableColumn, TableCell, TableRow, Table
|
||||||
|
from odf.draw import Frame, Image
|
||||||
|
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from idb import Article
|
||||||
|
from idb.util import image_url_to_filename
|
||||||
|
|
||||||
|
PAGE_LAYOUT_NAME = 'LandscapeLayout'
|
||||||
|
MASTER_PAGE_NAME = 'Standard'
|
||||||
|
BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
|
||||||
|
ITALIC_STYLE_NAME = 'Italic'
|
||||||
|
BOLD_STYLE_NAME = 'Bold'
|
||||||
|
CAPTION_STYLE_NAME = 'Caption'
|
||||||
|
UNDERLINE_STYLE_NAME = 'Underline'
|
||||||
|
CENTER_STYLE_NAME = 'CenterAligned'
|
||||||
|
TITLE_STYLE_NAME = 'Title'
|
||||||
|
|
||||||
|
|
||||||
|
def add_child(parent, child):
|
||||||
|
if hasattr(child, "qname"):
|
||||||
|
parent.addElement(child)
|
||||||
|
else:
|
||||||
|
parent.addText(child)
|
||||||
|
|
||||||
|
|
||||||
|
def calc_frame_dimensions(image_path, desired_width_cm):
|
||||||
|
with PILImage.open(image_path) as img:
|
||||||
|
orig_width, orig_height = img.size
|
||||||
|
|
||||||
|
dpi = 96.0
|
||||||
|
orig_width_cm = (orig_width / dpi) * 2.54
|
||||||
|
orig_height_cm = (orig_height / dpi) * 2.54
|
||||||
|
|
||||||
|
scale = desired_width_cm / orig_width_cm
|
||||||
|
new_height_cm = orig_height_cm * scale
|
||||||
|
|
||||||
|
return f"{desired_width_cm}cm", f"{new_height_cm}cm"
|
||||||
|
|
||||||
|
|
||||||
|
class ImageWrap:
|
||||||
|
def __init__(self, image_file, caption):
|
||||||
|
self.image_file = image_file
|
||||||
|
self.caption = caption
|
||||||
|
|
||||||
|
def get_elements(self, doc):
|
||||||
|
embedded_href = doc.addPicture(self.image_file)
|
||||||
|
|
||||||
|
desired_width = 13.5
|
||||||
|
width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)
|
||||||
|
|
||||||
|
frm = Frame(width=width_str, height=height_str)
|
||||||
|
img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
|
||||||
|
frm.addElement(img)
|
||||||
|
|
||||||
|
elements = [frm]
|
||||||
|
|
||||||
|
if self.caption:
|
||||||
|
caption = P(stylename=CAPTION_STYLE_NAME)
|
||||||
|
caption.addText(self.caption)
|
||||||
|
elements.append(caption)
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentCreator:
|
||||||
|
def __init__(self):
|
||||||
|
self.doc = OpenDocumentText()
|
||||||
|
self.set_styles()
|
||||||
|
|
||||||
|
def set_styles(self):
|
||||||
|
landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
|
||||||
|
landscape_props = PageLayoutProperties(
|
||||||
|
pagewidth="29.7cm",
|
||||||
|
pageheight="21.0cm",
|
||||||
|
printorientation="landscape",
|
||||||
|
margin="1cm"
|
||||||
|
)
|
||||||
|
landscape_layout.addElement(landscape_props)
|
||||||
|
self.doc.automaticstyles.addElement(landscape_layout)
|
||||||
|
|
||||||
|
masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
|
||||||
|
self.doc.masterstyles.addElement(masterpage)
|
||||||
|
|
||||||
|
# bold
|
||||||
|
style = Style(name=BOLD_STYLE_NAME, family="text")
|
||||||
|
style.addElement(TextProperties(attributes={
|
||||||
|
'fontweight': "bold",
|
||||||
|
'fontweightasian': "bold",
|
||||||
|
'fontweightcomplex': "bold"
|
||||||
|
}))
|
||||||
|
self.doc.automaticstyles.addElement(style)
|
||||||
|
|
||||||
|
# italic
|
||||||
|
style = Style(name=ITALIC_STYLE_NAME, family="text")
|
||||||
|
style.addElement(TextProperties(attributes={
|
||||||
|
'fontstyle': "italic",
|
||||||
|
'fontstyleasian': "italic",
|
||||||
|
'fontstylecomplex': "italic"
|
||||||
|
}))
|
||||||
|
self.doc.automaticstyles.addElement(style)
|
||||||
|
|
||||||
|
# caption
|
||||||
|
style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
|
||||||
|
style.addElement(TextProperties(attributes={
|
||||||
|
'fontstyle': "italic",
|
||||||
|
'fontstyleasian': "italic",
|
||||||
|
'fontstylecomplex': "italic",
|
||||||
|
'fontsize': '10pt',
|
||||||
|
'color': '#777777'
|
||||||
|
}))
|
||||||
|
style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
|
||||||
|
self.doc.automaticstyles.addElement(style)
|
||||||
|
|
||||||
|
# underline
|
||||||
|
style = Style(name=UNDERLINE_STYLE_NAME, family="text")
|
||||||
|
style.addElement(TextProperties(attributes={
|
||||||
|
'textunderlinestyle': "solid",
|
||||||
|
'textunderlinewidth': "auto"
|
||||||
|
}))
|
||||||
|
self.doc.automaticstyles.addElement(style)
|
||||||
|
|
||||||
|
# blockquote
|
||||||
|
style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
|
||||||
|
style.addElement(ParagraphProperties(attributes={
|
||||||
|
'marginleft': '0.6cm',
|
||||||
|
'margintop': '0.15cm',
|
||||||
|
'marginbottom': '0.15cm',
|
||||||
|
}))
|
||||||
|
style.addElement(TextProperties(attributes={'color': '#378A62'}))
|
||||||
|
self.doc.styles.addElement(style)
|
||||||
|
|
||||||
|
# title
|
||||||
|
style = Style(name=TITLE_STYLE_NAME, family="paragraph")
|
||||||
|
style.addElement(TextProperties(attributes={
|
||||||
|
'fontsize': '20pt',
|
||||||
|
'fontweight': "bold",
|
||||||
|
'fontweightasian': "bold",
|
||||||
|
'fontweightcomplex': "bold"
|
||||||
|
}))
|
||||||
|
style.addElement(ParagraphProperties(textalign='center'))
|
||||||
|
self.doc.styles.addElement(style)
|
||||||
|
|
||||||
|
# centered text
|
||||||
|
style = Style(name=CENTER_STYLE_NAME, family="paragraph")
|
||||||
|
style.addElement(ParagraphProperties(textalign="center"))
|
||||||
|
self.doc.automaticstyles.addElement(style)
|
||||||
|
|
||||||
|
def process_inline(self, node):
|
||||||
|
if isinstance(node, str):
|
||||||
|
return [node]
|
||||||
|
|
||||||
|
tag = node.name.lower()
|
||||||
|
|
||||||
|
simple_tags = (
|
||||||
|
('strong', 'b'),
|
||||||
|
('em', 'i'),
|
||||||
|
('ins', 'u')
|
||||||
|
)
|
||||||
|
simple_styles = (
|
||||||
|
BOLD_STYLE_NAME,
|
||||||
|
ITALIC_STYLE_NAME,
|
||||||
|
UNDERLINE_STYLE_NAME
|
||||||
|
)
|
||||||
|
for i, tags_list in enumerate(simple_tags):
|
||||||
|
if tag in tags_list:
|
||||||
|
span = Span(stylename=simple_styles[i])
|
||||||
|
for child in node.contents:
|
||||||
|
for inline in self.process_inline(child):
|
||||||
|
add_child(span, inline)
|
||||||
|
return [span]
|
||||||
|
|
||||||
|
if tag == "code":
|
||||||
|
return [Span(stylename="Code", text=node.get_text())]
|
||||||
|
|
||||||
|
elif tag == "a":
|
||||||
|
return [A(href=node.get("href"), text=node.get_text())]
|
||||||
|
|
||||||
|
elif tag == "img":
|
||||||
|
if node.name and node.name.lower() == "img":
|
||||||
|
return [self.process_img(node)]
|
||||||
|
|
||||||
|
else:
|
||||||
|
result = []
|
||||||
|
for child in node.contents:
|
||||||
|
result.extend(self.process_inline(child))
|
||||||
|
return result
|
||||||
|
|
||||||
|
def process_block(self, elem):
|
||||||
|
h_elem = self.try_process_heading(elem)
|
||||||
|
if h_elem is not None:
|
||||||
|
return h_elem
|
||||||
|
|
||||||
|
tag = elem.name.lower()
|
||||||
|
if tag == "p":
|
||||||
|
is_centered = False
|
||||||
|
has_image = False
|
||||||
|
for child in elem.contents:
|
||||||
|
# try converting heading
|
||||||
|
h_elem = self.try_process_heading(child)
|
||||||
|
if h_elem is not None:
|
||||||
|
return h_elem
|
||||||
|
|
||||||
|
if child.name:
|
||||||
|
if child.name.lower() == "img":
|
||||||
|
has_image = True
|
||||||
|
if child.name.lower() == "center":
|
||||||
|
for cchild in child.contents:
|
||||||
|
h_elem = self.try_process_heading(cchild)
|
||||||
|
if h_elem is not None:
|
||||||
|
return h_elem
|
||||||
|
is_centered = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_centered or has_image:
|
||||||
|
p_elem = P(stylename=CENTER_STYLE_NAME)
|
||||||
|
else:
|
||||||
|
p_elem = P()
|
||||||
|
|
||||||
|
for child in elem.contents:
|
||||||
|
for inline in self.process_inline(child):
|
||||||
|
if has_image and isinstance(inline, ImageWrap):
|
||||||
|
image = inline.get_elements(self.doc)
|
||||||
|
p_elem.addElement(image[0])
|
||||||
|
elems = [p_elem]
|
||||||
|
if len(image) == 2:
|
||||||
|
elems.append(image[1])
|
||||||
|
return elems
|
||||||
|
|
||||||
|
add_child(p_elem, inline)
|
||||||
|
|
||||||
|
return p_elem
|
||||||
|
|
||||||
|
elif tag == "blockquote":
|
||||||
|
items = []
|
||||||
|
for child in elem.contents:
|
||||||
|
text = child.get_text()
|
||||||
|
if text.strip() == '':
|
||||||
|
continue
|
||||||
|
items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
|
||||||
|
return items
|
||||||
|
|
||||||
|
elif tag in ("ul", "ol"):
|
||||||
|
odf_list = List()
|
||||||
|
li_elements = elem.find_all("li", recursive=False)
|
||||||
|
for li in li_elements:
|
||||||
|
li_item = ListItem()
|
||||||
|
p = P()
|
||||||
|
for child in li.contents:
|
||||||
|
# if nested list is found, we'll process it later
|
||||||
|
if hasattr(child, "name") and child.name in ("ul", "ol"):
|
||||||
|
continue
|
||||||
|
for inline in self.process_inline(child):
|
||||||
|
add_child(p, inline)
|
||||||
|
li_item.addElement(p)
|
||||||
|
|
||||||
|
# process nested lists
|
||||||
|
for child in li.contents:
|
||||||
|
if hasattr(child, "name") and child.name in ("ul", "ol"):
|
||||||
|
nested_list = self.process_block(child)
|
||||||
|
li_item.addElement(nested_list)
|
||||||
|
|
||||||
|
odf_list.addElement(li_item)
|
||||||
|
return odf_list
|
||||||
|
|
||||||
|
elif tag == "pre":
|
||||||
|
return P(stylename="Preformatted", text=elem.get_text())
|
||||||
|
|
||||||
|
elif tag == "hr":
|
||||||
|
return P(stylename=CENTER_STYLE_NAME, text='---')
|
||||||
|
|
||||||
|
elif tag == "table":
|
||||||
|
odf_table = Table()
|
||||||
|
for tr in elem.find_all("tr"):
|
||||||
|
row = TableRow()
|
||||||
|
for cell in tr.find_all(["th", "td"]):
|
||||||
|
cell_elem = TableCell()
|
||||||
|
cell_html = "".join(str(child) for child in cell.contents)
|
||||||
|
cell_odf_elements = self.html_to_odf_elements(cell_html)
|
||||||
|
for el in cell_odf_elements:
|
||||||
|
cell_elem.addElement(el)
|
||||||
|
row.addElement(cell_elem)
|
||||||
|
odf_table.addElement(row)
|
||||||
|
return odf_table
|
||||||
|
|
||||||
|
elif tag == "img":
|
||||||
|
return self.process_img(elem).get_elements(self.doc)
|
||||||
|
|
||||||
|
elif tag == "br":
|
||||||
|
return LineBreak()
|
||||||
|
|
||||||
|
else:
|
||||||
|
p_elem = P()
|
||||||
|
p_elem.addText(elem.get_text())
|
||||||
|
return p_elem
|
||||||
|
|
||||||
|
def try_process_heading(self, elem):
|
||||||
|
if not elem.name:
|
||||||
|
return
|
||||||
|
|
||||||
|
tag = elem.name.lower()
|
||||||
|
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
||||||
|
return H(outlinelevel=tag[1], text=elem.get_text())
|
||||||
|
|
||||||
|
def process_img(self, elem) -> ImageWrap:
|
||||||
|
href = elem.get("src")
|
||||||
|
saved_file = os.path.join(
|
||||||
|
os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
|
||||||
|
)
|
||||||
|
if not os.path.exists(saved_file):
|
||||||
|
raise ValueError(f'image {saved_file} not found')
|
||||||
|
|
||||||
|
alt = elem.get('alt')
|
||||||
|
return ImageWrap(saved_file, alt)
|
||||||
|
|
||||||
|
def html_to_odf_elements(self, html):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
elements = []
|
||||||
|
top_nodes = soup.body.contents if soup.body else soup.contents
|
||||||
|
|
||||||
|
keeping = False
|
||||||
|
stack = []
|
||||||
|
ending_headings = (
|
||||||
|
'дополнительное чтение',
|
||||||
|
'основные источники',
|
||||||
|
'источники',
|
||||||
|
'дополнительные материалы',
|
||||||
|
'additional reading',
|
||||||
|
'further reading',
|
||||||
|
'main sources',
|
||||||
|
'additional sources',
|
||||||
|
)
|
||||||
|
|
||||||
|
for node in top_nodes:
|
||||||
|
if isinstance(node, str):
|
||||||
|
if not node.strip():
|
||||||
|
continue
|
||||||
|
p = P()
|
||||||
|
p.addText(node)
|
||||||
|
elements.append(p)
|
||||||
|
elif node.name:
|
||||||
|
if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
|
||||||
|
if stack:
|
||||||
|
elements.append(stack)
|
||||||
|
stack = []
|
||||||
|
keeping = True
|
||||||
|
result = self.process_block(node)
|
||||||
|
if keeping:
|
||||||
|
stack.append(result)
|
||||||
|
else:
|
||||||
|
elements.append(result)
|
||||||
|
if stack:
|
||||||
|
elements.append(stack)
|
||||||
|
return elements
|
||||||
|
|
||||||
|
def create(self,
|
||||||
|
orig: Article,
|
||||||
|
trans: Article,
|
||||||
|
output_odt: str,
|
||||||
|
with_title=False):
|
||||||
|
orig_elements = self.html_to_odf_elements(orig.html)
|
||||||
|
trans_elements = self.html_to_odf_elements(trans.html)
|
||||||
|
|
||||||
|
max_len = max(len(orig_elements), len(trans_elements))
|
||||||
|
while len(orig_elements) < max_len:
|
||||||
|
orig_elements.append(P())
|
||||||
|
while len(trans_elements) < max_len:
|
||||||
|
trans_elements.append(P())
|
||||||
|
|
||||||
|
main_table = Table(name="ComparisonTable")
|
||||||
|
|
||||||
|
col1 = TableColumn()
|
||||||
|
col2 = TableColumn()
|
||||||
|
|
||||||
|
main_table.addElement(col1)
|
||||||
|
main_table.addElement(col2)
|
||||||
|
|
||||||
|
if with_title:
|
||||||
|
# headings
|
||||||
|
header_row = TableRow()
|
||||||
|
|
||||||
|
header_cell_left = TableCell()
|
||||||
|
header_cell_right = TableCell()
|
||||||
|
|
||||||
|
header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
|
||||||
|
header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))
|
||||||
|
|
||||||
|
header_row.addElement(header_cell_left)
|
||||||
|
header_row.addElement(header_cell_right)
|
||||||
|
|
||||||
|
main_table.addElement(header_row)
|
||||||
|
|
||||||
|
# content
|
||||||
|
for i in range(max_len):
|
||||||
|
row = TableRow()
|
||||||
|
|
||||||
|
cell_orig = TableCell()
|
||||||
|
cell_trans = TableCell()
|
||||||
|
|
||||||
|
if isinstance(orig_elements[i], list):
|
||||||
|
for elem in orig_elements[i]:
|
||||||
|
cell_orig.addElement(elem)
|
||||||
|
else:
|
||||||
|
cell_orig.addElement(orig_elements[i])
|
||||||
|
|
||||||
|
if isinstance(trans_elements[i], list):
|
||||||
|
for elem in trans_elements[i]:
|
||||||
|
cell_trans.addElement(elem)
|
||||||
|
else:
|
||||||
|
cell_trans.addElement(trans_elements[i])
|
||||||
|
|
||||||
|
row.addElement(cell_orig)
|
||||||
|
row.addElement(cell_trans)
|
||||||
|
|
||||||
|
main_table.addElement(row)
|
||||||
|
|
||||||
|
self.doc.text.addElement(main_table)
|
||||||
|
self.doc.save(output_odt)
|
@ -2,11 +2,18 @@ import tiktoken
|
|||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
model = "gpt-4o"
|
model = "gpt-4o"
|
||||||
system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations."
|
system_prompt = """
|
||||||
input_token_limit = 2000
|
Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form:
|
||||||
|
- Quotes (e.g., > quoted text). Can be multi-line.
|
||||||
|
- Links (e.g., [text](url))
|
||||||
|
- Images (e.g., )
|
||||||
|
|
||||||
|
Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
|
||||||
|
"""
|
||||||
|
input_token_limit = 3500
|
||||||
|
|
||||||
|
|
||||||
def translate(text):
|
def translate_markdown(text):
|
||||||
buf = []
|
buf = []
|
||||||
bufsize = 0
|
bufsize = 0
|
||||||
cl = OpenAI()
|
cl = OpenAI()
|
||||||
|
44
idb/util.py
Normal file
44
idb/util.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
import re, os.path
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
def save_file(file, content):
|
||||||
|
with open(file, 'w') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(filename):
|
||||||
|
with open(filename) as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def name_from_url(url):
|
||||||
|
return os.path.basename(url[:-1])
|
||||||
|
|
||||||
|
|
||||||
|
def image_url_to_filename(url):
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
filename = os.path.basename(parsed_url.path)
|
||||||
|
name, ext = os.path.splitext(filename)
|
||||||
|
date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path)
|
||||||
|
if not date_match:
|
||||||
|
raise ValueError("no valid date found in URL")
|
||||||
|
year = date_match.group(1)
|
||||||
|
day = date_match.group(3) if date_match.group(3) else "01"
|
||||||
|
return f"{year}{day}_{name}{ext}"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_markdown(markdown_text):
|
||||||
|
image_pattern = r"!\[.*?\]\((.*?)\)"
|
||||||
|
images = re.findall(image_pattern, markdown_text)
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url, filename):
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
response.raise_for_status()
|
||||||
|
if response.status_code == 200:
|
||||||
|
with open(filename, 'wb') as file:
|
||||||
|
for chunk in response.iter_content(1024):
|
||||||
|
file.write(chunk)
|
125
idb/wordpress.py
125
idb/wordpress.py
@ -1,13 +1,123 @@
|
|||||||
import requests
|
import requests, re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from html import unescape
|
from html import unescape
|
||||||
from markdownify import markdownify
|
from markdownify import MarkdownConverter
|
||||||
from collections import namedtuple
|
from markdown import markdown
|
||||||
|
from enum import Enum
|
||||||
ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
|
|
||||||
|
|
||||||
|
|
||||||
def fetch(url) -> ArticleContent:
|
class WordpressMarkdownConverter(MarkdownConverter):
|
||||||
|
def convert_p(self, el, text, convert_as_inline):
|
||||||
|
md = self.idb_convert_image(el)
|
||||||
|
if md is not None:
|
||||||
|
return md
|
||||||
|
if str(el).startswith('<p style="text-align:center;">'):
|
||||||
|
text = text.replace('\n\n', '<br>')
|
||||||
|
return f'<center>{text}</center>\n\n'
|
||||||
|
return super().convert_p(el, text, convert_as_inline)
|
||||||
|
|
||||||
|
def _convert_hn(self, n, el, text, convert_as_inline):
|
||||||
|
md = self.idb_convert_image(el)
|
||||||
|
if md is not None:
|
||||||
|
return md
|
||||||
|
return super()._convert_hn(n, el, text, convert_as_inline)
|
||||||
|
|
||||||
|
def convert_a(self, el, *args):
|
||||||
|
md = self.idb_convert_image(el)
|
||||||
|
if md is not None:
|
||||||
|
return md
|
||||||
|
return super().convert_a(el, *args)
|
||||||
|
|
||||||
|
def convert_div(self, el, *args):
|
||||||
|
if str(el).startswith('<div class="wp-caption aligncenter" data-shortcode="caption"'):
|
||||||
|
md = self.idb_convert_image(el)
|
||||||
|
if md is not None:
|
||||||
|
return md
|
||||||
|
return super().convert_a(el, *args)
|
||||||
|
|
||||||
|
def idb_convert_image(self, el):
|
||||||
|
html = str(el)
|
||||||
|
|
||||||
|
pattern = re.compile(r'^<(?:h[1-6]|p)[^>]*>.*?<img.*?src="([^"?]+)(?:\?[^"]*)?"')
|
||||||
|
match = pattern.search(html)
|
||||||
|
if match:
|
||||||
|
return f'})\n\n'
|
||||||
|
|
||||||
|
pattern = re.compile(r'^<div class="wp-caption aligncenter" data-shortcode="caption"[^>]+><a[^>]+><img alt="[^"]*" aria-describedby="caption-attachment.*?src="([^"?]+)(?:\?[^"]*)?".*?/></a><p.*?id="caption-attachment[^"]+"[^>]*>(.*?)</p>', re.S)
|
||||||
|
match = pattern.search(html)
|
||||||
|
if match:
|
||||||
|
src, title = match.groups()
|
||||||
|
title = unescape(title)
|
||||||
|
return f'\n\n'
|
||||||
|
|
||||||
|
|
||||||
|
def _markdownify(html, **options):
|
||||||
|
return WordpressMarkdownConverter(**options).convert(html)
|
||||||
|
|
||||||
|
|
||||||
|
def markdown_from_html(html):
|
||||||
|
def mapper(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s in ('#', '# #', '# # #', '##', '###', '___'):
|
||||||
|
return f'<center>{s}</center>'
|
||||||
|
return s
|
||||||
|
|
||||||
|
md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
|
||||||
|
return '\n\n'.join(map(mapper, md.split('\n\n')))
|
||||||
|
|
||||||
|
|
||||||
|
def html_from_markdown(s):
|
||||||
|
return markdown(s, extensions=['extra', 'tables'])
|
||||||
|
|
||||||
|
|
||||||
|
class Article:
|
||||||
|
title: str
|
||||||
|
html: str
|
||||||
|
md: str
|
||||||
|
|
||||||
|
def __init__(self, title, html, md):
|
||||||
|
self.title = title
|
||||||
|
self.html = html
|
||||||
|
self.md = md
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_html(cls, title, html):
|
||||||
|
return Article(title, html, markdown_from_html(html))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_markdown(cls, title, md):
|
||||||
|
return Article(title, html_from_markdown(md), md)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_markdown_file(cls, filename, with_title=True):
|
||||||
|
if with_title:
|
||||||
|
with open(filename) as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
first_line = lines[0].strip()
|
||||||
|
if not first_line.startswith('#'):
|
||||||
|
raise ValueError('first line must start with #')
|
||||||
|
title = first_line.lstrip('#').strip()
|
||||||
|
|
||||||
|
lines.pop(0)
|
||||||
|
if lines and not lines[0].strip():
|
||||||
|
lines.pop(0)
|
||||||
|
|
||||||
|
md = '\n\n'.join(lines)
|
||||||
|
else:
|
||||||
|
with open(filename) as f:
|
||||||
|
md = f.read()
|
||||||
|
title = ''
|
||||||
|
|
||||||
|
return cls.from_markdown(title, md)
|
||||||
|
|
||||||
|
|
||||||
|
class Language(Enum):
|
||||||
|
English = 'en'
|
||||||
|
Russian = 'ru'
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_article(url) -> Article:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
@ -17,6 +127,5 @@ def fetch(url) -> ArticleContent:
|
|||||||
|
|
||||||
html = str(soup.find("div", class_="entry-content")).strip()
|
html = str(soup.find("div", class_="entry-content")).strip()
|
||||||
title = unescape(soup.find(class_="entry-title").get_text(strip=True))
|
title = unescape(soup.find(class_="entry-title").get_text(strip=True))
|
||||||
md = markdownify(html).strip()
|
|
||||||
|
|
||||||
return ArticleContent(title, html, md)
|
return Article.from_html(title, html)
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
|
odfpy~=1.4.1
|
||||||
|
pillow~=11.1.0
|
||||||
beautifulsoup4~=4.13.3
|
beautifulsoup4~=4.13.3
|
||||||
markdownify~=0.14.1
|
|
||||||
requests~=2.32.3
|
requests~=2.32.3
|
||||||
|
markdownify~=0.14.1
|
||||||
|
Markdown~=3.7
|
||||||
|
tiktoken~=0.8.0
|
||||||
openai~=1.61.1
|
openai~=1.61.1
|
||||||
python-dotenv~=1.0.1
|
python-dotenv~=1.0.1
|
||||||
tiktoken~=0.8.0
|
|
@ -1,7 +1,6 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from idb.wordpress import fetch
|
from idb import fetch_article, translate_markdown
|
||||||
from idb.translator import translate
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
@ -18,8 +17,8 @@ if __name__ == '__main__':
|
|||||||
help="output files")
|
help="output files")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
article = fetch(args.url)
|
a = fetch_article(args.url)
|
||||||
translation = translate(article.md)
|
translation = translate_markdown(a.md)
|
||||||
|
|
||||||
save(args.output[0], article.md)
|
save(args.output[0], a.md)
|
||||||
save(args.output[1], translation)
|
save(args.output[1], translation)
|
||||||
|
19
tzo_images.py
Executable file
19
tzo_images.py
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import os.path
|
||||||
|
from idb import tzo_urls
|
||||||
|
from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
for url in tzo_urls:
|
||||||
|
name = name_from_url(url)
|
||||||
|
markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt')
|
||||||
|
image_urls = extract_images_from_markdown(read_file(markdown_file))
|
||||||
|
for image_url in image_urls:
|
||||||
|
image_name = image_url_to_filename(image_url)
|
||||||
|
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
|
||||||
|
download_file(image_url, output_file)
|
||||||
|
print(f'{image_name} saved')
|
15
tzo_odt.py
Executable file
15
tzo_odt.py
Executable file
@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import os.path
|
||||||
|
from idb import Article, DocumentCreator, tzo_urls
|
||||||
|
from idb.util import name_from_url
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
for url in tzo_urls:
|
||||||
|
name = name_from_url(url)
|
||||||
|
|
||||||
|
orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)
|
||||||
|
trans = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-en.txt'), with_title=False)
|
||||||
|
|
||||||
|
doc = DocumentCreator()
|
||||||
|
doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))
|
Loading…
x
Reference in New Issue
Block a user