md -> odt converter, first commit

This commit is contained in:
E. S 2025-03-06 01:58:58 +03:00
parent 2ebf5f18fa
commit 7f4b460c96
11 changed files with 696 additions and 20 deletions

10
.gitignore vendored
View File

@ -1,3 +1,11 @@
/.idea /.idea
/.venv /.venv
/.env /.env
/tzo_save.py
/test*py
/*.html
/*.odt
/*.md
/*.txt
/*.zip
/.DS_Store

24
cartier_odt.py Executable file
View File

@ -0,0 +1,24 @@
#!/usr/bin/env python3
import os.path
from idb import Article, DocumentCreator
from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
if __name__ == '__main__':
name = 'cartier3'
orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru')
trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en')
orig = Article.from_markdown_file(orig_path, with_title=False)
trans = Article.from_markdown_file(trans_path, with_title=False)
image_urls = extract_images_from_markdown(read_file(orig_path))
for image_url in image_urls:
image_name = image_url_to_filename(image_url)
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
if not os.path.exists(output_file):
download_file(image_url, output_file)
print(f'{image_name} saved')
doc = DocumentCreator()
doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))

View File

@ -0,0 +1,24 @@
from .wordpress import Article, fetch_article
from .translator import translate_markdown
from .doc import DocumentCreator
tzo_urls = (
'https://kniganews.org/2012/12/20/beyond-clouds-1/',
'https://kniganews.org/2012/12/21/beyond-clouds-21/',
'https://kniganews.org/2012/12/22/beyond-clouds-22/',
'https://kniganews.org/2012/12/23/beyond-clouds-31/',
'https://kniganews.org/2012/12/24/beyond-clouds-32/',
'https://kniganews.org/2012/12/25/beyond-clouds-33/',
'https://kniganews.org/2012/12/28/beyond-clouds-41/',
'https://kniganews.org/2012/12/29/beyond-clouds-42/',
'https://kniganews.org/2012/12/30/beyond-clouds-43/',
'https://kniganews.org/2013/01/01/beyond-clouds-44/',
'https://kniganews.org/2013/01/06/beyond-clouds-51/',
'https://kniganews.org/2013/01/07/beyond-clouds-52/',
'https://kniganews.org/2013/02/16/beyond-clouds-53/',
'https://kniganews.org/2013/03/25/beyond-clouds-61/',
'https://kniganews.org/2013/05/10/beyond-clouds-62/',
'https://kniganews.org/2013/06/17/beyond-clouds-731/',
'https://kniganews.org/2013/08/07/beyond-clouds-732/',
'https://kniganews.org/2013/09/17/beyond-clouds-73/'
)

424
idb/doc.py Normal file
View File

@ -0,0 +1,424 @@
import os.path
from odf.opendocument import OpenDocumentText
from odf.text import P, H, Span, A, LineBreak, List, ListItem
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
from odf.table import TableColumn, TableCell, TableRow, Table
from odf.draw import Frame, Image
from PIL import Image as PILImage
from bs4 import BeautifulSoup
from idb import Article
from idb.util import image_url_to_filename
PAGE_LAYOUT_NAME = 'LandscapeLayout'
MASTER_PAGE_NAME = 'Standard'
BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
ITALIC_STYLE_NAME = 'Italic'
BOLD_STYLE_NAME = 'Bold'
CAPTION_STYLE_NAME = 'Caption'
UNDERLINE_STYLE_NAME = 'Underline'
CENTER_STYLE_NAME = 'CenterAligned'
TITLE_STYLE_NAME = 'Title'
def add_child(parent, child):
if hasattr(child, "qname"):
parent.addElement(child)
else:
parent.addText(child)
def calc_frame_dimensions(image_path, desired_width_cm):
with PILImage.open(image_path) as img:
orig_width, orig_height = img.size
dpi = 96.0
orig_width_cm = (orig_width / dpi) * 2.54
orig_height_cm = (orig_height / dpi) * 2.54
scale = desired_width_cm / orig_width_cm
new_height_cm = orig_height_cm * scale
return f"{desired_width_cm}cm", f"{new_height_cm}cm"
class ImageWrap:
def __init__(self, image_file, caption):
self.image_file = image_file
self.caption = caption
def get_elements(self, doc):
embedded_href = doc.addPicture(self.image_file)
desired_width = 13.5
width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)
frm = Frame(width=width_str, height=height_str)
img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
frm.addElement(img)
elements = [frm]
if self.caption:
caption = P(stylename=CAPTION_STYLE_NAME)
caption.addText(self.caption)
elements.append(caption)
return elements
class DocumentCreator:
def __init__(self):
self.doc = OpenDocumentText()
self.set_styles()
def set_styles(self):
landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
landscape_props = PageLayoutProperties(
pagewidth="29.7cm",
pageheight="21.0cm",
printorientation="landscape",
margin="1cm"
)
landscape_layout.addElement(landscape_props)
self.doc.automaticstyles.addElement(landscape_layout)
masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
self.doc.masterstyles.addElement(masterpage)
# bold
style = Style(name=BOLD_STYLE_NAME, family="text")
style.addElement(TextProperties(attributes={
'fontweight': "bold",
'fontweightasian': "bold",
'fontweightcomplex': "bold"
}))
self.doc.automaticstyles.addElement(style)
# italic
style = Style(name=ITALIC_STYLE_NAME, family="text")
style.addElement(TextProperties(attributes={
'fontstyle': "italic",
'fontstyleasian': "italic",
'fontstylecomplex': "italic"
}))
self.doc.automaticstyles.addElement(style)
# caption
style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
style.addElement(TextProperties(attributes={
'fontstyle': "italic",
'fontstyleasian': "italic",
'fontstylecomplex': "italic",
'fontsize': '10pt',
'color': '#777777'
}))
style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
self.doc.automaticstyles.addElement(style)
# underline
style = Style(name=UNDERLINE_STYLE_NAME, family="text")
style.addElement(TextProperties(attributes={
'textunderlinestyle': "solid",
'textunderlinewidth': "auto"
}))
self.doc.automaticstyles.addElement(style)
# blockquote
style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
style.addElement(ParagraphProperties(attributes={
'marginleft': '0.6cm',
'margintop': '0.15cm',
'marginbottom': '0.15cm',
}))
style.addElement(TextProperties(attributes={'color': '#378A62'}))
self.doc.styles.addElement(style)
# title
style = Style(name=TITLE_STYLE_NAME, family="paragraph")
style.addElement(TextProperties(attributes={
'fontsize': '20pt',
'fontweight': "bold",
'fontweightasian': "bold",
'fontweightcomplex': "bold"
}))
style.addElement(ParagraphProperties(textalign='center'))
self.doc.styles.addElement(style)
# centered text
style = Style(name=CENTER_STYLE_NAME, family="paragraph")
style.addElement(ParagraphProperties(textalign="center"))
self.doc.automaticstyles.addElement(style)
def process_inline(self, node):
if isinstance(node, str):
return [node]
tag = node.name.lower()
simple_tags = (
('strong', 'b'),
('em', 'i'),
('ins', 'u')
)
simple_styles = (
BOLD_STYLE_NAME,
ITALIC_STYLE_NAME,
UNDERLINE_STYLE_NAME
)
for i, tags_list in enumerate(simple_tags):
if tag in tags_list:
span = Span(stylename=simple_styles[i])
for child in node.contents:
for inline in self.process_inline(child):
add_child(span, inline)
return [span]
if tag == "code":
return [Span(stylename="Code", text=node.get_text())]
elif tag == "a":
return [A(href=node.get("href"), text=node.get_text())]
elif tag == "img":
if node.name and node.name.lower() == "img":
return [self.process_img(node)]
else:
result = []
for child in node.contents:
result.extend(self.process_inline(child))
return result
def process_block(self, elem):
h_elem = self.try_process_heading(elem)
if h_elem is not None:
return h_elem
tag = elem.name.lower()
if tag == "p":
is_centered = False
has_image = False
for child in elem.contents:
# try converting heading
h_elem = self.try_process_heading(child)
if h_elem is not None:
return h_elem
if child.name:
if child.name.lower() == "img":
has_image = True
if child.name.lower() == "center":
for cchild in child.contents:
h_elem = self.try_process_heading(cchild)
if h_elem is not None:
return h_elem
is_centered = True
break
if is_centered or has_image:
p_elem = P(stylename=CENTER_STYLE_NAME)
else:
p_elem = P()
for child in elem.contents:
for inline in self.process_inline(child):
if has_image and isinstance(inline, ImageWrap):
image = inline.get_elements(self.doc)
p_elem.addElement(image[0])
elems = [p_elem]
if len(image) == 2:
elems.append(image[1])
return elems
add_child(p_elem, inline)
return p_elem
elif tag == "blockquote":
items = []
for child in elem.contents:
text = child.get_text()
if text.strip() == '':
continue
items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
return items
elif tag in ("ul", "ol"):
odf_list = List()
li_elements = elem.find_all("li", recursive=False)
for li in li_elements:
li_item = ListItem()
p = P()
for child in li.contents:
# if nested list is found, we'll process it later
if hasattr(child, "name") and child.name in ("ul", "ol"):
continue
for inline in self.process_inline(child):
add_child(p, inline)
li_item.addElement(p)
# process nested lists
for child in li.contents:
if hasattr(child, "name") and child.name in ("ul", "ol"):
nested_list = self.process_block(child)
li_item.addElement(nested_list)
odf_list.addElement(li_item)
return odf_list
elif tag == "pre":
return P(stylename="Preformatted", text=elem.get_text())
elif tag == "hr":
return P(stylename=CENTER_STYLE_NAME, text='---')
elif tag == "table":
odf_table = Table()
for tr in elem.find_all("tr"):
row = TableRow()
for cell in tr.find_all(["th", "td"]):
cell_elem = TableCell()
cell_html = "".join(str(child) for child in cell.contents)
cell_odf_elements = self.html_to_odf_elements(cell_html)
for el in cell_odf_elements:
cell_elem.addElement(el)
row.addElement(cell_elem)
odf_table.addElement(row)
return odf_table
elif tag == "img":
return self.process_img(elem).get_elements(self.doc)
elif tag == "br":
return LineBreak()
else:
p_elem = P()
p_elem.addText(elem.get_text())
return p_elem
def try_process_heading(self, elem):
if not elem.name:
return
tag = elem.name.lower()
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
return H(outlinelevel=tag[1], text=elem.get_text())
def process_img(self, elem) -> ImageWrap:
href = elem.get("src")
saved_file = os.path.join(
os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
)
if not os.path.exists(saved_file):
raise ValueError(f'image {saved_file} not found')
alt = elem.get('alt')
return ImageWrap(saved_file, alt)
def html_to_odf_elements(self, html):
soup = BeautifulSoup(html, "html.parser")
elements = []
top_nodes = soup.body.contents if soup.body else soup.contents
keeping = False
stack = []
ending_headings = (
'дополнительное чтение',
'основные источники',
'источники',
'дополнительные материалы',
'additional reading',
'further reading',
'main sources',
'additional sources',
)
for node in top_nodes:
if isinstance(node, str):
if not node.strip():
continue
p = P()
p.addText(node)
elements.append(p)
elif node.name:
if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
if stack:
elements.append(stack)
stack = []
keeping = True
result = self.process_block(node)
if keeping:
stack.append(result)
else:
elements.append(result)
if stack:
elements.append(stack)
return elements
def create(self,
orig: Article,
trans: Article,
output_odt: str,
with_title=False):
orig_elements = self.html_to_odf_elements(orig.html)
trans_elements = self.html_to_odf_elements(trans.html)
max_len = max(len(orig_elements), len(trans_elements))
while len(orig_elements) < max_len:
orig_elements.append(P())
while len(trans_elements) < max_len:
trans_elements.append(P())
main_table = Table(name="ComparisonTable")
col1 = TableColumn()
col2 = TableColumn()
main_table.addElement(col1)
main_table.addElement(col2)
if with_title:
# headings
header_row = TableRow()
header_cell_left = TableCell()
header_cell_right = TableCell()
header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))
header_row.addElement(header_cell_left)
header_row.addElement(header_cell_right)
main_table.addElement(header_row)
# content
for i in range(max_len):
row = TableRow()
cell_orig = TableCell()
cell_trans = TableCell()
if isinstance(orig_elements[i], list):
for elem in orig_elements[i]:
cell_orig.addElement(elem)
else:
cell_orig.addElement(orig_elements[i])
if isinstance(trans_elements[i], list):
for elem in trans_elements[i]:
cell_trans.addElement(elem)
else:
cell_trans.addElement(trans_elements[i])
row.addElement(cell_orig)
row.addElement(cell_trans)
main_table.addElement(row)
self.doc.text.addElement(main_table)
self.doc.save(output_odt)

View File

@ -2,11 +2,18 @@ import tiktoken
from openai import OpenAI from openai import OpenAI
model = "gpt-4o" model = "gpt-4o"
system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations." system_prompt = """
input_token_limit = 2000 Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form:
- Quotes (e.g., > quoted text). Can be multi-line.
- Links (e.g., [text](url))
- Images (e.g., ![alt text](image_url))
Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
"""
input_token_limit = 3500
def translate(text): def translate_markdown(text):
buf = [] buf = []
bufsize = 0 bufsize = 0
cl = OpenAI() cl = OpenAI()

44
idb/util.py Normal file
View File

@ -0,0 +1,44 @@
import re, os.path
import requests
from urllib.parse import urlparse
def save_file(file, content):
with open(file, 'w') as f:
f.write(content)
def read_file(filename):
with open(filename) as f:
return f.read()
def name_from_url(url):
return os.path.basename(url[:-1])
def image_url_to_filename(url):
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
name, ext = os.path.splitext(filename)
date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path)
if not date_match:
raise ValueError("no valid date found in URL")
year = date_match.group(1)
day = date_match.group(3) if date_match.group(3) else "01"
return f"{year}{day}_{name}{ext}"
def extract_images_from_markdown(markdown_text):
image_pattern = r"!\[.*?\]\((.*?)\)"
images = re.findall(image_pattern, markdown_text)
return images
def download_file(url, filename):
response = requests.get(url, stream=True)
response.raise_for_status()
if response.status_code == 200:
with open(filename, 'wb') as file:
for chunk in response.iter_content(1024):
file.write(chunk)

View File

@ -1,13 +1,123 @@
import requests import requests, re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from html import unescape from html import unescape
from markdownify import markdownify from markdownify import MarkdownConverter
from collections import namedtuple from markdown import markdown
from enum import Enum
ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
def fetch(url) -> ArticleContent: class WordpressMarkdownConverter(MarkdownConverter):
def convert_p(self, el, text, convert_as_inline):
md = self.idb_convert_image(el)
if md is not None:
return md
if str(el).startswith('<p style="text-align:center;">'):
text = text.replace('\n\n', '<br>')
return f'<center>{text}</center>\n\n'
return super().convert_p(el, text, convert_as_inline)
def _convert_hn(self, n, el, text, convert_as_inline):
md = self.idb_convert_image(el)
if md is not None:
return md
return super()._convert_hn(n, el, text, convert_as_inline)
def convert_a(self, el, *args):
md = self.idb_convert_image(el)
if md is not None:
return md
return super().convert_a(el, *args)
def convert_div(self, el, *args):
if str(el).startswith('<div class="wp-caption aligncenter" data-shortcode="caption"'):
md = self.idb_convert_image(el)
if md is not None:
return md
return super().convert_a(el, *args)
def idb_convert_image(self, el):
html = str(el)
pattern = re.compile(r'^<(?:h[1-6]|p)[^>]*>.*?<img.*?src="([^"?]+)(?:\?[^"]*)?"')
match = pattern.search(html)
if match:
return f'![]({match.group(1)})\n\n'
pattern = re.compile(r'^<div class="wp-caption aligncenter" data-shortcode="caption"[^>]+><a[^>]+><img alt="[^"]*" aria-describedby="caption-attachment.*?src="([^"?]+)(?:\?[^"]*)?".*?/></a><p.*?id="caption-attachment[^"]+"[^>]*>(.*?)</p>', re.S)
match = pattern.search(html)
if match:
src, title = match.groups()
title = unescape(title)
return f'![{title}]({src})\n\n'
def _markdownify(html, **options):
return WordpressMarkdownConverter(**options).convert(html)
def markdown_from_html(html):
def mapper(s):
s = s.strip()
if s in ('#', '# #', '# # #', '##', '###', '___'):
return f'<center>{s}</center>'
return s
md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
return '\n\n'.join(map(mapper, md.split('\n\n')))
def html_from_markdown(s):
return markdown(s, extensions=['extra', 'tables'])
class Article:
title: str
html: str
md: str
def __init__(self, title, html, md):
self.title = title
self.html = html
self.md = md
@classmethod
def from_html(cls, title, html):
return Article(title, html, markdown_from_html(html))
@classmethod
def from_markdown(cls, title, md):
return Article(title, html_from_markdown(md), md)
@classmethod
def from_markdown_file(cls, filename, with_title=True):
if with_title:
with open(filename) as f:
lines = f.readlines()
first_line = lines[0].strip()
if not first_line.startswith('#'):
raise ValueError('first line must start with #')
title = first_line.lstrip('#').strip()
lines.pop(0)
if lines and not lines[0].strip():
lines.pop(0)
md = '\n\n'.join(lines)
else:
with open(filename) as f:
md = f.read()
title = ''
return cls.from_markdown(title, md)
class Language(Enum):
English = 'en'
Russian = 'ru'
def fetch_article(url) -> Article:
response = requests.get(url) response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser') soup = BeautifulSoup(response.text, 'html.parser')
@ -17,6 +127,5 @@ def fetch(url) -> ArticleContent:
html = str(soup.find("div", class_="entry-content")).strip() html = str(soup.find("div", class_="entry-content")).strip()
title = unescape(soup.find(class_="entry-title").get_text(strip=True)) title = unescape(soup.find(class_="entry-title").get_text(strip=True))
md = markdownify(html).strip()
return ArticleContent(title, html, md) return Article.from_html(title, html)

View File

@ -1,6 +1,9 @@
odfpy~=1.4.1
pillow~=11.1.0
beautifulsoup4~=4.13.3 beautifulsoup4~=4.13.3
markdownify~=0.14.1
requests~=2.32.3 requests~=2.32.3
markdownify~=0.14.1
Markdown~=3.7
tiktoken~=0.8.0
openai~=1.61.1 openai~=1.61.1
python-dotenv~=1.0.1 python-dotenv~=1.0.1
tiktoken~=0.8.0

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from argparse import ArgumentParser from argparse import ArgumentParser
from idb.wordpress import fetch from idb import fetch_article, translate_markdown
from idb.translator import translate
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
@ -18,8 +17,8 @@ if __name__ == '__main__':
help="output files") help="output files")
args = parser.parse_args() args = parser.parse_args()
article = fetch(args.url) a = fetch_article(args.url)
translation = translate(article.md) translation = translate_markdown(a.md)
save(args.output[0], article.md) save(args.output[0], a.md)
save(args.output[1], translation) save(args.output[1], translation)

19
tzo_images.py Executable file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env python3
import os.path
from idb import tzo_urls
from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown
from dotenv import load_dotenv
load_dotenv()
if __name__ == '__main__':
for url in tzo_urls:
name = name_from_url(url)
markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt')
image_urls = extract_images_from_markdown(read_file(markdown_file))
for image_url in image_urls:
image_name = image_url_to_filename(image_url)
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
download_file(image_url, output_file)
print(f'{image_name} saved')

15
tzo_odt.py Executable file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env python3
import os.path
from idb import Article, DocumentCreator, tzo_urls
from idb.util import name_from_url
if __name__ == '__main__':
for url in tzo_urls:
name = name_from_url(url)
orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)
trans = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-en.txt'), with_title=False)
doc = DocumentCreator()
doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))