From 7f4b460c964a8cb4764b967616843cfd9d1b6259 Mon Sep 17 00:00:00 2001 From: "E. S" Date: Thu, 6 Mar 2025 01:58:58 +0300 Subject: [PATCH] md -> odt converter, first commit --- .gitignore | 10 +- cartier_odt.py | 24 +++ idb/__init__.py | 24 +++ idb/doc.py | 424 ++++++++++++++++++++++++++++++++++++++++++++++ idb/translator.py | 13 +- idb/util.py | 44 +++++ idb/wordpress.py | 125 +++++++++++++- requirements.txt | 9 +- single_article.py | 9 +- tzo_images.py | 19 +++ tzo_odt.py | 15 ++ 11 files changed, 696 insertions(+), 20 deletions(-) create mode 100755 cartier_odt.py create mode 100644 idb/doc.py create mode 100644 idb/util.py create mode 100755 tzo_images.py create mode 100755 tzo_odt.py diff --git a/.gitignore b/.gitignore index 342783d..d1d609d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,11 @@ /.idea /.venv -/.env \ No newline at end of file +/.env +/tzo_save.py +/test*py +/*.html +/*.odt +/*.md +/*.txt +/*.zip +/.DS_Store \ No newline at end of file diff --git a/cartier_odt.py b/cartier_odt.py new file mode 100755 index 0000000..07a9619 --- /dev/null +++ b/cartier_odt.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import os.path +from idb import Article, DocumentCreator +from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file + + +if __name__ == '__main__': + name = 'cartier3' + orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru') + trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en') + + orig = Article.from_markdown_file(orig_path, with_title=False) + trans = Article.from_markdown_file(trans_path, with_title=False) + + image_urls = extract_images_from_markdown(read_file(orig_path)) + for image_url in image_urls: + image_name = image_url_to_filename(image_url) + output_file = os.path.join(os.path.dirname(__file__), 'images', image_name) + if not os.path.exists(output_file): + download_file(image_url, output_file) + print(f'{image_name} saved') + + doc = DocumentCreator() + doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt')) \ No newline at end of file diff --git a/idb/__init__.py b/idb/__init__.py index e69de29..a2fd00b 100644 --- a/idb/__init__.py +++ b/idb/__init__.py @@ -0,0 +1,24 @@ +from .wordpress import Article, fetch_article +from .translator import translate_markdown +from .doc import DocumentCreator + +tzo_urls = ( + 'https://kniganews.org/2012/12/20/beyond-clouds-1/', + 'https://kniganews.org/2012/12/21/beyond-clouds-21/', + 'https://kniganews.org/2012/12/22/beyond-clouds-22/', + 'https://kniganews.org/2012/12/23/beyond-clouds-31/', + 'https://kniganews.org/2012/12/24/beyond-clouds-32/', + 'https://kniganews.org/2012/12/25/beyond-clouds-33/', + 'https://kniganews.org/2012/12/28/beyond-clouds-41/', + 'https://kniganews.org/2012/12/29/beyond-clouds-42/', + 'https://kniganews.org/2012/12/30/beyond-clouds-43/', + 'https://kniganews.org/2013/01/01/beyond-clouds-44/', + 'https://kniganews.org/2013/01/06/beyond-clouds-51/', + 'https://kniganews.org/2013/01/07/beyond-clouds-52/', + 'https://kniganews.org/2013/02/16/beyond-clouds-53/', + 'https://kniganews.org/2013/03/25/beyond-clouds-61/', + 'https://kniganews.org/2013/05/10/beyond-clouds-62/', + 'https://kniganews.org/2013/06/17/beyond-clouds-731/', + 'https://kniganews.org/2013/08/07/beyond-clouds-732/', + 'https://kniganews.org/2013/09/17/beyond-clouds-73/' +) \ No newline at end of file diff --git a/idb/doc.py b/idb/doc.py new file mode 100644 index 0000000..6977bf7 --- /dev/null +++ b/idb/doc.py @@ -0,0 +1,424 @@ +import os.path + +from odf.opendocument import OpenDocumentText +from odf.text import P, H, Span, A, LineBreak, List, ListItem +from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage +from odf.table import TableColumn, TableCell, TableRow, Table +from odf.draw import Frame, Image + +from PIL import Image as PILImage + +from bs4 import BeautifulSoup +from idb import Article +from idb.util import image_url_to_filename + +PAGE_LAYOUT_NAME = 'LandscapeLayout' +MASTER_PAGE_NAME = 'Standard' +BLOCKQUOTE_STYLE_NAME = 'Block Quotation' +ITALIC_STYLE_NAME = 'Italic' +BOLD_STYLE_NAME = 'Bold' +CAPTION_STYLE_NAME = 'Caption' +UNDERLINE_STYLE_NAME = 'Underline' +CENTER_STYLE_NAME = 'CenterAligned' +TITLE_STYLE_NAME = 'Title' + + +def add_child(parent, child): + if hasattr(child, "qname"): + parent.addElement(child) + else: + parent.addText(child) + + +def calc_frame_dimensions(image_path, desired_width_cm): + with PILImage.open(image_path) as img: + orig_width, orig_height = img.size + + dpi = 96.0 + orig_width_cm = (orig_width / dpi) * 2.54 + orig_height_cm = (orig_height / dpi) * 2.54 + + scale = desired_width_cm / orig_width_cm + new_height_cm = orig_height_cm * scale + + return f"{desired_width_cm}cm", f"{new_height_cm}cm" + + +class ImageWrap: + def __init__(self, image_file, caption): + self.image_file = image_file + self.caption = caption + + def get_elements(self, doc): + embedded_href = doc.addPicture(self.image_file) + + desired_width = 13.5 + width_str, height_str = calc_frame_dimensions(self.image_file, desired_width) + + frm = Frame(width=width_str, height=height_str) + img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad") + frm.addElement(img) + + elements = [frm] + + if self.caption: + caption = P(stylename=CAPTION_STYLE_NAME) + caption.addText(self.caption) + elements.append(caption) + + return elements + + +class DocumentCreator: + def __init__(self): + self.doc = OpenDocumentText() + self.set_styles() + + def set_styles(self): + landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME) + landscape_props = PageLayoutProperties( + pagewidth="29.7cm", + pageheight="21.0cm", + printorientation="landscape", + margin="1cm" + ) + landscape_layout.addElement(landscape_props) + self.doc.automaticstyles.addElement(landscape_layout) + + masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME) + self.doc.masterstyles.addElement(masterpage) + + # bold + style = Style(name=BOLD_STYLE_NAME, family="text") + style.addElement(TextProperties(attributes={ + 'fontweight': "bold", + 'fontweightasian': "bold", + 'fontweightcomplex': "bold" + })) + self.doc.automaticstyles.addElement(style) + + # italic + style = Style(name=ITALIC_STYLE_NAME, family="text") + style.addElement(TextProperties(attributes={ + 'fontstyle': "italic", + 'fontstyleasian': "italic", + 'fontstylecomplex': "italic" + })) + self.doc.automaticstyles.addElement(style) + + # caption + style = Style(name=CAPTION_STYLE_NAME, family="paragraph") + style.addElement(TextProperties(attributes={ + 'fontstyle': "italic", + 'fontstyleasian': "italic", + 'fontstylecomplex': "italic", + 'fontsize': '10pt', + 'color': '#777777' + })) + style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm')) + self.doc.automaticstyles.addElement(style) + + # underline + style = Style(name=UNDERLINE_STYLE_NAME, family="text") + style.addElement(TextProperties(attributes={ + 'textunderlinestyle': "solid", + 'textunderlinewidth': "auto" + })) + self.doc.automaticstyles.addElement(style) + + # blockquote + style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph") + style.addElement(ParagraphProperties(attributes={ + 'marginleft': '0.6cm', + 'margintop': '0.15cm', + 'marginbottom': '0.15cm', + })) + style.addElement(TextProperties(attributes={'color': '#378A62'})) + self.doc.styles.addElement(style) + + # title + style = Style(name=TITLE_STYLE_NAME, family="paragraph") + style.addElement(TextProperties(attributes={ + 'fontsize': '20pt', + 'fontweight': "bold", + 'fontweightasian': "bold", + 'fontweightcomplex': "bold" + })) + style.addElement(ParagraphProperties(textalign='center')) + self.doc.styles.addElement(style) + + # centered text + style = Style(name=CENTER_STYLE_NAME, family="paragraph") + style.addElement(ParagraphProperties(textalign="center")) + self.doc.automaticstyles.addElement(style) + + def process_inline(self, node): + if isinstance(node, str): + return [node] + + tag = node.name.lower() + + simple_tags = ( + ('strong', 'b'), + ('em', 'i'), + ('ins', 'u') + ) + simple_styles = ( + BOLD_STYLE_NAME, + ITALIC_STYLE_NAME, + UNDERLINE_STYLE_NAME + ) + for i, tags_list in enumerate(simple_tags): + if tag in tags_list: + span = Span(stylename=simple_styles[i]) + for child in node.contents: + for inline in self.process_inline(child): + add_child(span, inline) + return [span] + + if tag == "code": + return [Span(stylename="Code", text=node.get_text())] + + elif tag == "a": + return [A(href=node.get("href"), text=node.get_text())] + + elif tag == "img": + if node.name and node.name.lower() == "img": + return [self.process_img(node)] + + else: + result = [] + for child in node.contents: + result.extend(self.process_inline(child)) + return result + + def process_block(self, elem): + h_elem = self.try_process_heading(elem) + if h_elem is not None: + return h_elem + + tag = elem.name.lower() + if tag == "p": + is_centered = False + has_image = False + for child in elem.contents: + # try converting heading + h_elem = self.try_process_heading(child) + if h_elem is not None: + return h_elem + + if child.name: + if child.name.lower() == "img": + has_image = True + if child.name.lower() == "center": + for cchild in child.contents: + h_elem = self.try_process_heading(cchild) + if h_elem is not None: + return h_elem + is_centered = True + break + + if is_centered or has_image: + p_elem = P(stylename=CENTER_STYLE_NAME) + else: + p_elem = P() + + for child in elem.contents: + for inline in self.process_inline(child): + if has_image and isinstance(inline, ImageWrap): + image = inline.get_elements(self.doc) + p_elem.addElement(image[0]) + elems = [p_elem] + if len(image) == 2: + elems.append(image[1]) + return elems + + add_child(p_elem, inline) + + return p_elem + + elif tag == "blockquote": + items = [] + for child in elem.contents: + text = child.get_text() + if text.strip() == '': + continue + items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text)) + return items + + elif tag in ("ul", "ol"): + odf_list = List() + li_elements = elem.find_all("li", recursive=False) + for li in li_elements: + li_item = ListItem() + p = P() + for child in li.contents: + # if nested list is found, we'll process it later + if hasattr(child, "name") and child.name in ("ul", "ol"): + continue + for inline in self.process_inline(child): + add_child(p, inline) + li_item.addElement(p) + + # process nested lists + for child in li.contents: + if hasattr(child, "name") and child.name in ("ul", "ol"): + nested_list = self.process_block(child) + li_item.addElement(nested_list) + + odf_list.addElement(li_item) + return odf_list + + elif tag == "pre": + return P(stylename="Preformatted", text=elem.get_text()) + + elif tag == "hr": + return P(stylename=CENTER_STYLE_NAME, text='---') + + elif tag == "table": + odf_table = Table() + for tr in elem.find_all("tr"): + row = TableRow() + for cell in tr.find_all(["th", "td"]): + cell_elem = TableCell() + cell_html = "".join(str(child) for child in cell.contents) + cell_odf_elements = self.html_to_odf_elements(cell_html) + for el in cell_odf_elements: + cell_elem.addElement(el) + row.addElement(cell_elem) + odf_table.addElement(row) + return odf_table + + elif tag == "img": + return self.process_img(elem).get_elements(self.doc) + + elif tag == "br": + return LineBreak() + + else: + p_elem = P() + p_elem.addText(elem.get_text()) + return p_elem + + def try_process_heading(self, elem): + if not elem.name: + return + + tag = elem.name.lower() + if tag in ("h1", "h2", "h3", "h4", "h5", "h6"): + return H(outlinelevel=tag[1], text=elem.get_text()) + + def process_img(self, elem) -> ImageWrap: + href = elem.get("src") + saved_file = os.path.join( + os.path.dirname(__file__), '..', 'images', image_url_to_filename(href) + ) + if not os.path.exists(saved_file): + raise ValueError(f'image {saved_file} not found') + + alt = elem.get('alt') + return ImageWrap(saved_file, alt) + + def html_to_odf_elements(self, html): + soup = BeautifulSoup(html, "html.parser") + elements = [] + top_nodes = soup.body.contents if soup.body else soup.contents + + keeping = False + stack = [] + ending_headings = ( + 'дополнительное чтение', + 'основные источники', + 'источники', + 'дополнительные материалы', + 'additional reading', + 'further reading', + 'main sources', + 'additional sources', + ) + + for node in top_nodes: + if isinstance(node, str): + if not node.strip(): + continue + p = P() + p.addText(node) + elements.append(p) + elif node.name: + if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings): + if stack: + elements.append(stack) + stack = [] + keeping = True + result = self.process_block(node) + if keeping: + stack.append(result) + else: + elements.append(result) + if stack: + elements.append(stack) + return elements + + def create(self, + orig: Article, + trans: Article, + output_odt: str, + with_title=False): + orig_elements = self.html_to_odf_elements(orig.html) + trans_elements = self.html_to_odf_elements(trans.html) + + max_len = max(len(orig_elements), len(trans_elements)) + while len(orig_elements) < max_len: + orig_elements.append(P()) + while len(trans_elements) < max_len: + trans_elements.append(P()) + + main_table = Table(name="ComparisonTable") + + col1 = TableColumn() + col2 = TableColumn() + + main_table.addElement(col1) + main_table.addElement(col2) + + if with_title: + # headings + header_row = TableRow() + + header_cell_left = TableCell() + header_cell_right = TableCell() + + header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title)) + header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title)) + + header_row.addElement(header_cell_left) + header_row.addElement(header_cell_right) + + main_table.addElement(header_row) + + # content + for i in range(max_len): + row = TableRow() + + cell_orig = TableCell() + cell_trans = TableCell() + + if isinstance(orig_elements[i], list): + for elem in orig_elements[i]: + cell_orig.addElement(elem) + else: + cell_orig.addElement(orig_elements[i]) + + if isinstance(trans_elements[i], list): + for elem in trans_elements[i]: + cell_trans.addElement(elem) + else: + cell_trans.addElement(trans_elements[i]) + + row.addElement(cell_orig) + row.addElement(cell_trans) + + main_table.addElement(row) + + self.doc.text.addElement(main_table) + self.doc.save(output_odt) diff --git a/idb/translator.py b/idb/translator.py index f316cd4..912bf98 100644 --- a/idb/translator.py +++ b/idb/translator.py @@ -2,11 +2,18 @@ import tiktoken from openai import OpenAI model = "gpt-4o" -system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations." -input_token_limit = 2000 +system_prompt = """ +Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form: +- Quotes (e.g., > quoted text). Can be multi-line. +- Links (e.g., [text](url)) +- Images (e.g., ![alt text](image_url)) + +Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content. +""" +input_token_limit = 3500 -def translate(text): +def translate_markdown(text): buf = [] bufsize = 0 cl = OpenAI() diff --git a/idb/util.py b/idb/util.py new file mode 100644 index 0000000..609a526 --- /dev/null +++ b/idb/util.py @@ -0,0 +1,44 @@ +import re, os.path +import requests +from urllib.parse import urlparse + + +def save_file(file, content): + with open(file, 'w') as f: + f.write(content) + + +def read_file(filename): + with open(filename) as f: + return f.read() + + +def name_from_url(url): + return os.path.basename(url[:-1]) + + +def image_url_to_filename(url): + parsed_url = urlparse(url) + filename = os.path.basename(parsed_url.path) + name, ext = os.path.splitext(filename) + date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path) + if not date_match: + raise ValueError("no valid date found in URL") + year = date_match.group(1) + day = date_match.group(3) if date_match.group(3) else "01" + return f"{year}{day}_{name}{ext}" + + +def extract_images_from_markdown(markdown_text): + image_pattern = r"!\[.*?\]\((.*?)\)" + images = re.findall(image_pattern, markdown_text) + return images + + +def download_file(url, filename): + response = requests.get(url, stream=True) + response.raise_for_status() + if response.status_code == 200: + with open(filename, 'wb') as file: + for chunk in response.iter_content(1024): + file.write(chunk) diff --git a/idb/wordpress.py b/idb/wordpress.py index 10559f9..7d2d264 100644 --- a/idb/wordpress.py +++ b/idb/wordpress.py @@ -1,13 +1,123 @@ -import requests +import requests, re from bs4 import BeautifulSoup from html import unescape -from markdownify import markdownify -from collections import namedtuple - -ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md']) +from markdownify import MarkdownConverter +from markdown import markdown +from enum import Enum -def fetch(url) -> ArticleContent: +class WordpressMarkdownConverter(MarkdownConverter): + def convert_p(self, el, text, convert_as_inline): + md = self.idb_convert_image(el) + if md is not None: + return md + if str(el).startswith('

'): + text = text.replace('\n\n', '
') + return f'

{text}
\n\n' + return super().convert_p(el, text, convert_as_inline) + + def _convert_hn(self, n, el, text, convert_as_inline): + md = self.idb_convert_image(el) + if md is not None: + return md + return super()._convert_hn(n, el, text, convert_as_inline) + + def convert_a(self, el, *args): + md = self.idb_convert_image(el) + if md is not None: + return md + return super().convert_a(el, *args) + + def convert_div(self, el, *args): + if str(el).startswith('
]*>.*?]+>]+>[^]*>(.*?)

', re.S) + match = pattern.search(html) + if match: + src, title = match.groups() + title = unescape(title) + return f'![{title}]({src})\n\n' + + +def _markdownify(html, **options): + return WordpressMarkdownConverter(**options).convert(html) + + +def markdown_from_html(html): + def mapper(s): + s = s.strip() + if s in ('#', '# #', '# # #', '##', '###', '___'): + return f'
{s}
' + return s + + md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip() + return '\n\n'.join(map(mapper, md.split('\n\n'))) + + +def html_from_markdown(s): + return markdown(s, extensions=['extra', 'tables']) + + +class Article: + title: str + html: str + md: str + + def __init__(self, title, html, md): + self.title = title + self.html = html + self.md = md + + @classmethod + def from_html(cls, title, html): + return Article(title, html, markdown_from_html(html)) + + @classmethod + def from_markdown(cls, title, md): + return Article(title, html_from_markdown(md), md) + + @classmethod + def from_markdown_file(cls, filename, with_title=True): + if with_title: + with open(filename) as f: + lines = f.readlines() + + first_line = lines[0].strip() + if not first_line.startswith('#'): + raise ValueError('first line must start with #') + title = first_line.lstrip('#').strip() + + lines.pop(0) + if lines and not lines[0].strip(): + lines.pop(0) + + md = '\n\n'.join(lines) + else: + with open(filename) as f: + md = f.read() + title = '' + + return cls.from_markdown(title, md) + + +class Language(Enum): + English = 'en' + Russian = 'ru' + + +def fetch_article(url) -> Article: response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') @@ -17,6 +127,5 @@ def fetch(url) -> ArticleContent: html = str(soup.find("div", class_="entry-content")).strip() title = unescape(soup.find(class_="entry-title").get_text(strip=True)) - md = markdownify(html).strip() - return ArticleContent(title, html, md) + return Article.from_html(title, html) diff --git a/requirements.txt b/requirements.txt index 6ffb267..4b5d369 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,9 @@ +odfpy~=1.4.1 +pillow~=11.1.0 beautifulsoup4~=4.13.3 -markdownify~=0.14.1 requests~=2.32.3 +markdownify~=0.14.1 +Markdown~=3.7 +tiktoken~=0.8.0 openai~=1.61.1 -python-dotenv~=1.0.1 -tiktoken~=0.8.0 \ No newline at end of file +python-dotenv~=1.0.1 \ No newline at end of file diff --git a/single_article.py b/single_article.py index 1c3e3bf..7fffeb8 100755 --- a/single_article.py +++ b/single_article.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 from argparse import ArgumentParser -from idb.wordpress import fetch -from idb.translator import translate +from idb import fetch_article, translate_markdown from dotenv import load_dotenv load_dotenv() @@ -18,8 +17,8 @@ if __name__ == '__main__': help="output files") args = parser.parse_args() - article = fetch(args.url) - translation = translate(article.md) + a = fetch_article(args.url) + translation = translate_markdown(a.md) - save(args.output[0], article.md) + save(args.output[0], a.md) save(args.output[1], translation) diff --git a/tzo_images.py b/tzo_images.py new file mode 100755 index 0000000..545522a --- /dev/null +++ b/tzo_images.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +import os.path +from idb import tzo_urls +from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown +from dotenv import load_dotenv + +load_dotenv() + + +if __name__ == '__main__': + for url in tzo_urls: + name = name_from_url(url) + markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt') + image_urls = extract_images_from_markdown(read_file(markdown_file)) + for image_url in image_urls: + image_name = image_url_to_filename(image_url) + output_file = os.path.join(os.path.dirname(__file__), 'images', image_name) + download_file(image_url, output_file) + print(f'{image_name} saved') \ No newline at end of file diff --git a/tzo_odt.py b/tzo_odt.py new file mode 100755 index 0000000..49896ee --- /dev/null +++ b/tzo_odt.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +import os.path +from idb import Article, DocumentCreator, tzo_urls +from idb.util import name_from_url + + +if __name__ == '__main__': + for url in tzo_urls: + name = name_from_url(url) + + orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False) + trans = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-en.txt'), with_title=False) + + doc = DocumentCreator() + doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt')) \ No newline at end of file