md -> odt converter, first commit

2025-03-06 01:58:58 +03:00 · 2025-03-06 01:58:58 +03:00 · 7f4b460c96
commit 7f4b460c96
parent 2ebf5f18fa
11 changed files with 696 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,11 @@
 /.idea
 /.venv
-/.env
+/.env
 /tzo_save.py
 /test*py
 /*.html
 /*.odt
 /*.md
 /*.txt
 /*.zip
 /.DS_Store
--- a/cartier_odt.py
+++ b/cartier_odt.py
@ -0,0 +1,24 @@
 #!/usr/bin/env python3
 import os.path
 from idb import Article, DocumentCreator
 from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
 if __name__ == '__main__':
    name = 'cartier3'
    orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru')
    trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en')
    orig = Article.from_markdown_file(orig_path, with_title=False)
    trans = Article.from_markdown_file(trans_path, with_title=False)
    image_urls = extract_images_from_markdown(read_file(orig_path))
    for image_url in image_urls:
        image_name = image_url_to_filename(image_url)
        output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
        if not os.path.exists(output_file):
            download_file(image_url, output_file)
        print(f'{image_name} saved')
    doc = DocumentCreator()
    doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))
--- a/idb/init.py
+++ b/idb/init.py
@ -0,0 +1,24 @@
 from .wordpress import Article, fetch_article
 from .translator import translate_markdown
 from .doc import DocumentCreator
 tzo_urls = (
    'https://kniganews.org/2012/12/20/beyond-clouds-1/',
    'https://kniganews.org/2012/12/21/beyond-clouds-21/',
    'https://kniganews.org/2012/12/22/beyond-clouds-22/',
    'https://kniganews.org/2012/12/23/beyond-clouds-31/',
    'https://kniganews.org/2012/12/24/beyond-clouds-32/',
    'https://kniganews.org/2012/12/25/beyond-clouds-33/',
    'https://kniganews.org/2012/12/28/beyond-clouds-41/',
    'https://kniganews.org/2012/12/29/beyond-clouds-42/',
    'https://kniganews.org/2012/12/30/beyond-clouds-43/',
    'https://kniganews.org/2013/01/01/beyond-clouds-44/',
    'https://kniganews.org/2013/01/06/beyond-clouds-51/',
    'https://kniganews.org/2013/01/07/beyond-clouds-52/',
    'https://kniganews.org/2013/02/16/beyond-clouds-53/',
    'https://kniganews.org/2013/03/25/beyond-clouds-61/',
    'https://kniganews.org/2013/05/10/beyond-clouds-62/',
    'https://kniganews.org/2013/06/17/beyond-clouds-731/',
    'https://kniganews.org/2013/08/07/beyond-clouds-732/',
    'https://kniganews.org/2013/09/17/beyond-clouds-73/'
 )
--- a/idb/doc.py
+++ b/idb/doc.py
@ -0,0 +1,424 @@
 import os.path
 from odf.opendocument import OpenDocumentText
 from odf.text import P, H, Span, A, LineBreak, List, ListItem
 from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
 from odf.table import TableColumn, TableCell, TableRow, Table
 from odf.draw import Frame, Image
 from PIL import Image as PILImage
 from bs4 import BeautifulSoup
 from idb import Article
 from idb.util import image_url_to_filename
 PAGE_LAYOUT_NAME = 'LandscapeLayout'
 MASTER_PAGE_NAME = 'Standard'
 BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
 ITALIC_STYLE_NAME = 'Italic'
 BOLD_STYLE_NAME = 'Bold'
 CAPTION_STYLE_NAME = 'Caption'
 UNDERLINE_STYLE_NAME = 'Underline'
 CENTER_STYLE_NAME = 'CenterAligned'
 TITLE_STYLE_NAME = 'Title'
 def add_child(parent, child):
    if hasattr(child, "qname"):
        parent.addElement(child)
    else:
        parent.addText(child)
 def calc_frame_dimensions(image_path, desired_width_cm):
    with PILImage.open(image_path) as img:
        orig_width, orig_height = img.size
    dpi = 96.0
    orig_width_cm = (orig_width / dpi) * 2.54
    orig_height_cm = (orig_height / dpi) * 2.54
    scale = desired_width_cm / orig_width_cm
    new_height_cm = orig_height_cm * scale
    return f"{desired_width_cm}cm", f"{new_height_cm}cm"
 class ImageWrap:
    def __init__(self, image_file, caption):
        self.image_file = image_file
        self.caption = caption
    def get_elements(self, doc):
        embedded_href = doc.addPicture(self.image_file)
        desired_width = 13.5
        width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)
        frm = Frame(width=width_str, height=height_str)
        img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
        frm.addElement(img)
        elements = [frm]
        if self.caption:
            caption = P(stylename=CAPTION_STYLE_NAME)
            caption.addText(self.caption)
            elements.append(caption)
        return elements
 class DocumentCreator:
    def __init__(self):
        self.doc = OpenDocumentText()
        self.set_styles()
    def set_styles(self):
        landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
        landscape_props = PageLayoutProperties(
            pagewidth="29.7cm",
            pageheight="21.0cm",
            printorientation="landscape",
            margin="1cm"
        )
        landscape_layout.addElement(landscape_props)
        self.doc.automaticstyles.addElement(landscape_layout)
        masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
        self.doc.masterstyles.addElement(masterpage)
        # bold
        style = Style(name=BOLD_STYLE_NAME, family="text")
        style.addElement(TextProperties(attributes={
            'fontweight': "bold",
            'fontweightasian': "bold",
            'fontweightcomplex': "bold"
        }))
        self.doc.automaticstyles.addElement(style)
        # italic
        style = Style(name=ITALIC_STYLE_NAME, family="text")
        style.addElement(TextProperties(attributes={
            'fontstyle': "italic",
            'fontstyleasian': "italic",
            'fontstylecomplex': "italic"
        }))
        self.doc.automaticstyles.addElement(style)
        # caption
        style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
        style.addElement(TextProperties(attributes={
            'fontstyle': "italic",
            'fontstyleasian': "italic",
            'fontstylecomplex': "italic",
            'fontsize': '10pt',
            'color': '#777777'
        }))
        style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
        self.doc.automaticstyles.addElement(style)
        # underline
        style = Style(name=UNDERLINE_STYLE_NAME, family="text")
        style.addElement(TextProperties(attributes={
            'textunderlinestyle': "solid",
            'textunderlinewidth': "auto"
        }))
        self.doc.automaticstyles.addElement(style)
        # blockquote
        style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
        style.addElement(ParagraphProperties(attributes={
            'marginleft': '0.6cm',
            'margintop': '0.15cm',
            'marginbottom': '0.15cm',
        }))
        style.addElement(TextProperties(attributes={'color': '#378A62'}))
        self.doc.styles.addElement(style)
        # title
        style = Style(name=TITLE_STYLE_NAME, family="paragraph")
        style.addElement(TextProperties(attributes={
            'fontsize': '20pt',
            'fontweight': "bold",
            'fontweightasian': "bold",
            'fontweightcomplex': "bold"
        }))
        style.addElement(ParagraphProperties(textalign='center'))
        self.doc.styles.addElement(style)
        # centered text
        style = Style(name=CENTER_STYLE_NAME, family="paragraph")
        style.addElement(ParagraphProperties(textalign="center"))
        self.doc.automaticstyles.addElement(style)
    def process_inline(self, node):
        if isinstance(node, str):
            return [node]
        tag = node.name.lower()
        simple_tags = (
            ('strong', 'b'),
            ('em', 'i'),
            ('ins', 'u')
        )
        simple_styles = (
            BOLD_STYLE_NAME,
            ITALIC_STYLE_NAME,
            UNDERLINE_STYLE_NAME
        )
        for i, tags_list in enumerate(simple_tags):
            if tag in tags_list:
                span = Span(stylename=simple_styles[i])
                for child in node.contents:
                    for inline in self.process_inline(child):
                        add_child(span, inline)
                return [span]
        if tag == "code":
            return [Span(stylename="Code", text=node.get_text())]
        elif tag == "a":
            return [A(href=node.get("href"), text=node.get_text())]
        elif tag == "img":
            if node.name and node.name.lower() == "img":
                return [self.process_img(node)]
        else:
            result = []
            for child in node.contents:
                result.extend(self.process_inline(child))
            return result
    def process_block(self, elem):
        h_elem = self.try_process_heading(elem)
        if h_elem is not None:
            return h_elem
        tag = elem.name.lower()
        if tag == "p":
            is_centered = False
            has_image = False
            for child in elem.contents:
                # try converting heading
                h_elem = self.try_process_heading(child)
                if h_elem is not None:
                    return h_elem
                if child.name:
                    if child.name.lower() == "img":
                        has_image = True
                    if child.name.lower() == "center":
                        for cchild in child.contents:
                            h_elem = self.try_process_heading(cchild)
                            if h_elem is not None:
                                return h_elem
                        is_centered = True
                        break
            if is_centered or has_image:
                p_elem = P(stylename=CENTER_STYLE_NAME)
            else:
                p_elem = P()
            for child in elem.contents:
                for inline in self.process_inline(child):
                    if has_image and isinstance(inline, ImageWrap):
                        image = inline.get_elements(self.doc)
                        p_elem.addElement(image[0])
                        elems = [p_elem]
                        if len(image) == 2:
                            elems.append(image[1])
                        return elems
                    add_child(p_elem, inline)
            return p_elem
        elif tag == "blockquote":
            items = []
            for child in elem.contents:
                text = child.get_text()
                if text.strip() == '':
                    continue
                items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
            return items
        elif tag in ("ul", "ol"):
            odf_list = List()
            li_elements = elem.find_all("li", recursive=False)
            for li in li_elements:
                li_item = ListItem()
                p = P()
                for child in li.contents:
                    # if nested list is found, we'll process it later
                    if hasattr(child, "name") and child.name in ("ul", "ol"):
                        continue
                    for inline in self.process_inline(child):
                        add_child(p, inline)
                li_item.addElement(p)
                # process nested lists
                for child in li.contents:
                    if hasattr(child, "name") and child.name in ("ul", "ol"):
                        nested_list = self.process_block(child)
                        li_item.addElement(nested_list)
                odf_list.addElement(li_item)
            return odf_list
        elif tag == "pre":
            return P(stylename="Preformatted", text=elem.get_text())
        elif tag == "hr":
            return P(stylename=CENTER_STYLE_NAME, text='---')
        elif tag == "table":
            odf_table = Table()
            for tr in elem.find_all("tr"):
                row = TableRow()
                for cell in tr.find_all(["th", "td"]):
                    cell_elem = TableCell()
                    cell_html = "".join(str(child) for child in cell.contents)
                    cell_odf_elements = self.html_to_odf_elements(cell_html)
                    for el in cell_odf_elements:
                        cell_elem.addElement(el)
                    row.addElement(cell_elem)
                odf_table.addElement(row)
            return odf_table
        elif tag == "img":
            return self.process_img(elem).get_elements(self.doc)
        elif tag == "br":
            return LineBreak()
        else:
            p_elem = P()
            p_elem.addText(elem.get_text())
            return p_elem
    def try_process_heading(self, elem):
        if not elem.name:
            return
        tag = elem.name.lower()
        if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
            return H(outlinelevel=tag[1], text=elem.get_text())
    def process_img(self, elem) -> ImageWrap:
        href = elem.get("src")
        saved_file = os.path.join(
            os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
        )
        if not os.path.exists(saved_file):
            raise ValueError(f'image {saved_file} not found')
        alt = elem.get('alt')
        return ImageWrap(saved_file, alt)
    def html_to_odf_elements(self, html):
        soup = BeautifulSoup(html, "html.parser")
        elements = []
        top_nodes = soup.body.contents if soup.body else soup.contents
        keeping = False
        stack = []
        ending_headings = (
            'дополнительное чтение',
            'основные источники',
            'источники',
            'дополнительные материалы',
            'additional reading',
            'further reading',
            'main sources',
            'additional sources',
        )
        for node in top_nodes:
            if isinstance(node, str):
                if not node.strip():
                    continue
                p = P()
                p.addText(node)
                elements.append(p)
            elif node.name:
                if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
                    if stack:
                        elements.append(stack)
                        stack = []
                    keeping = True
                result = self.process_block(node)
                if keeping:
                    stack.append(result)
                else:
                    elements.append(result)
        if stack:
            elements.append(stack)
        return elements
    def create(self,
               orig: Article,
               trans: Article,
               output_odt: str,
               with_title=False):
        orig_elements = self.html_to_odf_elements(orig.html)
        trans_elements = self.html_to_odf_elements(trans.html)
        max_len = max(len(orig_elements), len(trans_elements))
        while len(orig_elements) < max_len:
            orig_elements.append(P())
        while len(trans_elements) < max_len:
            trans_elements.append(P())
        main_table = Table(name="ComparisonTable")
        col1 = TableColumn()
        col2 = TableColumn()
        main_table.addElement(col1)
        main_table.addElement(col2)
        if with_title:
            # headings
            header_row = TableRow()
            header_cell_left = TableCell()
            header_cell_right = TableCell()
            header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
            header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))
            header_row.addElement(header_cell_left)
            header_row.addElement(header_cell_right)
            main_table.addElement(header_row)
        # content
        for i in range(max_len):
            row = TableRow()
            cell_orig = TableCell()
            cell_trans = TableCell()
            if isinstance(orig_elements[i], list):
                for elem in orig_elements[i]:
                    cell_orig.addElement(elem)
            else:
                cell_orig.addElement(orig_elements[i])
            if isinstance(trans_elements[i], list):
                for elem in trans_elements[i]:
                    cell_trans.addElement(elem)
            else:
                cell_trans.addElement(trans_elements[i])
            row.addElement(cell_orig)
            row.addElement(cell_trans)
            main_table.addElement(row)
        self.doc.text.addElement(main_table)
        self.doc.save(output_odt)
--- a/idb/translator.py
+++ b/idb/translator.py
@ -2,11 +2,18 @@ import tiktoken
 from openai import OpenAI
 model = "gpt-4o"
-system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations."
+system_prompt = """
-input_token_limit = 2000
+Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form:
 - Quotes (e.g., > quoted text). Can be multi-line.
 - Links (e.g., [text](url))
 - Images (e.g., ![alt text](image_url))
 Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
 """
 input_token_limit = 3500
-def translate(text):
+def translate_markdown(text):
    buf = []
    bufsize = 0
    cl = OpenAI()
--- a/idb/util.py
+++ b/idb/util.py
@ -0,0 +1,44 @@
 import re, os.path
 import requests
 from urllib.parse import urlparse
 def save_file(file, content):
    with open(file, 'w') as f:
        f.write(content)
 def read_file(filename):
    with open(filename) as f:
        return f.read()
 def name_from_url(url):
    return os.path.basename(url[:-1])
 def image_url_to_filename(url):
    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)
    name, ext = os.path.splitext(filename)
    date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path)
    if not date_match:
        raise ValueError("no valid date found in URL")
    year = date_match.group(1)
    day = date_match.group(3) if date_match.group(3) else "01"
    return f"{year}{day}_{name}{ext}"
 def extract_images_from_markdown(markdown_text):
    image_pattern = r"!\[.*?\]\((.*?)\)"
    images = re.findall(image_pattern, markdown_text)
    return images
 def download_file(url, filename):
    response = requests.get(url, stream=True)
    response.raise_for_status()
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
--- a/idb/wordpress.py
+++ b/idb/wordpress.py
@ -1,13 +1,123 @@
-import requests
+import requests, re
 from bs4 import BeautifulSoup
 from html import unescape
-from markdownify import markdownify
+from markdownify import MarkdownConverter
-from collections import namedtuple
+from markdown import markdown
-
+from enum import Enum
 ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
-def fetch(url) -> ArticleContent:
+class WordpressMarkdownConverter(MarkdownConverter):
    def convert_p(self, el, text, convert_as_inline):
        md = self.idb_convert_image(el)
        if md is not None:
            return md
        if str(el).startswith('<p style="text-align:center;">'):
            text = text.replace('\n\n', '<br>')
            return f'<center>{text}</center>\n\n'
        return super().convert_p(el, text, convert_as_inline)
    def _convert_hn(self, n, el, text, convert_as_inline):
        md = self.idb_convert_image(el)
        if md is not None:
            return md
        return super()._convert_hn(n, el, text, convert_as_inline)
    def convert_a(self, el, *args):
        md = self.idb_convert_image(el)
        if md is not None:
            return md
        return super().convert_a(el, *args)
    def convert_div(self, el, *args):
        if str(el).startswith('<div class="wp-caption aligncenter" data-shortcode="caption"'):
            md = self.idb_convert_image(el)
            if md is not None:
                return md
        return super().convert_a(el, *args)
    def idb_convert_image(self, el):
        html = str(el)
        pattern = re.compile(r'^<(?:h[1-6]|p)[^>]*>.*?<img.*?src="([^"?]+)(?:\?[^"]*)?"')
        match = pattern.search(html)
        if match:
            return f'![]({match.group(1)})\n\n'
        pattern = re.compile(r'^<div class="wp-caption aligncenter" data-shortcode="caption"[^>]+><a[^>]+><img alt="[^"]*" aria-describedby="caption-attachment.*?src="([^"?]+)(?:\?[^"]*)?".*?/></a><p.*?id="caption-attachment[^"]+"[^>]*>(.*?)</p>', re.S)
        match = pattern.search(html)
        if match:
            src, title = match.groups()
            title = unescape(title)
            return f'![{title}]({src})\n\n'
 def _markdownify(html, **options):
    return WordpressMarkdownConverter(**options).convert(html)
 def markdown_from_html(html):
    def mapper(s):
        s = s.strip()
        if s in ('#', '# #', '# # #', '##', '###', '___'):
            return f'<center>{s}</center>'
        return s
    md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
    return '\n\n'.join(map(mapper, md.split('\n\n')))
 def html_from_markdown(s):
    return markdown(s, extensions=['extra', 'tables'])
 class Article:
    title: str
    html: str
    md: str
    def __init__(self, title, html, md):
        self.title = title
        self.html = html
        self.md = md
    @classmethod
    def from_html(cls, title, html):
        return Article(title, html, markdown_from_html(html))
    @classmethod
    def from_markdown(cls, title, md):
        return Article(title, html_from_markdown(md), md)
    @classmethod
    def from_markdown_file(cls, filename, with_title=True):
        if with_title:
            with open(filename) as f:
                lines = f.readlines()
            first_line = lines[0].strip()
            if not first_line.startswith('#'):
                raise ValueError('first line must start with #')
            title = first_line.lstrip('#').strip()
            lines.pop(0)
            if lines and not lines[0].strip():
                lines.pop(0)
            md = '\n\n'.join(lines)
        else:
            with open(filename) as f:
                md = f.read()
            title = ''
        return cls.from_markdown(title, md)
 class Language(Enum):
    English = 'en'
    Russian = 'ru'
 def fetch_article(url) -> Article:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
@ -17,6 +127,5 @@ def fetch(url) -> ArticleContent:
    html = str(soup.find("div", class_="entry-content")).strip()
    title = unescape(soup.find(class_="entry-title").get_text(strip=True))
    md = markdownify(html).strip()
-    return ArticleContent(title, html, md)
+    return Article.from_html(title, html)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,9 @@
 odfpy~=1.4.1
 pillow~=11.1.0
 beautifulsoup4~=4.13.3
 markdownify~=0.14.1
 requests~=2.32.3
 markdownify~=0.14.1
 Markdown~=3.7
 tiktoken~=0.8.0
 openai~=1.61.1
-python-dotenv~=1.0.1
+python-dotenv~=1.0.1
 tiktoken~=0.8.0
--- a/single_article.py
+++ b/single_article.py
@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 from argparse import ArgumentParser
-from idb.wordpress import fetch
+from idb import fetch_article, translate_markdown
 from idb.translator import translate
 from dotenv import load_dotenv
 load_dotenv()
@ -18,8 +17,8 @@ if __name__ == '__main__':
                        help="output files")
    args = parser.parse_args()
-    article = fetch(args.url)
+    a = fetch_article(args.url)
-    translation = translate(article.md)
+    translation = translate_markdown(a.md)
-    save(args.output[0], article.md)
+    save(args.output[0], a.md)
    save(args.output[1], translation)
--- a/tzo_images.py
+++ b/tzo_images.py
@ -0,0 +1,19 @@
 #!/usr/bin/env python3
 import os.path
 from idb import tzo_urls
 from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown
 from dotenv import load_dotenv
 load_dotenv()
 if __name__ == '__main__':
    for url in tzo_urls:
        name = name_from_url(url)
        markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt')
        image_urls = extract_images_from_markdown(read_file(markdown_file))
        for image_url in image_urls:
            image_name = image_url_to_filename(image_url)
            output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
            download_file(image_url, output_file)
            print(f'{image_name} saved')
--- a/tzo_odt.py
+++ b/tzo_odt.py
@ -0,0 +1,15 @@
 #!/usr/bin/env python3
 import os.path
 from idb import Article, DocumentCreator, tzo_urls
 from idb.util import name_from_url
 if __name__ == '__main__':
    for url in tzo_urls:
        name = name_from_url(url)
        orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)
        trans = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-en.txt'), with_title=False)
        doc = DocumentCreator()
        doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))