md -> odt converter, first commit

2025-03-06 01:58:58 +03:00 · 2025-03-06 01:58:58 +03:00 · 7f4b460c96
commit 7f4b460c96
parent 2ebf5f18fa
11 changed files with 696 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,11 @@
 /.idea
 /.venv
-/.env
+/.env
+/tzo_save.py
+/test*py
+/*.html
+/*.odt
+/*.md
+/*.txt
+/*.zip
+/.DS_Store
--- a/cartier_odt.py
+++ b/cartier_odt.py
@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import os.path
+from idb import Article, DocumentCreator
+from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
+
+
+if __name__ == '__main__':
+    name = 'cartier3'
+    orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru')
+    trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en')
+
+    orig = Article.from_markdown_file(orig_path, with_title=False)
+    trans = Article.from_markdown_file(trans_path, with_title=False)
+
+    image_urls = extract_images_from_markdown(read_file(orig_path))
+    for image_url in image_urls:
+        image_name = image_url_to_filename(image_url)
+        output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
+        if not os.path.exists(output_file):
+            download_file(image_url, output_file)
+        print(f'{image_name} saved')
+
+    doc = DocumentCreator()
+    doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))
--- a/idb/init.py
+++ b/idb/init.py
@ -0,0 +1,24 @@
+from .wordpress import Article, fetch_article
+from .translator import translate_markdown
+from .doc import DocumentCreator
+
+tzo_urls = (
+    'https://kniganews.org/2012/12/20/beyond-clouds-1/',
+    'https://kniganews.org/2012/12/21/beyond-clouds-21/',
+    'https://kniganews.org/2012/12/22/beyond-clouds-22/',
+    'https://kniganews.org/2012/12/23/beyond-clouds-31/',
+    'https://kniganews.org/2012/12/24/beyond-clouds-32/',
+    'https://kniganews.org/2012/12/25/beyond-clouds-33/',
+    'https://kniganews.org/2012/12/28/beyond-clouds-41/',
+    'https://kniganews.org/2012/12/29/beyond-clouds-42/',
+    'https://kniganews.org/2012/12/30/beyond-clouds-43/',
+    'https://kniganews.org/2013/01/01/beyond-clouds-44/',
+    'https://kniganews.org/2013/01/06/beyond-clouds-51/',
+    'https://kniganews.org/2013/01/07/beyond-clouds-52/',
+    'https://kniganews.org/2013/02/16/beyond-clouds-53/',
+    'https://kniganews.org/2013/03/25/beyond-clouds-61/',
+    'https://kniganews.org/2013/05/10/beyond-clouds-62/',
+    'https://kniganews.org/2013/06/17/beyond-clouds-731/',
+    'https://kniganews.org/2013/08/07/beyond-clouds-732/',
+    'https://kniganews.org/2013/09/17/beyond-clouds-73/'
+)
--- a/idb/doc.py
+++ b/idb/doc.py
@ -0,0 +1,424 @@
+import os.path
+
+from odf.opendocument import OpenDocumentText
+from odf.text import P, H, Span, A, LineBreak, List, ListItem
+from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
+from odf.table import TableColumn, TableCell, TableRow, Table
+from odf.draw import Frame, Image
+
+from PIL import Image as PILImage
+
+from bs4 import BeautifulSoup
+from idb import Article
+from idb.util import image_url_to_filename
+
+PAGE_LAYOUT_NAME = 'LandscapeLayout'
+MASTER_PAGE_NAME = 'Standard'
+BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
+ITALIC_STYLE_NAME = 'Italic'
+BOLD_STYLE_NAME = 'Bold'
+CAPTION_STYLE_NAME = 'Caption'
+UNDERLINE_STYLE_NAME = 'Underline'
+CENTER_STYLE_NAME = 'CenterAligned'
+TITLE_STYLE_NAME = 'Title'
+
+
+def add_child(parent, child):
+    if hasattr(child, "qname"):
+        parent.addElement(child)
+    else:
+        parent.addText(child)
+
+
+def calc_frame_dimensions(image_path, desired_width_cm):
+    with PILImage.open(image_path) as img:
+        orig_width, orig_height = img.size
+
+    dpi = 96.0
+    orig_width_cm = (orig_width / dpi) * 2.54
+    orig_height_cm = (orig_height / dpi) * 2.54
+
+    scale = desired_width_cm / orig_width_cm
+    new_height_cm = orig_height_cm * scale
+
+    return f"{desired_width_cm}cm", f"{new_height_cm}cm"
+
+
+class ImageWrap:
+    def __init__(self, image_file, caption):
+        self.image_file = image_file
+        self.caption = caption
+
+    def get_elements(self, doc):
+        embedded_href = doc.addPicture(self.image_file)
+
+        desired_width = 13.5
+        width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)
+
+        frm = Frame(width=width_str, height=height_str)
+        img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
+        frm.addElement(img)
+
+        elements = [frm]
+
+        if self.caption:
+            caption = P(stylename=CAPTION_STYLE_NAME)
+            caption.addText(self.caption)
+            elements.append(caption)
+
+        return elements
+
+
+class DocumentCreator:
+    def __init__(self):
+        self.doc = OpenDocumentText()
+        self.set_styles()
+
+    def set_styles(self):
+        landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
+        landscape_props = PageLayoutProperties(
+            pagewidth="29.7cm",
+            pageheight="21.0cm",
+            printorientation="landscape",
+            margin="1cm"
+        )
+        landscape_layout.addElement(landscape_props)
+        self.doc.automaticstyles.addElement(landscape_layout)
+
+        masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
+        self.doc.masterstyles.addElement(masterpage)
+
+        # bold
+        style = Style(name=BOLD_STYLE_NAME, family="text")
+        style.addElement(TextProperties(attributes={
+            'fontweight': "bold",
+            'fontweightasian': "bold",
+            'fontweightcomplex': "bold"
+        }))
+        self.doc.automaticstyles.addElement(style)
+
+        # italic
+        style = Style(name=ITALIC_STYLE_NAME, family="text")
+        style.addElement(TextProperties(attributes={
+            'fontstyle': "italic",
+            'fontstyleasian': "italic",
+            'fontstylecomplex': "italic"
+        }))
+        self.doc.automaticstyles.addElement(style)
+
+        # caption
+        style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
+        style.addElement(TextProperties(attributes={
+            'fontstyle': "italic",
+            'fontstyleasian': "italic",
+            'fontstylecomplex': "italic",
+            'fontsize': '10pt',
+            'color': '#777777'
+        }))
+        style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
+        self.doc.automaticstyles.addElement(style)
+
+        # underline
+        style = Style(name=UNDERLINE_STYLE_NAME, family="text")
+        style.addElement(TextProperties(attributes={
+            'textunderlinestyle': "solid",
+            'textunderlinewidth': "auto"
+        }))
+        self.doc.automaticstyles.addElement(style)
+
+        # blockquote
+        style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
+        style.addElement(ParagraphProperties(attributes={
+            'marginleft': '0.6cm',
+            'margintop': '0.15cm',
+            'marginbottom': '0.15cm',
+        }))
+        style.addElement(TextProperties(attributes={'color': '#378A62'}))
+        self.doc.styles.addElement(style)
+
+        # title
+        style = Style(name=TITLE_STYLE_NAME, family="paragraph")
+        style.addElement(TextProperties(attributes={
+            'fontsize': '20pt',
+            'fontweight': "bold",
+            'fontweightasian': "bold",
+            'fontweightcomplex': "bold"
+        }))
+        style.addElement(ParagraphProperties(textalign='center'))
+        self.doc.styles.addElement(style)
+
+        # centered text
+        style = Style(name=CENTER_STYLE_NAME, family="paragraph")
+        style.addElement(ParagraphProperties(textalign="center"))
+        self.doc.automaticstyles.addElement(style)
+
+    def process_inline(self, node):
+        if isinstance(node, str):
+            return [node]
+
+        tag = node.name.lower()
+
+        simple_tags = (
+            ('strong', 'b'),
+            ('em', 'i'),
+            ('ins', 'u')
+        )
+        simple_styles = (
+            BOLD_STYLE_NAME,
+            ITALIC_STYLE_NAME,
+            UNDERLINE_STYLE_NAME
+        )
+        for i, tags_list in enumerate(simple_tags):
+            if tag in tags_list:
+                span = Span(stylename=simple_styles[i])
+                for child in node.contents:
+                    for inline in self.process_inline(child):
+                        add_child(span, inline)
+                return [span]
+
+        if tag == "code":
+            return [Span(stylename="Code", text=node.get_text())]
+
+        elif tag == "a":
+            return [A(href=node.get("href"), text=node.get_text())]
+
+        elif tag == "img":
+            if node.name and node.name.lower() == "img":
+                return [self.process_img(node)]
+
+        else:
+            result = []
+            for child in node.contents:
+                result.extend(self.process_inline(child))
+            return result
+
+    def process_block(self, elem):
+        h_elem = self.try_process_heading(elem)
+        if h_elem is not None:
+            return h_elem
+
+        tag = elem.name.lower()
+        if tag == "p":
+            is_centered = False
+            has_image = False
+            for child in elem.contents:
+                # try converting heading
+                h_elem = self.try_process_heading(child)
+                if h_elem is not None:
+                    return h_elem
+
+                if child.name:
+                    if child.name.lower() == "img":
+                        has_image = True
+                    if child.name.lower() == "center":
+                        for cchild in child.contents:
+                            h_elem = self.try_process_heading(cchild)
+                            if h_elem is not None:
+                                return h_elem
+                        is_centered = True
+                        break
+
+            if is_centered or has_image:
+                p_elem = P(stylename=CENTER_STYLE_NAME)
+            else:
+                p_elem = P()
+
+            for child in elem.contents:
+                for inline in self.process_inline(child):
+                    if has_image and isinstance(inline, ImageWrap):
+                        image = inline.get_elements(self.doc)
+                        p_elem.addElement(image[0])
+                        elems = [p_elem]
+                        if len(image) == 2:
+                            elems.append(image[1])
+                        return elems
+
+                    add_child(p_elem, inline)
+
+            return p_elem
+
+        elif tag == "blockquote":
+            items = []
+            for child in elem.contents:
+                text = child.get_text()
+                if text.strip() == '':
+                    continue
+                items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
+            return items
+
+        elif tag in ("ul", "ol"):
+            odf_list = List()
+            li_elements = elem.find_all("li", recursive=False)
+            for li in li_elements:
+                li_item = ListItem()
+                p = P()
+                for child in li.contents:
+                    # if nested list is found, we'll process it later
+                    if hasattr(child, "name") and child.name in ("ul", "ol"):
+                        continue
+                    for inline in self.process_inline(child):
+                        add_child(p, inline)
+                li_item.addElement(p)
+
+                # process nested lists
+                for child in li.contents:
+                    if hasattr(child, "name") and child.name in ("ul", "ol"):
+                        nested_list = self.process_block(child)
+                        li_item.addElement(nested_list)
+
+                odf_list.addElement(li_item)
+            return odf_list
+
+        elif tag == "pre":
+            return P(stylename="Preformatted", text=elem.get_text())
+
+        elif tag == "hr":
+            return P(stylename=CENTER_STYLE_NAME, text='---')
+
+        elif tag == "table":
+            odf_table = Table()
+            for tr in elem.find_all("tr"):
+                row = TableRow()
+                for cell in tr.find_all(["th", "td"]):
+                    cell_elem = TableCell()
+                    cell_html = "".join(str(child) for child in cell.contents)
+                    cell_odf_elements = self.html_to_odf_elements(cell_html)
+                    for el in cell_odf_elements:
+                        cell_elem.addElement(el)
+                    row.addElement(cell_elem)
+                odf_table.addElement(row)
+            return odf_table
+
+        elif tag == "img":
+            return self.process_img(elem).get_elements(self.doc)
+
+        elif tag == "br":
+            return LineBreak()
+
+        else:
+            p_elem = P()
+            p_elem.addText(elem.get_text())
+            return p_elem
+
+    def try_process_heading(self, elem):
+        if not elem.name:
+            return
+
+        tag = elem.name.lower()
+        if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
+            return H(outlinelevel=tag[1], text=elem.get_text())
+
+    def process_img(self, elem) -> ImageWrap:
+        href = elem.get("src")
+        saved_file = os.path.join(
+            os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
+        )
+        if not os.path.exists(saved_file):
+            raise ValueError(f'image {saved_file} not found')
+
+        alt = elem.get('alt')
+        return ImageWrap(saved_file, alt)
+
+    def html_to_odf_elements(self, html):
+        soup = BeautifulSoup(html, "html.parser")
+        elements = []
+        top_nodes = soup.body.contents if soup.body else soup.contents
+
+        keeping = False
+        stack = []
+        ending_headings = (
+            'дополнительное чтение',
+            'основные источники',
+            'источники',
+            'дополнительные материалы',
+            'additional reading',
+            'further reading',
+            'main sources',
+            'additional sources',
+        )
+
+        for node in top_nodes:
+            if isinstance(node, str):
+                if not node.strip():
+                    continue
+                p = P()
+                p.addText(node)
+                elements.append(p)
+            elif node.name:
+                if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
+                    if stack:
+                        elements.append(stack)
+                        stack = []
+                    keeping = True
+                result = self.process_block(node)
+                if keeping:
+                    stack.append(result)
+                else:
+                    elements.append(result)
+        if stack:
+            elements.append(stack)
+        return elements
+
+    def create(self,
+               orig: Article,
+               trans: Article,
+               output_odt: str,
+               with_title=False):
+        orig_elements = self.html_to_odf_elements(orig.html)
+        trans_elements = self.html_to_odf_elements(trans.html)
+
+        max_len = max(len(orig_elements), len(trans_elements))
+        while len(orig_elements) < max_len:
+            orig_elements.append(P())
+        while len(trans_elements) < max_len:
+            trans_elements.append(P())
+
+        main_table = Table(name="ComparisonTable")
+
+        col1 = TableColumn()
+        col2 = TableColumn()
+
+        main_table.addElement(col1)
+        main_table.addElement(col2)
+
+        if with_title:
+            # headings
+            header_row = TableRow()
+
+            header_cell_left = TableCell()
+            header_cell_right = TableCell()
+
+            header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
+            header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))
+
+            header_row.addElement(header_cell_left)
+            header_row.addElement(header_cell_right)
+
+            main_table.addElement(header_row)
+
+        # content
+        for i in range(max_len):
+            row = TableRow()
+
+            cell_orig = TableCell()
+            cell_trans = TableCell()
+
+            if isinstance(orig_elements[i], list):
+                for elem in orig_elements[i]:
+                    cell_orig.addElement(elem)
+            else:
+                cell_orig.addElement(orig_elements[i])
+
+            if isinstance(trans_elements[i], list):
+                for elem in trans_elements[i]:
+                    cell_trans.addElement(elem)
+            else:
+                cell_trans.addElement(trans_elements[i])
+
+            row.addElement(cell_orig)
+            row.addElement(cell_trans)
+
+            main_table.addElement(row)
+
+        self.doc.text.addElement(main_table)
+        self.doc.save(output_odt)
--- a/idb/translator.py
+++ b/idb/translator.py
@ -2,11 +2,18 @@ import tiktoken
 from openai import OpenAI

 model = "gpt-4o"
-system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations."
-input_token_limit = 2000
+system_prompt = """
+Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form:
+- Quotes (e.g., > quoted text). Can be multi-line.
+- Links (e.g., [text](url))
+- Images (e.g., ![alt text](image_url))
+
+Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
+"""
+input_token_limit = 3500


-def translate(text):
+def translate_markdown(text):
    buf = []
    bufsize = 0
    cl = OpenAI()
--- a/idb/util.py
+++ b/idb/util.py
@ -0,0 +1,44 @@
+import re, os.path
+import requests
+from urllib.parse import urlparse
+
+
+def save_file(file, content):
+    with open(file, 'w') as f:
+        f.write(content)
+
+
+def read_file(filename):
+    with open(filename) as f:
+        return f.read()
+
+
+def name_from_url(url):
+    return os.path.basename(url[:-1])
+
+
+def image_url_to_filename(url):
+    parsed_url = urlparse(url)
+    filename = os.path.basename(parsed_url.path)
+    name, ext = os.path.splitext(filename)
+    date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path)
+    if not date_match:
+        raise ValueError("no valid date found in URL")
+    year = date_match.group(1)
+    day = date_match.group(3) if date_match.group(3) else "01"
+    return f"{year}{day}_{name}{ext}"
+
+
+def extract_images_from_markdown(markdown_text):
+    image_pattern = r"!\[.*?\]\((.*?)\)"
+    images = re.findall(image_pattern, markdown_text)
+    return images
+
+
+def download_file(url, filename):
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    if response.status_code == 200:
+        with open(filename, 'wb') as file:
+            for chunk in response.iter_content(1024):
+                file.write(chunk)
--- a/idb/wordpress.py
+++ b/idb/wordpress.py
@ -1,13 +1,123 @@
-import requests
+import requests, re
 from bs4 import BeautifulSoup
 from html import unescape
-from markdownify import markdownify
-from collections import namedtuple
-
-ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
+from markdownify import MarkdownConverter
+from markdown import markdown
+from enum import Enum


-def fetch(url) -> ArticleContent:
+class WordpressMarkdownConverter(MarkdownConverter):
+    def convert_p(self, el, text, convert_as_inline):
+        md = self.idb_convert_image(el)
+        if md is not None:
+            return md
+        if str(el).startswith('<p style="text-align:center;">'):
+            text = text.replace('\n\n', '<br>')
+            return f'<center>{text}</center>\n\n'
+        return super().convert_p(el, text, convert_as_inline)
+
+    def _convert_hn(self, n, el, text, convert_as_inline):
+        md = self.idb_convert_image(el)
+        if md is not None:
+            return md
+        return super()._convert_hn(n, el, text, convert_as_inline)
+
+    def convert_a(self, el, *args):
+        md = self.idb_convert_image(el)
+        if md is not None:
+            return md
+        return super().convert_a(el, *args)
+
+    def convert_div(self, el, *args):
+        if str(el).startswith('<div class="wp-caption aligncenter" data-shortcode="caption"'):
+            md = self.idb_convert_image(el)
+            if md is not None:
+                return md
+        return super().convert_a(el, *args)
+
+    def idb_convert_image(self, el):
+        html = str(el)
+
+        pattern = re.compile(r'^<(?:h[1-6]|p)[^>]*>.*?<img.*?src="([^"?]+)(?:\?[^"]*)?"')
+        match = pattern.search(html)
+        if match:
+            return f'![]({match.group(1)})\n\n'
+
+        pattern = re.compile(r'^<div class="wp-caption aligncenter" data-shortcode="caption"[^>]+><a[^>]+><img alt="[^"]*" aria-describedby="caption-attachment.*?src="([^"?]+)(?:\?[^"]*)?".*?/></a><p.*?id="caption-attachment[^"]+"[^>]*>(.*?)</p>', re.S)
+        match = pattern.search(html)
+        if match:
+            src, title = match.groups()
+            title = unescape(title)
+            return f'![{title}]({src})\n\n'
+
+
+def _markdownify(html, **options):
+    return WordpressMarkdownConverter(**options).convert(html)
+
+
+def markdown_from_html(html):
+    def mapper(s):
+        s = s.strip()
+        if s in ('#', '# #', '# # #', '##', '###', '___'):
+            return f'<center>{s}</center>'
+        return s
+
+    md = _markdownify(html, keep_inline_images_in=['a', 'h1', 'div', 'p']).strip()
+    return '\n\n'.join(map(mapper, md.split('\n\n')))
+
+
+def html_from_markdown(s):
+    return markdown(s, extensions=['extra', 'tables'])
+
+
+class Article:
+    title: str
+    html: str
+    md: str
+
+    def __init__(self, title, html, md):
+        self.title = title
+        self.html = html
+        self.md = md
+
+    @classmethod
+    def from_html(cls, title, html):
+        return Article(title, html, markdown_from_html(html))
+
+    @classmethod
+    def from_markdown(cls, title, md):
+        return Article(title, html_from_markdown(md), md)
+
+    @classmethod
+    def from_markdown_file(cls, filename, with_title=True):
+        if with_title:
+            with open(filename) as f:
+                lines = f.readlines()
+
+            first_line = lines[0].strip()
+            if not first_line.startswith('#'):
+                raise ValueError('first line must start with #')
+            title = first_line.lstrip('#').strip()
+
+            lines.pop(0)
+            if lines and not lines[0].strip():
+                lines.pop(0)
+
+            md = '\n\n'.join(lines)
+        else:
+            with open(filename) as f:
+                md = f.read()
+            title = ''
+
+        return cls.from_markdown(title, md)
+
+
+class Language(Enum):
+    English = 'en'
+    Russian = 'ru'
+
+
+def fetch_article(url) -> Article:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

@ -17,6 +127,5 @@ def fetch(url) -> ArticleContent:

    html = str(soup.find("div", class_="entry-content")).strip()
    title = unescape(soup.find(class_="entry-title").get_text(strip=True))
-    md = markdownify(html).strip()

-    return ArticleContent(title, html, md)
+    return Article.from_html(title, html)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,9 @@
+odfpy~=1.4.1
+pillow~=11.1.0
 beautifulsoup4~=4.13.3
-markdownify~=0.14.1
 requests~=2.32.3
+markdownify~=0.14.1
+Markdown~=3.7
+tiktoken~=0.8.0
 openai~=1.61.1
-python-dotenv~=1.0.1
-tiktoken~=0.8.0
+python-dotenv~=1.0.1
--- a/single_article.py
+++ b/single_article.py
@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 from argparse import ArgumentParser
-from idb.wordpress import fetch
-from idb.translator import translate
+from idb import fetch_article, translate_markdown
 from dotenv import load_dotenv

 load_dotenv()
@ -18,8 +17,8 @@ if __name__ == '__main__':
                        help="output files")
    args = parser.parse_args()

-    article = fetch(args.url)
-    translation = translate(article.md)
+    a = fetch_article(args.url)
+    translation = translate_markdown(a.md)

-    save(args.output[0], article.md)
+    save(args.output[0], a.md)
    save(args.output[1], translation)
--- a/tzo_images.py
+++ b/tzo_images.py
@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+import os.path
+from idb import tzo_urls
+from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+if __name__ == '__main__':
+    for url in tzo_urls:
+        name = name_from_url(url)
+        markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt')
+        image_urls = extract_images_from_markdown(read_file(markdown_file))
+        for image_url in image_urls:
+            image_name = image_url_to_filename(image_url)
+            output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
+            download_file(image_url, output_file)
+            print(f'{image_name} saved')
--- a/tzo_odt.py
+++ b/tzo_odt.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import os.path
+from idb import Article, DocumentCreator, tzo_urls
+from idb.util import name_from_url
+
+
+if __name__ == '__main__':
+    for url in tzo_urls:
+        name = name_from_url(url)
+
+        orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)
+        trans = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-en.txt'), with_title=False)
+
+        doc = DocumentCreator()
+        doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))