many changes

2025-05-17 04:28:55 +03:00 · 2025-05-17 04:28:55 +03:00 · de9084dfd5
commit de9084dfd5
parent 7f4b460c96
10 changed files with 359 additions and 54 deletions
--- a/cartier_odt.py
+++ b/cartier_odt.py
@ -1,24 +0,0 @@
-#!/usr/bin/env python3
-import os.path
-from idb import Article, DocumentCreator
-from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
-
-
-if __name__ == '__main__':
-    name = 'cartier3'
-    orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru')
-    trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en')
-
-    orig = Article.from_markdown_file(orig_path, with_title=False)
-    trans = Article.from_markdown_file(trans_path, with_title=False)
-
-    image_urls = extract_images_from_markdown(read_file(orig_path))
-    for image_url in image_urls:
-        image_name = image_url_to_filename(image_url)
-        output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
-        if not os.path.exists(output_file):
-            download_file(image_url, output_file)
-        print(f'{image_name} saved')
-
-    doc = DocumentCreator()
-    doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))
--- a/idb/init.py
+++ b/idb/init.py
@ -1,24 +1,4 @@
 from .wordpress import Article, fetch_article
 from .translator import translate_markdown
 from .doc import DocumentCreator
-
-tzo_urls = (
-    'https://kniganews.org/2012/12/20/beyond-clouds-1/',
-    'https://kniganews.org/2012/12/21/beyond-clouds-21/',
-    'https://kniganews.org/2012/12/22/beyond-clouds-22/',
-    'https://kniganews.org/2012/12/23/beyond-clouds-31/',
-    'https://kniganews.org/2012/12/24/beyond-clouds-32/',
-    'https://kniganews.org/2012/12/25/beyond-clouds-33/',
-    'https://kniganews.org/2012/12/28/beyond-clouds-41/',
-    'https://kniganews.org/2012/12/29/beyond-clouds-42/',
-    'https://kniganews.org/2012/12/30/beyond-clouds-43/',
-    'https://kniganews.org/2013/01/01/beyond-clouds-44/',
-    'https://kniganews.org/2013/01/06/beyond-clouds-51/',
-    'https://kniganews.org/2013/01/07/beyond-clouds-52/',
-    'https://kniganews.org/2013/02/16/beyond-clouds-53/',
-    'https://kniganews.org/2013/03/25/beyond-clouds-61/',
-    'https://kniganews.org/2013/05/10/beyond-clouds-62/',
-    'https://kniganews.org/2013/06/17/beyond-clouds-731/',
-    'https://kniganews.org/2013/08/07/beyond-clouds-732/',
-    'https://kniganews.org/2013/09/17/beyond-clouds-73/'
-)
+from .tzo import tzo_urls, after_tzo_urls
--- a/idb/doc.py
+++ b/idb/doc.py
@ -1,12 +1,14 @@
 import os.path
+import zipfile

-from odf.opendocument import OpenDocumentText
+from odf.opendocument import OpenDocumentText, load
 from odf.text import P, H, Span, A, LineBreak, List, ListItem
 from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
 from odf.table import TableColumn, TableCell, TableRow, Table
 from odf.draw import Frame, Image

 from PIL import Image as PILImage
+from io import BytesIO

 from bs4 import BeautifulSoup
 from idb import Article
@ -422,3 +424,180 @@ class DocumentCreator:

        self.doc.text.addElement(main_table)
        self.doc.save(output_odt)
+
+
+class DocumentReader:
+    def __init__(self, input_file):
+        self.doc = load(input_file)
+        self.package = zipfile.ZipFile(input_file)
+
+        self.style_alignments = self.build_style_alignments(self.doc)  # For paragraph alignment (family="paragraph")
+        self.text_style_formats = self.build_text_styles(self.doc)  # For text formatting (family="text")
+
+    def parse_node(self, node, indent=0):
+        if isinstance(node, str):
+            return node
+        try:
+            if node.nodeType == node.TEXT_NODE:
+                return node.data
+        except AttributeError:
+            pass
+
+        tag = node.tagName
+        if tag == "text:h":
+            level_str = node.attributes.get("text:outline-level", "1")
+            try:
+                level = int(level_str)
+            except ValueError:
+                level = 1
+            if level > 6:
+                level = 6
+            content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
+            return f'{"#" * level} {content}\n\n'
+
+        if tag == 'text:p':
+            style = node.getAttribute('stylename')
+            content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
+
+            if style:
+                style_align = self.style_alignments.get(style, '')
+                if style_align == 'center' or style.lower() == 'center':
+                    return f"<center>{content}</center>\n\n"
+
+            if style in ('Block Quotation', 'Quotations') or style.endswith('Quotation'):
+                lines = content.splitlines()
+                content = "\n".join(["> " + line for line in lines])
+
+            return content + "\n\n"
+
+        elif tag == "text:list":
+            md = ""
+            for child in node.childNodes:
+                md += self.parse_node(child, indent)
+            return md + "\n"
+
+        elif tag == "text:list-item":
+            item_text = ""
+            for child in node.childNodes:
+                item_text += self.parse_node(child, indent + 1)
+            lines = item_text.splitlines()
+            if lines:
+                prefix = "    " * indent + "- "
+                new_lines = [prefix + lines[0]]
+                for line in lines[1:]:
+                    new_lines.append("    " * (indent + 1) + line)
+                return "\n".join(new_lines) + "\n"
+            return ""
+
+        elif tag == "text:span":
+            style_name = node.getAttribute("stylename") or ""
+            content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
+            fmt = self.text_style_formats.get(style_name, {})
+            md_text = content
+            if fmt.get("bold") and fmt.get("italic"):
+                md_text = f"***{md_text}***"
+            else:
+                if fmt.get("bold"):
+                    md_text = f"**{md_text}**"
+                if fmt.get("italic"):
+                    md_text = f"*{md_text}*"
+            if fmt.get("underline"):
+                md_text = f"<u>{md_text}</u>"
+            return md_text
+
+        elif tag == "text:a":
+            href = node.getAttribute("href")
+            content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
+            if href:
+                return f"[{content}]({href})"
+            return content
+
+        elif tag == "text:line-break":
+            return "  \n"
+
+        elif tag == "draw:frame":
+            md = ""
+            caption_text = ""
+            for child in node.childNodes:
+                if hasattr(child, "tagName"):
+                    if child.tagName == "draw:image":
+                        href = child.attributes.get(('http://www.w3.org/1999/xlink', 'href'))
+                        md += f"![]({href})"
+                    elif child.tagName == "draw:caption":
+                        caption_text = ''.join([self.parse_node(c, indent) for c in child.childNodes]).strip()
+            if caption_text:
+                md += "\n" + caption_text + "\n"
+            return md
+
+        else:
+            return ''.join([self.parse_node(child, indent) for child in node.childNodes])
+
+    def get_embedded_image_size(self, file_name) -> tuple[int, int]:
+        data = self.package.read(file_name)
+        img = PILImage.open(BytesIO(data))
+        return img.size
+
+    def get_markdown(self, column=1) -> str:
+        tables = self.doc.getElementsByType(Table)
+        comp_table = None
+        for tbl in tables:
+            if tbl.getAttribute("name") == "ComparisonTable":
+                comp_table = tbl
+                break
+        if not comp_table:
+            raise RuntimeError("ComparisonTable not found in the document.")
+
+        md_lines = []
+        rows = comp_table.getElementsByType(TableRow)
+        for row in rows:
+            cells = row.getElementsByType(TableCell)
+            if len(cells) >= 2:
+                right_cell = cells[column]
+                cell_md = ""
+                for child in right_cell.childNodes:
+                    cell_md += self.parse_node(child)
+                # Remove any extra whitespace.
+                md_lines.append(cell_md.strip())
+
+        return "\n\n".join(md_lines)
+
+    @staticmethod
+    def build_style_alignments(doc):
+        alignments = {}
+        for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
+            if style.getAttribute('family') == 'paragraph':
+                style_name = style.getAttribute('name')
+                para_props = style.getElementsByType(ParagraphProperties)
+                if para_props:
+                    attr_val = para_props[0].attributes.get(
+                        ('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'text-align'), '')
+                    if attr_val:
+                        alignments[style_name] = attr_val.lower().strip()
+        return alignments
+
+    @staticmethod
+    def build_text_styles(doc):
+        text_styles = {}
+        for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
+            if style.getAttribute('family') == 'text':
+                style_name = style.getAttribute('name')
+                text_props = style.getElementsByType(TextProperties)
+                if text_props:
+                    props = text_props[0].attributes
+                    bold = False
+                    italic = False
+                    underline = False
+                    fw = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-weight'),
+                                   '').lower()
+                    if 'bold' in fw or fw in ('700', '800', '900'):
+                        bold = True
+                    fs = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-style'),
+                                   '').lower()
+                    if 'italic' in fs:
+                        italic = True
+                    tu = props.get(('urn:oasis:names:tc:opendocument:xmlns:style:1.0', 'text-underline-style'),
+                                   '').lower()
+                    if tu and tu != 'none':
+                        underline = True
+                    text_styles[style_name] = {'bold': bold, 'italic': italic, 'underline': underline}
+        return text_styles
--- a/idb/translator.py
+++ b/idb/translator.py
@ -10,7 +10,7 @@ Translate the following text from Russian to English while strictly preserving t

 Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
 """
-input_token_limit = 3500
+input_token_limit = 5000


 def translate_markdown(text):
--- a/idb/tzo.py
+++ b/idb/tzo.py
@ -0,0 +1,78 @@
+import os
+import re
+from PIL import Image
+from collections import namedtuple
+from .util import image_url_to_filename
+
+tzo_urls = (
+    'https://kniganews.org/2012/12/20/beyond-clouds-1/',
+    'https://kniganews.org/2012/12/21/beyond-clouds-21/',
+    'https://kniganews.org/2012/12/22/beyond-clouds-22/',
+    'https://kniganews.org/2012/12/23/beyond-clouds-31/',
+    'https://kniganews.org/2012/12/24/beyond-clouds-32/',
+    'https://kniganews.org/2012/12/25/beyond-clouds-33/',
+    'https://kniganews.org/2012/12/28/beyond-clouds-41/',
+    'https://kniganews.org/2012/12/29/beyond-clouds-42/',
+    'https://kniganews.org/2012/12/30/beyond-clouds-43/',
+    'https://kniganews.org/2013/01/01/beyond-clouds-44/',
+    'https://kniganews.org/2013/01/06/beyond-clouds-51/',
+    'https://kniganews.org/2013/01/07/beyond-clouds-52/',
+    'https://kniganews.org/2013/02/16/beyond-clouds-53/',
+    'https://kniganews.org/2013/03/25/beyond-clouds-61/',
+    'https://kniganews.org/2013/05/10/beyond-clouds-62/',
+    'https://kniganews.org/2013/06/17/beyond-clouds-731/',
+    'https://kniganews.org/2013/08/07/beyond-clouds-732/',
+    'https://kniganews.org/2013/09/17/beyond-clouds-73/'
+)
+after_tzo_urls = (
+    'https://kniganews.org/2012/11/17/langlands-plus/',
+)
+ImageInfo = namedtuple('ImageInfo', ('url', 'local_name', 'local_path', 'width', 'height'))
+
+
+class ImageList:
+    images: list[ImageInfo]
+
+    def __init__(self):
+        self.images = []
+
+    def add_image(self, url):
+        local_name = image_url_to_filename(url)
+        local_path = os.path.realpath(os.path.join(
+            os.path.dirname(__file__),
+            '..',
+            'images',
+            local_name
+        ))
+        image = Image.open(local_path)
+        self.images.append(ImageInfo(url, local_name, local_path, image.size[0], image.size[1]))
+
+    def get_images_by_size(self, w, h) -> list[ImageInfo]:
+        return list(filter(lambda image: image.width == w and image.height == h, self.images))
+
+
+def get_part_by_odt_name(name: str) -> int:
+    m = re.match(r'^beyond-clouds-(\d+)(?:v\d+)?\.odt$', name)
+    if not m:
+        raise ValueError('could not parse file name')
+    if not m.group(1).isnumeric():
+        raise ValueError('extracted value is not a number')
+    return int(m.group(1))
+
+
+def part_image_list(part) -> ImageList:
+    file = os.path.realpath(os.path.join(
+        os.path.dirname(__file__),
+        '..',
+        'tzo',
+        f'beyond-clouds-{part}-ru.txt',
+    ))
+    with open(file) as f:
+        txt = f.read()
+    urls = re.findall(r'!\[.*?]\((.*?)\)', txt)
+
+    images = ImageList()
+    for url in urls:
+        images.add_image(url)
+
+    return images
--- a/idb/util.py
+++ b/idb/util.py
@ -21,12 +21,12 @@ def image_url_to_filename(url):
    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)
    name, ext = os.path.splitext(filename)
-    date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path)
+    date_match = re.search(r'(\d{4})/(\d{2})', parsed_url.path)
    if not date_match:
        raise ValueError("no valid date found in URL")
    year = date_match.group(1)
-    day = date_match.group(3) if date_match.group(3) else "01"
-    return f"{year}{day}_{name}{ext}"
+    month = date_match.group(2)
+    return f"{year}{month}_{name}{ext}"


 def extract_images_from_markdown(markdown_text):
--- a/odt_to_md.py
+++ b/odt_to_md.py
@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+import re
+
+from argparse import ArgumentParser
+from os.path import basename
+
+from idb import tzo
+from idb.doc import DocumentReader
+from idb.tzo import get_part_by_odt_name
+
+
+def tzo_replace_images(md: str,
+                       tzo_part: int,
+                       dr: DocumentReader):
+    il = tzo.part_image_list(tzo_part)
+
+    def _markdown_image(image, title) -> str:
+        if title:
+            return f'![]({image.url} "{title}")'
+        else:
+            return f'![]({image.url})'
+
+    def _repl(match: re.Match) -> str:
+        orig_alt, path, title = match.groups()
+        w, h = dr.get_embedded_image_size(path)
+        found_images = il.get_images_by_size(w, h)
+        # if len(found_images) > 1:
+        #     raise ValueError(f'more than one image found with size {w}x{h}')
+        return ''.join(list(map(lambda i: _markdown_image(i, title), found_images)))
+
+    regex = re.compile(r'!\[(.*?)]\((\S+?)(?:\s+"(.*?)")?\)')
+    return regex.sub(_repl, md)
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('-i', '--input', required=True, type=str, help='Input file')
+    parser.add_argument('-c', '--column', default=1, type=int, help='Column number')
+    parser.add_argument('--tzo', action='store_true', help='TZO')
+    args = parser.parse_args()
+
+    reader = DocumentReader(args.input)
+    md = reader.get_markdown(args.column)
+    if args.tzo:
+        md = tzo_replace_images(md, get_part_by_odt_name(basename(args.input)), reader)
+
+    print(md)
+
+
+if __name__ == '__main__':
+    main()
--- a/single_odt.py
+++ b/single_odt.py
@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+import os.path
+from idb import Article, DocumentCreator
+from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
+from argparse import ArgumentParser
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--ru-file', type=str, required=True, help='russian input file')
+    parser.add_argument('--en-file', type=str, required=True, help='english input file')
+    parser.add_argument('--output', type=str, required=True, help='output ODT file')
+    args = parser.parse_args()
+
+    orig = Article.from_markdown_file(args.ru_file, with_title=False)
+    trans = Article.from_markdown_file(args.en_file, with_title=False)
+
+    image_urls = extract_images_from_markdown(read_file(args.ru_file))
+    for image_url in image_urls:
+        image_name = image_url_to_filename(image_url)
+        output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
+        if not os.path.exists(output_file):
+            download_file(image_url, output_file)
+        print(f'{image_name} saved')
+
+    doc = DocumentCreator()
+    doc.create(orig, trans, args.output)
--- a/tzo_images.py
+++ b/tzo_images.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import os.path
-from idb import tzo_urls
+from argparse import ArgumentParser
+from idb import tzo_urls, after_tzo_urls
 from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown
 from dotenv import load_dotenv

@ -8,7 +9,13 @@ load_dotenv()


 if __name__ == '__main__':
-    for url in tzo_urls:
+    parser = ArgumentParser()
+    parser.add_argument('--after', action='store_true')
+    args = parser.parse_args()
+
+    urls = tzo_urls if not args.after else after_tzo_urls
+
+    for url in urls:
        name = name_from_url(url)
        markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt')
        image_urls = extract_images_from_markdown(read_file(markdown_file))
--- a/tzo_odt.py
+++ b/tzo_odt.py
@ -1,11 +1,18 @@
 #!/usr/bin/env python3
 import os.path
-from idb import Article, DocumentCreator, tzo_urls
+from argparse import ArgumentParser
+from idb import Article, DocumentCreator, tzo_urls, after_tzo_urls
 from idb.util import name_from_url


 if __name__ == '__main__':
-    for url in tzo_urls:
+    parser = ArgumentParser()
+    parser.add_argument('--after', action='store_true')
+    args = parser.parse_args()
+
+    urls = tzo_urls if not args.after else after_tzo_urls
+
+    for url in urls:
        name = name_from_url(url)

        orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)