diff --git a/cartier_odt.py b/cartier_odt.py deleted file mode 100755 index 07a9619..0000000 --- a/cartier_odt.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 -import os.path -from idb import Article, DocumentCreator -from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file - - -if __name__ == '__main__': - name = 'cartier3' - orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru') - trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en') - - orig = Article.from_markdown_file(orig_path, with_title=False) - trans = Article.from_markdown_file(trans_path, with_title=False) - - image_urls = extract_images_from_markdown(read_file(orig_path)) - for image_url in image_urls: - image_name = image_url_to_filename(image_url) - output_file = os.path.join(os.path.dirname(__file__), 'images', image_name) - if not os.path.exists(output_file): - download_file(image_url, output_file) - print(f'{image_name} saved') - - doc = DocumentCreator() - doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt')) \ No newline at end of file diff --git a/idb/__init__.py b/idb/__init__.py index a2fd00b..897b087 100644 --- a/idb/__init__.py +++ b/idb/__init__.py @@ -1,24 +1,4 @@ from .wordpress import Article, fetch_article from .translator import translate_markdown from .doc import DocumentCreator - -tzo_urls = ( - 'https://kniganews.org/2012/12/20/beyond-clouds-1/', - 'https://kniganews.org/2012/12/21/beyond-clouds-21/', - 'https://kniganews.org/2012/12/22/beyond-clouds-22/', - 'https://kniganews.org/2012/12/23/beyond-clouds-31/', - 'https://kniganews.org/2012/12/24/beyond-clouds-32/', - 'https://kniganews.org/2012/12/25/beyond-clouds-33/', - 'https://kniganews.org/2012/12/28/beyond-clouds-41/', - 'https://kniganews.org/2012/12/29/beyond-clouds-42/', - 'https://kniganews.org/2012/12/30/beyond-clouds-43/', - 'https://kniganews.org/2013/01/01/beyond-clouds-44/', - 'https://kniganews.org/2013/01/06/beyond-clouds-51/', - 'https://kniganews.org/2013/01/07/beyond-clouds-52/', - 'https://kniganews.org/2013/02/16/beyond-clouds-53/', - 'https://kniganews.org/2013/03/25/beyond-clouds-61/', - 'https://kniganews.org/2013/05/10/beyond-clouds-62/', - 'https://kniganews.org/2013/06/17/beyond-clouds-731/', - 'https://kniganews.org/2013/08/07/beyond-clouds-732/', - 'https://kniganews.org/2013/09/17/beyond-clouds-73/' -) \ No newline at end of file +from .tzo import tzo_urls, after_tzo_urls \ No newline at end of file diff --git a/idb/doc.py b/idb/doc.py index 6977bf7..bd65ab6 100644 --- a/idb/doc.py +++ b/idb/doc.py @@ -1,12 +1,14 @@ import os.path +import zipfile -from odf.opendocument import OpenDocumentText +from odf.opendocument import OpenDocumentText, load from odf.text import P, H, Span, A, LineBreak, List, ListItem from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage from odf.table import TableColumn, TableCell, TableRow, Table from odf.draw import Frame, Image from PIL import Image as PILImage +from io import BytesIO from bs4 import BeautifulSoup from idb import Article @@ -422,3 +424,180 @@ class DocumentCreator: self.doc.text.addElement(main_table) self.doc.save(output_odt) + + +class DocumentReader: + def __init__(self, input_file): + self.doc = load(input_file) + self.package = zipfile.ZipFile(input_file) + + self.style_alignments = self.build_style_alignments(self.doc) # For paragraph alignment (family="paragraph") + self.text_style_formats = self.build_text_styles(self.doc) # For text formatting (family="text") + + def parse_node(self, node, indent=0): + if isinstance(node, str): + return node + try: + if node.nodeType == node.TEXT_NODE: + return node.data + except AttributeError: + pass + + tag = node.tagName + if tag == "text:h": + level_str = node.attributes.get("text:outline-level", "1") + try: + level = int(level_str) + except ValueError: + level = 1 + if level > 6: + level = 6 + content = ''.join([self.parse_node(child, indent) for child in node.childNodes]) + return f'{"#" * level} {content}\n\n' + + if tag == 'text:p': + style = node.getAttribute('stylename') + content = ''.join([self.parse_node(child, indent) for child in node.childNodes]) + + if style: + style_align = self.style_alignments.get(style, '') + if style_align == 'center' or style.lower() == 'center': + return f"
{content}
\n\n" + + if style in ('Block Quotation', 'Quotations') or style.endswith('Quotation'): + lines = content.splitlines() + content = "\n".join(["> " + line for line in lines]) + + return content + "\n\n" + + elif tag == "text:list": + md = "" + for child in node.childNodes: + md += self.parse_node(child, indent) + return md + "\n" + + elif tag == "text:list-item": + item_text = "" + for child in node.childNodes: + item_text += self.parse_node(child, indent + 1) + lines = item_text.splitlines() + if lines: + prefix = " " * indent + "- " + new_lines = [prefix + lines[0]] + for line in lines[1:]: + new_lines.append(" " * (indent + 1) + line) + return "\n".join(new_lines) + "\n" + return "" + + elif tag == "text:span": + style_name = node.getAttribute("stylename") or "" + content = ''.join([self.parse_node(child, indent) for child in node.childNodes]) + fmt = self.text_style_formats.get(style_name, {}) + md_text = content + if fmt.get("bold") and fmt.get("italic"): + md_text = f"***{md_text}***" + else: + if fmt.get("bold"): + md_text = f"**{md_text}**" + if fmt.get("italic"): + md_text = f"*{md_text}*" + if fmt.get("underline"): + md_text = f"{md_text}" + return md_text + + elif tag == "text:a": + href = node.getAttribute("href") + content = ''.join([self.parse_node(child, indent) for child in node.childNodes]) + if href: + return f"[{content}]({href})" + return content + + elif tag == "text:line-break": + return " \n" + + elif tag == "draw:frame": + md = "" + caption_text = "" + for child in node.childNodes: + if hasattr(child, "tagName"): + if child.tagName == "draw:image": + href = child.attributes.get(('http://www.w3.org/1999/xlink', 'href')) + md += f"![]({href})" + elif child.tagName == "draw:caption": + caption_text = ''.join([self.parse_node(c, indent) for c in child.childNodes]).strip() + if caption_text: + md += "\n" + caption_text + "\n" + return md + + else: + return ''.join([self.parse_node(child, indent) for child in node.childNodes]) + + def get_embedded_image_size(self, file_name) -> tuple[int, int]: + data = self.package.read(file_name) + img = PILImage.open(BytesIO(data)) + return img.size + + def get_markdown(self, column=1) -> str: + tables = self.doc.getElementsByType(Table) + comp_table = None + for tbl in tables: + if tbl.getAttribute("name") == "ComparisonTable": + comp_table = tbl + break + if not comp_table: + raise RuntimeError("ComparisonTable not found in the document.") + + md_lines = [] + rows = comp_table.getElementsByType(TableRow) + for row in rows: + cells = row.getElementsByType(TableCell) + if len(cells) >= 2: + right_cell = cells[column] + cell_md = "" + for child in right_cell.childNodes: + cell_md += self.parse_node(child) + # Remove any extra whitespace. + md_lines.append(cell_md.strip()) + + return "\n\n".join(md_lines) + + @staticmethod + def build_style_alignments(doc): + alignments = {} + for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]: + if style.getAttribute('family') == 'paragraph': + style_name = style.getAttribute('name') + para_props = style.getElementsByType(ParagraphProperties) + if para_props: + attr_val = para_props[0].attributes.get( + ('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'text-align'), '') + if attr_val: + alignments[style_name] = attr_val.lower().strip() + return alignments + + @staticmethod + def build_text_styles(doc): + text_styles = {} + for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]: + if style.getAttribute('family') == 'text': + style_name = style.getAttribute('name') + text_props = style.getElementsByType(TextProperties) + if text_props: + props = text_props[0].attributes + bold = False + italic = False + underline = False + fw = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-weight'), + '').lower() + if 'bold' in fw or fw in ('700', '800', '900'): + bold = True + fs = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-style'), + '').lower() + if 'italic' in fs: + italic = True + tu = props.get(('urn:oasis:names:tc:opendocument:xmlns:style:1.0', 'text-underline-style'), + '').lower() + if tu and tu != 'none': + underline = True + text_styles[style_name] = {'bold': bold, 'italic': italic, 'underline': underline} + return text_styles diff --git a/idb/translator.py b/idb/translator.py index 912bf98..76ef59a 100644 --- a/idb/translator.py +++ b/idb/translator.py @@ -10,7 +10,7 @@ Translate the following text from Russian to English while strictly preserving t Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content. """ -input_token_limit = 3500 +input_token_limit = 5000 def translate_markdown(text): diff --git a/idb/tzo.py b/idb/tzo.py new file mode 100644 index 0000000..0b28268 --- /dev/null +++ b/idb/tzo.py @@ -0,0 +1,78 @@ +import os +import re +from PIL import Image +from collections import namedtuple +from .util import image_url_to_filename + +tzo_urls = ( + 'https://kniganews.org/2012/12/20/beyond-clouds-1/', + 'https://kniganews.org/2012/12/21/beyond-clouds-21/', + 'https://kniganews.org/2012/12/22/beyond-clouds-22/', + 'https://kniganews.org/2012/12/23/beyond-clouds-31/', + 'https://kniganews.org/2012/12/24/beyond-clouds-32/', + 'https://kniganews.org/2012/12/25/beyond-clouds-33/', + 'https://kniganews.org/2012/12/28/beyond-clouds-41/', + 'https://kniganews.org/2012/12/29/beyond-clouds-42/', + 'https://kniganews.org/2012/12/30/beyond-clouds-43/', + 'https://kniganews.org/2013/01/01/beyond-clouds-44/', + 'https://kniganews.org/2013/01/06/beyond-clouds-51/', + 'https://kniganews.org/2013/01/07/beyond-clouds-52/', + 'https://kniganews.org/2013/02/16/beyond-clouds-53/', + 'https://kniganews.org/2013/03/25/beyond-clouds-61/', + 'https://kniganews.org/2013/05/10/beyond-clouds-62/', + 'https://kniganews.org/2013/06/17/beyond-clouds-731/', + 'https://kniganews.org/2013/08/07/beyond-clouds-732/', + 'https://kniganews.org/2013/09/17/beyond-clouds-73/' +) +after_tzo_urls = ( + 'https://kniganews.org/2012/11/17/langlands-plus/', +) +ImageInfo = namedtuple('ImageInfo', ('url', 'local_name', 'local_path', 'width', 'height')) + + +class ImageList: + images: list[ImageInfo] + + def __init__(self): + self.images = [] + + def add_image(self, url): + local_name = image_url_to_filename(url) + local_path = os.path.realpath(os.path.join( + os.path.dirname(__file__), + '..', + 'images', + local_name + )) + image = Image.open(local_path) + self.images.append(ImageInfo(url, local_name, local_path, image.size[0], image.size[1])) + + def get_images_by_size(self, w, h) -> list[ImageInfo]: + return list(filter(lambda image: image.width == w and image.height == h, self.images)) + + +def get_part_by_odt_name(name: str) -> int: + m = re.match(r'^beyond-clouds-(\d+)(?:v\d+)?\.odt$', name) + if not m: + raise ValueError('could not parse file name') + if not m.group(1).isnumeric(): + raise ValueError('extracted value is not a number') + return int(m.group(1)) + + +def part_image_list(part) -> ImageList: + file = os.path.realpath(os.path.join( + os.path.dirname(__file__), + '..', + 'tzo', + f'beyond-clouds-{part}-ru.txt', + )) + with open(file) as f: + txt = f.read() + urls = re.findall(r'!\[.*?]\((.*?)\)', txt) + + images = ImageList() + for url in urls: + images.add_image(url) + + return images \ No newline at end of file diff --git a/idb/util.py b/idb/util.py index 609a526..1581ae2 100644 --- a/idb/util.py +++ b/idb/util.py @@ -21,12 +21,12 @@ def image_url_to_filename(url): parsed_url = urlparse(url) filename = os.path.basename(parsed_url.path) name, ext = os.path.splitext(filename) - date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path) + date_match = re.search(r'(\d{4})/(\d{2})', parsed_url.path) if not date_match: raise ValueError("no valid date found in URL") year = date_match.group(1) - day = date_match.group(3) if date_match.group(3) else "01" - return f"{year}{day}_{name}{ext}" + month = date_match.group(2) + return f"{year}{month}_{name}{ext}" def extract_images_from_markdown(markdown_text): diff --git a/odt_to_md.py b/odt_to_md.py new file mode 100755 index 0000000..f09c0d6 --- /dev/null +++ b/odt_to_md.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import re + +from argparse import ArgumentParser +from os.path import basename + +from idb import tzo +from idb.doc import DocumentReader +from idb.tzo import get_part_by_odt_name + + +def tzo_replace_images(md: str, + tzo_part: int, + dr: DocumentReader): + il = tzo.part_image_list(tzo_part) + + def _markdown_image(image, title) -> str: + if title: + return f'![]({image.url} "{title}")' + else: + return f'![]({image.url})' + + def _repl(match: re.Match) -> str: + orig_alt, path, title = match.groups() + w, h = dr.get_embedded_image_size(path) + found_images = il.get_images_by_size(w, h) + # if len(found_images) > 1: + # raise ValueError(f'more than one image found with size {w}x{h}') + return ''.join(list(map(lambda i: _markdown_image(i, title), found_images))) + + regex = re.compile(r'!\[(.*?)]\((\S+?)(?:\s+"(.*?)")?\)') + return regex.sub(_repl, md) + + +def main(): + parser = ArgumentParser() + parser.add_argument('-i', '--input', required=True, type=str, help='Input file') + parser.add_argument('-c', '--column', default=1, type=int, help='Column number') + parser.add_argument('--tzo', action='store_true', help='TZO') + args = parser.parse_args() + + reader = DocumentReader(args.input) + md = reader.get_markdown(args.column) + if args.tzo: + md = tzo_replace_images(md, get_part_by_odt_name(basename(args.input)), reader) + + print(md) + + +if __name__ == '__main__': + main() diff --git a/single_odt.py b/single_odt.py new file mode 100755 index 0000000..fac4c32 --- /dev/null +++ b/single_odt.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +import os.path +from idb import Article, DocumentCreator +from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file +from argparse import ArgumentParser + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--ru-file', type=str, required=True, help='russian input file') + parser.add_argument('--en-file', type=str, required=True, help='english input file') + parser.add_argument('--output', type=str, required=True, help='output ODT file') + args = parser.parse_args() + + orig = Article.from_markdown_file(args.ru_file, with_title=False) + trans = Article.from_markdown_file(args.en_file, with_title=False) + + image_urls = extract_images_from_markdown(read_file(args.ru_file)) + for image_url in image_urls: + image_name = image_url_to_filename(image_url) + output_file = os.path.join(os.path.dirname(__file__), 'images', image_name) + if not os.path.exists(output_file): + download_file(image_url, output_file) + print(f'{image_name} saved') + + doc = DocumentCreator() + doc.create(orig, trans, args.output) \ No newline at end of file diff --git a/tzo_images.py b/tzo_images.py index 545522a..e434bf2 100755 --- a/tzo_images.py +++ b/tzo_images.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os.path -from idb import tzo_urls +from argparse import ArgumentParser +from idb import tzo_urls, after_tzo_urls from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown from dotenv import load_dotenv @@ -8,7 +9,13 @@ load_dotenv() if __name__ == '__main__': - for url in tzo_urls: + parser = ArgumentParser() + parser.add_argument('--after', action='store_true') + args = parser.parse_args() + + urls = tzo_urls if not args.after else after_tzo_urls + + for url in urls: name = name_from_url(url) markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt') image_urls = extract_images_from_markdown(read_file(markdown_file)) diff --git a/tzo_odt.py b/tzo_odt.py index 49896ee..aef78aa 100755 --- a/tzo_odt.py +++ b/tzo_odt.py @@ -1,11 +1,18 @@ #!/usr/bin/env python3 import os.path -from idb import Article, DocumentCreator, tzo_urls +from argparse import ArgumentParser +from idb import Article, DocumentCreator, tzo_urls, after_tzo_urls from idb.util import name_from_url if __name__ == '__main__': - for url in tzo_urls: + parser = ArgumentParser() + parser.add_argument('--after', action='store_true') + args = parser.parse_args() + + urls = tzo_urls if not args.after else after_tzo_urls + + for url in urls: name = name_from_url(url) orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)