diff --git a/.gitignore b/.gitignore index 342783d..d1d609d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,11 @@ /.idea /.venv -/.env \ No newline at end of file +/.env +/tzo_save.py +/test*py +/*.html +/*.odt +/*.md +/*.txt +/*.zip +/.DS_Store \ No newline at end of file diff --git a/cartier_odt.py b/cartier_odt.py new file mode 100755 index 0000000..07a9619 --- /dev/null +++ b/cartier_odt.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import os.path +from idb import Article, DocumentCreator +from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file + + +if __name__ == '__main__': + name = 'cartier3' + orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru') + trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en') + + orig = Article.from_markdown_file(orig_path, with_title=False) + trans = Article.from_markdown_file(trans_path, with_title=False) + + image_urls = extract_images_from_markdown(read_file(orig_path)) + for image_url in image_urls: + image_name = image_url_to_filename(image_url) + output_file = os.path.join(os.path.dirname(__file__), 'images', image_name) + if not os.path.exists(output_file): + download_file(image_url, output_file) + print(f'{image_name} saved') + + doc = DocumentCreator() + doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt')) \ No newline at end of file diff --git a/idb/__init__.py b/idb/__init__.py index e69de29..a2fd00b 100644 --- a/idb/__init__.py +++ b/idb/__init__.py @@ -0,0 +1,24 @@ +from .wordpress import Article, fetch_article +from .translator import translate_markdown +from .doc import DocumentCreator + +tzo_urls = ( + 'https://kniganews.org/2012/12/20/beyond-clouds-1/', + 'https://kniganews.org/2012/12/21/beyond-clouds-21/', + 'https://kniganews.org/2012/12/22/beyond-clouds-22/', + 'https://kniganews.org/2012/12/23/beyond-clouds-31/', + 'https://kniganews.org/2012/12/24/beyond-clouds-32/', + 'https://kniganews.org/2012/12/25/beyond-clouds-33/', + 'https://kniganews.org/2012/12/28/beyond-clouds-41/', + 'https://kniganews.org/2012/12/29/beyond-clouds-42/', + 'https://kniganews.org/2012/12/30/beyond-clouds-43/', + 'https://kniganews.org/2013/01/01/beyond-clouds-44/', + 'https://kniganews.org/2013/01/06/beyond-clouds-51/', + 'https://kniganews.org/2013/01/07/beyond-clouds-52/', + 'https://kniganews.org/2013/02/16/beyond-clouds-53/', + 'https://kniganews.org/2013/03/25/beyond-clouds-61/', + 'https://kniganews.org/2013/05/10/beyond-clouds-62/', + 'https://kniganews.org/2013/06/17/beyond-clouds-731/', + 'https://kniganews.org/2013/08/07/beyond-clouds-732/', + 'https://kniganews.org/2013/09/17/beyond-clouds-73/' +) \ No newline at end of file diff --git a/idb/doc.py b/idb/doc.py new file mode 100644 index 0000000..6977bf7 --- /dev/null +++ b/idb/doc.py @@ -0,0 +1,424 @@ +import os.path + +from odf.opendocument import OpenDocumentText +from odf.text import P, H, Span, A, LineBreak, List, ListItem +from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage +from odf.table import TableColumn, TableCell, TableRow, Table +from odf.draw import Frame, Image + +from PIL import Image as PILImage + +from bs4 import BeautifulSoup +from idb import Article +from idb.util import image_url_to_filename + +PAGE_LAYOUT_NAME = 'LandscapeLayout' +MASTER_PAGE_NAME = 'Standard' +BLOCKQUOTE_STYLE_NAME = 'Block Quotation' +ITALIC_STYLE_NAME = 'Italic' +BOLD_STYLE_NAME = 'Bold' +CAPTION_STYLE_NAME = 'Caption' +UNDERLINE_STYLE_NAME = 'Underline' +CENTER_STYLE_NAME = 'CenterAligned' +TITLE_STYLE_NAME = 'Title' + + +def add_child(parent, child): + if hasattr(child, "qname"): + parent.addElement(child) + else: + parent.addText(child) + + +def calc_frame_dimensions(image_path, desired_width_cm): + with PILImage.open(image_path) as img: + orig_width, orig_height = img.size + + dpi = 96.0 + orig_width_cm = (orig_width / dpi) * 2.54 + orig_height_cm = (orig_height / dpi) * 2.54 + + scale = desired_width_cm / orig_width_cm + new_height_cm = orig_height_cm * scale + + return f"{desired_width_cm}cm", f"{new_height_cm}cm" + + +class ImageWrap: + def __init__(self, image_file, caption): + self.image_file = image_file + self.caption = caption + + def get_elements(self, doc): + embedded_href = doc.addPicture(self.image_file) + + desired_width = 13.5 + width_str, height_str = calc_frame_dimensions(self.image_file, desired_width) + + frm = Frame(width=width_str, height=height_str) + img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad") + frm.addElement(img) + + elements = [frm] + + if self.caption: + caption = P(stylename=CAPTION_STYLE_NAME) + caption.addText(self.caption) + elements.append(caption) + + return elements + + +class DocumentCreator: + def __init__(self): + self.doc = OpenDocumentText() + self.set_styles() + + def set_styles(self): + landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME) + landscape_props = PageLayoutProperties( + pagewidth="29.7cm", + pageheight="21.0cm", + printorientation="landscape", + margin="1cm" + ) + landscape_layout.addElement(landscape_props) + self.doc.automaticstyles.addElement(landscape_layout) + + masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME) + self.doc.masterstyles.addElement(masterpage) + + # bold + style = Style(name=BOLD_STYLE_NAME, family="text") + style.addElement(TextProperties(attributes={ + 'fontweight': "bold", + 'fontweightasian': "bold", + 'fontweightcomplex': "bold" + })) + self.doc.automaticstyles.addElement(style) + + # italic + style = Style(name=ITALIC_STYLE_NAME, family="text") + style.addElement(TextProperties(attributes={ + 'fontstyle': "italic", + 'fontstyleasian': "italic", + 'fontstylecomplex': "italic" + })) + self.doc.automaticstyles.addElement(style) + + # caption + style = Style(name=CAPTION_STYLE_NAME, family="paragraph") + style.addElement(TextProperties(attributes={ + 'fontstyle': "italic", + 'fontstyleasian': "italic", + 'fontstylecomplex': "italic", + 'fontsize': '10pt', + 'color': '#777777' + })) + style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm')) + self.doc.automaticstyles.addElement(style) + + # underline + style = Style(name=UNDERLINE_STYLE_NAME, family="text") + style.addElement(TextProperties(attributes={ + 'textunderlinestyle': "solid", + 'textunderlinewidth': "auto" + })) + self.doc.automaticstyles.addElement(style) + + # blockquote + style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph") + style.addElement(ParagraphProperties(attributes={ + 'marginleft': '0.6cm', + 'margintop': '0.15cm', + 'marginbottom': '0.15cm', + })) + style.addElement(TextProperties(attributes={'color': '#378A62'})) + self.doc.styles.addElement(style) + + # title + style = Style(name=TITLE_STYLE_NAME, family="paragraph") + style.addElement(TextProperties(attributes={ + 'fontsize': '20pt', + 'fontweight': "bold", + 'fontweightasian': "bold", + 'fontweightcomplex': "bold" + })) + style.addElement(ParagraphProperties(textalign='center')) + self.doc.styles.addElement(style) + + # centered text + style = Style(name=CENTER_STYLE_NAME, family="paragraph") + style.addElement(ParagraphProperties(textalign="center")) + self.doc.automaticstyles.addElement(style) + + def process_inline(self, node): + if isinstance(node, str): + return [node] + + tag = node.name.lower() + + simple_tags = ( + ('strong', 'b'), + ('em', 'i'), + ('ins', 'u') + ) + simple_styles = ( + BOLD_STYLE_NAME, + ITALIC_STYLE_NAME, + UNDERLINE_STYLE_NAME + ) + for i, tags_list in enumerate(simple_tags): + if tag in tags_list: + span = Span(stylename=simple_styles[i]) + for child in node.contents: + for inline in self.process_inline(child): + add_child(span, inline) + return [span] + + if tag == "code": + return [Span(stylename="Code", text=node.get_text())] + + elif tag == "a": + return [A(href=node.get("href"), text=node.get_text())] + + elif tag == "img": + if node.name and node.name.lower() == "img": + return [self.process_img(node)] + + else: + result = [] + for child in node.contents: + result.extend(self.process_inline(child)) + return result + + def process_block(self, elem): + h_elem = self.try_process_heading(elem) + if h_elem is not None: + return h_elem + + tag = elem.name.lower() + if tag == "p": + is_centered = False + has_image = False + for child in elem.contents: + # try converting heading + h_elem = self.try_process_heading(child) + if h_elem is not None: + return h_elem + + if child.name: + if child.name.lower() == "img": + has_image = True + if child.name.lower() == "center": + for cchild in child.contents: + h_elem = self.try_process_heading(cchild) + if h_elem is not None: + return h_elem + is_centered = True + break + + if is_centered or has_image: + p_elem = P(stylename=CENTER_STYLE_NAME) + else: + p_elem = P() + + for child in elem.contents: + for inline in self.process_inline(child): + if has_image and isinstance(inline, ImageWrap): + image = inline.get_elements(self.doc) + p_elem.addElement(image[0]) + elems = [p_elem] + if len(image) == 2: + elems.append(image[1]) + return elems + + add_child(p_elem, inline) + + return p_elem + + elif tag == "blockquote": + items = [] + for child in elem.contents: + text = child.get_text() + if text.strip() == '': + continue + items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text)) + return items + + elif tag in ("ul", "ol"): + odf_list = List() + li_elements = elem.find_all("li", recursive=False) + for li in li_elements: + li_item = ListItem() + p = P() + for child in li.contents: + # if nested list is found, we'll process it later + if hasattr(child, "name") and child.name in ("ul", "ol"): + continue + for inline in self.process_inline(child): + add_child(p, inline) + li_item.addElement(p) + + # process nested lists + for child in li.contents: + if hasattr(child, "name") and child.name in ("ul", "ol"): + nested_list = self.process_block(child) + li_item.addElement(nested_list) + + odf_list.addElement(li_item) + return odf_list + + elif tag == "pre": + return P(stylename="Preformatted", text=elem.get_text()) + + elif tag == "hr": + return P(stylename=CENTER_STYLE_NAME, text='---') + + elif tag == "table": + odf_table = Table() + for tr in elem.find_all("tr"): + row = TableRow() + for cell in tr.find_all(["th", "td"]): + cell_elem = TableCell() + cell_html = "".join(str(child) for child in cell.contents) + cell_odf_elements = self.html_to_odf_elements(cell_html) + for el in cell_odf_elements: + cell_elem.addElement(el) + row.addElement(cell_elem) + odf_table.addElement(row) + return odf_table + + elif tag == "img": + return self.process_img(elem).get_elements(self.doc) + + elif tag == "br": + return LineBreak() + + else: + p_elem = P() + p_elem.addText(elem.get_text()) + return p_elem + + def try_process_heading(self, elem): + if not elem.name: + return + + tag = elem.name.lower() + if tag in ("h1", "h2", "h3", "h4", "h5", "h6"): + return H(outlinelevel=tag[1], text=elem.get_text()) + + def process_img(self, elem) -> ImageWrap: + href = elem.get("src") + saved_file = os.path.join( + os.path.dirname(__file__), '..', 'images', image_url_to_filename(href) + ) + if not os.path.exists(saved_file): + raise ValueError(f'image {saved_file} not found') + + alt = elem.get('alt') + return ImageWrap(saved_file, alt) + + def html_to_odf_elements(self, html): + soup = BeautifulSoup(html, "html.parser") + elements = [] + top_nodes = soup.body.contents if soup.body else soup.contents + + keeping = False + stack = [] + ending_headings = ( + 'дополнительное чтение', + 'основные источники', + 'источники', + 'дополнительные материалы', + 'additional reading', + 'further reading', + 'main sources', + 'additional sources', + ) + + for node in top_nodes: + if isinstance(node, str): + if not node.strip(): + continue + p = P() + p.addText(node) + elements.append(p) + elif node.name: + if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings): + if stack: + elements.append(stack) + stack = [] + keeping = True + result = self.process_block(node) + if keeping: + stack.append(result) + else: + elements.append(result) + if stack: + elements.append(stack) + return elements + + def create(self, + orig: Article, + trans: Article, + output_odt: str, + with_title=False): + orig_elements = self.html_to_odf_elements(orig.html) + trans_elements = self.html_to_odf_elements(trans.html) + + max_len = max(len(orig_elements), len(trans_elements)) + while len(orig_elements) < max_len: + orig_elements.append(P()) + while len(trans_elements) < max_len: + trans_elements.append(P()) + + main_table = Table(name="ComparisonTable") + + col1 = TableColumn() + col2 = TableColumn() + + main_table.addElement(col1) + main_table.addElement(col2) + + if with_title: + # headings + header_row = TableRow() + + header_cell_left = TableCell() + header_cell_right = TableCell() + + header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title)) + header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title)) + + header_row.addElement(header_cell_left) + header_row.addElement(header_cell_right) + + main_table.addElement(header_row) + + # content + for i in range(max_len): + row = TableRow() + + cell_orig = TableCell() + cell_trans = TableCell() + + if isinstance(orig_elements[i], list): + for elem in orig_elements[i]: + cell_orig.addElement(elem) + else: + cell_orig.addElement(orig_elements[i]) + + if isinstance(trans_elements[i], list): + for elem in trans_elements[i]: + cell_trans.addElement(elem) + else: + cell_trans.addElement(trans_elements[i]) + + row.addElement(cell_orig) + row.addElement(cell_trans) + + main_table.addElement(row) + + self.doc.text.addElement(main_table) + self.doc.save(output_odt) diff --git a/idb/translator.py b/idb/translator.py index f316cd4..912bf98 100644 --- a/idb/translator.py +++ b/idb/translator.py @@ -2,11 +2,18 @@ import tiktoken from openai import OpenAI model = "gpt-4o" -system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations." -input_token_limit = 2000 +system_prompt = """ +Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form: +- Quotes (e.g., > quoted text). Can be multi-line. +- Links (e.g., [text](url)) +- Images (e.g., ) + +Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content. +""" +input_token_limit = 3500 -def translate(text): +def translate_markdown(text): buf = [] bufsize = 0 cl = OpenAI() diff --git a/idb/util.py b/idb/util.py new file mode 100644 index 0000000..609a526 --- /dev/null +++ b/idb/util.py @@ -0,0 +1,44 @@ +import re, os.path +import requests +from urllib.parse import urlparse + + +def save_file(file, content): + with open(file, 'w') as f: + f.write(content) + + +def read_file(filename): + with open(filename) as f: + return f.read() + + +def name_from_url(url): + return os.path.basename(url[:-1]) + + +def image_url_to_filename(url): + parsed_url = urlparse(url) + filename = os.path.basename(parsed_url.path) + name, ext = os.path.splitext(filename) + date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path) + if not date_match: + raise ValueError("no valid date found in URL") + year = date_match.group(1) + day = date_match.group(3) if date_match.group(3) else "01" + return f"{year}{day}_{name}{ext}" + + +def extract_images_from_markdown(markdown_text): + image_pattern = r"!\[.*?\]\((.*?)\)" + images = re.findall(image_pattern, markdown_text) + return images + + +def download_file(url, filename): + response = requests.get(url, stream=True) + response.raise_for_status() + if response.status_code == 200: + with open(filename, 'wb') as file: + for chunk in response.iter_content(1024): + file.write(chunk) diff --git a/idb/wordpress.py b/idb/wordpress.py index 10559f9..7d2d264 100644 --- a/idb/wordpress.py +++ b/idb/wordpress.py @@ -1,13 +1,123 @@ -import requests +import requests, re from bs4 import BeautifulSoup from html import unescape -from markdownify import markdownify -from collections import namedtuple - -ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md']) +from markdownify import MarkdownConverter +from markdown import markdown +from enum import Enum -def fetch(url) -> ArticleContent: +class WordpressMarkdownConverter(MarkdownConverter): + def convert_p(self, el, text, convert_as_inline): + md = self.idb_convert_image(el) + if md is not None: + return md + if str(el).startswith('
'):
+ text = text.replace('\n\n', '
')
+ return f'