import os.path import zipfile from odf.opendocument import OpenDocumentText, load from odf.text import P, H, Span, A, LineBreak, List, ListItem from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage from odf.table import TableColumn, TableCell, TableRow, Table from odf.draw import Frame, Image from PIL import Image as PILImage from io import BytesIO from bs4 import BeautifulSoup from idb import Article from idb.util import image_url_to_filename PAGE_LAYOUT_NAME = 'LandscapeLayout' MASTER_PAGE_NAME = 'Standard' BLOCKQUOTE_STYLE_NAME = 'Block Quotation' ITALIC_STYLE_NAME = 'Italic' BOLD_STYLE_NAME = 'Bold' CAPTION_STYLE_NAME = 'Caption' UNDERLINE_STYLE_NAME = 'Underline' CENTER_STYLE_NAME = 'CenterAligned' TITLE_STYLE_NAME = 'Title' def add_child(parent, child): if hasattr(child, "qname"): parent.addElement(child) else: parent.addText(child) def calc_frame_dimensions(image_path, desired_width_cm): with PILImage.open(image_path) as img: orig_width, orig_height = img.size dpi = 96.0 orig_width_cm = (orig_width / dpi) * 2.54 orig_height_cm = (orig_height / dpi) * 2.54 scale = desired_width_cm / orig_width_cm new_height_cm = orig_height_cm * scale return f"{desired_width_cm}cm", f"{new_height_cm}cm" class ImageWrap: def __init__(self, image_file, caption): self.image_file = image_file self.caption = caption def get_elements(self, doc): embedded_href = doc.addPicture(self.image_file) desired_width = 13.5 width_str, height_str = calc_frame_dimensions(self.image_file, desired_width) frm = Frame(width=width_str, height=height_str) img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad") frm.addElement(img) elements = [frm] if self.caption: caption = P(stylename=CAPTION_STYLE_NAME) caption.addText(self.caption) elements.append(caption) return elements class DocumentCreator: def __init__(self): self.doc = OpenDocumentText() self.set_styles() def set_styles(self): landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME) landscape_props = PageLayoutProperties( pagewidth="29.7cm", pageheight="21.0cm", printorientation="landscape", margin="1cm" ) landscape_layout.addElement(landscape_props) self.doc.automaticstyles.addElement(landscape_layout) masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME) self.doc.masterstyles.addElement(masterpage) # bold style = Style(name=BOLD_STYLE_NAME, family="text") style.addElement(TextProperties(attributes={ 'fontweight': "bold", 'fontweightasian': "bold", 'fontweightcomplex': "bold" })) self.doc.automaticstyles.addElement(style) # italic style = Style(name=ITALIC_STYLE_NAME, family="text") style.addElement(TextProperties(attributes={ 'fontstyle': "italic", 'fontstyleasian': "italic", 'fontstylecomplex': "italic" })) self.doc.automaticstyles.addElement(style) # caption style = Style(name=CAPTION_STYLE_NAME, family="paragraph") style.addElement(TextProperties(attributes={ 'fontstyle': "italic", 'fontstyleasian': "italic", 'fontstylecomplex': "italic", 'fontsize': '10pt', 'color': '#777777' })) style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm')) self.doc.automaticstyles.addElement(style) # underline style = Style(name=UNDERLINE_STYLE_NAME, family="text") style.addElement(TextProperties(attributes={ 'textunderlinestyle': "solid", 'textunderlinewidth': "auto" })) self.doc.automaticstyles.addElement(style) # blockquote style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph") style.addElement(ParagraphProperties(attributes={ 'marginleft': '0.6cm', 'margintop': '0.15cm', 'marginbottom': '0.15cm', })) style.addElement(TextProperties(attributes={'color': '#378A62'})) self.doc.styles.addElement(style) # title style = Style(name=TITLE_STYLE_NAME, family="paragraph") style.addElement(TextProperties(attributes={ 'fontsize': '20pt', 'fontweight': "bold", 'fontweightasian': "bold", 'fontweightcomplex': "bold" })) style.addElement(ParagraphProperties(textalign='center')) self.doc.styles.addElement(style) # centered text style = Style(name=CENTER_STYLE_NAME, family="paragraph") style.addElement(ParagraphProperties(textalign="center")) self.doc.automaticstyles.addElement(style) def process_inline(self, node): if isinstance(node, str): return [node] tag = node.name.lower() simple_tags = ( ('strong', 'b'), ('em', 'i'), ('ins', 'u') ) simple_styles = ( BOLD_STYLE_NAME, ITALIC_STYLE_NAME, UNDERLINE_STYLE_NAME ) for i, tags_list in enumerate(simple_tags): if tag in tags_list: span = Span(stylename=simple_styles[i]) for child in node.contents: for inline in self.process_inline(child): add_child(span, inline) return [span] if tag == "code": return [Span(stylename="Code", text=node.get_text())] elif tag == "a": return [A(href=node.get("href"), text=node.get_text())] elif tag == "img": if node.name and node.name.lower() == "img": return [self.process_img(node)] else: result = [] for child in node.contents: result.extend(self.process_inline(child)) return result def process_block(self, elem): h_elem = self.try_process_heading(elem) if h_elem is not None: return h_elem tag = elem.name.lower() if tag == "p": is_centered = False has_image = False for child in elem.contents: # try converting heading h_elem = self.try_process_heading(child) if h_elem is not None: return h_elem if child.name: if child.name.lower() == "img": has_image = True if child.name.lower() == "center": for cchild in child.contents: h_elem = self.try_process_heading(cchild) if h_elem is not None: return h_elem is_centered = True break if is_centered or has_image: p_elem = P(stylename=CENTER_STYLE_NAME) else: p_elem = P() for child in elem.contents: for inline in self.process_inline(child): if has_image and isinstance(inline, ImageWrap): image = inline.get_elements(self.doc) p_elem.addElement(image[0]) elems = [p_elem] if len(image) == 2: elems.append(image[1]) return elems add_child(p_elem, inline) return p_elem elif tag == "blockquote": items = [] for child in elem.contents: text = child.get_text() if text.strip() == '': continue items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text)) return items elif tag in ("ul", "ol"): odf_list = List() li_elements = elem.find_all("li", recursive=False) for li in li_elements: li_item = ListItem() p = P() for child in li.contents: # if nested list is found, we'll process it later if hasattr(child, "name") and child.name in ("ul", "ol"): continue for inline in self.process_inline(child): add_child(p, inline) li_item.addElement(p) # process nested lists for child in li.contents: if hasattr(child, "name") and child.name in ("ul", "ol"): nested_list = self.process_block(child) li_item.addElement(nested_list) odf_list.addElement(li_item) return odf_list elif tag == "pre": return P(stylename="Preformatted", text=elem.get_text()) elif tag == "hr": return P(stylename=CENTER_STYLE_NAME, text='---') elif tag == "table": odf_table = Table() for tr in elem.find_all("tr"): row = TableRow() for cell in tr.find_all(["th", "td"]): cell_elem = TableCell() cell_html = "".join(str(child) for child in cell.contents) cell_odf_elements = self.html_to_odf_elements(cell_html) for el in cell_odf_elements: cell_elem.addElement(el) row.addElement(cell_elem) odf_table.addElement(row) return odf_table elif tag == "img": return self.process_img(elem).get_elements(self.doc) elif tag == "br": return LineBreak() else: p_elem = P() p_elem.addText(elem.get_text()) return p_elem def try_process_heading(self, elem): if not elem.name: return tag = elem.name.lower() if tag in ("h1", "h2", "h3", "h4", "h5", "h6"): return H(outlinelevel=tag[1], text=elem.get_text()) def process_img(self, elem) -> ImageWrap: href = elem.get("src") saved_file = os.path.join( os.path.dirname(__file__), '..', 'images', image_url_to_filename(href) ) if not os.path.exists(saved_file): raise ValueError(f'image {saved_file} not found') alt = elem.get('alt') return ImageWrap(saved_file, alt) def html_to_odf_elements(self, html): soup = BeautifulSoup(html, "html.parser") elements = [] top_nodes = soup.body.contents if soup.body else soup.contents keeping = False stack = [] ending_headings = ( 'дополнительное чтение', 'основные источники', 'источники', 'дополнительные материалы', 'additional reading', 'further reading', 'main sources', 'additional sources', ) for node in top_nodes: if isinstance(node, str): if not node.strip(): continue p = P() p.addText(node) elements.append(p) elif node.name: if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings): if stack: elements.append(stack) stack = [] keeping = True result = self.process_block(node) if keeping: stack.append(result) else: elements.append(result) if stack: elements.append(stack) return elements def create(self, orig: Article, trans: Article, output_odt: str, with_title=False): orig_elements = self.html_to_odf_elements(orig.html) trans_elements = self.html_to_odf_elements(trans.html) max_len = max(len(orig_elements), len(trans_elements)) while len(orig_elements) < max_len: orig_elements.append(P()) while len(trans_elements) < max_len: trans_elements.append(P()) main_table = Table(name="ComparisonTable") col1 = TableColumn() col2 = TableColumn() main_table.addElement(col1) main_table.addElement(col2) if with_title: # headings header_row = TableRow() header_cell_left = TableCell() header_cell_right = TableCell() header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title)) header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title)) header_row.addElement(header_cell_left) header_row.addElement(header_cell_right) main_table.addElement(header_row) # content for i in range(max_len): row = TableRow() cell_orig = TableCell() cell_trans = TableCell() if isinstance(orig_elements[i], list): for elem in orig_elements[i]: cell_orig.addElement(elem) else: cell_orig.addElement(orig_elements[i]) if isinstance(trans_elements[i], list): for elem in trans_elements[i]: cell_trans.addElement(elem) else: cell_trans.addElement(trans_elements[i]) row.addElement(cell_orig) row.addElement(cell_trans) main_table.addElement(row) self.doc.text.addElement(main_table) self.doc.save(output_odt) class DocumentReader: def __init__(self, input_file): self.doc = load(input_file) self.package = zipfile.ZipFile(input_file) self.style_alignments = self.build_style_alignments(self.doc) # For paragraph alignment (family="paragraph") self.text_style_formats = self.build_text_styles(self.doc) # For text formatting (family="text") def parse_node(self, node, indent=0): if isinstance(node, str): return node try: if node.nodeType == node.TEXT_NODE: return node.data except AttributeError: pass tag = node.tagName if tag == "text:h": level_str = node.attributes.get("text:outline-level", "1") try: level = int(level_str) except ValueError: level = 1 if level > 6: level = 6 content = ''.join([self.parse_node(child, indent) for child in node.childNodes]) return f'{"#" * level} {content}\n\n' if tag == 'text:p': style = node.getAttribute('stylename') content = ''.join([self.parse_node(child, indent) for child in node.childNodes]) if style: style_align = self.style_alignments.get(style, '') if style_align == 'center' or style.lower() == 'center': return f"
{content}
\n\n" if style in ('Block Quotation', 'Quotations') or style.endswith('Quotation'): lines = content.splitlines() content = "\n".join(["> " + line for line in lines]) return content + "\n\n" elif tag == "text:list": md = "" for child in node.childNodes: md += self.parse_node(child, indent) return md + "\n" elif tag == "text:list-item": item_text = "" for child in node.childNodes: item_text += self.parse_node(child, indent + 1) lines = item_text.splitlines() if lines: prefix = " " * indent + "- " new_lines = [prefix + lines[0]] for line in lines[1:]: new_lines.append(" " * (indent + 1) + line) return "\n".join(new_lines) + "\n" return "" elif tag == "text:span": style_name = node.getAttribute("stylename") or "" content = ''.join([self.parse_node(child, indent) for child in node.childNodes]) fmt = self.text_style_formats.get(style_name, {}) md_text = content if fmt.get("bold") and fmt.get("italic"): md_text = f"***{md_text}***" else: if fmt.get("bold"): md_text = f"**{md_text}**" if fmt.get("italic"): md_text = f"*{md_text}*" if fmt.get("underline"): md_text = f"{md_text}" return md_text elif tag == "text:a": href = node.getAttribute("href") content = ''.join([self.parse_node(child, indent) for child in node.childNodes]) if href: return f"[{content}]({href})" return content elif tag == "text:line-break": return " \n" elif tag == "draw:frame": md = "" caption_text = "" for child in node.childNodes: if hasattr(child, "tagName"): if child.tagName == "draw:image": href = child.attributes.get(('http://www.w3.org/1999/xlink', 'href')) md += f"![]({href})" elif child.tagName == "draw:caption": caption_text = ''.join([self.parse_node(c, indent) for c in child.childNodes]).strip() if caption_text: md += "\n" + caption_text + "\n" return md else: return ''.join([self.parse_node(child, indent) for child in node.childNodes]) def get_embedded_image_size(self, file_name) -> tuple[int, int]: data = self.package.read(file_name) img = PILImage.open(BytesIO(data)) return img.size def get_markdown(self, column=1) -> str: tables = self.doc.getElementsByType(Table) comp_table = None for tbl in tables: if tbl.getAttribute("name") == "ComparisonTable": comp_table = tbl break if not comp_table: raise RuntimeError("ComparisonTable not found in the document.") md_lines = [] rows = comp_table.getElementsByType(TableRow) for row in rows: cells = row.getElementsByType(TableCell) if len(cells) >= 2: right_cell = cells[column] cell_md = "" for child in right_cell.childNodes: cell_md += self.parse_node(child) # Remove any extra whitespace. md_lines.append(cell_md.strip()) return "\n\n".join(md_lines) @staticmethod def build_style_alignments(doc): alignments = {} for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]: if style.getAttribute('family') == 'paragraph': style_name = style.getAttribute('name') para_props = style.getElementsByType(ParagraphProperties) if para_props: attr_val = para_props[0].attributes.get( ('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'text-align'), '') if attr_val: alignments[style_name] = attr_val.lower().strip() return alignments @staticmethod def build_text_styles(doc): text_styles = {} for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]: if style.getAttribute('family') == 'text': style_name = style.getAttribute('name') text_props = style.getElementsByType(TextProperties) if text_props: props = text_props[0].attributes bold = False italic = False underline = False fw = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-weight'), '').lower() if 'bold' in fw or fw in ('700', '800', '900'): bold = True fs = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-style'), '').lower() if 'italic' in fs: italic = True tu = props.get(('urn:oasis:names:tc:opendocument:xmlns:style:1.0', 'text-underline-style'), '').lower() if tu and tu != 'none': underline = True text_styles[style_name] = {'bold': bold, 'italic': italic, 'underline': underline} return text_styles