idb_utils/idb/doc.py

import os.path
import zipfile

from odf.opendocument import OpenDocumentText, load
from odf.text import P, H, Span, A, LineBreak, List, ListItem
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
from odf.table import TableColumn, TableCell, TableRow, Table
from odf.draw import Frame, Image

from PIL import Image as PILImage
from io import BytesIO

from bs4 import BeautifulSoup
from idb import Article
from idb.util import image_url_to_filename

PAGE_LAYOUT_NAME = 'LandscapeLayout'
MASTER_PAGE_NAME = 'Standard'
BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
ITALIC_STYLE_NAME = 'Italic'
BOLD_STYLE_NAME = 'Bold'
CAPTION_STYLE_NAME = 'Caption'
UNDERLINE_STYLE_NAME = 'Underline'
CENTER_STYLE_NAME = 'CenterAligned'
TITLE_STYLE_NAME = 'Title'


def add_child(parent, child):
    if hasattr(child, "qname"):
        parent.addElement(child)
    else:
        parent.addText(child)


def calc_frame_dimensions(image_path, desired_width_cm):
    with PILImage.open(image_path) as img:
        orig_width, orig_height = img.size

    dpi = 96.0
    orig_width_cm = (orig_width / dpi) * 2.54
    orig_height_cm = (orig_height / dpi) * 2.54

    scale = desired_width_cm / orig_width_cm
    new_height_cm = orig_height_cm * scale

    return f"{desired_width_cm}cm", f"{new_height_cm}cm"


class ImageWrap:
    def __init__(self, image_file, caption):
        self.image_file = image_file
        self.caption = caption

    def get_elements(self, doc):
        embedded_href = doc.addPicture(self.image_file)

        desired_width = 13.5
        width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)

        frm = Frame(width=width_str, height=height_str)
        img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
        frm.addElement(img)

        elements = [frm]

        if self.caption:
            caption = P(stylename=CAPTION_STYLE_NAME)
            caption.addText(self.caption)
            elements.append(caption)

        return elements


class DocumentCreator:
    def __init__(self):
        self.doc = OpenDocumentText()
        self.set_styles()

    def set_styles(self):
        landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
        landscape_props = PageLayoutProperties(
            pagewidth="29.7cm",
            pageheight="21.0cm",
            printorientation="landscape",
            margin="1cm"
        )
        landscape_layout.addElement(landscape_props)
        self.doc.automaticstyles.addElement(landscape_layout)

        masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
        self.doc.masterstyles.addElement(masterpage)

        # bold
        style = Style(name=BOLD_STYLE_NAME, family="text")
        style.addElement(TextProperties(attributes={
            'fontweight': "bold",
            'fontweightasian': "bold",
            'fontweightcomplex': "bold"
        }))
        self.doc.automaticstyles.addElement(style)

        # italic
        style = Style(name=ITALIC_STYLE_NAME, family="text")
        style.addElement(TextProperties(attributes={
            'fontstyle': "italic",
            'fontstyleasian': "italic",
            'fontstylecomplex': "italic"
        }))
        self.doc.automaticstyles.addElement(style)

        # caption
        style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
        style.addElement(TextProperties(attributes={
            'fontstyle': "italic",
            'fontstyleasian': "italic",
            'fontstylecomplex': "italic",
            'fontsize': '10pt',
            'color': '#777777'
        }))
        style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
        self.doc.automaticstyles.addElement(style)

        # underline
        style = Style(name=UNDERLINE_STYLE_NAME, family="text")
        style.addElement(TextProperties(attributes={
            'textunderlinestyle': "solid",
            'textunderlinewidth': "auto"
        }))
        self.doc.automaticstyles.addElement(style)

        # blockquote
        style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
        style.addElement(ParagraphProperties(attributes={
            'marginleft': '0.6cm',
            'margintop': '0.15cm',
            'marginbottom': '0.15cm',
        }))
        style.addElement(TextProperties(attributes={'color': '#378A62'}))
        self.doc.styles.addElement(style)

        # title
        style = Style(name=TITLE_STYLE_NAME, family="paragraph")
        style.addElement(TextProperties(attributes={
            'fontsize': '20pt',
            'fontweight': "bold",
            'fontweightasian': "bold",
            'fontweightcomplex': "bold"
        }))
        style.addElement(ParagraphProperties(textalign='center'))
        self.doc.styles.addElement(style)

        # centered text
        style = Style(name=CENTER_STYLE_NAME, family="paragraph")
        style.addElement(ParagraphProperties(textalign="center"))
        self.doc.automaticstyles.addElement(style)

    def process_inline(self, node):
        if isinstance(node, str):
            return [node]

        tag = node.name.lower()

        simple_tags = (
            ('strong', 'b'),
            ('em', 'i'),
            ('ins', 'u')
        )
        simple_styles = (
            BOLD_STYLE_NAME,
            ITALIC_STYLE_NAME,
            UNDERLINE_STYLE_NAME
        )
        for i, tags_list in enumerate(simple_tags):
            if tag in tags_list:
                span = Span(stylename=simple_styles[i])
                for child in node.contents:
                    for inline in self.process_inline(child):
                        add_child(span, inline)
                return [span]

        if tag == "code":
            return [Span(stylename="Code", text=node.get_text())]

        elif tag == "a":
            return [A(href=node.get("href"), text=node.get_text())]

        elif tag == "img":
            if node.name and node.name.lower() == "img":
                return [self.process_img(node)]

        else:
            result = []
            for child in node.contents:
                result.extend(self.process_inline(child))
            return result

    def process_block(self, elem):
        h_elem = self.try_process_heading(elem)
        if h_elem is not None:
            return h_elem

        tag = elem.name.lower()
        if tag == "p":
            is_centered = False
            has_image = False
            for child in elem.contents:
                # try converting heading
                h_elem = self.try_process_heading(child)
                if h_elem is not None:
                    return h_elem

                if child.name:
                    if child.name.lower() == "img":
                        has_image = True
                    if child.name.lower() == "center":
                        for cchild in child.contents:
                            h_elem = self.try_process_heading(cchild)
                            if h_elem is not None:
                                return h_elem
                        is_centered = True
                        break

            if is_centered or has_image:
                p_elem = P(stylename=CENTER_STYLE_NAME)
            else:
                p_elem = P()

            for child in elem.contents:
                for inline in self.process_inline(child):
                    if has_image and isinstance(inline, ImageWrap):
                        image = inline.get_elements(self.doc)
                        p_elem.addElement(image[0])
                        elems = [p_elem]
                        if len(image) == 2:
                            elems.append(image[1])
                        return elems

                    add_child(p_elem, inline)

            return p_elem

        elif tag == "blockquote":
            items = []
            for child in elem.contents:
                text = child.get_text()
                if text.strip() == '':
                    continue
                items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
            return items

        elif tag in ("ul", "ol"):
            odf_list = List()
            li_elements = elem.find_all("li", recursive=False)
            for li in li_elements:
                li_item = ListItem()
                p = P()
                for child in li.contents:
                    # if nested list is found, we'll process it later
                    if hasattr(child, "name") and child.name in ("ul", "ol"):
                        continue
                    for inline in self.process_inline(child):
                        add_child(p, inline)
                li_item.addElement(p)

                # process nested lists
                for child in li.contents:
                    if hasattr(child, "name") and child.name in ("ul", "ol"):
                        nested_list = self.process_block(child)
                        li_item.addElement(nested_list)

                odf_list.addElement(li_item)
            return odf_list

        elif tag == "pre":
            return P(stylename="Preformatted", text=elem.get_text())

        elif tag == "hr":
            return P(stylename=CENTER_STYLE_NAME, text='---')

        elif tag == "table":
            odf_table = Table()
            for tr in elem.find_all("tr"):
                row = TableRow()
                for cell in tr.find_all(["th", "td"]):
                    cell_elem = TableCell()
                    cell_html = "".join(str(child) for child in cell.contents)
                    cell_odf_elements = self.html_to_odf_elements(cell_html)
                    for el in cell_odf_elements:
                        cell_elem.addElement(el)
                    row.addElement(cell_elem)
                odf_table.addElement(row)
            return odf_table

        elif tag == "img":
            return self.process_img(elem).get_elements(self.doc)

        elif tag == "br":
            return LineBreak()

        else:
            p_elem = P()
            p_elem.addText(elem.get_text())
            return p_elem

    def try_process_heading(self, elem):
        if not elem.name:
            return

        tag = elem.name.lower()
        if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
            return H(outlinelevel=tag[1], text=elem.get_text())

    def process_img(self, elem) -> ImageWrap:
        href = elem.get("src")
        saved_file = os.path.join(
            os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
        )
        if not os.path.exists(saved_file):
            raise ValueError(f'image {saved_file} not found')

        alt = elem.get('alt')
        return ImageWrap(saved_file, alt)

    def html_to_odf_elements(self, html):
        soup = BeautifulSoup(html, "html.parser")
        elements = []
        top_nodes = soup.body.contents if soup.body else soup.contents

        keeping = False
        stack = []
        ending_headings = (
            'дополнительное чтение',
            'основные источники',
            'источники',
            'дополнительные материалы',
            'additional reading',
            'further reading',
            'main sources',
            'additional sources',
        )

        for node in top_nodes:
            if isinstance(node, str):
                if not node.strip():
                    continue
                p = P()
                p.addText(node)
                elements.append(p)
            elif node.name:
                if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
                    if stack:
                        elements.append(stack)
                        stack = []
                    keeping = True
                result = self.process_block(node)
                if keeping:
                    stack.append(result)
                else:
                    elements.append(result)
        if stack:
            elements.append(stack)
        return elements

    def create(self,
               orig: Article,
               trans: Article,
               output_odt: str,
               with_title=False):
        orig_elements = self.html_to_odf_elements(orig.html)
        trans_elements = self.html_to_odf_elements(trans.html)

        max_len = max(len(orig_elements), len(trans_elements))
        while len(orig_elements) < max_len:
            orig_elements.append(P())
        while len(trans_elements) < max_len:
            trans_elements.append(P())

        main_table = Table(name="ComparisonTable")

        col1 = TableColumn()
        col2 = TableColumn()

        main_table.addElement(col1)
        main_table.addElement(col2)

        if with_title:
            # headings
            header_row = TableRow()

            header_cell_left = TableCell()
            header_cell_right = TableCell()

            header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
            header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))

            header_row.addElement(header_cell_left)
            header_row.addElement(header_cell_right)

            main_table.addElement(header_row)

        # content
        for i in range(max_len):
            row = TableRow()

            cell_orig = TableCell()
            cell_trans = TableCell()

            if isinstance(orig_elements[i], list):
                for elem in orig_elements[i]:
                    cell_orig.addElement(elem)
            else:
                cell_orig.addElement(orig_elements[i])

            if isinstance(trans_elements[i], list):
                for elem in trans_elements[i]:
                    cell_trans.addElement(elem)
            else:
                cell_trans.addElement(trans_elements[i])

            row.addElement(cell_orig)
            row.addElement(cell_trans)

            main_table.addElement(row)

        self.doc.text.addElement(main_table)
        self.doc.save(output_odt)


class DocumentReader:
    def __init__(self, input_file):
        self.doc = load(input_file)
        self.package = zipfile.ZipFile(input_file)

        self.style_alignments = self.build_style_alignments(self.doc)  # For paragraph alignment (family="paragraph")
        self.text_style_formats = self.build_text_styles(self.doc)  # For text formatting (family="text")

    def parse_node(self, node, indent=0):
        if isinstance(node, str):
            return node
        try:
            if node.nodeType == node.TEXT_NODE:
                return node.data
        except AttributeError:
            pass

        tag = node.tagName
        if tag == "text:h":
            level_str = node.attributes.get("text:outline-level", "1")
            try:
                level = int(level_str)
            except ValueError:
                level = 1
            if level > 6:
                level = 6
            content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
            return f'{"#" * level} {content}\n\n'

        if tag == 'text:p':
            style = node.getAttribute('stylename')
            content = ''.join([self.parse_node(child, indent) for child in node.childNodes])

            if style:
                style_align = self.style_alignments.get(style, '')
                if style_align == 'center' or style.lower() == 'center':
                    return f"<center>{content}</center>\n\n"

            if style in ('Block Quotation', 'Quotations') or style.endswith('Quotation'):
                lines = content.splitlines()
                content = "\n".join(["> " + line for line in lines])

            return content + "\n\n"

        elif tag == "text:list":
            md = ""
            for child in node.childNodes:
                md += self.parse_node(child, indent)
            return md + "\n"

        elif tag == "text:list-item":
            item_text = ""
            for child in node.childNodes:
                item_text += self.parse_node(child, indent + 1)
            lines = item_text.splitlines()
            if lines:
                prefix = "    " * indent + "- "
                new_lines = [prefix + lines[0]]
                for line in lines[1:]:
                    new_lines.append("    " * (indent + 1) + line)
                return "\n".join(new_lines) + "\n"
            return ""

        elif tag == "text:span":
            style_name = node.getAttribute("stylename") or ""
            content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
            fmt = self.text_style_formats.get(style_name, {})
            md_text = content
            if fmt.get("bold") and fmt.get("italic"):
                md_text = f"***{md_text}***"
            else:
                if fmt.get("bold"):
                    md_text = f"**{md_text}**"
                if fmt.get("italic"):
                    md_text = f"*{md_text}*"
            if fmt.get("underline"):
                md_text = f"<u>{md_text}</u>"
            return md_text

        elif tag == "text:a":
            href = node.getAttribute("href")
            content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
            if href:
                return f"[{content}]({href})"
            return content

        elif tag == "text:line-break":
            return "  \n"

        elif tag == "draw:frame":
            md = ""
            caption_text = ""
            for child in node.childNodes:
                if hasattr(child, "tagName"):
                    if child.tagName == "draw:image":
                        href = child.attributes.get(('http://www.w3.org/1999/xlink', 'href'))
                        md += f"![]({href})"
                    elif child.tagName == "draw:caption":
                        caption_text = ''.join([self.parse_node(c, indent) for c in child.childNodes]).strip()
            if caption_text:
                md += "\n" + caption_text + "\n"
            return md

        else:
            return ''.join([self.parse_node(child, indent) for child in node.childNodes])

    def get_embedded_image_size(self, file_name) -> tuple[int, int]:
        data = self.package.read(file_name)
        img = PILImage.open(BytesIO(data))
        return img.size

    def get_markdown(self, column=1) -> str:
        tables = self.doc.getElementsByType(Table)
        comp_table = None
        for tbl in tables:
            if tbl.getAttribute("name") == "ComparisonTable":
                comp_table = tbl
                break
        if not comp_table:
            raise RuntimeError("ComparisonTable not found in the document.")

        md_lines = []
        rows = comp_table.getElementsByType(TableRow)
        for row in rows:
            cells = row.getElementsByType(TableCell)
            if len(cells) >= 2:
                right_cell = cells[column]
                cell_md = ""
                for child in right_cell.childNodes:
                    cell_md += self.parse_node(child)
                # Remove any extra whitespace.
                md_lines.append(cell_md.strip())

        return "\n\n".join(md_lines)

    @staticmethod
    def build_style_alignments(doc):
        alignments = {}
        for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
            if style.getAttribute('family') == 'paragraph':
                style_name = style.getAttribute('name')
                para_props = style.getElementsByType(ParagraphProperties)
                if para_props:
                    attr_val = para_props[0].attributes.get(
                        ('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'text-align'), '')
                    if attr_val:
                        alignments[style_name] = attr_val.lower().strip()
        return alignments

    @staticmethod
    def build_text_styles(doc):
        text_styles = {}
        for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
            if style.getAttribute('family') == 'text':
                style_name = style.getAttribute('name')
                text_props = style.getElementsByType(TextProperties)
                if text_props:
                    props = text_props[0].attributes
                    bold = False
                    italic = False
                    underline = False
                    fw = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-weight'),
                                   '').lower()
                    if 'bold' in fw or fw in ('700', '800', '900'):
                        bold = True
                    fs = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-style'),
                                   '').lower()
                    if 'italic' in fs:
                        italic = True
                    tu = props.get(('urn:oasis:names:tc:opendocument:xmlns:style:1.0', 'text-underline-style'),
                                   '').lower()
                    if tu and tu != 'none':
                        underline = True
                    text_styles[style_name] = {'bold': bold, 'italic': italic, 'underline': underline}
        return text_styles