604 lines
21 KiB
Python
604 lines
21 KiB
Python
import os.path
|
|
import zipfile
|
|
|
|
from odf.opendocument import OpenDocumentText, load
|
|
from odf.text import P, H, Span, A, LineBreak, List, ListItem
|
|
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
|
|
from odf.table import TableColumn, TableCell, TableRow, Table
|
|
from odf.draw import Frame, Image
|
|
|
|
from PIL import Image as PILImage
|
|
from io import BytesIO
|
|
|
|
from bs4 import BeautifulSoup
|
|
from idb import Article
|
|
from idb.util import image_url_to_filename
|
|
|
|
PAGE_LAYOUT_NAME = 'LandscapeLayout'
|
|
MASTER_PAGE_NAME = 'Standard'
|
|
BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
|
|
ITALIC_STYLE_NAME = 'Italic'
|
|
BOLD_STYLE_NAME = 'Bold'
|
|
CAPTION_STYLE_NAME = 'Caption'
|
|
UNDERLINE_STYLE_NAME = 'Underline'
|
|
CENTER_STYLE_NAME = 'CenterAligned'
|
|
TITLE_STYLE_NAME = 'Title'
|
|
|
|
|
|
def add_child(parent, child):
|
|
if hasattr(child, "qname"):
|
|
parent.addElement(child)
|
|
else:
|
|
parent.addText(child)
|
|
|
|
|
|
def calc_frame_dimensions(image_path, desired_width_cm):
|
|
with PILImage.open(image_path) as img:
|
|
orig_width, orig_height = img.size
|
|
|
|
dpi = 96.0
|
|
orig_width_cm = (orig_width / dpi) * 2.54
|
|
orig_height_cm = (orig_height / dpi) * 2.54
|
|
|
|
scale = desired_width_cm / orig_width_cm
|
|
new_height_cm = orig_height_cm * scale
|
|
|
|
return f"{desired_width_cm}cm", f"{new_height_cm}cm"
|
|
|
|
|
|
class ImageWrap:
|
|
def __init__(self, image_file, caption):
|
|
self.image_file = image_file
|
|
self.caption = caption
|
|
|
|
def get_elements(self, doc):
|
|
embedded_href = doc.addPicture(self.image_file)
|
|
|
|
desired_width = 13.5
|
|
width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)
|
|
|
|
frm = Frame(width=width_str, height=height_str)
|
|
img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
|
|
frm.addElement(img)
|
|
|
|
elements = [frm]
|
|
|
|
if self.caption:
|
|
caption = P(stylename=CAPTION_STYLE_NAME)
|
|
caption.addText(self.caption)
|
|
elements.append(caption)
|
|
|
|
return elements
|
|
|
|
|
|
class DocumentCreator:
|
|
def __init__(self):
|
|
self.doc = OpenDocumentText()
|
|
self.set_styles()
|
|
|
|
def set_styles(self):
|
|
landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
|
|
landscape_props = PageLayoutProperties(
|
|
pagewidth="29.7cm",
|
|
pageheight="21.0cm",
|
|
printorientation="landscape",
|
|
margin="1cm"
|
|
)
|
|
landscape_layout.addElement(landscape_props)
|
|
self.doc.automaticstyles.addElement(landscape_layout)
|
|
|
|
masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
|
|
self.doc.masterstyles.addElement(masterpage)
|
|
|
|
# bold
|
|
style = Style(name=BOLD_STYLE_NAME, family="text")
|
|
style.addElement(TextProperties(attributes={
|
|
'fontweight': "bold",
|
|
'fontweightasian': "bold",
|
|
'fontweightcomplex': "bold"
|
|
}))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
# italic
|
|
style = Style(name=ITALIC_STYLE_NAME, family="text")
|
|
style.addElement(TextProperties(attributes={
|
|
'fontstyle': "italic",
|
|
'fontstyleasian': "italic",
|
|
'fontstylecomplex': "italic"
|
|
}))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
# caption
|
|
style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
|
|
style.addElement(TextProperties(attributes={
|
|
'fontstyle': "italic",
|
|
'fontstyleasian': "italic",
|
|
'fontstylecomplex': "italic",
|
|
'fontsize': '10pt',
|
|
'color': '#777777'
|
|
}))
|
|
style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
# underline
|
|
style = Style(name=UNDERLINE_STYLE_NAME, family="text")
|
|
style.addElement(TextProperties(attributes={
|
|
'textunderlinestyle': "solid",
|
|
'textunderlinewidth': "auto"
|
|
}))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
# blockquote
|
|
style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
|
|
style.addElement(ParagraphProperties(attributes={
|
|
'marginleft': '0.6cm',
|
|
'margintop': '0.15cm',
|
|
'marginbottom': '0.15cm',
|
|
}))
|
|
style.addElement(TextProperties(attributes={'color': '#378A62'}))
|
|
self.doc.styles.addElement(style)
|
|
|
|
# title
|
|
style = Style(name=TITLE_STYLE_NAME, family="paragraph")
|
|
style.addElement(TextProperties(attributes={
|
|
'fontsize': '20pt',
|
|
'fontweight': "bold",
|
|
'fontweightasian': "bold",
|
|
'fontweightcomplex': "bold"
|
|
}))
|
|
style.addElement(ParagraphProperties(textalign='center'))
|
|
self.doc.styles.addElement(style)
|
|
|
|
# centered text
|
|
style = Style(name=CENTER_STYLE_NAME, family="paragraph")
|
|
style.addElement(ParagraphProperties(textalign="center"))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
def process_inline(self, node):
|
|
if isinstance(node, str):
|
|
return [node]
|
|
|
|
tag = node.name.lower()
|
|
|
|
simple_tags = (
|
|
('strong', 'b'),
|
|
('em', 'i'),
|
|
('ins', 'u')
|
|
)
|
|
simple_styles = (
|
|
BOLD_STYLE_NAME,
|
|
ITALIC_STYLE_NAME,
|
|
UNDERLINE_STYLE_NAME
|
|
)
|
|
for i, tags_list in enumerate(simple_tags):
|
|
if tag in tags_list:
|
|
span = Span(stylename=simple_styles[i])
|
|
for child in node.contents:
|
|
for inline in self.process_inline(child):
|
|
add_child(span, inline)
|
|
return [span]
|
|
|
|
if tag == "code":
|
|
return [Span(stylename="Code", text=node.get_text())]
|
|
|
|
elif tag == "a":
|
|
return [A(href=node.get("href"), text=node.get_text())]
|
|
|
|
elif tag == "img":
|
|
if node.name and node.name.lower() == "img":
|
|
return [self.process_img(node)]
|
|
|
|
else:
|
|
result = []
|
|
for child in node.contents:
|
|
result.extend(self.process_inline(child))
|
|
return result
|
|
|
|
def process_block(self, elem):
|
|
h_elem = self.try_process_heading(elem)
|
|
if h_elem is not None:
|
|
return h_elem
|
|
|
|
tag = elem.name.lower()
|
|
if tag == "p":
|
|
is_centered = False
|
|
has_image = False
|
|
for child in elem.contents:
|
|
# try converting heading
|
|
h_elem = self.try_process_heading(child)
|
|
if h_elem is not None:
|
|
return h_elem
|
|
|
|
if child.name:
|
|
if child.name.lower() == "img":
|
|
has_image = True
|
|
if child.name.lower() == "center":
|
|
for cchild in child.contents:
|
|
h_elem = self.try_process_heading(cchild)
|
|
if h_elem is not None:
|
|
return h_elem
|
|
is_centered = True
|
|
break
|
|
|
|
if is_centered or has_image:
|
|
p_elem = P(stylename=CENTER_STYLE_NAME)
|
|
else:
|
|
p_elem = P()
|
|
|
|
for child in elem.contents:
|
|
for inline in self.process_inline(child):
|
|
if has_image and isinstance(inline, ImageWrap):
|
|
image = inline.get_elements(self.doc)
|
|
p_elem.addElement(image[0])
|
|
elems = [p_elem]
|
|
if len(image) == 2:
|
|
elems.append(image[1])
|
|
return elems
|
|
|
|
add_child(p_elem, inline)
|
|
|
|
return p_elem
|
|
|
|
elif tag == "blockquote":
|
|
items = []
|
|
for child in elem.contents:
|
|
text = child.get_text()
|
|
if text.strip() == '':
|
|
continue
|
|
items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
|
|
return items
|
|
|
|
elif tag in ("ul", "ol"):
|
|
odf_list = List()
|
|
li_elements = elem.find_all("li", recursive=False)
|
|
for li in li_elements:
|
|
li_item = ListItem()
|
|
p = P()
|
|
for child in li.contents:
|
|
# if nested list is found, we'll process it later
|
|
if hasattr(child, "name") and child.name in ("ul", "ol"):
|
|
continue
|
|
for inline in self.process_inline(child):
|
|
add_child(p, inline)
|
|
li_item.addElement(p)
|
|
|
|
# process nested lists
|
|
for child in li.contents:
|
|
if hasattr(child, "name") and child.name in ("ul", "ol"):
|
|
nested_list = self.process_block(child)
|
|
li_item.addElement(nested_list)
|
|
|
|
odf_list.addElement(li_item)
|
|
return odf_list
|
|
|
|
elif tag == "pre":
|
|
return P(stylename="Preformatted", text=elem.get_text())
|
|
|
|
elif tag == "hr":
|
|
return P(stylename=CENTER_STYLE_NAME, text='---')
|
|
|
|
elif tag == "table":
|
|
odf_table = Table()
|
|
for tr in elem.find_all("tr"):
|
|
row = TableRow()
|
|
for cell in tr.find_all(["th", "td"]):
|
|
cell_elem = TableCell()
|
|
cell_html = "".join(str(child) for child in cell.contents)
|
|
cell_odf_elements = self.html_to_odf_elements(cell_html)
|
|
for el in cell_odf_elements:
|
|
cell_elem.addElement(el)
|
|
row.addElement(cell_elem)
|
|
odf_table.addElement(row)
|
|
return odf_table
|
|
|
|
elif tag == "img":
|
|
return self.process_img(elem).get_elements(self.doc)
|
|
|
|
elif tag == "br":
|
|
return LineBreak()
|
|
|
|
else:
|
|
p_elem = P()
|
|
p_elem.addText(elem.get_text())
|
|
return p_elem
|
|
|
|
def try_process_heading(self, elem):
|
|
if not elem.name:
|
|
return
|
|
|
|
tag = elem.name.lower()
|
|
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
|
return H(outlinelevel=tag[1], text=elem.get_text())
|
|
|
|
def process_img(self, elem) -> ImageWrap:
|
|
href = elem.get("src")
|
|
saved_file = os.path.join(
|
|
os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
|
|
)
|
|
if not os.path.exists(saved_file):
|
|
raise ValueError(f'image {saved_file} not found')
|
|
|
|
alt = elem.get('alt')
|
|
return ImageWrap(saved_file, alt)
|
|
|
|
def html_to_odf_elements(self, html):
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = []
|
|
top_nodes = soup.body.contents if soup.body else soup.contents
|
|
|
|
keeping = False
|
|
stack = []
|
|
ending_headings = (
|
|
'дополнительное чтение',
|
|
'основные источники',
|
|
'источники',
|
|
'дополнительные материалы',
|
|
'additional reading',
|
|
'further reading',
|
|
'main sources',
|
|
'additional sources',
|
|
)
|
|
|
|
for node in top_nodes:
|
|
if isinstance(node, str):
|
|
if not node.strip():
|
|
continue
|
|
p = P()
|
|
p.addText(node)
|
|
elements.append(p)
|
|
elif node.name:
|
|
if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
|
|
if stack:
|
|
elements.append(stack)
|
|
stack = []
|
|
keeping = True
|
|
result = self.process_block(node)
|
|
if keeping:
|
|
stack.append(result)
|
|
else:
|
|
elements.append(result)
|
|
if stack:
|
|
elements.append(stack)
|
|
return elements
|
|
|
|
def create(self,
|
|
orig: Article,
|
|
trans: Article,
|
|
output_odt: str,
|
|
with_title=False):
|
|
orig_elements = self.html_to_odf_elements(orig.html)
|
|
trans_elements = self.html_to_odf_elements(trans.html)
|
|
|
|
max_len = max(len(orig_elements), len(trans_elements))
|
|
while len(orig_elements) < max_len:
|
|
orig_elements.append(P())
|
|
while len(trans_elements) < max_len:
|
|
trans_elements.append(P())
|
|
|
|
main_table = Table(name="ComparisonTable")
|
|
|
|
col1 = TableColumn()
|
|
col2 = TableColumn()
|
|
|
|
main_table.addElement(col1)
|
|
main_table.addElement(col2)
|
|
|
|
if with_title:
|
|
# headings
|
|
header_row = TableRow()
|
|
|
|
header_cell_left = TableCell()
|
|
header_cell_right = TableCell()
|
|
|
|
header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
|
|
header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))
|
|
|
|
header_row.addElement(header_cell_left)
|
|
header_row.addElement(header_cell_right)
|
|
|
|
main_table.addElement(header_row)
|
|
|
|
# content
|
|
for i in range(max_len):
|
|
row = TableRow()
|
|
|
|
cell_orig = TableCell()
|
|
cell_trans = TableCell()
|
|
|
|
if isinstance(orig_elements[i], list):
|
|
for elem in orig_elements[i]:
|
|
cell_orig.addElement(elem)
|
|
else:
|
|
cell_orig.addElement(orig_elements[i])
|
|
|
|
if isinstance(trans_elements[i], list):
|
|
for elem in trans_elements[i]:
|
|
cell_trans.addElement(elem)
|
|
else:
|
|
cell_trans.addElement(trans_elements[i])
|
|
|
|
row.addElement(cell_orig)
|
|
row.addElement(cell_trans)
|
|
|
|
main_table.addElement(row)
|
|
|
|
self.doc.text.addElement(main_table)
|
|
self.doc.save(output_odt)
|
|
|
|
|
|
class DocumentReader:
|
|
def __init__(self, input_file):
|
|
self.doc = load(input_file)
|
|
self.package = zipfile.ZipFile(input_file)
|
|
|
|
self.style_alignments = self.build_style_alignments(self.doc) # For paragraph alignment (family="paragraph")
|
|
self.text_style_formats = self.build_text_styles(self.doc) # For text formatting (family="text")
|
|
|
|
def parse_node(self, node, indent=0):
|
|
if isinstance(node, str):
|
|
return node
|
|
try:
|
|
if node.nodeType == node.TEXT_NODE:
|
|
return node.data
|
|
except AttributeError:
|
|
pass
|
|
|
|
tag = node.tagName
|
|
if tag == "text:h":
|
|
level_str = node.attributes.get("text:outline-level", "1")
|
|
try:
|
|
level = int(level_str)
|
|
except ValueError:
|
|
level = 1
|
|
if level > 6:
|
|
level = 6
|
|
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
|
return f'{"#" * level} {content}\n\n'
|
|
|
|
if tag == 'text:p':
|
|
style = node.getAttribute('stylename')
|
|
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
|
|
|
if style:
|
|
style_align = self.style_alignments.get(style, '')
|
|
if style_align == 'center' or style.lower() == 'center':
|
|
return f"<center>{content}</center>\n\n"
|
|
|
|
if style in ('Block Quotation', 'Quotations') or style.endswith('Quotation'):
|
|
lines = content.splitlines()
|
|
content = "\n".join(["> " + line for line in lines])
|
|
|
|
return content + "\n\n"
|
|
|
|
elif tag == "text:list":
|
|
md = ""
|
|
for child in node.childNodes:
|
|
md += self.parse_node(child, indent)
|
|
return md + "\n"
|
|
|
|
elif tag == "text:list-item":
|
|
item_text = ""
|
|
for child in node.childNodes:
|
|
item_text += self.parse_node(child, indent + 1)
|
|
lines = item_text.splitlines()
|
|
if lines:
|
|
prefix = " " * indent + "- "
|
|
new_lines = [prefix + lines[0]]
|
|
for line in lines[1:]:
|
|
new_lines.append(" " * (indent + 1) + line)
|
|
return "\n".join(new_lines) + "\n"
|
|
return ""
|
|
|
|
elif tag == "text:span":
|
|
style_name = node.getAttribute("stylename") or ""
|
|
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
|
fmt = self.text_style_formats.get(style_name, {})
|
|
md_text = content
|
|
if fmt.get("bold") and fmt.get("italic"):
|
|
md_text = f"***{md_text}***"
|
|
else:
|
|
if fmt.get("bold"):
|
|
md_text = f"**{md_text}**"
|
|
if fmt.get("italic"):
|
|
md_text = f"*{md_text}*"
|
|
if fmt.get("underline"):
|
|
md_text = f"<u>{md_text}</u>"
|
|
return md_text
|
|
|
|
elif tag == "text:a":
|
|
href = node.getAttribute("href")
|
|
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
|
if href:
|
|
return f"[{content}]({href})"
|
|
return content
|
|
|
|
elif tag == "text:line-break":
|
|
return " \n"
|
|
|
|
elif tag == "draw:frame":
|
|
md = ""
|
|
caption_text = ""
|
|
for child in node.childNodes:
|
|
if hasattr(child, "tagName"):
|
|
if child.tagName == "draw:image":
|
|
href = child.attributes.get(('http://www.w3.org/1999/xlink', 'href'))
|
|
md += f""
|
|
elif child.tagName == "draw:caption":
|
|
caption_text = ''.join([self.parse_node(c, indent) for c in child.childNodes]).strip()
|
|
if caption_text:
|
|
md += "\n" + caption_text + "\n"
|
|
return md
|
|
|
|
else:
|
|
return ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
|
|
|
def get_embedded_image_size(self, file_name) -> tuple[int, int]:
|
|
data = self.package.read(file_name)
|
|
img = PILImage.open(BytesIO(data))
|
|
return img.size
|
|
|
|
def get_markdown(self, column=1) -> str:
|
|
tables = self.doc.getElementsByType(Table)
|
|
comp_table = None
|
|
for tbl in tables:
|
|
if tbl.getAttribute("name") == "ComparisonTable":
|
|
comp_table = tbl
|
|
break
|
|
if not comp_table:
|
|
raise RuntimeError("ComparisonTable not found in the document.")
|
|
|
|
md_lines = []
|
|
rows = comp_table.getElementsByType(TableRow)
|
|
for row in rows:
|
|
cells = row.getElementsByType(TableCell)
|
|
if len(cells) >= 2:
|
|
right_cell = cells[column]
|
|
cell_md = ""
|
|
for child in right_cell.childNodes:
|
|
cell_md += self.parse_node(child)
|
|
# Remove any extra whitespace.
|
|
md_lines.append(cell_md.strip())
|
|
|
|
return "\n\n".join(md_lines)
|
|
|
|
@staticmethod
|
|
def build_style_alignments(doc):
|
|
alignments = {}
|
|
for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
|
|
if style.getAttribute('family') == 'paragraph':
|
|
style_name = style.getAttribute('name')
|
|
para_props = style.getElementsByType(ParagraphProperties)
|
|
if para_props:
|
|
attr_val = para_props[0].attributes.get(
|
|
('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'text-align'), '')
|
|
if attr_val:
|
|
alignments[style_name] = attr_val.lower().strip()
|
|
return alignments
|
|
|
|
@staticmethod
|
|
def build_text_styles(doc):
|
|
text_styles = {}
|
|
for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
|
|
if style.getAttribute('family') == 'text':
|
|
style_name = style.getAttribute('name')
|
|
text_props = style.getElementsByType(TextProperties)
|
|
if text_props:
|
|
props = text_props[0].attributes
|
|
bold = False
|
|
italic = False
|
|
underline = False
|
|
fw = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-weight'),
|
|
'').lower()
|
|
if 'bold' in fw or fw in ('700', '800', '900'):
|
|
bold = True
|
|
fs = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-style'),
|
|
'').lower()
|
|
if 'italic' in fs:
|
|
italic = True
|
|
tu = props.get(('urn:oasis:names:tc:opendocument:xmlns:style:1.0', 'text-underline-style'),
|
|
'').lower()
|
|
if tu and tu != 'none':
|
|
underline = True
|
|
text_styles[style_name] = {'bold': bold, 'italic': italic, 'underline': underline}
|
|
return text_styles
|