425 lines
14 KiB
Python
425 lines
14 KiB
Python
import os.path
|
|
|
|
from odf.opendocument import OpenDocumentText
|
|
from odf.text import P, H, Span, A, LineBreak, List, ListItem
|
|
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
|
|
from odf.table import TableColumn, TableCell, TableRow, Table
|
|
from odf.draw import Frame, Image
|
|
|
|
from PIL import Image as PILImage
|
|
|
|
from bs4 import BeautifulSoup
|
|
from idb import Article
|
|
from idb.util import image_url_to_filename
|
|
|
|
PAGE_LAYOUT_NAME = 'LandscapeLayout'
|
|
MASTER_PAGE_NAME = 'Standard'
|
|
BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
|
|
ITALIC_STYLE_NAME = 'Italic'
|
|
BOLD_STYLE_NAME = 'Bold'
|
|
CAPTION_STYLE_NAME = 'Caption'
|
|
UNDERLINE_STYLE_NAME = 'Underline'
|
|
CENTER_STYLE_NAME = 'CenterAligned'
|
|
TITLE_STYLE_NAME = 'Title'
|
|
|
|
|
|
def add_child(parent, child):
|
|
if hasattr(child, "qname"):
|
|
parent.addElement(child)
|
|
else:
|
|
parent.addText(child)
|
|
|
|
|
|
def calc_frame_dimensions(image_path, desired_width_cm):
|
|
with PILImage.open(image_path) as img:
|
|
orig_width, orig_height = img.size
|
|
|
|
dpi = 96.0
|
|
orig_width_cm = (orig_width / dpi) * 2.54
|
|
orig_height_cm = (orig_height / dpi) * 2.54
|
|
|
|
scale = desired_width_cm / orig_width_cm
|
|
new_height_cm = orig_height_cm * scale
|
|
|
|
return f"{desired_width_cm}cm", f"{new_height_cm}cm"
|
|
|
|
|
|
class ImageWrap:
|
|
def __init__(self, image_file, caption):
|
|
self.image_file = image_file
|
|
self.caption = caption
|
|
|
|
def get_elements(self, doc):
|
|
embedded_href = doc.addPicture(self.image_file)
|
|
|
|
desired_width = 13.5
|
|
width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)
|
|
|
|
frm = Frame(width=width_str, height=height_str)
|
|
img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
|
|
frm.addElement(img)
|
|
|
|
elements = [frm]
|
|
|
|
if self.caption:
|
|
caption = P(stylename=CAPTION_STYLE_NAME)
|
|
caption.addText(self.caption)
|
|
elements.append(caption)
|
|
|
|
return elements
|
|
|
|
|
|
class DocumentCreator:
|
|
def __init__(self):
|
|
self.doc = OpenDocumentText()
|
|
self.set_styles()
|
|
|
|
def set_styles(self):
|
|
landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
|
|
landscape_props = PageLayoutProperties(
|
|
pagewidth="29.7cm",
|
|
pageheight="21.0cm",
|
|
printorientation="landscape",
|
|
margin="1cm"
|
|
)
|
|
landscape_layout.addElement(landscape_props)
|
|
self.doc.automaticstyles.addElement(landscape_layout)
|
|
|
|
masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
|
|
self.doc.masterstyles.addElement(masterpage)
|
|
|
|
# bold
|
|
style = Style(name=BOLD_STYLE_NAME, family="text")
|
|
style.addElement(TextProperties(attributes={
|
|
'fontweight': "bold",
|
|
'fontweightasian': "bold",
|
|
'fontweightcomplex': "bold"
|
|
}))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
# italic
|
|
style = Style(name=ITALIC_STYLE_NAME, family="text")
|
|
style.addElement(TextProperties(attributes={
|
|
'fontstyle': "italic",
|
|
'fontstyleasian': "italic",
|
|
'fontstylecomplex': "italic"
|
|
}))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
# caption
|
|
style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
|
|
style.addElement(TextProperties(attributes={
|
|
'fontstyle': "italic",
|
|
'fontstyleasian': "italic",
|
|
'fontstylecomplex': "italic",
|
|
'fontsize': '10pt',
|
|
'color': '#777777'
|
|
}))
|
|
style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
# underline
|
|
style = Style(name=UNDERLINE_STYLE_NAME, family="text")
|
|
style.addElement(TextProperties(attributes={
|
|
'textunderlinestyle': "solid",
|
|
'textunderlinewidth': "auto"
|
|
}))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
# blockquote
|
|
style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
|
|
style.addElement(ParagraphProperties(attributes={
|
|
'marginleft': '0.6cm',
|
|
'margintop': '0.15cm',
|
|
'marginbottom': '0.15cm',
|
|
}))
|
|
style.addElement(TextProperties(attributes={'color': '#378A62'}))
|
|
self.doc.styles.addElement(style)
|
|
|
|
# title
|
|
style = Style(name=TITLE_STYLE_NAME, family="paragraph")
|
|
style.addElement(TextProperties(attributes={
|
|
'fontsize': '20pt',
|
|
'fontweight': "bold",
|
|
'fontweightasian': "bold",
|
|
'fontweightcomplex': "bold"
|
|
}))
|
|
style.addElement(ParagraphProperties(textalign='center'))
|
|
self.doc.styles.addElement(style)
|
|
|
|
# centered text
|
|
style = Style(name=CENTER_STYLE_NAME, family="paragraph")
|
|
style.addElement(ParagraphProperties(textalign="center"))
|
|
self.doc.automaticstyles.addElement(style)
|
|
|
|
def process_inline(self, node):
|
|
if isinstance(node, str):
|
|
return [node]
|
|
|
|
tag = node.name.lower()
|
|
|
|
simple_tags = (
|
|
('strong', 'b'),
|
|
('em', 'i'),
|
|
('ins', 'u')
|
|
)
|
|
simple_styles = (
|
|
BOLD_STYLE_NAME,
|
|
ITALIC_STYLE_NAME,
|
|
UNDERLINE_STYLE_NAME
|
|
)
|
|
for i, tags_list in enumerate(simple_tags):
|
|
if tag in tags_list:
|
|
span = Span(stylename=simple_styles[i])
|
|
for child in node.contents:
|
|
for inline in self.process_inline(child):
|
|
add_child(span, inline)
|
|
return [span]
|
|
|
|
if tag == "code":
|
|
return [Span(stylename="Code", text=node.get_text())]
|
|
|
|
elif tag == "a":
|
|
return [A(href=node.get("href"), text=node.get_text())]
|
|
|
|
elif tag == "img":
|
|
if node.name and node.name.lower() == "img":
|
|
return [self.process_img(node)]
|
|
|
|
else:
|
|
result = []
|
|
for child in node.contents:
|
|
result.extend(self.process_inline(child))
|
|
return result
|
|
|
|
def process_block(self, elem):
|
|
h_elem = self.try_process_heading(elem)
|
|
if h_elem is not None:
|
|
return h_elem
|
|
|
|
tag = elem.name.lower()
|
|
if tag == "p":
|
|
is_centered = False
|
|
has_image = False
|
|
for child in elem.contents:
|
|
# try converting heading
|
|
h_elem = self.try_process_heading(child)
|
|
if h_elem is not None:
|
|
return h_elem
|
|
|
|
if child.name:
|
|
if child.name.lower() == "img":
|
|
has_image = True
|
|
if child.name.lower() == "center":
|
|
for cchild in child.contents:
|
|
h_elem = self.try_process_heading(cchild)
|
|
if h_elem is not None:
|
|
return h_elem
|
|
is_centered = True
|
|
break
|
|
|
|
if is_centered or has_image:
|
|
p_elem = P(stylename=CENTER_STYLE_NAME)
|
|
else:
|
|
p_elem = P()
|
|
|
|
for child in elem.contents:
|
|
for inline in self.process_inline(child):
|
|
if has_image and isinstance(inline, ImageWrap):
|
|
image = inline.get_elements(self.doc)
|
|
p_elem.addElement(image[0])
|
|
elems = [p_elem]
|
|
if len(image) == 2:
|
|
elems.append(image[1])
|
|
return elems
|
|
|
|
add_child(p_elem, inline)
|
|
|
|
return p_elem
|
|
|
|
elif tag == "blockquote":
|
|
items = []
|
|
for child in elem.contents:
|
|
text = child.get_text()
|
|
if text.strip() == '':
|
|
continue
|
|
items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
|
|
return items
|
|
|
|
elif tag in ("ul", "ol"):
|
|
odf_list = List()
|
|
li_elements = elem.find_all("li", recursive=False)
|
|
for li in li_elements:
|
|
li_item = ListItem()
|
|
p = P()
|
|
for child in li.contents:
|
|
# if nested list is found, we'll process it later
|
|
if hasattr(child, "name") and child.name in ("ul", "ol"):
|
|
continue
|
|
for inline in self.process_inline(child):
|
|
add_child(p, inline)
|
|
li_item.addElement(p)
|
|
|
|
# process nested lists
|
|
for child in li.contents:
|
|
if hasattr(child, "name") and child.name in ("ul", "ol"):
|
|
nested_list = self.process_block(child)
|
|
li_item.addElement(nested_list)
|
|
|
|
odf_list.addElement(li_item)
|
|
return odf_list
|
|
|
|
elif tag == "pre":
|
|
return P(stylename="Preformatted", text=elem.get_text())
|
|
|
|
elif tag == "hr":
|
|
return P(stylename=CENTER_STYLE_NAME, text='---')
|
|
|
|
elif tag == "table":
|
|
odf_table = Table()
|
|
for tr in elem.find_all("tr"):
|
|
row = TableRow()
|
|
for cell in tr.find_all(["th", "td"]):
|
|
cell_elem = TableCell()
|
|
cell_html = "".join(str(child) for child in cell.contents)
|
|
cell_odf_elements = self.html_to_odf_elements(cell_html)
|
|
for el in cell_odf_elements:
|
|
cell_elem.addElement(el)
|
|
row.addElement(cell_elem)
|
|
odf_table.addElement(row)
|
|
return odf_table
|
|
|
|
elif tag == "img":
|
|
return self.process_img(elem).get_elements(self.doc)
|
|
|
|
elif tag == "br":
|
|
return LineBreak()
|
|
|
|
else:
|
|
p_elem = P()
|
|
p_elem.addText(elem.get_text())
|
|
return p_elem
|
|
|
|
def try_process_heading(self, elem):
|
|
if not elem.name:
|
|
return
|
|
|
|
tag = elem.name.lower()
|
|
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
|
return H(outlinelevel=tag[1], text=elem.get_text())
|
|
|
|
def process_img(self, elem) -> ImageWrap:
|
|
href = elem.get("src")
|
|
saved_file = os.path.join(
|
|
os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
|
|
)
|
|
if not os.path.exists(saved_file):
|
|
raise ValueError(f'image {saved_file} not found')
|
|
|
|
alt = elem.get('alt')
|
|
return ImageWrap(saved_file, alt)
|
|
|
|
def html_to_odf_elements(self, html):
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = []
|
|
top_nodes = soup.body.contents if soup.body else soup.contents
|
|
|
|
keeping = False
|
|
stack = []
|
|
ending_headings = (
|
|
'дополнительное чтение',
|
|
'основные источники',
|
|
'источники',
|
|
'дополнительные материалы',
|
|
'additional reading',
|
|
'further reading',
|
|
'main sources',
|
|
'additional sources',
|
|
)
|
|
|
|
for node in top_nodes:
|
|
if isinstance(node, str):
|
|
if not node.strip():
|
|
continue
|
|
p = P()
|
|
p.addText(node)
|
|
elements.append(p)
|
|
elif node.name:
|
|
if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
|
|
if stack:
|
|
elements.append(stack)
|
|
stack = []
|
|
keeping = True
|
|
result = self.process_block(node)
|
|
if keeping:
|
|
stack.append(result)
|
|
else:
|
|
elements.append(result)
|
|
if stack:
|
|
elements.append(stack)
|
|
return elements
|
|
|
|
def create(self,
|
|
orig: Article,
|
|
trans: Article,
|
|
output_odt: str,
|
|
with_title=False):
|
|
orig_elements = self.html_to_odf_elements(orig.html)
|
|
trans_elements = self.html_to_odf_elements(trans.html)
|
|
|
|
max_len = max(len(orig_elements), len(trans_elements))
|
|
while len(orig_elements) < max_len:
|
|
orig_elements.append(P())
|
|
while len(trans_elements) < max_len:
|
|
trans_elements.append(P())
|
|
|
|
main_table = Table(name="ComparisonTable")
|
|
|
|
col1 = TableColumn()
|
|
col2 = TableColumn()
|
|
|
|
main_table.addElement(col1)
|
|
main_table.addElement(col2)
|
|
|
|
if with_title:
|
|
# headings
|
|
header_row = TableRow()
|
|
|
|
header_cell_left = TableCell()
|
|
header_cell_right = TableCell()
|
|
|
|
header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
|
|
header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))
|
|
|
|
header_row.addElement(header_cell_left)
|
|
header_row.addElement(header_cell_right)
|
|
|
|
main_table.addElement(header_row)
|
|
|
|
# content
|
|
for i in range(max_len):
|
|
row = TableRow()
|
|
|
|
cell_orig = TableCell()
|
|
cell_trans = TableCell()
|
|
|
|
if isinstance(orig_elements[i], list):
|
|
for elem in orig_elements[i]:
|
|
cell_orig.addElement(elem)
|
|
else:
|
|
cell_orig.addElement(orig_elements[i])
|
|
|
|
if isinstance(trans_elements[i], list):
|
|
for elem in trans_elements[i]:
|
|
cell_trans.addElement(elem)
|
|
else:
|
|
cell_trans.addElement(trans_elements[i])
|
|
|
|
row.addElement(cell_orig)
|
|
row.addElement(cell_trans)
|
|
|
|
main_table.addElement(row)
|
|
|
|
self.doc.text.addElement(main_table)
|
|
self.doc.save(output_odt)
|