idb_utils/idb/doc.py
2025-03-06 02:04:06 +03:00

425 lines
14 KiB
Python

import os.path
from odf.opendocument import OpenDocumentText
from odf.text import P, H, Span, A, LineBreak, List, ListItem
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
from odf.table import TableColumn, TableCell, TableRow, Table
from odf.draw import Frame, Image
from PIL import Image as PILImage
from bs4 import BeautifulSoup
from idb import Article
from idb.util import image_url_to_filename
PAGE_LAYOUT_NAME = 'LandscapeLayout'
MASTER_PAGE_NAME = 'Standard'
BLOCKQUOTE_STYLE_NAME = 'Block Quotation'
ITALIC_STYLE_NAME = 'Italic'
BOLD_STYLE_NAME = 'Bold'
CAPTION_STYLE_NAME = 'Caption'
UNDERLINE_STYLE_NAME = 'Underline'
CENTER_STYLE_NAME = 'CenterAligned'
TITLE_STYLE_NAME = 'Title'
def add_child(parent, child):
if hasattr(child, "qname"):
parent.addElement(child)
else:
parent.addText(child)
def calc_frame_dimensions(image_path, desired_width_cm):
with PILImage.open(image_path) as img:
orig_width, orig_height = img.size
dpi = 96.0
orig_width_cm = (orig_width / dpi) * 2.54
orig_height_cm = (orig_height / dpi) * 2.54
scale = desired_width_cm / orig_width_cm
new_height_cm = orig_height_cm * scale
return f"{desired_width_cm}cm", f"{new_height_cm}cm"
class ImageWrap:
def __init__(self, image_file, caption):
self.image_file = image_file
self.caption = caption
def get_elements(self, doc):
embedded_href = doc.addPicture(self.image_file)
desired_width = 13.5
width_str, height_str = calc_frame_dimensions(self.image_file, desired_width)
frm = Frame(width=width_str, height=height_str)
img = Image(href=embedded_href, type="simple", show="embed", actuate="onLoad")
frm.addElement(img)
elements = [frm]
if self.caption:
caption = P(stylename=CAPTION_STYLE_NAME)
caption.addText(self.caption)
elements.append(caption)
return elements
class DocumentCreator:
def __init__(self):
self.doc = OpenDocumentText()
self.set_styles()
def set_styles(self):
landscape_layout = PageLayout(name=PAGE_LAYOUT_NAME)
landscape_props = PageLayoutProperties(
pagewidth="29.7cm",
pageheight="21.0cm",
printorientation="landscape",
margin="1cm"
)
landscape_layout.addElement(landscape_props)
self.doc.automaticstyles.addElement(landscape_layout)
masterpage = MasterPage(name="Standard", pagelayoutname=PAGE_LAYOUT_NAME)
self.doc.masterstyles.addElement(masterpage)
# bold
style = Style(name=BOLD_STYLE_NAME, family="text")
style.addElement(TextProperties(attributes={
'fontweight': "bold",
'fontweightasian': "bold",
'fontweightcomplex': "bold"
}))
self.doc.automaticstyles.addElement(style)
# italic
style = Style(name=ITALIC_STYLE_NAME, family="text")
style.addElement(TextProperties(attributes={
'fontstyle': "italic",
'fontstyleasian': "italic",
'fontstylecomplex': "italic"
}))
self.doc.automaticstyles.addElement(style)
# caption
style = Style(name=CAPTION_STYLE_NAME, family="paragraph")
style.addElement(TextProperties(attributes={
'fontstyle': "italic",
'fontstyleasian': "italic",
'fontstylecomplex': "italic",
'fontsize': '10pt',
'color': '#777777'
}))
style.addElement(ParagraphProperties(textalign="center", margintop='0.15cm', marginbottom='0.15cm'))
self.doc.automaticstyles.addElement(style)
# underline
style = Style(name=UNDERLINE_STYLE_NAME, family="text")
style.addElement(TextProperties(attributes={
'textunderlinestyle': "solid",
'textunderlinewidth': "auto"
}))
self.doc.automaticstyles.addElement(style)
# blockquote
style = Style(name=BLOCKQUOTE_STYLE_NAME, family="paragraph")
style.addElement(ParagraphProperties(attributes={
'marginleft': '0.6cm',
'margintop': '0.15cm',
'marginbottom': '0.15cm',
}))
style.addElement(TextProperties(attributes={'color': '#378A62'}))
self.doc.styles.addElement(style)
# title
style = Style(name=TITLE_STYLE_NAME, family="paragraph")
style.addElement(TextProperties(attributes={
'fontsize': '20pt',
'fontweight': "bold",
'fontweightasian': "bold",
'fontweightcomplex': "bold"
}))
style.addElement(ParagraphProperties(textalign='center'))
self.doc.styles.addElement(style)
# centered text
style = Style(name=CENTER_STYLE_NAME, family="paragraph")
style.addElement(ParagraphProperties(textalign="center"))
self.doc.automaticstyles.addElement(style)
def process_inline(self, node):
if isinstance(node, str):
return [node]
tag = node.name.lower()
simple_tags = (
('strong', 'b'),
('em', 'i'),
('ins', 'u')
)
simple_styles = (
BOLD_STYLE_NAME,
ITALIC_STYLE_NAME,
UNDERLINE_STYLE_NAME
)
for i, tags_list in enumerate(simple_tags):
if tag in tags_list:
span = Span(stylename=simple_styles[i])
for child in node.contents:
for inline in self.process_inline(child):
add_child(span, inline)
return [span]
if tag == "code":
return [Span(stylename="Code", text=node.get_text())]
elif tag == "a":
return [A(href=node.get("href"), text=node.get_text())]
elif tag == "img":
if node.name and node.name.lower() == "img":
return [self.process_img(node)]
else:
result = []
for child in node.contents:
result.extend(self.process_inline(child))
return result
def process_block(self, elem):
h_elem = self.try_process_heading(elem)
if h_elem is not None:
return h_elem
tag = elem.name.lower()
if tag == "p":
is_centered = False
has_image = False
for child in elem.contents:
# try converting heading
h_elem = self.try_process_heading(child)
if h_elem is not None:
return h_elem
if child.name:
if child.name.lower() == "img":
has_image = True
if child.name.lower() == "center":
for cchild in child.contents:
h_elem = self.try_process_heading(cchild)
if h_elem is not None:
return h_elem
is_centered = True
break
if is_centered or has_image:
p_elem = P(stylename=CENTER_STYLE_NAME)
else:
p_elem = P()
for child in elem.contents:
for inline in self.process_inline(child):
if has_image and isinstance(inline, ImageWrap):
image = inline.get_elements(self.doc)
p_elem.addElement(image[0])
elems = [p_elem]
if len(image) == 2:
elems.append(image[1])
return elems
add_child(p_elem, inline)
return p_elem
elif tag == "blockquote":
items = []
for child in elem.contents:
text = child.get_text()
if text.strip() == '':
continue
items.append(P(stylename=BLOCKQUOTE_STYLE_NAME, text=text))
return items
elif tag in ("ul", "ol"):
odf_list = List()
li_elements = elem.find_all("li", recursive=False)
for li in li_elements:
li_item = ListItem()
p = P()
for child in li.contents:
# if nested list is found, we'll process it later
if hasattr(child, "name") and child.name in ("ul", "ol"):
continue
for inline in self.process_inline(child):
add_child(p, inline)
li_item.addElement(p)
# process nested lists
for child in li.contents:
if hasattr(child, "name") and child.name in ("ul", "ol"):
nested_list = self.process_block(child)
li_item.addElement(nested_list)
odf_list.addElement(li_item)
return odf_list
elif tag == "pre":
return P(stylename="Preformatted", text=elem.get_text())
elif tag == "hr":
return P(stylename=CENTER_STYLE_NAME, text='---')
elif tag == "table":
odf_table = Table()
for tr in elem.find_all("tr"):
row = TableRow()
for cell in tr.find_all(["th", "td"]):
cell_elem = TableCell()
cell_html = "".join(str(child) for child in cell.contents)
cell_odf_elements = self.html_to_odf_elements(cell_html)
for el in cell_odf_elements:
cell_elem.addElement(el)
row.addElement(cell_elem)
odf_table.addElement(row)
return odf_table
elif tag == "img":
return self.process_img(elem).get_elements(self.doc)
elif tag == "br":
return LineBreak()
else:
p_elem = P()
p_elem.addText(elem.get_text())
return p_elem
def try_process_heading(self, elem):
if not elem.name:
return
tag = elem.name.lower()
if tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
return H(outlinelevel=tag[1], text=elem.get_text())
def process_img(self, elem) -> ImageWrap:
href = elem.get("src")
saved_file = os.path.join(
os.path.dirname(__file__), '..', 'images', image_url_to_filename(href)
)
if not os.path.exists(saved_file):
raise ValueError(f'image {saved_file} not found')
alt = elem.get('alt')
return ImageWrap(saved_file, alt)
def html_to_odf_elements(self, html):
soup = BeautifulSoup(html, "html.parser")
elements = []
top_nodes = soup.body.contents if soup.body else soup.contents
keeping = False
stack = []
ending_headings = (
'дополнительное чтение',
'основные источники',
'источники',
'дополнительные материалы',
'additional reading',
'further reading',
'main sources',
'additional sources',
)
for node in top_nodes:
if isinstance(node, str):
if not node.strip():
continue
p = P()
p.addText(node)
elements.append(p)
elif node.name:
if node.name.lower() in ("h1", "h2", "h3", "h4") and node.get_text().strip().lower() in (end.lower() for end in ending_headings):
if stack:
elements.append(stack)
stack = []
keeping = True
result = self.process_block(node)
if keeping:
stack.append(result)
else:
elements.append(result)
if stack:
elements.append(stack)
return elements
def create(self,
orig: Article,
trans: Article,
output_odt: str,
with_title=False):
orig_elements = self.html_to_odf_elements(orig.html)
trans_elements = self.html_to_odf_elements(trans.html)
max_len = max(len(orig_elements), len(trans_elements))
while len(orig_elements) < max_len:
orig_elements.append(P())
while len(trans_elements) < max_len:
trans_elements.append(P())
main_table = Table(name="ComparisonTable")
col1 = TableColumn()
col2 = TableColumn()
main_table.addElement(col1)
main_table.addElement(col2)
if with_title:
# headings
header_row = TableRow()
header_cell_left = TableCell()
header_cell_right = TableCell()
header_cell_left.addElement(P(stylename=TITLE_STYLE_NAME, text=orig.title))
header_cell_right.addElement(P(stylename=TITLE_STYLE_NAME, text=trans.title))
header_row.addElement(header_cell_left)
header_row.addElement(header_cell_right)
main_table.addElement(header_row)
# content
for i in range(max_len):
row = TableRow()
cell_orig = TableCell()
cell_trans = TableCell()
if isinstance(orig_elements[i], list):
for elem in orig_elements[i]:
cell_orig.addElement(elem)
else:
cell_orig.addElement(orig_elements[i])
if isinstance(trans_elements[i], list):
for elem in trans_elements[i]:
cell_trans.addElement(elem)
else:
cell_trans.addElement(trans_elements[i])
row.addElement(cell_orig)
row.addElement(cell_trans)
main_table.addElement(row)
self.doc.text.addElement(main_table)
self.doc.save(output_odt)