many changes

This commit is contained in:
E. S 2025-05-17 04:28:55 +03:00
parent 7f4b460c96
commit de9084dfd5
10 changed files with 359 additions and 54 deletions

View File

@ -1,24 +0,0 @@
#!/usr/bin/env python3
import os.path
from idb import Article, DocumentCreator
from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
if __name__ == '__main__':
name = 'cartier3'
orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru')
trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en')
orig = Article.from_markdown_file(orig_path, with_title=False)
trans = Article.from_markdown_file(trans_path, with_title=False)
image_urls = extract_images_from_markdown(read_file(orig_path))
for image_url in image_urls:
image_name = image_url_to_filename(image_url)
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
if not os.path.exists(output_file):
download_file(image_url, output_file)
print(f'{image_name} saved')
doc = DocumentCreator()
doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))

View File

@ -1,24 +1,4 @@
from .wordpress import Article, fetch_article
from .translator import translate_markdown
from .doc import DocumentCreator
tzo_urls = (
'https://kniganews.org/2012/12/20/beyond-clouds-1/',
'https://kniganews.org/2012/12/21/beyond-clouds-21/',
'https://kniganews.org/2012/12/22/beyond-clouds-22/',
'https://kniganews.org/2012/12/23/beyond-clouds-31/',
'https://kniganews.org/2012/12/24/beyond-clouds-32/',
'https://kniganews.org/2012/12/25/beyond-clouds-33/',
'https://kniganews.org/2012/12/28/beyond-clouds-41/',
'https://kniganews.org/2012/12/29/beyond-clouds-42/',
'https://kniganews.org/2012/12/30/beyond-clouds-43/',
'https://kniganews.org/2013/01/01/beyond-clouds-44/',
'https://kniganews.org/2013/01/06/beyond-clouds-51/',
'https://kniganews.org/2013/01/07/beyond-clouds-52/',
'https://kniganews.org/2013/02/16/beyond-clouds-53/',
'https://kniganews.org/2013/03/25/beyond-clouds-61/',
'https://kniganews.org/2013/05/10/beyond-clouds-62/',
'https://kniganews.org/2013/06/17/beyond-clouds-731/',
'https://kniganews.org/2013/08/07/beyond-clouds-732/',
'https://kniganews.org/2013/09/17/beyond-clouds-73/'
)
from .tzo import tzo_urls, after_tzo_urls

View File

@ -1,12 +1,14 @@
import os.path
import zipfile
from odf.opendocument import OpenDocumentText
from odf.opendocument import OpenDocumentText, load
from odf.text import P, H, Span, A, LineBreak, List, ListItem
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
from odf.table import TableColumn, TableCell, TableRow, Table
from odf.draw import Frame, Image
from PIL import Image as PILImage
from io import BytesIO
from bs4 import BeautifulSoup
from idb import Article
@ -422,3 +424,180 @@ class DocumentCreator:
self.doc.text.addElement(main_table)
self.doc.save(output_odt)
class DocumentReader:
def __init__(self, input_file):
self.doc = load(input_file)
self.package = zipfile.ZipFile(input_file)
self.style_alignments = self.build_style_alignments(self.doc) # For paragraph alignment (family="paragraph")
self.text_style_formats = self.build_text_styles(self.doc) # For text formatting (family="text")
def parse_node(self, node, indent=0):
if isinstance(node, str):
return node
try:
if node.nodeType == node.TEXT_NODE:
return node.data
except AttributeError:
pass
tag = node.tagName
if tag == "text:h":
level_str = node.attributes.get("text:outline-level", "1")
try:
level = int(level_str)
except ValueError:
level = 1
if level > 6:
level = 6
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
return f'{"#" * level} {content}\n\n'
if tag == 'text:p':
style = node.getAttribute('stylename')
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
if style:
style_align = self.style_alignments.get(style, '')
if style_align == 'center' or style.lower() == 'center':
return f"<center>{content}</center>\n\n"
if style in ('Block Quotation', 'Quotations') or style.endswith('Quotation'):
lines = content.splitlines()
content = "\n".join(["> " + line for line in lines])
return content + "\n\n"
elif tag == "text:list":
md = ""
for child in node.childNodes:
md += self.parse_node(child, indent)
return md + "\n"
elif tag == "text:list-item":
item_text = ""
for child in node.childNodes:
item_text += self.parse_node(child, indent + 1)
lines = item_text.splitlines()
if lines:
prefix = " " * indent + "- "
new_lines = [prefix + lines[0]]
for line in lines[1:]:
new_lines.append(" " * (indent + 1) + line)
return "\n".join(new_lines) + "\n"
return ""
elif tag == "text:span":
style_name = node.getAttribute("stylename") or ""
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
fmt = self.text_style_formats.get(style_name, {})
md_text = content
if fmt.get("bold") and fmt.get("italic"):
md_text = f"***{md_text}***"
else:
if fmt.get("bold"):
md_text = f"**{md_text}**"
if fmt.get("italic"):
md_text = f"*{md_text}*"
if fmt.get("underline"):
md_text = f"<u>{md_text}</u>"
return md_text
elif tag == "text:a":
href = node.getAttribute("href")
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
if href:
return f"[{content}]({href})"
return content
elif tag == "text:line-break":
return " \n"
elif tag == "draw:frame":
md = ""
caption_text = ""
for child in node.childNodes:
if hasattr(child, "tagName"):
if child.tagName == "draw:image":
href = child.attributes.get(('http://www.w3.org/1999/xlink', 'href'))
md += f"![]({href})"
elif child.tagName == "draw:caption":
caption_text = ''.join([self.parse_node(c, indent) for c in child.childNodes]).strip()
if caption_text:
md += "\n" + caption_text + "\n"
return md
else:
return ''.join([self.parse_node(child, indent) for child in node.childNodes])
def get_embedded_image_size(self, file_name) -> tuple[int, int]:
data = self.package.read(file_name)
img = PILImage.open(BytesIO(data))
return img.size
def get_markdown(self, column=1) -> str:
tables = self.doc.getElementsByType(Table)
comp_table = None
for tbl in tables:
if tbl.getAttribute("name") == "ComparisonTable":
comp_table = tbl
break
if not comp_table:
raise RuntimeError("ComparisonTable not found in the document.")
md_lines = []
rows = comp_table.getElementsByType(TableRow)
for row in rows:
cells = row.getElementsByType(TableCell)
if len(cells) >= 2:
right_cell = cells[column]
cell_md = ""
for child in right_cell.childNodes:
cell_md += self.parse_node(child)
# Remove any extra whitespace.
md_lines.append(cell_md.strip())
return "\n\n".join(md_lines)
@staticmethod
def build_style_alignments(doc):
alignments = {}
for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
if style.getAttribute('family') == 'paragraph':
style_name = style.getAttribute('name')
para_props = style.getElementsByType(ParagraphProperties)
if para_props:
attr_val = para_props[0].attributes.get(
('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'text-align'), '')
if attr_val:
alignments[style_name] = attr_val.lower().strip()
return alignments
@staticmethod
def build_text_styles(doc):
text_styles = {}
for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
if style.getAttribute('family') == 'text':
style_name = style.getAttribute('name')
text_props = style.getElementsByType(TextProperties)
if text_props:
props = text_props[0].attributes
bold = False
italic = False
underline = False
fw = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-weight'),
'').lower()
if 'bold' in fw or fw in ('700', '800', '900'):
bold = True
fs = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-style'),
'').lower()
if 'italic' in fs:
italic = True
tu = props.get(('urn:oasis:names:tc:opendocument:xmlns:style:1.0', 'text-underline-style'),
'').lower()
if tu and tu != 'none':
underline = True
text_styles[style_name] = {'bold': bold, 'italic': italic, 'underline': underline}
return text_styles

View File

@ -10,7 +10,7 @@ Translate the following text from Russian to English while strictly preserving t
Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
"""
input_token_limit = 3500
input_token_limit = 5000
def translate_markdown(text):

78
idb/tzo.py Normal file
View File

@ -0,0 +1,78 @@
import os
import re
from PIL import Image
from collections import namedtuple
from .util import image_url_to_filename
tzo_urls = (
'https://kniganews.org/2012/12/20/beyond-clouds-1/',
'https://kniganews.org/2012/12/21/beyond-clouds-21/',
'https://kniganews.org/2012/12/22/beyond-clouds-22/',
'https://kniganews.org/2012/12/23/beyond-clouds-31/',
'https://kniganews.org/2012/12/24/beyond-clouds-32/',
'https://kniganews.org/2012/12/25/beyond-clouds-33/',
'https://kniganews.org/2012/12/28/beyond-clouds-41/',
'https://kniganews.org/2012/12/29/beyond-clouds-42/',
'https://kniganews.org/2012/12/30/beyond-clouds-43/',
'https://kniganews.org/2013/01/01/beyond-clouds-44/',
'https://kniganews.org/2013/01/06/beyond-clouds-51/',
'https://kniganews.org/2013/01/07/beyond-clouds-52/',
'https://kniganews.org/2013/02/16/beyond-clouds-53/',
'https://kniganews.org/2013/03/25/beyond-clouds-61/',
'https://kniganews.org/2013/05/10/beyond-clouds-62/',
'https://kniganews.org/2013/06/17/beyond-clouds-731/',
'https://kniganews.org/2013/08/07/beyond-clouds-732/',
'https://kniganews.org/2013/09/17/beyond-clouds-73/'
)
after_tzo_urls = (
'https://kniganews.org/2012/11/17/langlands-plus/',
)
ImageInfo = namedtuple('ImageInfo', ('url', 'local_name', 'local_path', 'width', 'height'))
class ImageList:
images: list[ImageInfo]
def __init__(self):
self.images = []
def add_image(self, url):
local_name = image_url_to_filename(url)
local_path = os.path.realpath(os.path.join(
os.path.dirname(__file__),
'..',
'images',
local_name
))
image = Image.open(local_path)
self.images.append(ImageInfo(url, local_name, local_path, image.size[0], image.size[1]))
def get_images_by_size(self, w, h) -> list[ImageInfo]:
return list(filter(lambda image: image.width == w and image.height == h, self.images))
def get_part_by_odt_name(name: str) -> int:
m = re.match(r'^beyond-clouds-(\d+)(?:v\d+)?\.odt$', name)
if not m:
raise ValueError('could not parse file name')
if not m.group(1).isnumeric():
raise ValueError('extracted value is not a number')
return int(m.group(1))
def part_image_list(part) -> ImageList:
file = os.path.realpath(os.path.join(
os.path.dirname(__file__),
'..',
'tzo',
f'beyond-clouds-{part}-ru.txt',
))
with open(file) as f:
txt = f.read()
urls = re.findall(r'!\[.*?]\((.*?)\)', txt)
images = ImageList()
for url in urls:
images.add_image(url)
return images

View File

@ -21,12 +21,12 @@ def image_url_to_filename(url):
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
name, ext = os.path.splitext(filename)
date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path)
date_match = re.search(r'(\d{4})/(\d{2})', parsed_url.path)
if not date_match:
raise ValueError("no valid date found in URL")
year = date_match.group(1)
day = date_match.group(3) if date_match.group(3) else "01"
return f"{year}{day}_{name}{ext}"
month = date_match.group(2)
return f"{year}{month}_{name}{ext}"
def extract_images_from_markdown(markdown_text):

51
odt_to_md.py Executable file
View File

@ -0,0 +1,51 @@
#!/usr/bin/env python3
import re
from argparse import ArgumentParser
from os.path import basename
from idb import tzo
from idb.doc import DocumentReader
from idb.tzo import get_part_by_odt_name
def tzo_replace_images(md: str,
tzo_part: int,
dr: DocumentReader):
il = tzo.part_image_list(tzo_part)
def _markdown_image(image, title) -> str:
if title:
return f'![]({image.url} "{title}")'
else:
return f'![]({image.url})'
def _repl(match: re.Match) -> str:
orig_alt, path, title = match.groups()
w, h = dr.get_embedded_image_size(path)
found_images = il.get_images_by_size(w, h)
# if len(found_images) > 1:
# raise ValueError(f'more than one image found with size {w}x{h}')
return ''.join(list(map(lambda i: _markdown_image(i, title), found_images)))
regex = re.compile(r'!\[(.*?)]\((\S+?)(?:\s+"(.*?)")?\)')
return regex.sub(_repl, md)
def main():
parser = ArgumentParser()
parser.add_argument('-i', '--input', required=True, type=str, help='Input file')
parser.add_argument('-c', '--column', default=1, type=int, help='Column number')
parser.add_argument('--tzo', action='store_true', help='TZO')
args = parser.parse_args()
reader = DocumentReader(args.input)
md = reader.get_markdown(args.column)
if args.tzo:
md = tzo_replace_images(md, get_part_by_odt_name(basename(args.input)), reader)
print(md)
if __name__ == '__main__':
main()

27
single_odt.py Executable file
View File

@ -0,0 +1,27 @@
#!/usr/bin/env python3
import os.path
from idb import Article, DocumentCreator
from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
from argparse import ArgumentParser
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--ru-file', type=str, required=True, help='russian input file')
parser.add_argument('--en-file', type=str, required=True, help='english input file')
parser.add_argument('--output', type=str, required=True, help='output ODT file')
args = parser.parse_args()
orig = Article.from_markdown_file(args.ru_file, with_title=False)
trans = Article.from_markdown_file(args.en_file, with_title=False)
image_urls = extract_images_from_markdown(read_file(args.ru_file))
for image_url in image_urls:
image_name = image_url_to_filename(image_url)
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
if not os.path.exists(output_file):
download_file(image_url, output_file)
print(f'{image_name} saved')
doc = DocumentCreator()
doc.create(orig, trans, args.output)

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import os.path
from idb import tzo_urls
from argparse import ArgumentParser
from idb import tzo_urls, after_tzo_urls
from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown
from dotenv import load_dotenv
@ -8,7 +9,13 @@ load_dotenv()
if __name__ == '__main__':
for url in tzo_urls:
parser = ArgumentParser()
parser.add_argument('--after', action='store_true')
args = parser.parse_args()
urls = tzo_urls if not args.after else after_tzo_urls
for url in urls:
name = name_from_url(url)
markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt')
image_urls = extract_images_from_markdown(read_file(markdown_file))

View File

@ -1,11 +1,18 @@
#!/usr/bin/env python3
import os.path
from idb import Article, DocumentCreator, tzo_urls
from argparse import ArgumentParser
from idb import Article, DocumentCreator, tzo_urls, after_tzo_urls
from idb.util import name_from_url
if __name__ == '__main__':
for url in tzo_urls:
parser = ArgumentParser()
parser.add_argument('--after', action='store_true')
args = parser.parse_args()
urls = tzo_urls if not args.after else after_tzo_urls
for url in urls:
name = name_from_url(url)
orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)