many changes
This commit is contained in:
parent
7f4b460c96
commit
de9084dfd5
@ -1,24 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import os.path
|
||||
from idb import Article, DocumentCreator
|
||||
from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
name = 'cartier3'
|
||||
orig_path = os.path.join(os.path.dirname(__file__), f'{name}_ru')
|
||||
trans_path = os.path.join(os.path.dirname(__file__), f'{name}_en')
|
||||
|
||||
orig = Article.from_markdown_file(orig_path, with_title=False)
|
||||
trans = Article.from_markdown_file(trans_path, with_title=False)
|
||||
|
||||
image_urls = extract_images_from_markdown(read_file(orig_path))
|
||||
for image_url in image_urls:
|
||||
image_name = image_url_to_filename(image_url)
|
||||
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
|
||||
if not os.path.exists(output_file):
|
||||
download_file(image_url, output_file)
|
||||
print(f'{image_name} saved')
|
||||
|
||||
doc = DocumentCreator()
|
||||
doc.create(orig, trans, os.path.join(os.path.dirname(__file__), f'{name}.odt'))
|
@ -1,24 +1,4 @@
|
||||
from .wordpress import Article, fetch_article
|
||||
from .translator import translate_markdown
|
||||
from .doc import DocumentCreator
|
||||
|
||||
tzo_urls = (
|
||||
'https://kniganews.org/2012/12/20/beyond-clouds-1/',
|
||||
'https://kniganews.org/2012/12/21/beyond-clouds-21/',
|
||||
'https://kniganews.org/2012/12/22/beyond-clouds-22/',
|
||||
'https://kniganews.org/2012/12/23/beyond-clouds-31/',
|
||||
'https://kniganews.org/2012/12/24/beyond-clouds-32/',
|
||||
'https://kniganews.org/2012/12/25/beyond-clouds-33/',
|
||||
'https://kniganews.org/2012/12/28/beyond-clouds-41/',
|
||||
'https://kniganews.org/2012/12/29/beyond-clouds-42/',
|
||||
'https://kniganews.org/2012/12/30/beyond-clouds-43/',
|
||||
'https://kniganews.org/2013/01/01/beyond-clouds-44/',
|
||||
'https://kniganews.org/2013/01/06/beyond-clouds-51/',
|
||||
'https://kniganews.org/2013/01/07/beyond-clouds-52/',
|
||||
'https://kniganews.org/2013/02/16/beyond-clouds-53/',
|
||||
'https://kniganews.org/2013/03/25/beyond-clouds-61/',
|
||||
'https://kniganews.org/2013/05/10/beyond-clouds-62/',
|
||||
'https://kniganews.org/2013/06/17/beyond-clouds-731/',
|
||||
'https://kniganews.org/2013/08/07/beyond-clouds-732/',
|
||||
'https://kniganews.org/2013/09/17/beyond-clouds-73/'
|
||||
)
|
||||
from .tzo import tzo_urls, after_tzo_urls
|
181
idb/doc.py
181
idb/doc.py
@ -1,12 +1,14 @@
|
||||
import os.path
|
||||
import zipfile
|
||||
|
||||
from odf.opendocument import OpenDocumentText
|
||||
from odf.opendocument import OpenDocumentText, load
|
||||
from odf.text import P, H, Span, A, LineBreak, List, ListItem
|
||||
from odf.style import Style, TextProperties, ParagraphProperties, PageLayout, PageLayoutProperties, MasterPage
|
||||
from odf.table import TableColumn, TableCell, TableRow, Table
|
||||
from odf.draw import Frame, Image
|
||||
|
||||
from PIL import Image as PILImage
|
||||
from io import BytesIO
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from idb import Article
|
||||
@ -422,3 +424,180 @@ class DocumentCreator:
|
||||
|
||||
self.doc.text.addElement(main_table)
|
||||
self.doc.save(output_odt)
|
||||
|
||||
|
||||
class DocumentReader:
|
||||
def __init__(self, input_file):
|
||||
self.doc = load(input_file)
|
||||
self.package = zipfile.ZipFile(input_file)
|
||||
|
||||
self.style_alignments = self.build_style_alignments(self.doc) # For paragraph alignment (family="paragraph")
|
||||
self.text_style_formats = self.build_text_styles(self.doc) # For text formatting (family="text")
|
||||
|
||||
def parse_node(self, node, indent=0):
|
||||
if isinstance(node, str):
|
||||
return node
|
||||
try:
|
||||
if node.nodeType == node.TEXT_NODE:
|
||||
return node.data
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
tag = node.tagName
|
||||
if tag == "text:h":
|
||||
level_str = node.attributes.get("text:outline-level", "1")
|
||||
try:
|
||||
level = int(level_str)
|
||||
except ValueError:
|
||||
level = 1
|
||||
if level > 6:
|
||||
level = 6
|
||||
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
||||
return f'{"#" * level} {content}\n\n'
|
||||
|
||||
if tag == 'text:p':
|
||||
style = node.getAttribute('stylename')
|
||||
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
||||
|
||||
if style:
|
||||
style_align = self.style_alignments.get(style, '')
|
||||
if style_align == 'center' or style.lower() == 'center':
|
||||
return f"<center>{content}</center>\n\n"
|
||||
|
||||
if style in ('Block Quotation', 'Quotations') or style.endswith('Quotation'):
|
||||
lines = content.splitlines()
|
||||
content = "\n".join(["> " + line for line in lines])
|
||||
|
||||
return content + "\n\n"
|
||||
|
||||
elif tag == "text:list":
|
||||
md = ""
|
||||
for child in node.childNodes:
|
||||
md += self.parse_node(child, indent)
|
||||
return md + "\n"
|
||||
|
||||
elif tag == "text:list-item":
|
||||
item_text = ""
|
||||
for child in node.childNodes:
|
||||
item_text += self.parse_node(child, indent + 1)
|
||||
lines = item_text.splitlines()
|
||||
if lines:
|
||||
prefix = " " * indent + "- "
|
||||
new_lines = [prefix + lines[0]]
|
||||
for line in lines[1:]:
|
||||
new_lines.append(" " * (indent + 1) + line)
|
||||
return "\n".join(new_lines) + "\n"
|
||||
return ""
|
||||
|
||||
elif tag == "text:span":
|
||||
style_name = node.getAttribute("stylename") or ""
|
||||
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
||||
fmt = self.text_style_formats.get(style_name, {})
|
||||
md_text = content
|
||||
if fmt.get("bold") and fmt.get("italic"):
|
||||
md_text = f"***{md_text}***"
|
||||
else:
|
||||
if fmt.get("bold"):
|
||||
md_text = f"**{md_text}**"
|
||||
if fmt.get("italic"):
|
||||
md_text = f"*{md_text}*"
|
||||
if fmt.get("underline"):
|
||||
md_text = f"<u>{md_text}</u>"
|
||||
return md_text
|
||||
|
||||
elif tag == "text:a":
|
||||
href = node.getAttribute("href")
|
||||
content = ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
||||
if href:
|
||||
return f"[{content}]({href})"
|
||||
return content
|
||||
|
||||
elif tag == "text:line-break":
|
||||
return " \n"
|
||||
|
||||
elif tag == "draw:frame":
|
||||
md = ""
|
||||
caption_text = ""
|
||||
for child in node.childNodes:
|
||||
if hasattr(child, "tagName"):
|
||||
if child.tagName == "draw:image":
|
||||
href = child.attributes.get(('http://www.w3.org/1999/xlink', 'href'))
|
||||
md += f""
|
||||
elif child.tagName == "draw:caption":
|
||||
caption_text = ''.join([self.parse_node(c, indent) for c in child.childNodes]).strip()
|
||||
if caption_text:
|
||||
md += "\n" + caption_text + "\n"
|
||||
return md
|
||||
|
||||
else:
|
||||
return ''.join([self.parse_node(child, indent) for child in node.childNodes])
|
||||
|
||||
def get_embedded_image_size(self, file_name) -> tuple[int, int]:
|
||||
data = self.package.read(file_name)
|
||||
img = PILImage.open(BytesIO(data))
|
||||
return img.size
|
||||
|
||||
def get_markdown(self, column=1) -> str:
|
||||
tables = self.doc.getElementsByType(Table)
|
||||
comp_table = None
|
||||
for tbl in tables:
|
||||
if tbl.getAttribute("name") == "ComparisonTable":
|
||||
comp_table = tbl
|
||||
break
|
||||
if not comp_table:
|
||||
raise RuntimeError("ComparisonTable not found in the document.")
|
||||
|
||||
md_lines = []
|
||||
rows = comp_table.getElementsByType(TableRow)
|
||||
for row in rows:
|
||||
cells = row.getElementsByType(TableCell)
|
||||
if len(cells) >= 2:
|
||||
right_cell = cells[column]
|
||||
cell_md = ""
|
||||
for child in right_cell.childNodes:
|
||||
cell_md += self.parse_node(child)
|
||||
# Remove any extra whitespace.
|
||||
md_lines.append(cell_md.strip())
|
||||
|
||||
return "\n\n".join(md_lines)
|
||||
|
||||
@staticmethod
|
||||
def build_style_alignments(doc):
|
||||
alignments = {}
|
||||
for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
|
||||
if style.getAttribute('family') == 'paragraph':
|
||||
style_name = style.getAttribute('name')
|
||||
para_props = style.getElementsByType(ParagraphProperties)
|
||||
if para_props:
|
||||
attr_val = para_props[0].attributes.get(
|
||||
('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'text-align'), '')
|
||||
if attr_val:
|
||||
alignments[style_name] = attr_val.lower().strip()
|
||||
return alignments
|
||||
|
||||
@staticmethod
|
||||
def build_text_styles(doc):
|
||||
text_styles = {}
|
||||
for style in [*doc.automaticstyles.getElementsByType(Style), *doc.styles.getElementsByType(Style)]:
|
||||
if style.getAttribute('family') == 'text':
|
||||
style_name = style.getAttribute('name')
|
||||
text_props = style.getElementsByType(TextProperties)
|
||||
if text_props:
|
||||
props = text_props[0].attributes
|
||||
bold = False
|
||||
italic = False
|
||||
underline = False
|
||||
fw = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-weight'),
|
||||
'').lower()
|
||||
if 'bold' in fw or fw in ('700', '800', '900'):
|
||||
bold = True
|
||||
fs = props.get(('urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0', 'font-style'),
|
||||
'').lower()
|
||||
if 'italic' in fs:
|
||||
italic = True
|
||||
tu = props.get(('urn:oasis:names:tc:opendocument:xmlns:style:1.0', 'text-underline-style'),
|
||||
'').lower()
|
||||
if tu and tu != 'none':
|
||||
underline = True
|
||||
text_styles[style_name] = {'bold': bold, 'italic': italic, 'underline': underline}
|
||||
return text_styles
|
||||
|
@ -10,7 +10,7 @@ Translate the following text from Russian to English while strictly preserving t
|
||||
|
||||
Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
|
||||
"""
|
||||
input_token_limit = 3500
|
||||
input_token_limit = 5000
|
||||
|
||||
|
||||
def translate_markdown(text):
|
||||
|
78
idb/tzo.py
Normal file
78
idb/tzo.py
Normal file
@ -0,0 +1,78 @@
|
||||
import os
|
||||
import re
|
||||
from PIL import Image
|
||||
from collections import namedtuple
|
||||
from .util import image_url_to_filename
|
||||
|
||||
tzo_urls = (
|
||||
'https://kniganews.org/2012/12/20/beyond-clouds-1/',
|
||||
'https://kniganews.org/2012/12/21/beyond-clouds-21/',
|
||||
'https://kniganews.org/2012/12/22/beyond-clouds-22/',
|
||||
'https://kniganews.org/2012/12/23/beyond-clouds-31/',
|
||||
'https://kniganews.org/2012/12/24/beyond-clouds-32/',
|
||||
'https://kniganews.org/2012/12/25/beyond-clouds-33/',
|
||||
'https://kniganews.org/2012/12/28/beyond-clouds-41/',
|
||||
'https://kniganews.org/2012/12/29/beyond-clouds-42/',
|
||||
'https://kniganews.org/2012/12/30/beyond-clouds-43/',
|
||||
'https://kniganews.org/2013/01/01/beyond-clouds-44/',
|
||||
'https://kniganews.org/2013/01/06/beyond-clouds-51/',
|
||||
'https://kniganews.org/2013/01/07/beyond-clouds-52/',
|
||||
'https://kniganews.org/2013/02/16/beyond-clouds-53/',
|
||||
'https://kniganews.org/2013/03/25/beyond-clouds-61/',
|
||||
'https://kniganews.org/2013/05/10/beyond-clouds-62/',
|
||||
'https://kniganews.org/2013/06/17/beyond-clouds-731/',
|
||||
'https://kniganews.org/2013/08/07/beyond-clouds-732/',
|
||||
'https://kniganews.org/2013/09/17/beyond-clouds-73/'
|
||||
)
|
||||
after_tzo_urls = (
|
||||
'https://kniganews.org/2012/11/17/langlands-plus/',
|
||||
)
|
||||
ImageInfo = namedtuple('ImageInfo', ('url', 'local_name', 'local_path', 'width', 'height'))
|
||||
|
||||
|
||||
class ImageList:
|
||||
images: list[ImageInfo]
|
||||
|
||||
def __init__(self):
|
||||
self.images = []
|
||||
|
||||
def add_image(self, url):
|
||||
local_name = image_url_to_filename(url)
|
||||
local_path = os.path.realpath(os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
'..',
|
||||
'images',
|
||||
local_name
|
||||
))
|
||||
image = Image.open(local_path)
|
||||
self.images.append(ImageInfo(url, local_name, local_path, image.size[0], image.size[1]))
|
||||
|
||||
def get_images_by_size(self, w, h) -> list[ImageInfo]:
|
||||
return list(filter(lambda image: image.width == w and image.height == h, self.images))
|
||||
|
||||
|
||||
def get_part_by_odt_name(name: str) -> int:
|
||||
m = re.match(r'^beyond-clouds-(\d+)(?:v\d+)?\.odt$', name)
|
||||
if not m:
|
||||
raise ValueError('could not parse file name')
|
||||
if not m.group(1).isnumeric():
|
||||
raise ValueError('extracted value is not a number')
|
||||
return int(m.group(1))
|
||||
|
||||
|
||||
def part_image_list(part) -> ImageList:
|
||||
file = os.path.realpath(os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
'..',
|
||||
'tzo',
|
||||
f'beyond-clouds-{part}-ru.txt',
|
||||
))
|
||||
with open(file) as f:
|
||||
txt = f.read()
|
||||
urls = re.findall(r'!\[.*?]\((.*?)\)', txt)
|
||||
|
||||
images = ImageList()
|
||||
for url in urls:
|
||||
images.add_image(url)
|
||||
|
||||
return images
|
@ -21,12 +21,12 @@ def image_url_to_filename(url):
|
||||
parsed_url = urlparse(url)
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
name, ext = os.path.splitext(filename)
|
||||
date_match = re.search(r'(\d{4})/(\d{2})/(\d{2})?', parsed_url.path)
|
||||
date_match = re.search(r'(\d{4})/(\d{2})', parsed_url.path)
|
||||
if not date_match:
|
||||
raise ValueError("no valid date found in URL")
|
||||
year = date_match.group(1)
|
||||
day = date_match.group(3) if date_match.group(3) else "01"
|
||||
return f"{year}{day}_{name}{ext}"
|
||||
month = date_match.group(2)
|
||||
return f"{year}{month}_{name}{ext}"
|
||||
|
||||
|
||||
def extract_images_from_markdown(markdown_text):
|
||||
|
51
odt_to_md.py
Executable file
51
odt_to_md.py
Executable file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python3
|
||||
import re
|
||||
|
||||
from argparse import ArgumentParser
|
||||
from os.path import basename
|
||||
|
||||
from idb import tzo
|
||||
from idb.doc import DocumentReader
|
||||
from idb.tzo import get_part_by_odt_name
|
||||
|
||||
|
||||
def tzo_replace_images(md: str,
|
||||
tzo_part: int,
|
||||
dr: DocumentReader):
|
||||
il = tzo.part_image_list(tzo_part)
|
||||
|
||||
def _markdown_image(image, title) -> str:
|
||||
if title:
|
||||
return f''
|
||||
else:
|
||||
return f''
|
||||
|
||||
def _repl(match: re.Match) -> str:
|
||||
orig_alt, path, title = match.groups()
|
||||
w, h = dr.get_embedded_image_size(path)
|
||||
found_images = il.get_images_by_size(w, h)
|
||||
# if len(found_images) > 1:
|
||||
# raise ValueError(f'more than one image found with size {w}x{h}')
|
||||
return ''.join(list(map(lambda i: _markdown_image(i, title), found_images)))
|
||||
|
||||
regex = re.compile(r'!\[(.*?)]\((\S+?)(?:\s+"(.*?)")?\)')
|
||||
return regex.sub(_repl, md)
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('-i', '--input', required=True, type=str, help='Input file')
|
||||
parser.add_argument('-c', '--column', default=1, type=int, help='Column number')
|
||||
parser.add_argument('--tzo', action='store_true', help='TZO')
|
||||
args = parser.parse_args()
|
||||
|
||||
reader = DocumentReader(args.input)
|
||||
md = reader.get_markdown(args.column)
|
||||
if args.tzo:
|
||||
md = tzo_replace_images(md, get_part_by_odt_name(basename(args.input)), reader)
|
||||
|
||||
print(md)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
27
single_odt.py
Executable file
27
single_odt.py
Executable file
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env python3
|
||||
import os.path
|
||||
from idb import Article, DocumentCreator
|
||||
from idb.util import image_url_to_filename, download_file, extract_images_from_markdown, read_file
|
||||
from argparse import ArgumentParser
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--ru-file', type=str, required=True, help='russian input file')
|
||||
parser.add_argument('--en-file', type=str, required=True, help='english input file')
|
||||
parser.add_argument('--output', type=str, required=True, help='output ODT file')
|
||||
args = parser.parse_args()
|
||||
|
||||
orig = Article.from_markdown_file(args.ru_file, with_title=False)
|
||||
trans = Article.from_markdown_file(args.en_file, with_title=False)
|
||||
|
||||
image_urls = extract_images_from_markdown(read_file(args.ru_file))
|
||||
for image_url in image_urls:
|
||||
image_name = image_url_to_filename(image_url)
|
||||
output_file = os.path.join(os.path.dirname(__file__), 'images', image_name)
|
||||
if not os.path.exists(output_file):
|
||||
download_file(image_url, output_file)
|
||||
print(f'{image_name} saved')
|
||||
|
||||
doc = DocumentCreator()
|
||||
doc.create(orig, trans, args.output)
|
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
import os.path
|
||||
from idb import tzo_urls
|
||||
from argparse import ArgumentParser
|
||||
from idb import tzo_urls, after_tzo_urls
|
||||
from idb.util import read_file, name_from_url, image_url_to_filename, download_file, extract_images_from_markdown
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@ -8,7 +9,13 @@ load_dotenv()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for url in tzo_urls:
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--after', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
urls = tzo_urls if not args.after else after_tzo_urls
|
||||
|
||||
for url in urls:
|
||||
name = name_from_url(url)
|
||||
markdown_file = os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt')
|
||||
image_urls = extract_images_from_markdown(read_file(markdown_file))
|
||||
|
11
tzo_odt.py
11
tzo_odt.py
@ -1,11 +1,18 @@
|
||||
#!/usr/bin/env python3
|
||||
import os.path
|
||||
from idb import Article, DocumentCreator, tzo_urls
|
||||
from argparse import ArgumentParser
|
||||
from idb import Article, DocumentCreator, tzo_urls, after_tzo_urls
|
||||
from idb.util import name_from_url
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for url in tzo_urls:
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--after', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
urls = tzo_urls if not args.after else after_tzo_urls
|
||||
|
||||
for url in urls:
|
||||
name = name_from_url(url)
|
||||
|
||||
orig = Article.from_markdown_file(os.path.join(os.path.dirname(__file__), 'tzo', f'{name}-ru.txt'), with_title=False)
|
||||
|
Loading…
x
Reference in New Issue
Block a user