idb_utils/idb/tzo.py

import os
import re
from PIL import Image
from collections import namedtuple
from .util import image_url_to_filename

tzo_urls = (
    'https://kniganews.org/2012/12/20/beyond-clouds-1/',
    'https://kniganews.org/2012/12/21/beyond-clouds-21/',
    'https://kniganews.org/2012/12/22/beyond-clouds-22/',
    'https://kniganews.org/2012/12/23/beyond-clouds-31/',
    'https://kniganews.org/2012/12/24/beyond-clouds-32/',
    'https://kniganews.org/2012/12/25/beyond-clouds-33/',
    'https://kniganews.org/2012/12/28/beyond-clouds-41/',
    'https://kniganews.org/2012/12/29/beyond-clouds-42/',
    'https://kniganews.org/2012/12/30/beyond-clouds-43/',
    'https://kniganews.org/2013/01/01/beyond-clouds-44/',
    'https://kniganews.org/2013/01/06/beyond-clouds-51/',
    'https://kniganews.org/2013/01/07/beyond-clouds-52/',
    'https://kniganews.org/2013/02/16/beyond-clouds-53/',
    'https://kniganews.org/2013/03/25/beyond-clouds-61/',
    'https://kniganews.org/2013/05/10/beyond-clouds-62/',
    'https://kniganews.org/2013/06/17/beyond-clouds-731/',
    'https://kniganews.org/2013/08/07/beyond-clouds-732/',
    'https://kniganews.org/2013/09/17/beyond-clouds-73/'
)
after_tzo_urls = (
    'https://kniganews.org/2012/11/17/langlands-plus/',
)
ImageInfo = namedtuple('ImageInfo', ('url', 'local_name', 'local_path', 'width', 'height'))


class ImageList:
    images: list[ImageInfo]

    def __init__(self):
        self.images = []

    def add_image(self, url):
        local_name = image_url_to_filename(url)
        local_path = os.path.realpath(os.path.join(
            os.path.dirname(__file__),
            '..',
            'images',
            local_name
        ))
        image = Image.open(local_path)
        self.images.append(ImageInfo(url, local_name, local_path, image.size[0], image.size[1]))

    def get_images_by_size(self, w, h) -> list[ImageInfo]:
        return list(filter(lambda image: image.width == w and image.height == h, self.images))


def get_part_by_odt_name(name: str) -> int:
    m = re.match(r'^beyond-clouds-(\d+)(?:v\d+)?\.odt$', name)
    if not m:
        raise ValueError('could not parse file name')
    if not m.group(1).isnumeric():
        raise ValueError('extracted value is not a number')
    return int(m.group(1))


def part_image_list(part) -> ImageList:
    file = os.path.realpath(os.path.join(
        os.path.dirname(__file__),
        '..',
        'tzo',
        f'beyond-clouds-{part}-ru.txt',
    ))
    with open(file) as f:
        txt = f.read()
    urls = re.findall(r'!\[.*?]\((.*?)\)', txt)

    images = ImageList()
    for url in urls:
        images.add_image(url)

    return images