Evgeny Zinoviev abd1975def upd
2023-12-30 15:37:08 +03:00

398 lines
13 KiB
Python

import re
import requests
import imghdr
import json
import os
import queue
import shutil
import traceback
from ..util.util import safe_print, download_file, run
from typing import Optional
from threading import Thread
import urllib.error
_pages_queue = queue.Queue()
_merging_queue = queue.Queue()
VTILES = 3
HTILES = 2
TILE_MERGING_POOL_SIZE = 8
PAGE_FETCHING_POOL_SIZE = 8
MONTHS = dict(
jan=1,
feb=2,
mar=3,
apr=4,
may=5,
jun=6,
jul=7,
juillet=7,
aout=8,
aug=8,
sep=9,
oct=10,
nov=11,
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
dec=12
)
def convert_date(s: str) -> tuple[str, str, str]:
m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
year = m[2]
month = '%02d' % MONTHS[m[1]]
day = m[0]
return year, month, day
def parse_url(url: str) -> tuple:
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
def _doc_info(collection_id, doc_id):
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
return r.json()
def page_info(collection_id, doc_id, page):
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
return r.json()
def thumbnail_url(collection_id, doc_id, page) -> str:
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
class DownloaderThread(Thread):
_url: str
_save_as: str
_download_result: Optional[bool]
_handle_http: bool
user_info: dict
def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None):
super().__init__()
if user_info is None:
user_info = {}
if thread_name:
self.name = thread_name
self._url = url
self._save_as = save_as
self._download_result = None
self._handle_http = handle_http
self.user_info = user_info
def run(self):
try:
self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http)
except urllib.error.HTTPError:
pass
def is_downloaded(self) -> bool:
return self._download_result is True
class TileMergeWorker(Thread):
_working_dir: str
_number: int
def __init__(self, working_dir: str, number: int):
super().__init__()
self._working_dir = working_dir
self._number = number
def run(self):
safe_print(f'[tile merger {self._number}] started')
while not _merging_queue.empty():
try:
page = _merging_queue.get_nowait()
page_dir = os.path.join(self._working_dir, str(page))
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
meta_path = os.path.join(page_dir, 'meta.json')
if os.path.exists(thumbnail_path):
shutil.copy(thumbnail_path, os.path.join(self._working_dir, f'{page}.jpg'))
continue
if os.path.exists(meta_path):
with open(meta_path, 'r') as f:
meta = json.loads(f.read())
htiles = meta['h']
vtiles = meta['v']
else:
htiles = HTILES
vtiles = VTILES
hfiles = []
for h in range(htiles):
vfiles = []
for v in range(vtiles):
vfiles.append(f'v{v}_h{h}.jpg')
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
hfiles.append(f'{h}.jpg')
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
# shutil.rmtree(page_dir)
safe_print(f'[tile merger {self._number}] page {page} done')
except queue.Empty:
break
class PageFetchWorker(Thread):
_working_dir: str
_number: int
_failed: bool
_error: Optional[str]
_probe_pages: Optional[list[int]]
_probe_all: bool
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False):
super().__init__()
self._working_dir = working_dir
self._number = number
self._collection_id = collection_id
self._doc_id = doc_id
self._failed = False
self._error = None
self._probe_pages = probe_pages
self._probe_all = probe_all
def run(self):
safe_print(f'[pf-{self._number}] started')
page = 0
try:
while not _pages_queue.empty():
try:
page = _pages_queue.get_nowait()
safe_print(f'[pf-{self._number}] page {page} started')
if self._probe_all or (self._probe_pages is not None and page in self._probe_pages):
self.probe_dl(page)
else:
try:
self.normal_dl(page)
except OSError:
safe_print(f'[pf-{self._number}] normal_dl() failed, trying probe_dl()')
self.probe_dl(page)
except queue.Empty:
break
except Exception as e:
self._failed = True
self._error = f'while fetching page {page}: {str(e)}' + traceback.format_exc()
def _get_page_dir(self, page):
page_dir = os.path.join(self._working_dir, str(page))
if not os.path.exists(page_dir):
os.makedirs(page_dir)
return page_dir
def is_failed(self) -> bool:
return self._failed
def get_error(self) -> str:
return self._error if self._error is not None else ''
def normal_dl(self, page):
page_dir = self._get_page_dir(page)
dl_tasks = []
for horiz_tile in range(HTILES):
for vert_tile in range(VTILES):
url = tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
if os.path.isfile(output_file):
if os.path.getsize(output_file) < 4:
os.unlink(output_file)
# safe_print(f'[pf-{self._number}] already exists')
continue
dl_tasks.append(DownloaderThread(url=url,
save_as=os.path.join(page_dir, output_file),
thread_name=f'p{page}-v{vert_tile}-h{horiz_tile}'))
for task in dl_tasks:
task.start()
data_error = False
for task in dl_tasks:
task.join()
if not task.is_downloaded():
# safe_print(f'failed to download file {task._url}')
raise OSError(f'network error, failed to download {task._url}')
elif not imghdr.what(task._save_as):
data_error = True
if data_error:
self.thumbnail_dl(page)
else:
safe_print(f'[pf-{self._number}] page {page}: all files saved')
def probe_dl(self, page):
page_dir = self._get_page_dir(page)
real_h = 0
real_v = 0
data_error = False
dl_tasks = []
for h in range(10):
for v in range(10):
url = tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
output_file = f'{page_dir}/{h}x{v}.jpg'
if os.path.isfile(output_file):
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
if os.path.getsize(output_file) < 4:
os.unlink(output_file)
continue
dl_tasks.append(DownloaderThread(url=url,
save_as=os.path.join(page_dir, output_file),
handle_http=True,
thread_name=f'p{page}-v{v}-h{h}',
user_info=dict(h=h, v=v)))
for task in dl_tasks:
task.start()
for task in dl_tasks:
task.join()
if task.is_downloaded():
task_h = task.user_info['h']
task_v = task.user_info['v']
if task_h > real_h:
real_h = task_h
if task_v > real_v:
real_v = task_v
if not imghdr.what(task._save_as):
data_error = True
# try:
# if not download_file(url, output_file, handle_http_errors=False):
# raise OSError('network failure')
# if not imghdr.what(output_file):
# data_error = True
# break
# real_v = v
# real_h = h
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
#
# except urllib.error.HTTPError:
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
# break
if data_error:
self.thumbnail_dl(page)
else:
with open(os.path.join(page_dir, 'meta.json'), 'w') as f:
f.write(json.dumps(dict(v=real_v+1, h=real_h+1)))
safe_print(f'[pf-{self._number}] page {page}: all files saved (seemingly...)')
def thumbnail_dl(self, page):
page_dir = self._get_page_dir(page)
thumb_url = thumbnail_url(self._collection_id, self._doc_id, page)
if not download_file(thumb_url, os.path.join(page_dir, 'thumbnail.jpg')):
raise RuntimeError(f'network error, failed to download thumbnail ({thumb_url})')
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
def grab_magazine(url: str,
output_root: str,
probe_pages: Optional[list[int]] = None,
probe_all=False, only_fetch=False, force_overwrite=False):
try:
pub_date, collection_id, doc_id = parse_url(url)
except AttributeError:
return False
data = _doc_info(collection_id, doc_id)
pages = int(data['nbPages'])
print(f'found {pages} pages')
y, m, d = convert_date(pub_date)
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
if not force_overwrite:
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
return True
else:
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
output_dir = os.path.join(output_root, pub_date)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# fetch pages
for page in range(pages):
_pages_queue.put(page+1)
pool = []
for i in range(PAGE_FETCHING_POOL_SIZE):
pool.append(PageFetchWorker(working_dir=output_dir,
number=i+1,
collection_id=collection_id,
doc_id=doc_id,
probe_pages=probe_pages,
probe_all=probe_all))
for worker in pool:
worker.start()
for worker in pool:
worker.join()
if worker.is_failed():
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
f.write(f'error: {worker.get_error()}')
print(f'ERROR: failed to download {pub_date} magazine')
return False
if only_fetch:
return True
# merge tiles
for page in range(pages):
page += 1
_merging_queue.put(page)
pool = []
for i in range(TILE_MERGING_POOL_SIZE):
pool.append(TileMergeWorker(working_dir=output_dir, number=i+1))
for worker in pool:
worker.start()
try:
for worker in pool:
worker.join()
# merge images into pdf
files = [str(page + 1) + '.jpg' for page in range(pages)]
run(['convert', *files, os.path.join(output_root, f'{y}-{m}-{d}.pdf')], cwd=output_dir)
shutil.rmtree(output_dir)
except:
traceback.print_exc()
return True
def set_tile_merging_pool_size(size):
global TILE_MERGING_POOL_SIZE
TILE_MERGING_POOL_SIZE = size
def set_page_fetching_pool_size(size):
global PAGE_FETCHING_POOL_SIZE
PAGE_FETCHING_POOL_SIZE = size