398 lines
13 KiB
Python
398 lines
13 KiB
Python
import re
|
|
import requests
|
|
import imghdr
|
|
import json
|
|
import os
|
|
import queue
|
|
import shutil
|
|
import traceback
|
|
|
|
from ..util.util import safe_print, download_file, run
|
|
from typing import Optional
|
|
from threading import Thread
|
|
import urllib.error
|
|
|
|
_pages_queue = queue.Queue()
|
|
_merging_queue = queue.Queue()
|
|
|
|
VTILES = 3
|
|
HTILES = 2
|
|
TILE_MERGING_POOL_SIZE = 8
|
|
PAGE_FETCHING_POOL_SIZE = 8
|
|
|
|
|
|
MONTHS = dict(
|
|
jan=1,
|
|
feb=2,
|
|
mar=3,
|
|
apr=4,
|
|
may=5,
|
|
jun=6,
|
|
jul=7,
|
|
juillet=7,
|
|
aout=8,
|
|
aug=8,
|
|
sep=9,
|
|
oct=10,
|
|
nov=11,
|
|
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
|
|
dec=12
|
|
)
|
|
|
|
|
|
def convert_date(s: str) -> tuple[str, str, str]:
|
|
m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
|
|
year = m[2]
|
|
month = '%02d' % MONTHS[m[1]]
|
|
day = m[0]
|
|
return year, month, day
|
|
|
|
|
|
def parse_url(url: str) -> tuple:
|
|
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
|
|
|
|
|
def _doc_info(collection_id, doc_id):
|
|
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
|
|
return r.json()
|
|
|
|
|
|
def page_info(collection_id, doc_id, page):
|
|
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
|
|
return r.json()
|
|
|
|
|
|
def thumbnail_url(collection_id, doc_id, page) -> str:
|
|
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
|
|
|
|
|
|
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
|
|
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
|
|
|
|
|
|
|
|
|
|
class DownloaderThread(Thread):
|
|
_url: str
|
|
_save_as: str
|
|
_download_result: Optional[bool]
|
|
_handle_http: bool
|
|
user_info: dict
|
|
|
|
def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None):
|
|
super().__init__()
|
|
if user_info is None:
|
|
user_info = {}
|
|
if thread_name:
|
|
self.name = thread_name
|
|
|
|
self._url = url
|
|
self._save_as = save_as
|
|
self._download_result = None
|
|
self._handle_http = handle_http
|
|
self.user_info = user_info
|
|
|
|
def run(self):
|
|
try:
|
|
self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http)
|
|
except urllib.error.HTTPError:
|
|
pass
|
|
|
|
def is_downloaded(self) -> bool:
|
|
return self._download_result is True
|
|
|
|
|
|
class TileMergeWorker(Thread):
|
|
_working_dir: str
|
|
_number: int
|
|
|
|
def __init__(self, working_dir: str, number: int):
|
|
super().__init__()
|
|
self._working_dir = working_dir
|
|
self._number = number
|
|
|
|
def run(self):
|
|
safe_print(f'[tile merger {self._number}] started')
|
|
|
|
while not _merging_queue.empty():
|
|
try:
|
|
page = _merging_queue.get_nowait()
|
|
page_dir = os.path.join(self._working_dir, str(page))
|
|
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
|
|
meta_path = os.path.join(page_dir, 'meta.json')
|
|
|
|
if os.path.exists(thumbnail_path):
|
|
shutil.copy(thumbnail_path, os.path.join(self._working_dir, f'{page}.jpg'))
|
|
continue
|
|
|
|
if os.path.exists(meta_path):
|
|
with open(meta_path, 'r') as f:
|
|
meta = json.loads(f.read())
|
|
htiles = meta['h']
|
|
vtiles = meta['v']
|
|
else:
|
|
htiles = HTILES
|
|
vtiles = VTILES
|
|
|
|
hfiles = []
|
|
for h in range(htiles):
|
|
vfiles = []
|
|
for v in range(vtiles):
|
|
vfiles.append(f'v{v}_h{h}.jpg')
|
|
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
|
|
hfiles.append(f'{h}.jpg')
|
|
|
|
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
|
|
# shutil.rmtree(page_dir)
|
|
|
|
safe_print(f'[tile merger {self._number}] page {page} done')
|
|
|
|
except queue.Empty:
|
|
break
|
|
|
|
|
|
class PageFetchWorker(Thread):
|
|
_working_dir: str
|
|
_number: int
|
|
_failed: bool
|
|
_error: Optional[str]
|
|
_probe_pages: Optional[list[int]]
|
|
_probe_all: bool
|
|
|
|
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False):
|
|
super().__init__()
|
|
self._working_dir = working_dir
|
|
self._number = number
|
|
self._collection_id = collection_id
|
|
self._doc_id = doc_id
|
|
self._failed = False
|
|
self._error = None
|
|
self._probe_pages = probe_pages
|
|
self._probe_all = probe_all
|
|
|
|
def run(self):
|
|
safe_print(f'[pf-{self._number}] started')
|
|
page = 0
|
|
|
|
try:
|
|
while not _pages_queue.empty():
|
|
try:
|
|
page = _pages_queue.get_nowait()
|
|
safe_print(f'[pf-{self._number}] page {page} started')
|
|
|
|
if self._probe_all or (self._probe_pages is not None and page in self._probe_pages):
|
|
self.probe_dl(page)
|
|
else:
|
|
try:
|
|
self.normal_dl(page)
|
|
except OSError:
|
|
safe_print(f'[pf-{self._number}] normal_dl() failed, trying probe_dl()')
|
|
self.probe_dl(page)
|
|
|
|
except queue.Empty:
|
|
break
|
|
|
|
except Exception as e:
|
|
self._failed = True
|
|
self._error = f'while fetching page {page}: {str(e)}' + traceback.format_exc()
|
|
|
|
def _get_page_dir(self, page):
|
|
page_dir = os.path.join(self._working_dir, str(page))
|
|
if not os.path.exists(page_dir):
|
|
os.makedirs(page_dir)
|
|
return page_dir
|
|
|
|
def is_failed(self) -> bool:
|
|
return self._failed
|
|
|
|
def get_error(self) -> str:
|
|
return self._error if self._error is not None else ''
|
|
|
|
def normal_dl(self, page):
|
|
page_dir = self._get_page_dir(page)
|
|
dl_tasks = []
|
|
for horiz_tile in range(HTILES):
|
|
for vert_tile in range(VTILES):
|
|
url = tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
|
|
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
|
|
if os.path.isfile(output_file):
|
|
if os.path.getsize(output_file) < 4:
|
|
os.unlink(output_file)
|
|
# safe_print(f'[pf-{self._number}] already exists')
|
|
continue
|
|
|
|
dl_tasks.append(DownloaderThread(url=url,
|
|
save_as=os.path.join(page_dir, output_file),
|
|
thread_name=f'p{page}-v{vert_tile}-h{horiz_tile}'))
|
|
|
|
for task in dl_tasks:
|
|
task.start()
|
|
|
|
data_error = False
|
|
|
|
for task in dl_tasks:
|
|
task.join()
|
|
if not task.is_downloaded():
|
|
# safe_print(f'failed to download file {task._url}')
|
|
raise OSError(f'network error, failed to download {task._url}')
|
|
|
|
elif not imghdr.what(task._save_as):
|
|
data_error = True
|
|
|
|
if data_error:
|
|
self.thumbnail_dl(page)
|
|
else:
|
|
safe_print(f'[pf-{self._number}] page {page}: all files saved')
|
|
|
|
def probe_dl(self, page):
|
|
page_dir = self._get_page_dir(page)
|
|
real_h = 0
|
|
real_v = 0
|
|
data_error = False
|
|
dl_tasks = []
|
|
for h in range(10):
|
|
for v in range(10):
|
|
url = tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
|
output_file = f'{page_dir}/{h}x{v}.jpg'
|
|
if os.path.isfile(output_file):
|
|
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
|
if os.path.getsize(output_file) < 4:
|
|
os.unlink(output_file)
|
|
continue
|
|
|
|
dl_tasks.append(DownloaderThread(url=url,
|
|
save_as=os.path.join(page_dir, output_file),
|
|
handle_http=True,
|
|
thread_name=f'p{page}-v{v}-h{h}',
|
|
user_info=dict(h=h, v=v)))
|
|
|
|
for task in dl_tasks:
|
|
task.start()
|
|
for task in dl_tasks:
|
|
task.join()
|
|
|
|
if task.is_downloaded():
|
|
task_h = task.user_info['h']
|
|
task_v = task.user_info['v']
|
|
if task_h > real_h:
|
|
real_h = task_h
|
|
if task_v > real_v:
|
|
real_v = task_v
|
|
|
|
if not imghdr.what(task._save_as):
|
|
data_error = True
|
|
|
|
# try:
|
|
# if not download_file(url, output_file, handle_http_errors=False):
|
|
# raise OSError('network failure')
|
|
# if not imghdr.what(output_file):
|
|
# data_error = True
|
|
# break
|
|
# real_v = v
|
|
# real_h = h
|
|
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
|
|
#
|
|
# except urllib.error.HTTPError:
|
|
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
|
|
# break
|
|
|
|
if data_error:
|
|
self.thumbnail_dl(page)
|
|
else:
|
|
with open(os.path.join(page_dir, 'meta.json'), 'w') as f:
|
|
f.write(json.dumps(dict(v=real_v+1, h=real_h+1)))
|
|
safe_print(f'[pf-{self._number}] page {page}: all files saved (seemingly...)')
|
|
|
|
def thumbnail_dl(self, page):
|
|
page_dir = self._get_page_dir(page)
|
|
thumb_url = thumbnail_url(self._collection_id, self._doc_id, page)
|
|
if not download_file(thumb_url, os.path.join(page_dir, 'thumbnail.jpg')):
|
|
raise RuntimeError(f'network error, failed to download thumbnail ({thumb_url})')
|
|
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
|
|
|
|
|
|
def grab_magazine(url: str,
|
|
output_root: str,
|
|
probe_pages: Optional[list[int]] = None,
|
|
probe_all=False, only_fetch=False, force_overwrite=False):
|
|
try:
|
|
pub_date, collection_id, doc_id = parse_url(url)
|
|
except AttributeError:
|
|
return False
|
|
|
|
data = _doc_info(collection_id, doc_id)
|
|
pages = int(data['nbPages'])
|
|
print(f'found {pages} pages')
|
|
|
|
y, m, d = convert_date(pub_date)
|
|
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
|
if not force_overwrite:
|
|
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
|
return True
|
|
else:
|
|
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
|
|
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
|
|
|
|
output_dir = os.path.join(output_root, pub_date)
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
|
|
# fetch pages
|
|
for page in range(pages):
|
|
_pages_queue.put(page+1)
|
|
|
|
pool = []
|
|
for i in range(PAGE_FETCHING_POOL_SIZE):
|
|
pool.append(PageFetchWorker(working_dir=output_dir,
|
|
number=i+1,
|
|
collection_id=collection_id,
|
|
doc_id=doc_id,
|
|
probe_pages=probe_pages,
|
|
probe_all=probe_all))
|
|
for worker in pool:
|
|
worker.start()
|
|
|
|
for worker in pool:
|
|
worker.join()
|
|
if worker.is_failed():
|
|
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
|
|
f.write(f'error: {worker.get_error()}')
|
|
print(f'ERROR: failed to download {pub_date} magazine')
|
|
return False
|
|
|
|
if only_fetch:
|
|
return True
|
|
|
|
# merge tiles
|
|
for page in range(pages):
|
|
page += 1
|
|
_merging_queue.put(page)
|
|
|
|
pool = []
|
|
for i in range(TILE_MERGING_POOL_SIZE):
|
|
pool.append(TileMergeWorker(working_dir=output_dir, number=i+1))
|
|
for worker in pool:
|
|
worker.start()
|
|
try:
|
|
for worker in pool:
|
|
worker.join()
|
|
|
|
# merge images into pdf
|
|
files = [str(page + 1) + '.jpg' for page in range(pages)]
|
|
run(['convert', *files, os.path.join(output_root, f'{y}-{m}-{d}.pdf')], cwd=output_dir)
|
|
shutil.rmtree(output_dir)
|
|
except:
|
|
traceback.print_exc()
|
|
|
|
return True
|
|
|
|
|
|
def set_tile_merging_pool_size(size):
|
|
global TILE_MERGING_POOL_SIZE
|
|
TILE_MERGING_POOL_SIZE = size
|
|
|
|
|
|
def set_page_fetching_pool_size(size):
|
|
global PAGE_FETCHING_POOL_SIZE
|
|
PAGE_FETCHING_POOL_SIZE = size
|