458 lines
16 KiB
Python
Executable File
458 lines
16 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import logging
|
|
import warnings
|
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import re
|
|
import imghdr
|
|
import requests
|
|
import urllib.request
|
|
import urllib.error
|
|
import http.client
|
|
import subprocess
|
|
import shutil
|
|
import queue
|
|
import traceback
|
|
import retronews
|
|
import logging
|
|
|
|
from database import Database
|
|
from typing import Optional
|
|
from threading import Thread, Lock
|
|
from time import sleep
|
|
from argparse import ArgumentParser
|
|
|
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
|
|
VTILES = 3
|
|
HTILES = 2
|
|
TILE_MERGING_POOL_SIZE = 8
|
|
PAGE_FETCHING_POOL_SIZE = 8
|
|
|
|
database = Database()
|
|
print_lock = Lock()
|
|
|
|
pages_queue = queue.Queue()
|
|
merging_queue = queue.Queue()
|
|
|
|
|
|
def safe_print(*args, **kwargs):
|
|
with print_lock:
|
|
print(*args, **kwargs)
|
|
|
|
|
|
def run(args: list, **kwargs):
|
|
p = subprocess.run(args, **kwargs)
|
|
if p.returncode != 0:
|
|
raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
|
|
|
|
|
|
class DownloaderThread(Thread):
|
|
_url: str
|
|
_save_as: str
|
|
_download_result: Optional[bool]
|
|
_handle_http: bool
|
|
user_info: dict
|
|
|
|
def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None):
|
|
super().__init__()
|
|
if user_info is None:
|
|
user_info = {}
|
|
if thread_name:
|
|
self.name = thread_name
|
|
|
|
self._url = url
|
|
self._save_as = save_as
|
|
self._download_result = None
|
|
self._handle_http = handle_http
|
|
self.user_info = user_info
|
|
|
|
def run(self):
|
|
try:
|
|
self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http)
|
|
except urllib.error.HTTPError:
|
|
pass
|
|
|
|
def is_downloaded(self) -> bool:
|
|
return self._download_result is True
|
|
|
|
|
|
class TileMergeWorker(Thread):
|
|
_working_dir: str
|
|
_number: int
|
|
|
|
def __init__(self, working_dir: str, number: int):
|
|
super().__init__()
|
|
self._working_dir = working_dir
|
|
self._number = number
|
|
|
|
def run(self):
|
|
safe_print(f'[tile merger {self._number}] started')
|
|
|
|
while not merging_queue.empty():
|
|
try:
|
|
page = merging_queue.get_nowait()
|
|
page_dir = os.path.join(self._working_dir, str(page))
|
|
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
|
|
meta_path = os.path.join(page_dir, 'meta.json')
|
|
|
|
if os.path.exists(thumbnail_path):
|
|
shutil.copy(thumbnail_path, os.path.join(self._working_dir, f'{page}.jpg'))
|
|
continue
|
|
|
|
if os.path.exists(meta_path):
|
|
with open(meta_path, 'r') as f:
|
|
meta = json.loads(f.read())
|
|
htiles = meta['h']
|
|
vtiles = meta['v']
|
|
else:
|
|
htiles = HTILES
|
|
vtiles = VTILES
|
|
|
|
hfiles = []
|
|
for h in range(htiles):
|
|
vfiles = []
|
|
for v in range(vtiles):
|
|
vfiles.append(f'{h}x{v}.jpg')
|
|
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
|
|
hfiles.append(f'{h}.jpg')
|
|
|
|
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
|
|
# shutil.rmtree(page_dir)
|
|
|
|
safe_print(f'[tile merger {self._number}] page {page} done')
|
|
|
|
except queue.Empty:
|
|
break
|
|
|
|
|
|
class PageFetchWorker(Thread):
|
|
_working_dir: str
|
|
_number: int
|
|
_failed: bool
|
|
_error: Optional[str]
|
|
_probe_pages: Optional[list[int]]
|
|
_probe_all: bool
|
|
|
|
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False):
|
|
super().__init__()
|
|
self._working_dir = working_dir
|
|
self._number = number
|
|
self._collection_id = collection_id
|
|
self._doc_id = doc_id
|
|
self._failed = False
|
|
self._error = None
|
|
self._probe_pages = probe_pages
|
|
self._probe_all = probe_all
|
|
|
|
def run(self):
|
|
safe_print(f'[pf-{self._number}] started')
|
|
page = 0
|
|
|
|
try:
|
|
while not pages_queue.empty():
|
|
try:
|
|
page = pages_queue.get_nowait()
|
|
safe_print(f'[pf-{self._number}] page {page} started')
|
|
|
|
if self._probe_all or page in self._probe_pages:
|
|
self.probe_dl(page)
|
|
else:
|
|
try:
|
|
self.normal_dl(page)
|
|
except OSError:
|
|
safe_print(f'[pf-{self._number}] normal_dl() failed, trying probe_dl()')
|
|
self.probe_dl(page)
|
|
|
|
except queue.Empty:
|
|
break
|
|
|
|
except Exception as e:
|
|
self._failed = True
|
|
self._error = f'while fetching page {page}: {str(e)}'
|
|
|
|
def _get_page_dir(self, page):
|
|
page_dir = os.path.join(self._working_dir, str(page))
|
|
if not os.path.exists(page_dir):
|
|
os.makedirs(page_dir)
|
|
return page_dir
|
|
|
|
def is_failed(self) -> bool:
|
|
return self._failed
|
|
|
|
def get_error(self) -> str:
|
|
return self._error if self._error is not None else ''
|
|
|
|
def normal_dl(self, page):
|
|
page_dir = self._get_page_dir(page)
|
|
dl_tasks = []
|
|
for horiz_tile in range(HTILES):
|
|
for vert_tile in range(VTILES):
|
|
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
|
|
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
|
|
if os.path.isfile(output_file):
|
|
if os.path.getsize(output_file) < 4:
|
|
os.unlink(output_file)
|
|
# safe_print(f'[pf-{self._number}] already exists')
|
|
continue
|
|
|
|
dl_tasks.append(DownloaderThread(url=url,
|
|
save_as=os.path.join(page_dir, output_file),
|
|
thread_name=f'p{page}-v{vert_tile}-h{horiz_tile}'))
|
|
|
|
for task in dl_tasks:
|
|
task.start()
|
|
|
|
data_error = False
|
|
|
|
for task in dl_tasks:
|
|
task.join()
|
|
if not task.is_downloaded():
|
|
# safe_print(f'failed to download file {task._url}')
|
|
raise OSError(f'network error, failed to download {task._url}')
|
|
|
|
elif not imghdr.what(task._save_as):
|
|
data_error = True
|
|
|
|
if data_error:
|
|
self.thumbnail_dl(page)
|
|
else:
|
|
safe_print(f'[pf-{self._number}] page {page}: all files saved')
|
|
|
|
def probe_dl(self, page):
|
|
page_dir = self._get_page_dir(page)
|
|
real_h = 0
|
|
real_v = 0
|
|
data_error = False
|
|
dl_tasks = []
|
|
for h in range(10):
|
|
for v in range(10):
|
|
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
|
output_file = f'{page_dir}/{h}x{v}.jpg'
|
|
if os.path.isfile(output_file):
|
|
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
|
if os.path.getsize(output_file) < 4:
|
|
os.unlink(output_file)
|
|
continue
|
|
|
|
dl_tasks.append(DownloaderThread(url=url,
|
|
save_as=os.path.join(page_dir, output_file),
|
|
handle_http=True,
|
|
thread_name=f'p{page}-v{v}-h{h}',
|
|
user_info=dict(h=h, v=v)))
|
|
|
|
for task in dl_tasks:
|
|
task.start()
|
|
for task in dl_tasks:
|
|
task.join()
|
|
|
|
if task.is_downloaded():
|
|
task_h = task.user_info['h']
|
|
task_v = task.user_info['v']
|
|
if task_h > real_h:
|
|
real_h = task_h
|
|
if task_v > real_v:
|
|
real_v = task_v
|
|
|
|
if not imghdr.what(task._save_as):
|
|
data_error = True
|
|
|
|
# try:
|
|
# if not download_file(url, output_file, handle_http_errors=False):
|
|
# raise OSError('network failure')
|
|
# if not imghdr.what(output_file):
|
|
# data_error = True
|
|
# break
|
|
# real_v = v
|
|
# real_h = h
|
|
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
|
|
#
|
|
# except urllib.error.HTTPError:
|
|
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
|
|
# break
|
|
|
|
if data_error:
|
|
self.thumbnail_dl(page)
|
|
else:
|
|
with open(os.path.join(page_dir, 'meta.json'), 'w') as f:
|
|
f.write(json.dumps(dict(v=real_v+1, h=real_h+1)))
|
|
safe_print(f'[pf-{self._number}] page {page}: all files saved (seemingly...)')
|
|
|
|
def thumbnail_dl(self, page):
|
|
page_dir = self._get_page_dir(page)
|
|
thumbnail_url = retronews.thumbnail_url(self._collection_id, self._doc_id, page)
|
|
if not download_file(thumbnail_url, os.path.join(page_dir, 'thumbnail.jpg')):
|
|
raise RuntimeError(f'network error, failed to download thumbnail ({thumbnail_url})')
|
|
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
|
|
|
|
|
|
def download_file(url, output, handle_http_errors=True) -> bool:
|
|
tries_left = 3
|
|
ok = False
|
|
while tries_left > 0:
|
|
try:
|
|
urllib.request.urlretrieve(url, output)
|
|
ok = True
|
|
break
|
|
except http.client.RemoteDisconnected:
|
|
ok = False
|
|
print(' caught an exception, sleeping for 2 seconds and retrying...')
|
|
sleep(2)
|
|
tries_left -= 1
|
|
except urllib.error.HTTPError as e:
|
|
if not handle_http_errors:
|
|
raise e
|
|
else:
|
|
print(f' failed to download {url}: {str(e)}')
|
|
return False
|
|
return ok
|
|
|
|
|
|
def grab_magazine(url: str,
|
|
output_root: str,
|
|
probe_pages: Optional[list[int]] = None,
|
|
probe_all=False, only_fetch=False, force_overwrite=False):
|
|
try:
|
|
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
|
except AttributeError:
|
|
return False
|
|
|
|
data = retronews.doc_info(collection_id, doc_id)
|
|
pages = int(data['nbPages'])
|
|
print(f'found {pages} pages')
|
|
|
|
y, m, d = retronews.convert_date(pub_date)
|
|
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
|
if not force_overwrite:
|
|
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
|
return True
|
|
else:
|
|
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
|
|
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
|
|
|
|
output_dir = os.path.join(output_root, pub_date)
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
|
|
# fetch pages
|
|
for page in range(pages):
|
|
pages_queue.put(page+1)
|
|
|
|
pool = []
|
|
for i in range(PAGE_FETCHING_POOL_SIZE):
|
|
pool.append(PageFetchWorker(working_dir=output_dir,
|
|
number=i+1,
|
|
collection_id=collection_id,
|
|
doc_id=doc_id,
|
|
probe_pages=probe_pages,
|
|
probe_all=probe_all))
|
|
for worker in pool:
|
|
worker.start()
|
|
|
|
for worker in pool:
|
|
worker.join()
|
|
if worker.is_failed():
|
|
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
|
|
f.write(f'error: {worker.get_error()}')
|
|
print(f'ERROR: failed to download {pub_date} magazine')
|
|
return False
|
|
|
|
if only_fetch:
|
|
return True
|
|
|
|
# merge tiles
|
|
for page in range(pages):
|
|
page += 1
|
|
merging_queue.put(page)
|
|
|
|
pool = []
|
|
for i in range(TILE_MERGING_POOL_SIZE):
|
|
pool.append(TileMergeWorker(working_dir=output_dir, number=i+1))
|
|
for worker in pool:
|
|
worker.start()
|
|
try:
|
|
for worker in pool:
|
|
worker.join()
|
|
|
|
# merge images into pdf
|
|
files = [str(page + 1) + '.jpg' for page in range(pages)]
|
|
run(['convert', *files, os.path.join(output_root, f'{y}-{m}-{d}.pdf')], cwd=output_dir)
|
|
shutil.rmtree(output_dir)
|
|
except:
|
|
traceback.print_exc()
|
|
|
|
return True
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = ArgumentParser()
|
|
parser.add_argument('--url', type=str, required=True)
|
|
parser.add_argument('--output', type=str, required=True,
|
|
help='output directory')
|
|
parser.add_argument('--merge-threads', default=TILE_MERGING_POOL_SIZE, type=int)
|
|
parser.add_argument('--fetch-threads', default=PAGE_FETCHING_POOL_SIZE, type=int)
|
|
parser.add_argument('--continue-prev', action='store_true',
|
|
help='keep scrapping backwards in time')
|
|
parser.add_argument('--continue-next', action='store_true',
|
|
help='keep scrapping forwards in time')
|
|
parser.add_argument('--only-fetch', action='store_true',
|
|
help='only fetch magazine tiles and exit, do not merge anything')
|
|
parser.add_argument('--force-overwrite', action='store_true',
|
|
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
|
parser.add_argument('--force-probe', action='store_true',
|
|
help='force all pages to use the \'probe\' method')
|
|
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
|
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
|
|
|
args = parser.parse_args()
|
|
|
|
with_continuation = args.continue_prev or args.continue_next
|
|
if args.fetch_probe_pages and with_continuation:
|
|
raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
|
|
if args.only_fetch and with_continuation:
|
|
raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
|
|
|
|
TILE_MERGING_POOL_SIZE = args.merge_threads
|
|
PAGE_FETCHING_POOL_SIZE = args.fetch_threads
|
|
|
|
url = args.url
|
|
while True:
|
|
print(f'grabbing {url}...')
|
|
if not grab_magazine(url,
|
|
output_root=args.output,
|
|
probe_pages=args.fetch_probe_pages,
|
|
probe_all=args.force_probe,
|
|
only_fetch=args.only_fetch,
|
|
force_overwrite=args.force_overwrite):
|
|
logging.error('failed to grab')
|
|
break
|
|
|
|
if not args.continue_prev and not args.continue_next:
|
|
break
|
|
|
|
r = requests.get(url)
|
|
|
|
try:
|
|
next_url = None
|
|
if args.continue_next:
|
|
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
|
elif args.continue_prev:
|
|
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
|
|
|
if not next_url:
|
|
if not next_url:
|
|
break
|
|
|
|
if next_url.startswith('/'):
|
|
next_url = f'https://www.retronews.fr{next_url}'
|
|
|
|
url = next_url
|
|
|
|
except:
|
|
print('error: failed to find previous link! exiting')
|
|
break
|
|
|