This commit is contained in:
Evgeny Zinoviev 2023-02-18 20:07:00 +03:00
parent 209c6404eb
commit 3847423443
7 changed files with 114 additions and 40 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
/.idea
/venv
/*.sqlite3
__pycache__
/test.py

View File

@ -20,5 +20,5 @@ if __name__ == '__main__':
print(f'ERROR: {doc["collection_id"]}/{doc["page_id"]}/{page}: width or height is zero') print(f'ERROR: {doc["collection_id"]}/{doc["page_id"]}/{page}: width or height is zero')
continue continue
ratio = width/height ratio = width/height
# TODO if ratio >= 0.8:
# print(f'[]') print(f'{doc["collection_id"]}/{doc["doc_id"]}/{page}: {ratio}')

1
database/__init__.py Normal file
View File

@ -0,0 +1 @@
from .database import Database

View File

@ -13,7 +13,7 @@ class Database:
def __init__(self): def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__) self.logger = logging.getLogger(self.__class__.__name__)
file = os.path.join(os.path.dirname(__file__), 'mdf-retrobase.sqlite3') file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3')
self.sqlite = sqlite3.connect(file, check_same_thread=False) self.sqlite = sqlite3.connect(file, check_same_thread=False)
self.lock = threading.Lock() self.lock = threading.Lock()

View File

@ -1,4 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import logging
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os import os
import sys import sys
import json import json
@ -13,6 +17,7 @@ import shutil
import queue import queue
import traceback import traceback
import retronews import retronews
import logging
from database import Database from database import Database
from typing import Optional from typing import Optional
@ -20,6 +25,8 @@ from threading import Thread, Lock
from time import sleep from time import sleep
from argparse import ArgumentParser from argparse import ArgumentParser
warnings.filterwarnings("ignore", category=DeprecationWarning)
VTILES = 3 VTILES = 3
HTILES = 2 HTILES = 2
TILE_MERGING_POOL_SIZE = 8 TILE_MERGING_POOL_SIZE = 8
@ -47,18 +54,27 @@ class DownloaderThread(Thread):
_url: str _url: str
_save_as: str _save_as: str
_download_result: Optional[bool] _download_result: Optional[bool]
_handle_http: bool
user_info: dict
def __init__(self, url: str, save_as: str, thread_name=None): def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None):
super().__init__() super().__init__()
if user_info is None:
user_info = {}
if thread_name: if thread_name:
self.name = thread_name self.name = thread_name
self._url = url self._url = url
self._save_as = save_as self._save_as = save_as
self._download_result = None self._download_result = None
self._handle_http = handle_http
self.user_info = user_info
def run(self): def run(self):
self._download_result = download_file(self._url, self._save_as) try:
self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http)
except urllib.error.HTTPError:
pass
def is_downloaded(self) -> bool: def is_downloaded(self) -> bool:
return self._download_result is True return self._download_result is True
@ -80,7 +96,7 @@ class TileMergeWorker(Thread):
try: try:
page = merging_queue.get_nowait() page = merging_queue.get_nowait()
page_dir = os.path.join(self._working_dir, str(page)) page_dir = os.path.join(self._working_dir, str(page))
thumbnail_path = os.path.join(self._working_dir, 'thumbnail.jpg') thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
meta_path = os.path.join(page_dir, 'meta.json') meta_path = os.path.join(page_dir, 'meta.json')
if os.path.exists(thumbnail_path): if os.path.exists(thumbnail_path):
@ -100,12 +116,12 @@ class TileMergeWorker(Thread):
for h in range(htiles): for h in range(htiles):
vfiles = [] vfiles = []
for v in range(vtiles): for v in range(vtiles):
vfiles.append(f'v{v}_h{h}.jpg') vfiles.append(f'{h}x{v}.jpg')
run(['convert', '-append', *vfiles, f'_v_{h}.jpg'], cwd=page_dir) run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
hfiles.append(f'_v_{h}.jpg') hfiles.append(f'{h}.jpg')
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir) run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
shutil.rmtree(page_dir) # shutil.rmtree(page_dir)
safe_print(f'[tile merger {self._number}] page {page} done') safe_print(f'[tile merger {self._number}] page {page} done')
@ -119,8 +135,9 @@ class PageFetchWorker(Thread):
_failed: bool _failed: bool
_error: Optional[str] _error: Optional[str]
_probe_pages: Optional[list[int]] _probe_pages: Optional[list[int]]
_probe_all: bool
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None): def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False):
super().__init__() super().__init__()
self._working_dir = working_dir self._working_dir = working_dir
self._number = number self._number = number
@ -129,6 +146,7 @@ class PageFetchWorker(Thread):
self._failed = False self._failed = False
self._error = None self._error = None
self._probe_pages = probe_pages self._probe_pages = probe_pages
self._probe_all = probe_all
def run(self): def run(self):
safe_print(f'[pf-{self._number}] started') safe_print(f'[pf-{self._number}] started')
@ -140,7 +158,7 @@ class PageFetchWorker(Thread):
page = pages_queue.get_nowait() page = pages_queue.get_nowait()
safe_print(f'[pf-{self._number}] page {page} started') safe_print(f'[pf-{self._number}] page {page} started')
if page in self._probe_pages: if self._probe_all or page in self._probe_pages:
self.probe_dl(page) self.probe_dl(page)
else: else:
try: try:
@ -209,28 +227,52 @@ class PageFetchWorker(Thread):
real_h = 0 real_h = 0
real_v = 0 real_v = 0
data_error = False data_error = False
for h in range(5): dl_tasks = []
for v in range(5): for h in range(10):
for v in range(10):
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v) url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
output_file = f'{page_dir}/v{v}_h{h}.jpg' output_file = f'{page_dir}/{h}x{v}.jpg'
if os.path.isfile(output_file): if os.path.isfile(output_file):
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY') safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
if os.path.getsize(output_file) < 4: if os.path.getsize(output_file) < 4:
os.unlink(output_file) os.unlink(output_file)
continue continue
try:
if not download_file(url, output_file, handle_http_errors=False):
raise OSError('network failure')
if not imghdr.what(output_file):
data_error = True
break
real_v = v
real_h = h
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
except urllib.error.HTTPError: dl_tasks.append(DownloaderThread(url=url,
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL') save_as=os.path.join(page_dir, output_file),
break handle_http=True,
thread_name=f'p{page}-v{v}-h{h}',
user_info=dict(h=h, v=v)))
for task in dl_tasks:
task.start()
for task in dl_tasks:
task.join()
if task.is_downloaded():
task_h = task.user_info['h']
task_v = task.user_info['v']
if task_h > real_h:
real_h = task_h
if task_v > real_v:
real_v = task_v
if not imghdr.what(task._save_as):
data_error = True
# try:
# if not download_file(url, output_file, handle_http_errors=False):
# raise OSError('network failure')
# if not imghdr.what(output_file):
# data_error = True
# break
# real_v = v
# real_h = h
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
#
# except urllib.error.HTTPError:
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
# break
if data_error: if data_error:
self.thumbnail_dl(page) self.thumbnail_dl(page)
@ -272,10 +314,13 @@ def download_file(url, output, handle_http_errors=True) -> bool:
def grab_magazine(url: str, def grab_magazine(url: str,
output_root: str, output_root: str,
probe_pages: Optional[list[int]] = None, probe_pages: Optional[list[int]] = None,
only_fetch=False, force_overwrite=False): probe_all=False, only_fetch=False, force_overwrite=False):
pub_date, collection_id, doc_id = retronews.parse_url(url) try:
pub_date, collection_id, doc_id = retronews.parse_url(url)
except AttributeError:
return False
data = retronews.api_doc_info(collection_id, doc_id) data = retronews.doc_info(collection_id, doc_id)
pages = int(data['nbPages']) pages = int(data['nbPages'])
print(f'found {pages} pages') print(f'found {pages} pages')
@ -283,7 +328,7 @@ def grab_magazine(url: str,
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')): if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
if not force_overwrite: if not force_overwrite:
print(f'{y}-{m}-{d}.pdf already exists, not continuing') print(f'{y}-{m}-{d}.pdf already exists, not continuing')
return return True
else: else:
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf')) os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)') print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
@ -302,7 +347,8 @@ def grab_magazine(url: str,
number=i+1, number=i+1,
collection_id=collection_id, collection_id=collection_id,
doc_id=doc_id, doc_id=doc_id,
probe_pages=probe_pages)) probe_pages=probe_pages,
probe_all=probe_all))
for worker in pool: for worker in pool:
worker.start() worker.start()
@ -312,10 +358,10 @@ def grab_magazine(url: str,
with open(os.path.join(output_dir, 'error.txt'), 'w') as f: with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
f.write(f'error: {worker.get_error()}') f.write(f'error: {worker.get_error()}')
print(f'ERROR: failed to download {pub_date} magazine') print(f'ERROR: failed to download {pub_date} magazine')
return return False
if only_fetch: if only_fetch:
return return True
# merge tiles # merge tiles
for page in range(pages): for page in range(pages):
@ -338,6 +384,8 @@ def grab_magazine(url: str,
except: except:
traceback.print_exc() traceback.print_exc()
return True
if __name__ == '__main__': if __name__ == '__main__':
parser = ArgumentParser() parser = ArgumentParser()
@ -354,6 +402,8 @@ if __name__ == '__main__':
help='only fetch magazine tiles and exit, do not merge anything') help='only fetch magazine tiles and exit, do not merge anything')
parser.add_argument('--force-overwrite', action='store_true', parser.add_argument('--force-overwrite', action='store_true',
help='if file yyyy-mm-dd.pdf already exists, delete it and start over') help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
parser.add_argument('--force-probe', action='store_true',
help='force all pages to use the \'probe\' method')
parser.add_argument('--fetch-probe-pages', nargs='+', type=int, parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown') help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
@ -371,11 +421,14 @@ if __name__ == '__main__':
url = args.url url = args.url
while True: while True:
print(f'grabbing {url}...') print(f'grabbing {url}...')
grab_magazine(url, if not grab_magazine(url,
output_root=args.output, output_root=args.output,
probe_pages=args.fetch_probe_pages, probe_pages=args.fetch_probe_pages,
only_fetch=args.only_fetch, probe_all=args.force_probe,
force_overwrite=args.force_overwrite) only_fetch=args.only_fetch,
force_overwrite=args.force_overwrite):
logging.error('failed to grab')
break
if not args.continue_prev and not args.continue_next: if not args.continue_prev and not args.continue_next:
break break
@ -383,11 +436,16 @@ if __name__ == '__main__':
r = requests.get(url) r = requests.get(url)
try: try:
next_url = None
if args.continue_next: if args.continue_next:
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0] next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
elif args.continue_prev: elif args.continue_prev:
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0] next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
if not next_url:
if not next_url:
break
if next_url.startswith('/'): if next_url.startswith('/'):
next_url = f'https://www.retronews.fr{next_url}' next_url = f'https://www.retronews.fr{next_url}'

8
retronews/__init__.py Normal file
View File

@ -0,0 +1,8 @@
from .retronews import (
convert_date,
parse_url,
doc_info,
page_info,
thumbnail_url,
tile_url
)

View File

@ -9,6 +9,8 @@ MONTHS = dict(
may=5, may=5,
jun=6, jun=6,
jul=7, jul=7,
juillet=7,
aout=8,
aug=8, aug=8,
sep=9, sep=9,
oct=10, oct=10,
@ -27,7 +29,7 @@ def convert_date(s: str) -> tuple[str, str, str]:
def parse_url(url: str) -> tuple: def parse_url(url: str) -> tuple:
return re.search(r'/(?:mercure-de-france|le-nouveau-mercure|le-mercure-galant|mercure-francais|mercure-galant)/([^/]+)/(\d+)/(\d+)/', url).groups() return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
def doc_info(collection_id, doc_id): def doc_info(collection_id, doc_id):