This commit is contained in:
Evgeny Zinoviev 2023-02-18 20:07:00 +03:00
parent 209c6404eb
commit 3847423443
7 changed files with 114 additions and 40 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
/.idea
/venv
/*.sqlite3
__pycache__
/test.py

View File

@ -20,5 +20,5 @@ if __name__ == '__main__':
print(f'ERROR: {doc["collection_id"]}/{doc["page_id"]}/{page}: width or height is zero')
continue
ratio = width/height
# TODO
# print(f'[]')
if ratio >= 0.8:
print(f'{doc["collection_id"]}/{doc["doc_id"]}/{page}: {ratio}')

1
database/__init__.py Normal file
View File

@ -0,0 +1 @@
from .database import Database

View File

@ -13,7 +13,7 @@ class Database:
def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__)
file = os.path.join(os.path.dirname(__file__), 'mdf-retrobase.sqlite3')
file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3')
self.sqlite = sqlite3.connect(file, check_same_thread=False)
self.lock = threading.Lock()

View File

@ -1,4 +1,8 @@
#!/usr/bin/env python3
import logging
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os
import sys
import json
@ -13,6 +17,7 @@ import shutil
import queue
import traceback
import retronews
import logging
from database import Database
from typing import Optional
@ -20,6 +25,8 @@ from threading import Thread, Lock
from time import sleep
from argparse import ArgumentParser
warnings.filterwarnings("ignore", category=DeprecationWarning)
VTILES = 3
HTILES = 2
TILE_MERGING_POOL_SIZE = 8
@ -47,18 +54,27 @@ class DownloaderThread(Thread):
_url: str
_save_as: str
_download_result: Optional[bool]
_handle_http: bool
user_info: dict
def __init__(self, url: str, save_as: str, thread_name=None):
def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None):
super().__init__()
if user_info is None:
user_info = {}
if thread_name:
self.name = thread_name
self._url = url
self._save_as = save_as
self._download_result = None
self._handle_http = handle_http
self.user_info = user_info
def run(self):
self._download_result = download_file(self._url, self._save_as)
try:
self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http)
except urllib.error.HTTPError:
pass
def is_downloaded(self) -> bool:
return self._download_result is True
@ -80,7 +96,7 @@ class TileMergeWorker(Thread):
try:
page = merging_queue.get_nowait()
page_dir = os.path.join(self._working_dir, str(page))
thumbnail_path = os.path.join(self._working_dir, 'thumbnail.jpg')
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
meta_path = os.path.join(page_dir, 'meta.json')
if os.path.exists(thumbnail_path):
@ -100,12 +116,12 @@ class TileMergeWorker(Thread):
for h in range(htiles):
vfiles = []
for v in range(vtiles):
vfiles.append(f'v{v}_h{h}.jpg')
run(['convert', '-append', *vfiles, f'_v_{h}.jpg'], cwd=page_dir)
hfiles.append(f'_v_{h}.jpg')
vfiles.append(f'{h}x{v}.jpg')
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
hfiles.append(f'{h}.jpg')
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
shutil.rmtree(page_dir)
# shutil.rmtree(page_dir)
safe_print(f'[tile merger {self._number}] page {page} done')
@ -119,8 +135,9 @@ class PageFetchWorker(Thread):
_failed: bool
_error: Optional[str]
_probe_pages: Optional[list[int]]
_probe_all: bool
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None):
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False):
super().__init__()
self._working_dir = working_dir
self._number = number
@ -129,6 +146,7 @@ class PageFetchWorker(Thread):
self._failed = False
self._error = None
self._probe_pages = probe_pages
self._probe_all = probe_all
def run(self):
safe_print(f'[pf-{self._number}] started')
@ -140,7 +158,7 @@ class PageFetchWorker(Thread):
page = pages_queue.get_nowait()
safe_print(f'[pf-{self._number}] page {page} started')
if page in self._probe_pages:
if self._probe_all or page in self._probe_pages:
self.probe_dl(page)
else:
try:
@ -209,28 +227,52 @@ class PageFetchWorker(Thread):
real_h = 0
real_v = 0
data_error = False
for h in range(5):
for v in range(5):
dl_tasks = []
for h in range(10):
for v in range(10):
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
output_file = f'{page_dir}/v{v}_h{h}.jpg'
output_file = f'{page_dir}/{h}x{v}.jpg'
if os.path.isfile(output_file):
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
if os.path.getsize(output_file) < 4:
os.unlink(output_file)
continue
try:
if not download_file(url, output_file, handle_http_errors=False):
raise OSError('network failure')
if not imghdr.what(output_file):
data_error = True
break
real_v = v
real_h = h
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
except urllib.error.HTTPError:
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
break
dl_tasks.append(DownloaderThread(url=url,
save_as=os.path.join(page_dir, output_file),
handle_http=True,
thread_name=f'p{page}-v{v}-h{h}',
user_info=dict(h=h, v=v)))
for task in dl_tasks:
task.start()
for task in dl_tasks:
task.join()
if task.is_downloaded():
task_h = task.user_info['h']
task_v = task.user_info['v']
if task_h > real_h:
real_h = task_h
if task_v > real_v:
real_v = task_v
if not imghdr.what(task._save_as):
data_error = True
# try:
# if not download_file(url, output_file, handle_http_errors=False):
# raise OSError('network failure')
# if not imghdr.what(output_file):
# data_error = True
# break
# real_v = v
# real_h = h
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
#
# except urllib.error.HTTPError:
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
# break
if data_error:
self.thumbnail_dl(page)
@ -272,10 +314,13 @@ def download_file(url, output, handle_http_errors=True) -> bool:
def grab_magazine(url: str,
output_root: str,
probe_pages: Optional[list[int]] = None,
only_fetch=False, force_overwrite=False):
probe_all=False, only_fetch=False, force_overwrite=False):
try:
pub_date, collection_id, doc_id = retronews.parse_url(url)
except AttributeError:
return False
data = retronews.api_doc_info(collection_id, doc_id)
data = retronews.doc_info(collection_id, doc_id)
pages = int(data['nbPages'])
print(f'found {pages} pages')
@ -283,7 +328,7 @@ def grab_magazine(url: str,
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
if not force_overwrite:
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
return
return True
else:
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
@ -302,7 +347,8 @@ def grab_magazine(url: str,
number=i+1,
collection_id=collection_id,
doc_id=doc_id,
probe_pages=probe_pages))
probe_pages=probe_pages,
probe_all=probe_all))
for worker in pool:
worker.start()
@ -312,10 +358,10 @@ def grab_magazine(url: str,
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
f.write(f'error: {worker.get_error()}')
print(f'ERROR: failed to download {pub_date} magazine')
return
return False
if only_fetch:
return
return True
# merge tiles
for page in range(pages):
@ -338,6 +384,8 @@ def grab_magazine(url: str,
except:
traceback.print_exc()
return True
if __name__ == '__main__':
parser = ArgumentParser()
@ -354,6 +402,8 @@ if __name__ == '__main__':
help='only fetch magazine tiles and exit, do not merge anything')
parser.add_argument('--force-overwrite', action='store_true',
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
parser.add_argument('--force-probe', action='store_true',
help='force all pages to use the \'probe\' method')
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
@ -371,11 +421,14 @@ if __name__ == '__main__':
url = args.url
while True:
print(f'grabbing {url}...')
grab_magazine(url,
if not grab_magazine(url,
output_root=args.output,
probe_pages=args.fetch_probe_pages,
probe_all=args.force_probe,
only_fetch=args.only_fetch,
force_overwrite=args.force_overwrite)
force_overwrite=args.force_overwrite):
logging.error('failed to grab')
break
if not args.continue_prev and not args.continue_next:
break
@ -383,11 +436,16 @@ if __name__ == '__main__':
r = requests.get(url)
try:
next_url = None
if args.continue_next:
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
elif args.continue_prev:
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
if not next_url:
if not next_url:
break
if next_url.startswith('/'):
next_url = f'https://www.retronews.fr{next_url}'

8
retronews/__init__.py Normal file
View File

@ -0,0 +1,8 @@
from .retronews import (
convert_date,
parse_url,
doc_info,
page_info,
thumbnail_url,
tile_url
)

View File

@ -9,6 +9,8 @@ MONTHS = dict(
may=5,
jun=6,
jul=7,
juillet=7,
aout=8,
aug=8,
sep=9,
oct=10,
@ -27,7 +29,7 @@ def convert_date(s: str) -> tuple[str, str, str]:
def parse_url(url: str) -> tuple:
return re.search(r'/(?:mercure-de-france|le-nouveau-mercure|le-mercure-galant|mercure-francais|mercure-galant)/([^/]+)/(\d+)/(\d+)/', url).groups()
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
def doc_info(collection_id, doc_id):