wip
This commit is contained in:
parent
209c6404eb
commit
3847423443
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
/.idea
|
||||
/venv
|
||||
/*.sqlite3
|
||||
__pycache__
|
||||
/test.py
|
@ -20,5 +20,5 @@ if __name__ == '__main__':
|
||||
print(f'ERROR: {doc["collection_id"]}/{doc["page_id"]}/{page}: width or height is zero')
|
||||
continue
|
||||
ratio = width/height
|
||||
# TODO
|
||||
# print(f'[]')
|
||||
if ratio >= 0.8:
|
||||
print(f'{doc["collection_id"]}/{doc["doc_id"]}/{page}: {ratio}')
|
||||
|
1
database/__init__.py
Normal file
1
database/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .database import Database
|
@ -13,7 +13,7 @@ class Database:
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
file = os.path.join(os.path.dirname(__file__), 'mdf-retrobase.sqlite3')
|
||||
file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3')
|
||||
self.sqlite = sqlite3.connect(file, check_same_thread=False)
|
||||
self.lock = threading.Lock()
|
||||
|
@ -1,4 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
@ -13,6 +17,7 @@ import shutil
|
||||
import queue
|
||||
import traceback
|
||||
import retronews
|
||||
import logging
|
||||
|
||||
from database import Database
|
||||
from typing import Optional
|
||||
@ -20,6 +25,8 @@ from threading import Thread, Lock
|
||||
from time import sleep
|
||||
from argparse import ArgumentParser
|
||||
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
|
||||
VTILES = 3
|
||||
HTILES = 2
|
||||
TILE_MERGING_POOL_SIZE = 8
|
||||
@ -47,18 +54,27 @@ class DownloaderThread(Thread):
|
||||
_url: str
|
||||
_save_as: str
|
||||
_download_result: Optional[bool]
|
||||
_handle_http: bool
|
||||
user_info: dict
|
||||
|
||||
def __init__(self, url: str, save_as: str, thread_name=None):
|
||||
def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None):
|
||||
super().__init__()
|
||||
if user_info is None:
|
||||
user_info = {}
|
||||
if thread_name:
|
||||
self.name = thread_name
|
||||
|
||||
self._url = url
|
||||
self._save_as = save_as
|
||||
self._download_result = None
|
||||
self._handle_http = handle_http
|
||||
self.user_info = user_info
|
||||
|
||||
def run(self):
|
||||
self._download_result = download_file(self._url, self._save_as)
|
||||
try:
|
||||
self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http)
|
||||
except urllib.error.HTTPError:
|
||||
pass
|
||||
|
||||
def is_downloaded(self) -> bool:
|
||||
return self._download_result is True
|
||||
@ -80,7 +96,7 @@ class TileMergeWorker(Thread):
|
||||
try:
|
||||
page = merging_queue.get_nowait()
|
||||
page_dir = os.path.join(self._working_dir, str(page))
|
||||
thumbnail_path = os.path.join(self._working_dir, 'thumbnail.jpg')
|
||||
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
|
||||
meta_path = os.path.join(page_dir, 'meta.json')
|
||||
|
||||
if os.path.exists(thumbnail_path):
|
||||
@ -100,12 +116,12 @@ class TileMergeWorker(Thread):
|
||||
for h in range(htiles):
|
||||
vfiles = []
|
||||
for v in range(vtiles):
|
||||
vfiles.append(f'v{v}_h{h}.jpg')
|
||||
run(['convert', '-append', *vfiles, f'_v_{h}.jpg'], cwd=page_dir)
|
||||
hfiles.append(f'_v_{h}.jpg')
|
||||
vfiles.append(f'{h}x{v}.jpg')
|
||||
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
|
||||
hfiles.append(f'{h}.jpg')
|
||||
|
||||
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
|
||||
shutil.rmtree(page_dir)
|
||||
# shutil.rmtree(page_dir)
|
||||
|
||||
safe_print(f'[tile merger {self._number}] page {page} done')
|
||||
|
||||
@ -119,8 +135,9 @@ class PageFetchWorker(Thread):
|
||||
_failed: bool
|
||||
_error: Optional[str]
|
||||
_probe_pages: Optional[list[int]]
|
||||
_probe_all: bool
|
||||
|
||||
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None):
|
||||
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False):
|
||||
super().__init__()
|
||||
self._working_dir = working_dir
|
||||
self._number = number
|
||||
@ -129,6 +146,7 @@ class PageFetchWorker(Thread):
|
||||
self._failed = False
|
||||
self._error = None
|
||||
self._probe_pages = probe_pages
|
||||
self._probe_all = probe_all
|
||||
|
||||
def run(self):
|
||||
safe_print(f'[pf-{self._number}] started')
|
||||
@ -140,7 +158,7 @@ class PageFetchWorker(Thread):
|
||||
page = pages_queue.get_nowait()
|
||||
safe_print(f'[pf-{self._number}] page {page} started')
|
||||
|
||||
if page in self._probe_pages:
|
||||
if self._probe_all or page in self._probe_pages:
|
||||
self.probe_dl(page)
|
||||
else:
|
||||
try:
|
||||
@ -209,28 +227,52 @@ class PageFetchWorker(Thread):
|
||||
real_h = 0
|
||||
real_v = 0
|
||||
data_error = False
|
||||
for h in range(5):
|
||||
for v in range(5):
|
||||
dl_tasks = []
|
||||
for h in range(10):
|
||||
for v in range(10):
|
||||
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
||||
output_file = f'{page_dir}/v{v}_h{h}.jpg'
|
||||
output_file = f'{page_dir}/{h}x{v}.jpg'
|
||||
if os.path.isfile(output_file):
|
||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
||||
if os.path.getsize(output_file) < 4:
|
||||
os.unlink(output_file)
|
||||
continue
|
||||
try:
|
||||
if not download_file(url, output_file, handle_http_errors=False):
|
||||
raise OSError('network failure')
|
||||
if not imghdr.what(output_file):
|
||||
data_error = True
|
||||
break
|
||||
real_v = v
|
||||
real_h = h
|
||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
|
||||
|
||||
except urllib.error.HTTPError:
|
||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
|
||||
break
|
||||
dl_tasks.append(DownloaderThread(url=url,
|
||||
save_as=os.path.join(page_dir, output_file),
|
||||
handle_http=True,
|
||||
thread_name=f'p{page}-v{v}-h{h}',
|
||||
user_info=dict(h=h, v=v)))
|
||||
|
||||
for task in dl_tasks:
|
||||
task.start()
|
||||
for task in dl_tasks:
|
||||
task.join()
|
||||
|
||||
if task.is_downloaded():
|
||||
task_h = task.user_info['h']
|
||||
task_v = task.user_info['v']
|
||||
if task_h > real_h:
|
||||
real_h = task_h
|
||||
if task_v > real_v:
|
||||
real_v = task_v
|
||||
|
||||
if not imghdr.what(task._save_as):
|
||||
data_error = True
|
||||
|
||||
# try:
|
||||
# if not download_file(url, output_file, handle_http_errors=False):
|
||||
# raise OSError('network failure')
|
||||
# if not imghdr.what(output_file):
|
||||
# data_error = True
|
||||
# break
|
||||
# real_v = v
|
||||
# real_h = h
|
||||
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
|
||||
#
|
||||
# except urllib.error.HTTPError:
|
||||
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
|
||||
# break
|
||||
|
||||
if data_error:
|
||||
self.thumbnail_dl(page)
|
||||
@ -272,10 +314,13 @@ def download_file(url, output, handle_http_errors=True) -> bool:
|
||||
def grab_magazine(url: str,
|
||||
output_root: str,
|
||||
probe_pages: Optional[list[int]] = None,
|
||||
only_fetch=False, force_overwrite=False):
|
||||
probe_all=False, only_fetch=False, force_overwrite=False):
|
||||
try:
|
||||
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
||||
except AttributeError:
|
||||
return False
|
||||
|
||||
data = retronews.api_doc_info(collection_id, doc_id)
|
||||
data = retronews.doc_info(collection_id, doc_id)
|
||||
pages = int(data['nbPages'])
|
||||
print(f'found {pages} pages')
|
||||
|
||||
@ -283,7 +328,7 @@ def grab_magazine(url: str,
|
||||
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
||||
if not force_overwrite:
|
||||
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
||||
return
|
||||
return True
|
||||
else:
|
||||
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
|
||||
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
|
||||
@ -302,7 +347,8 @@ def grab_magazine(url: str,
|
||||
number=i+1,
|
||||
collection_id=collection_id,
|
||||
doc_id=doc_id,
|
||||
probe_pages=probe_pages))
|
||||
probe_pages=probe_pages,
|
||||
probe_all=probe_all))
|
||||
for worker in pool:
|
||||
worker.start()
|
||||
|
||||
@ -312,10 +358,10 @@ def grab_magazine(url: str,
|
||||
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
|
||||
f.write(f'error: {worker.get_error()}')
|
||||
print(f'ERROR: failed to download {pub_date} magazine')
|
||||
return
|
||||
return False
|
||||
|
||||
if only_fetch:
|
||||
return
|
||||
return True
|
||||
|
||||
# merge tiles
|
||||
for page in range(pages):
|
||||
@ -338,6 +384,8 @@ def grab_magazine(url: str,
|
||||
except:
|
||||
traceback.print_exc()
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
@ -354,6 +402,8 @@ if __name__ == '__main__':
|
||||
help='only fetch magazine tiles and exit, do not merge anything')
|
||||
parser.add_argument('--force-overwrite', action='store_true',
|
||||
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
||||
parser.add_argument('--force-probe', action='store_true',
|
||||
help='force all pages to use the \'probe\' method')
|
||||
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
||||
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
||||
|
||||
@ -371,11 +421,14 @@ if __name__ == '__main__':
|
||||
url = args.url
|
||||
while True:
|
||||
print(f'grabbing {url}...')
|
||||
grab_magazine(url,
|
||||
if not grab_magazine(url,
|
||||
output_root=args.output,
|
||||
probe_pages=args.fetch_probe_pages,
|
||||
probe_all=args.force_probe,
|
||||
only_fetch=args.only_fetch,
|
||||
force_overwrite=args.force_overwrite)
|
||||
force_overwrite=args.force_overwrite):
|
||||
logging.error('failed to grab')
|
||||
break
|
||||
|
||||
if not args.continue_prev and not args.continue_next:
|
||||
break
|
||||
@ -383,11 +436,16 @@ if __name__ == '__main__':
|
||||
r = requests.get(url)
|
||||
|
||||
try:
|
||||
next_url = None
|
||||
if args.continue_next:
|
||||
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
||||
elif args.continue_prev:
|
||||
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
||||
|
||||
if not next_url:
|
||||
if not next_url:
|
||||
break
|
||||
|
||||
if next_url.startswith('/'):
|
||||
next_url = f'https://www.retronews.fr{next_url}'
|
||||
|
||||
|
8
retronews/__init__.py
Normal file
8
retronews/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from .retronews import (
|
||||
convert_date,
|
||||
parse_url,
|
||||
doc_info,
|
||||
page_info,
|
||||
thumbnail_url,
|
||||
tile_url
|
||||
)
|
@ -9,6 +9,8 @@ MONTHS = dict(
|
||||
may=5,
|
||||
jun=6,
|
||||
jul=7,
|
||||
juillet=7,
|
||||
aout=8,
|
||||
aug=8,
|
||||
sep=9,
|
||||
oct=10,
|
||||
@ -27,7 +29,7 @@ def convert_date(s: str) -> tuple[str, str, str]:
|
||||
|
||||
|
||||
def parse_url(url: str) -> tuple:
|
||||
return re.search(r'/(?:mercure-de-france|le-nouveau-mercure|le-mercure-galant|mercure-francais|mercure-galant)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
||||
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
||||
|
||||
|
||||
def doc_info(collection_id, doc_id):
|
Loading…
x
Reference in New Issue
Block a user