wip
This commit is contained in:
parent
209c6404eb
commit
3847423443
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
/.idea
|
||||||
|
/venv
|
||||||
|
/*.sqlite3
|
||||||
|
__pycache__
|
||||||
|
/test.py
|
@ -20,5 +20,5 @@ if __name__ == '__main__':
|
|||||||
print(f'ERROR: {doc["collection_id"]}/{doc["page_id"]}/{page}: width or height is zero')
|
print(f'ERROR: {doc["collection_id"]}/{doc["page_id"]}/{page}: width or height is zero')
|
||||||
continue
|
continue
|
||||||
ratio = width/height
|
ratio = width/height
|
||||||
# TODO
|
if ratio >= 0.8:
|
||||||
# print(f'[]')
|
print(f'{doc["collection_id"]}/{doc["doc_id"]}/{page}: {ratio}')
|
||||||
|
1
database/__init__.py
Normal file
1
database/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from .database import Database
|
@ -13,7 +13,7 @@ class Database:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
file = os.path.join(os.path.dirname(__file__), 'mdf-retrobase.sqlite3')
|
file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3')
|
||||||
self.sqlite = sqlite3.connect(file, check_same_thread=False)
|
self.sqlite = sqlite3.connect(file, check_same_thread=False)
|
||||||
self.lock = threading.Lock()
|
self.lock = threading.Lock()
|
||||||
|
|
@ -1,4 +1,8 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
import logging
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
@ -13,6 +17,7 @@ import shutil
|
|||||||
import queue
|
import queue
|
||||||
import traceback
|
import traceback
|
||||||
import retronews
|
import retronews
|
||||||
|
import logging
|
||||||
|
|
||||||
from database import Database
|
from database import Database
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@ -20,6 +25,8 @@ from threading import Thread, Lock
|
|||||||
from time import sleep
|
from time import sleep
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
|
||||||
VTILES = 3
|
VTILES = 3
|
||||||
HTILES = 2
|
HTILES = 2
|
||||||
TILE_MERGING_POOL_SIZE = 8
|
TILE_MERGING_POOL_SIZE = 8
|
||||||
@ -47,18 +54,27 @@ class DownloaderThread(Thread):
|
|||||||
_url: str
|
_url: str
|
||||||
_save_as: str
|
_save_as: str
|
||||||
_download_result: Optional[bool]
|
_download_result: Optional[bool]
|
||||||
|
_handle_http: bool
|
||||||
|
user_info: dict
|
||||||
|
|
||||||
def __init__(self, url: str, save_as: str, thread_name=None):
|
def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
if user_info is None:
|
||||||
|
user_info = {}
|
||||||
if thread_name:
|
if thread_name:
|
||||||
self.name = thread_name
|
self.name = thread_name
|
||||||
|
|
||||||
self._url = url
|
self._url = url
|
||||||
self._save_as = save_as
|
self._save_as = save_as
|
||||||
self._download_result = None
|
self._download_result = None
|
||||||
|
self._handle_http = handle_http
|
||||||
|
self.user_info = user_info
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self._download_result = download_file(self._url, self._save_as)
|
try:
|
||||||
|
self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http)
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
pass
|
||||||
|
|
||||||
def is_downloaded(self) -> bool:
|
def is_downloaded(self) -> bool:
|
||||||
return self._download_result is True
|
return self._download_result is True
|
||||||
@ -80,7 +96,7 @@ class TileMergeWorker(Thread):
|
|||||||
try:
|
try:
|
||||||
page = merging_queue.get_nowait()
|
page = merging_queue.get_nowait()
|
||||||
page_dir = os.path.join(self._working_dir, str(page))
|
page_dir = os.path.join(self._working_dir, str(page))
|
||||||
thumbnail_path = os.path.join(self._working_dir, 'thumbnail.jpg')
|
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
|
||||||
meta_path = os.path.join(page_dir, 'meta.json')
|
meta_path = os.path.join(page_dir, 'meta.json')
|
||||||
|
|
||||||
if os.path.exists(thumbnail_path):
|
if os.path.exists(thumbnail_path):
|
||||||
@ -100,12 +116,12 @@ class TileMergeWorker(Thread):
|
|||||||
for h in range(htiles):
|
for h in range(htiles):
|
||||||
vfiles = []
|
vfiles = []
|
||||||
for v in range(vtiles):
|
for v in range(vtiles):
|
||||||
vfiles.append(f'v{v}_h{h}.jpg')
|
vfiles.append(f'{h}x{v}.jpg')
|
||||||
run(['convert', '-append', *vfiles, f'_v_{h}.jpg'], cwd=page_dir)
|
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
|
||||||
hfiles.append(f'_v_{h}.jpg')
|
hfiles.append(f'{h}.jpg')
|
||||||
|
|
||||||
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
|
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
|
||||||
shutil.rmtree(page_dir)
|
# shutil.rmtree(page_dir)
|
||||||
|
|
||||||
safe_print(f'[tile merger {self._number}] page {page} done')
|
safe_print(f'[tile merger {self._number}] page {page} done')
|
||||||
|
|
||||||
@ -119,8 +135,9 @@ class PageFetchWorker(Thread):
|
|||||||
_failed: bool
|
_failed: bool
|
||||||
_error: Optional[str]
|
_error: Optional[str]
|
||||||
_probe_pages: Optional[list[int]]
|
_probe_pages: Optional[list[int]]
|
||||||
|
_probe_all: bool
|
||||||
|
|
||||||
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None):
|
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._working_dir = working_dir
|
self._working_dir = working_dir
|
||||||
self._number = number
|
self._number = number
|
||||||
@ -129,6 +146,7 @@ class PageFetchWorker(Thread):
|
|||||||
self._failed = False
|
self._failed = False
|
||||||
self._error = None
|
self._error = None
|
||||||
self._probe_pages = probe_pages
|
self._probe_pages = probe_pages
|
||||||
|
self._probe_all = probe_all
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
safe_print(f'[pf-{self._number}] started')
|
safe_print(f'[pf-{self._number}] started')
|
||||||
@ -140,7 +158,7 @@ class PageFetchWorker(Thread):
|
|||||||
page = pages_queue.get_nowait()
|
page = pages_queue.get_nowait()
|
||||||
safe_print(f'[pf-{self._number}] page {page} started')
|
safe_print(f'[pf-{self._number}] page {page} started')
|
||||||
|
|
||||||
if page in self._probe_pages:
|
if self._probe_all or page in self._probe_pages:
|
||||||
self.probe_dl(page)
|
self.probe_dl(page)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
@ -209,28 +227,52 @@ class PageFetchWorker(Thread):
|
|||||||
real_h = 0
|
real_h = 0
|
||||||
real_v = 0
|
real_v = 0
|
||||||
data_error = False
|
data_error = False
|
||||||
for h in range(5):
|
dl_tasks = []
|
||||||
for v in range(5):
|
for h in range(10):
|
||||||
|
for v in range(10):
|
||||||
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
||||||
output_file = f'{page_dir}/v{v}_h{h}.jpg'
|
output_file = f'{page_dir}/{h}x{v}.jpg'
|
||||||
if os.path.isfile(output_file):
|
if os.path.isfile(output_file):
|
||||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
||||||
if os.path.getsize(output_file) < 4:
|
if os.path.getsize(output_file) < 4:
|
||||||
os.unlink(output_file)
|
os.unlink(output_file)
|
||||||
continue
|
continue
|
||||||
try:
|
|
||||||
if not download_file(url, output_file, handle_http_errors=False):
|
|
||||||
raise OSError('network failure')
|
|
||||||
if not imghdr.what(output_file):
|
|
||||||
data_error = True
|
|
||||||
break
|
|
||||||
real_v = v
|
|
||||||
real_h = h
|
|
||||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
|
|
||||||
|
|
||||||
except urllib.error.HTTPError:
|
dl_tasks.append(DownloaderThread(url=url,
|
||||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
|
save_as=os.path.join(page_dir, output_file),
|
||||||
break
|
handle_http=True,
|
||||||
|
thread_name=f'p{page}-v{v}-h{h}',
|
||||||
|
user_info=dict(h=h, v=v)))
|
||||||
|
|
||||||
|
for task in dl_tasks:
|
||||||
|
task.start()
|
||||||
|
for task in dl_tasks:
|
||||||
|
task.join()
|
||||||
|
|
||||||
|
if task.is_downloaded():
|
||||||
|
task_h = task.user_info['h']
|
||||||
|
task_v = task.user_info['v']
|
||||||
|
if task_h > real_h:
|
||||||
|
real_h = task_h
|
||||||
|
if task_v > real_v:
|
||||||
|
real_v = task_v
|
||||||
|
|
||||||
|
if not imghdr.what(task._save_as):
|
||||||
|
data_error = True
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# if not download_file(url, output_file, handle_http_errors=False):
|
||||||
|
# raise OSError('network failure')
|
||||||
|
# if not imghdr.what(output_file):
|
||||||
|
# data_error = True
|
||||||
|
# break
|
||||||
|
# real_v = v
|
||||||
|
# real_h = h
|
||||||
|
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
|
||||||
|
#
|
||||||
|
# except urllib.error.HTTPError:
|
||||||
|
# safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
|
||||||
|
# break
|
||||||
|
|
||||||
if data_error:
|
if data_error:
|
||||||
self.thumbnail_dl(page)
|
self.thumbnail_dl(page)
|
||||||
@ -272,10 +314,13 @@ def download_file(url, output, handle_http_errors=True) -> bool:
|
|||||||
def grab_magazine(url: str,
|
def grab_magazine(url: str,
|
||||||
output_root: str,
|
output_root: str,
|
||||||
probe_pages: Optional[list[int]] = None,
|
probe_pages: Optional[list[int]] = None,
|
||||||
only_fetch=False, force_overwrite=False):
|
probe_all=False, only_fetch=False, force_overwrite=False):
|
||||||
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
try:
|
||||||
|
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
||||||
|
except AttributeError:
|
||||||
|
return False
|
||||||
|
|
||||||
data = retronews.api_doc_info(collection_id, doc_id)
|
data = retronews.doc_info(collection_id, doc_id)
|
||||||
pages = int(data['nbPages'])
|
pages = int(data['nbPages'])
|
||||||
print(f'found {pages} pages')
|
print(f'found {pages} pages')
|
||||||
|
|
||||||
@ -283,7 +328,7 @@ def grab_magazine(url: str,
|
|||||||
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
||||||
if not force_overwrite:
|
if not force_overwrite:
|
||||||
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
||||||
return
|
return True
|
||||||
else:
|
else:
|
||||||
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
|
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
|
||||||
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
|
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
|
||||||
@ -302,7 +347,8 @@ def grab_magazine(url: str,
|
|||||||
number=i+1,
|
number=i+1,
|
||||||
collection_id=collection_id,
|
collection_id=collection_id,
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
probe_pages=probe_pages))
|
probe_pages=probe_pages,
|
||||||
|
probe_all=probe_all))
|
||||||
for worker in pool:
|
for worker in pool:
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|
||||||
@ -312,10 +358,10 @@ def grab_magazine(url: str,
|
|||||||
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
|
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
|
||||||
f.write(f'error: {worker.get_error()}')
|
f.write(f'error: {worker.get_error()}')
|
||||||
print(f'ERROR: failed to download {pub_date} magazine')
|
print(f'ERROR: failed to download {pub_date} magazine')
|
||||||
return
|
return False
|
||||||
|
|
||||||
if only_fetch:
|
if only_fetch:
|
||||||
return
|
return True
|
||||||
|
|
||||||
# merge tiles
|
# merge tiles
|
||||||
for page in range(pages):
|
for page in range(pages):
|
||||||
@ -338,6 +384,8 @@ def grab_magazine(url: str,
|
|||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
@ -354,6 +402,8 @@ if __name__ == '__main__':
|
|||||||
help='only fetch magazine tiles and exit, do not merge anything')
|
help='only fetch magazine tiles and exit, do not merge anything')
|
||||||
parser.add_argument('--force-overwrite', action='store_true',
|
parser.add_argument('--force-overwrite', action='store_true',
|
||||||
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
||||||
|
parser.add_argument('--force-probe', action='store_true',
|
||||||
|
help='force all pages to use the \'probe\' method')
|
||||||
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
||||||
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
||||||
|
|
||||||
@ -371,11 +421,14 @@ if __name__ == '__main__':
|
|||||||
url = args.url
|
url = args.url
|
||||||
while True:
|
while True:
|
||||||
print(f'grabbing {url}...')
|
print(f'grabbing {url}...')
|
||||||
grab_magazine(url,
|
if not grab_magazine(url,
|
||||||
output_root=args.output,
|
output_root=args.output,
|
||||||
probe_pages=args.fetch_probe_pages,
|
probe_pages=args.fetch_probe_pages,
|
||||||
only_fetch=args.only_fetch,
|
probe_all=args.force_probe,
|
||||||
force_overwrite=args.force_overwrite)
|
only_fetch=args.only_fetch,
|
||||||
|
force_overwrite=args.force_overwrite):
|
||||||
|
logging.error('failed to grab')
|
||||||
|
break
|
||||||
|
|
||||||
if not args.continue_prev and not args.continue_next:
|
if not args.continue_prev and not args.continue_next:
|
||||||
break
|
break
|
||||||
@ -383,11 +436,16 @@ if __name__ == '__main__':
|
|||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
next_url = None
|
||||||
if args.continue_next:
|
if args.continue_next:
|
||||||
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
||||||
elif args.continue_prev:
|
elif args.continue_prev:
|
||||||
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
||||||
|
|
||||||
|
if not next_url:
|
||||||
|
if not next_url:
|
||||||
|
break
|
||||||
|
|
||||||
if next_url.startswith('/'):
|
if next_url.startswith('/'):
|
||||||
next_url = f'https://www.retronews.fr{next_url}'
|
next_url = f'https://www.retronews.fr{next_url}'
|
||||||
|
|
||||||
|
8
retronews/__init__.py
Normal file
8
retronews/__init__.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
from .retronews import (
|
||||||
|
convert_date,
|
||||||
|
parse_url,
|
||||||
|
doc_info,
|
||||||
|
page_info,
|
||||||
|
thumbnail_url,
|
||||||
|
tile_url
|
||||||
|
)
|
@ -9,6 +9,8 @@ MONTHS = dict(
|
|||||||
may=5,
|
may=5,
|
||||||
jun=6,
|
jun=6,
|
||||||
jul=7,
|
jul=7,
|
||||||
|
juillet=7,
|
||||||
|
aout=8,
|
||||||
aug=8,
|
aug=8,
|
||||||
sep=9,
|
sep=9,
|
||||||
oct=10,
|
oct=10,
|
||||||
@ -27,7 +29,7 @@ def convert_date(s: str) -> tuple[str, str, str]:
|
|||||||
|
|
||||||
|
|
||||||
def parse_url(url: str) -> tuple:
|
def parse_url(url: str) -> tuple:
|
||||||
return re.search(r'/(?:mercure-de-france|le-nouveau-mercure|le-mercure-galant|mercure-francais|mercure-galant)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
||||||
|
|
||||||
|
|
||||||
def doc_info(collection_id, doc_id):
|
def doc_info(collection_id, doc_id):
|
Loading…
x
Reference in New Issue
Block a user