upd
This commit is contained in:
parent
3847423443
commit
abd1975def
45
dl-from-db.py
Executable file
45
dl-from-db.py
Executable file
@ -0,0 +1,45 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from mdf import Database, retronews
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
database = Database()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument('--output', type=str, required=True,
|
||||||
|
help='output directory')
|
||||||
|
parser.add_argument('--from-date', type=str)
|
||||||
|
parser.add_argument('--to-date', type=str)
|
||||||
|
parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int)
|
||||||
|
parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int)
|
||||||
|
parser.add_argument('--only-fetch', action='store_true',
|
||||||
|
help='only fetch magazine tiles and exit, do not merge anything')
|
||||||
|
parser.add_argument('--force-overwrite', action='store_true',
|
||||||
|
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
||||||
|
parser.add_argument('--force-probe', action='store_true',
|
||||||
|
help='force all pages to use the \'probe\' method')
|
||||||
|
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
||||||
|
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
retronews.set_tile_merging_pool_size(args.merge_threads)
|
||||||
|
retronews.set_page_fetching_pool_size(args.fetch_threads)
|
||||||
|
|
||||||
|
for doc in database.get_documents((args.from_date, args.to_date)):
|
||||||
|
url = doc['url']
|
||||||
|
print(f'grabbing {url}...')
|
||||||
|
if not retronews.grab_magazine(url,
|
||||||
|
output_root=args.output,
|
||||||
|
probe_pages=args.fetch_probe_pages,
|
||||||
|
probe_all=args.force_probe,
|
||||||
|
only_fetch=args.only_fetch,
|
||||||
|
force_overwrite=args.force_overwrite):
|
||||||
|
logging.error(f'failed to grab {url}')
|
80
dl-retronews.py
Executable file
80
dl-retronews.py
Executable file
@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from mdf import retronews
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument('--url', type=str, required=True)
|
||||||
|
parser.add_argument('--output', type=str, required=True,
|
||||||
|
help='output directory')
|
||||||
|
parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int)
|
||||||
|
parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int)
|
||||||
|
parser.add_argument('--continue-prev', action='store_true',
|
||||||
|
help='keep scrapping backwards in time')
|
||||||
|
parser.add_argument('--continue-next', action='store_true',
|
||||||
|
help='keep scrapping forwards in time')
|
||||||
|
parser.add_argument('--only-fetch', action='store_true',
|
||||||
|
help='only fetch magazine tiles and exit, do not merge anything')
|
||||||
|
parser.add_argument('--force-overwrite', action='store_true',
|
||||||
|
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
||||||
|
parser.add_argument('--force-probe', action='store_true',
|
||||||
|
help='force all pages to use the \'probe\' method')
|
||||||
|
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
||||||
|
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
with_continuation = args.continue_prev or args.continue_next
|
||||||
|
if args.fetch_probe_pages and with_continuation:
|
||||||
|
raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
|
||||||
|
if args.only_fetch and with_continuation:
|
||||||
|
raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
|
||||||
|
|
||||||
|
TILE_MERGING_POOL_SIZE = args.merge_threads
|
||||||
|
PAGE_FETCHING_POOL_SIZE = args.fetch_threads
|
||||||
|
|
||||||
|
url = args.url
|
||||||
|
while True:
|
||||||
|
print(f'grabbing {url}...')
|
||||||
|
if not retronews.grab_magazine(url,
|
||||||
|
output_root=args.output,
|
||||||
|
probe_pages=args.fetch_probe_pages,
|
||||||
|
probe_all=args.force_probe,
|
||||||
|
only_fetch=args.only_fetch,
|
||||||
|
force_overwrite=args.force_overwrite):
|
||||||
|
logging.error('failed to grab')
|
||||||
|
break
|
||||||
|
|
||||||
|
if not args.continue_prev and not args.continue_next:
|
||||||
|
break
|
||||||
|
|
||||||
|
r = requests.get(url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
next_url = None
|
||||||
|
if args.continue_next:
|
||||||
|
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
||||||
|
elif args.continue_prev:
|
||||||
|
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
||||||
|
|
||||||
|
if not next_url:
|
||||||
|
break
|
||||||
|
|
||||||
|
if next_url.startswith('/'):
|
||||||
|
next_url = f'https://www.retronews.fr{next_url}'
|
||||||
|
|
||||||
|
url = next_url
|
||||||
|
|
||||||
|
except:
|
||||||
|
print('error: failed to find previous link! exiting')
|
||||||
|
break
|
3
mdf/__init__.py
Normal file
3
mdf/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from .retronews import retronews
|
||||||
|
from .util import util
|
||||||
|
from .database import Database
|
@ -1,7 +1,7 @@
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import logging
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
import retronews
|
from ..retronews import retronews
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@ -13,7 +13,7 @@ class Database:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3')
|
file = os.path.join(os.path.dirname(__file__), '..', '..', 'mdf-retrobase.sqlite3')
|
||||||
self.sqlite = sqlite3.connect(file, check_same_thread=False)
|
self.sqlite = sqlite3.connect(file, check_same_thread=False)
|
||||||
self.lock = threading.Lock()
|
self.lock = threading.Lock()
|
||||||
|
|
||||||
@ -125,10 +125,12 @@ class Database:
|
|||||||
sql = "SELECT issue_date, url, pages FROM mdf_links"
|
sql = "SELECT issue_date, url, pages FROM mdf_links"
|
||||||
if range:
|
if range:
|
||||||
sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'"
|
sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'"
|
||||||
|
sql += " ORDER BY issue_date"
|
||||||
cur.execute(sql)
|
cur.execute(sql)
|
||||||
for issue_date, url, pages in cur.fetchall():
|
for issue_date, url, pages in cur.fetchall():
|
||||||
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
||||||
docs.append(dict(
|
docs.append(dict(
|
||||||
|
url=url,
|
||||||
collection_id=collection_id,
|
collection_id=collection_id,
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
pages=pages
|
pages=pages
|
15
mdf/retronews/__init__.py
Normal file
15
mdf/retronews/__init__.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from .retronews import (
|
||||||
|
convert_date,
|
||||||
|
parse_url,
|
||||||
|
_doc_info,
|
||||||
|
page_info,
|
||||||
|
thumbnail_url,
|
||||||
|
tile_url,
|
||||||
|
HTILES,
|
||||||
|
VTILES,
|
||||||
|
PAGE_FETCHING_POOL_SIZE,
|
||||||
|
TILE_MERGING_POOL_SIZE,
|
||||||
|
set_tile_merging_pool_size,
|
||||||
|
set_page_fetching_pool_size,
|
||||||
|
grab_magazine
|
||||||
|
)
|
214
grab-retronews.py → mdf/retronews/retronews.py
Executable file → Normal file
214
grab-retronews.py → mdf/retronews/retronews.py
Executable file → Normal file
@ -1,53 +1,75 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
import logging
|
|
||||||
import warnings
|
|
||||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import re
|
import re
|
||||||
import imghdr
|
|
||||||
import requests
|
import requests
|
||||||
import urllib.request
|
import imghdr
|
||||||
import urllib.error
|
import json
|
||||||
import http.client
|
import os
|
||||||
import subprocess
|
|
||||||
import shutil
|
|
||||||
import queue
|
import queue
|
||||||
|
import shutil
|
||||||
import traceback
|
import traceback
|
||||||
import retronews
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from database import Database
|
from ..util.util import safe_print, download_file, run
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from threading import Thread, Lock
|
from threading import Thread
|
||||||
from time import sleep
|
import urllib.error
|
||||||
from argparse import ArgumentParser
|
|
||||||
|
|
||||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
_pages_queue = queue.Queue()
|
||||||
|
_merging_queue = queue.Queue()
|
||||||
|
|
||||||
VTILES = 3
|
VTILES = 3
|
||||||
HTILES = 2
|
HTILES = 2
|
||||||
TILE_MERGING_POOL_SIZE = 8
|
TILE_MERGING_POOL_SIZE = 8
|
||||||
PAGE_FETCHING_POOL_SIZE = 8
|
PAGE_FETCHING_POOL_SIZE = 8
|
||||||
|
|
||||||
database = Database()
|
|
||||||
print_lock = Lock()
|
|
||||||
|
|
||||||
pages_queue = queue.Queue()
|
MONTHS = dict(
|
||||||
merging_queue = queue.Queue()
|
jan=1,
|
||||||
|
feb=2,
|
||||||
|
mar=3,
|
||||||
|
apr=4,
|
||||||
|
may=5,
|
||||||
|
jun=6,
|
||||||
|
jul=7,
|
||||||
|
juillet=7,
|
||||||
|
aout=8,
|
||||||
|
aug=8,
|
||||||
|
sep=9,
|
||||||
|
oct=10,
|
||||||
|
nov=11,
|
||||||
|
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
|
||||||
|
dec=12
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def safe_print(*args, **kwargs):
|
def convert_date(s: str) -> tuple[str, str, str]:
|
||||||
with print_lock:
|
m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
|
||||||
print(*args, **kwargs)
|
year = m[2]
|
||||||
|
month = '%02d' % MONTHS[m[1]]
|
||||||
|
day = m[0]
|
||||||
|
return year, month, day
|
||||||
|
|
||||||
|
|
||||||
|
def parse_url(url: str) -> tuple:
|
||||||
|
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
||||||
|
|
||||||
|
|
||||||
|
def _doc_info(collection_id, doc_id):
|
||||||
|
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
|
def page_info(collection_id, doc_id, page):
|
||||||
|
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
|
def thumbnail_url(collection_id, doc_id, page) -> str:
|
||||||
|
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
|
||||||
|
|
||||||
|
|
||||||
|
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
|
||||||
|
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
|
||||||
|
|
||||||
|
|
||||||
def run(args: list, **kwargs):
|
|
||||||
p = subprocess.run(args, **kwargs)
|
|
||||||
if p.returncode != 0:
|
|
||||||
raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
|
|
||||||
|
|
||||||
|
|
||||||
class DownloaderThread(Thread):
|
class DownloaderThread(Thread):
|
||||||
@ -92,9 +114,9 @@ class TileMergeWorker(Thread):
|
|||||||
def run(self):
|
def run(self):
|
||||||
safe_print(f'[tile merger {self._number}] started')
|
safe_print(f'[tile merger {self._number}] started')
|
||||||
|
|
||||||
while not merging_queue.empty():
|
while not _merging_queue.empty():
|
||||||
try:
|
try:
|
||||||
page = merging_queue.get_nowait()
|
page = _merging_queue.get_nowait()
|
||||||
page_dir = os.path.join(self._working_dir, str(page))
|
page_dir = os.path.join(self._working_dir, str(page))
|
||||||
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
|
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
|
||||||
meta_path = os.path.join(page_dir, 'meta.json')
|
meta_path = os.path.join(page_dir, 'meta.json')
|
||||||
@ -116,7 +138,7 @@ class TileMergeWorker(Thread):
|
|||||||
for h in range(htiles):
|
for h in range(htiles):
|
||||||
vfiles = []
|
vfiles = []
|
||||||
for v in range(vtiles):
|
for v in range(vtiles):
|
||||||
vfiles.append(f'{h}x{v}.jpg')
|
vfiles.append(f'v{v}_h{h}.jpg')
|
||||||
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
|
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
|
||||||
hfiles.append(f'{h}.jpg')
|
hfiles.append(f'{h}.jpg')
|
||||||
|
|
||||||
@ -153,12 +175,12 @@ class PageFetchWorker(Thread):
|
|||||||
page = 0
|
page = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while not pages_queue.empty():
|
while not _pages_queue.empty():
|
||||||
try:
|
try:
|
||||||
page = pages_queue.get_nowait()
|
page = _pages_queue.get_nowait()
|
||||||
safe_print(f'[pf-{self._number}] page {page} started')
|
safe_print(f'[pf-{self._number}] page {page} started')
|
||||||
|
|
||||||
if self._probe_all or page in self._probe_pages:
|
if self._probe_all or (self._probe_pages is not None and page in self._probe_pages):
|
||||||
self.probe_dl(page)
|
self.probe_dl(page)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
@ -172,7 +194,7 @@ class PageFetchWorker(Thread):
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._failed = True
|
self._failed = True
|
||||||
self._error = f'while fetching page {page}: {str(e)}'
|
self._error = f'while fetching page {page}: {str(e)}' + traceback.format_exc()
|
||||||
|
|
||||||
def _get_page_dir(self, page):
|
def _get_page_dir(self, page):
|
||||||
page_dir = os.path.join(self._working_dir, str(page))
|
page_dir = os.path.join(self._working_dir, str(page))
|
||||||
@ -191,7 +213,7 @@ class PageFetchWorker(Thread):
|
|||||||
dl_tasks = []
|
dl_tasks = []
|
||||||
for horiz_tile in range(HTILES):
|
for horiz_tile in range(HTILES):
|
||||||
for vert_tile in range(VTILES):
|
for vert_tile in range(VTILES):
|
||||||
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
|
url = tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
|
||||||
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
|
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
|
||||||
if os.path.isfile(output_file):
|
if os.path.isfile(output_file):
|
||||||
if os.path.getsize(output_file) < 4:
|
if os.path.getsize(output_file) < 4:
|
||||||
@ -230,7 +252,7 @@ class PageFetchWorker(Thread):
|
|||||||
dl_tasks = []
|
dl_tasks = []
|
||||||
for h in range(10):
|
for h in range(10):
|
||||||
for v in range(10):
|
for v in range(10):
|
||||||
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
url = tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
||||||
output_file = f'{page_dir}/{h}x{v}.jpg'
|
output_file = f'{page_dir}/{h}x{v}.jpg'
|
||||||
if os.path.isfile(output_file):
|
if os.path.isfile(output_file):
|
||||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
||||||
@ -283,48 +305,26 @@ class PageFetchWorker(Thread):
|
|||||||
|
|
||||||
def thumbnail_dl(self, page):
|
def thumbnail_dl(self, page):
|
||||||
page_dir = self._get_page_dir(page)
|
page_dir = self._get_page_dir(page)
|
||||||
thumbnail_url = retronews.thumbnail_url(self._collection_id, self._doc_id, page)
|
thumb_url = thumbnail_url(self._collection_id, self._doc_id, page)
|
||||||
if not download_file(thumbnail_url, os.path.join(page_dir, 'thumbnail.jpg')):
|
if not download_file(thumb_url, os.path.join(page_dir, 'thumbnail.jpg')):
|
||||||
raise RuntimeError(f'network error, failed to download thumbnail ({thumbnail_url})')
|
raise RuntimeError(f'network error, failed to download thumbnail ({thumb_url})')
|
||||||
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
|
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, output, handle_http_errors=True) -> bool:
|
|
||||||
tries_left = 3
|
|
||||||
ok = False
|
|
||||||
while tries_left > 0:
|
|
||||||
try:
|
|
||||||
urllib.request.urlretrieve(url, output)
|
|
||||||
ok = True
|
|
||||||
break
|
|
||||||
except http.client.RemoteDisconnected:
|
|
||||||
ok = False
|
|
||||||
print(' caught an exception, sleeping for 2 seconds and retrying...')
|
|
||||||
sleep(2)
|
|
||||||
tries_left -= 1
|
|
||||||
except urllib.error.HTTPError as e:
|
|
||||||
if not handle_http_errors:
|
|
||||||
raise e
|
|
||||||
else:
|
|
||||||
print(f' failed to download {url}: {str(e)}')
|
|
||||||
return False
|
|
||||||
return ok
|
|
||||||
|
|
||||||
|
|
||||||
def grab_magazine(url: str,
|
def grab_magazine(url: str,
|
||||||
output_root: str,
|
output_root: str,
|
||||||
probe_pages: Optional[list[int]] = None,
|
probe_pages: Optional[list[int]] = None,
|
||||||
probe_all=False, only_fetch=False, force_overwrite=False):
|
probe_all=False, only_fetch=False, force_overwrite=False):
|
||||||
try:
|
try:
|
||||||
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
pub_date, collection_id, doc_id = parse_url(url)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
data = retronews.doc_info(collection_id, doc_id)
|
data = _doc_info(collection_id, doc_id)
|
||||||
pages = int(data['nbPages'])
|
pages = int(data['nbPages'])
|
||||||
print(f'found {pages} pages')
|
print(f'found {pages} pages')
|
||||||
|
|
||||||
y, m, d = retronews.convert_date(pub_date)
|
y, m, d = convert_date(pub_date)
|
||||||
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
||||||
if not force_overwrite:
|
if not force_overwrite:
|
||||||
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
||||||
@ -339,7 +339,7 @@ def grab_magazine(url: str,
|
|||||||
|
|
||||||
# fetch pages
|
# fetch pages
|
||||||
for page in range(pages):
|
for page in range(pages):
|
||||||
pages_queue.put(page+1)
|
_pages_queue.put(page+1)
|
||||||
|
|
||||||
pool = []
|
pool = []
|
||||||
for i in range(PAGE_FETCHING_POOL_SIZE):
|
for i in range(PAGE_FETCHING_POOL_SIZE):
|
||||||
@ -366,7 +366,7 @@ def grab_magazine(url: str,
|
|||||||
# merge tiles
|
# merge tiles
|
||||||
for page in range(pages):
|
for page in range(pages):
|
||||||
page += 1
|
page += 1
|
||||||
merging_queue.put(page)
|
_merging_queue.put(page)
|
||||||
|
|
||||||
pool = []
|
pool = []
|
||||||
for i in range(TILE_MERGING_POOL_SIZE):
|
for i in range(TILE_MERGING_POOL_SIZE):
|
||||||
@ -387,71 +387,11 @@ def grab_magazine(url: str,
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def set_tile_merging_pool_size(size):
|
||||||
parser = ArgumentParser()
|
global TILE_MERGING_POOL_SIZE
|
||||||
parser.add_argument('--url', type=str, required=True)
|
TILE_MERGING_POOL_SIZE = size
|
||||||
parser.add_argument('--output', type=str, required=True,
|
|
||||||
help='output directory')
|
|
||||||
parser.add_argument('--merge-threads', default=TILE_MERGING_POOL_SIZE, type=int)
|
|
||||||
parser.add_argument('--fetch-threads', default=PAGE_FETCHING_POOL_SIZE, type=int)
|
|
||||||
parser.add_argument('--continue-prev', action='store_true',
|
|
||||||
help='keep scrapping backwards in time')
|
|
||||||
parser.add_argument('--continue-next', action='store_true',
|
|
||||||
help='keep scrapping forwards in time')
|
|
||||||
parser.add_argument('--only-fetch', action='store_true',
|
|
||||||
help='only fetch magazine tiles and exit, do not merge anything')
|
|
||||||
parser.add_argument('--force-overwrite', action='store_true',
|
|
||||||
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
|
||||||
parser.add_argument('--force-probe', action='store_true',
|
|
||||||
help='force all pages to use the \'probe\' method')
|
|
||||||
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
|
||||||
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
with_continuation = args.continue_prev or args.continue_next
|
|
||||||
if args.fetch_probe_pages and with_continuation:
|
|
||||||
raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
|
|
||||||
if args.only_fetch and with_continuation:
|
|
||||||
raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
|
|
||||||
|
|
||||||
TILE_MERGING_POOL_SIZE = args.merge_threads
|
|
||||||
PAGE_FETCHING_POOL_SIZE = args.fetch_threads
|
|
||||||
|
|
||||||
url = args.url
|
|
||||||
while True:
|
|
||||||
print(f'grabbing {url}...')
|
|
||||||
if not grab_magazine(url,
|
|
||||||
output_root=args.output,
|
|
||||||
probe_pages=args.fetch_probe_pages,
|
|
||||||
probe_all=args.force_probe,
|
|
||||||
only_fetch=args.only_fetch,
|
|
||||||
force_overwrite=args.force_overwrite):
|
|
||||||
logging.error('failed to grab')
|
|
||||||
break
|
|
||||||
|
|
||||||
if not args.continue_prev and not args.continue_next:
|
|
||||||
break
|
|
||||||
|
|
||||||
r = requests.get(url)
|
|
||||||
|
|
||||||
try:
|
|
||||||
next_url = None
|
|
||||||
if args.continue_next:
|
|
||||||
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
|
||||||
elif args.continue_prev:
|
|
||||||
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
|
||||||
|
|
||||||
if not next_url:
|
|
||||||
if not next_url:
|
|
||||||
break
|
|
||||||
|
|
||||||
if next_url.startswith('/'):
|
|
||||||
next_url = f'https://www.retronews.fr{next_url}'
|
|
||||||
|
|
||||||
url = next_url
|
|
||||||
|
|
||||||
except:
|
|
||||||
print('error: failed to find previous link! exiting')
|
|
||||||
break
|
|
||||||
|
|
||||||
|
def set_page_fetching_pool_size(size):
|
||||||
|
global PAGE_FETCHING_POOL_SIZE
|
||||||
|
PAGE_FETCHING_POOL_SIZE = size
|
0
mdf/util/__init__.py
Normal file
0
mdf/util/__init__.py
Normal file
44
mdf/util/util.py
Normal file
44
mdf/util/util.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
import subprocess
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
|
||||||
|
from time import sleep
|
||||||
|
from threading import Lock
|
||||||
|
import http.client
|
||||||
|
|
||||||
|
|
||||||
|
_print_lock = Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def safe_print(*args, **kwargs):
|
||||||
|
with _print_lock:
|
||||||
|
print(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def run(args: list, **kwargs):
|
||||||
|
p = subprocess.run(args, **kwargs)
|
||||||
|
if p.returncode != 0:
|
||||||
|
raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url, output, handle_http_errors=True) -> bool:
|
||||||
|
tries_left = 3
|
||||||
|
ok = False
|
||||||
|
while tries_left > 0:
|
||||||
|
try:
|
||||||
|
urllib.request.urlretrieve(url, output)
|
||||||
|
ok = True
|
||||||
|
break
|
||||||
|
except http.client.RemoteDisconnected:
|
||||||
|
ok = False
|
||||||
|
print(' caught an exception, sleeping for 2 seconds and retrying...')
|
||||||
|
sleep(2)
|
||||||
|
tries_left -= 1
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if not handle_http_errors:
|
||||||
|
raise e
|
||||||
|
else:
|
||||||
|
print(f' failed to download {url}: {str(e)}')
|
||||||
|
return False
|
||||||
|
return ok
|
||||||
|
|
@ -1,8 +0,0 @@
|
|||||||
from .retronews import (
|
|
||||||
convert_date,
|
|
||||||
parse_url,
|
|
||||||
doc_info,
|
|
||||||
page_info,
|
|
||||||
thumbnail_url,
|
|
||||||
tile_url
|
|
||||||
)
|
|
@ -1,50 +0,0 @@
|
|||||||
import re
|
|
||||||
import requests
|
|
||||||
|
|
||||||
MONTHS = dict(
|
|
||||||
jan=1,
|
|
||||||
feb=2,
|
|
||||||
mar=3,
|
|
||||||
apr=4,
|
|
||||||
may=5,
|
|
||||||
jun=6,
|
|
||||||
jul=7,
|
|
||||||
juillet=7,
|
|
||||||
aout=8,
|
|
||||||
aug=8,
|
|
||||||
sep=9,
|
|
||||||
oct=10,
|
|
||||||
nov=11,
|
|
||||||
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
|
|
||||||
dec=12
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def convert_date(s: str) -> tuple[str, str, str]:
|
|
||||||
m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
|
|
||||||
year = m[2]
|
|
||||||
month = '%02d' % MONTHS[m[1]]
|
|
||||||
day = m[0]
|
|
||||||
return year, month, day
|
|
||||||
|
|
||||||
|
|
||||||
def parse_url(url: str) -> tuple:
|
|
||||||
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
|
||||||
|
|
||||||
|
|
||||||
def doc_info(collection_id, doc_id):
|
|
||||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
|
|
||||||
return r.json()
|
|
||||||
|
|
||||||
|
|
||||||
def page_info(collection_id, doc_id, page):
|
|
||||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
|
|
||||||
return r.json()
|
|
||||||
|
|
||||||
|
|
||||||
def thumbnail_url(collection_id, doc_id, page) -> str:
|
|
||||||
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
|
|
||||||
|
|
||||||
|
|
||||||
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
|
|
||||||
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
|
|
Loading…
x
Reference in New Issue
Block a user