upd
This commit is contained in:
parent
3847423443
commit
abd1975def
45
dl-from-db.py
Executable file
45
dl-from-db.py
Executable file
@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
|
||||
import logging
|
||||
|
||||
from mdf import Database, retronews
|
||||
from argparse import ArgumentParser
|
||||
|
||||
database = Database()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--output', type=str, required=True,
|
||||
help='output directory')
|
||||
parser.add_argument('--from-date', type=str)
|
||||
parser.add_argument('--to-date', type=str)
|
||||
parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int)
|
||||
parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int)
|
||||
parser.add_argument('--only-fetch', action='store_true',
|
||||
help='only fetch magazine tiles and exit, do not merge anything')
|
||||
parser.add_argument('--force-overwrite', action='store_true',
|
||||
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
||||
parser.add_argument('--force-probe', action='store_true',
|
||||
help='force all pages to use the \'probe\' method')
|
||||
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
||||
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
retronews.set_tile_merging_pool_size(args.merge_threads)
|
||||
retronews.set_page_fetching_pool_size(args.fetch_threads)
|
||||
|
||||
for doc in database.get_documents((args.from_date, args.to_date)):
|
||||
url = doc['url']
|
||||
print(f'grabbing {url}...')
|
||||
if not retronews.grab_magazine(url,
|
||||
output_root=args.output,
|
||||
probe_pages=args.fetch_probe_pages,
|
||||
probe_all=args.force_probe,
|
||||
only_fetch=args.only_fetch,
|
||||
force_overwrite=args.force_overwrite):
|
||||
logging.error(f'failed to grab {url}')
|
80
dl-retronews.py
Executable file
80
dl-retronews.py
Executable file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
|
||||
import re
|
||||
import requests
|
||||
import logging
|
||||
|
||||
from mdf import retronews
|
||||
from argparse import ArgumentParser
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--url', type=str, required=True)
|
||||
parser.add_argument('--output', type=str, required=True,
|
||||
help='output directory')
|
||||
parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int)
|
||||
parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int)
|
||||
parser.add_argument('--continue-prev', action='store_true',
|
||||
help='keep scrapping backwards in time')
|
||||
parser.add_argument('--continue-next', action='store_true',
|
||||
help='keep scrapping forwards in time')
|
||||
parser.add_argument('--only-fetch', action='store_true',
|
||||
help='only fetch magazine tiles and exit, do not merge anything')
|
||||
parser.add_argument('--force-overwrite', action='store_true',
|
||||
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
||||
parser.add_argument('--force-probe', action='store_true',
|
||||
help='force all pages to use the \'probe\' method')
|
||||
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
||||
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with_continuation = args.continue_prev or args.continue_next
|
||||
if args.fetch_probe_pages and with_continuation:
|
||||
raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
|
||||
if args.only_fetch and with_continuation:
|
||||
raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
|
||||
|
||||
TILE_MERGING_POOL_SIZE = args.merge_threads
|
||||
PAGE_FETCHING_POOL_SIZE = args.fetch_threads
|
||||
|
||||
url = args.url
|
||||
while True:
|
||||
print(f'grabbing {url}...')
|
||||
if not retronews.grab_magazine(url,
|
||||
output_root=args.output,
|
||||
probe_pages=args.fetch_probe_pages,
|
||||
probe_all=args.force_probe,
|
||||
only_fetch=args.only_fetch,
|
||||
force_overwrite=args.force_overwrite):
|
||||
logging.error('failed to grab')
|
||||
break
|
||||
|
||||
if not args.continue_prev and not args.continue_next:
|
||||
break
|
||||
|
||||
r = requests.get(url)
|
||||
|
||||
try:
|
||||
next_url = None
|
||||
if args.continue_next:
|
||||
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
||||
elif args.continue_prev:
|
||||
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
||||
|
||||
if not next_url:
|
||||
break
|
||||
|
||||
if next_url.startswith('/'):
|
||||
next_url = f'https://www.retronews.fr{next_url}'
|
||||
|
||||
url = next_url
|
||||
|
||||
except:
|
||||
print('error: failed to find previous link! exiting')
|
||||
break
|
3
mdf/__init__.py
Normal file
3
mdf/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from .retronews import retronews
|
||||
from .util import util
|
||||
from .database import Database
|
@ -1,7 +1,7 @@
|
||||
import sqlite3
|
||||
import logging
|
||||
import os.path
|
||||
import retronews
|
||||
from ..retronews import retronews
|
||||
import threading
|
||||
|
||||
from typing import Optional
|
||||
@ -13,7 +13,7 @@ class Database:
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3')
|
||||
file = os.path.join(os.path.dirname(__file__), '..', '..', 'mdf-retrobase.sqlite3')
|
||||
self.sqlite = sqlite3.connect(file, check_same_thread=False)
|
||||
self.lock = threading.Lock()
|
||||
|
||||
@ -125,10 +125,12 @@ class Database:
|
||||
sql = "SELECT issue_date, url, pages FROM mdf_links"
|
||||
if range:
|
||||
sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'"
|
||||
sql += " ORDER BY issue_date"
|
||||
cur.execute(sql)
|
||||
for issue_date, url, pages in cur.fetchall():
|
||||
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
||||
docs.append(dict(
|
||||
url=url,
|
||||
collection_id=collection_id,
|
||||
doc_id=doc_id,
|
||||
pages=pages
|
15
mdf/retronews/__init__.py
Normal file
15
mdf/retronews/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
from .retronews import (
|
||||
convert_date,
|
||||
parse_url,
|
||||
_doc_info,
|
||||
page_info,
|
||||
thumbnail_url,
|
||||
tile_url,
|
||||
HTILES,
|
||||
VTILES,
|
||||
PAGE_FETCHING_POOL_SIZE,
|
||||
TILE_MERGING_POOL_SIZE,
|
||||
set_tile_merging_pool_size,
|
||||
set_page_fetching_pool_size,
|
||||
grab_magazine
|
||||
)
|
214
grab-retronews.py → mdf/retronews/retronews.py
Executable file → Normal file
214
grab-retronews.py → mdf/retronews/retronews.py
Executable file → Normal file
@ -1,53 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import imghdr
|
||||
import requests
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import http.client
|
||||
import subprocess
|
||||
import shutil
|
||||
import imghdr
|
||||
import json
|
||||
import os
|
||||
import queue
|
||||
import shutil
|
||||
import traceback
|
||||
import retronews
|
||||
import logging
|
||||
|
||||
from database import Database
|
||||
from ..util.util import safe_print, download_file, run
|
||||
from typing import Optional
|
||||
from threading import Thread, Lock
|
||||
from time import sleep
|
||||
from argparse import ArgumentParser
|
||||
from threading import Thread
|
||||
import urllib.error
|
||||
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
_pages_queue = queue.Queue()
|
||||
_merging_queue = queue.Queue()
|
||||
|
||||
VTILES = 3
|
||||
HTILES = 2
|
||||
TILE_MERGING_POOL_SIZE = 8
|
||||
PAGE_FETCHING_POOL_SIZE = 8
|
||||
|
||||
database = Database()
|
||||
print_lock = Lock()
|
||||
|
||||
pages_queue = queue.Queue()
|
||||
merging_queue = queue.Queue()
|
||||
MONTHS = dict(
|
||||
jan=1,
|
||||
feb=2,
|
||||
mar=3,
|
||||
apr=4,
|
||||
may=5,
|
||||
jun=6,
|
||||
jul=7,
|
||||
juillet=7,
|
||||
aout=8,
|
||||
aug=8,
|
||||
sep=9,
|
||||
oct=10,
|
||||
nov=11,
|
||||
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
|
||||
dec=12
|
||||
)
|
||||
|
||||
|
||||
def safe_print(*args, **kwargs):
|
||||
with print_lock:
|
||||
print(*args, **kwargs)
|
||||
def convert_date(s: str) -> tuple[str, str, str]:
|
||||
m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
|
||||
year = m[2]
|
||||
month = '%02d' % MONTHS[m[1]]
|
||||
day = m[0]
|
||||
return year, month, day
|
||||
|
||||
|
||||
def parse_url(url: str) -> tuple:
|
||||
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
||||
|
||||
|
||||
def _doc_info(collection_id, doc_id):
|
||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
|
||||
return r.json()
|
||||
|
||||
|
||||
def page_info(collection_id, doc_id, page):
|
||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
|
||||
return r.json()
|
||||
|
||||
|
||||
def thumbnail_url(collection_id, doc_id, page) -> str:
|
||||
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
|
||||
|
||||
|
||||
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
|
||||
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
|
||||
|
||||
|
||||
def run(args: list, **kwargs):
|
||||
p = subprocess.run(args, **kwargs)
|
||||
if p.returncode != 0:
|
||||
raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
|
||||
|
||||
|
||||
class DownloaderThread(Thread):
|
||||
@ -92,9 +114,9 @@ class TileMergeWorker(Thread):
|
||||
def run(self):
|
||||
safe_print(f'[tile merger {self._number}] started')
|
||||
|
||||
while not merging_queue.empty():
|
||||
while not _merging_queue.empty():
|
||||
try:
|
||||
page = merging_queue.get_nowait()
|
||||
page = _merging_queue.get_nowait()
|
||||
page_dir = os.path.join(self._working_dir, str(page))
|
||||
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
|
||||
meta_path = os.path.join(page_dir, 'meta.json')
|
||||
@ -116,7 +138,7 @@ class TileMergeWorker(Thread):
|
||||
for h in range(htiles):
|
||||
vfiles = []
|
||||
for v in range(vtiles):
|
||||
vfiles.append(f'{h}x{v}.jpg')
|
||||
vfiles.append(f'v{v}_h{h}.jpg')
|
||||
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
|
||||
hfiles.append(f'{h}.jpg')
|
||||
|
||||
@ -153,12 +175,12 @@ class PageFetchWorker(Thread):
|
||||
page = 0
|
||||
|
||||
try:
|
||||
while not pages_queue.empty():
|
||||
while not _pages_queue.empty():
|
||||
try:
|
||||
page = pages_queue.get_nowait()
|
||||
page = _pages_queue.get_nowait()
|
||||
safe_print(f'[pf-{self._number}] page {page} started')
|
||||
|
||||
if self._probe_all or page in self._probe_pages:
|
||||
if self._probe_all or (self._probe_pages is not None and page in self._probe_pages):
|
||||
self.probe_dl(page)
|
||||
else:
|
||||
try:
|
||||
@ -172,7 +194,7 @@ class PageFetchWorker(Thread):
|
||||
|
||||
except Exception as e:
|
||||
self._failed = True
|
||||
self._error = f'while fetching page {page}: {str(e)}'
|
||||
self._error = f'while fetching page {page}: {str(e)}' + traceback.format_exc()
|
||||
|
||||
def _get_page_dir(self, page):
|
||||
page_dir = os.path.join(self._working_dir, str(page))
|
||||
@ -191,7 +213,7 @@ class PageFetchWorker(Thread):
|
||||
dl_tasks = []
|
||||
for horiz_tile in range(HTILES):
|
||||
for vert_tile in range(VTILES):
|
||||
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
|
||||
url = tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
|
||||
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
|
||||
if os.path.isfile(output_file):
|
||||
if os.path.getsize(output_file) < 4:
|
||||
@ -230,7 +252,7 @@ class PageFetchWorker(Thread):
|
||||
dl_tasks = []
|
||||
for h in range(10):
|
||||
for v in range(10):
|
||||
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
||||
url = tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
||||
output_file = f'{page_dir}/{h}x{v}.jpg'
|
||||
if os.path.isfile(output_file):
|
||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
||||
@ -283,48 +305,26 @@ class PageFetchWorker(Thread):
|
||||
|
||||
def thumbnail_dl(self, page):
|
||||
page_dir = self._get_page_dir(page)
|
||||
thumbnail_url = retronews.thumbnail_url(self._collection_id, self._doc_id, page)
|
||||
if not download_file(thumbnail_url, os.path.join(page_dir, 'thumbnail.jpg')):
|
||||
raise RuntimeError(f'network error, failed to download thumbnail ({thumbnail_url})')
|
||||
thumb_url = thumbnail_url(self._collection_id, self._doc_id, page)
|
||||
if not download_file(thumb_url, os.path.join(page_dir, 'thumbnail.jpg')):
|
||||
raise RuntimeError(f'network error, failed to download thumbnail ({thumb_url})')
|
||||
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
|
||||
|
||||
|
||||
def download_file(url, output, handle_http_errors=True) -> bool:
|
||||
tries_left = 3
|
||||
ok = False
|
||||
while tries_left > 0:
|
||||
try:
|
||||
urllib.request.urlretrieve(url, output)
|
||||
ok = True
|
||||
break
|
||||
except http.client.RemoteDisconnected:
|
||||
ok = False
|
||||
print(' caught an exception, sleeping for 2 seconds and retrying...')
|
||||
sleep(2)
|
||||
tries_left -= 1
|
||||
except urllib.error.HTTPError as e:
|
||||
if not handle_http_errors:
|
||||
raise e
|
||||
else:
|
||||
print(f' failed to download {url}: {str(e)}')
|
||||
return False
|
||||
return ok
|
||||
|
||||
|
||||
def grab_magazine(url: str,
|
||||
output_root: str,
|
||||
probe_pages: Optional[list[int]] = None,
|
||||
probe_all=False, only_fetch=False, force_overwrite=False):
|
||||
try:
|
||||
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
||||
pub_date, collection_id, doc_id = parse_url(url)
|
||||
except AttributeError:
|
||||
return False
|
||||
|
||||
data = retronews.doc_info(collection_id, doc_id)
|
||||
data = _doc_info(collection_id, doc_id)
|
||||
pages = int(data['nbPages'])
|
||||
print(f'found {pages} pages')
|
||||
|
||||
y, m, d = retronews.convert_date(pub_date)
|
||||
y, m, d = convert_date(pub_date)
|
||||
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
||||
if not force_overwrite:
|
||||
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
||||
@ -339,7 +339,7 @@ def grab_magazine(url: str,
|
||||
|
||||
# fetch pages
|
||||
for page in range(pages):
|
||||
pages_queue.put(page+1)
|
||||
_pages_queue.put(page+1)
|
||||
|
||||
pool = []
|
||||
for i in range(PAGE_FETCHING_POOL_SIZE):
|
||||
@ -366,7 +366,7 @@ def grab_magazine(url: str,
|
||||
# merge tiles
|
||||
for page in range(pages):
|
||||
page += 1
|
||||
merging_queue.put(page)
|
||||
_merging_queue.put(page)
|
||||
|
||||
pool = []
|
||||
for i in range(TILE_MERGING_POOL_SIZE):
|
||||
@ -387,71 +387,11 @@ def grab_magazine(url: str,
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--url', type=str, required=True)
|
||||
parser.add_argument('--output', type=str, required=True,
|
||||
help='output directory')
|
||||
parser.add_argument('--merge-threads', default=TILE_MERGING_POOL_SIZE, type=int)
|
||||
parser.add_argument('--fetch-threads', default=PAGE_FETCHING_POOL_SIZE, type=int)
|
||||
parser.add_argument('--continue-prev', action='store_true',
|
||||
help='keep scrapping backwards in time')
|
||||
parser.add_argument('--continue-next', action='store_true',
|
||||
help='keep scrapping forwards in time')
|
||||
parser.add_argument('--only-fetch', action='store_true',
|
||||
help='only fetch magazine tiles and exit, do not merge anything')
|
||||
parser.add_argument('--force-overwrite', action='store_true',
|
||||
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
||||
parser.add_argument('--force-probe', action='store_true',
|
||||
help='force all pages to use the \'probe\' method')
|
||||
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
||||
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
||||
def set_tile_merging_pool_size(size):
|
||||
global TILE_MERGING_POOL_SIZE
|
||||
TILE_MERGING_POOL_SIZE = size
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with_continuation = args.continue_prev or args.continue_next
|
||||
if args.fetch_probe_pages and with_continuation:
|
||||
raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
|
||||
if args.only_fetch and with_continuation:
|
||||
raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
|
||||
|
||||
TILE_MERGING_POOL_SIZE = args.merge_threads
|
||||
PAGE_FETCHING_POOL_SIZE = args.fetch_threads
|
||||
|
||||
url = args.url
|
||||
while True:
|
||||
print(f'grabbing {url}...')
|
||||
if not grab_magazine(url,
|
||||
output_root=args.output,
|
||||
probe_pages=args.fetch_probe_pages,
|
||||
probe_all=args.force_probe,
|
||||
only_fetch=args.only_fetch,
|
||||
force_overwrite=args.force_overwrite):
|
||||
logging.error('failed to grab')
|
||||
break
|
||||
|
||||
if not args.continue_prev and not args.continue_next:
|
||||
break
|
||||
|
||||
r = requests.get(url)
|
||||
|
||||
try:
|
||||
next_url = None
|
||||
if args.continue_next:
|
||||
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
||||
elif args.continue_prev:
|
||||
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
||||
|
||||
if not next_url:
|
||||
if not next_url:
|
||||
break
|
||||
|
||||
if next_url.startswith('/'):
|
||||
next_url = f'https://www.retronews.fr{next_url}'
|
||||
|
||||
url = next_url
|
||||
|
||||
except:
|
||||
print('error: failed to find previous link! exiting')
|
||||
break
|
||||
|
||||
def set_page_fetching_pool_size(size):
|
||||
global PAGE_FETCHING_POOL_SIZE
|
||||
PAGE_FETCHING_POOL_SIZE = size
|
0
mdf/util/__init__.py
Normal file
0
mdf/util/__init__.py
Normal file
44
mdf/util/util.py
Normal file
44
mdf/util/util.py
Normal file
@ -0,0 +1,44 @@
|
||||
import subprocess
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
from time import sleep
|
||||
from threading import Lock
|
||||
import http.client
|
||||
|
||||
|
||||
_print_lock = Lock()
|
||||
|
||||
|
||||
def safe_print(*args, **kwargs):
|
||||
with _print_lock:
|
||||
print(*args, **kwargs)
|
||||
|
||||
|
||||
def run(args: list, **kwargs):
|
||||
p = subprocess.run(args, **kwargs)
|
||||
if p.returncode != 0:
|
||||
raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
|
||||
|
||||
|
||||
def download_file(url, output, handle_http_errors=True) -> bool:
|
||||
tries_left = 3
|
||||
ok = False
|
||||
while tries_left > 0:
|
||||
try:
|
||||
urllib.request.urlretrieve(url, output)
|
||||
ok = True
|
||||
break
|
||||
except http.client.RemoteDisconnected:
|
||||
ok = False
|
||||
print(' caught an exception, sleeping for 2 seconds and retrying...')
|
||||
sleep(2)
|
||||
tries_left -= 1
|
||||
except urllib.error.HTTPError as e:
|
||||
if not handle_http_errors:
|
||||
raise e
|
||||
else:
|
||||
print(f' failed to download {url}: {str(e)}')
|
||||
return False
|
||||
return ok
|
||||
|
@ -1,8 +0,0 @@
|
||||
from .retronews import (
|
||||
convert_date,
|
||||
parse_url,
|
||||
doc_info,
|
||||
page_info,
|
||||
thumbnail_url,
|
||||
tile_url
|
||||
)
|
@ -1,50 +0,0 @@
|
||||
import re
|
||||
import requests
|
||||
|
||||
MONTHS = dict(
|
||||
jan=1,
|
||||
feb=2,
|
||||
mar=3,
|
||||
apr=4,
|
||||
may=5,
|
||||
jun=6,
|
||||
jul=7,
|
||||
juillet=7,
|
||||
aout=8,
|
||||
aug=8,
|
||||
sep=9,
|
||||
oct=10,
|
||||
nov=11,
|
||||
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
|
||||
dec=12
|
||||
)
|
||||
|
||||
|
||||
def convert_date(s: str) -> tuple[str, str, str]:
|
||||
m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
|
||||
year = m[2]
|
||||
month = '%02d' % MONTHS[m[1]]
|
||||
day = m[0]
|
||||
return year, month, day
|
||||
|
||||
|
||||
def parse_url(url: str) -> tuple:
|
||||
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
||||
|
||||
|
||||
def doc_info(collection_id, doc_id):
|
||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
|
||||
return r.json()
|
||||
|
||||
|
||||
def page_info(collection_id, doc_id, page):
|
||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
|
||||
return r.json()
|
||||
|
||||
|
||||
def thumbnail_url(collection_id, doc_id, page) -> str:
|
||||
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
|
||||
|
||||
|
||||
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
|
||||
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
|
Loading…
x
Reference in New Issue
Block a user