This commit is contained in:
Evgeny Zinoviev 2023-12-30 15:37:08 +03:00
parent 3847423443
commit abd1975def
11 changed files with 268 additions and 197 deletions

45
dl-from-db.py Executable file
View File

@ -0,0 +1,45 @@
#!/usr/bin/env python3
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import logging
from mdf import Database, retronews
from argparse import ArgumentParser
database = Database()
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--output', type=str, required=True,
help='output directory')
parser.add_argument('--from-date', type=str)
parser.add_argument('--to-date', type=str)
parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int)
parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int)
parser.add_argument('--only-fetch', action='store_true',
help='only fetch magazine tiles and exit, do not merge anything')
parser.add_argument('--force-overwrite', action='store_true',
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
parser.add_argument('--force-probe', action='store_true',
help='force all pages to use the \'probe\' method')
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
args = parser.parse_args()
retronews.set_tile_merging_pool_size(args.merge_threads)
retronews.set_page_fetching_pool_size(args.fetch_threads)
for doc in database.get_documents((args.from_date, args.to_date)):
url = doc['url']
print(f'grabbing {url}...')
if not retronews.grab_magazine(url,
output_root=args.output,
probe_pages=args.fetch_probe_pages,
probe_all=args.force_probe,
only_fetch=args.only_fetch,
force_overwrite=args.force_overwrite):
logging.error(f'failed to grab {url}')

80
dl-retronews.py Executable file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python3
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import re
import requests
import logging
from mdf import retronews
from argparse import ArgumentParser
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--url', type=str, required=True)
parser.add_argument('--output', type=str, required=True,
help='output directory')
parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int)
parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int)
parser.add_argument('--continue-prev', action='store_true',
help='keep scrapping backwards in time')
parser.add_argument('--continue-next', action='store_true',
help='keep scrapping forwards in time')
parser.add_argument('--only-fetch', action='store_true',
help='only fetch magazine tiles and exit, do not merge anything')
parser.add_argument('--force-overwrite', action='store_true',
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
parser.add_argument('--force-probe', action='store_true',
help='force all pages to use the \'probe\' method')
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
args = parser.parse_args()
with_continuation = args.continue_prev or args.continue_next
if args.fetch_probe_pages and with_continuation:
raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
if args.only_fetch and with_continuation:
raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
TILE_MERGING_POOL_SIZE = args.merge_threads
PAGE_FETCHING_POOL_SIZE = args.fetch_threads
url = args.url
while True:
print(f'grabbing {url}...')
if not retronews.grab_magazine(url,
output_root=args.output,
probe_pages=args.fetch_probe_pages,
probe_all=args.force_probe,
only_fetch=args.only_fetch,
force_overwrite=args.force_overwrite):
logging.error('failed to grab')
break
if not args.continue_prev and not args.continue_next:
break
r = requests.get(url)
try:
next_url = None
if args.continue_next:
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
elif args.continue_prev:
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
if not next_url:
break
if next_url.startswith('/'):
next_url = f'https://www.retronews.fr{next_url}'
url = next_url
except:
print('error: failed to find previous link! exiting')
break

3
mdf/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .retronews import retronews
from .util import util
from .database import Database

View File

@ -1,7 +1,7 @@
import sqlite3 import sqlite3
import logging import logging
import os.path import os.path
import retronews from ..retronews import retronews
import threading import threading
from typing import Optional from typing import Optional
@ -13,7 +13,7 @@ class Database:
def __init__(self): def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__) self.logger = logging.getLogger(self.__class__.__name__)
file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3') file = os.path.join(os.path.dirname(__file__), '..', '..', 'mdf-retrobase.sqlite3')
self.sqlite = sqlite3.connect(file, check_same_thread=False) self.sqlite = sqlite3.connect(file, check_same_thread=False)
self.lock = threading.Lock() self.lock = threading.Lock()
@ -125,10 +125,12 @@ class Database:
sql = "SELECT issue_date, url, pages FROM mdf_links" sql = "SELECT issue_date, url, pages FROM mdf_links"
if range: if range:
sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'" sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'"
sql += " ORDER BY issue_date"
cur.execute(sql) cur.execute(sql)
for issue_date, url, pages in cur.fetchall(): for issue_date, url, pages in cur.fetchall():
pub_date, collection_id, doc_id = retronews.parse_url(url) pub_date, collection_id, doc_id = retronews.parse_url(url)
docs.append(dict( docs.append(dict(
url=url,
collection_id=collection_id, collection_id=collection_id,
doc_id=doc_id, doc_id=doc_id,
pages=pages pages=pages

15
mdf/retronews/__init__.py Normal file
View File

@ -0,0 +1,15 @@
from .retronews import (
convert_date,
parse_url,
_doc_info,
page_info,
thumbnail_url,
tile_url,
HTILES,
VTILES,
PAGE_FETCHING_POOL_SIZE,
TILE_MERGING_POOL_SIZE,
set_tile_merging_pool_size,
set_page_fetching_pool_size,
grab_magazine
)

214
grab-retronews.py → mdf/retronews/retronews.py Executable file → Normal file
View File

@ -1,53 +1,75 @@
#!/usr/bin/env python3
import logging
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os
import sys
import json
import re import re
import imghdr
import requests import requests
import urllib.request import imghdr
import urllib.error import json
import http.client import os
import subprocess
import shutil
import queue import queue
import shutil
import traceback import traceback
import retronews
import logging
from database import Database from ..util.util import safe_print, download_file, run
from typing import Optional from typing import Optional
from threading import Thread, Lock from threading import Thread
from time import sleep import urllib.error
from argparse import ArgumentParser
warnings.filterwarnings("ignore", category=DeprecationWarning) _pages_queue = queue.Queue()
_merging_queue = queue.Queue()
VTILES = 3 VTILES = 3
HTILES = 2 HTILES = 2
TILE_MERGING_POOL_SIZE = 8 TILE_MERGING_POOL_SIZE = 8
PAGE_FETCHING_POOL_SIZE = 8 PAGE_FETCHING_POOL_SIZE = 8
database = Database()
print_lock = Lock()
pages_queue = queue.Queue() MONTHS = dict(
merging_queue = queue.Queue() jan=1,
feb=2,
mar=3,
apr=4,
may=5,
jun=6,
jul=7,
juillet=7,
aout=8,
aug=8,
sep=9,
oct=10,
nov=11,
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
dec=12
)
def safe_print(*args, **kwargs): def convert_date(s: str) -> tuple[str, str, str]:
with print_lock: m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
print(*args, **kwargs) year = m[2]
month = '%02d' % MONTHS[m[1]]
day = m[0]
return year, month, day
def parse_url(url: str) -> tuple:
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
def _doc_info(collection_id, doc_id):
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
return r.json()
def page_info(collection_id, doc_id, page):
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
return r.json()
def thumbnail_url(collection_id, doc_id, page) -> str:
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
def run(args: list, **kwargs):
p = subprocess.run(args, **kwargs)
if p.returncode != 0:
raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
class DownloaderThread(Thread): class DownloaderThread(Thread):
@ -92,9 +114,9 @@ class TileMergeWorker(Thread):
def run(self): def run(self):
safe_print(f'[tile merger {self._number}] started') safe_print(f'[tile merger {self._number}] started')
while not merging_queue.empty(): while not _merging_queue.empty():
try: try:
page = merging_queue.get_nowait() page = _merging_queue.get_nowait()
page_dir = os.path.join(self._working_dir, str(page)) page_dir = os.path.join(self._working_dir, str(page))
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg') thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
meta_path = os.path.join(page_dir, 'meta.json') meta_path = os.path.join(page_dir, 'meta.json')
@ -116,7 +138,7 @@ class TileMergeWorker(Thread):
for h in range(htiles): for h in range(htiles):
vfiles = [] vfiles = []
for v in range(vtiles): for v in range(vtiles):
vfiles.append(f'{h}x{v}.jpg') vfiles.append(f'v{v}_h{h}.jpg')
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir) run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
hfiles.append(f'{h}.jpg') hfiles.append(f'{h}.jpg')
@ -153,12 +175,12 @@ class PageFetchWorker(Thread):
page = 0 page = 0
try: try:
while not pages_queue.empty(): while not _pages_queue.empty():
try: try:
page = pages_queue.get_nowait() page = _pages_queue.get_nowait()
safe_print(f'[pf-{self._number}] page {page} started') safe_print(f'[pf-{self._number}] page {page} started')
if self._probe_all or page in self._probe_pages: if self._probe_all or (self._probe_pages is not None and page in self._probe_pages):
self.probe_dl(page) self.probe_dl(page)
else: else:
try: try:
@ -172,7 +194,7 @@ class PageFetchWorker(Thread):
except Exception as e: except Exception as e:
self._failed = True self._failed = True
self._error = f'while fetching page {page}: {str(e)}' self._error = f'while fetching page {page}: {str(e)}' + traceback.format_exc()
def _get_page_dir(self, page): def _get_page_dir(self, page):
page_dir = os.path.join(self._working_dir, str(page)) page_dir = os.path.join(self._working_dir, str(page))
@ -191,7 +213,7 @@ class PageFetchWorker(Thread):
dl_tasks = [] dl_tasks = []
for horiz_tile in range(HTILES): for horiz_tile in range(HTILES):
for vert_tile in range(VTILES): for vert_tile in range(VTILES):
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile) url = tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg' output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
if os.path.isfile(output_file): if os.path.isfile(output_file):
if os.path.getsize(output_file) < 4: if os.path.getsize(output_file) < 4:
@ -230,7 +252,7 @@ class PageFetchWorker(Thread):
dl_tasks = [] dl_tasks = []
for h in range(10): for h in range(10):
for v in range(10): for v in range(10):
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v) url = tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
output_file = f'{page_dir}/{h}x{v}.jpg' output_file = f'{page_dir}/{h}x{v}.jpg'
if os.path.isfile(output_file): if os.path.isfile(output_file):
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY') safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
@ -283,48 +305,26 @@ class PageFetchWorker(Thread):
def thumbnail_dl(self, page): def thumbnail_dl(self, page):
page_dir = self._get_page_dir(page) page_dir = self._get_page_dir(page)
thumbnail_url = retronews.thumbnail_url(self._collection_id, self._doc_id, page) thumb_url = thumbnail_url(self._collection_id, self._doc_id, page)
if not download_file(thumbnail_url, os.path.join(page_dir, 'thumbnail.jpg')): if not download_file(thumb_url, os.path.join(page_dir, 'thumbnail.jpg')):
raise RuntimeError(f'network error, failed to download thumbnail ({thumbnail_url})') raise RuntimeError(f'network error, failed to download thumbnail ({thumb_url})')
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail') safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
def download_file(url, output, handle_http_errors=True) -> bool:
tries_left = 3
ok = False
while tries_left > 0:
try:
urllib.request.urlretrieve(url, output)
ok = True
break
except http.client.RemoteDisconnected:
ok = False
print(' caught an exception, sleeping for 2 seconds and retrying...')
sleep(2)
tries_left -= 1
except urllib.error.HTTPError as e:
if not handle_http_errors:
raise e
else:
print(f' failed to download {url}: {str(e)}')
return False
return ok
def grab_magazine(url: str, def grab_magazine(url: str,
output_root: str, output_root: str,
probe_pages: Optional[list[int]] = None, probe_pages: Optional[list[int]] = None,
probe_all=False, only_fetch=False, force_overwrite=False): probe_all=False, only_fetch=False, force_overwrite=False):
try: try:
pub_date, collection_id, doc_id = retronews.parse_url(url) pub_date, collection_id, doc_id = parse_url(url)
except AttributeError: except AttributeError:
return False return False
data = retronews.doc_info(collection_id, doc_id) data = _doc_info(collection_id, doc_id)
pages = int(data['nbPages']) pages = int(data['nbPages'])
print(f'found {pages} pages') print(f'found {pages} pages')
y, m, d = retronews.convert_date(pub_date) y, m, d = convert_date(pub_date)
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')): if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
if not force_overwrite: if not force_overwrite:
print(f'{y}-{m}-{d}.pdf already exists, not continuing') print(f'{y}-{m}-{d}.pdf already exists, not continuing')
@ -339,7 +339,7 @@ def grab_magazine(url: str,
# fetch pages # fetch pages
for page in range(pages): for page in range(pages):
pages_queue.put(page+1) _pages_queue.put(page+1)
pool = [] pool = []
for i in range(PAGE_FETCHING_POOL_SIZE): for i in range(PAGE_FETCHING_POOL_SIZE):
@ -366,7 +366,7 @@ def grab_magazine(url: str,
# merge tiles # merge tiles
for page in range(pages): for page in range(pages):
page += 1 page += 1
merging_queue.put(page) _merging_queue.put(page)
pool = [] pool = []
for i in range(TILE_MERGING_POOL_SIZE): for i in range(TILE_MERGING_POOL_SIZE):
@ -387,71 +387,11 @@ def grab_magazine(url: str,
return True return True
if __name__ == '__main__': def set_tile_merging_pool_size(size):
parser = ArgumentParser() global TILE_MERGING_POOL_SIZE
parser.add_argument('--url', type=str, required=True) TILE_MERGING_POOL_SIZE = size
parser.add_argument('--output', type=str, required=True,
help='output directory')
parser.add_argument('--merge-threads', default=TILE_MERGING_POOL_SIZE, type=int)
parser.add_argument('--fetch-threads', default=PAGE_FETCHING_POOL_SIZE, type=int)
parser.add_argument('--continue-prev', action='store_true',
help='keep scrapping backwards in time')
parser.add_argument('--continue-next', action='store_true',
help='keep scrapping forwards in time')
parser.add_argument('--only-fetch', action='store_true',
help='only fetch magazine tiles and exit, do not merge anything')
parser.add_argument('--force-overwrite', action='store_true',
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
parser.add_argument('--force-probe', action='store_true',
help='force all pages to use the \'probe\' method')
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
args = parser.parse_args()
with_continuation = args.continue_prev or args.continue_next
if args.fetch_probe_pages and with_continuation:
raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
if args.only_fetch and with_continuation:
raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
TILE_MERGING_POOL_SIZE = args.merge_threads
PAGE_FETCHING_POOL_SIZE = args.fetch_threads
url = args.url
while True:
print(f'grabbing {url}...')
if not grab_magazine(url,
output_root=args.output,
probe_pages=args.fetch_probe_pages,
probe_all=args.force_probe,
only_fetch=args.only_fetch,
force_overwrite=args.force_overwrite):
logging.error('failed to grab')
break
if not args.continue_prev and not args.continue_next:
break
r = requests.get(url)
try:
next_url = None
if args.continue_next:
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
elif args.continue_prev:
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
if not next_url:
if not next_url:
break
if next_url.startswith('/'):
next_url = f'https://www.retronews.fr{next_url}'
url = next_url
except:
print('error: failed to find previous link! exiting')
break
def set_page_fetching_pool_size(size):
global PAGE_FETCHING_POOL_SIZE
PAGE_FETCHING_POOL_SIZE = size

0
mdf/util/__init__.py Normal file
View File

44
mdf/util/util.py Normal file
View File

@ -0,0 +1,44 @@
import subprocess
import urllib.request
import urllib.error
from time import sleep
from threading import Lock
import http.client
_print_lock = Lock()
def safe_print(*args, **kwargs):
with _print_lock:
print(*args, **kwargs)
def run(args: list, **kwargs):
p = subprocess.run(args, **kwargs)
if p.returncode != 0:
raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
def download_file(url, output, handle_http_errors=True) -> bool:
tries_left = 3
ok = False
while tries_left > 0:
try:
urllib.request.urlretrieve(url, output)
ok = True
break
except http.client.RemoteDisconnected:
ok = False
print(' caught an exception, sleeping for 2 seconds and retrying...')
sleep(2)
tries_left -= 1
except urllib.error.HTTPError as e:
if not handle_http_errors:
raise e
else:
print(f' failed to download {url}: {str(e)}')
return False
return ok

View File

@ -1,8 +0,0 @@
from .retronews import (
convert_date,
parse_url,
doc_info,
page_info,
thumbnail_url,
tile_url
)

View File

@ -1,50 +0,0 @@
import re
import requests
MONTHS = dict(
jan=1,
feb=2,
mar=3,
apr=4,
may=5,
jun=6,
jul=7,
juillet=7,
aout=8,
aug=8,
sep=9,
oct=10,
nov=11,
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
dec=12
)
def convert_date(s: str) -> tuple[str, str, str]:
m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
year = m[2]
month = '%02d' % MONTHS[m[1]]
day = m[0]
return year, month, day
def parse_url(url: str) -> tuple:
return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
def doc_info(collection_id, doc_id):
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
return r.json()
def page_info(collection_id, doc_id, page):
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
return r.json()
def thumbnail_url(collection_id, doc_id, page) -> str:
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'