initial
This commit is contained in:
commit
209c6404eb
24
check-ratio.py
Executable file
24
check-ratio.py
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
from database import Database
|
||||
from argparse import ArgumentParser
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
db = Database()
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--from', type=str, required=True, dest='date_from',
|
||||
help='date formatted as yyyy-mm-dd')
|
||||
parser.add_argument('--to', type=str, required=True, dest='date_to',
|
||||
help='date formatted as yyyy-mm-dd')
|
||||
args = parser.parse_args()
|
||||
|
||||
docs = db.get_documents((args.date_from, args.date_to))
|
||||
for doc in docs:
|
||||
pages = db.get_doc_pages(doc['collection_id'], doc['doc_id'])
|
||||
for page, width, height, dpi in pages:
|
||||
if width == 0 or height == 0:
|
||||
print(f'ERROR: {doc["collection_id"]}/{doc["page_id"]}/{page}: width or height is zero')
|
||||
continue
|
||||
ratio = width/height
|
||||
# TODO
|
||||
# print(f'[]')
|
152
database.py
Normal file
152
database.py
Normal file
@ -0,0 +1,152 @@
|
||||
import sqlite3
|
||||
import logging
|
||||
import os.path
|
||||
import retronews
|
||||
import threading
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Database:
|
||||
SCHEMA = 6
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
file = os.path.join(os.path.dirname(__file__), 'mdf-retrobase.sqlite3')
|
||||
self.sqlite = sqlite3.connect(file, check_same_thread=False)
|
||||
self.lock = threading.Lock()
|
||||
|
||||
sqlite_version = self._get_sqlite_version()
|
||||
self.logger.debug(f'SQLite version: {sqlite_version}')
|
||||
|
||||
schema_version = self.schema_get_version()
|
||||
self.logger.debug(f'Schema version: {schema_version}')
|
||||
|
||||
self.schema_init(schema_version)
|
||||
self.schema_set_version(self.SCHEMA)
|
||||
|
||||
def __del__(self):
|
||||
if self.sqlite:
|
||||
self.sqlite.commit()
|
||||
self.sqlite.close()
|
||||
|
||||
def _get_sqlite_version(self) -> str:
|
||||
cursor = self.sqlite.cursor()
|
||||
cursor.execute("SELECT sqlite_version()")
|
||||
return cursor.fetchone()[0]
|
||||
|
||||
def schema_get_version(self) -> int:
|
||||
cursor = self.sqlite.execute('PRAGMA user_version')
|
||||
return int(cursor.fetchone()[0])
|
||||
|
||||
def schema_set_version(self, v) -> None:
|
||||
self.sqlite.execute('PRAGMA user_version={:d}'.format(v))
|
||||
self.logger.info(f'Schema set to {v}')
|
||||
|
||||
def cursor(self) -> sqlite3.Cursor:
|
||||
return self.sqlite.cursor()
|
||||
|
||||
def commit(self) -> None:
|
||||
return self.sqlite.commit()
|
||||
|
||||
def schema_init(self, version: int) -> None:
|
||||
cursor = self.cursor()
|
||||
|
||||
if version < 1:
|
||||
# timestamps
|
||||
cursor.execute("""CREATE TABLE IF NOT EXISTS mdf_links (
|
||||
issue_date TEXT PRIMARY KEY,
|
||||
url TEXT NOT NULL,
|
||||
pages INTEGER NOT NULL
|
||||
)""")
|
||||
|
||||
if version < 2:
|
||||
cursor.execute("""CREATE TABLE IF NOT EXISTS mdf_pages (
|
||||
collection_id INTEGER NOT NULL,
|
||||
doc_id INTEGER NOT NULL,
|
||||
page INTEGER NOT NULL,
|
||||
height INTEGER NOT NULL,
|
||||
width INTEGER NOT NULL,
|
||||
dpi INTEGER NOT NULL
|
||||
)""")
|
||||
cursor.execute("""CREATE UNIQUE INDEX mdf_pages_idx ON mdf_pages (collection_id, doc_id, page)""")
|
||||
|
||||
if version < 3:
|
||||
cursor.execute("ALTER TABLE mdf_pages ADD fail INTEGER NOT NULL")
|
||||
|
||||
if version < 4:
|
||||
cursor.execute("""CREATE INDEX mdf_pages_fail_idx ON mdf_pages (fail)""")
|
||||
|
||||
if version < 5:
|
||||
for col in ('collection_id', 'doc_id'):
|
||||
cursor.execute(f"ALTER TABLE mdf_links ADD {col} INTEGER NOT NULL DEFAULT '0'")
|
||||
cursor.execute("CREATE INDEX mdf_links_col_doc_idx ON mdf_links (collection_id, doc_id)")
|
||||
|
||||
if version < 6:
|
||||
cursor.execute("DROP INDEX mdf_links_col_doc_idx")
|
||||
cursor.execute("CREATE UNIQUE INDEX mdf_links_col_doc_idx ON mdf_links (collection_id, doc_id)")
|
||||
|
||||
self.commit()
|
||||
|
||||
def add_link(self, issue_date: str, url: str, pages: int):
|
||||
with self.lock:
|
||||
self.cursor().execute("REPLACE INTO mdf_links (issue_date, url, pages) VALUES (?, ?, ?)",
|
||||
(issue_date, url, str(pages)))
|
||||
self.commit()
|
||||
|
||||
def add_page(self, collection_id: int, doc_id: int, page: int, width: int, height: int, dpi: int):
|
||||
with self.lock:
|
||||
self.cursor().execute("INSERT INTO mdf_pages (collection_id, doc_id, page, width, height, dpi, fail) VALUES (?, ?, ?, ?, ?, ?, 0)",
|
||||
(collection_id, doc_id, page, width, height, dpi))
|
||||
self.commit()
|
||||
|
||||
def update_page(self, collection_id: int, doc_id: int, page: int, width: int, height: int, dpi: int):
|
||||
with self.lock:
|
||||
self.cursor().execute("UPDATE mdf_pages SET width=?, height=?, dpi=?, fail=0 WHERE collection_id=? AND doc_id=? AND page=?",
|
||||
(width, height, dpi, collection_id, doc_id, page))
|
||||
self.commit()
|
||||
|
||||
def add_page_failed(self, collection_id, doc_id, page):
|
||||
with self.lock:
|
||||
self.cursor().execute("INSERT INTO mdf_pages (collection_id, doc_id, page, width, height, dpi, fail) VALUES (?, ?, ?, 0, 0, 0, 1)",
|
||||
(collection_id, doc_id, page))
|
||||
self.commit()
|
||||
|
||||
def get_existing_pages(self, fail=0):
|
||||
cur = self.cursor()
|
||||
cur.execute("SELECT collection_id, doc_id, page FROM mdf_pages WHERE fail=?", (fail,))
|
||||
return cur.fetchall()
|
||||
|
||||
def get_documents(self, range: Optional[tuple[str, str]] = None):
|
||||
cur = self.cursor()
|
||||
docs = []
|
||||
|
||||
sql = "SELECT issue_date, url, pages FROM mdf_links"
|
||||
if range:
|
||||
sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'"
|
||||
cur.execute(sql)
|
||||
for issue_date, url, pages in cur.fetchall():
|
||||
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
||||
docs.append(dict(
|
||||
collection_id=collection_id,
|
||||
doc_id=doc_id,
|
||||
pages=pages
|
||||
))
|
||||
|
||||
return docs
|
||||
|
||||
def get_doc_pages(self, collection_id, doc_id):
|
||||
cur = self.cursor()
|
||||
cur.execute("SELECT page, width, height, dpi FROM mdf_pages WHERE collection_id=? AND doc_id=?",
|
||||
(collection_id, doc_id))
|
||||
return cur.fetchall()
|
||||
|
||||
def fix_documents(self):
|
||||
cur = self.cursor()
|
||||
cur.execute("SELECT issue_date, url FROM mdf_links")
|
||||
for issue_date, url in cur.fetchall():
|
||||
pub_date, cid, did = retronews.parse_url(url)
|
||||
cur.execute("UPDATE mdf_links SET collection_id=?, doc_id=? WHERE issue_date=?",
|
||||
(cid, did, issue_date))
|
||||
self.commit()
|
107
fill-pages-info.py
Executable file
107
fill-pages-info.py
Executable file
@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import retronews
|
||||
import threading
|
||||
import queue
|
||||
import sqlite3
|
||||
|
||||
from database import Database
|
||||
from argparse import ArgumentParser
|
||||
|
||||
db = Database()
|
||||
print_lock = threading.Lock()
|
||||
ok_lock = threading.Lock()
|
||||
fail_lock = threading.Lock()
|
||||
tasks_queue = queue.Queue()
|
||||
|
||||
done_ok = 0
|
||||
done_fail = 0
|
||||
|
||||
|
||||
def incr_ok():
|
||||
global done_ok
|
||||
with ok_lock:
|
||||
done_ok += 1
|
||||
print_state()
|
||||
|
||||
|
||||
def incr_fail():
|
||||
global done_fail
|
||||
with fail_lock:
|
||||
done_fail += 1
|
||||
print_state()
|
||||
|
||||
|
||||
def print_state():
|
||||
with print_lock:
|
||||
print(f'ok={done_ok} fail={done_fail}')
|
||||
|
||||
|
||||
class PageWorker(threading.Thread):
|
||||
_do_update: bool
|
||||
|
||||
def __init__(self, do_update: bool):
|
||||
super().__init__()
|
||||
self._do_update = do_update
|
||||
|
||||
def run(self):
|
||||
while not tasks_queue.empty():
|
||||
try:
|
||||
collection_id, doc_id, page = tasks_queue.get_nowait()
|
||||
try:
|
||||
info = retronews.page_info(collection_id, doc_id, page)
|
||||
try:
|
||||
f = getattr(db, 'add_page' if not self._do_update else 'update_page')
|
||||
f(collection_id, doc_id, page, info['width'], info['height'], info['dpi'])
|
||||
except sqlite3.IntegrityError:
|
||||
with print_lock:
|
||||
print(f'error: unique failed for ({collection_id}, {doc_id}, {page})')
|
||||
incr_ok()
|
||||
except:
|
||||
# traceback.print_exc()
|
||||
if self._do_update:
|
||||
with print_lock:
|
||||
print(f'error: skipping updating the page ({collection_id}, {doc_id}, {page}) cause failed again')
|
||||
else:
|
||||
db.add_page_failed(collection_id, doc_id, page)
|
||||
incr_fail()
|
||||
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--threads', type=int, required=True)
|
||||
parser.add_argument('--fails', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.fails:
|
||||
pages = db.get_existing_pages(fail=1)
|
||||
for cid, did, page in pages:
|
||||
tasks_queue.put((cid, did, page))
|
||||
pages = None
|
||||
else:
|
||||
ex_pages = db.get_existing_pages()
|
||||
ex_map = {}
|
||||
for cid, did, page in ex_pages:
|
||||
ex_map[f'{cid}_{did}_{page}'] = 1
|
||||
|
||||
docs = db.get_documents()
|
||||
for doc in docs:
|
||||
for page in range(doc['pages']):
|
||||
page += 1
|
||||
if f"{doc['collection_id']}_{doc['doc_id']}_{page}" not in ex_map:
|
||||
tasks_queue.put((doc['collection_id'], doc['doc_id'], page))
|
||||
|
||||
ex_pages = None
|
||||
ex_map = None
|
||||
docs = None
|
||||
|
||||
pool = []
|
||||
for i in range(args.threads):
|
||||
pool.append(PageWorker(do_update=args.fails))
|
||||
for t in pool:
|
||||
t.start()
|
||||
for t in pool:
|
||||
t.join()
|
7
fix-db.py
Executable file
7
fix-db.py
Executable file
@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from database import Database
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
db = Database()
|
||||
docs = db.fix_documents()
|
56
grab-links.py
Executable file
56
grab-links.py
Executable file
@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
import re
|
||||
import requests
|
||||
import traceback
|
||||
|
||||
from retronews import convert_date, parse_url
|
||||
from argparse import ArgumentParser, ArgumentError
|
||||
from database import Database
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
database = Database()
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--initial-url', type=str, required=True)
|
||||
parser.add_argument('--forwards', action='store_true')
|
||||
parser.add_argument('--backwards', action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.forwards and not args.backwards:
|
||||
raise ArgumentError('either --forwards or --backwards is required!')
|
||||
|
||||
url = args.initial_url
|
||||
while True:
|
||||
print(f'grabbing {url}...')
|
||||
try:
|
||||
pub_date, collection_id, doc_id = parse_url(url)
|
||||
except AttributeError:
|
||||
break
|
||||
|
||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
|
||||
data = r.json()
|
||||
pages = int(data['nbPages'])
|
||||
|
||||
y, m, d = convert_date(pub_date)
|
||||
issue_date = f'{y}-{m}-{d}'
|
||||
|
||||
print(f'adding {issue_date}')
|
||||
database.add_link(issue_date, url, pages)
|
||||
|
||||
r = requests.get(url)
|
||||
|
||||
try:
|
||||
if args.forwards:
|
||||
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
||||
elif args.backwards:
|
||||
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
||||
|
||||
if next_url.startswith('/'):
|
||||
next_url = f'https://www.retronews.fr{next_url}'
|
||||
|
||||
url = next_url
|
||||
except:
|
||||
traceback.print_exc()
|
||||
print('error: failed to find previous link! exiting')
|
||||
break
|
399
grab-retronews.py
Executable file
399
grab-retronews.py
Executable file
@ -0,0 +1,399 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import imghdr
|
||||
import requests
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import http.client
|
||||
import subprocess
|
||||
import shutil
|
||||
import queue
|
||||
import traceback
|
||||
import retronews
|
||||
|
||||
from database import Database
|
||||
from typing import Optional
|
||||
from threading import Thread, Lock
|
||||
from time import sleep
|
||||
from argparse import ArgumentParser
|
||||
|
||||
VTILES = 3
|
||||
HTILES = 2
|
||||
TILE_MERGING_POOL_SIZE = 8
|
||||
PAGE_FETCHING_POOL_SIZE = 8
|
||||
|
||||
database = Database()
|
||||
print_lock = Lock()
|
||||
|
||||
pages_queue = queue.Queue()
|
||||
merging_queue = queue.Queue()
|
||||
|
||||
|
||||
def safe_print(*args, **kwargs):
|
||||
with print_lock:
|
||||
print(*args, **kwargs)
|
||||
|
||||
|
||||
def run(args: list, **kwargs):
|
||||
p = subprocess.run(args, **kwargs)
|
||||
if p.returncode != 0:
|
||||
raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
|
||||
|
||||
|
||||
class DownloaderThread(Thread):
|
||||
_url: str
|
||||
_save_as: str
|
||||
_download_result: Optional[bool]
|
||||
|
||||
def __init__(self, url: str, save_as: str, thread_name=None):
|
||||
super().__init__()
|
||||
if thread_name:
|
||||
self.name = thread_name
|
||||
|
||||
self._url = url
|
||||
self._save_as = save_as
|
||||
self._download_result = None
|
||||
|
||||
def run(self):
|
||||
self._download_result = download_file(self._url, self._save_as)
|
||||
|
||||
def is_downloaded(self) -> bool:
|
||||
return self._download_result is True
|
||||
|
||||
|
||||
class TileMergeWorker(Thread):
|
||||
_working_dir: str
|
||||
_number: int
|
||||
|
||||
def __init__(self, working_dir: str, number: int):
|
||||
super().__init__()
|
||||
self._working_dir = working_dir
|
||||
self._number = number
|
||||
|
||||
def run(self):
|
||||
safe_print(f'[tile merger {self._number}] started')
|
||||
|
||||
while not merging_queue.empty():
|
||||
try:
|
||||
page = merging_queue.get_nowait()
|
||||
page_dir = os.path.join(self._working_dir, str(page))
|
||||
thumbnail_path = os.path.join(self._working_dir, 'thumbnail.jpg')
|
||||
meta_path = os.path.join(page_dir, 'meta.json')
|
||||
|
||||
if os.path.exists(thumbnail_path):
|
||||
shutil.copy(thumbnail_path, os.path.join(self._working_dir, f'{page}.jpg'))
|
||||
continue
|
||||
|
||||
if os.path.exists(meta_path):
|
||||
with open(meta_path, 'r') as f:
|
||||
meta = json.loads(f.read())
|
||||
htiles = meta['h']
|
||||
vtiles = meta['v']
|
||||
else:
|
||||
htiles = HTILES
|
||||
vtiles = VTILES
|
||||
|
||||
hfiles = []
|
||||
for h in range(htiles):
|
||||
vfiles = []
|
||||
for v in range(vtiles):
|
||||
vfiles.append(f'v{v}_h{h}.jpg')
|
||||
run(['convert', '-append', *vfiles, f'_v_{h}.jpg'], cwd=page_dir)
|
||||
hfiles.append(f'_v_{h}.jpg')
|
||||
|
||||
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
|
||||
shutil.rmtree(page_dir)
|
||||
|
||||
safe_print(f'[tile merger {self._number}] page {page} done')
|
||||
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
|
||||
class PageFetchWorker(Thread):
|
||||
_working_dir: str
|
||||
_number: int
|
||||
_failed: bool
|
||||
_error: Optional[str]
|
||||
_probe_pages: Optional[list[int]]
|
||||
|
||||
def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None):
|
||||
super().__init__()
|
||||
self._working_dir = working_dir
|
||||
self._number = number
|
||||
self._collection_id = collection_id
|
||||
self._doc_id = doc_id
|
||||
self._failed = False
|
||||
self._error = None
|
||||
self._probe_pages = probe_pages
|
||||
|
||||
def run(self):
|
||||
safe_print(f'[pf-{self._number}] started')
|
||||
page = 0
|
||||
|
||||
try:
|
||||
while not pages_queue.empty():
|
||||
try:
|
||||
page = pages_queue.get_nowait()
|
||||
safe_print(f'[pf-{self._number}] page {page} started')
|
||||
|
||||
if page in self._probe_pages:
|
||||
self.probe_dl(page)
|
||||
else:
|
||||
try:
|
||||
self.normal_dl(page)
|
||||
except OSError:
|
||||
safe_print(f'[pf-{self._number}] normal_dl() failed, trying probe_dl()')
|
||||
self.probe_dl(page)
|
||||
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
self._failed = True
|
||||
self._error = f'while fetching page {page}: {str(e)}'
|
||||
|
||||
def _get_page_dir(self, page):
|
||||
page_dir = os.path.join(self._working_dir, str(page))
|
||||
if not os.path.exists(page_dir):
|
||||
os.makedirs(page_dir)
|
||||
return page_dir
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
return self._failed
|
||||
|
||||
def get_error(self) -> str:
|
||||
return self._error if self._error is not None else ''
|
||||
|
||||
def normal_dl(self, page):
|
||||
page_dir = self._get_page_dir(page)
|
||||
dl_tasks = []
|
||||
for horiz_tile in range(HTILES):
|
||||
for vert_tile in range(VTILES):
|
||||
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
|
||||
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
|
||||
if os.path.isfile(output_file):
|
||||
if os.path.getsize(output_file) < 4:
|
||||
os.unlink(output_file)
|
||||
# safe_print(f'[pf-{self._number}] already exists')
|
||||
continue
|
||||
|
||||
dl_tasks.append(DownloaderThread(url=url,
|
||||
save_as=os.path.join(page_dir, output_file),
|
||||
thread_name=f'p{page}-v{vert_tile}-h{horiz_tile}'))
|
||||
|
||||
for task in dl_tasks:
|
||||
task.start()
|
||||
|
||||
data_error = False
|
||||
|
||||
for task in dl_tasks:
|
||||
task.join()
|
||||
if not task.is_downloaded():
|
||||
# safe_print(f'failed to download file {task._url}')
|
||||
raise OSError(f'network error, failed to download {task._url}')
|
||||
|
||||
elif not imghdr.what(task._save_as):
|
||||
data_error = True
|
||||
|
||||
if data_error:
|
||||
self.thumbnail_dl(page)
|
||||
else:
|
||||
safe_print(f'[pf-{self._number}] page {page}: all files saved')
|
||||
|
||||
def probe_dl(self, page):
|
||||
page_dir = self._get_page_dir(page)
|
||||
real_h = 0
|
||||
real_v = 0
|
||||
data_error = False
|
||||
for h in range(5):
|
||||
for v in range(5):
|
||||
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
|
||||
output_file = f'{page_dir}/v{v}_h{h}.jpg'
|
||||
if os.path.isfile(output_file):
|
||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
|
||||
if os.path.getsize(output_file) < 4:
|
||||
os.unlink(output_file)
|
||||
continue
|
||||
try:
|
||||
if not download_file(url, output_file, handle_http_errors=False):
|
||||
raise OSError('network failure')
|
||||
if not imghdr.what(output_file):
|
||||
data_error = True
|
||||
break
|
||||
real_v = v
|
||||
real_h = h
|
||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
|
||||
|
||||
except urllib.error.HTTPError:
|
||||
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
|
||||
break
|
||||
|
||||
if data_error:
|
||||
self.thumbnail_dl(page)
|
||||
else:
|
||||
with open(os.path.join(page_dir, 'meta.json'), 'w') as f:
|
||||
f.write(json.dumps(dict(v=real_v+1, h=real_h+1)))
|
||||
safe_print(f'[pf-{self._number}] page {page}: all files saved (seemingly...)')
|
||||
|
||||
def thumbnail_dl(self, page):
|
||||
page_dir = self._get_page_dir(page)
|
||||
thumbnail_url = retronews.thumbnail_url(self._collection_id, self._doc_id, page)
|
||||
if not download_file(thumbnail_url, os.path.join(page_dir, 'thumbnail.jpg')):
|
||||
raise RuntimeError(f'network error, failed to download thumbnail ({thumbnail_url})')
|
||||
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
|
||||
|
||||
|
||||
def download_file(url, output, handle_http_errors=True) -> bool:
|
||||
tries_left = 3
|
||||
ok = False
|
||||
while tries_left > 0:
|
||||
try:
|
||||
urllib.request.urlretrieve(url, output)
|
||||
ok = True
|
||||
break
|
||||
except http.client.RemoteDisconnected:
|
||||
ok = False
|
||||
print(' caught an exception, sleeping for 2 seconds and retrying...')
|
||||
sleep(2)
|
||||
tries_left -= 1
|
||||
except urllib.error.HTTPError as e:
|
||||
if not handle_http_errors:
|
||||
raise e
|
||||
else:
|
||||
print(f' failed to download {url}: {str(e)}')
|
||||
return False
|
||||
return ok
|
||||
|
||||
|
||||
def grab_magazine(url: str,
|
||||
output_root: str,
|
||||
probe_pages: Optional[list[int]] = None,
|
||||
only_fetch=False, force_overwrite=False):
|
||||
pub_date, collection_id, doc_id = retronews.parse_url(url)
|
||||
|
||||
data = retronews.api_doc_info(collection_id, doc_id)
|
||||
pages = int(data['nbPages'])
|
||||
print(f'found {pages} pages')
|
||||
|
||||
y, m, d = retronews.convert_date(pub_date)
|
||||
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
|
||||
if not force_overwrite:
|
||||
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
|
||||
return
|
||||
else:
|
||||
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
|
||||
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
|
||||
|
||||
output_dir = os.path.join(output_root, pub_date)
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
# fetch pages
|
||||
for page in range(pages):
|
||||
pages_queue.put(page+1)
|
||||
|
||||
pool = []
|
||||
for i in range(PAGE_FETCHING_POOL_SIZE):
|
||||
pool.append(PageFetchWorker(working_dir=output_dir,
|
||||
number=i+1,
|
||||
collection_id=collection_id,
|
||||
doc_id=doc_id,
|
||||
probe_pages=probe_pages))
|
||||
for worker in pool:
|
||||
worker.start()
|
||||
|
||||
for worker in pool:
|
||||
worker.join()
|
||||
if worker.is_failed():
|
||||
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
|
||||
f.write(f'error: {worker.get_error()}')
|
||||
print(f'ERROR: failed to download {pub_date} magazine')
|
||||
return
|
||||
|
||||
if only_fetch:
|
||||
return
|
||||
|
||||
# merge tiles
|
||||
for page in range(pages):
|
||||
page += 1
|
||||
merging_queue.put(page)
|
||||
|
||||
pool = []
|
||||
for i in range(TILE_MERGING_POOL_SIZE):
|
||||
pool.append(TileMergeWorker(working_dir=output_dir, number=i+1))
|
||||
for worker in pool:
|
||||
worker.start()
|
||||
try:
|
||||
for worker in pool:
|
||||
worker.join()
|
||||
|
||||
# merge images into pdf
|
||||
files = [str(page + 1) + '.jpg' for page in range(pages)]
|
||||
run(['convert', *files, os.path.join(output_root, f'{y}-{m}-{d}.pdf')], cwd=output_dir)
|
||||
shutil.rmtree(output_dir)
|
||||
except:
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--url', type=str, required=True)
|
||||
parser.add_argument('--output', type=str, required=True,
|
||||
help='output directory')
|
||||
parser.add_argument('--merge-threads', default=TILE_MERGING_POOL_SIZE, type=int)
|
||||
parser.add_argument('--fetch-threads', default=PAGE_FETCHING_POOL_SIZE, type=int)
|
||||
parser.add_argument('--continue-prev', action='store_true',
|
||||
help='keep scrapping backwards in time')
|
||||
parser.add_argument('--continue-next', action='store_true',
|
||||
help='keep scrapping forwards in time')
|
||||
parser.add_argument('--only-fetch', action='store_true',
|
||||
help='only fetch magazine tiles and exit, do not merge anything')
|
||||
parser.add_argument('--force-overwrite', action='store_true',
|
||||
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
|
||||
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
|
||||
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with_continuation = args.continue_prev or args.continue_next
|
||||
if args.fetch_probe_pages and with_continuation:
|
||||
raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
|
||||
if args.only_fetch and with_continuation:
|
||||
raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
|
||||
|
||||
TILE_MERGING_POOL_SIZE = args.merge_threads
|
||||
PAGE_FETCHING_POOL_SIZE = args.fetch_threads
|
||||
|
||||
url = args.url
|
||||
while True:
|
||||
print(f'grabbing {url}...')
|
||||
grab_magazine(url,
|
||||
output_root=args.output,
|
||||
probe_pages=args.fetch_probe_pages,
|
||||
only_fetch=args.only_fetch,
|
||||
force_overwrite=args.force_overwrite)
|
||||
|
||||
if not args.continue_prev and not args.continue_next:
|
||||
break
|
||||
|
||||
r = requests.get(url)
|
||||
|
||||
try:
|
||||
if args.continue_next:
|
||||
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
|
||||
elif args.continue_prev:
|
||||
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
|
||||
|
||||
if next_url.startswith('/'):
|
||||
next_url = f'https://www.retronews.fr{next_url}'
|
||||
|
||||
url = next_url
|
||||
|
||||
except:
|
||||
print('error: failed to find previous link! exiting')
|
||||
break
|
||||
|
48
retronews.py
Normal file
48
retronews.py
Normal file
@ -0,0 +1,48 @@
|
||||
import re
|
||||
import requests
|
||||
|
||||
MONTHS = dict(
|
||||
jan=1,
|
||||
feb=2,
|
||||
mar=3,
|
||||
apr=4,
|
||||
may=5,
|
||||
jun=6,
|
||||
jul=7,
|
||||
aug=8,
|
||||
sep=9,
|
||||
oct=10,
|
||||
nov=11,
|
||||
novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
|
||||
dec=12
|
||||
)
|
||||
|
||||
|
||||
def convert_date(s: str) -> tuple[str, str, str]:
|
||||
m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
|
||||
year = m[2]
|
||||
month = '%02d' % MONTHS[m[1]]
|
||||
day = m[0]
|
||||
return year, month, day
|
||||
|
||||
|
||||
def parse_url(url: str) -> tuple:
|
||||
return re.search(r'/(?:mercure-de-france|le-nouveau-mercure|le-mercure-galant|mercure-francais|mercure-galant)/([^/]+)/(\d+)/(\d+)/', url).groups()
|
||||
|
||||
|
||||
def doc_info(collection_id, doc_id):
|
||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
|
||||
return r.json()
|
||||
|
||||
|
||||
def page_info(collection_id, doc_id, page):
|
||||
r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
|
||||
return r.json()
|
||||
|
||||
|
||||
def thumbnail_url(collection_id, doc_id, page) -> str:
|
||||
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
|
||||
|
||||
|
||||
def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
|
||||
return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
|
Loading…
x
Reference in New Issue
Block a user