initial
This commit is contained in:
commit
5fd7512f90
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
/.idea
|
||||
/.venv
|
||||
/fb.sqlite3
|
||||
__pycache__
|
59
dl.py
Executable file
59
dl.py
Executable file
@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import requests
|
||||
|
||||
from fb import Database
|
||||
from fb.util import get_dl_url, get_long_book_id
|
||||
from argparse import ArgumentParser
|
||||
|
||||
|
||||
def download_file(url, dst, cookie, user_agent):
|
||||
streamed_response = requests.get(url, stream=True,
|
||||
headers={
|
||||
'User-Agent': user_agent,
|
||||
'Cookie': cookie
|
||||
})
|
||||
with open(dst, 'wb') as file:
|
||||
for chunk in streamed_response.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
file.write(chunk)
|
||||
|
||||
|
||||
def remove_from_my_books(bid, cookie, user_agent):
|
||||
r = requests.post('https://www.forgottenbooks.com/books-remove', data={
|
||||
'p': get_long_book_id(bid),
|
||||
}, headers={
|
||||
'User-Agent': user_agent,
|
||||
'Cookie': cookie
|
||||
})
|
||||
r.raise_for_status()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--from-id', type=int, required=True)
|
||||
parser.add_argument('--to-id', type=int, required=True)
|
||||
parser.add_argument('--output-directory', type=str, required=True)
|
||||
parser.add_argument('--cookie', type=str, required=True)
|
||||
parser.add_argument('--user-agent', type=str,
|
||||
default='Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.output_directory):
|
||||
os.makedirs(args.output_directory)
|
||||
|
||||
db = Database()
|
||||
ids = db.get_ids(args.from_id, args.to_id)
|
||||
for i in ids:
|
||||
url = get_dl_url(i)
|
||||
skip = False
|
||||
dst = os.path.join(args.output_directory, f'{i}.pdf')
|
||||
if not os.path.exists(dst):
|
||||
download_file(url, dst, args.cookie, args.user_agent)
|
||||
else:
|
||||
print(f'{i} already downloaded, removing from my books')
|
||||
skip = True
|
||||
|
||||
remove_from_my_books(i, args.cookie, args.user_agent)
|
||||
if not skip:
|
||||
print(f"saved {i}")
|
1
fb/__init__.py
Normal file
1
fb/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .database import Database
|
118
fb/database.py
Normal file
118
fb/database.py
Normal file
@ -0,0 +1,118 @@
|
||||
import sqlite3
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
|
||||
from .util import stringify
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Database:
|
||||
SCHEMA = 2
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
config_dir = os.path.join(
|
||||
os.getenv('HOME'),
|
||||
'.config',
|
||||
'forgottenbooks'
|
||||
)
|
||||
if not os.path.exists(config_dir):
|
||||
os.makedirs(config_dir)
|
||||
|
||||
file = os.path.join(config_dir, 'fb.sqlite3')
|
||||
self.sqlite = sqlite3.connect(file, check_same_thread=False)
|
||||
self.lock = threading.Lock()
|
||||
|
||||
sqlite_version = self._get_sqlite_version()
|
||||
self.logger.debug(f'SQLite version: {sqlite_version}')
|
||||
|
||||
schema_version = self.schema_get_version()
|
||||
self.logger.debug(f'Schema version: {schema_version}')
|
||||
|
||||
self.schema_init(schema_version)
|
||||
self.schema_set_version(self.SCHEMA)
|
||||
|
||||
def __del__(self):
|
||||
if self.sqlite:
|
||||
self.sqlite.commit()
|
||||
self.sqlite.close()
|
||||
|
||||
def _get_sqlite_version(self) -> str:
|
||||
cursor = self.sqlite.cursor()
|
||||
cursor.execute("SELECT sqlite_version()")
|
||||
return cursor.fetchone()[0]
|
||||
|
||||
def schema_get_version(self) -> int:
|
||||
cursor = self.sqlite.execute('PRAGMA user_version')
|
||||
return int(cursor.fetchone()[0])
|
||||
|
||||
def schema_set_version(self, v) -> None:
|
||||
self.sqlite.execute('PRAGMA user_version={:d}'.format(v))
|
||||
self.logger.info(f'Schema set to {v}')
|
||||
|
||||
def cursor(self) -> sqlite3.Cursor:
|
||||
return self.sqlite.cursor()
|
||||
|
||||
def commit(self) -> None:
|
||||
return self.sqlite.commit()
|
||||
|
||||
def schema_init(self, version: int) -> None:
|
||||
cursor = self.cursor()
|
||||
|
||||
if version < 1:
|
||||
cursor.execute("""CREATE TABLE IF NOT EXISTS books (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
author TEXT NOT NULL,
|
||||
meta TEXT NOT NULL,
|
||||
bc TEXT NO NULL,
|
||||
details TEXT NOT NULL
|
||||
)""")
|
||||
|
||||
if version < 2:
|
||||
cursor.execute("""CREATE TABLE IF NOT EXISTS books_failed (id INTEGER PRIMARY KEY)""")
|
||||
|
||||
def add_book(self,
|
||||
book_id: int,
|
||||
name: str,
|
||||
author: str,
|
||||
meta: list,
|
||||
bc: list,
|
||||
details: dict):
|
||||
with self.lock:
|
||||
cur = self.cursor()
|
||||
cur.execute("INSERT INTO books (id, name, author, meta, bc, details) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(str(book_id), name, author, stringify(meta), stringify(bc), stringify(details)))
|
||||
cur.close()
|
||||
self.commit()
|
||||
|
||||
def add_failed_book(self, book_id: int):
|
||||
with self.lock:
|
||||
cur = self.cursor()
|
||||
cur.execute("INSERT INTO books_failed (id) VALUES (?)", (str(book_id),))
|
||||
cur.close()
|
||||
|
||||
def get_max_book_id(self) -> Optional[int]:
|
||||
cur = self.cursor()
|
||||
cur.execute("SELECT MAX(id) FROM books")
|
||||
res = cur.fetchone()[0]
|
||||
cur.close()
|
||||
return int(res) if res is not None else None
|
||||
|
||||
def get_book(self, book_id) -> Optional[dict]:
|
||||
cur = self.cursor()
|
||||
cur.execute("SELECT * FROM books WHERE id=?", (book_id,))
|
||||
all = cur.fetchall()
|
||||
cur.close()
|
||||
return all[0] if len(all) > 0 else None
|
||||
|
||||
def get_ids(self, id_from: int, id_to: int) -> list[int]:
|
||||
cur = self.cursor()
|
||||
cur.execute("SELECT id FROM books WHERE id BETWEEN ? AND ? ORDER BY id", (id_from, id_to,))
|
||||
l = []
|
||||
for row in cur.fetchall():
|
||||
l.append(int(row[0]))
|
||||
cur.close()
|
||||
return l
|
14486
fb/util.py
Normal file
14486
fb/util.py
Normal file
File diff suppressed because it is too large
Load Diff
179
main.py
Executable file
179
main.py
Executable file
@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
import requests
|
||||
import threading
|
||||
import random
|
||||
import urllib3
|
||||
|
||||
from queue import Queue
|
||||
from bs4 import BeautifulSoup
|
||||
from fb import Database
|
||||
from fb.util import get_fb_url, get_useragent
|
||||
from argparse import ArgumentParser, ArgumentError
|
||||
from sqlite3 import IntegrityError
|
||||
|
||||
db: Database
|
||||
|
||||
my_proxies = """
|
||||
127.0.0.1:1077
|
||||
127.0.0.1:1079
|
||||
127.0.0.1:1074
|
||||
127.0.0.1:1076
|
||||
127.0.0.1:1071
|
||||
127.0.0.1:1081
|
||||
127.0.0.1:1069
|
||||
"""
|
||||
my_proxies = list(set(my_proxies.strip().split()))
|
||||
|
||||
|
||||
class TooManyRequests(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def parse_book_page(book_id: int, proxy: str):
|
||||
headers = {
|
||||
'User-Agent': get_useragent()
|
||||
}
|
||||
|
||||
url = get_fb_url(book_id)
|
||||
proxy = f'socks5://{proxy}'
|
||||
r = requests.get(url,
|
||||
headers=headers,
|
||||
proxies=dict(http=proxy, https=proxy))
|
||||
if r.status_code != 200:
|
||||
if r.status_code == 429:
|
||||
raise TooManyRequests()
|
||||
|
||||
# print(f'{book_id} code {r.status_code}')
|
||||
return False
|
||||
|
||||
html = BeautifulSoup(r.text, "html.parser")
|
||||
desc = html.select_one('meta[name="description"]')['content']
|
||||
|
||||
# extract useful info from meta tags
|
||||
metainfo = []
|
||||
try:
|
||||
if desc.startswith('; '):
|
||||
desc = desc[2:]
|
||||
for item in desc.split('; '):
|
||||
colon_pos = item.index(':')
|
||||
key = item[0:colon_pos]
|
||||
val = item[colon_pos+2:]
|
||||
metainfo.append([key, val])
|
||||
except ValueError:
|
||||
metainfo.append(desc)
|
||||
pass
|
||||
|
||||
# parse name and author
|
||||
name = html.select_one('div.overview h1').text
|
||||
author = html.select_one('div.overview h2 i').text
|
||||
|
||||
# parse breadcrumbs hierarchy
|
||||
bc = html.select('ul.breadcrumb li.breadcrumb-item')
|
||||
bc_hierarchy = []
|
||||
bc_first_skipped = False
|
||||
if bc:
|
||||
for bc_item in bc:
|
||||
if not bc_first_skipped:
|
||||
bc_first_skipped = True
|
||||
continue
|
||||
bc_hierarchy.append(bc_item.text)
|
||||
|
||||
# book info table
|
||||
details = {}
|
||||
rows = html.select('table[width="400"] tr')
|
||||
if rows:
|
||||
for r in rows:
|
||||
cols = r.select('td')
|
||||
details[cols[0].text] = cols[1].text
|
||||
|
||||
db.add_book(book_id, name, author, metainfo, bc_hierarchy, details)
|
||||
return True
|
||||
|
||||
|
||||
def worker(task_queue, print_lock, proxy):
|
||||
while not task_queue.empty():
|
||||
book_id = task_queue.get()
|
||||
db_error = False
|
||||
result = None
|
||||
|
||||
try:
|
||||
result = parse_book_page(book_id, proxy)
|
||||
except IntegrityError:
|
||||
db_error = True
|
||||
except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests):
|
||||
task_queue.put(book_id)
|
||||
db.add_failed_book(book_id)
|
||||
print(f'{book_id}: failed due to network error, proxy = {proxy}')
|
||||
continue
|
||||
except requests.exceptions.ChunkedEncodingError:
|
||||
print(f'{book_id} causes weird error')
|
||||
continue
|
||||
|
||||
if result is not False:
|
||||
with print_lock:
|
||||
print(f"{book_id} " + ("done" if not db_error else " raised db error"))
|
||||
task_queue.task_done()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--book-id', type=int)
|
||||
parser.add_argument('--continue', action='store_true')
|
||||
parser.add_argument('--max-book-id', type=int, default=1500000)
|
||||
parser.add_argument('--find-gaps', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
db = Database()
|
||||
|
||||
if args.find_gaps:
|
||||
id_from = 100000
|
||||
id_to = 1400000
|
||||
ids_in_db = db.get_ids(id_from, id_to)
|
||||
task_queue = Queue()
|
||||
print_lock = threading.Lock()
|
||||
|
||||
for i in range(id_from, id_to+1):
|
||||
if i not in ids_in_db:
|
||||
task_queue.put(i)
|
||||
|
||||
threads = []
|
||||
for proxy in my_proxies:
|
||||
for i in range(4):
|
||||
thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
elif hasattr(args, 'continue') and getattr(args, 'continue') is True:
|
||||
if args.book_id:
|
||||
last_book_id = args.book_id
|
||||
else:
|
||||
last_book_id = db.get_max_book_id()
|
||||
if last_book_id is None:
|
||||
last_book_id = 0
|
||||
|
||||
task_queue = Queue()
|
||||
print_lock = threading.Lock()
|
||||
|
||||
for task_number in range(last_book_id + 1, args.max_book_id):
|
||||
task_queue.put(task_number)
|
||||
|
||||
threads = []
|
||||
for proxy in my_proxies:
|
||||
for i in range(3):
|
||||
thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
else:
|
||||
if not args.book_id:
|
||||
raise ArgumentError(None, '--book-id is required')
|
||||
proxy = random.choice(my_proxies)
|
||||
book = db.get_book(args.book_id)
|
||||
if book:
|
||||
raise RuntimeError('this book is already in the database')
|
||||
parse_book_page(args.book_id, proxy)
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
requests~=2.31.0
|
||||
urllib3~=2.2.1
|
||||
beautifulsoup4~=4.12.3
|
Loading…
x
Reference in New Issue
Block a user