This commit is contained in:
Evgeny Zinoviev 2024-06-16 00:04:44 +03:00
commit 5fd7512f90
7 changed files with 14850 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
/.idea
/.venv
/fb.sqlite3
__pycache__

59
dl.py Executable file
View File

@ -0,0 +1,59 @@
#!/usr/bin/env python3
import os
import requests
from fb import Database
from fb.util import get_dl_url, get_long_book_id
from argparse import ArgumentParser
def download_file(url, dst, cookie, user_agent):
streamed_response = requests.get(url, stream=True,
headers={
'User-Agent': user_agent,
'Cookie': cookie
})
with open(dst, 'wb') as file:
for chunk in streamed_response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
file.write(chunk)
def remove_from_my_books(bid, cookie, user_agent):
r = requests.post('https://www.forgottenbooks.com/books-remove', data={
'p': get_long_book_id(bid),
}, headers={
'User-Agent': user_agent,
'Cookie': cookie
})
r.raise_for_status()
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--from-id', type=int, required=True)
parser.add_argument('--to-id', type=int, required=True)
parser.add_argument('--output-directory', type=str, required=True)
parser.add_argument('--cookie', type=str, required=True)
parser.add_argument('--user-agent', type=str,
default='Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0')
args = parser.parse_args()
if not os.path.exists(args.output_directory):
os.makedirs(args.output_directory)
db = Database()
ids = db.get_ids(args.from_id, args.to_id)
for i in ids:
url = get_dl_url(i)
skip = False
dst = os.path.join(args.output_directory, f'{i}.pdf')
if not os.path.exists(dst):
download_file(url, dst, args.cookie, args.user_agent)
else:
print(f'{i} already downloaded, removing from my books')
skip = True
remove_from_my_books(i, args.cookie, args.user_agent)
if not skip:
print(f"saved {i}")

1
fb/__init__.py Normal file
View File

@ -0,0 +1 @@
from .database import Database

118
fb/database.py Normal file
View File

@ -0,0 +1,118 @@
import sqlite3
import logging
import os
import threading
from .util import stringify
from typing import Optional
class Database:
SCHEMA = 2
def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__)
config_dir = os.path.join(
os.getenv('HOME'),
'.config',
'forgottenbooks'
)
if not os.path.exists(config_dir):
os.makedirs(config_dir)
file = os.path.join(config_dir, 'fb.sqlite3')
self.sqlite = sqlite3.connect(file, check_same_thread=False)
self.lock = threading.Lock()
sqlite_version = self._get_sqlite_version()
self.logger.debug(f'SQLite version: {sqlite_version}')
schema_version = self.schema_get_version()
self.logger.debug(f'Schema version: {schema_version}')
self.schema_init(schema_version)
self.schema_set_version(self.SCHEMA)
def __del__(self):
if self.sqlite:
self.sqlite.commit()
self.sqlite.close()
def _get_sqlite_version(self) -> str:
cursor = self.sqlite.cursor()
cursor.execute("SELECT sqlite_version()")
return cursor.fetchone()[0]
def schema_get_version(self) -> int:
cursor = self.sqlite.execute('PRAGMA user_version')
return int(cursor.fetchone()[0])
def schema_set_version(self, v) -> None:
self.sqlite.execute('PRAGMA user_version={:d}'.format(v))
self.logger.info(f'Schema set to {v}')
def cursor(self) -> sqlite3.Cursor:
return self.sqlite.cursor()
def commit(self) -> None:
return self.sqlite.commit()
def schema_init(self, version: int) -> None:
cursor = self.cursor()
if version < 1:
cursor.execute("""CREATE TABLE IF NOT EXISTS books (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
author TEXT NOT NULL,
meta TEXT NOT NULL,
bc TEXT NO NULL,
details TEXT NOT NULL
)""")
if version < 2:
cursor.execute("""CREATE TABLE IF NOT EXISTS books_failed (id INTEGER PRIMARY KEY)""")
def add_book(self,
book_id: int,
name: str,
author: str,
meta: list,
bc: list,
details: dict):
with self.lock:
cur = self.cursor()
cur.execute("INSERT INTO books (id, name, author, meta, bc, details) VALUES (?, ?, ?, ?, ?, ?)",
(str(book_id), name, author, stringify(meta), stringify(bc), stringify(details)))
cur.close()
self.commit()
def add_failed_book(self, book_id: int):
with self.lock:
cur = self.cursor()
cur.execute("INSERT INTO books_failed (id) VALUES (?)", (str(book_id),))
cur.close()
def get_max_book_id(self) -> Optional[int]:
cur = self.cursor()
cur.execute("SELECT MAX(id) FROM books")
res = cur.fetchone()[0]
cur.close()
return int(res) if res is not None else None
def get_book(self, book_id) -> Optional[dict]:
cur = self.cursor()
cur.execute("SELECT * FROM books WHERE id=?", (book_id,))
all = cur.fetchall()
cur.close()
return all[0] if len(all) > 0 else None
def get_ids(self, id_from: int, id_to: int) -> list[int]:
cur = self.cursor()
cur.execute("SELECT id FROM books WHERE id BETWEEN ? AND ? ORDER BY id", (id_from, id_to,))
l = []
for row in cur.fetchall():
l.append(int(row[0]))
cur.close()
return l

14486
fb/util.py Normal file

File diff suppressed because it is too large Load Diff

179
main.py Executable file
View File

@ -0,0 +1,179 @@
#!/usr/bin/env python3
import requests
import threading
import random
import urllib3
from queue import Queue
from bs4 import BeautifulSoup
from fb import Database
from fb.util import get_fb_url, get_useragent
from argparse import ArgumentParser, ArgumentError
from sqlite3 import IntegrityError
db: Database
my_proxies = """
127.0.0.1:1077
127.0.0.1:1079
127.0.0.1:1074
127.0.0.1:1076
127.0.0.1:1071
127.0.0.1:1081
127.0.0.1:1069
"""
my_proxies = list(set(my_proxies.strip().split()))
class TooManyRequests(Exception):
pass
def parse_book_page(book_id: int, proxy: str):
headers = {
'User-Agent': get_useragent()
}
url = get_fb_url(book_id)
proxy = f'socks5://{proxy}'
r = requests.get(url,
headers=headers,
proxies=dict(http=proxy, https=proxy))
if r.status_code != 200:
if r.status_code == 429:
raise TooManyRequests()
# print(f'{book_id} code {r.status_code}')
return False
html = BeautifulSoup(r.text, "html.parser")
desc = html.select_one('meta[name="description"]')['content']
# extract useful info from meta tags
metainfo = []
try:
if desc.startswith('; '):
desc = desc[2:]
for item in desc.split('; '):
colon_pos = item.index(':')
key = item[0:colon_pos]
val = item[colon_pos+2:]
metainfo.append([key, val])
except ValueError:
metainfo.append(desc)
pass
# parse name and author
name = html.select_one('div.overview h1').text
author = html.select_one('div.overview h2 i').text
# parse breadcrumbs hierarchy
bc = html.select('ul.breadcrumb li.breadcrumb-item')
bc_hierarchy = []
bc_first_skipped = False
if bc:
for bc_item in bc:
if not bc_first_skipped:
bc_first_skipped = True
continue
bc_hierarchy.append(bc_item.text)
# book info table
details = {}
rows = html.select('table[width="400"] tr')
if rows:
for r in rows:
cols = r.select('td')
details[cols[0].text] = cols[1].text
db.add_book(book_id, name, author, metainfo, bc_hierarchy, details)
return True
def worker(task_queue, print_lock, proxy):
while not task_queue.empty():
book_id = task_queue.get()
db_error = False
result = None
try:
result = parse_book_page(book_id, proxy)
except IntegrityError:
db_error = True
except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests):
task_queue.put(book_id)
db.add_failed_book(book_id)
print(f'{book_id}: failed due to network error, proxy = {proxy}')
continue
except requests.exceptions.ChunkedEncodingError:
print(f'{book_id} causes weird error')
continue
if result is not False:
with print_lock:
print(f"{book_id} " + ("done" if not db_error else " raised db error"))
task_queue.task_done()
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--book-id', type=int)
parser.add_argument('--continue', action='store_true')
parser.add_argument('--max-book-id', type=int, default=1500000)
parser.add_argument('--find-gaps', action='store_true')
args = parser.parse_args()
db = Database()
if args.find_gaps:
id_from = 100000
id_to = 1400000
ids_in_db = db.get_ids(id_from, id_to)
task_queue = Queue()
print_lock = threading.Lock()
for i in range(id_from, id_to+1):
if i not in ids_in_db:
task_queue.put(i)
threads = []
for proxy in my_proxies:
for i in range(4):
thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
elif hasattr(args, 'continue') and getattr(args, 'continue') is True:
if args.book_id:
last_book_id = args.book_id
else:
last_book_id = db.get_max_book_id()
if last_book_id is None:
last_book_id = 0
task_queue = Queue()
print_lock = threading.Lock()
for task_number in range(last_book_id + 1, args.max_book_id):
task_queue.put(task_number)
threads = []
for proxy in my_proxies:
for i in range(3):
thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
else:
if not args.book_id:
raise ArgumentError(None, '--book-id is required')
proxy = random.choice(my_proxies)
book = db.get_book(args.book_id)
if book:
raise RuntimeError('this book is already in the database')
parse_book_page(args.book_id, proxy)

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
requests~=2.31.0
urllib3~=2.2.1
beautifulsoup4~=4.12.3