initial

2024-06-16 00:04:44 +03:00 · 2024-06-16 00:04:44 +03:00 · 5fd7512f90
commit 5fd7512f90
7 changed files with 14850 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+/.idea
+/.venv
+/fb.sqlite3
+__pycache__
--- a/dl.py
+++ b/dl.py
@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+import os
+import requests
+
+from fb import Database
+from fb.util import get_dl_url, get_long_book_id
+from argparse import ArgumentParser
+
+
+def download_file(url, dst, cookie, user_agent):
+    streamed_response = requests.get(url, stream=True,
+                                     headers={
+                                         'User-Agent': user_agent,
+                                         'Cookie': cookie
+                                     })
+    with open(dst, 'wb') as file:
+        for chunk in streamed_response.iter_content(chunk_size=1024):
+            if chunk:  # filter out keep-alive new chunks
+                file.write(chunk)
+
+
+def remove_from_my_books(bid, cookie, user_agent):
+    r = requests.post('https://www.forgottenbooks.com/books-remove', data={
+        'p': get_long_book_id(bid),
+    }, headers={
+        'User-Agent': user_agent,
+        'Cookie': cookie
+    })
+    r.raise_for_status()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--from-id', type=int, required=True)
+    parser.add_argument('--to-id', type=int, required=True)
+    parser.add_argument('--output-directory', type=str, required=True)
+    parser.add_argument('--cookie', type=str, required=True)
+    parser.add_argument('--user-agent', type=str,
+                        default='Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0')
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_directory):
+        os.makedirs(args.output_directory)
+
+    db = Database()
+    ids = db.get_ids(args.from_id, args.to_id)
+    for i in ids:
+        url = get_dl_url(i)
+        skip = False
+        dst = os.path.join(args.output_directory, f'{i}.pdf')
+        if not os.path.exists(dst):
+            download_file(url, dst, args.cookie, args.user_agent)
+        else:
+            print(f'{i} already downloaded, removing from my books')
+            skip = True
+
+        remove_from_my_books(i, args.cookie, args.user_agent)
+        if not skip:
+            print(f"saved {i}")
--- a/fb/init.py
+++ b/fb/init.py
@ -0,0 +1 @@
+from .database import Database
--- a/fb/database.py
+++ b/fb/database.py
@ -0,0 +1,118 @@
+import sqlite3
+import logging
+import os
+import threading
+
+from .util import stringify
+from typing import Optional
+
+
+class Database:
+    SCHEMA = 2
+
+    def __init__(self):
+        self.logger = logging.getLogger(self.__class__.__name__)
+
+        config_dir = os.path.join(
+            os.getenv('HOME'),
+            '.config',
+            'forgottenbooks'
+        )
+        if not os.path.exists(config_dir):
+            os.makedirs(config_dir)
+
+        file = os.path.join(config_dir, 'fb.sqlite3')
+        self.sqlite = sqlite3.connect(file, check_same_thread=False)
+        self.lock = threading.Lock()
+
+        sqlite_version = self._get_sqlite_version()
+        self.logger.debug(f'SQLite version: {sqlite_version}')
+
+        schema_version = self.schema_get_version()
+        self.logger.debug(f'Schema version: {schema_version}')
+
+        self.schema_init(schema_version)
+        self.schema_set_version(self.SCHEMA)
+
+    def __del__(self):
+        if self.sqlite:
+            self.sqlite.commit()
+            self.sqlite.close()
+
+    def _get_sqlite_version(self) -> str:
+        cursor = self.sqlite.cursor()
+        cursor.execute("SELECT sqlite_version()")
+        return cursor.fetchone()[0]
+
+    def schema_get_version(self) -> int:
+        cursor = self.sqlite.execute('PRAGMA user_version')
+        return int(cursor.fetchone()[0])
+
+    def schema_set_version(self, v) -> None:
+        self.sqlite.execute('PRAGMA user_version={:d}'.format(v))
+        self.logger.info(f'Schema set to {v}')
+
+    def cursor(self) -> sqlite3.Cursor:
+        return self.sqlite.cursor()
+
+    def commit(self) -> None:
+        return self.sqlite.commit()
+
+    def schema_init(self, version: int) -> None:
+        cursor = self.cursor()
+
+        if version < 1:
+            cursor.execute("""CREATE TABLE IF NOT EXISTS books (
+                    id INTEGER PRIMARY KEY,
+                    name TEXT NOT NULL,
+                    author TEXT NOT NULL,
+                    meta TEXT NOT NULL, 
+                    bc TEXT NO NULL,
+                    details TEXT NOT NULL
+                )""")
+
+        if version < 2:
+            cursor.execute("""CREATE TABLE IF NOT EXISTS books_failed (id INTEGER PRIMARY KEY)""")
+
+    def add_book(self,
+                 book_id: int,
+                 name: str,
+                 author: str,
+                 meta: list,
+                 bc: list,
+                 details: dict):
+        with self.lock:
+            cur = self.cursor()
+            cur.execute("INSERT INTO books (id, name, author, meta, bc, details) VALUES (?, ?, ?, ?, ?, ?)",
+                              (str(book_id), name, author, stringify(meta), stringify(bc), stringify(details)))
+            cur.close()
+            self.commit()
+
+    def add_failed_book(self, book_id: int):
+        with self.lock:
+            cur = self.cursor()
+            cur.execute("INSERT INTO books_failed (id) VALUES (?)", (str(book_id),))
+            cur.close()
+
+    def get_max_book_id(self) -> Optional[int]:
+        cur = self.cursor()
+        cur.execute("SELECT MAX(id) FROM books")
+        res = cur.fetchone()[0]
+        cur.close()
+        return int(res) if res is not None else None
+
+    def get_book(self, book_id) -> Optional[dict]:
+        cur = self.cursor()
+        cur.execute("SELECT * FROM books WHERE id=?", (book_id,))
+        all = cur.fetchall()
+        cur.close()
+        return all[0] if len(all) > 0 else None
+
+    def get_ids(self, id_from: int, id_to: int) -> list[int]:
+        cur = self.cursor()
+        cur.execute("SELECT id FROM books WHERE id BETWEEN ? AND ? ORDER BY id", (id_from, id_to,))
+        l = []
+        for row in cur.fetchall():
+            l.append(int(row[0]))
+        cur.close()
+        return l
--- a/fb/util.py
+++ b/fb/util.py
--- a/main.py
+++ b/main.py
@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+import requests
+import threading
+import random
+import urllib3
+
+from queue import Queue
+from bs4 import BeautifulSoup
+from fb import Database
+from fb.util import get_fb_url, get_useragent
+from argparse import ArgumentParser, ArgumentError
+from sqlite3 import IntegrityError
+
+db: Database
+
+my_proxies = """
+127.0.0.1:1077
+127.0.0.1:1079
+127.0.0.1:1074
+127.0.0.1:1076
+127.0.0.1:1071
+127.0.0.1:1081
+127.0.0.1:1069
+"""
+my_proxies = list(set(my_proxies.strip().split()))
+
+
+class TooManyRequests(Exception):
+    pass
+
+
+def parse_book_page(book_id: int, proxy: str):
+    headers = {
+        'User-Agent': get_useragent()
+    }
+
+    url = get_fb_url(book_id)
+    proxy = f'socks5://{proxy}'
+    r = requests.get(url,
+                     headers=headers,
+                     proxies=dict(http=proxy, https=proxy))
+    if r.status_code != 200:
+        if r.status_code == 429:
+            raise TooManyRequests()
+
+        # print(f'{book_id} code {r.status_code}')
+        return False
+
+    html = BeautifulSoup(r.text, "html.parser")
+    desc = html.select_one('meta[name="description"]')['content']
+
+    # extract useful info from meta tags
+    metainfo = []
+    try:
+        if desc.startswith('; '):
+            desc = desc[2:]
+        for item in desc.split('; '):
+            colon_pos = item.index(':')
+            key = item[0:colon_pos]
+            val = item[colon_pos+2:]
+            metainfo.append([key, val])
+    except ValueError:
+        metainfo.append(desc)
+        pass
+
+    # parse name and author
+    name = html.select_one('div.overview h1').text
+    author = html.select_one('div.overview h2 i').text
+
+    # parse breadcrumbs hierarchy
+    bc = html.select('ul.breadcrumb li.breadcrumb-item')
+    bc_hierarchy = []
+    bc_first_skipped = False
+    if bc:
+        for bc_item in bc:
+            if not bc_first_skipped:
+                bc_first_skipped = True
+                continue
+            bc_hierarchy.append(bc_item.text)
+
+    # book info table
+    details = {}
+    rows = html.select('table[width="400"] tr')
+    if rows:
+        for r in rows:
+            cols = r.select('td')
+            details[cols[0].text] = cols[1].text
+
+    db.add_book(book_id, name, author, metainfo, bc_hierarchy, details)
+    return True
+
+
+def worker(task_queue, print_lock, proxy):
+    while not task_queue.empty():
+        book_id = task_queue.get()
+        db_error = False
+        result = None
+
+        try:
+            result = parse_book_page(book_id, proxy)
+        except IntegrityError:
+            db_error = True
+        except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests):
+            task_queue.put(book_id)
+            db.add_failed_book(book_id)
+            print(f'{book_id}: failed due to network error, proxy = {proxy}')
+            continue
+        except requests.exceptions.ChunkedEncodingError:
+            print(f'{book_id} causes weird error')
+            continue
+
+        if result is not False:
+            with print_lock:
+                print(f"{book_id} " + ("done" if not db_error else " raised db error"))
+        task_queue.task_done()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--book-id', type=int)
+    parser.add_argument('--continue', action='store_true')
+    parser.add_argument('--max-book-id', type=int, default=1500000)
+    parser.add_argument('--find-gaps', action='store_true')
+    args = parser.parse_args()
+
+    db = Database()
+
+    if args.find_gaps:
+        id_from = 100000
+        id_to = 1400000
+        ids_in_db = db.get_ids(id_from, id_to)
+        task_queue = Queue()
+        print_lock = threading.Lock()
+
+        for i in range(id_from, id_to+1):
+            if i not in ids_in_db:
+                task_queue.put(i)
+
+        threads = []
+        for proxy in my_proxies:
+            for i in range(4):
+                thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
+                thread.start()
+                threads.append(thread)
+
+        for thread in threads:
+            thread.join()
+
+    elif hasattr(args, 'continue') and getattr(args, 'continue') is True:
+        if args.book_id:
+            last_book_id = args.book_id
+        else:
+            last_book_id = db.get_max_book_id()
+            if last_book_id is None:
+                last_book_id = 0
+
+        task_queue = Queue()
+        print_lock = threading.Lock()
+
+        for task_number in range(last_book_id + 1, args.max_book_id):
+            task_queue.put(task_number)
+
+        threads = []
+        for proxy in my_proxies:
+            for i in range(3):
+                thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
+                thread.start()
+                threads.append(thread)
+
+        for thread in threads:
+            thread.join()
+    else:
+        if not args.book_id:
+            raise ArgumentError(None, '--book-id is required')
+        proxy = random.choice(my_proxies)
+        book = db.get_book(args.book_id)
+        if book:
+            raise RuntimeError('this book is already in the database')
+        parse_book_page(args.book_id, proxy)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+requests~=2.31.0
+urllib3~=2.2.1
+beautifulsoup4~=4.12.3