Evgeny Zinoviev 5fd7512f90 initial
2024-06-16 00:31:39 +03:00

180 lines
5.1 KiB
Python
Executable File

#!/usr/bin/env python3
import requests
import threading
import random
import urllib3
from queue import Queue
from bs4 import BeautifulSoup
from fb import Database
from fb.util import get_fb_url, get_useragent
from argparse import ArgumentParser, ArgumentError
from sqlite3 import IntegrityError
db: Database
my_proxies = """
127.0.0.1:1077
127.0.0.1:1079
127.0.0.1:1074
127.0.0.1:1076
127.0.0.1:1071
127.0.0.1:1081
127.0.0.1:1069
"""
my_proxies = list(set(my_proxies.strip().split()))
class TooManyRequests(Exception):
pass
def parse_book_page(book_id: int, proxy: str):
headers = {
'User-Agent': get_useragent()
}
url = get_fb_url(book_id)
proxy = f'socks5://{proxy}'
r = requests.get(url,
headers=headers,
proxies=dict(http=proxy, https=proxy))
if r.status_code != 200:
if r.status_code == 429:
raise TooManyRequests()
# print(f'{book_id} code {r.status_code}')
return False
html = BeautifulSoup(r.text, "html.parser")
desc = html.select_one('meta[name="description"]')['content']
# extract useful info from meta tags
metainfo = []
try:
if desc.startswith('; '):
desc = desc[2:]
for item in desc.split('; '):
colon_pos = item.index(':')
key = item[0:colon_pos]
val = item[colon_pos+2:]
metainfo.append([key, val])
except ValueError:
metainfo.append(desc)
pass
# parse name and author
name = html.select_one('div.overview h1').text
author = html.select_one('div.overview h2 i').text
# parse breadcrumbs hierarchy
bc = html.select('ul.breadcrumb li.breadcrumb-item')
bc_hierarchy = []
bc_first_skipped = False
if bc:
for bc_item in bc:
if not bc_first_skipped:
bc_first_skipped = True
continue
bc_hierarchy.append(bc_item.text)
# book info table
details = {}
rows = html.select('table[width="400"] tr')
if rows:
for r in rows:
cols = r.select('td')
details[cols[0].text] = cols[1].text
db.add_book(book_id, name, author, metainfo, bc_hierarchy, details)
return True
def worker(task_queue, print_lock, proxy):
while not task_queue.empty():
book_id = task_queue.get()
db_error = False
result = None
try:
result = parse_book_page(book_id, proxy)
except IntegrityError:
db_error = True
except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests):
task_queue.put(book_id)
db.add_failed_book(book_id)
print(f'{book_id}: failed due to network error, proxy = {proxy}')
continue
except requests.exceptions.ChunkedEncodingError:
print(f'{book_id} causes weird error')
continue
if result is not False:
with print_lock:
print(f"{book_id} " + ("done" if not db_error else " raised db error"))
task_queue.task_done()
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--book-id', type=int)
parser.add_argument('--continue', action='store_true')
parser.add_argument('--max-book-id', type=int, default=1500000)
parser.add_argument('--find-gaps', action='store_true')
args = parser.parse_args()
db = Database()
if args.find_gaps:
id_from = 100000
id_to = 1400000
ids_in_db = db.get_ids(id_from, id_to)
task_queue = Queue()
print_lock = threading.Lock()
for i in range(id_from, id_to+1):
if i not in ids_in_db:
task_queue.put(i)
threads = []
for proxy in my_proxies:
for i in range(4):
thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
elif hasattr(args, 'continue') and getattr(args, 'continue') is True:
if args.book_id:
last_book_id = args.book_id
else:
last_book_id = db.get_max_book_id()
if last_book_id is None:
last_book_id = 0
task_queue = Queue()
print_lock = threading.Lock()
for task_number in range(last_book_id + 1, args.max_book_id):
task_queue.put(task_number)
threads = []
for proxy in my_proxies:
for i in range(3):
thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
else:
if not args.book_id:
raise ArgumentError(None, '--book-id is required')
proxy = random.choice(my_proxies)
book = db.get_book(args.book_id)
if book:
raise RuntimeError('this book is already in the database')
parse_book_page(args.book_id, proxy)