180 lines
5.1 KiB
Python
Executable File
180 lines
5.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import requests
|
|
import threading
|
|
import random
|
|
import urllib3
|
|
|
|
from queue import Queue
|
|
from bs4 import BeautifulSoup
|
|
from fb import Database
|
|
from fb.util import get_fb_url, get_useragent
|
|
from argparse import ArgumentParser, ArgumentError
|
|
from sqlite3 import IntegrityError
|
|
|
|
db: Database
|
|
|
|
my_proxies = """
|
|
127.0.0.1:1077
|
|
127.0.0.1:1079
|
|
127.0.0.1:1074
|
|
127.0.0.1:1076
|
|
127.0.0.1:1071
|
|
127.0.0.1:1081
|
|
127.0.0.1:1069
|
|
"""
|
|
my_proxies = list(set(my_proxies.strip().split()))
|
|
|
|
|
|
class TooManyRequests(Exception):
|
|
pass
|
|
|
|
|
|
def parse_book_page(book_id: int, proxy: str):
|
|
headers = {
|
|
'User-Agent': get_useragent()
|
|
}
|
|
|
|
url = get_fb_url(book_id)
|
|
proxy = f'socks5://{proxy}'
|
|
r = requests.get(url,
|
|
headers=headers,
|
|
proxies=dict(http=proxy, https=proxy))
|
|
if r.status_code != 200:
|
|
if r.status_code == 429:
|
|
raise TooManyRequests()
|
|
|
|
# print(f'{book_id} code {r.status_code}')
|
|
return False
|
|
|
|
html = BeautifulSoup(r.text, "html.parser")
|
|
desc = html.select_one('meta[name="description"]')['content']
|
|
|
|
# extract useful info from meta tags
|
|
metainfo = []
|
|
try:
|
|
if desc.startswith('; '):
|
|
desc = desc[2:]
|
|
for item in desc.split('; '):
|
|
colon_pos = item.index(':')
|
|
key = item[0:colon_pos]
|
|
val = item[colon_pos+2:]
|
|
metainfo.append([key, val])
|
|
except ValueError:
|
|
metainfo.append(desc)
|
|
pass
|
|
|
|
# parse name and author
|
|
name = html.select_one('div.overview h1').text
|
|
author = html.select_one('div.overview h2 i').text
|
|
|
|
# parse breadcrumbs hierarchy
|
|
bc = html.select('ul.breadcrumb li.breadcrumb-item')
|
|
bc_hierarchy = []
|
|
bc_first_skipped = False
|
|
if bc:
|
|
for bc_item in bc:
|
|
if not bc_first_skipped:
|
|
bc_first_skipped = True
|
|
continue
|
|
bc_hierarchy.append(bc_item.text)
|
|
|
|
# book info table
|
|
details = {}
|
|
rows = html.select('table[width="400"] tr')
|
|
if rows:
|
|
for r in rows:
|
|
cols = r.select('td')
|
|
details[cols[0].text] = cols[1].text
|
|
|
|
db.add_book(book_id, name, author, metainfo, bc_hierarchy, details)
|
|
return True
|
|
|
|
|
|
def worker(task_queue, print_lock, proxy):
|
|
while not task_queue.empty():
|
|
book_id = task_queue.get()
|
|
db_error = False
|
|
result = None
|
|
|
|
try:
|
|
result = parse_book_page(book_id, proxy)
|
|
except IntegrityError:
|
|
db_error = True
|
|
except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests):
|
|
task_queue.put(book_id)
|
|
db.add_failed_book(book_id)
|
|
print(f'{book_id}: failed due to network error, proxy = {proxy}')
|
|
continue
|
|
except requests.exceptions.ChunkedEncodingError:
|
|
print(f'{book_id} causes weird error')
|
|
continue
|
|
|
|
if result is not False:
|
|
with print_lock:
|
|
print(f"{book_id} " + ("done" if not db_error else " raised db error"))
|
|
task_queue.task_done()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = ArgumentParser()
|
|
parser.add_argument('--book-id', type=int)
|
|
parser.add_argument('--continue', action='store_true')
|
|
parser.add_argument('--max-book-id', type=int, default=1500000)
|
|
parser.add_argument('--find-gaps', action='store_true')
|
|
args = parser.parse_args()
|
|
|
|
db = Database()
|
|
|
|
if args.find_gaps:
|
|
id_from = 100000
|
|
id_to = 1400000
|
|
ids_in_db = db.get_ids(id_from, id_to)
|
|
task_queue = Queue()
|
|
print_lock = threading.Lock()
|
|
|
|
for i in range(id_from, id_to+1):
|
|
if i not in ids_in_db:
|
|
task_queue.put(i)
|
|
|
|
threads = []
|
|
for proxy in my_proxies:
|
|
for i in range(4):
|
|
thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
|
|
thread.start()
|
|
threads.append(thread)
|
|
|
|
for thread in threads:
|
|
thread.join()
|
|
|
|
elif hasattr(args, 'continue') and getattr(args, 'continue') is True:
|
|
if args.book_id:
|
|
last_book_id = args.book_id
|
|
else:
|
|
last_book_id = db.get_max_book_id()
|
|
if last_book_id is None:
|
|
last_book_id = 0
|
|
|
|
task_queue = Queue()
|
|
print_lock = threading.Lock()
|
|
|
|
for task_number in range(last_book_id + 1, args.max_book_id):
|
|
task_queue.put(task_number)
|
|
|
|
threads = []
|
|
for proxy in my_proxies:
|
|
for i in range(3):
|
|
thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
|
|
thread.start()
|
|
threads.append(thread)
|
|
|
|
for thread in threads:
|
|
thread.join()
|
|
else:
|
|
if not args.book_id:
|
|
raise ArgumentError(None, '--book-id is required')
|
|
proxy = random.choice(my_proxies)
|
|
book = db.get_book(args.book_id)
|
|
if book:
|
|
raise RuntimeError('this book is already in the database')
|
|
parse_book_page(args.book_id, proxy)
|