This commit is contained in:
Evgeny Zinoviev 2021-04-20 00:42:06 +03:00
parent a00b0b2cbf
commit 36f59a3a84
4 changed files with 114 additions and 27 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
venv venv
.idea .idea
output.csv output.csv
__pycache__

View File

@ -1,4 +1,4 @@
import requests, textract, re, os, tempfile, random, string, csv import requests, textract, re, os, tempfile, random, string
from argparse import ArgumentParser from argparse import ArgumentParser
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -13,21 +13,24 @@ regex = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|
def strgen(n: int) -> str: def strgen(n: int) -> str:
return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(n)).lower() return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(n)).lower()
def get_links(s: str) -> list[str]:
return list(set(re.findall(regex, s)))
class MosGorSud:
def __init__(self, url: str, upto_pages: int, output: str):
self.url = url
self.upto_pages = upto_pages
self.output = output
def go(self): class MGSPiracy:
f = open(self.output, 'w', newline='') BASE_URL = "https://mos-gorsud.ru/mgs/defend"
csv_writer = csv.writer(f)
for page in range(1, self.upto_pages): def __init__(self, from_page: int, to_page: int):
self.from_page = from_page
self.to_page = to_page
def get_cases(self) -> list[dict]:
cases = []
for page in range(self.from_page, self.to_page+1):
print(f'page {page}') print(f'page {page}')
url = self.url + '?page=' + str(page) url = self.BASE_URL + '?page=' + str(page)
r = requests.get(url, headers=headers) r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser") soup = BeautifulSoup(r.text, "html.parser")
@ -39,19 +42,25 @@ class MosGorSud:
date = cols[0].get_text().strip() date = cols[0].get_text().strip()
statement_number = cols[1].get_text().strip() statement_number = cols[1].get_text().strip()
applicant = cols[3].get_text().strip() applicant = cols[3].get_text().strip()
objects = cols[4].get_text().strip() object = cols[4].get_text().strip()
link = self.full_link(cols[5].find('a')['href']) link = self.mgs_url(cols[5].find('a')['href'])
text = mgs.get_document_text(link) decision_text = self.get_document_text(link)
links = '\n'.join(mgs.get_links(text)) violation_links = '\n'.join(get_links(decision_text))
# print(f'date={date}, stmt_number={statement_number}, applicant={applicant} objects={objects} link={link}') cases.append(dict(
date=date,
statement_number=statement_number,
applicant=applicant,
object=object,
doc_link=link,
violation_links=violation_links,
decision_text=decision_text
))
csv_writer.writerow((date, statement_number, applicant, objects, link, links)) return cases
f.close() def mgs_url(self, url: str) -> str:
def full_link(self, url: str) -> str:
if not url.startswith('http:') and not url.startswith('https:'): if not url.startswith('http:') and not url.startswith('https:'):
if not url.startswith('/'): if not url.startswith('/'):
url = '/' + url url = '/' + url
@ -75,9 +84,6 @@ class MosGorSud:
return text return text
def get_links(self, s: str) -> list[str]:
return list(set(re.findall(regex, s)))
if __name__ == '__main__': if __name__ == '__main__':
argp = ArgumentParser() argp = ArgumentParser()
@ -87,7 +93,7 @@ if __name__ == '__main__':
help='Last page to parse') help='Last page to parse')
args = argp.parse_args() args = argp.parse_args()
mgs = MosGorSud(url="https://mos-gorsud.ru/mgs/defend", mgs = MGSPiracy(url="https://mos-gorsud.ru/mgs/defend",
upto_pages=args.upto_pages, to_page=args.upto_pages,
output=args.output) output=args.output)
mgs.go() mgs.get_cases()

50
telegram_notify.py Normal file
View File

@ -0,0 +1,50 @@
import requests
from mgs import MGSPiracy
from argparse import ArgumentParser
from jstate import JState
if __name__ == '__main__':
# parse arguments
parser = ArgumentParser()
parser.add_argument('--state-file', required=True)
parser.add_argument('--token', help='Telegram bot token', required=True)
parser.add_argument('--chat-id', type=int, help='Telegram chat id (with bot)', required=True)
parser.add_argument('--from', type=int, default=1, help='First page', dest='_from')
parser.add_argument('--to', type=int, default=5, help='Last page')
parser.add_argument('--domains', nargs='+', required=True)
args = parser.parse_args()
# get recent cases
mgs = MGSPiracy(from_page=args._from, to_page=args.to)
cases = mgs.get_cases()
# read state
jst = JState(args.state_file, default=dict(cases=[]))
data = jst.read()
# loop through cases
results = []
for case in cases:
if case['statement_number'] in data['cases']:
continue
matched = False
for mydomain in args.domains:
if mydomain in case['decision_text']:
matched = True
results.append('%s found in %s' % (mydomain, case['statement_number']))
data['cases'].append(case['statement_number'])
if matched:
break
# remember found cases
jst.write(data)
# if found anything, send to telegram
if results:
text = 'new mos-gorsud findings:\n'.join(results)
r = requests.post('https://api.telegram.org/bot%s/sendMessage' % args.token, data={
'chat_id': args.chat_id,
'text': text
})

30
to_csv.py Normal file
View File

@ -0,0 +1,30 @@
import csv
from mgs import MGSPiracy
from argparse import ArgumentParser
if __name__ == '__main__':
# parse arguments
argp = ArgumentParser()
argp.add_argument('--output', type=str, default='output.csv', help='CSV output file name')
argp.add_argument('--from', type=int, default=0, help='First page', dest='_from')
argp.add_argument('--to', type=int, default=10, help='Last page')
args = argp.parse_args()
# get cases
mgs = MGSPiracy(from_page=args._from, to_page=args.to)
cases = mgs.get_cases()
# write to csv
f = open(args.output, 'w', newline='')
csv_writer = csv.writer(f)
for case in cases:
csv_writer.writerow((
case['date'],
case['statement_number'],
case['applicant'],
case['object'],
case['doc_link'],
case['violation_links']))
f.close()