261 lines
6.4 KiB
Python
261 lines
6.4 KiB
Python
import os
|
||
import json
|
||
import re
|
||
import datetime
|
||
import time
|
||
from util import split_sen, rot_ru
|
||
|
||
CWD = os.path.dirname(os.path.realpath(__file__))
|
||
|
||
def _data_sort_len(i):
|
||
return len(i['text'])
|
||
|
||
def _data_sort_date(i):
|
||
return int(time.mktime(datetime.datetime.strptime(i['date'], '%d/%m/%y').timetuple()))
|
||
|
||
# sort: 'len', 'date'
|
||
def load_data(sort='len', sort_reverse=False, date=None, type=None):
|
||
with open(os.path.join(CWD, "data.json")) as f:
|
||
data = json.loads(f.read())
|
||
|
||
# stringify all types
|
||
for k, v in enumerate(data):
|
||
if 'type' in data[k]:
|
||
data[k]['type'] = str(data[k]['type'])
|
||
else:
|
||
data[k]['type'] = '1'
|
||
|
||
# ignore placeholders
|
||
data = list(filter(lambda i: i['text'] != '', data))
|
||
|
||
# sort
|
||
if sort == 'len':
|
||
sort_f = _data_sort_len
|
||
elif sort == 'date':
|
||
sort_f = _data_sort_date
|
||
else:
|
||
raise Error("Unknown sort type " + str(sort))
|
||
|
||
# filter by date
|
||
data = sorted(data, key=sort_f, reverse=sort_reverse)
|
||
|
||
if date:
|
||
data = list(filter(lambda i: 'date' in i and i['date'] == date, data))
|
||
|
||
if type:
|
||
data = list(filter(lambda i: 'type' in i and i['type'] == str(type), data))
|
||
|
||
return data
|
||
|
||
def clean_string(s, remove_junk=False):
|
||
s = s.replace(')', ') ')
|
||
s = re.sub(r'(\!|\.|\]|\,)([^\)])', r'\1 \2', s)
|
||
#s = s.replace('/', ' ')
|
||
s = s.upper()
|
||
|
||
s = s.replace('/', ' ')
|
||
s = re.sub(r'\s+', ' ', s).strip()
|
||
|
||
junks = [
|
||
'ВОЕННОЕ',
|
||
'ВЫШЕСТОЯЩИХ',
|
||
'ПРАВО',
|
||
'ПРАВИЛАМ ВОЙНЫ',
|
||
'ВЫПИСКА',
|
||
'КОНТРОЛЬ',
|
||
'ИХ',
|
||
'ПО',
|
||
'НАВЫКИ',
|
||
'С ВЫШЕСТОЯЩИМИ',
|
||
#'ПРИСУТСТВИЕ',
|
||
#'ЛИНЕЙНО',
|
||
'ИНСТРУКЦИИ',
|
||
'ЗАКОННО',
|
||
'ПОХЛЕБКА',
|
||
'СВЯЗЕЙ',
|
||
'ЖУЮЩЕГО ХРЯЩИ',
|
||
'ИНДЕКСИРОВАН БЕЗУКОРИЗНЕНННО',
|
||
'ИНДЕКСИРОВАН БЕЗУКОРИЗНЕННО',
|
||
'ОТКЛАДЫВАЕТСЯ ЛИНЕЙНО',
|
||
'УСТАЛИ СМОТРЯЩИХ',
|
||
'- ЕГО ВЕЛИЧЕСТВО',
|
||
'ГУБЕРНИЯ',
|
||
'С ВЫШЕСТОЯЩИМИ КОНТРОЛЬ',
|
||
'С ЛОКАЦИИ',
|
||
'КАЗНЬ ВЫШЕСТОЯЩ',
|
||
#'КАЗНЬ',
|
||
'ГУБЕРНИЯ',
|
||
'ПРОВЕРКИ',
|
||
'УСТАНОВЛЕНО',
|
||
'ПОБЕДИТЕЛЕМ',
|
||
#'СТАЛЬНЫЕ',
|
||
'НЕРВЫ',
|
||
'ДАРОВАНО',
|
||
#'ТРАНСПОРТИРОВКА',
|
||
'ОДОБРЕНО',
|
||
'ПРОЯВЛЕНИЯ',
|
||
'УЗАКОНЕНО',
|
||
'ИМЕЕТСЯ',
|
||
'ЗНАЛ',
|
||
'НЕ ПРИМЕЧЕНО',
|
||
'НА СЕВЕР',
|
||
'ПРИГОВОРИТЬ',
|
||
'ШЕСТВУЕМ',
|
||
'ДАГОН',
|
||
'ДА МЕРЗНУЩИЙ',
|
||
'КОФЕ',
|
||
#'РЕАГИРОВАНИЕ',
|
||
'УКАЗАНО',
|
||
'- ВЫСОКИЙ ТИТУЛ',
|
||
'ЗАКАЗ',
|
||
'ЧЕРТЫ ЛИЦА',
|
||
|
||
# english
|
||
'SCHOOL ON THE RIGHT',
|
||
'WILL NOT ALLOW',
|
||
'FLYWHEEL',
|
||
'TRIUMPHANTLY',
|
||
'BEING USED',
|
||
'NICE',
|
||
'UMBRELLA',
|
||
#'BIOROBOT',
|
||
'CONSERVATISM',
|
||
'WAS ESTABLISHED',
|
||
'WITH A PASSWORD',
|
||
'ANT',
|
||
'YEAR',
|
||
'RECOGNIZED',
|
||
'SEARCHED'
|
||
#'LEGAL',
|
||
#'FIGHTING'
|
||
]
|
||
|
||
# только без пробелов
|
||
junks_words = list(filter(lambda w: ' ' not in w, junks))
|
||
|
||
# только с пробелами
|
||
junks_nwords = list(filter(lambda w: w not in junks_words, junks))
|
||
#print(junks_nwords)
|
||
|
||
if remove_junk:
|
||
s = s.split(' ')
|
||
s = list(filter(lambda l: re.sub(r'\.|\!|,$', '', l) not in junks_words, s))
|
||
s = ' '.join(s)
|
||
|
||
for j in junks_nwords:
|
||
s = s.replace(j, '')
|
||
|
||
s = s.replace('Х О Р Т И Ц А', 'Х_О_Р_Т_И_Ц_А')
|
||
s = s.replace('ЯРОСЛАВСКАЯ ГУБЕРНИЯ', 'ЯРОСЛАВСКАЯ_ГУБЕРНИЯ')
|
||
s = s.replace('ЩУКА В МЕШКЕ', 'ЩУКА_В_МЕШКЕ')
|
||
s = s.replace('Ъ - ВЕЛИЧЕСТВЕННО', 'Ъ_-_ВЕЛИЧЕСТВЕННО')
|
||
|
||
s = re.sub(r'\s+', ' ', s).strip()
|
||
return s
|
||
|
||
def decode(s, is_url=False):
|
||
buf = ''
|
||
for word in s.split(' '):
|
||
word = word.strip()
|
||
if word == '' or word == '!':
|
||
continue
|
||
|
||
if re.match(r'^\d+', word):
|
||
buf += word
|
||
elif is_url and word.endswith('://'):
|
||
buf += word[0]
|
||
buf += '://'
|
||
else:
|
||
letter = word[0]
|
||
buf += letter
|
||
|
||
return buf
|
||
|
||
def decode2(s):
|
||
buf = ''
|
||
for s in re.split(r'[\?\.\!]+', s):
|
||
s = s.strip()
|
||
if s == '':
|
||
continue
|
||
|
||
words = s.split(' ')
|
||
|
||
letter = words[1][0]
|
||
buf += letter
|
||
|
||
return buf
|
||
|
||
def decode3(s):
|
||
buf = ''
|
||
for s in re.split(r'[\?\.\!]+', s):
|
||
s = s.strip()
|
||
s = s.replace(' ', '')
|
||
s = s.replace('-', '')
|
||
if not s:
|
||
continue
|
||
|
||
print(s)
|
||
continue
|
||
|
||
s = s.upper()
|
||
|
||
if s[0] in ('Ш', 'Щ', 'И'):
|
||
buf += s[0]
|
||
elif s[4] == 'Й':
|
||
buf += s[4]
|
||
elif s[0] == 'И':
|
||
buf += 'И'
|
||
elif s[7] == 'М':
|
||
buf += 'М'
|
||
elif s[4] == 'А':
|
||
buf += 'А'
|
||
elif s[2] == 'Р':
|
||
buf += 'Р'
|
||
elif s[1] == 'У':
|
||
buf += 'У'
|
||
elif s[9] == 'Ю':
|
||
buf += 'Ю'
|
||
else:
|
||
buf += '?'
|
||
|
||
return buf
|
||
|
||
|
||
def decode_2char_rot_minus3(s):
|
||
lines = split_sen(s)
|
||
buf = ''
|
||
for line in lines:
|
||
line = line.replace(' ', '').replace("'", '')
|
||
if re.sub(r'[\.\!\?\\/]', '', line).isdigit():
|
||
buf += line
|
||
continue
|
||
buf += line[1]
|
||
|
||
rots = rot_ru(buf, return_list=True)
|
||
return rots[len(rots)-2]
|
||
|
||
|
||
|
||
|
||
# s: source
|
||
# t: type
|
||
def decode_auto(s, t, reverse_decoded=False, remove_junk=True):
|
||
if t == '1':
|
||
s = clean_string(s, remove_junk=remove_junk)
|
||
result = decode(s)
|
||
|
||
elif t == '2':
|
||
result = decode2(s)
|
||
|
||
elif t == '3':
|
||
result = decode3(s)
|
||
|
||
elif t == '2char_rot-3':
|
||
result = decode_2char_rot_minus3(s)
|
||
|
||
if reverse_decoded:
|
||
# reverse string
|
||
result = result[::-1]
|
||
|
||
return result
|