507 lines
13 KiB
Python
507 lines
13 KiB
Python
import re, sys
|
||
from pprint import pprint
|
||
import operator
|
||
from termcolor import cprint
|
||
import itertools
|
||
|
||
RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
|
||
GLAS_LETTERS = 'аеёиоуыэюя'
|
||
SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь'
|
||
|
||
ENG_ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
|
||
|
||
RU_FREQ = {
|
||
'о': 0.10983,
|
||
'е': 0.08483,
|
||
'а': 0.07998,
|
||
'и': 0.07367,
|
||
'н': 0.067,
|
||
'т': 0.06318,
|
||
'с': 0.05473,
|
||
'р': 0.04746,
|
||
'в': 0.04533,
|
||
'л': 0.04343,
|
||
'к': 0.03486,
|
||
'м': 0.03203,
|
||
'д': 0.02977,
|
||
'п': 0.02804,
|
||
'у': 0.02615,
|
||
'я': 0.02001,
|
||
'ы': 0.01898,
|
||
'ь': 0.01735,
|
||
'г': 0.01687,
|
||
'з': 0.01641,
|
||
'б': 0.01592,
|
||
'ч': 0.0145,
|
||
'й': 0.01208,
|
||
'х': 0.00966,
|
||
'ж': 0.0094,
|
||
'ш': 0.00718,
|
||
'ю': 0.00639,
|
||
'ц': 0.00486,
|
||
'щ': 0.00361,
|
||
'э': 0.00331,
|
||
'ф': 0.00267,
|
||
'ъ': 0.00037,
|
||
'ё': 0.00013
|
||
}
|
||
|
||
ENG_FREQ = {
|
||
'A': 8.1,
|
||
'B': 1.4,
|
||
'C': 2.7,
|
||
'D': 3.9,
|
||
'E': 13.0,
|
||
'F': 2.9,
|
||
'G': 2.0,
|
||
'H': 5.2,
|
||
'I': 6.5,
|
||
'J': 0.2,
|
||
'K': 0.4,
|
||
'L': 3.4,
|
||
'M': 2.5,
|
||
'N': 7.2,
|
||
'O': 7.9,
|
||
'P': 2.0,
|
||
'R': 6.9,
|
||
'S': 6.1,
|
||
'T': 10.5,
|
||
'U': 2.4,
|
||
'V': 0.9,
|
||
'W': 1.5,
|
||
'X': 0.2,
|
||
'Y': 1.9,
|
||
'Z': 0.1,
|
||
}
|
||
for k, v in ENG_FREQ.items():
|
||
ENG_FREQ[k] = v/100
|
||
|
||
|
||
BF_NAMES = [
|
||
'марширующе',
|
||
'марширующий',
|
||
'свободин',
|
||
'мовсаев',
|
||
'щиголев',
|
||
'щиголёв',
|
||
]
|
||
|
||
CTHULHU_DICT = [
|
||
"ah", #generic action, e.g. greet, eat, do
|
||
"'ai", #speak / call
|
||
"athg", #sign (contract) / agree to
|
||
"'bthnk", #body / essence
|
||
"bug", #go
|
||
#"c- (prefix) we / our
|
||
"ch'", # cross over / travel
|
||
"chtenff", # brotherhood / society
|
||
"ebumna", # pit
|
||
"ee", # answers
|
||
"ehye", # cohesion / integrity
|
||
"ep", # after; with
|
||
"hai", # later / then
|
||
#"f'- (prefix) they / their
|
||
"'fhalma", # mother
|
||
"fhtagn", # wait / sleep
|
||
"fm'latgh", # burn
|
||
"ftaghu", # skin / boundary
|
||
"geb", # here
|
||
"gnaiih", # father
|
||
"gof'nn", # children
|
||
"goka", # grant
|
||
"gotha", # wish
|
||
"grah'n", # lost one / larva
|
||
#"h'- (prefix) it / its
|
||
"hafh'drn", # priest / summoner
|
||
"hai", # now
|
||
"hlirgh", # heretic
|
||
"hrii", # followers
|
||
"hupadgh", # born of
|
||
"ilyaa", # expect / await
|
||
"k'yarnak", # share / exchange
|
||
"kadishtu", # understand / know
|
||
"kn'a", # question
|
||
"li'hee", # on pain of
|
||
"llll", # at / beside
|
||
"lloig", # mind / psyche
|
||
"lw'nafh", # dream / transmit
|
||
"mg", # (conjunction) yet
|
||
"mnahn'", # worthless
|
||
"n'gha", # death
|
||
"n'ghft", # darkness
|
||
#"na- (prefix) (contraction of nafl-)
|
||
#"nafl- (prefix) not / (not-present tense)
|
||
#"ng- (prefix) (conjunction) and / then
|
||
"nglui", # threshold
|
||
"nilgh'ri", # anything / everything
|
||
#"nnn- (prefix) watch / protect
|
||
"nog", # come
|
||
"nw", # head / place
|
||
#"-nyth (suffix) servant of
|
||
#"-og (suffix) (emphatic)
|
||
"ooboshu", # visit
|
||
#"-or (suffix) force from / aspect of
|
||
"orr'e", # soul / spirit
|
||
#"-oth (suffix) native of
|
||
#"ph'- (prefix) over / beyond
|
||
"phlegeth", # realm of information
|
||
"r'luh", # secret / hidden
|
||
"ron", # religion / cult
|
||
"s'uhn", # pact
|
||
"sgn'wahl", # share space
|
||
"shagg", # realm of dreams
|
||
"shogg", # realm of darkness
|
||
"shtunggli", # notify / contact
|
||
"shugg", # realm of Earth
|
||
"sll'ha", # invite
|
||
"stell'bsna", # ask / pray for
|
||
"syha'h", # eternity
|
||
"tharanak", # promise / bring
|
||
"throd", # tremble
|
||
"uaaah", # (finish spell)
|
||
"uh'e", # people / crowd
|
||
"uln", # call / summon
|
||
"vulgtlagln", # pray to
|
||
"vulgtm", # prayer
|
||
"wgah'n", # reside in / control
|
||
"y'hah", # amen
|
||
#"y- (prefix) I / my
|
||
"ya", # I
|
||
#"-yar (suffix) time of / moment
|
||
"zhro", # (lift spell)'
|
||
]
|
||
|
||
def plural(n, words):
|
||
words = words.split(' ')
|
||
return words[0 if n == 1 else 1]
|
||
|
||
# split text to sentences
|
||
def split_sen(s, smart=True):
|
||
s = s.strip()
|
||
lines = []
|
||
|
||
endings = ('. ', '? ', '! ')
|
||
|
||
pos = 0
|
||
while pos < len(s):
|
||
min_index = None
|
||
for end in endings:
|
||
try:
|
||
i = s.index(end, pos)
|
||
except ValueError:
|
||
continue
|
||
if min_index == None or i < min_index:
|
||
min_index = i
|
||
|
||
if min_index:
|
||
line = s[pos:min_index+1]
|
||
pos = min_index+2
|
||
else:
|
||
line = s[pos:]
|
||
pos = len(s)
|
||
|
||
lines.append(line)
|
||
|
||
if not smart:
|
||
return lines
|
||
|
||
result_lines = []
|
||
for line in lines:
|
||
words = re.split(r'\s+', line)
|
||
buf = []
|
||
for w in words:
|
||
if not re.sub(r'[\.\!\?\\/]', '', w).isdigit():
|
||
buf.append(w)
|
||
else:
|
||
if len(buf):
|
||
result_lines.append(' '.join(buf))
|
||
result_lines.append(w)
|
||
buf = []
|
||
if len(buf):
|
||
result_lines.append(' '.join(buf))
|
||
|
||
return result_lines
|
||
|
||
def analyze_sentences(lines, not_used=False):
|
||
max_len = 0
|
||
for line in lines:
|
||
if len(line) > max_len:
|
||
max_len = len(line)
|
||
|
||
max_len += 1
|
||
|
||
i = 1
|
||
for line in lines:
|
||
words = re.split(r'\s+', line)
|
||
|
||
cprint('%2d. ' % i, 'cyan', end='')
|
||
|
||
print(line, end='')
|
||
if len(line) < max_len:
|
||
print(' ' * (max_len-len(line)), end='')
|
||
|
||
cprint(str(len(words)), 'green', attrs=['bold'], end='')
|
||
cprint(' %s,' % plural(len(words), 'word words'), 'green', end='')
|
||
|
||
cprint(' %d' % len(line), 'yellow', attrs=['bold'], end='')
|
||
cprint('/', 'yellow', end='')
|
||
cprint('%d' % len(line.replace(' ', '')), 'yellow', attrs=['bold'], end='')
|
||
|
||
cprint(' chars ', 'yellow', end='')
|
||
|
||
unique = unique_letters_amount(line)
|
||
cprint('(', 'red', end='')
|
||
cprint(unique, 'red', attrs=['bold'], end='')
|
||
cprint(' unique)', 'red')
|
||
|
||
i += 1
|
||
|
||
cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='')
|
||
|
||
if not_used:
|
||
not_used_list = []
|
||
s = ''.join(lines).lower()
|
||
for c in RU_ALPHABET:
|
||
if c not in s and c not in not_used_list:
|
||
not_used_list.append(c)
|
||
cprint('Not used letters: %s' % ', '.join(not_used_list), 'white', attrs=['bold'])
|
||
|
||
|
||
class LetterFreq:
|
||
def __init__(self, letter, freq):
|
||
self.letter = letter
|
||
self.freq = freq
|
||
|
||
def __repr__(self):
|
||
return '<LetterFreq of %s = %f>' % (self.letter, self.freq)
|
||
|
||
def analyze_letters_frequency_type4(s_in, eng=False, source_eng=False, only_unique=False, print_result=True):
|
||
freqs = []
|
||
added_letters = []
|
||
repl = {}
|
||
|
||
alphabet = RU_ALPHABET if not source_eng else ENG_ALPHABET
|
||
alphabet_freq = RU_FREQ if not eng else ENG_FREQ
|
||
|
||
s = ''
|
||
for c in s_in.lower():
|
||
if c == "'" or c in alphabet:
|
||
if c not in added_letters:
|
||
added_letters.append(c)
|
||
freqs.append(LetterFreq(c, 0))
|
||
s += c
|
||
|
||
for lf in freqs:
|
||
lf.freq = s.count(lf.letter) / len(s)
|
||
|
||
# sort by frequency
|
||
freqs = sorted(freqs, key=operator.attrgetter('freq'), reverse=True)
|
||
|
||
already_found_letters = []
|
||
|
||
for lf in freqs:
|
||
closest_delta = 1
|
||
closest_letter = '?'
|
||
closest_alphabet_freq = 0
|
||
|
||
for a_letter, a_freq in alphabet_freq.items():
|
||
delta = abs(a_freq - lf.freq)
|
||
if delta < closest_delta and (a_letter not in already_found_letters or not only_unique):
|
||
closest_delta = delta
|
||
closest_letter = a_letter
|
||
closest_alphabet_freq = a_freq
|
||
|
||
repl[lf.letter] = closest_letter
|
||
already_found_letters.append(closest_letter)
|
||
|
||
if print_result:
|
||
print('%s (%f) ---> %s (%f)' % (lf.letter, lf.freq, closest_letter, closest_alphabet_freq))
|
||
|
||
return repl
|
||
|
||
# s - lowercase string
|
||
# repl - { from: to } dict
|
||
def replace_by_dict(s, repl):
|
||
buf = ''
|
||
for c in s:
|
||
if c in repl:
|
||
buf += repl[c]
|
||
else:
|
||
buf += c
|
||
return buf
|
||
|
||
def print_hl(s, tohl, end="\n"):
|
||
i = 0
|
||
indexes = []
|
||
for c in s:
|
||
if c.lower() != tohl.lower():
|
||
print(c, end='')
|
||
else:
|
||
indexes.append(i)
|
||
cprint(c, 'red', attrs=['bold', 'underline'], end='')
|
||
i += 1
|
||
print('', end=end)
|
||
return indexes
|
||
|
||
def rot_en(s):
|
||
alphabet = ''
|
||
for i in range(65, 91):
|
||
alphabet += chr(i)
|
||
|
||
s = s.upper()
|
||
for i in range(0, 26):
|
||
for letter in s:
|
||
if letter not in alphabet:
|
||
print(letter, end='')
|
||
continue
|
||
|
||
letter_index = alphabet.index(letter)
|
||
new_index = (letter_index + i) % 26
|
||
new_letter = alphabet[new_index]
|
||
print(new_letter, end='')
|
||
|
||
print()
|
||
|
||
def rot_ru(s, return_list=False):
|
||
alphabet = RU_ALPHABET.upper()
|
||
|
||
result = []
|
||
s = s.upper()
|
||
for i in range(0, 33):
|
||
buf = ''
|
||
for letter in s:
|
||
if letter not in alphabet:
|
||
buf += letter
|
||
continue
|
||
|
||
letter_index = alphabet.index(letter)
|
||
new_index = (letter_index + i) % 33
|
||
new_letter = alphabet[new_index]
|
||
|
||
buf += new_letter
|
||
|
||
if not return_list:
|
||
print(buf)
|
||
else:
|
||
result.append(buf)
|
||
|
||
if return_list:
|
||
return result
|
||
|
||
def bf_all(table):
|
||
return list(itertools.product(*table))
|
||
|
||
def spaceitout(string,amount):
|
||
amountint = int(amount)
|
||
pile = ""
|
||
for char in string:
|
||
pile = pile + char + " "*amount
|
||
return pile.strip()
|
||
|
||
def unique_letters_amount(line):
|
||
line = line.upper()
|
||
unique = 0
|
||
for a in RU_ALPHABET.upper():
|
||
if a in line:
|
||
unique += 1
|
||
return unique
|
||
|
||
def bf_find_words(lines, words, nospaces=False):
|
||
min_word_len = 100
|
||
for w in words:
|
||
if len(w) < min_word_len:
|
||
min_word_len = len(w)
|
||
|
||
if nospaces:
|
||
lines = list(map(lambda s: re.sub(r'\s+', '', s), lines))
|
||
|
||
for line_start in range(0, len(lines) - min_word_len + 1):
|
||
cur_lines = lines[line_start:]
|
||
for w in words:
|
||
w = w.lower()
|
||
if len(w) > len(cur_lines):
|
||
continue
|
||
|
||
found = True
|
||
max_sen_len = 0
|
||
for i in range(0, len(w)):
|
||
line = cur_lines[i].lower()
|
||
if len(line) > max_sen_len:
|
||
max_sen_len = len(line)
|
||
if w[i] not in line:
|
||
found = False
|
||
break
|
||
|
||
if found:
|
||
# word <word> found in <n>-<m>
|
||
print('word ', end='')
|
||
cprint(w, 'white', attrs=['bold', 'underline'], end='')
|
||
print(' found in ', end='')
|
||
cprint(line_start+1, 'white', attrs=['bold'], end='')
|
||
print('-', end='')
|
||
cprint(line_start+len(w), 'white', attrs=['bold'], end='')
|
||
print(':')
|
||
|
||
for i in range(0, len(w)):
|
||
line = cur_lines[i]
|
||
print(' ', end='')
|
||
cprint('%2d. ' % (line_start+i+1), 'cyan', end='')
|
||
indexes = print_hl(line, w[i], end='')
|
||
if len(line) < max_sen_len:
|
||
print(' ' * (max_sen_len - len(line)), end='')
|
||
|
||
cprint(' %s. ' % (w[i]), 'cyan', end='')
|
||
|
||
indexes = tuple(map(lambda n: str(n+1), indexes))
|
||
for i, n in enumerate(indexes):
|
||
cprint(n, 'green', end='')
|
||
if i < len(indexes) - 1:
|
||
print(', ', end='')
|
||
|
||
print()
|
||
|
||
print()
|
||
|
||
|
||
class BFGrepDictionary:
|
||
def __init__(self, lines, dict_file):
|
||
def prepare_line(line):
|
||
line = re.sub(r'[\.\!\?\s]', '', line)
|
||
line = line.lower()
|
||
line = list(set(line))
|
||
return line
|
||
|
||
self.lines = list(map(prepare_line, lines))
|
||
self.lines_count = len(self.lines)
|
||
self.dict_file = dict_file
|
||
|
||
def go(self):
|
||
self.walk(0, '')
|
||
#bf_all(self.lines)
|
||
|
||
def walk(self, start_line, buf):
|
||
line = self.lines[start_line]
|
||
for i in range(len(line)):
|
||
letter = line[i]
|
||
if start_line == 0 and letter == 'ы':
|
||
continue
|
||
if start_line > 0 and letter == buf[-1:]:
|
||
continue
|
||
if start_line + 1 <= self.lines_count - 1:
|
||
self.walk(start_line + 1, buf + letter)
|
||
else:
|
||
self.check_word(buf + letter)
|
||
|
||
def check_word(self, s):
|
||
print(s)
|
||
|
||
# def grep():
|
||
# cmd = 'cat /tmp/all.txt | grep --color=never "%s" | xargs' % word
|
||
# #print(cmd)
|
||
# result = subprocess.check_output(cmd, shell=True, cwd=CWD).strip().decode('utf8')
|
||
# if result:
|
||
# result = result.replace("\n", ' ')
|
||
# return result.split(' ')
|
||
# else:
|
||
# return None
|