triumfalno/util.py
2017-05-13 13:45:46 +03:00

507 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re, sys
from pprint import pprint
import operator
from termcolor import cprint
import itertools
RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
GLAS_LETTERS = 'аеёиоуыэюя'
SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь'
ENG_ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
RU_FREQ = {
'о': 0.10983,
'е': 0.08483,
'а': 0.07998,
'и': 0.07367,
'н': 0.067,
'т': 0.06318,
'с': 0.05473,
'р': 0.04746,
'в': 0.04533,
'л': 0.04343,
'к': 0.03486,
'м': 0.03203,
'д': 0.02977,
'п': 0.02804,
'у': 0.02615,
'я': 0.02001,
'ы': 0.01898,
'ь': 0.01735,
'г': 0.01687,
'з': 0.01641,
'б': 0.01592,
'ч': 0.0145,
'й': 0.01208,
'х': 0.00966,
'ж': 0.0094,
'ш': 0.00718,
'ю': 0.00639,
'ц': 0.00486,
'щ': 0.00361,
'э': 0.00331,
'ф': 0.00267,
'ъ': 0.00037,
'ё': 0.00013
}
ENG_FREQ = {
'A': 8.1,
'B': 1.4,
'C': 2.7,
'D': 3.9,
'E': 13.0,
'F': 2.9,
'G': 2.0,
'H': 5.2,
'I': 6.5,
'J': 0.2,
'K': 0.4,
'L': 3.4,
'M': 2.5,
'N': 7.2,
'O': 7.9,
'P': 2.0,
'R': 6.9,
'S': 6.1,
'T': 10.5,
'U': 2.4,
'V': 0.9,
'W': 1.5,
'X': 0.2,
'Y': 1.9,
'Z': 0.1,
}
for k, v in ENG_FREQ.items():
ENG_FREQ[k] = v/100
BF_NAMES = [
'марширующе',
'марширующий',
'свободин',
'мовсаев',
'щиголев',
'щиголёв',
]
CTHULHU_DICT = [
"ah", #generic action, e.g. greet, eat, do
"'ai", #speak / call
"athg", #sign (contract) / agree to
"'bthnk", #body / essence
"bug", #go
#"c- (prefix) we / our
"ch'", # cross over / travel
"chtenff", # brotherhood / society
"ebumna", # pit
"ee", # answers
"ehye", # cohesion / integrity
"ep", # after; with
"hai", # later / then
#"f'- (prefix) they / their
"'fhalma", # mother
"fhtagn", # wait / sleep
"fm'latgh", # burn
"ftaghu", # skin / boundary
"geb", # here
"gnaiih", # father
"gof'nn", # children
"goka", # grant
"gotha", # wish
"grah'n", # lost one / larva
#"h'- (prefix) it / its
"hafh'drn", # priest / summoner
"hai", # now
"hlirgh", # heretic
"hrii", # followers
"hupadgh", # born of
"ilyaa", # expect / await
"k'yarnak", # share / exchange
"kadishtu", # understand / know
"kn'a", # question
"li'hee", # on pain of
"llll", # at / beside
"lloig", # mind / psyche
"lw'nafh", # dream / transmit
"mg", # (conjunction) yet
"mnahn'", # worthless
"n'gha", # death
"n'ghft", # darkness
#"na- (prefix) (contraction of nafl-)
#"nafl- (prefix) not / (not-present tense)
#"ng- (prefix) (conjunction) and / then
"nglui", # threshold
"nilgh'ri", # anything / everything
#"nnn- (prefix) watch / protect
"nog", # come
"nw", # head / place
#"-nyth (suffix) servant of
#"-og (suffix) (emphatic)
"ooboshu", # visit
#"-or (suffix) force from / aspect of
"orr'e", # soul / spirit
#"-oth (suffix) native of
#"ph'- (prefix) over / beyond
"phlegeth", # realm of information
"r'luh", # secret / hidden
"ron", # religion / cult
"s'uhn", # pact
"sgn'wahl", # share space
"shagg", # realm of dreams
"shogg", # realm of darkness
"shtunggli", # notify / contact
"shugg", # realm of Earth
"sll'ha", # invite
"stell'bsna", # ask / pray for
"syha'h", # eternity
"tharanak", # promise / bring
"throd", # tremble
"uaaah", # (finish spell)
"uh'e", # people / crowd
"uln", # call / summon
"vulgtlagln", # pray to
"vulgtm", # prayer
"wgah'n", # reside in / control
"y'hah", # amen
#"y- (prefix) I / my
"ya", # I
#"-yar (suffix) time of / moment
"zhro", # (lift spell)'
]
def plural(n, words):
words = words.split(' ')
return words[0 if n == 1 else 1]
# split text to sentences
def split_sen(s, smart=True):
s = s.strip()
lines = []
endings = ('. ', '? ', '! ')
pos = 0
while pos < len(s):
min_index = None
for end in endings:
try:
i = s.index(end, pos)
except ValueError:
continue
if min_index == None or i < min_index:
min_index = i
if min_index:
line = s[pos:min_index+1]
pos = min_index+2
else:
line = s[pos:]
pos = len(s)
lines.append(line)
if not smart:
return lines
result_lines = []
for line in lines:
words = re.split(r'\s+', line)
buf = []
for w in words:
if not re.sub(r'[\.\!\?\\/]', '', w).isdigit():
buf.append(w)
else:
if len(buf):
result_lines.append(' '.join(buf))
result_lines.append(w)
buf = []
if len(buf):
result_lines.append(' '.join(buf))
return result_lines
def analyze_sentences(lines, not_used=False):
max_len = 0
for line in lines:
if len(line) > max_len:
max_len = len(line)
max_len += 1
i = 1
for line in lines:
words = re.split(r'\s+', line)
cprint('%2d. ' % i, 'cyan', end='')
print(line, end='')
if len(line) < max_len:
print(' ' * (max_len-len(line)), end='')
cprint(str(len(words)), 'green', attrs=['bold'], end='')
cprint(' %s,' % plural(len(words), 'word words'), 'green', end='')
cprint(' %d' % len(line), 'yellow', attrs=['bold'], end='')
cprint('/', 'yellow', end='')
cprint('%d' % len(line.replace(' ', '')), 'yellow', attrs=['bold'], end='')
cprint(' chars ', 'yellow', end='')
unique = unique_letters_amount(line)
cprint('(', 'red', end='')
cprint(unique, 'red', attrs=['bold'], end='')
cprint(' unique)', 'red')
i += 1
cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='')
if not_used:
not_used_list = []
s = ''.join(lines).lower()
for c in RU_ALPHABET:
if c not in s and c not in not_used_list:
not_used_list.append(c)
cprint('Not used letters: %s' % ', '.join(not_used_list), 'white', attrs=['bold'])
class LetterFreq:
def __init__(self, letter, freq):
self.letter = letter
self.freq = freq
def __repr__(self):
return '<LetterFreq of %s = %f>' % (self.letter, self.freq)
def analyze_letters_frequency_type4(s_in, eng=False, source_eng=False, only_unique=False, print_result=True):
freqs = []
added_letters = []
repl = {}
alphabet = RU_ALPHABET if not source_eng else ENG_ALPHABET
alphabet_freq = RU_FREQ if not eng else ENG_FREQ
s = ''
for c in s_in.lower():
if c == "'" or c in alphabet:
if c not in added_letters:
added_letters.append(c)
freqs.append(LetterFreq(c, 0))
s += c
for lf in freqs:
lf.freq = s.count(lf.letter) / len(s)
# sort by frequency
freqs = sorted(freqs, key=operator.attrgetter('freq'), reverse=True)
already_found_letters = []
for lf in freqs:
closest_delta = 1
closest_letter = '?'
closest_alphabet_freq = 0
for a_letter, a_freq in alphabet_freq.items():
delta = abs(a_freq - lf.freq)
if delta < closest_delta and (a_letter not in already_found_letters or not only_unique):
closest_delta = delta
closest_letter = a_letter
closest_alphabet_freq = a_freq
repl[lf.letter] = closest_letter
already_found_letters.append(closest_letter)
if print_result:
print('%s (%f) ---> %s (%f)' % (lf.letter, lf.freq, closest_letter, closest_alphabet_freq))
return repl
# s - lowercase string
# repl - { from: to } dict
def replace_by_dict(s, repl):
buf = ''
for c in s:
if c in repl:
buf += repl[c]
else:
buf += c
return buf
def print_hl(s, tohl, end="\n"):
i = 0
indexes = []
for c in s:
if c.lower() != tohl.lower():
print(c, end='')
else:
indexes.append(i)
cprint(c, 'red', attrs=['bold', 'underline'], end='')
i += 1
print('', end=end)
return indexes
def rot_en(s):
alphabet = ''
for i in range(65, 91):
alphabet += chr(i)
s = s.upper()
for i in range(0, 26):
for letter in s:
if letter not in alphabet:
print(letter, end='')
continue
letter_index = alphabet.index(letter)
new_index = (letter_index + i) % 26
new_letter = alphabet[new_index]
print(new_letter, end='')
print()
def rot_ru(s, return_list=False):
alphabet = RU_ALPHABET.upper()
result = []
s = s.upper()
for i in range(0, 33):
buf = ''
for letter in s:
if letter not in alphabet:
buf += letter
continue
letter_index = alphabet.index(letter)
new_index = (letter_index + i) % 33
new_letter = alphabet[new_index]
buf += new_letter
if not return_list:
print(buf)
else:
result.append(buf)
if return_list:
return result
def bf_all(table):
return list(itertools.product(*table))
def spaceitout(string,amount):
amountint = int(amount)
pile = ""
for char in string:
pile = pile + char + " "*amount
return pile.strip()
def unique_letters_amount(line):
line = line.upper()
unique = 0
for a in RU_ALPHABET.upper():
if a in line:
unique += 1
return unique
def bf_find_words(lines, words, nospaces=False):
min_word_len = 100
for w in words:
if len(w) < min_word_len:
min_word_len = len(w)
if nospaces:
lines = list(map(lambda s: re.sub(r'\s+', '', s), lines))
for line_start in range(0, len(lines) - min_word_len + 1):
cur_lines = lines[line_start:]
for w in words:
w = w.lower()
if len(w) > len(cur_lines):
continue
found = True
max_sen_len = 0
for i in range(0, len(w)):
line = cur_lines[i].lower()
if len(line) > max_sen_len:
max_sen_len = len(line)
if w[i] not in line:
found = False
break
if found:
# word <word> found in <n>-<m>
print('word ', end='')
cprint(w, 'white', attrs=['bold', 'underline'], end='')
print(' found in ', end='')
cprint(line_start+1, 'white', attrs=['bold'], end='')
print('-', end='')
cprint(line_start+len(w), 'white', attrs=['bold'], end='')
print(':')
for i in range(0, len(w)):
line = cur_lines[i]
print(' ', end='')
cprint('%2d. ' % (line_start+i+1), 'cyan', end='')
indexes = print_hl(line, w[i], end='')
if len(line) < max_sen_len:
print(' ' * (max_sen_len - len(line)), end='')
cprint(' %s. ' % (w[i]), 'cyan', end='')
indexes = tuple(map(lambda n: str(n+1), indexes))
for i, n in enumerate(indexes):
cprint(n, 'green', end='')
if i < len(indexes) - 1:
print(', ', end='')
print()
print()
class BFGrepDictionary:
def __init__(self, lines, dict_file):
def prepare_line(line):
line = re.sub(r'[\.\!\?\s]', '', line)
line = line.lower()
line = list(set(line))
return line
self.lines = list(map(prepare_line, lines))
self.lines_count = len(self.lines)
self.dict_file = dict_file
def go(self):
self.walk(0, '')
#bf_all(self.lines)
def walk(self, start_line, buf):
line = self.lines[start_line]
for i in range(len(line)):
letter = line[i]
if start_line == 0 and letter == 'ы':
continue
if start_line > 0 and letter == buf[-1:]:
continue
if start_line + 1 <= self.lines_count - 1:
self.walk(start_line + 1, buf + letter)
else:
self.check_word(buf + letter)
def check_word(self, s):
print(s)
# def grep():
# cmd = 'cat /tmp/all.txt | grep --color=never "%s" | xargs' % word
# #print(cmd)
# result = subprocess.check_output(cmd, shell=True, cwd=CWD).strip().decode('utf8')
# if result:
# result = result.replace("\n", ' ')
# return result.split(' ')
# else:
# return None