triumfalno/util.py

import re, sys
from pprint import pprint
import operator
from termcolor import cprint
import itertools

RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
GLAS_LETTERS = 'аеёиоуыэюя'
SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь'

ENG_ALPHABET = 'abcdefghijklmnopqrstuvwxyz'

RU_FREQ = {
    'о': 0.10983,
    'е': 0.08483,
    'а': 0.07998,
    'и': 0.07367,
    'н': 0.067,
    'т': 0.06318,
    'с': 0.05473,
    'р': 0.04746,
    'в': 0.04533,
    'л': 0.04343,
    'к': 0.03486,
    'м': 0.03203,
    'д': 0.02977,
    'п': 0.02804,
    'у': 0.02615,
    'я': 0.02001,
    'ы': 0.01898,
    'ь': 0.01735,
    'г': 0.01687,
    'з': 0.01641,
    'б': 0.01592,
    'ч': 0.0145,
    'й': 0.01208,
    'х': 0.00966,
    'ж': 0.0094,
    'ш': 0.00718,
    'ю': 0.00639,
    'ц': 0.00486,
    'щ': 0.00361,
    'э': 0.00331,
    'ф': 0.00267,
    'ъ': 0.00037,
    'ё': 0.00013
}

ENG_FREQ = {
    'A': 8.1,
    'B': 1.4,
    'C': 2.7,
    'D': 3.9,
    'E': 13.0,
    'F': 2.9,
    'G': 2.0,
    'H': 5.2,
    'I': 6.5,
    'J': 0.2,
    'K': 0.4,
    'L': 3.4,
    'M': 2.5,
    'N': 7.2,
    'O': 7.9,
    'P': 2.0,
    'R': 6.9,
    'S': 6.1,
    'T': 10.5,
    'U': 2.4,
    'V': 0.9,
    'W': 1.5,
    'X': 0.2,
    'Y': 1.9,
    'Z': 0.1,
}
for k, v in ENG_FREQ.items():
    ENG_FREQ[k] = v/100


BF_NAMES = [
    'марширующе',
    'марширующий',
    'свободин',
    'мовсаев',
    'щиголев',
    'щиголёв',
]

CTHULHU_DICT = [
    "ah", #generic action, e.g. greet, eat, do
    "'ai", #speak / call
    "athg", #sign (contract) / agree to
    "'bthnk", #body / essence
    "bug", #go
    #"c- (prefix) we / our
    "ch'", # cross over / travel
    "chtenff", # brotherhood / society
    "ebumna", # pit
    "ee", # answers
    "ehye", # cohesion / integrity
    "ep", # after; with
    "hai", # later / then
    #"f'- (prefix) they / their
    "'fhalma", # mother
    "fhtagn", # wait / sleep
    "fm'latgh", # burn
    "ftaghu", # skin / boundary
    "geb", # here
    "gnaiih", # father
    "gof'nn", # children
    "goka", # grant
    "gotha", # wish
    "grah'n", # lost one / larva
    #"h'- (prefix) it / its
    "hafh'drn", # priest / summoner
    "hai", # now
    "hlirgh", # heretic
    "hrii", # followers
    "hupadgh", # born of
    "ilyaa", # expect / await
    "k'yarnak", # share / exchange
    "kadishtu", # understand / know
    "kn'a", # question
    "li'hee", # on pain of
    "llll", # at / beside
    "lloig", # mind / psyche
    "lw'nafh", # dream / transmit
    "mg", # (conjunction) yet
    "mnahn'", # worthless
    "n'gha", # death
    "n'ghft", # darkness
    #"na- (prefix) (contraction of nafl-)
    #"nafl- (prefix) not / (not-present tense)
    #"ng- (prefix) (conjunction) and / then
    "nglui", # threshold
    "nilgh'ri", # anything / everything
    #"nnn- (prefix) watch / protect
    "nog", # come
    "nw", # head / place
    #"-nyth (suffix) servant of
    #"-og (suffix) (emphatic)
    "ooboshu", # visit
    #"-or (suffix) force from / aspect of
    "orr'e", # soul / spirit
    #"-oth (suffix) native of
    #"ph'- (prefix) over / beyond
    "phlegeth", # realm of information
    "r'luh", # secret / hidden
    "ron", # religion / cult
    "s'uhn", # pact
    "sgn'wahl", # share space
    "shagg", # realm of dreams
    "shogg", # realm of darkness
    "shtunggli", # notify / contact
    "shugg", # realm of Earth
    "sll'ha", # invite
    "stell'bsna", # ask / pray for
    "syha'h", # eternity
    "tharanak", # promise / bring
    "throd", # tremble
    "uaaah", # (finish spell)
    "uh'e", # people / crowd
    "uln", # call / summon
    "vulgtlagln", # pray to
    "vulgtm", # prayer
    "wgah'n", # reside in / control
    "y'hah", # amen
    #"y- (prefix) I / my
    "ya", # I
    #"-yar (suffix) time of / moment
    "zhro", # (lift spell)'
]

def plural(n, words):
    words = words.split(' ')
    return words[0 if n == 1 else 1]

# split text to sentences
def split_sen(s, smart=True):
    s = s.strip()
    lines = []

    endings = ('. ', '? ', '! ')

    pos = 0
    while pos < len(s):
        min_index = None
        for end in endings:
            try:
                i = s.index(end, pos)
            except ValueError:
                continue
            if min_index == None or i < min_index:
                min_index = i

        if min_index:
            line = s[pos:min_index+1]
            pos = min_index+2
        else:
            line = s[pos:]
            pos = len(s)

        lines.append(line)

    if not smart:
        return lines

    result_lines = []
    for line in lines:
        words = re.split(r'\s+', line)
        buf = []
        for w in words:
            if not re.sub(r'[\.\!\?\\/]', '', w).isdigit():
                buf.append(w)
            else:
                if len(buf):
                    result_lines.append(' '.join(buf))
                result_lines.append(w)
                buf = []
        if len(buf):
            result_lines.append(' '.join(buf))

    return result_lines

def analyze_sentences(lines, not_used=False):
    max_len = 0
    for line in lines:
        if len(line) > max_len:
            max_len = len(line)

    max_len += 1

    i = 1
    for line in lines:
        words = re.split(r'\s+', line)

        cprint('%2d. ' % i, 'cyan', end='')

        print(line, end='')
        if len(line) < max_len:
            print(' ' * (max_len-len(line)), end='')

        cprint(str(len(words)), 'green', attrs=['bold'], end='')
        cprint(' %s,' % plural(len(words), 'word words'), 'green', end='')

        cprint(' %d' % len(line), 'yellow', attrs=['bold'], end='')
        cprint('/', 'yellow', end='')
        cprint('%d' % len(line.replace(' ', '')), 'yellow', attrs=['bold'], end='')

        cprint(' chars ', 'yellow', end='')

        unique = unique_letters_amount(line)
        cprint('(', 'red', end='')
        cprint(unique, 'red', attrs=['bold'], end='')
        cprint(' unique)', 'red')

        i += 1

    cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='')

    if not_used:
        not_used_list = []
        s = ''.join(lines).lower()
        for c in RU_ALPHABET:
            if c not in s and c not in not_used_list:
                not_used_list.append(c)
        cprint('Not used letters: %s' % ', '.join(not_used_list), 'white', attrs=['bold'])


class LetterFreq:
    def __init__(self, letter, freq):
        self.letter = letter
        self.freq = freq

    def __repr__(self):
        return '<LetterFreq of %s = %f>' % (self.letter, self.freq)

def analyze_letters_frequency_type4(s_in, eng=False, source_eng=False, only_unique=False, print_result=True):
    freqs = []
    added_letters = []
    repl = {}

    alphabet = RU_ALPHABET if not source_eng else ENG_ALPHABET
    alphabet_freq = RU_FREQ if not eng else ENG_FREQ

    s = ''
    for c in s_in.lower():
        if c == "'" or c in alphabet:
            if c not in added_letters:
                added_letters.append(c)
                freqs.append(LetterFreq(c, 0))
            s += c

    for lf in freqs:
        lf.freq = s.count(lf.letter) / len(s)

    # sort by frequency
    freqs = sorted(freqs, key=operator.attrgetter('freq'), reverse=True)

    already_found_letters = []

    for lf in freqs:
        closest_delta = 1
        closest_letter = '?'
        closest_alphabet_freq = 0

        for a_letter, a_freq in alphabet_freq.items():
            delta = abs(a_freq - lf.freq)
            if delta < closest_delta and (a_letter not in already_found_letters or not only_unique):
                closest_delta = delta
                closest_letter = a_letter
                closest_alphabet_freq = a_freq

        repl[lf.letter] = closest_letter
        already_found_letters.append(closest_letter)

        if print_result:
            print('%s (%f) ---> %s (%f)' % (lf.letter, lf.freq, closest_letter, closest_alphabet_freq))

    return repl

# s - lowercase string
# repl - { from: to } dict
def replace_by_dict(s, repl):
    buf = ''
    for c in s:
        if c in repl:
            buf += repl[c]
        else:
            buf += c
    return buf

def print_hl(s, tohl, end="\n"):
    i = 0
    indexes = []
    for c in s:
        if c.lower() != tohl.lower():
            print(c, end='')
        else:
            indexes.append(i)
            cprint(c, 'red', attrs=['bold', 'underline'], end='')
        i += 1
    print('', end=end)
    return indexes

def rot_en(s):
    alphabet = ''
    for i in range(65, 91):
        alphabet += chr(i)

    s = s.upper()
    for i in range(0, 26):
        for letter in s:
            if letter not in alphabet:
                print(letter, end='')
                continue

            letter_index = alphabet.index(letter)
            new_index = (letter_index + i) % 26
            new_letter = alphabet[new_index]
            print(new_letter, end='')

        print()

def rot_ru(s, return_list=False):
    alphabet = RU_ALPHABET.upper()

    result = []
    s = s.upper()
    for i in range(0, 33):
        buf = ''
        for letter in s:
            if letter not in alphabet:
                buf += letter
                continue

            letter_index = alphabet.index(letter)
            new_index = (letter_index + i) % 33
            new_letter = alphabet[new_index]

            buf += new_letter

        if not return_list:
            print(buf)
        else:
            result.append(buf)

    if return_list:
        return result

def bf_all(table):
    return list(itertools.product(*table))

def spaceitout(string,amount):
    amountint = int(amount)
    pile = ""
    for char in string:
        pile = pile + char + " "*amount
    return pile.strip()

def unique_letters_amount(line):
    line = line.upper()
    unique = 0
    for a in RU_ALPHABET.upper():
        if a in line:
            unique += 1
    return unique

def bf_find_words(lines, words, nospaces=False):
    min_word_len = 100
    for w in words:
        if len(w) < min_word_len:
            min_word_len = len(w)

    if nospaces:
        lines = list(map(lambda s: re.sub(r'\s+', '', s), lines))

    for line_start in range(0, len(lines) - min_word_len + 1):
        cur_lines = lines[line_start:]
        for w in words:
            w = w.lower()
            if len(w) > len(cur_lines):
                continue

            found = True
            max_sen_len = 0
            for i in range(0, len(w)):
                line = cur_lines[i].lower()
                if len(line) > max_sen_len:
                    max_sen_len = len(line)
                if w[i] not in line:
                    found = False
                    break

            if found:
                # word <word> found in <n>-<m>
                print('word ', end='')
                cprint(w, 'white', attrs=['bold', 'underline'], end='')
                print(' found in ', end='')
                cprint(line_start+1, 'white', attrs=['bold'], end='')
                print('-', end='')
                cprint(line_start+len(w), 'white', attrs=['bold'], end='')
                print(':')

                for i in range(0, len(w)):
                    line = cur_lines[i]
                    print('  ', end='')
                    cprint('%2d. ' % (line_start+i+1), 'cyan', end='')
                    indexes = print_hl(line, w[i], end='')
                    if len(line) < max_sen_len:
                        print(' ' * (max_sen_len - len(line)), end='')

                    cprint(' %s. ' % (w[i]), 'cyan', end='')

                    indexes = tuple(map(lambda n: str(n+1), indexes))
                    for i, n in enumerate(indexes):
                        cprint(n, 'green', end='')
                        if i < len(indexes) - 1:
                            print(', ', end='')

                    print()

                print()


class BFGrepDictionary:
    def __init__(self, lines, dict_file):
        def prepare_line(line):
            line = re.sub(r'[\.\!\?\s]', '', line)
            line = line.lower()
            line = list(set(line))
            return line

        self.lines = list(map(prepare_line, lines))
        self.lines_count = len(self.lines)
        self.dict_file = dict_file

    def go(self):
        self.walk(0, '')
        #bf_all(self.lines)

    def walk(self, start_line, buf):
        line = self.lines[start_line]
        for i in range(len(line)):
            letter = line[i]
            if start_line == 0 and letter == 'ы':
                continue
            if start_line > 0 and letter == buf[-1:]:
                continue
            if start_line + 1 <= self.lines_count - 1:
                self.walk(start_line + 1, buf + letter)
            else:
                self.check_word(buf + letter)

    def check_word(self, s):
        print(s)

#    def grep():
#        cmd = 'cat /tmp/all.txt | grep --color=never "%s" | xargs' % word
#        #print(cmd)
#        result = subprocess.check_output(cmd, shell=True, cwd=CWD).strip().decode('utf8')
#        if result:
#            result = result.replace("\n", ' ')
#            return result.split(' ')
#        else:
#            return None