summaryrefslogtreecommitdiff
path: root/python/problems/dictionaries/text/tmp.py
blob: bf56c4f5fd416e87a877ba7c9859cc3d29ad6d2b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import collections

def following_words(txt):
    words = txt.split()
    freq = collections.defaultdict(list)
    for word, next_word in zip(words, words[1:]):
        freq[word].append(next_word)
    return freq

def freq_following_word(txt):
    following = following_words(txt)
    for f in following:
        vals = collections.Counter(following[f])
        s = sorted(vals.most_common(), key = lambda x: (-x[1], x[0]))
        following[f] = s[0][0]
    return following

def text(word, freq, num):
    words = []
    for i in range(num):
        words.append(word)
        word = freq[word]
    return ' '.join(words)


import urllib.request
txt = 'danes je lep dan danes sije sonce danes sije dan ki je sonce'
#urllib.request.urlopen('http://squeeb1134.tripod.com/1984.txt').read().decode('utf8')
print (text('danes', freq_following_word(txt), 5))