diff options
Diffstat (limited to 'python/problems/dictionaries/text/tmp.py')
-rw-r--r-- | python/problems/dictionaries/text/tmp.py | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/python/problems/dictionaries/text/tmp.py b/python/problems/dictionaries/text/tmp.py new file mode 100644 index 0000000..bf56c4f --- /dev/null +++ b/python/problems/dictionaries/text/tmp.py @@ -0,0 +1,29 @@ +import collections + +def following_words(txt): + words = txt.split() + freq = collections.defaultdict(list) + for word, next_word in zip(words, words[1:]): + freq[word].append(next_word) + return freq + +def freq_following_word(txt): + following = following_words(txt) + for f in following: + vals = collections.Counter(following[f]) + s = sorted(vals.most_common(), key = lambda x: (-x[1], x[0])) + following[f] = s[0][0] + return following + +def text(word, freq, num): + words = [] + for i in range(num): + words.append(word) + word = freq[word] + return ' '.join(words) + + +import urllib.request +txt = 'danes je lep dan danes sije sonce danes sije dan ki je sonce' +#urllib.request.urlopen('http://squeeb1134.tripod.com/1984.txt').read().decode('utf8') +print (text('danes', freq_following_word(txt), 5)) |