I\'ve written a piece of code that essentially counts word frequencies and inserts them into an ARFF file for use with weka. I\'d like to alter it so that it can count bi-gram f
Generalized to n-grams with optional padding, also uses defaultdict(int)
for frequencies, to work in 2.6:
from collections import defaultdict
def ngrams(words, n=2, padding=False):
"Compute n-grams with optional padding"
pad = [] if not padding else [None]*(n-1)
grams = pad + words + pad
return (tuple(grams[i:i+n]) for i in range(0, len(grams) - (n - 1)))
# grab n-grams
words = ['the','cat','sat','on','the','dog','on','the','cat']
for size, padding in ((3, 0), (4, 0), (2, 1)):
print '\n%d-grams padding=%d' % (size, padding)
print list(ngrams(words, size, padding))
# show frequency
counts = defaultdict(int)
for ng in ngrams(words, 2, False):
counts[ng] += 1
print '\nfrequencies of bigrams:'
for c, ng in sorted(((c, ng) for ng, c in counts.iteritems()), reverse=True):
print c, ng
Output:
3-grams padding=0
[('the', 'cat', 'sat'), ('cat', 'sat', 'on'), ('sat', 'on', 'the'),
('on', 'the', 'dog'), ('the', 'dog', 'on'), ('dog', 'on', 'the'),
('on', 'the', 'cat')]
4-grams padding=0
[('the', 'cat', 'sat', 'on'), ('cat', 'sat', 'on', 'the'),
('sat', 'on', 'the', 'dog'), ('on', 'the', 'dog', 'on'),
('the', 'dog', 'on', 'the'), ('dog', 'on', 'the', 'cat')]
2-grams padding=1
[(None, 'the'), ('the', 'cat'), ('cat', 'sat'), ('sat', 'on'),
('on', 'the'), ('the', 'dog'), ('dog', 'on'), ('on', 'the'),
('the', 'cat'), ('cat', None)]
frequencies of bigrams:
2 ('the', 'cat')
2 ('on', 'the')
1 ('the', 'dog')
1 ('sat', 'on')
1 ('dog', 'on')
1 ('cat', 'sat')