I needed to compute the Unigrams, BiGrams and Trigrams for a text file containing text like:
\"Cystic fibrosis affects 30,000 children and young adults in the US a
nltk has native support for ngrams
'n' is the ngram size ex: n=3 is for a trigram
from nltk import ngrams
def ngramize(texts, n):
output=[]
for text in texts:
output += ngrams(text,n)
return output
If efficiency is an issue and you have to build multiple different n-grams I would consider using the following code (building up on Franck's excellent answer):
from itertools import chain
def n_grams(seq, n=1):
"""Returns an iterator over the n-grams given a list_tokens"""
shift_token = lambda i: (el for j,el in enumerate(seq) if j>=i)
shifted_tokens = (shift_token(i) for i in range(n))
tuple_ngrams = zip(*shifted_tokens)
return tuple_ngrams # if join in generator : (" ".join(i) for i in tuple_ngrams)
def range_ngrams(list_tokens, ngram_range=(1,2)):
"""Returns an itirator over all n-grams for n in range(ngram_range) given a list_tokens."""
return chain(*(n_grams(list_tokens, i) for i in range(*ngram_range)))
Usage :
>>> input_list = input_list = 'test the ngrams generator'.split()
>>> list(range_ngrams(input_list, ngram_range=(1,3)))
[('test',), ('the',), ('ngrams',), ('generator',), ('test', 'the'), ('the', 'ngrams'), ('ngrams', 'generator'), ('test', 'the', 'ngrams'), ('the', 'ngrams', 'generator')]
~Same speed as NLTK:
import nltk
%%timeit
input_list = 'test the ngrams interator vs nltk '*10**6
nltk.ngrams(input_list,n=5)
# 7.02 ms ± 79 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
input_list = 'test the ngrams interator vs nltk '*10**6
n_grams(input_list,n=5)
# 7.01 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
input_list = 'test the ngrams interator vs nltk '*10**6
nltk.ngrams(input_list,n=1)
nltk.ngrams(input_list,n=2)
nltk.ngrams(input_list,n=3)
nltk.ngrams(input_list,n=4)
nltk.ngrams(input_list,n=5)
# 7.32 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
input_list = 'test the ngrams interator vs nltk '*10**6
range_ngrams(input_list, ngram_range=(1,6))
# 7.13 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)