I have a list of sentences:
text = [\'cant railway station\',\'citadel hotel\',\' police stn\'].
I need to form bigram pairs and store the
from nltk import word_tokenize
from nltk.util import ngrams
text = ['cant railway station', 'citadel hotel', 'police stn']
for line in text:
token = nltk.word_tokenize(line)
bigram = list(ngrams(token, 2))
# the '2' represents bigram...you can change it to get ngrams with different size
>>> text = ['cant railway station','citadel hotel',' police stn']
>>> bigrams = [(ele, tex.split()[i+1]) for tex in text for i,ele in enumerate(tex.split()) if i < len(tex.split())-1]
>>> bigrams
[('cant', 'railway'), ('railway', 'station'), ('citadel', 'hotel'), ('police', 'stn')]
Using enumerate and split function.
Rather than turning your text into lists of strings, start with each sentence separately as a string. I've also removed punctuation and stopwords, just remove these portions if irrelevant to you:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
def get_bigrams(myString):
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(myString)
stemmer = PorterStemmer()
bigram_finder = BigramCollocationFinder.from_words(tokens)
bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
for bigram_tuple in bigrams:
x = "%s %s" % bigram_tuple
tokens.append(x)
result = [' '.join([stemmer.stem(w).lower() for w in x.split()]) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 8]
return result
To use it, do like so:
for line in sentence:
features = get_bigrams(line)
# train set here
Note that this goes a little further and actually statistically scores the bigrams (which will come in handy in training the model).
Just fixing Dan's code:
def get_bigrams(myString):
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(myString)
stemmer = PorterStemmer()
bigram_finder = BigramCollocationFinder.from_words(tokens)
bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
for bigram_tuple in bigrams:
x = "%s %s" % bigram_tuple
tokens.append(x)
result = [' '.join([stemmer.stem(w).lower() for w in x.split()]) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 8]
return result