问题
I was experimenting with python NLTK text classification. Here is the code example i am practicing: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
Here is code:
from nltk import bigrams
from nltk.probability import ELEProbDist, FreqDist
from nltk import NaiveBayesClassifier
from collections import defaultdict
train_samples = {}
with file ('data/positive.txt', 'rt') as f:
for line in f.readlines():
train_samples[line] = 'pos'
with file ('data/negative.txt', 'rt') as d:
for line in d.readlines():
train_samples[line] = 'neg'
f = open("data/test.txt", "r")
test_samples = f.readlines()
# Error in this code
# def bigramReturner(text):
# tweetString = text.lower()
# bigramFeatureVector = {}
# for item in bigrams(tweetString.split()):
# bigramFeatureVector.append(' '.join(item))
# return bigramFeatureVector
# Updated the code from the stack overflow comment
def bigramReturner (tweetString):
tweetString = tweetString.lower()
#comment the line since the function is not defined
#tweetString = removePunctuation (tweetString)
bigramFeatureVector = []
for item in nltk.unigrams(tweetString.split()):
bigramFeatureVector.append(' '.join(item))
return bigramFeatureVector
def get_labeled_features(samples):
word_freqs = {}
for text, label in train_samples.items():
tokens = text.split()
for token in tokens:
if token not in word_freqs:
word_freqs[token] = {'pos': 0, 'neg': 0}
word_freqs[token][label] += 1
return word_freqs
def get_label_probdist(labeled_features):
label_fd = FreqDist()
for item, counts in labeled_features.items():
for label in ['neg', 'pos']:
if counts[label] > 0:
label_fd.inc(label)
label_probdist = ELEProbDist(label_fd)
return label_probdist
def get_feature_probdist(labeled_features):
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
num_samples = len(train_samples) / 2
for token, counts in labeled_features.items():
for label in ['neg', 'pos']:
feature_freqdist[label, token].inc(True, count=counts[label])
feature_freqdist[label, token].inc(None, num_samples - counts[label])
feature_values[token].add(None)
feature_values[token].add(True)
for item in feature_freqdist.items():
print item[0], item[1]
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return feature_probdist
labeled_features = get_labeled_features(train_samples)
label_probdist = get_label_probdist(labeled_features)
feature_probdist = get_feature_probdist(labeled_features)
classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
for sample in test_samples:
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
but when I run the code I get following error:
Traceback (most recent call last):
File "naive_bigram_1.py", line 87, in <module>
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
File "naive_bigram_1.py", line 30, in bigramReturner
tweetString = removePunctuation (tweetString)
NameError: global name 'removePunctuation' is not defined
I saw the similar question with other error, here I updated as well n-grams with Naive Bayes classifier
回答1:
You're calling a function removePunctuation
that hasn't been defined previously:
def bigramReturner (tweetString):
tweetString = tweetString.lower()
tweetString = removePunctuation (tweetString)
....
I also noticed that you put spaces between your functions' names and the parameters list. Avoid that as it's not really idiomatic Python and could even cause some problems (like your function being evaluated as an object instead of being called).
来源:https://stackoverflow.com/questions/19209895/n-grams-with-naive-bayes-classifier-error