问题
I got this tfidf from yebrahim and somehow my output document yield all 0 for the result . Any problem with this ? example of the output is hippo 0.0 hipper 0.0 hip 0.0 hint 0.0 hindsight 0.0 hill 0.0 hilarious 0.0
thanks for the help
# a list of (words-freq) pairs for each document
global_terms_in_doc = {}
# list to hold occurrences of terms across documents
global_term_freq = {}
num_docs = 0
lang = 'english'
lang_dictionary = {}
top_k = -1
supported_langs = ('english', 'french')
from django.utils.encoding import smart_str, smart_unicode
# support for custom language if needed
def loadLanguageLemmas(filePath):
print('loading language from file: ' + filePath)
f = open(filePath)
for line in f:
words = line.split()
if words[1] == '=' or words[0] == words[1]:
continue
lang_dictionary[words[0]] = words[1]
def remove_diacritic(words):
for i in range(len(words)):
w = unicode(words[i], 'ISO-8859-1')
w = unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore')
words[i] = w.lower()
return words
# function to tokenize text, and put words back to their roots
def tokenize(text):
text = ' '.join(text)
tokens = PunktWordTokenizer().tokenize(text)
# lemmatize words. try both noun and verb lemmatizations
lmtzr = WordNetLemmatizer()
for i in range(0,len(tokens)):
#tokens[i] = tokens[i].strip("'")
if lang != 'english':
if tokens[i] in lang_dictionary:
tokens[i] = lang_dictionary[tokens[i]]
else:
res = lmtzr.lemmatize(tokens[i])
if res == tokens[i]:
tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
else:
tokens[i] = res
# don't return any single letters
tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()]
return tokens
def remove_stopwords(text):
# remove punctuation
chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&',
'*', '(', ')', ' - ', '_', '+' ,'=', '@', ':', '\\', ',',
';', '~', '`', '<', '>', '|', '[', ']', '{', '}']
for c in chars:
text = smart_str(text.replace(c, ' '))
text = text.split()
import nltk
if lang == 'english':
stopwords = nltk.corpus.stopwords.words('english')
else:
stopwords = open(lang + '_stopwords.txt', 'r').read().split()
content = [w for w in text if w.lower().strip() not in stopwords]
return content
# __main__ execution
import sys, re, math, unicodedata
from optparse import OptionParser
parser = OptionParser(usage='usage: %prog [options] input_file')
parser.add_option('-l', '--language', dest='language',
help='language to use in tokenizing and lemmatizing. supported\
languages: {english, french}', metavar='LANGUAGE')
parser.add_option('-k', '--top-k', dest='top_k',
help='output only terms with score no less k')
parser.add_option('-m', '--mode', dest='mode',
help='display mode. can be either "both" or "term"')
(options, args) = parser.parse_args()
if options.language:
if options.language not in supported_langs:
print 'only ', supported_langs, ' are supported in this version.'
quit()
if options.language != 'english':
lang = options.language
loadLanguageLemmas(options.language + '_lemmas.txt')
if options.top_k:
top_k = int(options.top_k)
display_mode = 'both'
if options.mode:
if options.mode == 'both' or options.mode == 'term':
display_mode = options.mode
else:
parser.print_help()
if not args:
parser.print_help()
quit()
reader = open(args[0])
all_files = reader.read().splitlines()
num_docs = len(all_files)
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.punkt import PunktWordTokenizer
print('initializing..')
for f in all_files:
# local term frequency map
terms_in_doc = {}
doc_words = open(f).read().lower()
#print 'words:\n', doc_words
doc_words = remove_stopwords(doc_words)
#print 'after stopwords:\n', doc_words
doc_words = tokenize(doc_words)
#print 'after tokenize:\n', doc_words
#quit()
# increment local count
for word in doc_words:
if word in terms_in_doc:
terms_in_doc[word] += 1
else:
terms_in_doc[word] = 1
# increment global frequency
for (word,freq) in terms_in_doc.items():
if word in global_term_freq:
global_term_freq[word] += 1
else:
global_term_freq[word] = 1
global_terms_in_doc[f] = terms_in_doc
print('working through documents.. ')
for f in all_files:
writer = open(f + '_final', 'w')
result = []
# iterate over terms in f, calculate their tf-idf, put in new list
max_freq = 0;
for (term,freq) in global_terms_in_doc[f].items():
if freq > max_freq:
max_freq = freq
for (term,freq) in global_terms_in_doc[f].items():
idf = math.log(float(1 + num_docs) / float(1 + global_term_freq[term]))
tfidf = float(freq) / float(max_freq) * float(idf)
result.append([tfidf, term])
# sort result on tfidf and write them in descending order
result = sorted(result, reverse=True)
for (tfidf, term) in result[:top_k]:
if display_mode == 'both':
writer.write(term + '\t' + str(tfidf) + '\n')
else:
writer.write(term + '\n')
print('success, with ' + str(num_docs) + ' documents.')
来源:https://stackoverflow.com/questions/16129076/tf-idf-for-my-documents-yield-0