compute tf-idf with corpus

问题

So, I have copied a source code about how to create a system that can run tf-idf, and here is the code :

    #module import
    from __future__ import division, unicode_literals
    import math
    import string
    import re
    import os

    from text.blob import TextBlob as tb
    #create a new array
    words = {} 
    def tf(word, blob):
       return blob.words.count(word) / len(blob.words)

    def n_containing(word, bloblist):
       return sum(1 for blob in bloblist if word in blob)

    def idf(word, bloblist):
       return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

    def tfidf(word, blob, bloblist):
       return tf(word, blob) * idf(word, bloblist)

    regex = re.compile('[%s]' % re.escape(string.punctuation))

    f = open('D:/article/sport/a.txt','r')
    var = f.read()
    var = regex.sub(' ', var)
    var = var.lower()

    document1 = tb(var)

    f = open('D:/article/food/b.txt','r')
    var = f.read()
    var = var.lower()
    document2 = tb(var)


    bloblist = [document1, document2]
    for i, blob in enumerate(bloblist):
       print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:50]:
    print("Word: {}, TF-IDF: {}".format(word, round(score, 5)))

but, the problem is, i want to put all of the files in the sport folder in a corpora, and the food article in the food folder into another corpora, so the the system will give a result for each corpora. Now, i can only compare files, but i want to compare between corpora.I am very sorry for asking this question, any help will be appriciated.

Thanks

回答1:

What i got is that, you want to calculate the word frequency of two file and store them in different file to compare them, to do this , you can use terminal. Here is the simple code to calculate the word frequency

import string
import collections
import operator
keywords = []
i=0
def removePunctuation(sentence):
    sentence = sentence.lower()
    new_sentence = ""
    for char in sentence:
        if char not in string.punctuation:
                new_sentence = new_sentence + char
    return new_sentence
 def wordFrequences(sentence):
    global i
    wordFreq = {}
    split_sentence = new_sentence.split()
    for word in split_sentence:
        wordFreq[word] = wordFreq.get(word,0) + 1
    wordFreq.items()
  # od = collections.OrderedDict(sorted(wordFreq.items(),reverse=True))
  # print od
    sorted_x= sorted(wordFreq.iteritems(), key=operator.itemgetter(1),reverse = True)
    print sorted_x
    for key, value in sorted_x:
        keywords.append(key)
    print keywords
f = open('D:/article/sport/a.txt','r')
sentence = f.read()
# sentence = "The first test of the function some some some some"
new_sentence = removePunctuation(sentence)
wordFrequences(new_sentence)

you have to run this code two time by changing the path of your text file and each time when you run code from console pass command like this

python abovecode.py > destinationfile.txt

like in your case

python abovecode.py > sportfolder/file1.txt
python abovecode.py > foodfolder/file2.txt

imp : if u want the words with their frequency then omit the part

print keywords

imp : if u need words acc. to their freq then omit

print sorted_x

来源：https://stackoverflow.com/questions/22434092/compute-tf-idf-with-corpus

标签

python

tf-idf