I need to compare documents stored in a DB and come up with a similarity score between 0 and 1.
The method I need to use has to be very simple. Implementing a vanil
For our Information Retrieval Course, we use some code that is written by our professor in Java. Sorry, no python port. "It is being released for educational and research purposes only under the GNU General Public License."
You can check out the documentation http://userweb.cs.utexas.edu/~mooney/ir-course/doc/
But more specifically check out: http://userweb.cs.utexas.edu/users/mooney/ir-course/doc/ir/vsr/HashMapVector.html
You can download it http://userweb.cs.utexas.edu/users/mooney/ir-course/
Check out NLTK package: http://www.nltk.org it has everything what you need
For the cosine_similarity:
def cosine_distance(u, v):
"""
Returns the cosine of the angle between vectors v and u. This is equal to
u.v / |u||v|.
"""
return numpy.dot(u, v) / (math.sqrt(numpy.dot(u, u)) * math.sqrt(numpy.dot(v, v)))
For ngrams:
def ngrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
"""
A utility that produces a sequence of ngrams from a sequence of items.
For example:
>>> ngrams([1,2,3,4,5], 3)
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]
Use ingram for an iterator version of this function. Set pad_left
or pad_right to true in order to get additional ngrams:
>>> ngrams([1,2,3,4,5], 2, pad_right=True)
[(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
@param sequence: the source data to be converted into ngrams
@type sequence: C{sequence} or C{iterator}
@param n: the degree of the ngrams
@type n: C{int}
@param pad_left: whether the ngrams should be left-padded
@type pad_left: C{boolean}
@param pad_right: whether the ngrams should be right-padded
@type pad_right: C{boolean}
@param pad_symbol: the symbol to use for padding (default is None)
@type pad_symbol: C{any}
@return: The ngrams
@rtype: C{list} of C{tuple}s
"""
if pad_left:
sequence = chain((pad_symbol,) * (n-1), sequence)
if pad_right:
sequence = chain(sequence, (pad_symbol,) * (n-1))
sequence = list(sequence)
count = max(0, len(sequence) - n + 1)
return [tuple(sequence[i:i+n]) for i in range(count)]
for tf-idf you will have to compute distribution first, I am using Lucene to do that, but you may very well do something similar with NLTK, use FreqDist:
http://nltk.googlecode.com/svn/trunk/doc/book/ch01.html#frequency_distribution_index_term
if you like pylucene, this will tell you how to comute tf.idf
# reader = lucene.IndexReader(FSDirectory.open(index_loc))
docs = reader.numDocs()
for i in xrange(docs):
tfv = reader.getTermFreqVector(i, fieldname)
if tfv:
rec = {}
terms = tfv.getTerms()
frequencies = tfv.getTermFrequencies()
for (t,f,x) in zip(terms,frequencies,xrange(maxtokensperdoc)):
df= searcher.docFreq(Term(fieldname, t)) # number of docs with the given term
tmap.setdefault(t, len(tmap))
rec[t] = sim.tf(f) * sim.idf(df, max_doc) #compute TF.IDF
# and normalize the values using cosine normalization
if cosine_normalization:
denom = sum([x**2 for x in rec.values()])**0.5
for k,v in rec.items():
rec[k] = v / denom
If you are interested, I've done tutorial series (Part I and Part II) talking about tf-idf and using the Scikits.learn (sklearn) Python module.
Part 3 has cosine similarity.
Here's an answer with just python
+ numpy
, in short:
Cosine:
def cosine_sim(u,v):
return np.dot(u,v) / (sqrt(np.dot(u,u)) * sqrt(np.dot(v,v)))
Ngrams:
def ngrams(sentence, n):
return zip(*[sentence.split()[i:] for i in range(n)])
TF-IDF (it's a little weird but it works):
def tfidf(corpus, vocab):
"""
INPUT:
corpus = [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]),
('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]),
('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
vocab = ['a', 'bar', 'black', 'foo', 'is', 'sentence',
'sheep', 'this']
OUTPUT:
[[0.300, 0.300, 0.0, 0.300, 0.300, 0.0, 0.0, 0.300],
[0.0, 0.600, 0.600, 0.300, 0.0, 0.0, 0.600, 0.0],
[0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
"""
def termfreq(matrix, doc, term):
try: return matrix[doc][term] / float(sum(matrix[doc].values()))
except ZeroDivisionError: return 0
def inversedocfreq(matrix, term):
try:
return float(len(matrix)) /sum([1 for i,_ in enumerate(matrix) if matrix[i][term] > 0])
except ZeroDivisionError: return 0
matrix = [{k:v for k,v in zip(vocab, i[1])} for i in corpus]
tfidf = defaultdict(dict)
for doc,_ in enumerate(matrix):
for term in matrix[doc]:
tf = termfreq(matrix,doc,term)
idf = inversedocfreq(matrix, term)
tfidf[doc][term] = tf*idf
return [[tfidf[doc][term] for term in vocab] for doc,_ in enumerate(tfidf)]
Here's the long answer with the tests:
import numpy as np
from math import sqrt, log
from itertools import chain, product
from collections import defaultdict
def cosine_sim(u,v):
return np.dot(u,v) / (sqrt(np.dot(u,u)) * sqrt(np.dot(v,v)))
def ngrams(sentence, n):
return zip(*[sentence.split()[i:] for i in range(n)])
def tfidf(corpus, vocab):
"""
INPUT:
corpus = [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]),
('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]),
('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
vocab = ['a', 'bar', 'black', 'foo', 'is', 'sentence',
'sheep', 'this']
OUTPUT:
[[0.300, 0.300, 0.0, 0.300, 0.300, 0.0, 0.0, 0.300],
[0.0, 0.600, 0.600, 0.300, 0.0, 0.0, 0.600, 0.0],
[0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
"""
def termfreq(matrix, doc, term):
try: return matrix[doc][term] / float(sum(matrix[doc].values()))
except ZeroDivisionError: return 0
def inversedocfreq(matrix, term):
try:
return float(len(matrix)) /sum([1 for i,_ in enumerate(matrix) if matrix[i][term] > 0])
except ZeroDivisionError: return 0
matrix = [{k:v for k,v in zip(vocab, i[1])} for i in corpus]
tfidf = defaultdict(dict)
for doc,_ in enumerate(matrix):
for term in matrix[doc]:
tf = termfreq(matrix,doc,term)
idf = inversedocfreq(matrix, term)
tfidf[doc][term] = tf*idf
return [[tfidf[doc][term] for term in vocab] for doc,_ in enumerate(tfidf)]
def corpus2vectors(corpus):
def vectorize(sentence, vocab):
return [sentence.split().count(i) for i in vocab]
vectorized_corpus = []
vocab = sorted(set(chain(*[i.lower().split() for i in corpus])))
for i in corpus:
vectorized_corpus.append((i, vectorize(i, vocab)))
return vectorized_corpus, vocab
def create_test_corpus():
sent1 = "this is a foo bar"
sent2 = "foo bar bar black sheep"
sent3 = "this is a sentence"
all_sents = [sent1,sent2,sent3]
corpus, vocab = corpus2vectors(all_sents)
return corpus, vocab
def test_cosine():
corpus, vocab = create_test_corpus()
for sentx, senty in product(corpus, corpus):
print sentx[0]
print senty[0]
print "cosine =", cosine_sim(sentx[1], senty[1])
print
def test_ngrams():
corpus, vocab = create_test_corpus()
for sentx in corpus:
print sentx[0]
print ngrams(sentx[0],2)
print ngrams(sentx[0],3)
print
def test_tfidf():
corpus, vocab = create_test_corpus()
print corpus
print vocab
print tfidf(corpus, vocab)
print "Testing cosine..."
test_cosine()
print
print "Testing ngrams..."
test_ngrams()
print
print "Testing tfidf..."
test_tfidf()
print
[out]:
Testing cosine...
this is a foo bar
this is a foo bar
cosine = 1.0
this is a foo bar
foo bar bar black sheep
cosine = 0.507092552837
this is a foo bar
this is a sentence
cosine = 0.67082039325
foo bar bar black sheep
this is a foo bar
cosine = 0.507092552837
foo bar bar black sheep
foo bar bar black sheep
cosine = 1.0
foo bar bar black sheep
this is a sentence
cosine = 0.0
this is a sentence
this is a foo bar
cosine = 0.67082039325
this is a sentence
foo bar bar black sheep
cosine = 0.0
this is a sentence
this is a sentence
cosine = 1.0
Testing ngrams...
this is a foo bar
[('this', 'is'), ('is', 'a'), ('a', 'foo'), ('foo', 'bar')]
[('this', 'is', 'a'), ('is', 'a', 'foo'), ('a', 'foo', 'bar')]
foo bar bar black sheep
[('foo', 'bar'), ('bar', 'bar'), ('bar', 'black'), ('black', 'sheep')]
[('foo', 'bar', 'bar'), ('bar', 'bar', 'black'), ('bar', 'black', 'sheep')]
this is a sentence
[('this', 'is'), ('is', 'a'), ('a', 'sentence')]
[('this', 'is', 'a'), ('is', 'a', 'sentence')]
Testing tfidf...
[('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]), ('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]), ('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
['a', 'bar', 'black', 'foo', 'is', 'sentence', 'sheep', 'this']
[[0.30000000000000004, 0.30000000000000004, 0.0, 0.30000000000000004, 0.30000000000000004, 0.0, 0.0, 0.30000000000000004], [0.0, 0.6000000000000001, 0.6000000000000001, 0.30000000000000004, 0.0, 0.0, 0.6000000000000001, 0.0], [0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
In case you're still interested in this problem, I've done something very similar using Lucene Java and Jython. Here's some snippets from my code.
Lucene preprocesses documents and queries using so-called analyzers. This one uses Lucene's built-in n-gram filter:
class NGramAnalyzer(Analyzer):
'''Analyzer that yields n-grams for minlength <= n <= maxlength'''
def __init__(self, minlength, maxlength):
self.minlength = minlength
self.maxlength = maxlength
def tokenStream(self, field, reader):
lower = ASCIIFoldingFilter(LowerCaseTokenizer(reader))
return NGramTokenFilter(lower, self.minlength, self.maxlength)
To turn a list of ngrams
into a Document
:
doc = Document()
doc.add(Field('n-grams', ' '.join(ngrams),
Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
To store a document in an index:
wr = IndexWriter(index_dir, NGramAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
wr.addDocument(doc)
Building queries is a little bit more difficult as Lucene's QueryParser
expects a query language with special operators, quotes, etc., but it can be circumvented (as partly explained here).