1.读取文档
2.对要计算的多篇文档进行访问
3.将文档整理成指定格式,方便后续进行计算
4.计算出词语的频率
5.对频率低的词语进行过滤
6.通过语料库建立词典
7.加载要对比的文档
8.将要对比的文档通过doc2bow转化为稀疏向量
9.对稀疏向量进行进一步处理,得到新语料库
10.将新语料库通过tfidfmodel进行处理,得到tfidf
11.通过token2id得到特征数
12.稀疏矩阵相似度,建立索引
13.得到最终相似度结果
from gensim import corpora, models, similarities
import jieba
from collections import defaultdict
dog = "C:/Users/xt/PycharmProjects/similar/dog.txt" # 1.读取文档
cat = "C:/Users/xt/PycharmProjects/similar/cat.txt"
d1 = open(dog).read() # 2.对要计算的多篇文档进行访问
c1 = open(cat).read()
data1 = jieba.cut(d1) # 3.将文档整理成指定格式,方便后续进行计算
data2 = jieba.cut(c1)
data11 = ''
for item in data1:
data11 += item + ' '
data21 = ''
for item in data2:
data21 += item + ' '
doc = [data11, data21]
print('doc:')
print(doc)
texts = [[word for word in do.split()] for do in doc]
print('texts:')
print(texts)
freq = defaultdict(int) # 4.计算出词语的频率
for text in texts:
for token in text:
freq[token] += 1
print('freq:')
print(freq)
texts = [[token for token in text if freq[token] > 1] for text in texts] # 5.对频率低的词语进行过滤
print('texts:')
print(texts)
diction = corpora.Dictionary(texts) # 6.通过语料库建立词典
print('diction:')
print(diction)
diction.save('C:/Users/xt/PycharmProjects/similar/dict.dict')
doc3 = "C:/Users/xt/PycharmProjects/similar/animal.txt" # 7.加载要对比的文档
d3 = open(doc3).read()
data3 = jieba.cut(d3)
data31 = ""
for item in data3:
data31 += item+' '
new_doc = data31
print('new_doc:')
print(new_doc)
new_vec = diction.doc2bow(new_doc.split()) # 8.将要对比的文档通过doc2bow转化为稀疏向量
print('new_vec:')
print(new_vec)
corpus = [diction.doc2bow(text) for text in texts] # 9.对稀疏向量进行进一步处理,得到新语料库
print('corpus:')
print(corpus)
tf_idf = models.TfidfModel(corpus) # 10.将新语料库通过tfidfmodel进行处理,得到tfidf
featureNum = len(diction.token2id.keys()) # 11.通过token2id得到特征数
index = similarities.SparseMatrixSimilarity(tf_idf[corpus], num_features=featureNum) # 12.稀疏矩阵相似度,建立索引
print('index:')
print(index)
sim = index[tf_idf[new_vec]] # 13.得到最终相似度结果
print('sim:')
print(sim)
来源:CSDN
作者:键盘侠Hyatt
链接:https://blog.csdn.net/qq_18149105/article/details/104594031