听闻SimHash很强,对海量文档相似度的计算有很高的效率。查了查文档,大致的流程如下:
大致流程就是:分词, 配合词频计算哈希串(每个分出来的词最终会计算处同样的长度), 降维,计算海明距离。
#coding:utf8
import math
import jieba
import jieba.analyse
class SimHash(object):
def __init__(self):
pass
def getBinStr(self, source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
print(source, x)
return str(x)
def getWeight(self, source):
# fake weight with keyword
return ord(source)
def unwrap_weight(self, arr):
ret = ""
for item in arr:
tmp = 0
if int(item) > 0:
tmp = 1
ret += str(tmp)
return ret
def simHash(self, rawstr):
seg = jieba.cut(rawstr, cut_all=True)
keywords = jieba.analyse.extract_tags("|".join(seg), topK=100, withWeight=True)
print(keywords)
ret = []
for keyword, weight in keywords:
binstr = self.getBinStr(keyword)
keylist = []
for c in binstr:
weight = math.ceil(weight)
if c == "1":
keylist.append(int(weight))
else:
keylist.append(-int(weight))
ret.append(keylist)
# 对列表进行"降维"
rows = len(ret)
cols = len(ret[0])
result = []
for i in range(cols):
tmp = 0
for j in range(rows):
tmp += int(ret[j][i])
if tmp > 0:
tmp = "1"
elif tmp <= 0:
tmp = "0"
result.append(tmp)
return "".join(result)
def getDistince(self, hashstr1, hashstr2):
length = 0
for index, char in enumerate(hashstr1):
if char == hashstr2[index]:
continue
else:
length += 1
return length
if __name__ == "__main__":
simhash = SimHash()
s1 = "100元=38万星币,加微信"
s2 = "38万星币100元,加VX"
with open("a.txt", "r") as file:
s1 = "".join(file.readlines())
file.close()
with open("b.txt", "r") as file:
s2 = "".join(file.readlines())
file.close()
# s1 = "this is just test for simhash, here is the difference"
# s2 = "this is a test for simhash, here is the difference"
# print(simhash.getBinStr(s1))
# print(simhash.getBinStr(s2))
hash1 = simhash.simHash(s1)
hash2 = simhash.simHash(s2)
distince = simhash.getDistince(hash1, hash2)
# value = math.sqrt(len(s1)**2 + len(s2)**2)
value = 5
print("海明距离:", distince, "判定距离:", value, "是否相似:", distince<=value)
经计算发现,对大文本有较强的验证性,对小短文本相似度计算略有偏差,海明距离的计算会有不准。
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/d0/d4zzr4n51m7_vj9ryfb633pc0000gn/T/jieba.cache
Loading model cost 0.764 seconds.
Prefix dict has been built succesfully.
海明距离: 1 判定距离: 5 是否相似: True
参考链接:
来源:CSDN
作者:郭 璞
链接:https://blog.csdn.net/Marksinoberg/article/details/82559134