Simhash的实现完全版本

package com.mifan.wxrank.distinct;

import java.io.IOException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NLPTokenizer;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;

/**
 * Created by LiuKai on 2018/9/17.
 */
public class Simhash {
    private String tokens;  //要输入字符串
    public BigInteger intSimHash;  //文章最后产生的hash值
    public String strSimHash;    //文章最后产生的hash值 二进制类型的
    private int hashbits = 64; // 分词后的hash数;

    public Simhash(String tokens) {
        this.tokens = tokens;
    }

    public Simhash(int hashbits, String tokens) {
        this.hashbits = hashbits;
        this.tokens = tokens;
    }

    public Simhash(String tokens, int hashbits) {
        this.tokens = tokens;
        this.hashbits = hashbits;
        this.intSimHash = this.simHash();
    }

    /**
     * 清除html标签
     *
     * @param content
     * @return
     */
    private String cleanResume(String content) {
        // 若输入为HTML,下面会过滤掉所有的HTML的tag
        content = Jsoup.clean(content, Whitelist.none());
        content = StringUtils.lowerCase(content);
        String[] strings = {" ", "\n", "\r", "\t", "\\r", "\\n", "\\t", "&nbsp;"};
        for (String s : strings) {
            content = content.replaceAll(s, "");
        }
        return content;
    }


    public BigInteger simHash() {
        tokens = cleanResume(tokens); //清理一些特殊字符
        int[] v = new int[this.hashbits]; // 定义特征向量/数组
        List<Term> termList = NLPTokenizer.segment(this.tokens);  // 1、中文分词，分词器采用 hanlp
        List<Term> stringList = CommonUtil.filterStopWords(termList); //2、停用词移除
        Map<String, Integer> weightOfNature = new HashMap<String, Integer>(); // 词性的权重
        weightOfNature.put("n", 2);    //给名词的权重是2;
        for (Term string : stringList) {      // 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数.
            String nature = string.nature.toString();  //得到这个词的词性
            for (int i = 0; i < this.hashbits; i++) {
                BigInteger t = this.hash(string.word);
                BigInteger bitmask = new BigInteger("1").shiftLeft(i);
                // 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字),
                // 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
                // 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
                int weight = 1;  //添加权重
                if (weightOfNature.containsKey(nature)) {
                    weight = weightOfNature.get(nature);
                }
                if (t.and(bitmask).signum() != 0) {
                    // 这里是计算整个文档的所有特征的向量和
                    // 这里实际使用中需要 +- 权重
                    v[i] += weight;
                } else {
                    v[i] -= weight;
                }
            }
        }
        BigInteger fingerprint = new BigInteger("0");
        StringBuilder simHashBuffer = new StringBuilder();
        for (int i = 0; i < this.hashbits; i++) {
            // 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
            if (v[i] >= 0) {
                fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
                simHashBuffer.append("1");
            } else {
                simHashBuffer.append("0");
            }
        }
        this.strSimHash = simHashBuffer.toString();
        System.out.println(this.strSimHash + " length " + this.strSimHash.length());
        return fingerprint;
    }


    public int hammingDistance(Simhash other) {
        BigInteger x = this.intSimHash.xor(other.intSimHash);
        int tot = 0;
        // 统计x中二进制位数为1的个数
        // 我们想想，一个二进制数减去1，那么，从最后那个1（包括那个1）后面的数字全都反了，
        // 对吧，然后，n&(n-1)就相当于把后面的数字清0，
        // 我们看n能做多少次这样的操作就OK了。
        while (x.signum() != 0) {
            tot += 1;
            x = x.and(x.subtract(new BigInteger("1")));
        }
        return tot;
    }

    public static int hammingDistance(BigInteger a, BigInteger b) {
        BigInteger x = a.xor(b);
        int tot = 0;
        // 统计x中二进制位数为1的个数
        // 我们想想，一个二进制数减去1，那么，从最后那个1（包括那个1）后面的数字全都反了，
        // 对吧，然后，n&(n-1)就相当于把后面的数字清0，
        // 我们看n能做多少次这样的操作就OK了。

        while (x.signum() != 0) {
            tot += 1;
            x = x.and(x.subtract(new BigInteger("1")));
        }
        return tot;
    }

    public List subByDistance(Simhash simHash, int distance) {
        // 分成几组来检查
        int numEach = this.hashbits / (distance + 1);
        List characters = new ArrayList();

        StringBuilder buffer = new StringBuilder();

        int k = 0;
        for (int i = 0; i < this.intSimHash.bitLength(); i++) {
            // 当且仅当设置了指定的位时，返回 true
            boolean sr = simHash.intSimHash.testBit(i);

            if (sr) {
                buffer.append("1");
            } else {
                buffer.append("0");
            }

            if ((i + 1) % numEach == 0) {
                // 将二进制转为BigInteger
                BigInteger eachValue = new BigInteger(buffer.toString(), 2);
                System.out.println("----" + eachValue);
                buffer.delete(0, buffer.length());
                characters.add(eachValue);
            }
        }
        return characters;
    }

    private BigInteger hash(String source) {
        if (source == null || source.length() == 0) {
            return new BigInteger("0");
        } else {
            char[] sourceArray = source.toCharArray();
            BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
            BigInteger m = new BigInteger("1000003");
            BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
            for (char item : sourceArray) {
                BigInteger temp = BigInteger.valueOf((long) item);
                x = x.multiply(m).xor(temp).and(mask);
            }
            x = x.xor(new BigInteger(String.valueOf(source.length())));
            if (x.equals(new BigInteger("-1"))) {
                x = new BigInteger("-2");
            }
            return x;
        }
    }


//    public static void main(String[] args) throws IOException {
//        String s = "传统的 hash 算法只负责将原始内容尽量均匀随机地映射为一个签名值，"
//                + "原理上相当于伪随机数产生算法。产生的两个签名，如果相等，说明原始内容在一定概 率 下是相等的；"
//                + "如果不相等，除了说明原始内容不相等外，不再提供任何信息，因为即使原始内容只相差一个字节，"
//                + "所产生的签名也很可能差别极大。从这个意义 上来 说，要设计一个 hash 算法，"
//                + "对相似的内容产生的签名也相近，是更为艰难的任务，因为它的签名值除了提供原始内容是否相等的信息外，"
//                + "还能额外提供不相等的 原始内容的差异程度的信息。";
//        Simhash hash1 = new Simhash(s, 64);
//        System.out.println(hash1.intSimHash + "  " + hash1.intSimHash.bitLength());
//        // 计算 海明距离 在 3 以内的各块签名的 hash 值
//        hash1.subByDistance(hash1, 3);
//
//        // 删除首句话，并加入两个干扰串
//        s = "中国研究发现，这个问题,传统的 hash 算法只负责将原始内容尽量均匀随机地映射为一个签名值，"
//                + "原理上相当于伪随机数产生算法。产生的两个签名，如果相等，说明原始内容在一定概 率 下是相等的；"
//                + "如果不相等，除了说明原始内容不相等外，不再提供任何信息，因为即使原始内容只相差一个字节，"
//                + "所产生的签名也很可能差别极大。从这个意义 上来 说，要设计一个 hash 算法，"
//                + "对相似的内容产生的签名也相近，是更为艰难的任务，因为它的签名值除了提供原始内容是否相等的信息外，"
//                + "还能额外提供不相等的 原始内容的差异程度的信息。";
//        Simhash hash2 = new Simhash(s, 64);
//        System.out.println(hash2.intSimHash + "  " + hash2.intSimHash.bitCount());
//        hash1.subByDistance(hash2, 3);
//
//        System.out.println("============================");
//
//        System.out.println(hash1.hammingDistance(hash2));
//
//        int n = hammingDistance(hash1.intSimHash, hash2.intSimHash);
//        System.out.println(n);
//    }

}
使用hanlp进行了分词，和停用词的排除。还有就是移除了html的标签。对于各种类型的词语进行加权处理。
网站服务于：服务网站参考工作网站
来源：CSDN
作者：庸医2048
链接：https://blog.csdn.net/qq_20120669/article/details/82866182
标签
string
SimHash