jieba分词/jieba-analysis(java版)

北战南征 提交于 2019-11-28 15:45:05

简介

支持分词模式
Search模式,用于对用户查询词分词
Index模式,用于对索引文档分词
特性
支持多种分词模式
全角统一转成半角
用户词典功能
conf 目录有整理的搜狗细胞词库
因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能。

简单使用

获取jieba-analysis

<dependency>
  <groupId>com.huaban</groupId>
  <artifactId>jieba-analysis</artifactId>
  <version>1.0.2</version>
</dependency>

案例

复制代码
@Test
public void testDemo() {
    JiebaSegmenter segmenter = new JiebaSegmenter();
    String[] sentences =
        new String[] {"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。",
                      "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "结果婚的和尚未结过婚的"};
    for (String sentence : sentences) {
        System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());
    }
}
复制代码

原文链接:https://github.com/huaban/jieba-analysis

我的应用

复制代码
package com.analysis;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.List;
import java.util.UUID;

import org.junit.Before;
import org.junit.Test;

import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import com.huaban.analysis.jieba.SegToken;

public class jiebaTest {

    private Connection con = null;
    private PreparedStatement pstmt = null;

    /**
     * 连接
     */
    @Before
    public void beforeDemo() throws Exception {
        Class.forName("com.mysql.jdbc.Driver");
        String url = "jdbc:mysql://localhost:3306/test?user=root&password=root";
        con = DriverManager.getConnection(url);
    }

    /**
     * 分词查询测试
     */
    @Test
    public void getDemo() throws Exception {
        BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
        String str = br.readLine();

        String sql = "select * from t_jieba where name = ?";
        pstmt = con.prepareStatement(sql);

        pstmt.setString(1, str);
        ResultSet rs = pstmt.executeQuery();

        while (rs.next()) {
            System.out.println(rs.getInt(1)+"--"+rs.getString(2)+"--"+rs.getString(3)+"--"+rs.getString(4)+"--"+rs.getString(5));
            pstmt.clearParameters();
            String sql1 = "update t_jieba set times = ? where id = ?";
            pstmt = con.prepareStatement(sql1);
            pstmt.setInt(1, 1+ new Integer(rs.getString(5)));
            pstmt.setInt(2, rs.getInt(1));
            pstmt.executeUpdate();
        }
        
        rs.close();
        pstmt.close();
    }

    /**
     * 分词插入测试
     */
    @Test
    public void addDemo() throws Exception {
        String sql = "insert into t_jieba (name,cid,c_name,times) select ?,?,?,? from DUAL where not EXISTS(select name from t_jieba where name=?)";
        pstmt = con.prepareStatement(sql);
        JiebaSegmenter segmenter = new JiebaSegmenter();
        String[] sentences = new String[] { "大话数据结构", "深入浅出设计模式", "JavaEE开发的颠覆者: Spring Boot实战", "java从入门到放弃" };
        for (String sentence : sentences) {
            //System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());
            String uuid = UUID.randomUUID().toString();
            uuid = uuid.replace("-", "");
            List<SegToken> list = segmenter.process(sentence, SegMode.INDEX);
            for (SegToken segToken : list) {
                String name = segToken.word.trim();
                if (name != null && !"".equals(name)) {
                    pstmt.setString(1, segToken.word);
                    pstmt.setString(2, uuid);
                    pstmt.setString(3, sentence);
                    pstmt.setString(4, "0");
                    pstmt.setString(5, segToken.word);
                    pstmt.executeUpdate();
                    pstmt.clearParameters();
                }
            }
        }
        pstmt.close();
        System.out.println("插入成功!");
    }

}
复制代码
原文地址:https://www.cnblogs.com/bky-lzw/p/7799238.html

简介

支持分词模式
Search模式,用于对用户查询词分词
Index模式,用于对索引文档分词
特性
支持多种分词模式
全角统一转成半角
用户词典功能
conf 目录有整理的搜狗细胞词库
因为性能原因,最新的快照版本去除词性标注,也希望有更好的 Pull Request 可以提供该功能。

简单使用

获取jieba-analysis

<dependency>
  <groupId>com.huaban</groupId>
  <artifactId>jieba-analysis</artifactId>
  <version>1.0.2</version>
</dependency>

案例

复制代码
@Test
public void testDemo() {
    JiebaSegmenter segmenter = new JiebaSegmenter();
    String[] sentences =
        new String[] {"这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", "我不喜欢日本和服。", "雷猴回归人间。",
                      "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", "结果婚的和尚未结过婚的"};
    for (String sentence : sentences) {
        System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());
    }
}
复制代码

原文链接:https://github.com/huaban/jieba-analysis

我的应用

复制代码
package com.analysis;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.List;
import java.util.UUID;

import org.junit.Before;
import org.junit.Test;

import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import com.huaban.analysis.jieba.SegToken;

public class jiebaTest {

    private Connection con = null;
    private PreparedStatement pstmt = null;

    /**
     * 连接
     */
    @Before
    public void beforeDemo() throws Exception {
        Class.forName("com.mysql.jdbc.Driver");
        String url = "jdbc:mysql://localhost:3306/test?user=root&password=root";
        con = DriverManager.getConnection(url);
    }

    /**
     * 分词查询测试
     */
    @Test
    public void getDemo() throws Exception {
        BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
        String str = br.readLine();

        String sql = "select * from t_jieba where name = ?";
        pstmt = con.prepareStatement(sql);

        pstmt.setString(1, str);
        ResultSet rs = pstmt.executeQuery();

        while (rs.next()) {
            System.out.println(rs.getInt(1)+"--"+rs.getString(2)+"--"+rs.getString(3)+"--"+rs.getString(4)+"--"+rs.getString(5));
            pstmt.clearParameters();
            String sql1 = "update t_jieba set times = ? where id = ?";
            pstmt = con.prepareStatement(sql1);
            pstmt.setInt(1, 1+ new Integer(rs.getString(5)));
            pstmt.setInt(2, rs.getInt(1));
            pstmt.executeUpdate();
        }
        
        rs.close();
        pstmt.close();
    }

    /**
     * 分词插入测试
     */
    @Test
    public void addDemo() throws Exception {
        String sql = "insert into t_jieba (name,cid,c_name,times) select ?,?,?,? from DUAL where not EXISTS(select name from t_jieba where name=?)";
        pstmt = con.prepareStatement(sql);
        JiebaSegmenter segmenter = new JiebaSegmenter();
        String[] sentences = new String[] { "大话数据结构", "深入浅出设计模式", "JavaEE开发的颠覆者: Spring Boot实战", "java从入门到放弃" };
        for (String sentence : sentences) {
            //System.out.println(segmenter.process(sentence, SegMode.INDEX).toString());
            String uuid = UUID.randomUUID().toString();
            uuid = uuid.replace("-", "");
            List<SegToken> list = segmenter.process(sentence, SegMode.INDEX);
            for (SegToken segToken : list) {
                String name = segToken.word.trim();
                if (name != null && !"".equals(name)) {
                    pstmt.setString(1, segToken.word);
                    pstmt.setString(2, uuid);
                    pstmt.setString(3, sentence);
                    pstmt.setString(4, "0");
                    pstmt.setString(5, segToken.word);
                    pstmt.executeUpdate();
                    pstmt.clearParameters();
                }
            }
        }
        pstmt.close();
        System.out.println("插入成功!");
    }

}
复制代码
原文地址:https://www.cnblogs.com/bky-lzw/p/7799238.html
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!