package NaviveBayesClassify;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class ChineseSpliter {
/**
* 对给定的文本进行中文分词
*
* @param text
* 给定的文本
* @param splitToken
* 用于分割的标记,如"|"
* @return 分词完毕的文本
* @throws IOException
*/
public static String split(String text, String splitToken) throws IOException {
String result = null;
// 创建分词对象
@SuppressWarnings("resource")
Analyzer analyzer = new IKAnalyzer(true);
StringReader reader = new StringReader(text);
// 分词
TokenStream ts = analyzer.tokenStream("", reader);
CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
// 遍历分词数据
while (ts.incrementToken()) {
System.out.print(term.toString() + splitToken);
result = term.toString() + splitToken + result;
}
reader.close();
return result;
}
}
来源:oschina
链接:https://my.oschina.net/u/2510243/blog/637150