现在基本所有的网页都存在敏感词过滤的功能,最近开发项目的时候,需要一个敏感词过滤的功能,参考了很多博客,便对此做了总结和自己的开发代码。
一、构建敏感词库
读取文件数据,并保存到HashMap中,构建一个DFA模型(字典树)
public class SensitiveWordInit {
public static Map sensitiveWordMap = null;
/**
* 加载敏感词库
* @return
* @throws Exception
*/
public Set<String> LoadSetitiveWord() throws Exception{
InputStream inputStream = null;
InputStreamReader inputStreamReader = null;
BufferedReader bufferedReader = null;
Set<String> set = new HashSet<String>();
try{
inputStream = getClass().getClassLoader().getResourceAsStream("CensorWords.txt");
inputStreamReader = new InputStreamReader(inputStream,"UTF-8");
bufferedReader = new BufferedReader(inputStreamReader);
String str = null;
while((str=bufferedReader.readLine())!=null){
set.add(str);
}
}catch (Exception e){
e.printStackTrace();
}finally {
if(bufferedReader!=null){
bufferedReader.close();
}
if(inputStreamReader!=null){
inputStreamReader.close();
}
if(inputStream!=null){
inputStream.close();
}
}
return set;
}
/**
* 把敏感词加入到HashMap
* @param keyWordSet
* @return
*/
public Map addSensitiveWordToHashMap(Set<String> keyWordSet){
sensitiveWordMap = new HashMap(keyWordSet.size());
Map nowMap = null;
Iterator iterator = keyWordSet.iterator();
while(iterator.hasNext()){
String str = (String) iterator.next();
nowMap = sensitiveWordMap;
for(int i = 0;i<str.length();i++){
char word = str.charAt(i);
Object wordMap = sensitiveWordMap.get(word);
if(wordMap!=null){
nowMap = (Map) wordMap;
}else{
Map<String,String> newMap = new HashMap<String,String>();
newMap.put("isEnd","0");
nowMap.put(word,newMap);
nowMap = newMap;
}
if(i==str.length()-1){
nowMap.put("isEnd","1");
}
}
}
return sensitiveWordMap;
}
}
二、敏感词的工具类
敏感词库构建好,利用java提供的replaceAll方法实现敏感词的替换。所有要实现一个获取文本中敏感词集合的方法,如下:
public class SensitiveWordUtils {
/**
* 敏感词库
*/
public static Map sensitiveWordMap = null;
/**
* 只过滤最小敏感词
*/
public static int minMatchType = 1;
/**
* 过滤所有敏感词
*/
public static int maxMatchType = 2;
/**
* 返回从beginIndex开始的敏感词长度
* @param txt
* @param beginIndex
* @param matchType
* @return
*/
public static int checkSensitiveWordSum(String txt, int beginIndex, int matchType) throws Exception{
boolean flag = false;
int sensitiveSum = 0; //返回敏感词长度
sensitiveWordMap = SensitiveWordInit.sensitiveWordMap;
for(int i = beginIndex;i<txt.length();i++){
char key = txt.charAt(i);
sensitiveWordMap = (Map)sensitiveWordMap.get(key);
if(sensitiveWordMap!=null){
sensitiveSum++;
if(sensitiveWordMap.get("isEnd").equals("1")){
flag = true;
if(matchType == minMatchType){
break;
}
}
}else{
break;
}
}
if(!flag || sensitiveSum<2){
sensitiveSum = 0;
}
return sensitiveSum;
}
/**
* 获取txt文本中的敏感词并保存
* 为后续的replaceAll方法做准备工作
* @param txt
* @param matchType
* @return
*/
public static Set<String> getSensitiveWord(String txt,int matchType) throws Exception{
Set<String> sensitiveWord = new HashSet<String>();
for (int i = 0; i < txt.length(); i++) {
int sum = checkSensitiveWordSum(txt,i,matchType);
if (sum > 0) {
// 将检测出的敏感词保存到集合中
sensitiveWord.add(txt.substring(i, i + sum));
i = i + sum - 1; //减一的目的是i下一次循环递增了
}
}
return sensitiveWord;
}
/**
* 替换敏感词
* 无论敏感词长度多大,默认替代字符“***”
* @param txt
* @param matchType
* @return
*/
public static String replaceSensitiveWord(String txt, int matchType) throws Exception{
String resultTxt = txt;
Set<String> set = getSensitiveWord(txt, matchType);
Iterator<String> iterator = set.iterator();
String word = null; //敏感词
String replaceTxt = "***"; // 替换字符,这里默认“***”
while (iterator.hasNext()) {
word = iterator.next();
resultTxt = resultTxt.replaceAll(word,replaceTxt);
}
return resultTxt;
}
}
注意:minMatchType和maxMatchType的设立主要是解决敏感词长度为1个的时候不进行替换
来源:https://blog.csdn.net/weixin_44607960/article/details/99475610