到了第六步,我们只是理论上探讨优化的步骤,最后,我们进行集体测试,使用敏感词越多,效果越明显:
package test;
import static util.PrintUtil.print;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
public class Test {
static int key_max = 0; // 敏感词最大长度
static String[] keys = {"办证", "气枪出售", "裸聊", "裸表演", "土枪卖"};
static String tContent = "再办证顶";
static ArrayList<String> first = new ArrayList<String>();
static String[] sortFirst;
static char[] charFirst;
static HashMap<String, ArrayList<String>> map = new HashMap<String, ArrayList<String>>();
static HashMap<String, String[]> sortMap = new HashMap<String, String[]>();
static HashMap<String, char[]> charMap = new HashMap<String, char[]>();
static void init(String[] keys) {
ArrayList<String> temp;
String key, value;
int length;
for (String k : keys) {
if (!first.contains(k.substring(0, 1))) {
first.add(k.substring(0, 1));
}
length = k.length();
if (length > key_max) key_max = length;
for (int i = 1; i < length; i ++) {
key = k.substring(0, i);
value = k.substring(i, i + 1);
if (i == 1 && !first.contains(key)) {
first.add(key);
}
// 有,添加
if (map.containsKey(key)) {
if (!map.get(key).contains(value)) {
map.get(key).add(value);
}
}
// 没有添加
else {
temp = new ArrayList<String>();
temp.add(value);
map.put(key, temp);
}
}
}
sortFirst = first.toArray(new String[first.size()]);
Arrays.sort(sortFirst); // 排序
charFirst = new char[first.size()];
for (int i = 0; i < charFirst.length; i ++) {
charFirst[i] = first.get(i).charAt(0);
}
Arrays.sort(charFirst); // 排序
String[] sortValue;
ArrayList<String> v;
Map.Entry<String, ArrayList<String>> entry;
Iterator<Entry<String, ArrayList<String>>> iter = map.entrySet().iterator();
while (iter.hasNext()) {
entry = (Map.Entry<String, ArrayList<String>>) iter.next();
v = (ArrayList<String>)entry.getValue();
sortValue = v.toArray(new String[v.size()]);
Arrays.sort(sortValue); // 排序
sortMap.put(entry.getKey(), sortValue);
}
char[] charValue;
iter = map.entrySet().iterator();
while (iter.hasNext()) {
entry = (Map.Entry<String, ArrayList<String>>) iter.next();
v = (ArrayList<String>)entry.getValue();
charValue = new char[v.size()];
for (int i = 0; i < charValue.length; i ++) {
charValue[i] = v.get(i).charAt(0);
}
Arrays.sort(charValue); // 排序
charMap.put(entry.getKey(), charValue);
}
}
/**
* 快速实现的方法
*/
public final static String test1(String content) {
for (String k : keys) {
if (content.indexOf(k) > -1)
return k;
}
return null;
}
/**
* 优化一
*/
public final static String test2(String content) {
boolean bFirst = false;
int length = content.length();
for (int i = 0; i < length; i ++) {
if (first.contains(content.substring(i, i + 1))) {
bFirst = true;
break;
}
}
return bFirst ? test1(content) : null;
}
/**
* 优化二
*/
public final static String test3(String content) {
String r = null, f, g, c = content;
ArrayList<String> temps;
int length = c.length();
tag : for (int i = 0; i < length - 1; i++) {
f = c.substring(i, i + 1);
if (first.contains(f)) {
for (int j = i + 1; j < length; j++) {
f = c.substring(i, j);
g = c.substring(j, j + 1);
temps = map.get(f);
if (temps == null) { // 找到了
//print("ok");
r = f;
break tag;
}
if (temps.contains(g)) {
if (j == length - 1) {
//print("find!");
r = c.substring(i, j + 1);
break tag;
}
} else { // 没有找到了
break;
}
}
}
}
return r;
}
/**
* 优化三
*/
public final static String test4(String content) {
String r = null, f, g, c = content;
String[] temps;
int length = c.length();
tag : for (int i = 0; i < length - 1; i++) {
f = c.substring(i, i + 1);
// 二分查找
if (Arrays.binarySearch(sortFirst, f) > -1) {
for (int j = i + 1; j < length; j++) {
f = c.substring(i, j);
g = c.substring(j, j + 1);
temps = sortMap.get(f);
if (temps == null) { // 找到了
//print("ok");
r = f;
break tag;
}
// 二分查找
if (Arrays.binarySearch(temps, g) > -1) {
if (j == length - 1) {
//print("find!");
r = c.substring(i, j + 1);
break tag;
}
} else { // 没有找到了
break;
}
}
}
}
return r;
}
/**
* 优化四
*/
public final static String test5(String content) {
String r = null, f, c = content;
char g;
char[] temps;
int length = c.length();
tag : for (int i = 0; i < length - 1; i++) {
g = c.charAt(i);
// 二分查找
if (Arrays.binarySearch(charFirst, g) > -1) {
for (int j = i + 1; j < length; j++) {
f = c.substring(i, j);
g = c.charAt(j);
temps = charMap.get(f);
if (temps == null) { // 找到了
//print("ok");
r = f;
break tag;
}
// 二分查找
if (Arrays.binarySearch(temps, g) > -1) {
if (j == length - 1) {
//print("find!");
r = c.substring(i, j + 1);
break tag;
}
} else { // 没有找到了
break;
}
}
}
}
return r;
}
/**
* 优化五
*/
public final static String test6(String content) {
String r = null, c = content;
char g;
char[] temps;
char[] keys = new char[key_max];
int length = c.length(), index;
tag : for (int i = 0; i < length - 1; i++) {
index = 0;
g = c.charAt(i);
// 过滤特殊字符
if (Arrays.binarySearch(filters, g) > -1) {
continue;
}
// 二分查找
if (Arrays.binarySearch(charFirst, g) > -1) {
keys[index++] = g;
for (int j = i + 1; j < length; j++) {
g = c.charAt(j);
// 过滤特殊字符
if (Arrays.binarySearch(filters, g) > -1) {
continue;
}
temps = charMap.get(String.valueOf(keys, 0, index));
if (temps == null) { // 找到了
//print("ok");
r = String.valueOf(keys, 0, index);
break tag;
}
// 二分查找
if (Arrays.binarySearch(temps, g) > -1) {
if (j == length - 1) {
//print("find!");
keys[index++] = g;
r = String.valueOf(keys, 0, index);
break tag;
}
} else { // 没有找到了
break;
}
keys[index++] = g;
}
}
}
return r;
}
public static StringBuffer read(String file) throws IOException{
BufferedReader in = new BufferedReader(new FileReader(file));
String line = null;
StringBuffer buffer = new StringBuffer();
while((line = in.readLine())!= null){
buffer.append(line);
}
return buffer;
}
// 过滤特殊字符[敏感词需要过滤、用户输入内容也需要过滤]
static char[] filters = ",.~!@#$%^&*(){}[];':\"".toCharArray();
static { Arrays.sort(filters); /* 排序 */ }
// 过滤特殊字符正则表达式
static String regexp = ",|\\.|\\(|\\)|\\*|&|\\^|%|\\$";
public static void main(String[] args) throws IOException {
// 读取敏感词组
String[] keys = read("data/keyword1").toString().split("@");
tContent = read("data/test1").toString(); // 读取测试内容
init(keys); // 初始化
long time1;
int max = 1000;
String newContent;
time1 = System.currentTimeMillis();
for (int i = 0; i < max; i ++) {
newContent = tContent.replaceAll(regexp, "");
test1(newContent);
}
print("test1 time:" + (System.currentTimeMillis() - time1));
time1 = System.currentTimeMillis();
for (int i = 0; i < max; i ++) {
newContent = tContent.replaceAll(regexp, "");
test2(newContent);
}
print("test2 time:" + (System.currentTimeMillis() - time1));
time1 = System.currentTimeMillis();
for (int i = 0; i < max; i ++) {
newContent = tContent.replaceAll(regexp, "");
test3(newContent);
}
print("test3 time:" + (System.currentTimeMillis() - time1));
time1 = System.currentTimeMillis();
for (int i = 0; i < max; i ++) {
newContent = tContent.replaceAll(regexp, "");
test4(newContent);
}
print("test4 time:" + (System.currentTimeMillis() - time1));
time1 = System.currentTimeMillis();
for (int i = 0; i < max; i ++) {
newContent = tContent.replaceAll(regexp, "");
test5(newContent);
}
print("test5 time:" + (System.currentTimeMillis() - time1));
time1 = System.currentTimeMillis();
for (int i = 0; i < max; i ++) {
// 取消正则过滤特殊字符
test6(tContent);
}
print("test6 time:" + (System.currentTimeMillis() - time1));
}
}
代码有不妥之处,欢迎指出^_^。
来源:oschina
链接:https://my.oschina.net/u/113867/blog/32389