【概述】做好一个web系统的安全运维,除了常规的防注入,防入侵等,还有一个检测并过滤敏感词,脏词.. 这件事做得不好,轻则导致一场投诉或纠纷,重则导致产品被勒令关闭停运。
废话少说,先看下代码,可以拿过去直接使用。
1 using Microsoft.VisualBasic;
2 using System;
3 using System.Collections.Generic;
4 using System.IO;
5 using System.Linq;
6 using System.Text;
7
8 namespace OpenCore.ContentSecurity
9 {
10 /// <summary>
11 /// 功能简介:基于DFA算法的高效率非法关键词检测过滤类(杜绝违法内容)
12 /// 开发前参考内容:https://blog.csdn.net/u011966339/article/details/72832197
13 /// 更新日志:
14 /// 2020-4-15:加载字典的处理采用静态构造方法中处理,避免频繁加载,提升性能.
15 /// 支持多词库文件加载.
16 /// 优化了算法的细节,提高健壮性。
17 /// </summary>
18 public class SensitiveWordFilter
19 {
20 private static string[] dictionaryPathList = null;
21 /// <summary>
22 /// 内存词典
23 /// </summary>
24 private static WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];
25 private static object lockObj = new object();
26 public static void Init(string[] sDictionaryFileName)
27 {
28 dictionaryPathList = sDictionaryFileName;
29 LoadDictionary();
30 }
31 public SensitiveWordFilter()
32 {
33
34 }
35 private string sourctText = string.Empty;
36 /// <summary>
37 /// 检测源
38 /// </summary>
39 private string SourctText
40 {
41 get { return sourctText; }
42 set { sourctText = value; }
43 }
44 /// <summary>
45 /// 检测源游标
46 /// </summary>
47 private int cursor = 0;
48 /// <summary>
49 /// 匹配成功后偏移量
50 /// </summary>
51 private int wordlenght = 0;
52 /// <summary>
53 /// 检测词游标
54 /// </summary>
55 private int nextCursor = 0;
56 private List<string> illegalWords = new List<string>();
57 /// <summary>
58 /// 检测到的非法词集
59 /// </summary>
60 public List<string> IllegalWords
61 {
62 get { return illegalWords; }
63 }
64 /// <summary>
65 /// 判断是否是中文
66 /// </summary>
67 /// <param name="character"></param>
68 /// <returns></returns>
69 private bool isCHS(char character)
70 {
71 // 中文表意字符的范围 4E00-9FA5
72 int charVal = (int)character;
73 return (charVal >= 0x4e00 && charVal <= 0x9fa5);
74 }
75 /// <summary>
76 /// 判断是否是数字
77 /// </summary>
78 /// <param name="character"></param>
79 /// <returns></returns>
80 private bool isNum(char character)
81 {
82 int charVal = (int)character;
83 return (charVal >= 48 && charVal <= 57);
84 }
85 /// <summary>
86 /// 判断是否是字母
87 /// </summary>
88 /// <param name="character"></param>
89 /// <returns></returns>
90 private bool isAlphabet(char character)
91 {
92 int charVal = (int)character;
93 return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
94 }
95 /// <summary>
96 /// 转半角小写的函数(DBC case)
97 /// </summary>
98 /// <param name="input">任意字符串</param>
99 /// <returns>半角字符串</returns>
100 ///<remarks>
101 ///全角空格为12288,半角空格为32
102 ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
103 ///</remarks>
104 private static string ToDBC(string input)
105 {
106 char[] c = input.ToCharArray();
107 for (int i = 0; i < c.Length; i++)
108 {
109 if (c[i] == 12288)
110 {
111 c[i] = (char)32;
112 continue;
113 }
114 if (c[i] > 65280 && c[i] < 65375)
115 c[i] = (char)(c[i] - 65248);
116 }
117 return new string(c).ToLower();
118 }
119 /// <summary>
120 /// 转换为简体中文
121 /// </summary>
122 /// <param name="sInput"></param>
123 /// <returns></returns>
124 private static string ToSimplifiedChiniese(string sInput)
125 {
126 if (string.IsNullOrEmpty(sInput))
127 {
128 return string.Empty;
129 }
130 try
131 {
132 return Strings.StrConv(sInput, VbStrConv.SimplifiedChinese, 0);
133 }
134 catch (Exception ex)
135 {
136
137 }
138 return sInput;
139 }
140 /// <summary>
141 /// 写入日志(非跨程序域的场景)
142 /// </summary>
143 /// <param name="Msg"></param>
144 private static void SaveLog(string Msg)
145 {
146 string sPath = Path.Combine(AppDomain.CurrentDomain.SetupInformation.ApplicationBase, "SecurityLog");
147 if (!Directory.Exists(sPath))
148 {
149 Directory.CreateDirectory(sPath);
150 }
151 sPath = string.Format("{0}\\{1}", sPath, DateTime.Now.ToString("yyyyMMdd") + ".log");
152 try
153 {
154 File.AppendAllText(sPath, "[" + DateTime.Now.ToString() + "]" + Msg + "\r\n");
155 }
156 catch
157 {
158 }
159 }
160 /// <summary>
161 /// 加载内存词库
162 /// </summary>
163 private static void LoadDictionary()
164 {
165 if (dictionaryPathList == null || dictionaryPathList.Length == 0)
166 {
167 SaveLog($"SensitiveWordFilter.LoadDictionary.字典路径配置为空");
168 return;
169 }
170 foreach (string sFileName in dictionaryPathList)
171 {
172 if (File.Exists(sFileName) == false)
173 {
174 SaveLog($"SensitiveWordFilter.LoadDictionary.路径:{sFileName}不是一个有效的文件");
175 return;
176 }
177 }
178 List<string> wordList = new List<string>();
179 Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
180 foreach (string sDictionaryFile in dictionaryPathList)
181 {
182 string[] words = System.IO.File.ReadAllLines(sDictionaryFile, System.Text.Encoding.Default);
183 foreach (string word in words)
184 {
185 if (string.IsNullOrEmpty(word))
186 continue;
187 if (word.Trim().Length == 0)
188 continue;
189 string key = ToDBC(word);
190 wordList.Add(key);
191 //适配繁体,简体.addbyww@2020-4-15
192 string key_simple = ToSimplifiedChiniese(key);
193 if (key_simple != key)
194 {
195 wordList.Add(key_simple);
196 }
197 }
198 }
199 Comparison<string> cmp = delegate (string key1, string key2)
200 {
201 return key1.CompareTo(key2);
202 };
203 wordList.Sort(cmp);
204 for (int i = wordList.Count - 1; i > 0; i--)
205 {
206 if (wordList[i].ToString() == wordList[i - 1].ToString())
207 {
208 wordList.RemoveAt(i);
209 }
210 }
211 foreach (var word in wordList)
212 {
213 if (word.Length > 0)
214 {
215 WordGroup group = MEMORYLEXICON[(int)word[0]];
216 if (group == null)
217 {
218 group = new WordGroup();
219 MEMORYLEXICON[(int)word[0]] = group;
220 }
221 group.Add(word.Substring(1));
222 }
223 }
224 }
225 /// <summary>
226 /// 检测
227 /// </summary>
228 /// <param name="blackWord"></param>
229 /// <returns></returns>
230 private bool Check(string blackWord)
231 {
232 wordlenght = 0;
233 //检测源下一位游标
234 nextCursor = cursor + 1;
235 bool found = false;
236 //遍历词的每一位做匹配
237 for (int i = 0; i < blackWord.Length; i++)
238 {
239 //特殊字符偏移游标
240 int offset = 0;
241 if (nextCursor >= sourctText.Length)
242 {
243 break;
244 }
245 else
246 {
247 //检测下位字符如果不是汉字 数字 字符 偏移量加1
248 for (int y = nextCursor; y < sourctText.Length; y++)
249 {
250
251 if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
252 {
253 offset++;
254 //避让特殊字符,下位游标如果>=字符串长度 跳出
255 if (nextCursor + offset >= sourctText.Length) break;
256 wordlenght++;
257 }
258 else break;
259 }
260 if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
261 {
262 found = true;
263 }
264 else
265 {
266 found = false;
267 break;
268 }
269 }
270 nextCursor = nextCursor + 1 + offset;
271 wordlenght++;
272 }
273 return found;
274 }
275 /// <summary>
276 /// 检测并替换敏感词为指定字符。之后返回
277 /// </summary>
278 /// <param name="replaceChar">比如:*</param>
279 public string getDataByFilter(string sSourceInput, char replaceChar)
280 {
281 if (string.IsNullOrEmpty(sSourceInput))
282 {
283 return sSourceInput;
284 }
285 if (MEMORYLEXICON == null || MEMORYLEXICON.Length == 0)
286 {
287 SaveLog($"SensitiveWordFilter.getDataByFilter.内存字典为空");
288 return sSourceInput;
289 }
290 //初始化
291 this.cursor = 0;
292 this.wordlenght = 0;
293 this.illegalWords.Clear();
294 this.sourctText = sSourceInput;
295 if (sourctText != string.Empty)
296 {
297 char[] tempString = sourctText.ToCharArray();
298 for (int i = 0; i < SourctText.Length; i++)
299 {
300 //查询以该字为首字符的词组
301 WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
302 if (group != null)
303 {
304 for (int z = 0; z < group.Count(); z++)
305 {
306 string word = group.GetWord(z);
307 if (word.Length == 0 || Check(word))
308 {
309 string blackword = string.Empty;
310 for (int pos = 0; pos < wordlenght + 1; pos++)
311 {
312 blackword += tempString[pos + cursor].ToString();
313 tempString[pos + cursor] = replaceChar;
314 }
315 illegalWords.Add(blackword);
316 cursor = cursor + wordlenght;
317 i = i + wordlenght;
318 }
319 }
320 }
321 cursor++;
322 }
323 return new string(tempString);
324 }
325 else
326 {
327 return string.Empty;
328 }
329 }
330 }
331 /// <summary>
332 /// 具有相同首字符的词组集合
333 /// </summary>
334 public class WordGroup
335 {
336 /// <summary>
337 /// 集合
338 /// </summary>
339 private List<string> groupList=new List<string>();
340 public WordGroup()
341 {
342
343 }
344 /// <summary>
345 /// 添加词
346 /// </summary>
347 /// <param name="word"></param>
348 public void Add(string word)
349 {
350 if (groupList.Contains(word) == false)
351 {
352 groupList.Add(word);
353 }
354 }
355 /// <summary>
356 /// 获取总数
357 /// </summary>
358 /// <returns></returns>
359 public int Count()
360 {
361 return groupList.Count;
362 }
363 /// <summary>
364 /// 根据下标获取词
365 /// </summary>
366 /// <param name="index"></param>
367 /// <returns></returns>
368 public string GetWord(int index)
369 {
370 return groupList[index];
371 }
372 }
373 }
上面是一个完整的,独立的实现类。 下面给一个简单的调用示例:
1 //全局配置,整个程序只要配置一次即可,后续无需配置
2 SensitiveWordFilter.Init(new string[] {
3 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\暴恐词库.txt",
4 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\反动词库.txt",
5 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\民生词库.txt",
6 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\色情词库.txt",
7 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\贪腐词库.txt",
8 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\其他词库.txt"
9 });
10 //下列可以在多个地方实例化,可以并发执行
11 SensitiveWordFilter wordFilter = new SensitiveWordFilter();
12 Dictionary<string, string> dictTestData = new Dictionary<string, string>();
13 //多测几个示例,看看效果
14 dictTestData["杀^人游戏,有人找一夜q"] = "";
15 dictTestData["数学学习课堂"] = "";
16 dictTestData["打击法0功有,法0功毒害大众"] = "";
17 Dictionary<string, string> dictResult = new Dictionary<string, string>();
18 foreach(string sKey in dictTestData.Keys)
19 {
20 dictResult[sKey] = $"替换后:{wordFilter.getDataByFilter(sKey,'|')}, ------------检测违禁词:{string.Join(",",(wordFilter.IllegalWords==null?new List<string>():wordFilter.IllegalWords))}";
21 }
22 string sResultJson = JsonConverter.SerializeObject(dictResult);
23 Utils.SaveLog(sResultJson);
最后,给一下打印的结果:
"杀^人游戏,有人找一夜q": 替换后: "杀^人游戏,有人找|||", ------------检测违禁词:一夜q",
"数学学习课堂": 替换后:"数学学习课堂", ------------检测违禁词:,
"打击法0功有,法0功毒害大众": 替换后:"打击|||有,|||毒害大众", ------------检测违禁词:法0功,法0功"
-------------附
词库下载地址:https://codeload.github.com/chason777777/mgck/zip/master
来源:oschina
链接:https://my.oschina.net/u/4392508/blog/3236294