今天策划给我一个任务 —— 检测昵称中是否含有敏感词功能,然后丢给我两个压缩包,我解压一看:
有的txt文件是一行一个词:
有的txt文件是按逗号分隔开:
不管是什么格式的总之量非常多,把我这辈子脏话都囊括了
读取txt文件数据
然后我得先对这些txt文件进行处理转换成我们能用的格式:一开始我直接for循环查找是否含有敏感词,后边找资料看到一个dfa算法。
using system; using system.text; using system.collections.generic; using system.io; public class program { static void main() { //换行的txt文件 list<string> list = linefeed(); //带有逗号的txt文件 comma(); string name = "假如这是敏感词"; //检测昵称中是否含有敏感词 censortext(name, list); console.read(); } static void censortext(string text, list<string> list) { foreach (string line in list) { if (text.contains(line)) { console.writeline("昵称中存在无法使用的字符,请修改后再次确认"); } } } //用换行分割的txt文件 static list<string> linefeed() { string filepath = "e:\\c#project\\pbz\\反动词库.txt"; // 替换为你的 txt 文件路径 list<string> lines = readtxtfile(filepath); string a = ""; foreach (string line in lines) { a += "\"" + line + "\","; } console.writeline(a); return lines; } static list<string> readtxtfile(string filepath) { list<string> lines = new list<string>(); try { using (streamreader sr = new streamreader(filepath)) { string line; while ((line = sr.readline()) != null) { lines.add(line); } } } catch (exception e) { console.writeline("读取文件时出现错误: " + e.message); } return lines; } //用逗号分隔的txt文件 static void comma() { string filepath = "e:\\c#project\\pbz\\gfw补充词库.txt"; // 替换为你的 txt 文件路径 list<string> elements = readtxtfile1(filepath); string a = ""; foreach (string element in elements) { a += "\"" + element + "\","; } console.writeline(a); } static list<string> readtxtfile1(string filepath) { list<string> elements = new list<string>(); try { using (streamreader sr = new streamreader(filepath)) { string line = sr.readline(); if (line != null) { string[] splitelements = line.split(','); foreach (string element in splitelements) { elements.add(element); } } } } catch (exception e) { console.writeline("读取文件时出现错误: " + e.message); } return elements; } }
这样处理过后的数据就是list<string>,或者可以处理成数组、集合都可以
我把处理出来的数据放在hashset中
/// <summary> /// 敏感词词库 /// </summary> public static hashset<string> maskword = new hashset<string> { "敏感词1","敏感词2","敏感词3","..." }
c#版dfa算法
然后通过c#版的dfa算法判断昵称中是否含有敏感词返回bool型放在工具类中使用:
/// <summary> /// 检测敏感词 /// </summary> /// <param name="text">要检测的词</param> /// <param name="maskword">敏感词词库</param> /// <returns></returns> public static bool checksensitivewords(string text) { dictionary<string, dictionary<string, string>> statemap = new dictionary<string, dictionary<string, string>>(); dictionary<string, string> currentstate = new dictionary<string, string>(); char[] chars; foreach (string word in maskword) { currentstate = statemap.containskey("0") ? statemap["0"] : new dictionary<string, string>(); dictionary<string, string> nextstate; chars = word.tochararray(); for (int i = 0; i < chars.length; i++) { string c = chars[i].tostring(); string nextstatekey = i == chars.length - 1 ? "end" : (i + 1).tostring(); if (currentstate.containskey(c)) { nextstate = statemap[currentstate[c]]; } else { nextstate = new dictionary<string, string>(); statemap[currentstate.count.tostring()] = nextstate; currentstate[c] = currentstate.count.tostring(); } currentstate = nextstate; currentstate["end"] = "end"; } } currentstate = statemap.containskey("0") ? statemap["0"] : new dictionary<string, string>(); chars = text.tochararray(); for (int i = 0; i < chars.length; i++) { string c = chars[i].tostring(); if (currentstate.containskey(c)) { currentstate = statemap[currentstate[c]]; if (currentstate.containskey("end")) { return true; // 匹配到敏感词 } } else { currentstate = statemap.containskey("0") ? statemap["0"] : new dictionary<string, string>(); } } return false; // 未匹配到敏感词 }
到此这篇关于基于c#检测敏感词功能的文章就介绍到这了,更多相关c#检测敏感词内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网!
发表评论