基于java语言开发的一个高效的敏感词过滤工具
经实测,敏感词数量为5000个,待检测文本长度为200时,此工具类可毫秒级高效检索敏感词
完整代码
package com.wgh.common.utils;
import com.google.common.collect.lists;
import java.util.*;
/**
* @desc 一个用于敏感词检测的高效工具类
* @author 王冠华
* @date 2024-07-17
*/
public class sensitivewordfilter {
private static class trienode {
map<character, trienode> children; // 使用 map 来存储子节点
trienode fail; // 失败指针
boolean isendofword; // 标记是否是敏感词的结尾
string word; // 存储完整的敏感词
public trienode() {
this.children = new hashmap<>(); // 使用 hashmap 存储子节点
this.fail = null;
this.isendofword = false;
this.word = null;
}
}
private trienode root;
/**
* 构造函数,用于初始化敏感词库。
* @param words 敏感词集合,集合确保敏感词不会重复
*/
public sensitivewordfilter(set<string> words) {
root = new trienode();
addwords(words);
buildfailurepointers();
}
/**
* 将敏感词集合添加到 trie 树中。
* @param words 敏感词集合
*/
private void addwords(set<string> words) {
for (string word : words) {
addword(word.tolowercase()); // 转换为小写
}
}
/**
* 将单个敏感词添加到 trie 树中。
* @param word 敏感词
*/
private void addword(string word) {
trienode current = root;
for (char c : word.tochararray()) {
current.children.putifabsent(c, new trienode());
current = current.children.get(c);
}
current.isendofword = true; // 标记敏感词的结尾
current.word = word; // 存储完整的敏感词
}
/**
* 构建失败指针,用于加速匹配过程。
*/
private void buildfailurepointers() {
queue<trienode> queue = new linkedlist<>();
root.fail = null; // 根节点的失败指针应为 null
queue.add(root);
while (!queue.isempty()) {
trienode current = queue.poll();
for (map.entry<character, trienode> entry : current.children.entryset()) {
char c = entry.getkey();
trienode child = entry.getvalue();
trienode fail = current.fail;
while (fail != null && !fail.children.containskey(c)) {
fail = fail.fail;
}
if (fail == null) {
child.fail = root;
} else {
child.fail = fail.children.get(c);
}
queue.add(child);
}
}
}
/**
* 检查文本中是否包含敏感词,并返回 true 或 false。
* @param text 要检查的文本
* @return 如果包含敏感词,返回 true;否则返回 false
*/
public boolean containssensitiveword(string text) {
trienode current = root;
trienode node;
for (int i = 0; i < text.length(); i++) {
node = current;
for (int j = i; j < text.length(); j++) {
char c = character.tolowercase(text.charat(j)); // 转换为小写
while (node != null && !node.children.containskey(c)) {
node = node.fail; // 使用失败指针进行跳转
}
if (node == null) {
break;
} else {
node = node.children.get(c); // 跳转到匹配的子节点
}
// 直接在当前节点判断是否为敏感词结尾
if (node != null && node.isendofword) {
return true;
}
}
}
return false; // 未找到敏感词
}
/**
* 检查文本中是否包含敏感词,并返回命中的第一个敏感词及其位置。
* @param text 要检查的文本
* @return 包含敏感词及其位置的 map.entry 对象
*/
public map.entry<string, integer> findsensitiveword(string text) {
trienode current = root;
trienode node;
for (int i = 0; i < text.length(); i++) {
node = current;
for (int j = i; j < text.length(); j++) {
char c = character.tolowercase(text.charat(j)); // 转换为小写
while (node != null && !node.children.containskey(c)) {
node = node.fail; // 使用失败指针进行跳转
}
if (node == null) {
break;
} else {
node = node.children.get(c); // 跳转到匹配的子节点
}
// 直接在当前节点判断是否为敏感词结尾
if (node != null && node.isendofword) {
return new abstractmap.simpleentry<>(node.word, i);
}
}
}
return null; // 未找到敏感词
}
/**
* 检查文本中是否包含敏感词,并返回所有命中的敏感词及其位置。
* @param text 要检查的文本
* @return 包含敏感词及其位置的列表
*/
public list<map.entry<string, integer>> findallsensitivewords(string text) {
list<map.entry<string, integer>> result = new arraylist<>();
trienode current = root;
trienode node;
for (int i = 0; i < text.length(); i++) {
node = current;
for (int j = i; j < text.length(); j++) {
char c = character.tolowercase(text.charat(j)); // 转换为小写
while (node != null && !node.children.containskey(c)) {
node = node.fail; // 使用失败指针进行跳转
}
if (node == null) {
break;
} else {
node = node.children.get(c); // 跳转到匹配的子节点
}
// 直接在当前节点判断是否为敏感词结尾
if (node != null && node.isendofword) {
result.add(new abstractmap.simpleentry<>(node.word, i));
// continue searching in this segment to find all overlapping sensitive words
}
}
}
return result; // 返回所有找到的敏感词
}
public static void main(string[] args) {
// 初始化含有5000个敏感词的hashset
arraylist<string> list = lists.newarraylist("敏感词1","敏感词2","敏感词3"));
set<string> set = new hashset<>(list);
// 创建 sensitivewordfilter 对象
sensitivewordfilter filter = new sensitivewordfilter(set);
// 要检查的文本
string text1 = "位于固原市原州区开城镇的中庄水库,敏感词1是宁夏中南部城乡敏感词2饮水安全工程的主敏感词3调节水库";
string text2 = "位于固原市原州区开城镇的中庄水库,是宁夏中南部城乡饮水安全工程的主调节水库";
// 使用过滤器进行检查
boolean containssensitiveword1 = filter.containssensitiveword(text1);
boolean containssensitiveword2 = filter.containssensitiveword(text2);
system.out.println("检测结果1: " + containssensitiveword1); // true
system.out.println("检测结果2: " + containssensitiveword2); // false
map.entry<string, integer> sensitiveword1 = filter.findsensitiveword(text1);
map.entry<string, integer> sensitiveword2 = filter.findsensitiveword(text2);
system.out.println("查找结果1: " + sensitiveword1); // true
system.out.println("查找结果2: " + sensitiveword2); // false
list<map.entry<string, integer>> allsensitivewords = filter.findallsensitivewords(text1);
system.out.println("查找所有结果: " + allsensitivewords);
}
}方法补充
除了上文的方法,小编还为大家整理了一些其他敏感词过滤方法,希望对大家有所帮助
方法一:java过滤器实现敏感词汇过滤
过滤器类
package web.filter;
import javax.servlet.*;
import javax.servlet.annotation.webfilter;
import java.io.bufferedreader;
import java.io.filereader;
import java.io.ioexception;
import java.lang.reflect.invocationhandler;
import java.lang.reflect.method;
import java.lang.reflect.proxy;
import java.util.arraylist;
import java.util.list;
/**
* 敏感词汇过滤器
*/
@webfilter("/*")
public class sensitivewordsfilter implements filter {
public void dofilter(servletrequest req, servletresponse resp, filterchain chain) throws servletexception, ioexception {
//1.创建代理对象,增强getparameter方法
servletrequest proxy_req = (servletrequest) proxy.newproxyinstance(req.getclass().getclassloader(), req.getclass().getinterfaces(), new invocationhandler() {
@override
public object invoke(object proxy, method method, object[] args) throws throwable {
//增强getparameter方法
//判断是否是getparameter方法
if(method.getname().equals("getparameter")){
//增强返回值
//获取返回值
string value = (string) method.invoke(req,args);
system.out.println("method:"+method.getname());
if(value != null){
for (string str : list) {
if(value.contains(str)){
value = value.replaceall(str,"***");
}
}
}
return value;
}
//判断方法名是否是 getparametermap
//判断方法名是否是 getparametervalue
return method.invoke(req,args);
}
});
//2.放行
chain.dofilter(proxy_req, resp);
}
private list<string> list = new arraylist<string>();//敏感词汇集合
public void init(filterconfig config) throws servletexception {
try{
//1.获取文件真实路径
servletcontext servletcontext = config.getservletcontext();
string realpath = servletcontext.getrealpath("/web-inf/classes/敏感词汇.txt");
//2.读取文件
bufferedreader br = new bufferedreader(new filereader(realpath));
//3.将文件的每一行数据添加到list中
string line = null;
while((line = br.readline())!=null){
list.add(line);
}
br.close();
system.out.println(list);
}catch (exception e){
e.printstacktrace();
}
}
public void destroy() {
}
}测试类
package web.servlet;
import javax.servlet.servletexception;
import javax.servlet.annotation.webservlet;
import javax.servlet.http.httpservlet;
import javax.servlet.http.httpservletrequest;
import javax.servlet.http.httpservletresponse;
import java.io.ioexception;
@webservlet("/testservlet")
public class testservlet extends httpservlet {
protected void dopost(httpservletrequest request, httpservletresponse response) throws servletexception, ioexception {
string name = request.getparameter("name");
string msg = request.getparameter("msg");
system.out.println(name+":"+msg);
}
protected void doget(httpservletrequest request, httpservletresponse response) throws servletexception, ioexception {
this.dopost(request, response);
}
}方法二:java敏感词过滤工具类
import java.util.*;
public class sensitivewordutil {
/**
* 敏感词匹配规则
*/
public static final int minmatchtype = 1; //最小匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国]人
public static final int maxmatchtype = 2; //最大匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国人]
/**
* 敏感词集合
*/
public static hashmap sensitivewordmap;
/**
* 初始化敏感词库,构建dfa算法模型
*
* @param sensitivewordset 敏感词库
*/
public static synchronized void init(set<string> sensitivewordset) {
initsensitivewordmap(sensitivewordset);
}
/**
* 初始化敏感词库,构建dfa算法模型
*
* @param sensitivewordset 敏感词库
*/
private static void initsensitivewordmap(set<string> sensitivewordset) {
//初始化敏感词容器,减小扩容操做
sensitivewordmap = new hashmap(sensitivewordset.size());
string key;
map nowmap;
map<string, string> newwormap;
//迭代sensitivewordset
iterator<string> iterator = sensitivewordset.iterator();
while (iterator.hasnext()) {
//关键字
key = iterator.next();
nowmap = sensitivewordmap;
for (int i = 0; i < key.length(); i++) {
//转换成char型
char keychar = key.charat(i);
//库中获取关键字
object wordmap = nowmap.get(keychar);
//若是存在该key,直接赋值,用于下一个循环获取
if (wordmap != null) {
nowmap = (map) wordmap;
} else {
//不存在则,则构建一个map,同时将isend设置为0,由于他不是最后一个
newwormap = new hashmap<>();
//不是最后一个
newwormap.put("isend", "0");
nowmap.put(keychar, newwormap);
nowmap = newwormap;
}
if (i == key.length() - 1) {
//最后一个
nowmap.put("isend", "1");
}
}
}
}
/**
* 判断文字是否包含敏感字符
*
* @param txt 文字
* @param matchtype 匹配规则 1:最小匹配规则,2:最大匹配规则
* @return 若包含返回true,不然返回false
*/
public static boolean contains(string txt, int matchtype) {
boolean flag = false;
for (int i = 0; i < txt.length(); i++) {
int matchflag = checksensitiveword(txt, i, matchtype); //判断是否包含敏感字符
if (matchflag > 0) { //大于0存在,返回true
flag = true;
}
}
return flag;
}
/**
* 判断文字是否包含敏感字符
*
* @param txt 文字
* @return 若包含返回true,不然返回false
*/
public static boolean contains(string txt) {
return contains(txt, maxmatchtype);
}
/**
* 获取文字中的敏感词
*
* @param txt 文字
* @param matchtype 匹配规则 1:最小匹配规则,2:最大匹配规则
* @return
*/
public static set<string> getsensitiveword(string txt, int matchtype) {
set<string> sensitivewordlist = new hashset<>();
for (int i = 0; i < txt.length(); i++) {
//判断是否包含敏感字符
int length = checksensitiveword(txt, i, matchtype);
if (length > 0) {//存在,加入list中
sensitivewordlist.add(txt.substring(i, i + length));
i = i + length - 1;//减1的缘由,是由于for会自增
}
}
return sensitivewordlist;
}
/**
* 获取文字中的敏感词
*
* @param txt 文字
* @return
*/
public static set<string> getsensitiveword(string txt) {
return getsensitiveword(txt, maxmatchtype);
}
/**
* 替换敏感字字符
*
* @param txt 文本
* @param replacechar 替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱***
* @param matchtype 敏感词匹配规则
* @return
*/
public static string replacesensitiveword(string txt, char replacechar, int matchtype) {
string resulttxt = txt;
//获取全部的敏感词
set<string> set = getsensitiveword(txt, matchtype);
iterator<string> iterator = set.iterator();
string word;
string replacestring;
while (iterator.hasnext()) {
word = iterator.next();
replacestring = getreplacechars(replacechar, word.length());
resulttxt = resulttxt.replaceall(word, replacestring);
}
return resulttxt;
}
/**
* 替换敏感字字符
*
* @param txt 文本
* @param replacechar 替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱***
* @return
*/
public static string replacesensitiveword(string txt, char replacechar) {
return replacesensitiveword(txt, replacechar, maxmatchtype);
}
/**
* 替换敏感字字符
*
* @param txt 文本
* @param replacestr 替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]
* @param matchtype 敏感词匹配规则
* @return
*/
public static string replacesensitiveword(string txt, string replacestr, int matchtype) {
string resulttxt = txt;
//获取全部的敏感词
set<string> set = getsensitiveword(txt, matchtype);
iterator<string> iterator = set.iterator();
string word;
while (iterator.hasnext()) {
word = iterator.next();
resulttxt = resulttxt.replaceall(word, replacestr);
}
return resulttxt;
}
/**
* 替换敏感字字符
*
* @param txt 文本
* @param replacestr 替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我爱[屏蔽]
* @return
*/
public static string replacesensitiveword(string txt, string replacestr) {
return replacesensitiveword(txt, replacestr, maxmatchtype);
}
/**
* 获取替换字符串
*
* @param replacechar
* @param length
* @return
*/
private static string getreplacechars(char replacechar, int length) {
string resultreplace = string.valueof(replacechar);
for (int i = 1; i < length; i++) {
resultreplace += replacechar;
}
return resultreplace;
}
/**
* 检查文字中是否包含敏感字符,检查规则以下:<br>
*
* @param txt
* @param beginindex
* @param matchtype
* @return 若是存在,则返回敏感词字符的长度,不存在返回0
*/
private static int checksensitiveword(string txt, int beginindex, int matchtype) {
//敏感词结束标识位:用于敏感词只有1位的状况
boolean flag = false;
//匹配标识数默认为0
int matchflag = 0;
char word;
map nowmap = sensitivewordmap;
for (int i = beginindex; i < txt.length(); i++) {
word = txt.charat(i);
//获取指定key
nowmap = (map) nowmap.get(word);
if (nowmap != null) {//存在,则判断是否为最后一个
//找到相应key,匹配标识+1
matchflag++;
//若是为最后一个匹配规则,结束循环,返回匹配标识数
if ("1".equals(nowmap.get("isend"))) {
//结束标志位为true
flag = true;
//最小规则,直接返回,最大规则还需继续查找
if (minmatchtype == matchtype) {
break;
}
}
} else {//不存在,直接返回
break;
}
}
if (matchflag < 2 || !flag) {//长度必须大于等于1,为词
matchflag = 0;
}
return matchflag;
}
public static void main(string[] args) {
set<string> sensitivewordset = new hashset<>();
sensitivewordset.add("成人电影");
sensitivewordset.add("爱恋");
sensitivewordset.add("静静");
sensitivewordset.add("哈哈");
sensitivewordset.add("啦啦");
sensitivewordset.add("感动");
sensitivewordset.add("发呆");
//初始化敏感词库
sensitivewordutil.init(sensitivewordset);
system.out.println("敏感词的数量:" + sensitivewordutil.sensitivewordmap.size());
string string = "成人太多的伤感情怀也许只局限于饲养基地 荧幕中的情节。"
+ "而后咱们的扮演的角色就是跟随着主人公的喜红客联盟 怒哀乐而过于牵强的把本身的情感也附加于银幕情节中,而后感动就流泪,"
+ "难过就躺在某一我的的怀里尽情的阐述心扉或者手机卡复制器一个贱人一杯红酒一部电影在夜 深人静的晚上,关上电话静静的发呆着。";
system.out.println("待检测语句字数:" + string.length());
//是否含有关键字
boolean result = sensitivewordutil.contains(string);
system.out.println(result);
result = sensitivewordutil.contains(string, sensitivewordutil.minmatchtype);
system.out.println(result);
//获取语句中的敏感词
set<string> set = sensitivewordutil.getsensitiveword(string);
system.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set);
set = sensitivewordutil.getsensitiveword(string, sensitivewordutil.minmatchtype);
system.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set);
//替换语句中的敏感词
string filterstr = sensitivewordutil.replacesensitiveword(string, '*');
system.out.println(filterstr);
filterstr = sensitivewordutil.replacesensitiveword(string, '*', sensitivewordutil.minmatchtype);
system.out.println(filterstr);
string filterstr2 = sensitivewordutil.replacesensitiveword(string, "[*敏感词*]");
system.out.println(filterstr2);
filterstr2 = sensitivewordutil.replacesensitiveword(string, "[*敏感词*]", sensitivewordutil.minmatchtype);
system.out.println(filterstr2);
}
}到此这篇关于基于java语言开发的一个高效的敏感词过滤工具的文章就介绍到这了,更多相关java敏感词过滤内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网!
发表评论