基于ac自动机的内容审核
项目链接:https://github.com/spectacleCase/badDetection
本篇文章是学习内容过滤所记录,项目亦是如此,如有什么不妥,还请指出。读者可以根据自己业务拓展开,话不多说进入正题
背景
内容审核,社区评论的业务或多或少都会涉及到,基于此背景,内容审核这个的需求就出现了。
整个内容审核大致如下
+------------------+| 用户提交文本 |+--------+---------+|v+------------------------+| 文本预处理(清洗、统一)|+------------------------+|+---------+----------+| |v v
+---------------+ +------------------+
| DFA/AC 快速匹配 | | 正则/变种规则匹配 |
+-------+-------+ +---------+--------+| || 发现敏感词 |v |+------------------+ || 命中敏感词 -> 拦截 | <---------++------------------+|v(未命中)
+---------------------------+
| NLP扩展(拼音/谐音/断字) |
+-----------+---------------+|v+------------------------------+| 风险打分引擎(规则+策略树) |+-------------+----------------+|+------v--------+| 风险等级判断 |+------+---------+|+---------+----------+-----------+| | |v v v
+-----------+ +---------------+ +---------------------+
| 正常放行 | | 人工审核队列 | | 拦截/封禁(高风险) |
+-----------+ +---------------+ +---------------------+
AC自动机
AC自动机是一种高效的多模式字符串匹配算法,结合了Trie树和KMP算法的失败指针思想,能在**单次扫描**文本的同时匹配多个模式串。其核心是通过构建Trie树并添加失败指针,使得匹配失败时能快速跳转,避免重复检查。时间复杂度为O(n+m+z)(n为文本长度,m为模式串总长度,z为匹配次数),适用于敏感词过滤、病毒检测、生物序列分析等场景,尤其适合大规模关键词快速匹配需求。
package com.choose;import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;public class SensitiveWordMatcher {private final Trie trie;private final Map<String, Sensitive> wordScores = new HashMap<>();public SensitiveWordMatcher(String filePath) throws IOException {List<String> lines = Files.readAllLines(Paths.get(filePath));//// Trie.TrieBuilder builder = Trie.builder().onlyWholeWords();Trie.TrieBuilder builder = Trie.builder();for (String line : lines) {if (line.trim().isEmpty()) {continue;}String[] parts = line.split("\\|");String word = TextPreprocessor.preprocess(parts[0].trim());builder.addKeyword(word);try {int score = parts.length > 1 ? Integer.parseInt(parts[2].trim()) : 10;wordScores.put(word, new Sensitive(parts[0].trim(), parts[1].trim(), score));} catch (Exception ignored) {}}this.trie = builder.build();}public Collection<Emit> match(String text) {return trie.parseText(text);}public int getScore(String word) {Sensitive sensitive = wordScores.get(word);if (sensitive == null) {return 0;}return sensitive.getScorer();}public String getType(String word) {Sensitive sensitive = wordScores.get(word);if (sensitive == null) {return null;}return sensitive.getType();}
}
package com.choose;/*** <p>* 敏感词* </p>** @author 桌角的眼镜* @version 1.0* @since 2025/5/18 15:54*/
public class Sensitive {/*** 词*/private String word;/*** 分数*/private Integer scorer;/*** 类型*/private String type;public Sensitive(String word, String type, Integer scorer) {this.word = word;this.scorer = scorer;this.type = type;}public String getWord() {return word;}public void setWord(String word) {this.word = word;}public Integer getScorer() {return scorer;}public void setScorer(Integer scorer) {this.scorer = scorer;}public String getType() {return type;}public void setType(String type) {this.type = type;}
}
判定打分机制
本次所使用的判断打分机制是基于权重,结合策略链进行的打分
打分接口
// 策略接口
public interface RiskStrategy {int evaluate(List<TypedEmit> matches, Map<String, RiskRule> rules);
}
累加实现
// 示例策略:简单累加
public class SumStrategy implements RiskStrategy {@Overridepublic int evaluate(List<TypedEmit> matches, Map<String, RiskRule> rules) {return matches.stream().mapToInt(m -> rules.get(m.getKeywordType()).getBaseScore()).sum();}
}
核心打分
package com.choose.risk;import com.choose.SensitiveWordMatcher;import java.util.Collection;
import java.util.List;
import java.util.Map;/*** @author lizhentao*/
public class RiskScorer {private final SensitiveWordMatcher matcher;/*** 规则配置表*/private final Map<String, RiskRule> rules;/*** 策略链*/private final List<RiskStrategy> strategies;public RiskScorer(SensitiveWordMatcher matcher, Map<String, RiskRule> rules, List<RiskStrategy> strategies) {this.matcher = matcher;this.rules = rules;this.strategies = strategies;}public RiskResult score(Collection<TypedEmit> matches) {int rawScore = 0;String riskType = "NORMAL";// 依次执行策略链for (RiskStrategy strategy : strategies) {rawScore = strategy.evaluate((List<TypedEmit>) matches, rules);// 动态决策if (rawScore > 10) {riskType = "HIGH";break;}}// 返回结构化结果return new RiskResult(rawScore, riskType, matches);}}
项目结果
详细内容可以看源代码