trie树又叫字典树,前缀树,经常用来实现字符串检索、词频统计等功能,本文利用trie树实现敏感词屏蔽。
本文为敏感词屏蔽功能的粗略版,很多方面还未顾及,比如大小写、繁体简体等,当然空间与性能也非最优,仅当抛砖引玉。
trie树的优化版如Double-Array Trie是值得深入了解的。
(1)首先实现trie树节点
public class TrieNode { /// <summary> /// 是否为关键词的结尾 /// </summary> protected bool _isEnd; /// <summary> /// 子节点,key 为关键单字 /// </summary> protected Dictionary<char, TrieNode> _childTrieNodes; /// <summary> /// 关键词 /// </summary> protected string _keyWords; /// <summary> /// 构造函数 /// </summary> public TrieNode() { _childTrieNodes = new Dictionary<char, TrieNode>(); _keyWords = string.Empty; } /// <summary> /// 获取为C的子节点TrieNode /// </summary> /// <param name="c"></param> /// <param name="node"></param> /// <returns>找到的子节点</returns> public bool TryGetChildTrieNode(char c, out TrieNode node) { try { return _childTrieNodes.TryGetValue(c, out node); } catch { node = null; return false; } } /// <summary> /// 添加子节点 /// </summary> /// <param name="c"></param> /// <returns>新添加的子节点</returns> public TrieNode AddChildTrieNode(char c) { TrieNode node; if (TryGetChildTrieNode(c, out node)) { return node; } node = new TrieNode(); _childTrieNodes[c] = node; return node; } /// <summary> /// 如果是关键词最后一个字了,需要记录整个关键词 /// </summary> /// <param name="keywords"></param> public void SetKeywords(string keywords) { _isEnd = true; _keyWords = keywords; } /// <summary> /// 是否为尾节点 /// </summary> public bool IsEnd { get { return _isEnd; } } }
(2)然后实现trie树
public class TrieTree { /// <summary> /// 根节点 /// </summary> protected TrieNode _root = new TrieNode(); /// <summary> /// 所有关键词的首字集合 /// </summary> //protected TrieNode[] _childTrieNodes = new TrieNode[char.MaxValue + 1]; protected Dictionary<char, TrieNode> _childTrieNodes = new Dictionary<char, TrieNode>(); /// <summary> /// 根据关键词集合构造TrieTree /// </summary> /// <param name="keywords"></param> public TrieTree(List<string> keywords) { foreach (string keyword in keywords) { if (string.IsNullOrWhiteSpace(keyword)) { continue; } // 得到首字 char firstWord = keyword[0]; TrieNode node = null; try { node = _childTrieNodes[firstWord]; } catch{ } if (node == null) { node = _root.AddChildTrieNode(firstWord); _childTrieNodes[firstWord] = node; } for (int i = 1; i < keyword.Length; i++) { node = node.AddChildTrieNode(keyword[i]); } // 将关键词放到末尾结点里 node.SetKeywords(keyword); } } /// <summary> /// 获取根节点 /// </summary> /// <returns></returns> public TrieNode getRoot() { return _root; } /// <summary> /// TrieTree关键词匹配 /// </summary> /// <param name="keyword">关键词</param> /// <param name="lastMatchLocation">最后配置位置后一位,也即匹配长度</param> /// <param name="isMaxLongMatch">是否为最长关键词匹配</param> /// <returns>是否匹配成功</returns> public bool Match(string keyword, out int lastMatchLocation, bool isMaxLongMatch = false) { char firstWord = keyword[0]; TrieNode node = null; try { node = _childTrieNodes[firstWord]; } catch { } lastMatchLocation = -1; if (node == null) // 如果首字都没有匹配上,无需往下匹配 { return false; } bool isMatch = false; // 是否匹配上了 for (int i = 1; i < keyword.Length; i++) { if (!node.IsEnd) { if (node.TryGetChildTrieNode(keyword[i], out node)) { continue; } return isMatch; } // 程序运行到此处肯定已经有匹配上的了 lastMatchLocation = i; isMatch = true; if (isMaxLongMatch) // 如果是最长关键词匹配 { if (!node.TryGetChildTrieNode(keyword[i], out node)) { return isMatch; } continue; } return isMatch; } return false; } }
(3)敏感词搜索与替换
public class SensitiveWord { /// <summary> /// 敏感词TrieTree /// </summary> private TrieTree _trieTree; /// <summary> /// 构造敏感词TrieTree /// </summary> /// <param name="sensitiveWords"></param> public SensitiveWord(List<string> sensitiveWords) { _trieTree = new TrieTree(sensitiveWords); } /// <summary> /// 判断是否含有敏感词 /// </summary> /// <param name="text">检查的字符串</param> /// <returns>返回true代表含有敏感词</returns> public bool ContainsAny(string text) { int lastMatchLocation = -1; int i = 0; do { if (_trieTree.Match(text,out lastMatchLocation)) { return true; } i++; text = text.Substring(1, text.Length - 1); } while (text.Length > 1); return false; } /// <summary> /// 敏感词替换 /// </summary> /// <param name="text">原字符串</param> /// <param name="replaceChar">替换字符</param> /// <returns>替换后的字符串</returns> public string Replace(string text, char replaceChar = '*', bool isMaxLongMatch = false) { int lastMatchLocation = -1; StringBuilder sb = new StringBuilder(); do { if (_trieTree.Match(text, out lastMatchLocation, isMaxLongMatch)) { for (int c = 0; c < lastMatchLocation; c++) { sb.Append(replaceChar); } int len = text.Length - lastMatchLocation; if (len <= 0) { return sb.ToString(); } text = text.Substring(lastMatchLocation, len); } else { sb.Append(text[0]); text = text.Substring(1, text.Length - 1); } } while (text.Length > 0); return sb.ToString(); } }
(4)测试代码
class Program { static void Main(string[] args) { string path = "Files/SensitiveWords.txt"; string[] lines = File.ReadAllLines(path); string test = "我是好人我只是我想睡你二姨女儿还有我想睡你三姨罢了"; // 建立敏感词TrieTree SensitiveWord swords = new SensitiveWord(lines.ToList()); // 测试是否包含敏感词 bool b = swords.ContainsAny(test); Console.WriteLine(b); // 测试短敏感词匹配 string str = swords.Replace(test,'*',false); Console.WriteLine(str); // 测试最长敏感词匹配 string str2 = swords.Replace(test, '*', true); Console.WriteLine(str2); } }
(5)测试结果
(6)源码下载
转载请注明:二十画生 » 利用trie树实现敏感词屏蔽功能