你带酒来,我有故事

利用trie树实现敏感词屏蔽功能

:: 代码生涯 二十画生 1844℃ 0评论

trie树又叫字典树,前缀树,经常用来实现字符串检索、词频统计等功能,本文利用trie树实现敏感词屏蔽。

本文为敏感词屏蔽功能的粗略版,很多方面还未顾及,比如大小写、繁体简体等,当然空间与性能也非最优,仅当抛砖引玉。

trie树的优化版如Double-Array Trie是值得深入了解的。

(1)首先实现trie树节点

public class TrieNode
    {
        /// <summary>
        /// 是否为关键词的结尾
        /// </summary>
        protected bool _isEnd;

        /// <summary>
        /// 子节点,key 为关键单字
        /// </summary>
        protected Dictionary<char, TrieNode> _childTrieNodes;

        /// <summary>
        /// 关键词
        /// </summary>
        protected string _keyWords;


        /// <summary>
        /// 构造函数
        /// </summary>
        public TrieNode()
        {
            _childTrieNodes = new Dictionary<char, TrieNode>();
            _keyWords = string.Empty;
        }

        /// <summary>
        /// 获取为C的子节点TrieNode
        /// </summary>
        /// <param name="c"></param>
        /// <param name="node"></param>
        /// <returns>找到的子节点</returns>
        public bool TryGetChildTrieNode(char c, out TrieNode node)
        {
            try
            {
                return _childTrieNodes.TryGetValue(c, out node);
            }
            catch
            {
                node = null;
                return false;
            }
        }

        /// <summary>
        /// 添加子节点
        /// </summary>
        /// <param name="c"></param>
        /// <returns>新添加的子节点</returns>
        public TrieNode AddChildTrieNode(char c)
        {
            TrieNode node;
            if (TryGetChildTrieNode(c, out node))
            {
                return node;
            }

            node = new TrieNode();
            _childTrieNodes[c] = node;

            return node;
        }

        /// <summary>
        /// 如果是关键词最后一个字了,需要记录整个关键词
        /// </summary>
        /// <param name="keywords"></param>
        public void SetKeywords(string keywords)
        {
            _isEnd = true;
            _keyWords = keywords;
        }

        /// <summary>
        /// 是否为尾节点
        /// </summary>
        public bool IsEnd
        {
            get { return _isEnd; }
        }

    }

(2)然后实现trie树

 public class TrieTree
    {
        /// <summary>
        /// 根节点
        /// </summary>
        protected TrieNode _root = new TrieNode();

        /// <summary>
        /// 所有关键词的首字集合
        /// </summary>
        //protected TrieNode[] _childTrieNodes = new TrieNode[char.MaxValue + 1];
        protected Dictionary<char, TrieNode> _childTrieNodes = new Dictionary<char, TrieNode>();

        /// <summary>
        /// 根据关键词集合构造TrieTree
        /// </summary>
        /// <param name="keywords"></param>
        public TrieTree(List<string> keywords)
        {
            foreach (string keyword in keywords)
            {
                if (string.IsNullOrWhiteSpace(keyword))
                {
                    continue;
                }

                // 得到首字
                char firstWord = keyword[0];
                TrieNode node = null;
                try
                {
                    node = _childTrieNodes[firstWord];
                }
                catch{ }

                if (node == null)
                {
                    node = _root.AddChildTrieNode(firstWord);
                     _childTrieNodes[firstWord] = node;
                }

                for (int i = 1; i < keyword.Length; i++)
                {
                    node = node.AddChildTrieNode(keyword[i]);
                }

                // 将关键词放到末尾结点里
                node.SetKeywords(keyword);

            }
        }

        /// <summary>
        /// 获取根节点
        /// </summary>
        /// <returns></returns>
        public TrieNode getRoot()
        {
            return _root;
        }


        /// <summary>
        /// TrieTree关键词匹配
        /// </summary>
        /// <param name="keyword">关键词</param>
        /// <param name="lastMatchLocation">最后配置位置后一位,也即匹配长度</param>
        /// <param name="isMaxLongMatch">是否为最长关键词匹配</param>
        /// <returns>是否匹配成功</returns>
        public bool Match(string keyword, out int lastMatchLocation, bool isMaxLongMatch = false)
        {
            char firstWord = keyword[0];
            TrieNode node = null;
            try
            {
                node = _childTrieNodes[firstWord];
            }
            catch { }

            lastMatchLocation = -1;
            if (node == null) // 如果首字都没有匹配上,无需往下匹配
            {
                return false;
            }

            bool isMatch = false; // 是否匹配上了

            for (int i = 1; i < keyword.Length; i++)
            {
                if (!node.IsEnd)
                {
                    if (node.TryGetChildTrieNode(keyword[i], out node))
                    {
                        continue;
                    }

                    return isMatch;
                }

                // 程序运行到此处肯定已经有匹配上的了
                lastMatchLocation = i;
                isMatch = true;

                if (isMaxLongMatch) // 如果是最长关键词匹配
                {
                    if (!node.TryGetChildTrieNode(keyword[i], out node))
                    {
                        return isMatch;
                    }
                    
                   continue;
                }

                return isMatch;               
            }

            return false;
        }
        


    }

(3)敏感词搜索与替换

 public class SensitiveWord
    {
        /// <summary>
        /// 敏感词TrieTree
        /// </summary>
        private TrieTree _trieTree;

        /// <summary>
        /// 构造敏感词TrieTree
        /// </summary>
        /// <param name="sensitiveWords"></param>
        public SensitiveWord(List<string> sensitiveWords)
        {
            _trieTree = new TrieTree(sensitiveWords);
        }

        /// <summary>
        /// 判断是否含有敏感词
        /// </summary>
        /// <param name="text">检查的字符串</param>
        /// <returns>返回true代表含有敏感词</returns> 
        public bool ContainsAny(string text)
        {
            int lastMatchLocation = -1;

            int i = 0;
            do
            {
                if (_trieTree.Match(text,out lastMatchLocation))
                {
                    return true;
                }

                i++;
                text = text.Substring(1, text.Length - 1);
            }
            while (text.Length > 1);

            return false;
        }

        /// <summary>
        /// 敏感词替换
        /// </summary>
        /// <param name="text">原字符串</param>
        /// <param name="replaceChar">替换字符</param>
        /// <returns>替换后的字符串</returns>
        public string Replace(string text, char replaceChar = '*', bool isMaxLongMatch = false)
        {
            int lastMatchLocation = -1;
            StringBuilder sb = new StringBuilder();
            do
            {
                if (_trieTree.Match(text, out lastMatchLocation, isMaxLongMatch))
                {
                    for (int c = 0; c < lastMatchLocation; c++)
                    {
                        sb.Append(replaceChar);
                    }
                    int len = text.Length - lastMatchLocation;
                    if (len <= 0)
                    {
                        return sb.ToString();
                    }
                    text = text.Substring(lastMatchLocation, len);
                }
                else
                {
                    sb.Append(text[0]);
                    text = text.Substring(1, text.Length - 1);
                }
            }
            while (text.Length > 0);

            return sb.ToString();
        }




    }

(4)测试代码

 class Program
    {

        static void Main(string[] args)
        {
            string path = "Files/SensitiveWords.txt";
            string[] lines = File.ReadAllLines(path);
            string test = "我是好人我只是我想睡你二姨女儿还有我想睡你三姨罢了";


            // 建立敏感词TrieTree
            SensitiveWord swords = new SensitiveWord(lines.ToList());

            // 测试是否包含敏感词
            bool b = swords.ContainsAny(test);
            Console.WriteLine(b);

            // 测试短敏感词匹配
            string str = swords.Replace(test,'*',false);
            Console.WriteLine(str);

            // 测试最长敏感词匹配
            string str2 = swords.Replace(test, '*', true);
            Console.WriteLine(str2);

        }
    }

(5)测试结果

利用trie树实现敏感词屏蔽功能

(6)源码下载

源码下载

 

 

 

 

 

转载请注明:二十画生 » 利用trie树实现敏感词屏蔽功能

喜欢 (4)
发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址
粤ICP备17031696号-1