diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3698302 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 bean-du + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..f1df377 --- /dev/null +++ b/README.md @@ -0,0 +1,23 @@ +# DFA 敏感词检测 + +> golang dfa 敏感词检测算法实现,支持动态设置敏感词 + +```go + go get github.com/bean-du/dfa +``` + +example: +```go + sensitive := []string{"王八蛋", "王八羔子"} + + fda := NewFDA() + fda.AddBadWords(sensitive) + + str := "你个王#八……羔子, 你就是个王*八/蛋" + fmt.Println(fda.Check(str)) +``` + +输出结果: +```go +[王#八……羔子 王*八/蛋] true +``` diff --git a/dfa.go b/dfa.go new file mode 100644 index 0000000..2c8a402 --- /dev/null +++ b/dfa.go @@ -0,0 +1,136 @@ +package dfa + +import ( + "strings" + "sync" +) + +const ( + defaultInvalidWorlds = " ,~,!,@,#,$,%,^,&,*,(,),_,-,+,=,?,<,>,.,—,,,。,/,\\,|,《,》,?,;,:,:,',‘,;,“," + defaultReplaceStr = "****" +) + +type DFA struct { + l sync.Mutex + trie *Trie + replaceStr string + invalidWords map[string]struct{} +} + +func NewFDA() *DFA { + f := &DFA{ + trie: NewTrie(), + replaceStr: defaultReplaceStr, + invalidWords: make(map[string]struct{}), + } + for _, s := range defaultInvalidWorlds { + f.invalidWords[string(s)] = struct{}{} + } + return f +} + +func (f *DFA) AddBadWords(words []string) { + f.l.Lock() + defer f.l.Unlock() + if len(words) > 0 { + for _, s := range words { + f.trie.Insert(s) + } + } +} + +func (f *DFA) SetInvalidChar(chars string) { + f.l.Lock() + defer f.l.Unlock() + f.invalidWords = make(map[string]struct{}) + for _, s := range chars { + f.invalidWords[string(s)] = struct{}{} + } +} + +func (f *DFA) SetReplaceStr(str string) { + f.l.Lock() + defer f.l.Unlock() + + f.replaceStr = str +} + +func (f *DFA) Check(txt string) ([]string, bool) { + _, found, b := f.check(txt, false) + return found, b +} + +func (f *DFA) CheckAndReplace(txt string) (string, []string, bool) { + return f.check(txt, true) +} + +func (f *DFA) check(txt string, replace bool) (string, []string, bool) { + var ( + str = []rune(txt) + ok bool + found []string + node *Node + nodeMap map[rune]*Node + start, tag = -1, -1 + result string + ) + f.l.Lock() + defer f.l.Unlock() + + for i, val := range str { + if _, ok = f.invalidWords[string(val)]; ok { + continue + } + + if nodeMap == nil { + node = f.trie.Child(string(val)) + if node != nil { + tag++ + if tag == 0 { + start = i + } + + if !node.IsEnd { + nodeMap = node.Child + } else { + found = append(found, string(str[start:i+1])) + if replace { + result = strings.Replace(result, string(str[start:i+1]), f.replaceStr, 1) + if result == "" { + result = strings.Replace(txt, string(str[start:i+1]), f.replaceStr, 1) + } + } + tag = -1 + start = -1 + nodeMap = nil + } + } else { + if start != -1 { + i = start + 1 + } + + nodeMap = nil + start = -1 + tag = -1 + } + } else { + if node, ok = nodeMap[val]; ok { + if !node.IsEnd { + nodeMap = node.Child + } else { + found = append(found, string(str[start:i+1])) + if replace { + result = strings.Replace(result, string(str[start:i+1]), f.replaceStr, 1) + if result == "" { + result = strings.Replace(txt, string(str[start:i+1]), f.replaceStr, 1) + } + } + tag = -1 + start = -1 + nodeMap = nil + } + } + } + } + return result, found, len(found) > 0 +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..3fa8dfb --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/bean-du/dfa + +go 1.16 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..e69de29 diff --git a/trie.go b/trie.go new file mode 100644 index 0000000..1c03cfe --- /dev/null +++ b/trie.go @@ -0,0 +1,94 @@ +package dfa + +type Node struct { + IsEnd bool + Value string + Child map[rune]*Node +} + +func newNode() *Node { + return &Node{ + IsEnd: false, + Value: "", + Child: make(map[rune]*Node), + } +} + +type Trie struct { + root *Node + size int +} + +func (t *Trie) Root() *Node { + return t.root +} + +func (t *Trie) Insert(key string) { + curNode := t.root + for _, v := range key { + if curNode.Child[v] == nil { + curNode.Child[v] = newNode() + } + curNode = curNode.Child[v] + } + + if !curNode.IsEnd { + t.size++ + curNode.IsEnd = true + } + curNode.Value = key +} + +func (t *Trie) PrefixMatch(key string) []string { + node, _ := t.findNode(key) + if node == nil { + return nil + } + return t.Walk(node) +} + +func (t *Trie) Walk(node *Node) (ret []string) { + if node.IsEnd { + ret = append(ret, node.Value) + } + for _, v := range node.Child { + ret = append(ret, t.Walk(v)...) + } + return +} + +func (t *Trie) findNode(key string) (node *Node, index int) { + curNode := t.root + f := false + for k, v := range key { + if f { + index = k + f = false + } + if curNode.Child[v] == nil { + return nil, index + } + curNode = curNode.Child[v] + if curNode.IsEnd { + f = true + } + } + + if curNode.IsEnd { + index = len(key) + } + + return curNode, index +} + +func (t *Trie) Child(key string) *Node { + node, _ := t.findNode(key) + return node +} + +func NewTrie() *Trie { + return &Trie{ + root: newNode(), + size: 0, + } +}