241 lines
5.5 KiB
Plaintext
241 lines
5.5 KiB
Plaintext
|
|
package dfaExUtil
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
* 扩展DFA算法
|
|||
|
|
*
|
|||
|
|
* 一种二层树实现类DFA算法(DFA为多层树结构;go语言特性中的map结构过于“重量级”导致内存占用很大;此外还可能存在大量相同字符结点)
|
|||
|
|
*
|
|||
|
|
* 第一层map为所有字母/汉字作为key;value为第二层map
|
|||
|
|
* 第二层map为第一层冲突字母/汉字的自定义hash作为key;value指示是否为敏感词结束标识
|
|||
|
|
*
|
|||
|
|
* 测试结果:50万+的敏感词:
|
|||
|
|
* 构造树耗时稍优于原始DFA;
|
|||
|
|
* 内存使用为原DFA的不到1/4:原DFA占用495M内存,此算法使用111M;
|
|||
|
|
* 查询效率比原DFA低10%~20%左右;主要是多一次map查询和多一次hash计算;
|
|||
|
|
*
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
* 注意使用[]rune的问题(此问题已通过使用固定位数hash解决):
|
|||
|
|
* []rune中文使用的是unicode编码;若“中”编码为#4E2D;而#4E2D对应“N-”;
|
|||
|
|
* 即:"N-"与"中"unicode编码均为#4E2D,即会产生hash冲突
|
|||
|
|
*
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"fmt"
|
|||
|
|
"strings"
|
|||
|
|
|
|||
|
|
hash "goutil/dfaExUtil/hash64"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// hash使用的类型(uint64对应hash64函数;uint32对应hash32函数)
|
|||
|
|
type hashType = uint64
|
|||
|
|
|
|||
|
|
type DFAEx struct {
|
|||
|
|
// 忽略大小写;true-忽略大小写;false-大小写敏感
|
|||
|
|
ignoreCase bool
|
|||
|
|
// hash冲突个数
|
|||
|
|
hashCollisions int
|
|||
|
|
// 树根
|
|||
|
|
// 字符/hash/uint8(b10000000)(最高字符表示是否结束/低7位表示字符位置)
|
|||
|
|
root map[rune]map[hashType]uint8
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 新建敏感词对象
|
|||
|
|
// wordList - 敏感词列表
|
|||
|
|
// ignoreCase - [可选;默认false] 是否忽略大小写
|
|||
|
|
func NewDFAEx(wordList []string, ignoreCase ...bool) (dfaEx *DFAEx) {
|
|||
|
|
var iCase bool
|
|||
|
|
var mapSize int
|
|||
|
|
|
|||
|
|
if len(ignoreCase) > 0 {
|
|||
|
|
iCase = ignoreCase[0]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
mapSize = len(wordList) * 10
|
|||
|
|
// 防止过小
|
|||
|
|
if mapSize < 1_000 {
|
|||
|
|
mapSize = 1_000
|
|||
|
|
}
|
|||
|
|
// 通常各语言的单rune非重复数不超过1万
|
|||
|
|
if mapSize > 10_000 {
|
|||
|
|
mapSize = 10_000
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
dfaEx = &DFAEx{
|
|||
|
|
ignoreCase: iCase,
|
|||
|
|
root: make(map[rune]map[hashType]uint8, mapSize),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for _, v := range wordList {
|
|||
|
|
word := v
|
|||
|
|
if iCase {
|
|||
|
|
// 忽略大小写;所有字母转大写
|
|||
|
|
word = strings.ToUpper(word)
|
|||
|
|
}
|
|||
|
|
wordRune := []rune(word)
|
|||
|
|
if len(wordRune) > 0 {
|
|||
|
|
dfaEx.InsertWord(wordRune)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return dfaEx
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 添加敏感词
|
|||
|
|
func (dfaEx *DFAEx) InsertWord(word []rune) {
|
|||
|
|
var hs hashType
|
|||
|
|
var lastWord rune
|
|||
|
|
var lastHash hashType
|
|||
|
|
for i, v := range word {
|
|||
|
|
lastWord = v
|
|||
|
|
lastHash = hs
|
|||
|
|
if wdInfo, ok := dfaEx.root[v]; ok {
|
|||
|
|
// "字"已存在
|
|||
|
|
if hsV, ok := wdInfo[hs]; !ok {
|
|||
|
|
// hash不存在,添加hash
|
|||
|
|
wdInfo[hs] = uint8(i & 0x7F) // 第i位
|
|||
|
|
} else {
|
|||
|
|
// hash已存在,检测是否冲突
|
|||
|
|
if (hsV & 0x7F) != uint8(i) {
|
|||
|
|
// hash冲突
|
|||
|
|
dfaEx.hashCollisions++
|
|||
|
|
// fmt.Printf("hash冲突 %s %016X %d %d\n", string(v), hs, i+1, hsV&0x7F+1)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
// "字"不存在,添加"字"和hash
|
|||
|
|
wdInfo = make(map[hashType]uint8)
|
|||
|
|
wdInfo[hs] = uint8(i & 0x7F) // 第i位
|
|||
|
|
dfaEx.root[v] = wdInfo
|
|||
|
|
}
|
|||
|
|
hs = hash.FastSumByRune2(v, hs) // hash更新
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 敏感词结束标志(uint8最高位置1)
|
|||
|
|
dfaEx.root[lastWord][lastHash] |= 0x80
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 字符串查找敏感词
|
|||
|
|
func (dfaEx *DFAEx) IsMatch(str string) bool {
|
|||
|
|
starts, _ := dfaEx.SearchSentence(str, true)
|
|||
|
|
return len(starts) > 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 指定字符替换敏感词
|
|||
|
|
func (dfaEx *DFAEx) HandleWord(str string, replace rune) string {
|
|||
|
|
starts, ends := dfaEx.SearchSentence(str)
|
|||
|
|
if len(starts) == 0 {
|
|||
|
|
return str
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
strRune := []rune(str)
|
|||
|
|
for i := 0; i < len(starts); i++ {
|
|||
|
|
for idx := starts[i]; idx <= ends[i]; idx++ {
|
|||
|
|
strRune[idx] = replace
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return string(strRune)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 字符串查找敏感词
|
|||
|
|
func (dfaEx *DFAEx) SearchSentence(str string, firstOpt ...bool) (starts, ends []int) {
|
|||
|
|
var first bool // 是否首次匹配就返回
|
|||
|
|
if len(firstOpt) > 0 {
|
|||
|
|
first = firstOpt[0]
|
|||
|
|
}
|
|||
|
|
strBak := str
|
|||
|
|
if dfaEx.ignoreCase {
|
|||
|
|
// 忽略大小写;所有字母转大写
|
|||
|
|
strBak = strings.ToUpper(str)
|
|||
|
|
}
|
|||
|
|
runeStr := []rune(strBak)
|
|||
|
|
for i := 0; i < len(runeStr); {
|
|||
|
|
end := dfaEx.searchByStart(i, runeStr)
|
|||
|
|
if end < 0 {
|
|||
|
|
// 继续下一个进行匹配
|
|||
|
|
i++
|
|||
|
|
} else {
|
|||
|
|
// 记录匹配位置;从匹配到的下一个位置继续
|
|||
|
|
starts = append(starts, i)
|
|||
|
|
ends = append(ends, end)
|
|||
|
|
if first {
|
|||
|
|
// 首次匹配就返回
|
|||
|
|
break
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
i = end + 1
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 从指定的开始位置搜索语句
|
|||
|
|
// start - 开始匹配的位置
|
|||
|
|
// str - 待检测字符串
|
|||
|
|
// 返回:匹配到的结束位置,未匹配到返回-1
|
|||
|
|
func (dfaEx *DFAEx) searchByStart(start int, runeStr []rune) (end int) {
|
|||
|
|
var hs hashType
|
|||
|
|
end = -1 // 未匹配到返回值
|
|||
|
|
|
|||
|
|
for i := start; i < len(runeStr); i++ {
|
|||
|
|
wd := runeStr[i]
|
|||
|
|
wdInfo, ok := dfaEx.root[wd]
|
|||
|
|
if !ok {
|
|||
|
|
// "字"不存在
|
|||
|
|
break
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
hsV, ok := wdInfo[hs]
|
|||
|
|
if !ok {
|
|||
|
|
// hash不存在
|
|||
|
|
break
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 检测是否句尾
|
|||
|
|
if (hsV & 0x80) != 0 {
|
|||
|
|
// 找到句尾,继续匹配,直到匹配到最长敏感词为止
|
|||
|
|
end = i
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
hs = hash.FastSumByRune2(wd, hs) // hash更新
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 获取hash冲突数
|
|||
|
|
func (dfaEx *DFAEx) GetHashCollisions() int {
|
|||
|
|
return dfaEx.hashCollisions
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 调试接口
|
|||
|
|
func (dfaEx *DFAEx) Print() {
|
|||
|
|
fmt.Println(dfaEx)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 调试接口
|
|||
|
|
func (dfaEx *DFAEx) PrintFmt(verbose bool) {
|
|||
|
|
var keys int
|
|||
|
|
var hashs int
|
|||
|
|
for k, v := range dfaEx.root {
|
|||
|
|
keys++
|
|||
|
|
if verbose {
|
|||
|
|
fmt.Println("---------------------------")
|
|||
|
|
fmt.Println(string(k))
|
|||
|
|
}
|
|||
|
|
for kk, vv := range v {
|
|||
|
|
hashs++
|
|||
|
|
if verbose {
|
|||
|
|
fmt.Printf("%016X %02X\n", kk, vv)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
fmt.Println("================================")
|
|||
|
|
fmt.Println("keys:", keys, "hashs", hashs, "map count", keys+1, "hashCollisions Count", dfaEx.hashCollisions)
|
|||
|
|
}
|