220 lines
5.2 KiB
Plaintext
220 lines
5.2 KiB
Plaintext
|
|
package dfaUtil
|
|||
|
|
|
|||
|
|
import "strings"
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
DFA util, is used to verify whether a sentence has invalid words.
|
|||
|
|
The underlying data structure is trie.
|
|||
|
|
https://en.wikipedia.org/wiki/Trie
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
// dfa util
|
|||
|
|
type DFAUtil struct {
|
|||
|
|
// The root node
|
|||
|
|
root *trieNode
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 搜索语句
|
|||
|
|
// 由于go不支持tuple,所以为了避免定义多余的struct,特别使用两个list来分别返回匹配的索引的上界和下界
|
|||
|
|
// 在处理此方法的返回值时,需要两者配合使用
|
|||
|
|
// 参数:
|
|||
|
|
//
|
|||
|
|
// sentence:语句字符串
|
|||
|
|
//
|
|||
|
|
// 返回:
|
|||
|
|
//
|
|||
|
|
// 搜索到的开始位置列表
|
|||
|
|
// 搜索到的结束位置列表
|
|||
|
|
func (this *DFAUtil) SearchSentence(sentence string) (startIndexList, endIndexList []int) {
|
|||
|
|
sentenceRuneList := []rune(sentence)
|
|||
|
|
for i := 0; i < len(sentenceRuneList); {
|
|||
|
|
//按序匹配每个字
|
|||
|
|
end := this.searchSentenceByStart(i, sentenceRuneList)
|
|||
|
|
if end < 0 {
|
|||
|
|
//匹配失败,继续匹配下一个字
|
|||
|
|
i++
|
|||
|
|
} else {
|
|||
|
|
//匹配成功,记录索引位置
|
|||
|
|
startIndexList = append(startIndexList, i)
|
|||
|
|
endIndexList = append(endIndexList, end)
|
|||
|
|
|
|||
|
|
//从匹配到的字后面开始找
|
|||
|
|
i = end + 1
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 从指定的开始位置搜索语句
|
|||
|
|
// 参数:
|
|||
|
|
//
|
|||
|
|
// start:开始匹配的位置
|
|||
|
|
// sentenceRuneList:语句字列表
|
|||
|
|
//
|
|||
|
|
// 返回:
|
|||
|
|
//
|
|||
|
|
// 匹配到的结束位置,未匹配到返回-1
|
|||
|
|
func (this *DFAUtil) searchSentenceByStart(start int, sentenceRuneList []rune) (endIndex int) {
|
|||
|
|
//当前节点,从根节点开始找
|
|||
|
|
currNode := this.root
|
|||
|
|
//是否匹配到
|
|||
|
|
var isMatched bool
|
|||
|
|
|
|||
|
|
//按顺序匹配字
|
|||
|
|
for i := start; i < len(sentenceRuneList); {
|
|||
|
|
child, exists := currNode.children[sentenceRuneList[i]]
|
|||
|
|
|
|||
|
|
//未匹配到则结束,跳出循环(可能匹配到过词结尾)
|
|||
|
|
if !exists {
|
|||
|
|
break
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
//是否是词末尾,如果是则先记录下来,因为还可能匹配到更长的词
|
|||
|
|
//比如["金鳞"、"金鳞岂是池中物"] => 匹配"金鳞岂是池中物",匹配到"金鳞"不应该停下来,应继续匹配更长的词
|
|||
|
|
if child.isEndOfWord {
|
|||
|
|
endIndex = i
|
|||
|
|
isMatched = true
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
//是否已经到词末尾
|
|||
|
|
if len(child.children) == 0 {
|
|||
|
|
return endIndex
|
|||
|
|
} else {
|
|||
|
|
//继续与后面的字匹配
|
|||
|
|
currNode = child
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
//增加索引匹配下一个位置
|
|||
|
|
i++
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
//匹配结束,若曾经匹配到词末尾,则直接返回匹配到的位置
|
|||
|
|
if isMatched {
|
|||
|
|
return endIndex
|
|||
|
|
} else {
|
|||
|
|
//没有匹配到词末尾,则返回匹配失败
|
|||
|
|
return -1
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Insert new word into object
|
|||
|
|
func (this *DFAUtil) InsertWord(word []rune) {
|
|||
|
|
currNode := this.root
|
|||
|
|
for _, c := range word {
|
|||
|
|
if cildNode, exist := currNode.children[c]; !exist {
|
|||
|
|
cildNode = newtrieNode()
|
|||
|
|
currNode.children[c] = cildNode
|
|||
|
|
currNode = cildNode
|
|||
|
|
} else {
|
|||
|
|
currNode = cildNode
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
currNode.isEndOfWord = true
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check if there is any word in the trie that starts with the given prefix.
|
|||
|
|
func (this *DFAUtil) StartsWith(prefix []rune) bool {
|
|||
|
|
currNode := this.root
|
|||
|
|
for _, c := range prefix {
|
|||
|
|
if childNode, exist := currNode.children[c]; !exist {
|
|||
|
|
return false
|
|||
|
|
} else {
|
|||
|
|
currNode = childNode
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return true
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Judge if input sentence contains some special caracter
|
|||
|
|
// Return:
|
|||
|
|
// Matc or not
|
|||
|
|
func (this *DFAUtil) IsMatch(sentence string) bool {
|
|||
|
|
startIndexList, _ := this.SearchSentence(sentence)
|
|||
|
|
return len(startIndexList) > 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Handle sentence. Use specified caracter to replace those sensitive caracters.
|
|||
|
|
// input: Input sentence
|
|||
|
|
// replaceCh: candidate
|
|||
|
|
// Return:
|
|||
|
|
// Sentence after manipulation
|
|||
|
|
func (this *DFAUtil) HandleWord(sentence string, replaceCh rune) string {
|
|||
|
|
startIndexList, endIndexList := this.SearchSentence(sentence)
|
|||
|
|
if len(startIndexList) == 0 {
|
|||
|
|
return sentence
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Manipulate
|
|||
|
|
sentenceList := []rune(sentence)
|
|||
|
|
for i := 0; i < len(startIndexList); i++ {
|
|||
|
|
for index := startIndexList[i]; index <= endIndexList[i]; index++ {
|
|||
|
|
sentenceList[index] = replaceCh
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return string(sentenceList)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Handle sentence. Use specified caracter to replace those sensitive caracters.
|
|||
|
|
// input: Input sentence
|
|||
|
|
// replaceCh: candidate
|
|||
|
|
// Return:
|
|||
|
|
// Sentence after manipulation
|
|||
|
|
func (this *DFAUtil) HandleWordUseStr(input string, replaceCh string) string {
|
|||
|
|
input2 := strings.ToUpper(input)
|
|||
|
|
|
|||
|
|
startIndexList, endIndexList := this.SearchSentence(input2)
|
|||
|
|
if len(startIndexList) == 0 {
|
|||
|
|
return input
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Manipulate
|
|||
|
|
inputRune := []rune(input)
|
|||
|
|
replaceChList := []rune(replaceCh)
|
|||
|
|
|
|||
|
|
//上一次替换掉的数量
|
|||
|
|
lastReplaceCount := 0
|
|||
|
|
|
|||
|
|
for i := 0; i < len(startIndexList); i++ {
|
|||
|
|
|
|||
|
|
//替换字的索引
|
|||
|
|
index := len(replaceChList)
|
|||
|
|
|
|||
|
|
//开始位置--加上替换的词的索引
|
|||
|
|
starIndex := startIndexList[i] + (i * index) - lastReplaceCount
|
|||
|
|
|
|||
|
|
//结束位置
|
|||
|
|
endIndex := endIndexList[i] + (i * index) - lastReplaceCount
|
|||
|
|
|
|||
|
|
//结束字符串
|
|||
|
|
sentenceAttr := string(inputRune[endIndex+1:])
|
|||
|
|
|
|||
|
|
//替换范围字符串
|
|||
|
|
inputRune = append(inputRune[:starIndex], replaceChList...)
|
|||
|
|
inputRune = append(inputRune, []rune(sentenceAttr)...)
|
|||
|
|
lastReplaceCount = endIndex + 1 - starIndex
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return string(inputRune)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Create new DfaUtil object
|
|||
|
|
// wordList:word list
|
|||
|
|
func NewDFAUtil(wordList []string) *DFAUtil {
|
|||
|
|
this := &DFAUtil{
|
|||
|
|
root: newtrieNode(),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for _, word := range wordList {
|
|||
|
|
wordRuneList := []rune(word)
|
|||
|
|
if len(wordRuneList) > 0 {
|
|||
|
|
this.InsertWord(wordRuneList)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return this
|
|||
|
|
}
|