func (s *State) DoThings(action rune, dfa *Lexical) { switch s.Func { case OutputIdentifier: dfa.OutputToken = dict.NewWordInfoDefault() s.getTextElse(dfa) dfa.OutputToken.WordType = dict.TEnglish case OutputSpace: dfa.OutputToken = dict.NewWordInfoDefault() s.getTextElse(dfa) dfa.OutputToken.WordType = dict.TSpace case OutputNumeric: dfa.OutputToken = dict.NewWordInfoDefault() s.getTextElse(dfa) dfa.OutputToken.WordType = dict.TNumeric case OutputChinese: dfa.OutputToken = dict.NewWordInfoDefault() s.getTextElse(dfa) dfa.OutputToken.WordType = dict.TSimplifiedChinese case Other: dfa.OutputToken = dict.NewWordInfoDefault() s.getText(dfa) dfa.OutputToken.WordType = dict.TSymbol } }
func (s *Segment) mergeEnglishSpecialWord(orginalText []rune, wordInfoList *list.List, current *list.Element) (bool, *list.Element) { cur := current cur = cur.Next() last := -1 for cur != nil { if cur.Value.(*dict.WordInfo).WordType == dict.TSymbol || cur.Value.(*dict.WordInfo).WordType == dict.TEnglish { last = cur.Value.(*dict.WordInfo).Position + utils.RuneLen(cur.Value.(*dict.WordInfo).Word) cur = cur.Next() } else { break } } if last >= 0 { first := current.Value.(*dict.WordInfo).Position newWord := orginalText[first:last] wa := s.wordDictionary.GetWordAttr(newWord) if wa == nil { return false, current } for current != cur { removeItem := current current = current.Next() wordInfoList.Remove(removeItem) } wi := dict.NewWordInfoDefault() wi.Word = string(newWord) wi.Pos = wa.Pos wi.Frequency = wa.Frequency wi.WordType = dict.TEnglish wi.Position = first wi.Rank = s.params.EnglishRank if current == nil { wordInfoList.PushBack(wi) } else { wordInfoList.InsertBefore(wi, current) } return true, current } return false, current }
func (m *ChsFullTextMatch) Match(posLenArr []dict.PositionLength, originalText string) *list.List { if m.options == nil { m.options = NewMatchOptions() } if m.params == nil { m.params = NewMatchParameter() } runes := utils.ToRunes(originalText) masks := make([]int, len(runes)) redundancy := m.params.Redundancy result := list.New() if len(posLenArr) == 0 { if m.options.UnknownWordIdentify { wi := dict.NewWordInfoDefault() wi.Word = originalText wi.Position = 0 wi.WordType = dict.TNone wi.Rank = 1 result.PushFront(wi) return result } else { position := 0 for _, r := range runes { wi := dict.NewWordInfoDefault() wi.Word = string(r) wi.Position = position wi.WordType = dict.TNone wi.Rank = 1 position++ result.PushBack(wi) } return result } } leafNodeArray := m.getLeafNodeArray(posLenArr, originalText) // 获取前TopRecord个单词序列 j := 0 for _, node := range leafNodeArray { if leafNodeArray[j] == nil { break } if j >= TopRecord || j >= len(leafNodeArray) { break } comb := make([]dict.PositionLength, node.AboveCount) i := node.AboveCount - 1 cur := node for i >= 0 { comb[i] = cur.PosLen cur = cur.Parent i-- } m.allCombinations = append(m.allCombinations, comb) j++ } // Force single word // 强制一元分词 if m.options.ForceSingleWord { comb := make([]dict.PositionLength, len(runes)) for i := 0; i < len(comb); i++ { pl := dict.NewPositionLength(i, 1, dict.NewWordAttr(string(runes[i]), dict.POS_UNK, 0.0)) pl.Level = 3 comb[i] = pl } m.allCombinations = append(m.allCombinations, comb) } if len(m.allCombinations) > 0 { positionCollection := m.mergeAllCombinations(redundancy) curPc := positionCollection.Front() for curPc != nil { pl := curPc.Value.(dict.PositionLength) wi := dict.NewWordInfoDefault() wi.Word = string(runes[pl.Position:(pl.Position + pl.Length)]) wi.Pos = pl.WordAttri.Pos wi.Frequency = pl.WordAttri.Frequency wi.WordType = dict.TSimplifiedChinese wi.Position = pl.Position switch pl.Level { case 0: wi.Rank = m.params.BestRank case 1: wi.Rank = m.params.SecRank case 2: wi.Rank = m.params.ThirdRank case 3: wi.Rank = m.params.SingleRank default: wi.Rank = m.params.BestRank } result.PushBack(wi) if pl.Length > 1 { for k := pl.Position; k < pl.Position+pl.Length; k++ { masks[k] = 2 } } else { masks[pl.Position] = 1 } curPc = curPc.Next() } } // 合并未登录词 unknownWords, needRemoveSingleWord := m.getUnknownWords(masks, runes) // 合并结果序列到对应位置中 if len(unknownWords) > 0 { cur := result.Front() if needRemoveSingleWord && !m.options.ForceSingleWord { // remove single word need be removed for cur != nil { if utils.RuneLen(cur.Value.(*dict.WordInfo).Word) == 1 { if masks[cur.Value.(*dict.WordInfo).Position] == 11 { removeItem := cur cur = cur.Next() result.Remove(removeItem) continue } } cur = cur.Next() } } cur = result.Front() j = 0 for cur != nil { if cur.Value.(*dict.WordInfo).Position >= unknownWords[j].Position { result.InsertBefore(unknownWords[j], cur) j++ if j >= len(unknownWords) { break } } if cur.Value.(*dict.WordInfo).Position < unknownWords[j].Position { cur = cur.Next() } } for j < len(unknownWords) { result.PushBack(unknownWords[j]) j++ } } return result }
func (m *ChsFullTextMatch) getUnknownWords(masks []int, orginalText []rune) (unknownWords []*dict.WordInfo, needRemoveSingleWord bool) { unknownWords = [](*dict.WordInfo){} // 找到所有未登录词 needRemoveSingleWord = false j := 0 begin := false beginPosition := 0 for j < len(masks) { if m.options.UnknownWordIdentify { if !begin { if m.isKnownSingleWord(masks, j, orginalText) { begin = true beginPosition = j } } else { mergeUnknownWord := true if !m.isKnownSingleWord(masks, j, orginalText) { if j-beginPosition <= 2 { for k := beginPosition; k < j; k++ { mergeUnknownWord = false if masks[k] != 1 { word := string(orginalText[k : k+1]) wi := dict.NewWordInfoDefault() wi.Word = word wi.Position = k wi.WordType = dict.TNone wi.Rank = m.params.UnknowRank unknownWords = append(unknownWords, wi) } } } else { for k := beginPosition; k < j; k++ { if masks[k] == 1 { masks[k] = 11 needRemoveSingleWord = true } } } begin = false if mergeUnknownWord { word := string(orginalText[beginPosition:j]) wi := dict.NewWordInfoDefault() wi.Word = word wi.Position = beginPosition wi.WordType = dict.TNone wi.Rank = m.params.UnknowRank unknownWords = append(unknownWords, wi) } } } } else { if m.isKnownSingleWord(masks, j, orginalText) { wi := dict.NewWordInfoDefault() wi.Word = string(orginalText[j]) wi.Position = j wi.WordType = dict.TNone wi.Rank = m.params.UnknowRank unknownWords = append(unknownWords, wi) } } j++ } if begin && m.options.UnknownWordIdentify { mergeUnknownWord := true if j-beginPosition <= 2 { for k := beginPosition; k < j; k++ { mergeUnknownWord = false if masks[k] != 1 { word := string(orginalText[k:(k + 1)]) wi := dict.NewWordInfoDefault() wi.Word = word wi.Position = k wi.WordType = dict.TNone wi.Rank = m.params.UnknowRank unknownWords = append(unknownWords, wi) } } } else { for k := beginPosition; k < j; k++ { if masks[k] == 1 { masks[k] = 11 needRemoveSingleWord = true } } } begin = false if mergeUnknownWord { word := string(orginalText[beginPosition:j]) wi := dict.NewWordInfoDefault() wi.Word = word wi.Position = beginPosition wi.WordType = dict.TNone wi.Rank = m.params.UnknowRank unknownWords = append(unknownWords, wi) } } return }