func (s *Stemmer) setto(str string) { l := utils.RuneLen(str) o := s.j + 1 sc := utils.ToRunes(str) for i := 0; i < l; i++ { s.b[o+i] = sc[i] } s.k = s.j + l }
func (s *Stemmer) ends(str string) bool { l := utils.RuneLen(str) o := s.k - l + 1 if o < 0 { return false } sc := utils.ToRunes(str) for i := 0; i < l; i++ { if s.b[o+i] != sc[i] { return false } } s.j = s.k - l return true }
func (s *Segment) mergeEnglishSpecialWord(orginalText []rune, wordInfoList *list.List, current *list.Element) (bool, *list.Element) { cur := current cur = cur.Next() last := -1 for cur != nil { if cur.Value.(*dict.WordInfo).WordType == dict.TSymbol || cur.Value.(*dict.WordInfo).WordType == dict.TEnglish { last = cur.Value.(*dict.WordInfo).Position + utils.RuneLen(cur.Value.(*dict.WordInfo).Word) cur = cur.Next() } else { break } } if last >= 0 { first := current.Value.(*dict.WordInfo).Position newWord := orginalText[first:last] wa := s.wordDictionary.GetWordAttr(newWord) if wa == nil { return false, current } for current != cur { removeItem := current current = current.Next() wordInfoList.Remove(removeItem) } wi := dict.NewWordInfoDefault() wi.Word = string(newWord) wi.Pos = wa.Pos wi.Frequency = wa.Frequency wi.WordType = dict.TEnglish wi.Position = first wi.Rank = s.params.EnglishRank if current == nil { wordInfoList.PushBack(wi) } else { wordInfoList.InsertBefore(wi, current) } return true, current } return false, current }
func (s *StopWord) IsStopWord(word string, filterEnglish bool, filterEnglishLength int, filterNumeric bool, filterNumbericLength int) bool { if len(word) == 0 { return false } r := utils.FirstRune(word) if r < 128 { slen := utils.RuneLen(word) if filterEnglish { if slen > filterEnglishLength && (r < '0' || r > '9') { return true } } if filterNumeric { if slen > filterNumbericLength && (r >= '0' && r <= '9') { return true } } return s.stopWordTbl[strings.ToLower(word)] } return s.stopWordTbl[word] }
func (s *Segment) preSegment(text string) *list.List { result := s.getInitSegment(text) runes := utils.ToRunes(text) cur := result.Front() for cur != nil { if s.options.IgnoreSpace { if cur.Value.(*dict.WordInfo).WordType == dict.TSpace { lst := cur cur = cur.Next() result.Remove(lst) continue } } switch cur.Value.(*dict.WordInfo).WordType { case dict.TSimplifiedChinese: inputText := cur.Value.(*dict.WordInfo).Word originalWordType := dict.TSimplifiedChinese pls := s.wordDictionary.GetAllMatchs(inputText, s.options.ChineseNameIdentify) chsMatch := match.NewChsFullTextMatch(s.wordDictionary) chsMatch.SetOptionParams(s.options, s.params) chsMatchWords := chsMatch.Match(pls, inputText) curChsMatch := chsMatchWords.Front() for curChsMatch != nil { wi := curChsMatch.Value.(*dict.WordInfo) wi.Position += cur.Value.(*dict.WordInfo).Position wi.OriginalWordType = originalWordType wi.WordType = originalWordType curChsMatch = curChsMatch.Next() } rcur := utils.InsertAfterList(result, chsMatchWords, cur) removeItem := cur cur = rcur.Next() result.Remove(removeItem) case dict.TEnglish: cur.Value.(*dict.WordInfo).Rank = s.params.EnglishRank cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word) if s.options.IgnoreCapital { cur.Value.(*dict.WordInfo).Word = strings.ToLower(cur.Value.(*dict.WordInfo).Word) } if s.options.EnglishSegment { lower := strings.ToLower(cur.Value.(*dict.WordInfo).Word) if lower != cur.Value.(*dict.WordInfo).Word { result.InsertBefore(dict.NewWordInfo(lower, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishLowerRank, dict.TEnglish, dict.TEnglish), cur) } stem := s.getStem(lower) if len(stem) > 0 { if lower != stem { result.InsertBefore(dict.NewWordInfo(stem, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishStemRank, dict.TEnglish, dict.TEnglish), cur) } } } if s.options.EnglishMultiDimensionality { needSplit := false for _, c := range cur.Value.(*dict.WordInfo).Word { if (c >= '0' && c <= '9') || (c == '_') { needSplit = true break } } if needSplit { output := s.re.FindAllString(cur.Value.(*dict.WordInfo).Word, -1) if len(output) > 1 { position := cur.Value.(*dict.WordInfo).Position for _, splitWord := range output { if len(splitWord) == 0 { continue } var wi *dict.WordInfo r := utils.FirstRune(splitWord) if r >= '0' && r <= '9' { wi = dict.NewWordInfoSome(splitWord, dict.POS_A_M, 1) wi.Position = position wi.Rank = s.params.NumericRank wi.OriginalWordType = dict.TEnglish wi.WordType = dict.TNumeric } else { wi = dict.NewWordInfoSome(splitWord, dict.POS_A_NX, 1) wi.Position = position wi.Rank = s.params.EnglishRank wi.OriginalWordType = dict.TEnglish wi.WordType = dict.TEnglish } result.InsertBefore(wi, cur) position += utils.RuneLen(splitWord) } } } } var ok bool if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok { cur = cur.Next() } case dict.TNumeric: cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word) cur.Value.(*dict.WordInfo).Rank = s.params.NumericRank var ok bool if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok { cur = cur.Next() } case dict.TSymbol: cur.Value.(*dict.WordInfo).Rank = s.params.SymbolRank cur = cur.Next() default: cur = cur.Next() } } return result }
func (m *ChsFullTextMatch) Match(posLenArr []dict.PositionLength, originalText string) *list.List { if m.options == nil { m.options = NewMatchOptions() } if m.params == nil { m.params = NewMatchParameter() } runes := utils.ToRunes(originalText) masks := make([]int, len(runes)) redundancy := m.params.Redundancy result := list.New() if len(posLenArr) == 0 { if m.options.UnknownWordIdentify { wi := dict.NewWordInfoDefault() wi.Word = originalText wi.Position = 0 wi.WordType = dict.TNone wi.Rank = 1 result.PushFront(wi) return result } else { position := 0 for _, r := range runes { wi := dict.NewWordInfoDefault() wi.Word = string(r) wi.Position = position wi.WordType = dict.TNone wi.Rank = 1 position++ result.PushBack(wi) } return result } } leafNodeArray := m.getLeafNodeArray(posLenArr, originalText) // 获取前TopRecord个单词序列 j := 0 for _, node := range leafNodeArray { if leafNodeArray[j] == nil { break } if j >= TopRecord || j >= len(leafNodeArray) { break } comb := make([]dict.PositionLength, node.AboveCount) i := node.AboveCount - 1 cur := node for i >= 0 { comb[i] = cur.PosLen cur = cur.Parent i-- } m.allCombinations = append(m.allCombinations, comb) j++ } // Force single word // 强制一元分词 if m.options.ForceSingleWord { comb := make([]dict.PositionLength, len(runes)) for i := 0; i < len(comb); i++ { pl := dict.NewPositionLength(i, 1, dict.NewWordAttr(string(runes[i]), dict.POS_UNK, 0.0)) pl.Level = 3 comb[i] = pl } m.allCombinations = append(m.allCombinations, comb) } if len(m.allCombinations) > 0 { positionCollection := m.mergeAllCombinations(redundancy) curPc := positionCollection.Front() for curPc != nil { pl := curPc.Value.(dict.PositionLength) wi := dict.NewWordInfoDefault() wi.Word = string(runes[pl.Position:(pl.Position + pl.Length)]) wi.Pos = pl.WordAttri.Pos wi.Frequency = pl.WordAttri.Frequency wi.WordType = dict.TSimplifiedChinese wi.Position = pl.Position switch pl.Level { case 0: wi.Rank = m.params.BestRank case 1: wi.Rank = m.params.SecRank case 2: wi.Rank = m.params.ThirdRank case 3: wi.Rank = m.params.SingleRank default: wi.Rank = m.params.BestRank } result.PushBack(wi) if pl.Length > 1 { for k := pl.Position; k < pl.Position+pl.Length; k++ { masks[k] = 2 } } else { masks[pl.Position] = 1 } curPc = curPc.Next() } } // 合并未登录词 unknownWords, needRemoveSingleWord := m.getUnknownWords(masks, runes) // 合并结果序列到对应位置中 if len(unknownWords) > 0 { cur := result.Front() if needRemoveSingleWord && !m.options.ForceSingleWord { // remove single word need be removed for cur != nil { if utils.RuneLen(cur.Value.(*dict.WordInfo).Word) == 1 { if masks[cur.Value.(*dict.WordInfo).Position] == 11 { removeItem := cur cur = cur.Next() result.Remove(removeItem) continue } } cur = cur.Next() } } cur = result.Front() j = 0 for cur != nil { if cur.Value.(*dict.WordInfo).Position >= unknownWords[j].Position { result.InsertBefore(unknownWords[j], cur) j++ if j >= len(unknownWords) { break } } if cur.Value.(*dict.WordInfo).Position < unknownWords[j].Position { cur = cur.Next() } } for j < len(unknownWords) { result.PushBack(unknownWords[j]) j++ } } return result }
func (d *WordDictionary) GetAllMatchs(text string, chineseNameIdentify bool) (result []PositionLength) { result = []PositionLength{} if len(text) == 0 { return } rtext := utils.ToRunes(text) keyText := rtext if rtext[0] < 128 { keyText = utils.ToRunes(strings.ToLower(text)) } for i := 0; i < len(rtext); i++ { fst := keyText[i] var chsNames []string = nil if chineseNameIdentify { chsNames = d.ChineseName.Match(rtext, i) for _, name := range chsNames { wa := NewWordAttr(name, POS_A_NR, 0) result = append(result, PositionLength{0, i, utils.RuneLen(name), wa}) } } if fwa, ok := d.firstCharDict[fst]; ok { result = append(result, PositionLength{0, i, 1, fwa}) } if i < len(keyText)-1 { doubleChar := keyText[i]*65536 + keyText[i+1] if fwa, ok := d.doubleCharDict[doubleChar]; ok { result = append(result, PositionLength{0, i, 2, fwa}) } } if i >= len(keyText)-2 { continue } tripleChar := int64(int32(keyText[i]))*0x100000000 + int64(int32(keyText[i+1]))*65536 + int64(int32(keyText[i+2])) if lenList, ok := d.tripleCharDict[tripleChar]; ok { for _, ilen := range *lenList { if ilen == 0 { break } if (i + int(ilen)) > len(keyText) { continue } key := string(keyText[i:(i + int(ilen))]) if wa, ok := d.wordDict[key]; ok { if chsNames != nil { find := false for _, name := range chsNames { if wa.Word == name { find = true break } } if find { continue } } result = append(result, PositionLength{0, i, int(ilen), wa}) } } } } return }