func NewChsName() *ChsName { c := &ChsName{} c.familyNameDict = make(map[rune]([]rune)) c.singleNameDict = make(map[rune]rune) c.doubleName1Dict = make(map[rune]rune) c.doubleName2Dict = make(map[rune]rune) for _, name := range FAMILY_NAMES { runes := utils.ToRunes(name) if len(runes) == 1 { if _, ok := c.familyNameDict[runes[0]]; !ok { c.familyNameDict[runes[0]] = nil } } else { if v, ok := c.familyNameDict[runes[0]]; ok { if v == nil { c.familyNameDict[runes[0]] = []rune{0} } c.familyNameDict[runes[0]] = append(c.familyNameDict[runes[0]], runes[1]) } else { c.familyNameDict[runes[0]] = []rune{runes[1]} } } } return c }
func (s *Segment) getInitSegment(text string) *list.List { result := list.New() runes := utils.ToRunes(text) lexical := framework.NewLexical(runes) var dfaResult int for i := 0; i < len(runes); i++ { dfaResult = lexical.Input(runes[i], i) switch dfaResult { case framework.Continue: continue case framework.Quit: result.PushBack(lexical.OutputToken) case framework.ElseQuit: result.PushBack(lexical.OutputToken) if lexical.OldState != 255 { i-- } } } dfaResult = lexical.Input(0, len(runes)) switch dfaResult { case framework.Continue: case framework.Quit: result.PushBack(lexical.OutputToken) case framework.ElseQuit: result.PushBack(lexical.OutputToken) } return result }
func (d *WordDictionary) Load(fileName string) (err error) { d.wordDict = make(map[string](*WordAttr)) d.firstCharDict = make(map[rune](*WordAttr)) d.doubleCharDict = make(map[int32](*WordAttr)) d.tripleCharDict = make(map[int64](*[]byte)) waList, err := d.loadFromTextFile(fileName) if err != nil { return err } for e := waList.Front(); e != nil; e = e.Next() { key := strings.ToLower(e.Value.(*WordAttr).Word) runes := utils.ToRunes(key) if len(runes) == 1 { d.firstCharDict[runes[0]] = e.Value.(*WordAttr) continue } if len(runes) == 2 { doubleChar := runes[0]*65536 + runes[1] d.doubleCharDict[doubleChar] = e.Value.(*WordAttr) continue } d.wordDict[key] = e.Value.(*WordAttr) tripleChar := int64(int32(runes[0]))*int64(0x100000000) + int64(int32(runes[1]))*int64(65536) + int64(int32(runes[2])) var wordLenArray []byte v, ok := d.tripleCharDict[tripleChar] if !ok { wordLenArray = make([]byte, 4) wordLenArray[0] = byte(len(runes)) d.tripleCharDict[tripleChar] = &wordLenArray } else { find := false i := 0 for i = 0; i < len(*v); i++ { if (*v)[i] == byte(len(runes)) { find = true break } if (*v)[i] == byte(0) { (*v)[i] = byte(len(runes)) find = true break } } if !find { var temp []byte = make([]byte, len(*v)*2) copy(temp, (*v)) temp[i] = byte(len(runes)) d.tripleCharDict[tripleChar] = &temp } } } return nil }
func (c *ChsName) loadNameDict(filePath string, dict map[rune]rune) (err error) { err = utils.EachLine(filePath, func(line string) { if len(line) > 0 { runes := utils.ToRunes(line) dict[runes[0]] = runes[0] } }) return }
func (s *Stemmer) setto(str string) { l := utils.RuneLen(str) o := s.j + 1 sc := utils.ToRunes(str) for i := 0; i < l; i++ { s.b[o+i] = sc[i] } s.k = s.j + l }
func (s *Stemmer) ends(str string) bool { l := utils.RuneLen(str) o := s.k - l + 1 if o < 0 { return false } sc := utils.ToRunes(str) for i := 0; i < l; i++ { if s.b[o+i] != sc[i] { return false } } s.j = s.k - l return true }
func (s *Segment) convertChineseCapicalToAsiic(text string) string { runes := utils.ToRunes(text) for i := 0; i < len(runes); i++ { if runes[i] >= '0' && runes[i] <= '9' { runes[i] -= '0' runes[i] += '0' } else if runes[i] >= 'a' && runes[i] <= 'z' { runes[i] -= 'a' runes[i] += 'a' } else if runes[i] >= 'A' && runes[i] <= 'Z' { runes[i] -= 'A' runes[i] += 'A' } } return string(runes) }
func (d *WordDictionary) GetAllMatchs(text string, chineseNameIdentify bool) (result []PositionLength) { result = []PositionLength{} if len(text) == 0 { return } rtext := utils.ToRunes(text) keyText := rtext if rtext[0] < 128 { keyText = utils.ToRunes(strings.ToLower(text)) } for i := 0; i < len(rtext); i++ { fst := keyText[i] var chsNames []string = nil if chineseNameIdentify { chsNames = d.ChineseName.Match(rtext, i) for _, name := range chsNames { wa := NewWordAttr(name, POS_A_NR, 0) result = append(result, PositionLength{0, i, utils.RuneLen(name), wa}) } } if fwa, ok := d.firstCharDict[fst]; ok { result = append(result, PositionLength{0, i, 1, fwa}) } if i < len(keyText)-1 { doubleChar := keyText[i]*65536 + keyText[i+1] if fwa, ok := d.doubleCharDict[doubleChar]; ok { result = append(result, PositionLength{0, i, 2, fwa}) } } if i >= len(keyText)-2 { continue } tripleChar := int64(int32(keyText[i]))*0x100000000 + int64(int32(keyText[i+1]))*65536 + int64(int32(keyText[i+2])) if lenList, ok := d.tripleCharDict[tripleChar]; ok { for _, ilen := range *lenList { if ilen == 0 { break } if (i + int(ilen)) > len(keyText) { continue } key := string(keyText[i:(i + int(ilen))]) if wa, ok := d.wordDict[key]; ok { if chsNames != nil { find := false for _, name := range chsNames { if wa.Word == name { find = true break } } if find { continue } } result = append(result, PositionLength{0, i, int(ilen), wa}) } } } } return }
func (s *Segment) preSegment(text string) *list.List { result := s.getInitSegment(text) runes := utils.ToRunes(text) cur := result.Front() for cur != nil { if s.options.IgnoreSpace { if cur.Value.(*dict.WordInfo).WordType == dict.TSpace { lst := cur cur = cur.Next() result.Remove(lst) continue } } switch cur.Value.(*dict.WordInfo).WordType { case dict.TSimplifiedChinese: inputText := cur.Value.(*dict.WordInfo).Word originalWordType := dict.TSimplifiedChinese pls := s.wordDictionary.GetAllMatchs(inputText, s.options.ChineseNameIdentify) chsMatch := match.NewChsFullTextMatch(s.wordDictionary) chsMatch.SetOptionParams(s.options, s.params) chsMatchWords := chsMatch.Match(pls, inputText) curChsMatch := chsMatchWords.Front() for curChsMatch != nil { wi := curChsMatch.Value.(*dict.WordInfo) wi.Position += cur.Value.(*dict.WordInfo).Position wi.OriginalWordType = originalWordType wi.WordType = originalWordType curChsMatch = curChsMatch.Next() } rcur := utils.InsertAfterList(result, chsMatchWords, cur) removeItem := cur cur = rcur.Next() result.Remove(removeItem) case dict.TEnglish: cur.Value.(*dict.WordInfo).Rank = s.params.EnglishRank cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word) if s.options.IgnoreCapital { cur.Value.(*dict.WordInfo).Word = strings.ToLower(cur.Value.(*dict.WordInfo).Word) } if s.options.EnglishSegment { lower := strings.ToLower(cur.Value.(*dict.WordInfo).Word) if lower != cur.Value.(*dict.WordInfo).Word { result.InsertBefore(dict.NewWordInfo(lower, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishLowerRank, dict.TEnglish, dict.TEnglish), cur) } stem := s.getStem(lower) if len(stem) > 0 { if lower != stem { result.InsertBefore(dict.NewWordInfo(stem, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishStemRank, dict.TEnglish, dict.TEnglish), cur) } } } if s.options.EnglishMultiDimensionality { needSplit := false for _, c := range cur.Value.(*dict.WordInfo).Word { if (c >= '0' && c <= '9') || (c == '_') { needSplit = true break } } if needSplit { output := s.re.FindAllString(cur.Value.(*dict.WordInfo).Word, -1) if len(output) > 1 { position := cur.Value.(*dict.WordInfo).Position for _, splitWord := range output { if len(splitWord) == 0 { continue } var wi *dict.WordInfo r := utils.FirstRune(splitWord) if r >= '0' && r <= '9' { wi = dict.NewWordInfoSome(splitWord, dict.POS_A_M, 1) wi.Position = position wi.Rank = s.params.NumericRank wi.OriginalWordType = dict.TEnglish wi.WordType = dict.TNumeric } else { wi = dict.NewWordInfoSome(splitWord, dict.POS_A_NX, 1) wi.Position = position wi.Rank = s.params.EnglishRank wi.OriginalWordType = dict.TEnglish wi.WordType = dict.TEnglish } result.InsertBefore(wi, cur) position += utils.RuneLen(splitWord) } } } } var ok bool if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok { cur = cur.Next() } case dict.TNumeric: cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word) cur.Value.(*dict.WordInfo).Rank = s.params.NumericRank var ok bool if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok { cur = cur.Next() } case dict.TSymbol: cur.Value.(*dict.WordInfo).Rank = s.params.SymbolRank cur = cur.Next() default: cur = cur.Next() } } return result }
func (m *ChsFullTextMatch) Match(posLenArr []dict.PositionLength, originalText string) *list.List { if m.options == nil { m.options = NewMatchOptions() } if m.params == nil { m.params = NewMatchParameter() } runes := utils.ToRunes(originalText) masks := make([]int, len(runes)) redundancy := m.params.Redundancy result := list.New() if len(posLenArr) == 0 { if m.options.UnknownWordIdentify { wi := dict.NewWordInfoDefault() wi.Word = originalText wi.Position = 0 wi.WordType = dict.TNone wi.Rank = 1 result.PushFront(wi) return result } else { position := 0 for _, r := range runes { wi := dict.NewWordInfoDefault() wi.Word = string(r) wi.Position = position wi.WordType = dict.TNone wi.Rank = 1 position++ result.PushBack(wi) } return result } } leafNodeArray := m.getLeafNodeArray(posLenArr, originalText) // 获取前TopRecord个单词序列 j := 0 for _, node := range leafNodeArray { if leafNodeArray[j] == nil { break } if j >= TopRecord || j >= len(leafNodeArray) { break } comb := make([]dict.PositionLength, node.AboveCount) i := node.AboveCount - 1 cur := node for i >= 0 { comb[i] = cur.PosLen cur = cur.Parent i-- } m.allCombinations = append(m.allCombinations, comb) j++ } // Force single word // 强制一元分词 if m.options.ForceSingleWord { comb := make([]dict.PositionLength, len(runes)) for i := 0; i < len(comb); i++ { pl := dict.NewPositionLength(i, 1, dict.NewWordAttr(string(runes[i]), dict.POS_UNK, 0.0)) pl.Level = 3 comb[i] = pl } m.allCombinations = append(m.allCombinations, comb) } if len(m.allCombinations) > 0 { positionCollection := m.mergeAllCombinations(redundancy) curPc := positionCollection.Front() for curPc != nil { pl := curPc.Value.(dict.PositionLength) wi := dict.NewWordInfoDefault() wi.Word = string(runes[pl.Position:(pl.Position + pl.Length)]) wi.Pos = pl.WordAttri.Pos wi.Frequency = pl.WordAttri.Frequency wi.WordType = dict.TSimplifiedChinese wi.Position = pl.Position switch pl.Level { case 0: wi.Rank = m.params.BestRank case 1: wi.Rank = m.params.SecRank case 2: wi.Rank = m.params.ThirdRank case 3: wi.Rank = m.params.SingleRank default: wi.Rank = m.params.BestRank } result.PushBack(wi) if pl.Length > 1 { for k := pl.Position; k < pl.Position+pl.Length; k++ { masks[k] = 2 } } else { masks[pl.Position] = 1 } curPc = curPc.Next() } } // 合并未登录词 unknownWords, needRemoveSingleWord := m.getUnknownWords(masks, runes) // 合并结果序列到对应位置中 if len(unknownWords) > 0 { cur := result.Front() if needRemoveSingleWord && !m.options.ForceSingleWord { // remove single word need be removed for cur != nil { if utils.RuneLen(cur.Value.(*dict.WordInfo).Word) == 1 { if masks[cur.Value.(*dict.WordInfo).Position] == 11 { removeItem := cur cur = cur.Next() result.Remove(removeItem) continue } } cur = cur.Next() } } cur = result.Front() j = 0 for cur != nil { if cur.Value.(*dict.WordInfo).Position >= unknownWords[j].Position { result.InsertBefore(unknownWords[j], cur) j++ if j >= len(unknownWords) { break } } if cur.Value.(*dict.WordInfo).Position < unknownWords[j].Position { cur = cur.Next() } } for j < len(unknownWords) { result.PushBack(unknownWords[j]) j++ } } return result }