func (s *StopWord) Load(file string) (err error) { err = utils.EachLine(file, func(line string) { if len(line) > 0 { if utils.FirstRune(line) < 128 { s.stopWordTbl[strings.ToLower(line)] = true } else { s.stopWordTbl[line] = true } } }) return }
func (s *StopWord) IsStopWord(word string, filterEnglish bool, filterEnglishLength int, filterNumeric bool, filterNumbericLength int) bool { if len(word) == 0 { return false } r := utils.FirstRune(word) if r < 128 { slen := utils.RuneLen(word) if filterEnglish { if slen > filterEnglishLength && (r < '0' || r > '9') { return true } } if filterNumeric { if slen > filterNumbericLength && (r >= '0' && r <= '9') { return true } } return s.stopWordTbl[strings.ToLower(word)] } return s.stopWordTbl[word] }
func (s *Segment) preSegment(text string) *list.List { result := s.getInitSegment(text) runes := utils.ToRunes(text) cur := result.Front() for cur != nil { if s.options.IgnoreSpace { if cur.Value.(*dict.WordInfo).WordType == dict.TSpace { lst := cur cur = cur.Next() result.Remove(lst) continue } } switch cur.Value.(*dict.WordInfo).WordType { case dict.TSimplifiedChinese: inputText := cur.Value.(*dict.WordInfo).Word originalWordType := dict.TSimplifiedChinese pls := s.wordDictionary.GetAllMatchs(inputText, s.options.ChineseNameIdentify) chsMatch := match.NewChsFullTextMatch(s.wordDictionary) chsMatch.SetOptionParams(s.options, s.params) chsMatchWords := chsMatch.Match(pls, inputText) curChsMatch := chsMatchWords.Front() for curChsMatch != nil { wi := curChsMatch.Value.(*dict.WordInfo) wi.Position += cur.Value.(*dict.WordInfo).Position wi.OriginalWordType = originalWordType wi.WordType = originalWordType curChsMatch = curChsMatch.Next() } rcur := utils.InsertAfterList(result, chsMatchWords, cur) removeItem := cur cur = rcur.Next() result.Remove(removeItem) case dict.TEnglish: cur.Value.(*dict.WordInfo).Rank = s.params.EnglishRank cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word) if s.options.IgnoreCapital { cur.Value.(*dict.WordInfo).Word = strings.ToLower(cur.Value.(*dict.WordInfo).Word) } if s.options.EnglishSegment { lower := strings.ToLower(cur.Value.(*dict.WordInfo).Word) if lower != cur.Value.(*dict.WordInfo).Word { result.InsertBefore(dict.NewWordInfo(lower, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishLowerRank, dict.TEnglish, dict.TEnglish), cur) } stem := s.getStem(lower) if len(stem) > 0 { if lower != stem { result.InsertBefore(dict.NewWordInfo(stem, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishStemRank, dict.TEnglish, dict.TEnglish), cur) } } } if s.options.EnglishMultiDimensionality { needSplit := false for _, c := range cur.Value.(*dict.WordInfo).Word { if (c >= '0' && c <= '9') || (c == '_') { needSplit = true break } } if needSplit { output := s.re.FindAllString(cur.Value.(*dict.WordInfo).Word, -1) if len(output) > 1 { position := cur.Value.(*dict.WordInfo).Position for _, splitWord := range output { if len(splitWord) == 0 { continue } var wi *dict.WordInfo r := utils.FirstRune(splitWord) if r >= '0' && r <= '9' { wi = dict.NewWordInfoSome(splitWord, dict.POS_A_M, 1) wi.Position = position wi.Rank = s.params.NumericRank wi.OriginalWordType = dict.TEnglish wi.WordType = dict.TNumeric } else { wi = dict.NewWordInfoSome(splitWord, dict.POS_A_NX, 1) wi.Position = position wi.Rank = s.params.EnglishRank wi.OriginalWordType = dict.TEnglish wi.WordType = dict.TEnglish } result.InsertBefore(wi, cur) position += utils.RuneLen(splitWord) } } } } var ok bool if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok { cur = cur.Next() } case dict.TNumeric: cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word) cur.Value.(*dict.WordInfo).Rank = s.params.NumericRank var ok bool if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok { cur = cur.Next() } case dict.TSymbol: cur.Value.(*dict.WordInfo).Rank = s.params.SymbolRank cur = cur.Next() default: cur = cur.Next() } } return result }