예제 #1
0
파일: stop_word.go 프로젝트: cluo/gosegment
func (s *StopWord) Load(file string) (err error) {
	err = utils.EachLine(file, func(line string) {
		if len(line) > 0 {
			if utils.FirstRune(line) < 128 {
				s.stopWordTbl[strings.ToLower(line)] = true
			} else {
				s.stopWordTbl[line] = true
			}
		}
	})
	return
}
예제 #2
0
파일: stop_word.go 프로젝트: cluo/gosegment
func (s *StopWord) IsStopWord(word string, filterEnglish bool, filterEnglishLength int, filterNumeric bool, filterNumbericLength int) bool {
	if len(word) == 0 {
		return false
	}

	r := utils.FirstRune(word)
	if r < 128 {
		slen := utils.RuneLen(word)
		if filterEnglish {
			if slen > filterEnglishLength && (r < '0' || r > '9') {
				return true
			}
		}
		if filterNumeric {
			if slen > filterNumbericLength && (r >= '0' && r <= '9') {
				return true
			}
		}
		return s.stopWordTbl[strings.ToLower(word)]
	}

	return s.stopWordTbl[word]
}
예제 #3
0
파일: segment.go 프로젝트: cluo/gosegment
func (s *Segment) preSegment(text string) *list.List {
	result := s.getInitSegment(text)
	runes := utils.ToRunes(text)
	cur := result.Front()
	for cur != nil {
		if s.options.IgnoreSpace {
			if cur.Value.(*dict.WordInfo).WordType == dict.TSpace {
				lst := cur
				cur = cur.Next()
				result.Remove(lst)
				continue
			}
		}
		switch cur.Value.(*dict.WordInfo).WordType {
		case dict.TSimplifiedChinese:
			inputText := cur.Value.(*dict.WordInfo).Word
			originalWordType := dict.TSimplifiedChinese
			pls := s.wordDictionary.GetAllMatchs(inputText, s.options.ChineseNameIdentify)
			chsMatch := match.NewChsFullTextMatch(s.wordDictionary)
			chsMatch.SetOptionParams(s.options, s.params)
			chsMatchWords := chsMatch.Match(pls, inputText)
			curChsMatch := chsMatchWords.Front()
			for curChsMatch != nil {
				wi := curChsMatch.Value.(*dict.WordInfo)
				wi.Position += cur.Value.(*dict.WordInfo).Position
				wi.OriginalWordType = originalWordType
				wi.WordType = originalWordType
				curChsMatch = curChsMatch.Next()
			}
			rcur := utils.InsertAfterList(result, chsMatchWords, cur)
			removeItem := cur
			cur = rcur.Next()
			result.Remove(removeItem)
		case dict.TEnglish:
			cur.Value.(*dict.WordInfo).Rank = s.params.EnglishRank
			cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word)
			if s.options.IgnoreCapital {
				cur.Value.(*dict.WordInfo).Word = strings.ToLower(cur.Value.(*dict.WordInfo).Word)
			}

			if s.options.EnglishSegment {
				lower := strings.ToLower(cur.Value.(*dict.WordInfo).Word)
				if lower != cur.Value.(*dict.WordInfo).Word {
					result.InsertBefore(dict.NewWordInfo(lower, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishLowerRank, dict.TEnglish, dict.TEnglish), cur)
				}
				stem := s.getStem(lower)
				if len(stem) > 0 {
					if lower != stem {
						result.InsertBefore(dict.NewWordInfo(stem, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishStemRank, dict.TEnglish, dict.TEnglish), cur)
					}
				}
			}

			if s.options.EnglishMultiDimensionality {
				needSplit := false
				for _, c := range cur.Value.(*dict.WordInfo).Word {
					if (c >= '0' && c <= '9') || (c == '_') {
						needSplit = true
						break
					}
				}
				if needSplit {
					output := s.re.FindAllString(cur.Value.(*dict.WordInfo).Word, -1)
					if len(output) > 1 {
						position := cur.Value.(*dict.WordInfo).Position
						for _, splitWord := range output {
							if len(splitWord) == 0 {
								continue
							}

							var wi *dict.WordInfo
							r := utils.FirstRune(splitWord)
							if r >= '0' && r <= '9' {
								wi = dict.NewWordInfoSome(splitWord, dict.POS_A_M, 1)
								wi.Position = position
								wi.Rank = s.params.NumericRank
								wi.OriginalWordType = dict.TEnglish
								wi.WordType = dict.TNumeric
							} else {
								wi = dict.NewWordInfoSome(splitWord, dict.POS_A_NX, 1)
								wi.Position = position
								wi.Rank = s.params.EnglishRank
								wi.OriginalWordType = dict.TEnglish
								wi.WordType = dict.TEnglish
							}

							result.InsertBefore(wi, cur)
							position += utils.RuneLen(splitWord)
						}
					}
				}
			}

			var ok bool
			if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok {
				cur = cur.Next()
			}

		case dict.TNumeric:
			cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word)
			cur.Value.(*dict.WordInfo).Rank = s.params.NumericRank
			var ok bool
			if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok {
				cur = cur.Next()
			}
		case dict.TSymbol:
			cur.Value.(*dict.WordInfo).Rank = s.params.SymbolRank
			cur = cur.Next()
		default:
			cur = cur.Next()
		}
	}
	return result
}