Пример #1
0
func (s *Stemmer) setto(str string) {
	l := utils.RuneLen(str)
	o := s.j + 1
	sc := utils.ToRunes(str)
	for i := 0; i < l; i++ {
		s.b[o+i] = sc[i]
	}
	s.k = s.j + l
}
Пример #2
0
func (s *Stemmer) ends(str string) bool {
	l := utils.RuneLen(str)
	o := s.k - l + 1
	if o < 0 {
		return false
	}
	sc := utils.ToRunes(str)
	for i := 0; i < l; i++ {
		if s.b[o+i] != sc[i] {
			return false
		}
	}
	s.j = s.k - l
	return true
}
Пример #3
0
func (s *Segment) mergeEnglishSpecialWord(orginalText []rune, wordInfoList *list.List, current *list.Element) (bool, *list.Element) {
	cur := current
	cur = cur.Next()

	last := -1
	for cur != nil {
		if cur.Value.(*dict.WordInfo).WordType == dict.TSymbol || cur.Value.(*dict.WordInfo).WordType == dict.TEnglish {
			last = cur.Value.(*dict.WordInfo).Position + utils.RuneLen(cur.Value.(*dict.WordInfo).Word)
			cur = cur.Next()
		} else {
			break
		}
	}

	if last >= 0 {
		first := current.Value.(*dict.WordInfo).Position
		newWord := orginalText[first:last]
		wa := s.wordDictionary.GetWordAttr(newWord)
		if wa == nil {
			return false, current
		}

		for current != cur {
			removeItem := current
			current = current.Next()
			wordInfoList.Remove(removeItem)
		}

		wi := dict.NewWordInfoDefault()
		wi.Word = string(newWord)
		wi.Pos = wa.Pos
		wi.Frequency = wa.Frequency
		wi.WordType = dict.TEnglish
		wi.Position = first
		wi.Rank = s.params.EnglishRank

		if current == nil {
			wordInfoList.PushBack(wi)
		} else {
			wordInfoList.InsertBefore(wi, current)
		}

		return true, current
	}

	return false, current
}
Пример #4
0
func (s *StopWord) IsStopWord(word string, filterEnglish bool, filterEnglishLength int, filterNumeric bool, filterNumbericLength int) bool {
	if len(word) == 0 {
		return false
	}

	r := utils.FirstRune(word)
	if r < 128 {
		slen := utils.RuneLen(word)
		if filterEnglish {
			if slen > filterEnglishLength && (r < '0' || r > '9') {
				return true
			}
		}
		if filterNumeric {
			if slen > filterNumbericLength && (r >= '0' && r <= '9') {
				return true
			}
		}
		return s.stopWordTbl[strings.ToLower(word)]
	}

	return s.stopWordTbl[word]
}
Пример #5
0
func (s *Segment) preSegment(text string) *list.List {
	result := s.getInitSegment(text)
	runes := utils.ToRunes(text)
	cur := result.Front()
	for cur != nil {
		if s.options.IgnoreSpace {
			if cur.Value.(*dict.WordInfo).WordType == dict.TSpace {
				lst := cur
				cur = cur.Next()
				result.Remove(lst)
				continue
			}
		}
		switch cur.Value.(*dict.WordInfo).WordType {
		case dict.TSimplifiedChinese:
			inputText := cur.Value.(*dict.WordInfo).Word
			originalWordType := dict.TSimplifiedChinese
			pls := s.wordDictionary.GetAllMatchs(inputText, s.options.ChineseNameIdentify)
			chsMatch := match.NewChsFullTextMatch(s.wordDictionary)
			chsMatch.SetOptionParams(s.options, s.params)
			chsMatchWords := chsMatch.Match(pls, inputText)
			curChsMatch := chsMatchWords.Front()
			for curChsMatch != nil {
				wi := curChsMatch.Value.(*dict.WordInfo)
				wi.Position += cur.Value.(*dict.WordInfo).Position
				wi.OriginalWordType = originalWordType
				wi.WordType = originalWordType
				curChsMatch = curChsMatch.Next()
			}
			rcur := utils.InsertAfterList(result, chsMatchWords, cur)
			removeItem := cur
			cur = rcur.Next()
			result.Remove(removeItem)
		case dict.TEnglish:
			cur.Value.(*dict.WordInfo).Rank = s.params.EnglishRank
			cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word)
			if s.options.IgnoreCapital {
				cur.Value.(*dict.WordInfo).Word = strings.ToLower(cur.Value.(*dict.WordInfo).Word)
			}

			if s.options.EnglishSegment {
				lower := strings.ToLower(cur.Value.(*dict.WordInfo).Word)
				if lower != cur.Value.(*dict.WordInfo).Word {
					result.InsertBefore(dict.NewWordInfo(lower, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishLowerRank, dict.TEnglish, dict.TEnglish), cur)
				}
				stem := s.getStem(lower)
				if len(stem) > 0 {
					if lower != stem {
						result.InsertBefore(dict.NewWordInfo(stem, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishStemRank, dict.TEnglish, dict.TEnglish), cur)
					}
				}
			}

			if s.options.EnglishMultiDimensionality {
				needSplit := false
				for _, c := range cur.Value.(*dict.WordInfo).Word {
					if (c >= '0' && c <= '9') || (c == '_') {
						needSplit = true
						break
					}
				}
				if needSplit {
					output := s.re.FindAllString(cur.Value.(*dict.WordInfo).Word, -1)
					if len(output) > 1 {
						position := cur.Value.(*dict.WordInfo).Position
						for _, splitWord := range output {
							if len(splitWord) == 0 {
								continue
							}

							var wi *dict.WordInfo
							r := utils.FirstRune(splitWord)
							if r >= '0' && r <= '9' {
								wi = dict.NewWordInfoSome(splitWord, dict.POS_A_M, 1)
								wi.Position = position
								wi.Rank = s.params.NumericRank
								wi.OriginalWordType = dict.TEnglish
								wi.WordType = dict.TNumeric
							} else {
								wi = dict.NewWordInfoSome(splitWord, dict.POS_A_NX, 1)
								wi.Position = position
								wi.Rank = s.params.EnglishRank
								wi.OriginalWordType = dict.TEnglish
								wi.WordType = dict.TEnglish
							}

							result.InsertBefore(wi, cur)
							position += utils.RuneLen(splitWord)
						}
					}
				}
			}

			var ok bool
			if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok {
				cur = cur.Next()
			}

		case dict.TNumeric:
			cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word)
			cur.Value.(*dict.WordInfo).Rank = s.params.NumericRank
			var ok bool
			if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok {
				cur = cur.Next()
			}
		case dict.TSymbol:
			cur.Value.(*dict.WordInfo).Rank = s.params.SymbolRank
			cur = cur.Next()
		default:
			cur = cur.Next()
		}
	}
	return result
}
Пример #6
0
func (m *ChsFullTextMatch) Match(posLenArr []dict.PositionLength, originalText string) *list.List {
	if m.options == nil {
		m.options = NewMatchOptions()
	}
	if m.params == nil {
		m.params = NewMatchParameter()
	}
	runes := utils.ToRunes(originalText)
	masks := make([]int, len(runes))
	redundancy := m.params.Redundancy

	result := list.New()
	if len(posLenArr) == 0 {
		if m.options.UnknownWordIdentify {
			wi := dict.NewWordInfoDefault()
			wi.Word = originalText
			wi.Position = 0
			wi.WordType = dict.TNone
			wi.Rank = 1
			result.PushFront(wi)
			return result
		} else {
			position := 0
			for _, r := range runes {
				wi := dict.NewWordInfoDefault()
				wi.Word = string(r)
				wi.Position = position
				wi.WordType = dict.TNone
				wi.Rank = 1
				position++
				result.PushBack(wi)
			}
			return result
		}
	}

	leafNodeArray := m.getLeafNodeArray(posLenArr, originalText)

	// 获取前TopRecord个单词序列
	j := 0
	for _, node := range leafNodeArray {
		if leafNodeArray[j] == nil {
			break
		}
		if j >= TopRecord || j >= len(leafNodeArray) {
			break
		}
		comb := make([]dict.PositionLength, node.AboveCount)
		i := node.AboveCount - 1
		cur := node
		for i >= 0 {
			comb[i] = cur.PosLen
			cur = cur.Parent
			i--
		}
		m.allCombinations = append(m.allCombinations, comb)
		j++
	}

	// Force single word
	// 强制一元分词
	if m.options.ForceSingleWord {
		comb := make([]dict.PositionLength, len(runes))
		for i := 0; i < len(comb); i++ {
			pl := dict.NewPositionLength(i, 1, dict.NewWordAttr(string(runes[i]), dict.POS_UNK, 0.0))
			pl.Level = 3
			comb[i] = pl
		}
		m.allCombinations = append(m.allCombinations, comb)
	}

	if len(m.allCombinations) > 0 {
		positionCollection := m.mergeAllCombinations(redundancy)
		curPc := positionCollection.Front()
		for curPc != nil {
			pl := curPc.Value.(dict.PositionLength)
			wi := dict.NewWordInfoDefault()
			wi.Word = string(runes[pl.Position:(pl.Position + pl.Length)])
			wi.Pos = pl.WordAttri.Pos
			wi.Frequency = pl.WordAttri.Frequency
			wi.WordType = dict.TSimplifiedChinese
			wi.Position = pl.Position
			switch pl.Level {
			case 0:
				wi.Rank = m.params.BestRank
			case 1:
				wi.Rank = m.params.SecRank
			case 2:
				wi.Rank = m.params.ThirdRank
			case 3:
				wi.Rank = m.params.SingleRank
			default:
				wi.Rank = m.params.BestRank
			}

			result.PushBack(wi)
			if pl.Length > 1 {
				for k := pl.Position; k < pl.Position+pl.Length; k++ {
					masks[k] = 2
				}
			} else {
				masks[pl.Position] = 1
			}
			curPc = curPc.Next()
		}
	}

	// 合并未登录词
	unknownWords, needRemoveSingleWord := m.getUnknownWords(masks, runes)
	// 合并结果序列到对应位置中
	if len(unknownWords) > 0 {
		cur := result.Front()
		if needRemoveSingleWord && !m.options.ForceSingleWord {
			// remove single word need be removed
			for cur != nil {
				if utils.RuneLen(cur.Value.(*dict.WordInfo).Word) == 1 {
					if masks[cur.Value.(*dict.WordInfo).Position] == 11 {
						removeItem := cur
						cur = cur.Next()
						result.Remove(removeItem)
						continue
					}
				}
				cur = cur.Next()
			}
		}

		cur = result.Front()
		j = 0
		for cur != nil {
			if cur.Value.(*dict.WordInfo).Position >= unknownWords[j].Position {
				result.InsertBefore(unknownWords[j], cur)
				j++
				if j >= len(unknownWords) {
					break
				}
			}

			if cur.Value.(*dict.WordInfo).Position < unknownWords[j].Position {
				cur = cur.Next()
			}
		}

		for j < len(unknownWords) {
			result.PushBack(unknownWords[j])
			j++
		}
	}

	return result
}
Пример #7
0
func (d *WordDictionary) GetAllMatchs(text string, chineseNameIdentify bool) (result []PositionLength) {
	result = []PositionLength{}
	if len(text) == 0 {
		return
	}

	rtext := utils.ToRunes(text)

	keyText := rtext
	if rtext[0] < 128 {
		keyText = utils.ToRunes(strings.ToLower(text))
	}

	for i := 0; i < len(rtext); i++ {
		fst := keyText[i]

		var chsNames []string = nil
		if chineseNameIdentify {
			chsNames = d.ChineseName.Match(rtext, i)
			for _, name := range chsNames {
				wa := NewWordAttr(name, POS_A_NR, 0)
				result = append(result, PositionLength{0, i, utils.RuneLen(name), wa})
			}
		}

		if fwa, ok := d.firstCharDict[fst]; ok {
			result = append(result, PositionLength{0, i, 1, fwa})
		}

		if i < len(keyText)-1 {
			doubleChar := keyText[i]*65536 + keyText[i+1]
			if fwa, ok := d.doubleCharDict[doubleChar]; ok {
				result = append(result, PositionLength{0, i, 2, fwa})
			}
		}

		if i >= len(keyText)-2 {
			continue
		}

		tripleChar := int64(int32(keyText[i]))*0x100000000 + int64(int32(keyText[i+1]))*65536 + int64(int32(keyText[i+2]))
		if lenList, ok := d.tripleCharDict[tripleChar]; ok {
			for _, ilen := range *lenList {
				if ilen == 0 {
					break
				}
				if (i + int(ilen)) > len(keyText) {
					continue
				}
				key := string(keyText[i:(i + int(ilen))])
				if wa, ok := d.wordDict[key]; ok {
					if chsNames != nil {
						find := false
						for _, name := range chsNames {
							if wa.Word == name {
								find = true
								break
							}
						}
						if find {
							continue
						}
					}
					result = append(result, PositionLength{0, i, int(ilen), wa})
				}
			}
		}
	}

	return
}