コード例 #1
0
ファイル: lexical.go プロジェクト: cluo/gosegment
func (s *State) DoThings(action rune, dfa *Lexical) {
	switch s.Func {
	case OutputIdentifier:
		dfa.OutputToken = dict.NewWordInfoDefault()
		s.getTextElse(dfa)
		dfa.OutputToken.WordType = dict.TEnglish
	case OutputSpace:
		dfa.OutputToken = dict.NewWordInfoDefault()
		s.getTextElse(dfa)
		dfa.OutputToken.WordType = dict.TSpace
	case OutputNumeric:
		dfa.OutputToken = dict.NewWordInfoDefault()
		s.getTextElse(dfa)
		dfa.OutputToken.WordType = dict.TNumeric
	case OutputChinese:
		dfa.OutputToken = dict.NewWordInfoDefault()
		s.getTextElse(dfa)
		dfa.OutputToken.WordType = dict.TSimplifiedChinese
	case Other:
		dfa.OutputToken = dict.NewWordInfoDefault()
		s.getText(dfa)
		dfa.OutputToken.WordType = dict.TSymbol
	}
}
コード例 #2
0
ファイル: segment.go プロジェクト: cluo/gosegment
func (s *Segment) mergeEnglishSpecialWord(orginalText []rune, wordInfoList *list.List, current *list.Element) (bool, *list.Element) {
	cur := current
	cur = cur.Next()

	last := -1
	for cur != nil {
		if cur.Value.(*dict.WordInfo).WordType == dict.TSymbol || cur.Value.(*dict.WordInfo).WordType == dict.TEnglish {
			last = cur.Value.(*dict.WordInfo).Position + utils.RuneLen(cur.Value.(*dict.WordInfo).Word)
			cur = cur.Next()
		} else {
			break
		}
	}

	if last >= 0 {
		first := current.Value.(*dict.WordInfo).Position
		newWord := orginalText[first:last]
		wa := s.wordDictionary.GetWordAttr(newWord)
		if wa == nil {
			return false, current
		}

		for current != cur {
			removeItem := current
			current = current.Next()
			wordInfoList.Remove(removeItem)
		}

		wi := dict.NewWordInfoDefault()
		wi.Word = string(newWord)
		wi.Pos = wa.Pos
		wi.Frequency = wa.Frequency
		wi.WordType = dict.TEnglish
		wi.Position = first
		wi.Rank = s.params.EnglishRank

		if current == nil {
			wordInfoList.PushBack(wi)
		} else {
			wordInfoList.InsertBefore(wi, current)
		}

		return true, current
	}

	return false, current
}
コード例 #3
0
ファイル: chsfulltext_match.go プロジェクト: cluo/gosegment
func (m *ChsFullTextMatch) Match(posLenArr []dict.PositionLength, originalText string) *list.List {
	if m.options == nil {
		m.options = NewMatchOptions()
	}
	if m.params == nil {
		m.params = NewMatchParameter()
	}
	runes := utils.ToRunes(originalText)
	masks := make([]int, len(runes))
	redundancy := m.params.Redundancy

	result := list.New()
	if len(posLenArr) == 0 {
		if m.options.UnknownWordIdentify {
			wi := dict.NewWordInfoDefault()
			wi.Word = originalText
			wi.Position = 0
			wi.WordType = dict.TNone
			wi.Rank = 1
			result.PushFront(wi)
			return result
		} else {
			position := 0
			for _, r := range runes {
				wi := dict.NewWordInfoDefault()
				wi.Word = string(r)
				wi.Position = position
				wi.WordType = dict.TNone
				wi.Rank = 1
				position++
				result.PushBack(wi)
			}
			return result
		}
	}

	leafNodeArray := m.getLeafNodeArray(posLenArr, originalText)

	// 获取前TopRecord个单词序列
	j := 0
	for _, node := range leafNodeArray {
		if leafNodeArray[j] == nil {
			break
		}
		if j >= TopRecord || j >= len(leafNodeArray) {
			break
		}
		comb := make([]dict.PositionLength, node.AboveCount)
		i := node.AboveCount - 1
		cur := node
		for i >= 0 {
			comb[i] = cur.PosLen
			cur = cur.Parent
			i--
		}
		m.allCombinations = append(m.allCombinations, comb)
		j++
	}

	// Force single word
	// 强制一元分词
	if m.options.ForceSingleWord {
		comb := make([]dict.PositionLength, len(runes))
		for i := 0; i < len(comb); i++ {
			pl := dict.NewPositionLength(i, 1, dict.NewWordAttr(string(runes[i]), dict.POS_UNK, 0.0))
			pl.Level = 3
			comb[i] = pl
		}
		m.allCombinations = append(m.allCombinations, comb)
	}

	if len(m.allCombinations) > 0 {
		positionCollection := m.mergeAllCombinations(redundancy)
		curPc := positionCollection.Front()
		for curPc != nil {
			pl := curPc.Value.(dict.PositionLength)
			wi := dict.NewWordInfoDefault()
			wi.Word = string(runes[pl.Position:(pl.Position + pl.Length)])
			wi.Pos = pl.WordAttri.Pos
			wi.Frequency = pl.WordAttri.Frequency
			wi.WordType = dict.TSimplifiedChinese
			wi.Position = pl.Position
			switch pl.Level {
			case 0:
				wi.Rank = m.params.BestRank
			case 1:
				wi.Rank = m.params.SecRank
			case 2:
				wi.Rank = m.params.ThirdRank
			case 3:
				wi.Rank = m.params.SingleRank
			default:
				wi.Rank = m.params.BestRank
			}

			result.PushBack(wi)
			if pl.Length > 1 {
				for k := pl.Position; k < pl.Position+pl.Length; k++ {
					masks[k] = 2
				}
			} else {
				masks[pl.Position] = 1
			}
			curPc = curPc.Next()
		}
	}

	// 合并未登录词
	unknownWords, needRemoveSingleWord := m.getUnknownWords(masks, runes)
	// 合并结果序列到对应位置中
	if len(unknownWords) > 0 {
		cur := result.Front()
		if needRemoveSingleWord && !m.options.ForceSingleWord {
			// remove single word need be removed
			for cur != nil {
				if utils.RuneLen(cur.Value.(*dict.WordInfo).Word) == 1 {
					if masks[cur.Value.(*dict.WordInfo).Position] == 11 {
						removeItem := cur
						cur = cur.Next()
						result.Remove(removeItem)
						continue
					}
				}
				cur = cur.Next()
			}
		}

		cur = result.Front()
		j = 0
		for cur != nil {
			if cur.Value.(*dict.WordInfo).Position >= unknownWords[j].Position {
				result.InsertBefore(unknownWords[j], cur)
				j++
				if j >= len(unknownWords) {
					break
				}
			}

			if cur.Value.(*dict.WordInfo).Position < unknownWords[j].Position {
				cur = cur.Next()
			}
		}

		for j < len(unknownWords) {
			result.PushBack(unknownWords[j])
			j++
		}
	}

	return result
}
コード例 #4
0
ファイル: chsfulltext_match.go プロジェクト: cluo/gosegment
func (m *ChsFullTextMatch) getUnknownWords(masks []int, orginalText []rune) (unknownWords []*dict.WordInfo, needRemoveSingleWord bool) {
	unknownWords = [](*dict.WordInfo){}

	// 找到所有未登录词
	needRemoveSingleWord = false

	j := 0
	begin := false
	beginPosition := 0
	for j < len(masks) {
		if m.options.UnknownWordIdentify {
			if !begin {
				if m.isKnownSingleWord(masks, j, orginalText) {
					begin = true
					beginPosition = j
				}
			} else {
				mergeUnknownWord := true
				if !m.isKnownSingleWord(masks, j, orginalText) {
					if j-beginPosition <= 2 {
						for k := beginPosition; k < j; k++ {
							mergeUnknownWord = false
							if masks[k] != 1 {
								word := string(orginalText[k : k+1])
								wi := dict.NewWordInfoDefault()
								wi.Word = word
								wi.Position = k
								wi.WordType = dict.TNone
								wi.Rank = m.params.UnknowRank
								unknownWords = append(unknownWords, wi)
							}
						}
					} else {
						for k := beginPosition; k < j; k++ {
							if masks[k] == 1 {
								masks[k] = 11
								needRemoveSingleWord = true
							}
						}
					}

					begin = false

					if mergeUnknownWord {
						word := string(orginalText[beginPosition:j])
						wi := dict.NewWordInfoDefault()
						wi.Word = word
						wi.Position = beginPosition
						wi.WordType = dict.TNone
						wi.Rank = m.params.UnknowRank
						unknownWords = append(unknownWords, wi)
					}
				}
			}
		} else {
			if m.isKnownSingleWord(masks, j, orginalText) {
				wi := dict.NewWordInfoDefault()
				wi.Word = string(orginalText[j])
				wi.Position = j
				wi.WordType = dict.TNone
				wi.Rank = m.params.UnknowRank
				unknownWords = append(unknownWords, wi)
			}
		}

		j++
	}

	if begin && m.options.UnknownWordIdentify {
		mergeUnknownWord := true
		if j-beginPosition <= 2 {
			for k := beginPosition; k < j; k++ {
				mergeUnknownWord = false
				if masks[k] != 1 {
					word := string(orginalText[k:(k + 1)])
					wi := dict.NewWordInfoDefault()
					wi.Word = word
					wi.Position = k
					wi.WordType = dict.TNone
					wi.Rank = m.params.UnknowRank
					unknownWords = append(unknownWords, wi)
				}
			}
		} else {
			for k := beginPosition; k < j; k++ {
				if masks[k] == 1 {
					masks[k] = 11
					needRemoveSingleWord = true
				}
			}
		}

		begin = false

		if mergeUnknownWord {
			word := string(orginalText[beginPosition:j])
			wi := dict.NewWordInfoDefault()
			wi.Word = word
			wi.Position = beginPosition
			wi.WordType = dict.TNone
			wi.Rank = m.params.UnknowRank
			unknownWords = append(unknownWords, wi)
		}
	}
	return
}