Beispiel #1
0
func NewChsName() *ChsName {
	c := &ChsName{}
	c.familyNameDict = make(map[rune]([]rune))
	c.singleNameDict = make(map[rune]rune)
	c.doubleName1Dict = make(map[rune]rune)
	c.doubleName2Dict = make(map[rune]rune)
	for _, name := range FAMILY_NAMES {
		runes := utils.ToRunes(name)
		if len(runes) == 1 {
			if _, ok := c.familyNameDict[runes[0]]; !ok {
				c.familyNameDict[runes[0]] = nil
			}
		} else {
			if v, ok := c.familyNameDict[runes[0]]; ok {
				if v == nil {
					c.familyNameDict[runes[0]] = []rune{0}
				}
				c.familyNameDict[runes[0]] = append(c.familyNameDict[runes[0]], runes[1])
			} else {
				c.familyNameDict[runes[0]] = []rune{runes[1]}
			}
		}
	}
	return c
}
Beispiel #2
0
func (s *Segment) getInitSegment(text string) *list.List {
	result := list.New()
	runes := utils.ToRunes(text)
	lexical := framework.NewLexical(runes)
	var dfaResult int

	for i := 0; i < len(runes); i++ {
		dfaResult = lexical.Input(runes[i], i)
		switch dfaResult {
		case framework.Continue:
			continue
		case framework.Quit:
			result.PushBack(lexical.OutputToken)
		case framework.ElseQuit:
			result.PushBack(lexical.OutputToken)
			if lexical.OldState != 255 {
				i--
			}
		}
	}

	dfaResult = lexical.Input(0, len(runes))
	switch dfaResult {
	case framework.Continue:
	case framework.Quit:
		result.PushBack(lexical.OutputToken)
	case framework.ElseQuit:
		result.PushBack(lexical.OutputToken)
	}
	return result
}
Beispiel #3
0
func (d *WordDictionary) Load(fileName string) (err error) {
	d.wordDict = make(map[string](*WordAttr))
	d.firstCharDict = make(map[rune](*WordAttr))
	d.doubleCharDict = make(map[int32](*WordAttr))
	d.tripleCharDict = make(map[int64](*[]byte))

	waList, err := d.loadFromTextFile(fileName)
	if err != nil {
		return err
	}

	for e := waList.Front(); e != nil; e = e.Next() {
		key := strings.ToLower(e.Value.(*WordAttr).Word)
		runes := utils.ToRunes(key)

		if len(runes) == 1 {
			d.firstCharDict[runes[0]] = e.Value.(*WordAttr)
			continue
		}

		if len(runes) == 2 {
			doubleChar := runes[0]*65536 + runes[1]
			d.doubleCharDict[doubleChar] = e.Value.(*WordAttr)
			continue
		}

		d.wordDict[key] = e.Value.(*WordAttr)
		tripleChar := int64(int32(runes[0]))*int64(0x100000000) + int64(int32(runes[1]))*int64(65536) + int64(int32(runes[2]))
		var wordLenArray []byte
		v, ok := d.tripleCharDict[tripleChar]
		if !ok {
			wordLenArray = make([]byte, 4)
			wordLenArray[0] = byte(len(runes))
			d.tripleCharDict[tripleChar] = &wordLenArray
		} else {
			find := false
			i := 0
			for i = 0; i < len(*v); i++ {
				if (*v)[i] == byte(len(runes)) {
					find = true
					break
				}
				if (*v)[i] == byte(0) {
					(*v)[i] = byte(len(runes))
					find = true
					break
				}
			}
			if !find {
				var temp []byte = make([]byte, len(*v)*2)
				copy(temp, (*v))
				temp[i] = byte(len(runes))
				d.tripleCharDict[tripleChar] = &temp
			}
		}
	}
	return nil
}
Beispiel #4
0
func (c *ChsName) loadNameDict(filePath string, dict map[rune]rune) (err error) {
	err = utils.EachLine(filePath, func(line string) {
		if len(line) > 0 {
			runes := utils.ToRunes(line)
			dict[runes[0]] = runes[0]
		}
	})
	return
}
Beispiel #5
0
func (s *Stemmer) setto(str string) {
	l := utils.RuneLen(str)
	o := s.j + 1
	sc := utils.ToRunes(str)
	for i := 0; i < l; i++ {
		s.b[o+i] = sc[i]
	}
	s.k = s.j + l
}
Beispiel #6
0
func (s *Stemmer) ends(str string) bool {
	l := utils.RuneLen(str)
	o := s.k - l + 1
	if o < 0 {
		return false
	}
	sc := utils.ToRunes(str)
	for i := 0; i < l; i++ {
		if s.b[o+i] != sc[i] {
			return false
		}
	}
	s.j = s.k - l
	return true
}
Beispiel #7
0
func (s *Segment) convertChineseCapicalToAsiic(text string) string {
	runes := utils.ToRunes(text)
	for i := 0; i < len(runes); i++ {
		if runes[i] >= '0' && runes[i] <= '9' {
			runes[i] -= '0'
			runes[i] += '0'
		} else if runes[i] >= 'a' && runes[i] <= 'z' {
			runes[i] -= 'a'
			runes[i] += 'a'
		} else if runes[i] >= 'A' && runes[i] <= 'Z' {
			runes[i] -= 'A'
			runes[i] += 'A'
		}
	}
	return string(runes)
}
Beispiel #8
0
func (d *WordDictionary) GetAllMatchs(text string, chineseNameIdentify bool) (result []PositionLength) {
	result = []PositionLength{}
	if len(text) == 0 {
		return
	}

	rtext := utils.ToRunes(text)

	keyText := rtext
	if rtext[0] < 128 {
		keyText = utils.ToRunes(strings.ToLower(text))
	}

	for i := 0; i < len(rtext); i++ {
		fst := keyText[i]

		var chsNames []string = nil
		if chineseNameIdentify {
			chsNames = d.ChineseName.Match(rtext, i)
			for _, name := range chsNames {
				wa := NewWordAttr(name, POS_A_NR, 0)
				result = append(result, PositionLength{0, i, utils.RuneLen(name), wa})
			}
		}

		if fwa, ok := d.firstCharDict[fst]; ok {
			result = append(result, PositionLength{0, i, 1, fwa})
		}

		if i < len(keyText)-1 {
			doubleChar := keyText[i]*65536 + keyText[i+1]
			if fwa, ok := d.doubleCharDict[doubleChar]; ok {
				result = append(result, PositionLength{0, i, 2, fwa})
			}
		}

		if i >= len(keyText)-2 {
			continue
		}

		tripleChar := int64(int32(keyText[i]))*0x100000000 + int64(int32(keyText[i+1]))*65536 + int64(int32(keyText[i+2]))
		if lenList, ok := d.tripleCharDict[tripleChar]; ok {
			for _, ilen := range *lenList {
				if ilen == 0 {
					break
				}
				if (i + int(ilen)) > len(keyText) {
					continue
				}
				key := string(keyText[i:(i + int(ilen))])
				if wa, ok := d.wordDict[key]; ok {
					if chsNames != nil {
						find := false
						for _, name := range chsNames {
							if wa.Word == name {
								find = true
								break
							}
						}
						if find {
							continue
						}
					}
					result = append(result, PositionLength{0, i, int(ilen), wa})
				}
			}
		}
	}

	return
}
Beispiel #9
0
func (s *Segment) preSegment(text string) *list.List {
	result := s.getInitSegment(text)
	runes := utils.ToRunes(text)
	cur := result.Front()
	for cur != nil {
		if s.options.IgnoreSpace {
			if cur.Value.(*dict.WordInfo).WordType == dict.TSpace {
				lst := cur
				cur = cur.Next()
				result.Remove(lst)
				continue
			}
		}
		switch cur.Value.(*dict.WordInfo).WordType {
		case dict.TSimplifiedChinese:
			inputText := cur.Value.(*dict.WordInfo).Word
			originalWordType := dict.TSimplifiedChinese
			pls := s.wordDictionary.GetAllMatchs(inputText, s.options.ChineseNameIdentify)
			chsMatch := match.NewChsFullTextMatch(s.wordDictionary)
			chsMatch.SetOptionParams(s.options, s.params)
			chsMatchWords := chsMatch.Match(pls, inputText)
			curChsMatch := chsMatchWords.Front()
			for curChsMatch != nil {
				wi := curChsMatch.Value.(*dict.WordInfo)
				wi.Position += cur.Value.(*dict.WordInfo).Position
				wi.OriginalWordType = originalWordType
				wi.WordType = originalWordType
				curChsMatch = curChsMatch.Next()
			}
			rcur := utils.InsertAfterList(result, chsMatchWords, cur)
			removeItem := cur
			cur = rcur.Next()
			result.Remove(removeItem)
		case dict.TEnglish:
			cur.Value.(*dict.WordInfo).Rank = s.params.EnglishRank
			cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word)
			if s.options.IgnoreCapital {
				cur.Value.(*dict.WordInfo).Word = strings.ToLower(cur.Value.(*dict.WordInfo).Word)
			}

			if s.options.EnglishSegment {
				lower := strings.ToLower(cur.Value.(*dict.WordInfo).Word)
				if lower != cur.Value.(*dict.WordInfo).Word {
					result.InsertBefore(dict.NewWordInfo(lower, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishLowerRank, dict.TEnglish, dict.TEnglish), cur)
				}
				stem := s.getStem(lower)
				if len(stem) > 0 {
					if lower != stem {
						result.InsertBefore(dict.NewWordInfo(stem, cur.Value.(*dict.WordInfo).Position, dict.POS_A_NX, 1, s.params.EnglishStemRank, dict.TEnglish, dict.TEnglish), cur)
					}
				}
			}

			if s.options.EnglishMultiDimensionality {
				needSplit := false
				for _, c := range cur.Value.(*dict.WordInfo).Word {
					if (c >= '0' && c <= '9') || (c == '_') {
						needSplit = true
						break
					}
				}
				if needSplit {
					output := s.re.FindAllString(cur.Value.(*dict.WordInfo).Word, -1)
					if len(output) > 1 {
						position := cur.Value.(*dict.WordInfo).Position
						for _, splitWord := range output {
							if len(splitWord) == 0 {
								continue
							}

							var wi *dict.WordInfo
							r := utils.FirstRune(splitWord)
							if r >= '0' && r <= '9' {
								wi = dict.NewWordInfoSome(splitWord, dict.POS_A_M, 1)
								wi.Position = position
								wi.Rank = s.params.NumericRank
								wi.OriginalWordType = dict.TEnglish
								wi.WordType = dict.TNumeric
							} else {
								wi = dict.NewWordInfoSome(splitWord, dict.POS_A_NX, 1)
								wi.Position = position
								wi.Rank = s.params.EnglishRank
								wi.OriginalWordType = dict.TEnglish
								wi.WordType = dict.TEnglish
							}

							result.InsertBefore(wi, cur)
							position += utils.RuneLen(splitWord)
						}
					}
				}
			}

			var ok bool
			if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok {
				cur = cur.Next()
			}

		case dict.TNumeric:
			cur.Value.(*dict.WordInfo).Word = s.convertChineseCapicalToAsiic(cur.Value.(*dict.WordInfo).Word)
			cur.Value.(*dict.WordInfo).Rank = s.params.NumericRank
			var ok bool
			if ok, cur = s.mergeEnglishSpecialWord(runes, result, cur); !ok {
				cur = cur.Next()
			}
		case dict.TSymbol:
			cur.Value.(*dict.WordInfo).Rank = s.params.SymbolRank
			cur = cur.Next()
		default:
			cur = cur.Next()
		}
	}
	return result
}
Beispiel #10
0
func (m *ChsFullTextMatch) Match(posLenArr []dict.PositionLength, originalText string) *list.List {
	if m.options == nil {
		m.options = NewMatchOptions()
	}
	if m.params == nil {
		m.params = NewMatchParameter()
	}
	runes := utils.ToRunes(originalText)
	masks := make([]int, len(runes))
	redundancy := m.params.Redundancy

	result := list.New()
	if len(posLenArr) == 0 {
		if m.options.UnknownWordIdentify {
			wi := dict.NewWordInfoDefault()
			wi.Word = originalText
			wi.Position = 0
			wi.WordType = dict.TNone
			wi.Rank = 1
			result.PushFront(wi)
			return result
		} else {
			position := 0
			for _, r := range runes {
				wi := dict.NewWordInfoDefault()
				wi.Word = string(r)
				wi.Position = position
				wi.WordType = dict.TNone
				wi.Rank = 1
				position++
				result.PushBack(wi)
			}
			return result
		}
	}

	leafNodeArray := m.getLeafNodeArray(posLenArr, originalText)

	// 获取前TopRecord个单词序列
	j := 0
	for _, node := range leafNodeArray {
		if leafNodeArray[j] == nil {
			break
		}
		if j >= TopRecord || j >= len(leafNodeArray) {
			break
		}
		comb := make([]dict.PositionLength, node.AboveCount)
		i := node.AboveCount - 1
		cur := node
		for i >= 0 {
			comb[i] = cur.PosLen
			cur = cur.Parent
			i--
		}
		m.allCombinations = append(m.allCombinations, comb)
		j++
	}

	// Force single word
	// 强制一元分词
	if m.options.ForceSingleWord {
		comb := make([]dict.PositionLength, len(runes))
		for i := 0; i < len(comb); i++ {
			pl := dict.NewPositionLength(i, 1, dict.NewWordAttr(string(runes[i]), dict.POS_UNK, 0.0))
			pl.Level = 3
			comb[i] = pl
		}
		m.allCombinations = append(m.allCombinations, comb)
	}

	if len(m.allCombinations) > 0 {
		positionCollection := m.mergeAllCombinations(redundancy)
		curPc := positionCollection.Front()
		for curPc != nil {
			pl := curPc.Value.(dict.PositionLength)
			wi := dict.NewWordInfoDefault()
			wi.Word = string(runes[pl.Position:(pl.Position + pl.Length)])
			wi.Pos = pl.WordAttri.Pos
			wi.Frequency = pl.WordAttri.Frequency
			wi.WordType = dict.TSimplifiedChinese
			wi.Position = pl.Position
			switch pl.Level {
			case 0:
				wi.Rank = m.params.BestRank
			case 1:
				wi.Rank = m.params.SecRank
			case 2:
				wi.Rank = m.params.ThirdRank
			case 3:
				wi.Rank = m.params.SingleRank
			default:
				wi.Rank = m.params.BestRank
			}

			result.PushBack(wi)
			if pl.Length > 1 {
				for k := pl.Position; k < pl.Position+pl.Length; k++ {
					masks[k] = 2
				}
			} else {
				masks[pl.Position] = 1
			}
			curPc = curPc.Next()
		}
	}

	// 合并未登录词
	unknownWords, needRemoveSingleWord := m.getUnknownWords(masks, runes)
	// 合并结果序列到对应位置中
	if len(unknownWords) > 0 {
		cur := result.Front()
		if needRemoveSingleWord && !m.options.ForceSingleWord {
			// remove single word need be removed
			for cur != nil {
				if utils.RuneLen(cur.Value.(*dict.WordInfo).Word) == 1 {
					if masks[cur.Value.(*dict.WordInfo).Position] == 11 {
						removeItem := cur
						cur = cur.Next()
						result.Remove(removeItem)
						continue
					}
				}
				cur = cur.Next()
			}
		}

		cur = result.Front()
		j = 0
		for cur != nil {
			if cur.Value.(*dict.WordInfo).Position >= unknownWords[j].Position {
				result.InsertBefore(unknownWords[j], cur)
				j++
				if j >= len(unknownWords) {
					break
				}
			}

			if cur.Value.(*dict.WordInfo).Position < unknownWords[j].Position {
				cur = cur.Next()
			}
		}

		for j < len(unknownWords) {
			result.PushBack(unknownWords[j])
			j++
		}
	}

	return result
}