Пример #1
0
// Implement the same sorting criteria as V2
func calculateScore(caseSensitive bool, normalize bool, text util.Chars, pattern []rune, sidx int, eidx int, withPos bool) (int, *[]int) {
	pidx, score, inGap, consecutive, firstBonus := 0, 0, false, 0, int16(0)
	pos := posArray(withPos, len(pattern))
	prevClass := charNonWord
	if sidx > 0 {
		prevClass = charClassOf(text.Get(sidx - 1))
	}
	for idx := sidx; idx < eidx; idx++ {
		char := text.Get(idx)
		class := charClassOf(char)
		if !caseSensitive {
			if char >= 'A' && char <= 'Z' {
				char += 32
			} else if char > unicode.MaxASCII {
				char = unicode.To(unicode.LowerCase, char)
			}
		}
		// pattern is already normalized
		if normalize {
			char = normalizeRune(char)
		}
		if char == pattern[pidx] {
			if withPos {
				*pos = append(*pos, idx)
			}
			score += scoreMatch
			bonus := bonusFor(prevClass, class)
			if consecutive == 0 {
				firstBonus = bonus
			} else {
				// Break consecutive chunk
				if bonus == bonusBoundary {
					firstBonus = bonus
				}
				bonus = util.Max16(util.Max16(bonus, firstBonus), bonusConsecutive)
			}
			if pidx == 0 {
				score += int(bonus * bonusFirstCharMultiplier)
			} else {
				score += int(bonus)
			}
			inGap = false
			consecutive++
			pidx++
		} else {
			if inGap {
				score += scoreGapExtention
			} else {
				score += scoreGapStart
			}
			inGap = true
			consecutive = 0
			firstBonus = 0
		}
		prevClass = class
	}
	return score, pos
}
Пример #2
0
func FuzzyMatchV2(caseSensitive bool, normalize bool, forward bool, input util.Chars, pattern []rune, withPos bool, slab *util.Slab) (Result, *[]int) {
	// Assume that pattern is given in lowercase if case-insensitive.
	// First check if there's a match and calculate bonus for each position.
	// If the input string is too long, consider finding the matching chars in
	// this phase as well (non-optimal alignment).
	N := input.Length()
	M := len(pattern)
	switch M {
	case 0:
		return Result{0, 0, 0}, posArray(withPos, M)
	case 1:
		return ExactMatchNaive(caseSensitive, normalize, forward, input, pattern[0:1], withPos, slab)
	}

	// Since O(nm) algorithm can be prohibitively expensive for large input,
	// we fall back to the greedy algorithm.
	if slab != nil && N*M > cap(slab.I16) {
		return FuzzyMatchV1(caseSensitive, normalize, forward, input, pattern, withPos, slab)
	}

	// Reuse pre-allocated integer slice to avoid unnecessary sweeping of garbages
	offset16 := 0
	offset32 := 0
	// Bonus point for each position
	offset16, B := alloc16(offset16, slab, N, false)
	// The first occurrence of each character in the pattern
	offset32, F := alloc32(offset32, slab, M, false)
	// Rune array
	offset32, T := alloc32(offset32, slab, N, false)

	// Phase 1. Check if there's a match and calculate bonus for each point
	pidx, lastIdx, prevClass := 0, 0, charNonWord
	for idx := 0; idx < N; idx++ {
		char := input.Get(idx)
		var class charClass
		if char <= unicode.MaxASCII {
			class = charClassOfAscii(char)
		} else {
			class = charClassOfNonAscii(char)
		}

		if !caseSensitive && class == charUpper {
			if char <= unicode.MaxASCII {
				char += 32
			} else {
				char = unicode.To(unicode.LowerCase, char)
			}
		}

		if normalize {
			char = normalizeRune(char)
		}

		T[idx] = char
		B[idx] = bonusFor(prevClass, class)
		prevClass = class

		if pidx < M {
			if char == pattern[pidx] {
				lastIdx = idx
				F[pidx] = int32(idx)
				pidx++
			}
		} else {
			if char == pattern[M-1] {
				lastIdx = idx
			}
		}
	}
	if pidx != M {
		return Result{-1, -1, 0}, nil
	}

	// Phase 2. Fill in score matrix (H)
	// Unlike the original algorithm, we do not allow omission.
	width := lastIdx - int(F[0]) + 1
	offset16, H := alloc16(offset16, slab, width*M, false)

	// Possible length of consecutive chunk at each position.
	offset16, C := alloc16(offset16, slab, width*M, false)

	maxScore, maxScorePos := int16(0), 0
	for i := 0; i < M; i++ {
		I := i * width
		inGap := false
		for j := int(F[i]); j <= lastIdx; j++ {
			j0 := j - int(F[0])
			var s1, s2, consecutive int16

			if j > int(F[i]) {
				if inGap {
					s2 = H[I+j0-1] + scoreGapExtention
				} else {
					s2 = H[I+j0-1] + scoreGapStart
				}
			}

			if pattern[i] == T[j] {
				var diag int16
				if i > 0 && j0 > 0 {
					diag = H[I-width+j0-1]
				}
				s1 = diag + scoreMatch
				b := B[j]
				if i > 0 {
					// j > 0 if i > 0
					consecutive = C[I-width+j0-1] + 1
					// Break consecutive chunk
					if b == bonusBoundary {
						consecutive = 1
					} else if consecutive > 1 {
						b = util.Max16(b, util.Max16(bonusConsecutive, B[j-int(consecutive)+1]))
					}
				} else {
					consecutive = 1
					b *= bonusFirstCharMultiplier
				}
				if s1+b < s2 {
					s1 += B[j]
					consecutive = 0
				} else {
					s1 += b
				}
			}
			C[I+j0] = consecutive

			inGap = s1 < s2
			score := util.Max16(util.Max16(s1, s2), 0)
			if i == M-1 && (forward && score > maxScore || !forward && score >= maxScore) {
				maxScore, maxScorePos = score, j
			}
			H[I+j0] = score
		}

		if DEBUG {
			if i == 0 {
				fmt.Print("  ")
				for j := int(F[i]); j <= lastIdx; j++ {
					fmt.Printf(" " + string(input.Get(j)) + " ")
				}
				fmt.Println()
			}
			fmt.Print(string(pattern[i]) + " ")
			for idx := int(F[0]); idx < int(F[i]); idx++ {
				fmt.Print(" 0 ")
			}
			for idx := int(F[i]); idx <= lastIdx; idx++ {
				fmt.Printf("%2d ", H[i*width+idx-int(F[0])])
			}
			fmt.Println()

			fmt.Print("  ")
			for idx, p := range C[I : I+width] {
				if idx+int(F[0]) < int(F[i]) {
					p = 0
				}
				fmt.Printf("%2d ", p)
			}
			fmt.Println()
		}
	}

	// Phase 3. (Optional) Backtrace to find character positions
	pos := posArray(withPos, M)
	j := int(F[0])
	if withPos {
		i := M - 1
		j = maxScorePos
		preferMatch := true
		for {
			I := i * width
			j0 := j - int(F[0])
			s := H[I+j0]

			var s1, s2 int16
			if i > 0 && j >= int(F[i]) {
				s1 = H[I-width+j0-1]
			}
			if j > int(F[i]) {
				s2 = H[I+j0-1]
			}

			if s > s1 && (s > s2 || s == s2 && preferMatch) {
				*pos = append(*pos, j)
				if i == 0 {
					break
				}
				i--
			}
			preferMatch = C[I+j0] > 1 || I+width+j0+1 < len(C) && C[I+width+j0+1] > 0
			j--
		}
	}
	// Start offset we return here is only relevant when begin tiebreak is used.
	// However finding the accurate offset requires backtracking, and we don't
	// want to pay extra cost for the option that has lost its importance.
	return Result{j, maxScorePos + 1, int(maxScore)}, pos
}