Пример #1
0
func inner(t *testing.T, cases *[]TestCase, wantIdx int, opt ls_core.Options, sortIt bool) {

	for i, tc := range *cases {

		m := ls_core.New(WrapAsEqualer(tc.src, sortIt), WrapAsEqualer(tc.dst, sortIt), opt)
		got, relDist := m.Distance()
		_ = relDist
		// fmt.Printf("%v %v\n", got, relDist)

		ssrc := fmt.Sprintf("%v", tc.src)
		sdst := fmt.Sprintf("%v", tc.dst)
		if got != tc.distances[wantIdx] {
			t.Logf(
				"%2v: Distance between %20v and %20v should be %v - but got %v (sorted %v)",
				i, stringspb.Ellipsoider(ssrc, 8), stringspb.Ellipsoider(sdst, 8), tc.distances[wantIdx], got, sortIt)
			t.Fail()
		}

		m.Print()
		fmt.Printf("\n")

		es := m.EditScript()

		got2 := m.ApplyEditScript(es)
		if !m.CompareToCol(got2) {
			t.Logf("\nwnt %v \ngot %v ", WrapAsEqualer(tc.dst, sortIt), got2)
			t.Fail()
		}

		fmt.Printf("\n")
		fmt.Printf("\n")

	}

}
Пример #2
0
func similarTextifiedTrees2(src *TextifiedTree, mp map[string][]*TextifiedTree, skipPrefix map[string]bool) {

	// srcE := word.WrapAsEqualer(string(src.Text), true) // ssrc as Equaler
	srcE := wordb.WrapAsEqualer(src.Text, true)
	srcLen := float64(len(src.Text))

	for fnKey, tts := range mp {

		if fnKey == src.SourceID {
			pf("    to %v SKIP self\n", fnKey)
			continue
		}

		pf("    to %v\n", fnKey)

		cntr, br := 0, true
		for _, tt := range tts {
			// outl, text := tt.Outl, tt.Text

			if tt.Lvl > src.Lvl+levelsTolerance {
				break // since we are now sorted by lvl, we can this is safe
			}

			if tt.Lvl == src.Lvl ||
				(tt.Lvl > src.Lvl && tt.Lvl <= src.Lvl+levelsTolerance) {
				// proceed
			} else {
				continue
			}

			if src.NumTokens < 1 {
				continue
			}

			if src.NumTokens < 5 && tt.NumTokens > 7 {
				continue
			}

			if HistoBasedDistance(src, tt) > 0.51 {
				breakMapsTooDistinct++
				continue
			}

			relSize := srcLen / float64(util.Max(1, len(tt.Text)))
			if relSize < 0.33 || relSize > 3 {
				continue
			}

			absDist, relDist := 0, 0.0

			if tt.NumTokens == src.NumTokens &&
				len(tt.Text) == len(src.Text) &&
				bytes.Equal(tt.Text, src.Text) {
				absDist, relDist = 0, 0.0
				appliedCompare++
			} else {
				dstE := wordb.WrapAsEqualer(tt.Text, true) // destinations as Equaler
				m := levenshtein.New(srcE, dstE, opt)
				absDist, relDist = m.Distance()
				appliedLevenshtein++
			}

			//
			if relDist < 0.26 && absDist < 10 {
				if br {
					pf("\t")
				}

				sd := ""
				sd = string(tt.Text[:util.Min(2*excerptLen, len(tt.Text)-1)])
				sd = stringspb.ToLen(sd, 2*excerptLen+1)
				pf("%12v %v %4v %5.2v   ", tt.Outline, sd, absDist, relDist)

				cntr++
				br = false

				sim := Similar{}
				sim.SourceID = fnKey
				sim.Lvl = tt.Lvl
				sim.Outline = tt.Outline
				sim.AbsLevenshtein = absDist
				sim.RelLevenshtein = relDist
				sim.Text = tt.Text
				src.Similars = append(src.Similars, sim)
				src.SumAbsLevenshtein += absDist
				src.SumRelLevenshtein += relDist

				if cntr%2 == 0 || cntr > 20 {
					pf("\n")
					br = true
				}
				if cntr > 20 {
					break
				}
			}

		}
		if !br {
			pf("\n")
		}
	}

}