Esempio n. 1
0
/*
KullbackLeiblerDivergence comput and return the divergence of two string based
on their character probabability.
*/
func KullbackLeiblerDivergence(a, b string) (divergence float64) {
	aCharsd, aValuesd := tekstus.CountAlnumDistribution(a)
	bCharsd, bValuesd := tekstus.CountAlnumDistribution(b)

	sumValuesA := numerus.IntsSum(aValuesd)
	sumValuesB := numerus.IntsSum(bValuesd)

	charsDiff := tekstus.RunesDiff(aCharsd, bCharsd)

	aMin, _, _ := numerus.IntsFindMin(aValuesd)
	bMin, _, _ := numerus.IntsFindMin(bValuesd)

	min := aMin
	if bMin < aMin {
		min = bMin
	}

	epsilon := float64(min) * 0.001
	gamma := 1.0 - (float64(len(charsDiff)) * epsilon)

	// Check if sum of a up to 1.
	var sum float64

	for _, v := range aValuesd {
		sum += float64(v) / float64(sumValuesA)
	}

	sumDiff := 1 - math.Abs(sum)

	if sumDiff > 0.000009 {
		return 0
	}

	sum = 0
	for _, v := range bValuesd {
		sum += float64(v) / float64(sumValuesB)
	}

	sumDiff = 1 - math.Abs(sum)

	if sumDiff > 0.000009 {
		return 0
	}

	for x, v := range aCharsd {
		probA := float64(aValuesd[x]) / float64(sumValuesA)
		probB := epsilon

		contain, atIdx := tekstus.RunesContain(bCharsd, v)

		if contain {
			probB = gamma * (float64(bValuesd[atIdx]) /
				float64(sumValuesB))
		}

		divergence += (probA - probB) * math.Log(probA/probB)
	}

	return divergence
}
Esempio n. 2
0
func TestCountCharDistribution(t *testing.T) {
	line := "// Copyright 2016 Mhd Sulhan <*****@*****.**>. All rights reserved."
	expchars := []rune{'C', 'o', 'p', 'y', 'r', 'i', 'g', 'h', 't',
		'2', '0', '1', '6',
		'M', 'd', 'S', 'u', 'l', 'a', 'n',
		'm', 's',
		'k', 'b', 'f',
		'A', 'e', 'v',
	}
	expvalues := []int{1, 2, 1, 1, 4, 5, 2, 4, 3,
		1, 1, 1, 1,
		1, 2, 1, 1, 4, 2, 2,
		1, 3,
		1, 1, 1,
		1, 3, 1,
	}

	gotchars, gotvalues := tekstus.CountAlnumDistribution(line)

	assert(t, expchars, gotchars, true)
	assert(t, expvalues, gotvalues, true)
}