/* KullbackLeiblerDivergence comput and return the divergence of two string based on their character probabability. */ func KullbackLeiblerDivergence(a, b string) (divergence float64) { aCharsd, aValuesd := tekstus.CountAlnumDistribution(a) bCharsd, bValuesd := tekstus.CountAlnumDistribution(b) sumValuesA := numerus.IntsSum(aValuesd) sumValuesB := numerus.IntsSum(bValuesd) charsDiff := tekstus.RunesDiff(aCharsd, bCharsd) aMin, _, _ := numerus.IntsFindMin(aValuesd) bMin, _, _ := numerus.IntsFindMin(bValuesd) min := aMin if bMin < aMin { min = bMin } epsilon := float64(min) * 0.001 gamma := 1.0 - (float64(len(charsDiff)) * epsilon) // Check if sum of a up to 1. var sum float64 for _, v := range aValuesd { sum += float64(v) / float64(sumValuesA) } sumDiff := 1 - math.Abs(sum) if sumDiff > 0.000009 { return 0 } sum = 0 for _, v := range bValuesd { sum += float64(v) / float64(sumValuesB) } sumDiff = 1 - math.Abs(sum) if sumDiff > 0.000009 { return 0 } for x, v := range aCharsd { probA := float64(aValuesd[x]) / float64(sumValuesA) probB := epsilon contain, atIdx := tekstus.RunesContain(bCharsd, v) if contain { probB = gamma * (float64(bValuesd[atIdx]) / float64(sumValuesB)) } divergence += (probA - probB) * math.Log(probA/probB) } return divergence }
func TestCountCharDistribution(t *testing.T) { line := "// Copyright 2016 Mhd Sulhan <*****@*****.**>. All rights reserved." expchars := []rune{'C', 'o', 'p', 'y', 'r', 'i', 'g', 'h', 't', '2', '0', '1', '6', 'M', 'd', 'S', 'u', 'l', 'a', 'n', 'm', 's', 'k', 'b', 'f', 'A', 'e', 'v', } expvalues := []int{1, 2, 1, 1, 4, 5, 2, 4, 3, 1, 1, 1, 1, 1, 2, 1, 1, 4, 2, 2, 1, 3, 1, 1, 1, 1, 3, 1, } gotchars, gotvalues := tekstus.CountAlnumDistribution(line) assert(t, expchars, gotchars, true) assert(t, expvalues, gotvalues, true) }