func TestWordsCountsOf(t *testing.T) { data := []string{"A", "B", "A", "C"} class := []string{"A", "B"} exp := []int{2, 1} got := tekstus.WordsCountTokens(data, class, false) assert(t, exp, got, true) }
// // computePerfByProbs will compute classifier performance using probabilities // or score `probs`. // // This currently only work for two class problem. // func (rt *Runtime) computePerfByProbs(samples tabula.ClasetInterface, actuals []string, probs []float64, ) { vs := samples.GetClassValueSpace() nactuals := numerus.IntsTo64(samples.Counts()) nclass := tekstus.WordsCountTokens(actuals, vs, false) pprev := math.Inf(-1) tp := int64(0) fp := int64(0) tpprev := int64(0) fpprev := int64(0) auc := float64(0) for x, p := range probs { if p != pprev { stat := Stat{} stat.SetTPRate(tp, nactuals[0]) stat.SetFPRate(fp, nactuals[1]) stat.SetPrecisionFromRate(nactuals[0], nactuals[1]) auc = auc + trapezoidArea(fp, fpprev, tp, tpprev) stat.SetAUC(auc) rt.perfs = append(rt.perfs, &stat) pprev = p tpprev = tp fpprev = fp } if actuals[x] == vs[0] { tp++ } else { fp++ } } stat := Stat{} stat.SetTPRate(tp, nactuals[0]) stat.SetFPRate(fp, nactuals[1]) stat.SetPrecisionFromRate(nactuals[0], nactuals[1]) auc = auc + trapezoidArea(fp, fpprev, tp, tpprev) auc = auc / float64(nclass[0]*nclass[1]) stat.SetAUC(auc) rt.perfs = append(rt.perfs, &stat) if len(rt.perfs) >= 2 { // Replace the first stat with second stat, because of NaN // value on the first precision. rt.perfs[0] = rt.perfs[1] } }
/* compute value for attribute T. Return Gini value in the form of, 1 - sum (probability of each classes in T) */ func (gini *Gini) compute(T *[]string, C *[]string) float64 { n := float64(len(*T)) if n == 0 { return 0 } classCount := tekstus.WordsCountTokens(*T, *C, true) var sump2 float64 for x, v := range classCount { p := float64(v) / n sump2 += (p * p) if DEBUG >= 3 { fmt.Printf("[gini] compute (%s): (%f/%f)^2 = %f\n", (*C)[x], v, n, p*p) } } return 1 - sump2 }