// // ClassifySet given a samples predict their class by running each sample in // forest, adn return their class prediction with confusion matrix. // `samples` is the sample that will be predicted, `sampleIds` is the index of // samples. // If `sampleIds` is not nil, then sample index will be checked in each tree, // if the sample is used for training, their vote is not counted. // // Algorithm, // // (0) Get value space (possible class values in dataset) // (1) For each row in test-set, // (1.1) collect votes in all trees, // (1.2) select majority class vote, and // (1.3) compute and save the actual class probabilities. // (2) Compute confusion matrix from predictions. // (3) Compute stat from confusion matrix. // (4) Write the stat to file only if sampleIds is empty, which mean its run // not from OOB set. // func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface, sampleIds []int, ) ( predicts []string, cm *classifier.CM, probs []float64, ) { stat := classifier.Stat{} stat.Start() if len(sampleIds) <= 0 { fmt.Println(tag, "Classify set:", samples) fmt.Println(tag, "Classify set sample (one row):", samples.GetRow(0)) } // (0) vs := samples.GetClassValueSpace() actuals := samples.GetClassAsStrings() sampleIdx := -1 // (1) rows := samples.GetRows() for x, row := range *rows { // (1.1) if len(sampleIds) > 0 { sampleIdx = sampleIds[x] } votes := forest.Votes(row, sampleIdx) // (1.2) classProbs := tekstus.WordsProbabilitiesOf(votes, vs, false) _, idx, ok := numerus.Floats64FindMax(classProbs) if ok { predicts = append(predicts, vs[idx]) } // (1.3) probs = append(probs, classProbs[0]) } // (2) cm = forest.ComputeCM(sampleIds, vs, actuals, predicts) // (3) forest.ComputeStatFromCM(&stat, cm) stat.End() if len(sampleIds) <= 0 { fmt.Println(tag, "CM:", cm) fmt.Println(tag, "Classifying stat:", stat) _ = stat.Write(forest.StatFile) } return predicts, cm, probs }
// // computePerfByProbs will compute classifier performance using probabilities // or score `probs`. // // This currently only work for two class problem. // func (rt *Runtime) computePerfByProbs(samples tabula.ClasetInterface, actuals []string, probs []float64, ) { vs := samples.GetClassValueSpace() nactuals := numerus.IntsTo64(samples.Counts()) nclass := tekstus.WordsCountTokens(actuals, vs, false) pprev := math.Inf(-1) tp := int64(0) fp := int64(0) tpprev := int64(0) fpprev := int64(0) auc := float64(0) for x, p := range probs { if p != pprev { stat := Stat{} stat.SetTPRate(tp, nactuals[0]) stat.SetFPRate(fp, nactuals[1]) stat.SetPrecisionFromRate(nactuals[0], nactuals[1]) auc = auc + trapezoidArea(fp, fpprev, tp, tpprev) stat.SetAUC(auc) rt.perfs = append(rt.perfs, &stat) pprev = p tpprev = tp fpprev = fp } if actuals[x] == vs[0] { tp++ } else { fp++ } } stat := Stat{} stat.SetTPRate(tp, nactuals[0]) stat.SetFPRate(fp, nactuals[1]) stat.SetPrecisionFromRate(nactuals[0], nactuals[1]) auc = auc + trapezoidArea(fp, fpprev, tp, tpprev) auc = auc / float64(nclass[0]*nclass[1]) stat.SetAUC(auc) rt.perfs = append(rt.perfs, &stat) if len(rt.perfs) >= 2 { // Replace the first stat with second stat, because of NaN // value on the first precision. rt.perfs[0] = rt.perfs[1] } }
/* computeGain calculate the gini index for each value in each attribute. */ func (runtime *Runtime) computeGain(D tabula.ClasetInterface) ( gains []gini.Gini, ) { switch runtime.SplitMethod { case SplitMethodGini: // create gains value for all attribute minus target class. gains = make([]gini.Gini, D.GetNColumn()) } runtime.SelectRandomFeature(D) classVS := D.GetClassValueSpace() classIdx := D.GetClassIndex() classType := D.GetClassType() for x, col := range *D.GetColumns() { // skip class attribute. if x == classIdx { continue } // skip column flagged with parent if (col.Flag & ColFlagParent) == ColFlagParent { gains[x].Skip = true continue } // ignore column flagged with skip if (col.Flag & ColFlagSkip) == ColFlagSkip { gains[x].Skip = true continue } // compute gain. if col.GetType() == tabula.TReal { attr := col.ToFloatSlice() if classType == tabula.TString { target := D.GetClassAsStrings() gains[x].ComputeContinu(&attr, &target, &classVS) } else { targetReal := D.GetClassAsReals() classVSReal := tekstus.StringsToFloat64( classVS) gains[x].ComputeContinuFloat(&attr, &targetReal, &classVSReal) } } else { attr := col.ToStringSlice() attrV := col.ValueSpace if DEBUG >= 2 { fmt.Println("[cart] attr :", attr) fmt.Println("[cart] attrV:", attrV) } target := D.GetClassAsStrings() gains[x].ComputeDiscrete(&attr, &attrV, &target, &classVS) } if DEBUG >= 2 { fmt.Println("[cart] gain :", gains[x]) } } return }
// // ClassifySetByWeight will classify each instance in samples by weight // with respect to its single performance. // // Algorithm, // (1) For each instance in samples, // (1.1) for each stage, // (1.1.1) collect votes for instance in current stage. // (1.1.2) Compute probabilities of each classes in votes. // // prob_class = count_of_class / total_votes // // (1.1.3) Compute total of probabilites times of stage weight. // // stage_prob = prob_class * stage_weight // // (1.2) Divide each class stage probabilites with // // stage_prob = stage_prob / // (sum_of_all_weights * number_of_tree_in_forest) // // (1.3) Select class label with highest probabilites. // (1.4) Save stage probabilities for positive class. // (2) Compute confusion matrix. // func (crf *Runtime) ClassifySetByWeight(samples tabula.ClasetInterface, sampleIds []int, ) ( predicts []string, cm *classifier.CM, probs []float64, ) { stat := classifier.Stat{} stat.Start() vs := samples.GetClassValueSpace() stageProbs := make([]float64, len(vs)) stageSumProbs := make([]float64, len(vs)) sumWeights := numerus.Floats64Sum(crf.weights) // (1) rows := samples.GetDataAsRows() for _, row := range *rows { for y := range stageSumProbs { stageSumProbs[y] = 0 } // (1.1) for y, forest := range crf.forests { // (1.1.1) votes := forest.Votes(row, -1) // (1.1.2) probs := tekstus.WordsProbabilitiesOf(votes, vs, false) // (1.1.3) for z := range probs { stageSumProbs[z] += probs[z] stageProbs[z] += probs[z] * crf.weights[y] } } // (1.2) stageWeight := sumWeights * float64(crf.NTree) for x := range stageProbs { stageProbs[x] = stageProbs[x] / stageWeight } // (1.3) _, maxi, ok := numerus.Floats64FindMax(stageProbs) if ok { predicts = append(predicts, vs[maxi]) } probs = append(probs, stageSumProbs[0]/ float64(len(crf.forests))) } // (2) actuals := samples.GetClassAsStrings() cm = crf.ComputeCM(sampleIds, vs, actuals, predicts) crf.ComputeStatFromCM(&stat, cm) stat.End() _ = stat.Write(crf.StatFile) return predicts, cm, probs }