func (crf *Runtime) deleteTrueNegative(samples tabula.ClasetInterface, cm *classifier.CM, ) { var row *tabula.Row tnids := cm.TNIndices() sort.Ints(tnids) // (1) if len(crf.weights) <= 1 { for _, i := range tnids { crf.tnset.PushRow(samples.GetRow(i)) } } // (2) c := 0 for x, i := range tnids { row = samples.DeleteRow(i - x) if row != nil { c++ } } if DEBUG >= 1 { fmt.Println(tag, "# TN", len(tnids), "# deleted", c) } }
// // Build given a sample dataset, build the stage with randomforest. // func (crf *Runtime) Build(samples tabula.ClasetInterface) (e error) { if samples == nil { return ErrNoInput } e = crf.Initialize(samples) if e != nil { return } fmt.Println(tag, "Training samples:", samples) fmt.Println(tag, "Sample (one row):", samples.GetRow(0)) fmt.Println(tag, "Config:", crf) for x := 0; x < crf.NStage; x++ { if DEBUG >= 1 { fmt.Println(tag, "Stage #", x) } forest, e := crf.createForest(samples) if e != nil { return e } e = crf.finalizeStage(forest) if e != nil { return e } } return crf.Finalize() }
// // Initialize will check forest inputs and set it to default values if invalid. // // It will also calculate number of random samples for each tree using, // // number-of-sample * percentage-of-bootstrap // // func (forest *Runtime) Initialize(samples tabula.ClasetInterface) error { if forest.NTree <= 0 { forest.NTree = DefNumTree } if forest.PercentBoot <= 0 { forest.PercentBoot = DefPercentBoot } if forest.NRandomFeature <= 0 { // Set default value to square-root of features. ncol := samples.GetNColumn() - 1 forest.NRandomFeature = int(math.Sqrt(float64(ncol))) } if forest.OOBStatsFile == "" { forest.OOBStatsFile = DefOOBStatsFile } if forest.PerfFile == "" { forest.PerfFile = DefPerfFile } if forest.StatFile == "" { forest.StatFile = DefStatFile } forest.nSubsample = int(float32(samples.GetNRow()) * (float32(forest.PercentBoot) / 100.0)) return forest.Runtime.Initialize() }
// // Initialize will check crf inputs and set it to default values if its // invalid. // func (crf *Runtime) Initialize(samples tabula.ClasetInterface) error { if crf.NStage <= 0 { crf.NStage = DefStage } if crf.TPRate <= 0 || crf.TPRate >= 1 { crf.TPRate = DefTPRate } if crf.TNRate <= 0 || crf.TNRate >= 1 { crf.TNRate = DefTNRate } if crf.NTree <= 0 { crf.NTree = DefNumTree } if crf.PercentBoot <= 0 { crf.PercentBoot = DefPercentBoot } if crf.NRandomFeature <= 0 { // Set default value to square-root of features. ncol := samples.GetNColumn() - 1 crf.NRandomFeature = int(math.Sqrt(float64(ncol))) } if crf.PerfFile == "" { crf.PerfFile = DefPerfFile } if crf.StatFile == "" { crf.StatFile = DefStatFile } crf.tnset = samples.Clone().(*tabula.Claset) return crf.Runtime.Initialize() }
// // computePerfByProbs will compute classifier performance using probabilities // or score `probs`. // // This currently only work for two class problem. // func (rt *Runtime) computePerfByProbs(samples tabula.ClasetInterface, actuals []string, probs []float64, ) { vs := samples.GetClassValueSpace() nactuals := numerus.IntsTo64(samples.Counts()) nclass := tekstus.WordsCountTokens(actuals, vs, false) pprev := math.Inf(-1) tp := int64(0) fp := int64(0) tpprev := int64(0) fpprev := int64(0) auc := float64(0) for x, p := range probs { if p != pprev { stat := Stat{} stat.SetTPRate(tp, nactuals[0]) stat.SetFPRate(fp, nactuals[1]) stat.SetPrecisionFromRate(nactuals[0], nactuals[1]) auc = auc + trapezoidArea(fp, fpprev, tp, tpprev) stat.SetAUC(auc) rt.perfs = append(rt.perfs, &stat) pprev = p tpprev = tp fpprev = fp } if actuals[x] == vs[0] { tp++ } else { fp++ } } stat := Stat{} stat.SetTPRate(tp, nactuals[0]) stat.SetFPRate(fp, nactuals[1]) stat.SetPrecisionFromRate(nactuals[0], nactuals[1]) auc = auc + trapezoidArea(fp, fpprev, tp, tpprev) auc = auc / float64(nclass[0]*nclass[1]) stat.SetAUC(auc) rt.perfs = append(rt.perfs, &stat) if len(rt.perfs) >= 2 { // Replace the first stat with second stat, because of NaN // value on the first precision. rt.perfs[0] = rt.perfs[1] } }
// // Init will initialize LNSmote runtime by checking input values and set it to // default if not set or invalid. // func (in *Runtime) Init(dataset tabula.ClasetInterface) { in.Runtime.Init() in.NSynthetic = in.PercentOver / 100.0 in.datasetRows = dataset.GetDataAsRows() in.minorset = tabula.SelectRowsWhere(dataset, in.ClassIndex, in.ClassMinor) in.outliers = make(tabula.Rows, 0) if DEBUG >= 1 { fmt.Println("[lnsmote] n:", in.NSynthetic) fmt.Println("[lnsmote] n minority:", in.minorset.Len()) } }
// // ClassifySet given a samples predict their class by running each sample in // forest, adn return their class prediction with confusion matrix. // `samples` is the sample that will be predicted, `sampleIds` is the index of // samples. // If `sampleIds` is not nil, then sample index will be checked in each tree, // if the sample is used for training, their vote is not counted. // // Algorithm, // // (0) Get value space (possible class values in dataset) // (1) For each row in test-set, // (1.1) collect votes in all trees, // (1.2) select majority class vote, and // (1.3) compute and save the actual class probabilities. // (2) Compute confusion matrix from predictions. // (3) Compute stat from confusion matrix. // (4) Write the stat to file only if sampleIds is empty, which mean its run // not from OOB set. // func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface, sampleIds []int, ) ( predicts []string, cm *classifier.CM, probs []float64, ) { stat := classifier.Stat{} stat.Start() if len(sampleIds) <= 0 { fmt.Println(tag, "Classify set:", samples) fmt.Println(tag, "Classify set sample (one row):", samples.GetRow(0)) } // (0) vs := samples.GetClassValueSpace() actuals := samples.GetClassAsStrings() sampleIdx := -1 // (1) rows := samples.GetRows() for x, row := range *rows { // (1.1) if len(sampleIds) > 0 { sampleIdx = sampleIds[x] } votes := forest.Votes(row, sampleIdx) // (1.2) classProbs := tekstus.WordsProbabilitiesOf(votes, vs, false) _, idx, ok := numerus.Floats64FindMax(classProbs) if ok { predicts = append(predicts, vs[idx]) } // (1.3) probs = append(probs, classProbs[0]) } // (2) cm = forest.ComputeCM(sampleIds, vs, actuals, predicts) // (3) forest.ComputeStatFromCM(&stat, cm) stat.End() if len(sampleIds) <= 0 { fmt.Println(tag, "CM:", cm) fmt.Println(tag, "Classifying stat:", stat) _ = stat.Write(forest.StatFile) } return predicts, cm, probs }
// SelectRandomFeature if NRandomFeature is greater than zero, select and // compute gain in n random features instead of in all features func (runtime *Runtime) SelectRandomFeature(D tabula.ClasetInterface) { if runtime.NRandomFeature <= 0 { // all features selected return } ncols := D.GetNColumn() // count all features minus class nfeature := ncols - 1 if runtime.NRandomFeature >= nfeature { // Do nothing if number of random feature equal or greater than // number of feature in dataset. return } // exclude class index and parent node index excludeIdx := []int{D.GetClassIndex()} cols := D.GetColumns() for x, col := range *cols { if (col.Flag & ColFlagParent) == ColFlagParent { excludeIdx = append(excludeIdx, x) } else { (*cols)[x].Flag |= ColFlagSkip } } // Select random features excluding feature in `excludeIdx`. var pickedIdx []int for x := 0; x < runtime.NRandomFeature; x++ { idx := numerus.IntPickRandPositive(ncols, false, pickedIdx, excludeIdx) pickedIdx = append(pickedIdx, idx) // Remove skip flag on selected column col := D.GetColumn(idx) col.Flag = col.Flag &^ ColFlagSkip } if DEBUG >= 1 { fmt.Println("[cart] selected random features:", pickedIdx) fmt.Println("[cart] selected columns :", D.GetColumns()) } }
// // Performance given an actuals class label and their probabilities, compute // the performance statistic of classifier. // // Algorithm, // (1) Sort the probabilities in descending order. // (2) Sort the actuals and predicts using sorted index from probs // (3) Compute tpr, fpr, precision // (4) Write performance to file. // func (rt *Runtime) Performance(samples tabula.ClasetInterface, predicts []string, probs []float64, ) ( perfs Stats, ) { // (1) actuals := samples.GetClassAsStrings() sortedIds := numerus.IntCreateSeq(0, len(probs)-1) numerus.Floats64InplaceMergesort(probs, sortedIds, 0, len(probs), false) // (2) tekstus.StringsSortByIndex(&actuals, sortedIds) tekstus.StringsSortByIndex(&predicts, sortedIds) // (3) rt.computePerfByProbs(samples, actuals, probs) return rt.perfs }
// // refillWithFP will copy the false-positive data in training set `tnset` // and append it to `samples`. // func (crf *Runtime) refillWithFP(samples, tnset tabula.ClasetInterface, cm *classifier.CM, ) { // Get and sort FP. fpids := cm.FPIndices() sort.Ints(fpids) // Move FP samples from TN-set to training set samples. for _, i := range fpids { samples.PushRow(tnset.GetRow(i)) } // Delete FP from training set. var row *tabula.Row c := 0 for x, i := range fpids { row = tnset.DeleteRow(i - x) if row != nil { c++ } } if DEBUG >= 1 { fmt.Println(tag, "# FP", len(fpids), "# refilled", c) } }
/* Build the forest using samples dataset. Algorithm, (0) Recheck input value: number of tree, percentage bootstrap, etc; and Open statistic file output. (1) For 0 to NTree, (1.1) Create new tree, repeat until all trees has been build. (2) Compute and write total statistic. */ func (forest *Runtime) Build(samples tabula.ClasetInterface) (e error) { // check input samples if samples == nil { return ErrNoInput } // (0) e = forest.Initialize(samples) if e != nil { return } fmt.Println(tag, "Training set :", samples) fmt.Println(tag, "Sample (one row):", samples.GetRow(0)) fmt.Println(tag, "Forest config :", forest) // (1) for t := 0; t < forest.NTree; t++ { if DEBUG >= 1 { fmt.Println(tag, "tree #", t) } // (1.1) for { _, _, e = forest.GrowTree(samples) if e == nil { break } fmt.Println(tag, "error:", e) } } // (2) return forest.Finalize() }
/* ClassifySet set the class attribute based on tree classification. */ func (runtime *Runtime) ClassifySet(data tabula.ClasetInterface) (e error) { nrow := data.GetNRow() targetAttr := data.GetClassColumn() for i := 0; i < nrow; i++ { class := runtime.Classify(data.GetRow(i)) _ = (*targetAttr).Records[i].SetValue(class, tabula.TString) } return }
/* computeGain calculate the gini index for each value in each attribute. */ func (runtime *Runtime) computeGain(D tabula.ClasetInterface) ( gains []gini.Gini, ) { switch runtime.SplitMethod { case SplitMethodGini: // create gains value for all attribute minus target class. gains = make([]gini.Gini, D.GetNColumn()) } runtime.SelectRandomFeature(D) classVS := D.GetClassValueSpace() classIdx := D.GetClassIndex() classType := D.GetClassType() for x, col := range *D.GetColumns() { // skip class attribute. if x == classIdx { continue } // skip column flagged with parent if (col.Flag & ColFlagParent) == ColFlagParent { gains[x].Skip = true continue } // ignore column flagged with skip if (col.Flag & ColFlagSkip) == ColFlagSkip { gains[x].Skip = true continue } // compute gain. if col.GetType() == tabula.TReal { attr := col.ToFloatSlice() if classType == tabula.TString { target := D.GetClassAsStrings() gains[x].ComputeContinu(&attr, &target, &classVS) } else { targetReal := D.GetClassAsReals() classVSReal := tekstus.StringsToFloat64( classVS) gains[x].ComputeContinuFloat(&attr, &targetReal, &classVSReal) } } else { attr := col.ToStringSlice() attrV := col.ValueSpace if DEBUG >= 2 { fmt.Println("[cart] attr :", attr) fmt.Println("[cart] attrV:", attrV) } target := D.GetClassAsStrings() gains[x].ComputeDiscrete(&attr, &attrV, &target, &classVS) } if DEBUG >= 2 { fmt.Println("[cart] gain :", gains[x]) } } return }
/* splitTreeByGain calculate the gain in all dataset, and split into two node: left and right. Return node with the split information. */ func (runtime *Runtime) splitTreeByGain(D tabula.ClasetInterface) ( node *binary.BTNode, e error, ) { node = &binary.BTNode{} D.RecountMajorMinor() // if dataset is empty return node labeled with majority classes in // dataset. nrow := D.GetNRow() if nrow <= 0 { if DEBUG >= 2 { fmt.Printf("[cart] empty dataset (%s) : %v\n", D.MajorityClass(), D) } node.Value = NodeValue{ IsLeaf: true, Class: D.MajorityClass(), Size: 0, } return node, nil } // if all dataset is in the same class, return node as leaf with class // is set to that class. single, name := D.IsInSingleClass() if single { if DEBUG >= 2 { fmt.Printf("[cart] in single class (%s): %v\n", name, D.GetColumns()) } node.Value = NodeValue{ IsLeaf: true, Class: name, Size: nrow, } return node, nil } if DEBUG >= 2 { fmt.Println("[cart] D:", D) } // calculate the Gini gain for each attribute. gains := runtime.computeGain(D) // get attribute with maximum Gini gain. MaxGainIdx := gini.FindMaxGain(&gains) MaxGain := gains[MaxGainIdx] // if maxgain value is 0, use majority class as node and terminate // the process if MaxGain.GetMaxGainValue() == 0 { if DEBUG >= 2 { fmt.Println("[cart] max gain 0 with target", D.GetClassAsStrings(), " and majority class is ", D.MajorityClass()) } node.Value = NodeValue{ IsLeaf: true, Class: D.MajorityClass(), Size: 0, } return node, nil } // using the sorted index in MaxGain, sort all field in dataset tabula.SortColumnsByIndex(D, MaxGain.SortedIndex) if DEBUG >= 2 { fmt.Println("[cart] maxgain:", MaxGain) } // Now that we have attribute with max gain in MaxGainIdx, and their // gain dan partition value in Gains[MaxGainIdx] and // GetMaxPartValue(), we split the dataset based on type of max-gain // attribute. // If its continuous, split the attribute using numeric value. // If its discrete, split the attribute using subset (partition) of // nominal values. var splitV interface{} if MaxGain.IsContinu { splitV = MaxGain.GetMaxPartGainValue() } else { attrPartV := MaxGain.GetMaxPartGainValue() attrSubV := attrPartV.(tekstus.ListStrings) splitV = attrSubV[0].Normalize() } if DEBUG >= 2 { fmt.Println("[cart] maxgainindex:", MaxGainIdx) fmt.Println("[cart] split v:", splitV) } node.Value = NodeValue{ SplitAttrName: D.GetColumn(MaxGainIdx).GetName(), IsLeaf: false, IsContinu: MaxGain.IsContinu, Size: nrow, SplitAttrIdx: MaxGainIdx, SplitV: splitV, } dsL, dsR, e := tabula.SplitRowsByValue(D, MaxGainIdx, splitV) if e != nil { return node, e } splitL := dsL.(tabula.ClasetInterface) splitR := dsR.(tabula.ClasetInterface) // Set the flag to parent in attribute referenced by // MaxGainIdx, so it will not computed again in the next round. cols := splitL.GetColumns() for x := range *cols { if x == MaxGainIdx { (*cols)[x].Flag = ColFlagParent } else { (*cols)[x].Flag = 0 } } cols = splitR.GetColumns() for x := range *cols { if x == MaxGainIdx { (*cols)[x].Flag = ColFlagParent } else { (*cols)[x].Flag = 0 } } nodeLeft, e := runtime.splitTreeByGain(splitL) if e != nil { return node, e } nodeRight, e := runtime.splitTreeByGain(splitR) if e != nil { return node, e } node.SetLeft(nodeLeft) node.SetRight(nodeRight) return node, nil }
// // ClassifySetByWeight will classify each instance in samples by weight // with respect to its single performance. // // Algorithm, // (1) For each instance in samples, // (1.1) for each stage, // (1.1.1) collect votes for instance in current stage. // (1.1.2) Compute probabilities of each classes in votes. // // prob_class = count_of_class / total_votes // // (1.1.3) Compute total of probabilites times of stage weight. // // stage_prob = prob_class * stage_weight // // (1.2) Divide each class stage probabilites with // // stage_prob = stage_prob / // (sum_of_all_weights * number_of_tree_in_forest) // // (1.3) Select class label with highest probabilites. // (1.4) Save stage probabilities for positive class. // (2) Compute confusion matrix. // func (crf *Runtime) ClassifySetByWeight(samples tabula.ClasetInterface, sampleIds []int, ) ( predicts []string, cm *classifier.CM, probs []float64, ) { stat := classifier.Stat{} stat.Start() vs := samples.GetClassValueSpace() stageProbs := make([]float64, len(vs)) stageSumProbs := make([]float64, len(vs)) sumWeights := numerus.Floats64Sum(crf.weights) // (1) rows := samples.GetDataAsRows() for _, row := range *rows { for y := range stageSumProbs { stageSumProbs[y] = 0 } // (1.1) for y, forest := range crf.forests { // (1.1.1) votes := forest.Votes(row, -1) // (1.1.2) probs := tekstus.WordsProbabilitiesOf(votes, vs, false) // (1.1.3) for z := range probs { stageSumProbs[z] += probs[z] stageProbs[z] += probs[z] * crf.weights[y] } } // (1.2) stageWeight := sumWeights * float64(crf.NTree) for x := range stageProbs { stageProbs[x] = stageProbs[x] / stageWeight } // (1.3) _, maxi, ok := numerus.Floats64FindMax(stageProbs) if ok { predicts = append(predicts, vs[maxi]) } probs = append(probs, stageSumProbs[0]/ float64(len(crf.forests))) } // (2) actuals := samples.GetClassAsStrings() cm = crf.ComputeCM(sampleIds, vs, actuals, predicts) crf.ComputeStatFromCM(&stat, cm) stat.End() _ = stat.Write(crf.StatFile) return predicts, cm, probs }
// // createForest will create and return a forest and run the training `samples` // on it. // // Algorithm, // (1) Initialize forest. // (2) For 0 to maximum number of tree in forest, // (2.1) grow one tree until success. // (2.2) If tree tp-rate and tn-rate greater than threshold, stop growing. // (3) Calculate weight. // (4) TODO: Move true-negative from samples. The collection of true-negative // will be used again to test the model and after test and the sample with FP // will be moved to training samples again. // (5) Refill samples with false-positive. // func (crf *Runtime) createForest(samples tabula.ClasetInterface) ( forest *rf.Runtime, e error, ) { var cm *classifier.CM var stat *classifier.Stat fmt.Println(tag, "Forest samples:", samples) // (1) forest = &rf.Runtime{ Runtime: classifier.Runtime{ RunOOB: true, }, NTree: crf.NTree, NRandomFeature: crf.NRandomFeature, } e = forest.Initialize(samples) if e != nil { return nil, e } // (2) for t := 0; t < crf.NTree; t++ { if DEBUG >= 2 { fmt.Println(tag, "Tree #", t) } // (2.1) for { cm, stat, e = forest.GrowTree(samples) if e == nil { break } } // (2.2) if stat.TPRate > crf.TPRate && stat.TNRate > crf.TNRate { break } } e = forest.Finalize() if e != nil { return nil, e } // (3) crf.computeWeight(stat) if DEBUG >= 1 { fmt.Println(tag, "Weight:", stat.FMeasure) } // (4) crf.deleteTrueNegative(samples, cm) // (5) crf.runTPSet(samples) samples.RecountMajorMinor() return forest, nil }