// // refillWithFP will copy the false-positive data in training set `tnset` // and append it to `samples`. // func (crf *Runtime) refillWithFP(samples, tnset tabula.ClasetInterface, cm *classifier.CM, ) { // Get and sort FP. fpids := cm.FPIndices() sort.Ints(fpids) // Move FP samples from TN-set to training set samples. for _, i := range fpids { samples.PushRow(tnset.GetRow(i)) } // Delete FP from training set. var row *tabula.Row c := 0 for x, i := range fpids { row = tnset.DeleteRow(i - x) if row != nil { c++ } } if DEBUG >= 1 { fmt.Println(tag, "# FP", len(fpids), "# refilled", c) } }
func (crf *Runtime) deleteTrueNegative(samples tabula.ClasetInterface, cm *classifier.CM, ) { var row *tabula.Row tnids := cm.TNIndices() sort.Ints(tnids) // (1) if len(crf.weights) <= 1 { for _, i := range tnids { crf.tnset.PushRow(samples.GetRow(i)) } } // (2) c := 0 for x, i := range tnids { row = samples.DeleteRow(i - x) if row != nil { c++ } } if DEBUG >= 1 { fmt.Println(tag, "# TN", len(tnids), "# deleted", c) } }
// // Build given a sample dataset, build the stage with randomforest. // func (crf *Runtime) Build(samples tabula.ClasetInterface) (e error) { if samples == nil { return ErrNoInput } e = crf.Initialize(samples) if e != nil { return } fmt.Println(tag, "Training samples:", samples) fmt.Println(tag, "Sample (one row):", samples.GetRow(0)) fmt.Println(tag, "Config:", crf) for x := 0; x < crf.NStage; x++ { if DEBUG >= 1 { fmt.Println(tag, "Stage #", x) } forest, e := crf.createForest(samples) if e != nil { return e } e = crf.finalizeStage(forest) if e != nil { return e } } return crf.Finalize() }
// // ClassifySet given a samples predict their class by running each sample in // forest, adn return their class prediction with confusion matrix. // `samples` is the sample that will be predicted, `sampleIds` is the index of // samples. // If `sampleIds` is not nil, then sample index will be checked in each tree, // if the sample is used for training, their vote is not counted. // // Algorithm, // // (0) Get value space (possible class values in dataset) // (1) For each row in test-set, // (1.1) collect votes in all trees, // (1.2) select majority class vote, and // (1.3) compute and save the actual class probabilities. // (2) Compute confusion matrix from predictions. // (3) Compute stat from confusion matrix. // (4) Write the stat to file only if sampleIds is empty, which mean its run // not from OOB set. // func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface, sampleIds []int, ) ( predicts []string, cm *classifier.CM, probs []float64, ) { stat := classifier.Stat{} stat.Start() if len(sampleIds) <= 0 { fmt.Println(tag, "Classify set:", samples) fmt.Println(tag, "Classify set sample (one row):", samples.GetRow(0)) } // (0) vs := samples.GetClassValueSpace() actuals := samples.GetClassAsStrings() sampleIdx := -1 // (1) rows := samples.GetRows() for x, row := range *rows { // (1.1) if len(sampleIds) > 0 { sampleIdx = sampleIds[x] } votes := forest.Votes(row, sampleIdx) // (1.2) classProbs := tekstus.WordsProbabilitiesOf(votes, vs, false) _, idx, ok := numerus.Floats64FindMax(classProbs) if ok { predicts = append(predicts, vs[idx]) } // (1.3) probs = append(probs, classProbs[0]) } // (2) cm = forest.ComputeCM(sampleIds, vs, actuals, predicts) // (3) forest.ComputeStatFromCM(&stat, cm) stat.End() if len(sampleIds) <= 0 { fmt.Println(tag, "CM:", cm) fmt.Println(tag, "Classifying stat:", stat) _ = stat.Write(forest.StatFile) } return predicts, cm, probs }
/* ClassifySet set the class attribute based on tree classification. */ func (runtime *Runtime) ClassifySet(data tabula.ClasetInterface) (e error) { nrow := data.GetNRow() targetAttr := data.GetClassColumn() for i := 0; i < nrow; i++ { class := runtime.Classify(data.GetRow(i)) _ = (*targetAttr).Records[i].SetValue(class, tabula.TString) } return }
/* Build the forest using samples dataset. Algorithm, (0) Recheck input value: number of tree, percentage bootstrap, etc; and Open statistic file output. (1) For 0 to NTree, (1.1) Create new tree, repeat until all trees has been build. (2) Compute and write total statistic. */ func (forest *Runtime) Build(samples tabula.ClasetInterface) (e error) { // check input samples if samples == nil { return ErrNoInput } // (0) e = forest.Initialize(samples) if e != nil { return } fmt.Println(tag, "Training set :", samples) fmt.Println(tag, "Sample (one row):", samples.GetRow(0)) fmt.Println(tag, "Forest config :", forest) // (1) for t := 0; t < forest.NTree; t++ { if DEBUG >= 1 { fmt.Println(tag, "tree #", t) } // (1.1) for { _, _, e = forest.GrowTree(samples) if e == nil { break } fmt.Println(tag, "error:", e) } } // (2) return forest.Finalize() }