Beispiel #1
0
//
// refillWithFP will copy the false-positive data in training set `tnset`
// and append it to `samples`.
//
func (crf *Runtime) refillWithFP(samples, tnset tabula.ClasetInterface,
	cm *classifier.CM,
) {
	// Get and sort FP.
	fpids := cm.FPIndices()
	sort.Ints(fpids)

	// Move FP samples from TN-set to training set samples.
	for _, i := range fpids {
		samples.PushRow(tnset.GetRow(i))
	}

	// Delete FP from training set.
	var row *tabula.Row
	c := 0
	for x, i := range fpids {
		row = tnset.DeleteRow(i - x)
		if row != nil {
			c++
		}
	}

	if DEBUG >= 1 {
		fmt.Println(tag, "# FP", len(fpids), "# refilled", c)
	}
}
Beispiel #2
0
func (crf *Runtime) deleteTrueNegative(samples tabula.ClasetInterface,
	cm *classifier.CM,
) {
	var row *tabula.Row

	tnids := cm.TNIndices()
	sort.Ints(tnids)

	// (1)
	if len(crf.weights) <= 1 {
		for _, i := range tnids {
			crf.tnset.PushRow(samples.GetRow(i))
		}
	}

	// (2)
	c := 0
	for x, i := range tnids {
		row = samples.DeleteRow(i - x)
		if row != nil {
			c++
		}
	}

	if DEBUG >= 1 {
		fmt.Println(tag, "# TN", len(tnids), "# deleted", c)
	}
}
Beispiel #3
0
//
// Build given a sample dataset, build the stage with randomforest.
//
func (crf *Runtime) Build(samples tabula.ClasetInterface) (e error) {
	if samples == nil {
		return ErrNoInput
	}

	e = crf.Initialize(samples)
	if e != nil {
		return
	}

	fmt.Println(tag, "Training samples:", samples)
	fmt.Println(tag, "Sample (one row):", samples.GetRow(0))
	fmt.Println(tag, "Config:", crf)

	for x := 0; x < crf.NStage; x++ {
		if DEBUG >= 1 {
			fmt.Println(tag, "Stage #", x)
		}

		forest, e := crf.createForest(samples)
		if e != nil {
			return e
		}

		e = crf.finalizeStage(forest)
		if e != nil {
			return e
		}
	}

	return crf.Finalize()
}
Beispiel #4
0
//
// ClassifySet given a samples predict their class by running each sample in
// forest, adn return their class prediction with confusion matrix.
// `samples` is the sample that will be predicted, `sampleIds` is the index of
// samples.
// If `sampleIds` is not nil, then sample index will be checked in each tree,
// if the sample is used for training, their vote is not counted.
//
// Algorithm,
//
// (0) Get value space (possible class values in dataset)
// (1) For each row in test-set,
// (1.1) collect votes in all trees,
// (1.2) select majority class vote, and
// (1.3) compute and save the actual class probabilities.
// (2) Compute confusion matrix from predictions.
// (3) Compute stat from confusion matrix.
// (4) Write the stat to file only if sampleIds is empty, which mean its run
// not from OOB set.
//
func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface,
	sampleIds []int,
) (
	predicts []string, cm *classifier.CM, probs []float64,
) {
	stat := classifier.Stat{}
	stat.Start()

	if len(sampleIds) <= 0 {
		fmt.Println(tag, "Classify set:", samples)
		fmt.Println(tag, "Classify set sample (one row):",
			samples.GetRow(0))
	}

	// (0)
	vs := samples.GetClassValueSpace()
	actuals := samples.GetClassAsStrings()
	sampleIdx := -1

	// (1)
	rows := samples.GetRows()
	for x, row := range *rows {
		// (1.1)
		if len(sampleIds) > 0 {
			sampleIdx = sampleIds[x]
		}
		votes := forest.Votes(row, sampleIdx)

		// (1.2)
		classProbs := tekstus.WordsProbabilitiesOf(votes, vs, false)

		_, idx, ok := numerus.Floats64FindMax(classProbs)

		if ok {
			predicts = append(predicts, vs[idx])
		}

		// (1.3)
		probs = append(probs, classProbs[0])
	}

	// (2)
	cm = forest.ComputeCM(sampleIds, vs, actuals, predicts)

	// (3)
	forest.ComputeStatFromCM(&stat, cm)
	stat.End()

	if len(sampleIds) <= 0 {
		fmt.Println(tag, "CM:", cm)
		fmt.Println(tag, "Classifying stat:", stat)
		_ = stat.Write(forest.StatFile)
	}

	return predicts, cm, probs
}
Beispiel #5
0
/*
ClassifySet set the class attribute based on tree classification.
*/
func (runtime *Runtime) ClassifySet(data tabula.ClasetInterface) (e error) {
	nrow := data.GetNRow()
	targetAttr := data.GetClassColumn()

	for i := 0; i < nrow; i++ {
		class := runtime.Classify(data.GetRow(i))

		_ = (*targetAttr).Records[i].SetValue(class, tabula.TString)
	}

	return
}
Beispiel #6
0
/*
Build the forest using samples dataset.

Algorithm,

(0) Recheck input value: number of tree, percentage bootstrap, etc; and
    Open statistic file output.
(1) For 0 to NTree,
(1.1) Create new tree, repeat until all trees has been build.
(2) Compute and write total statistic.
*/
func (forest *Runtime) Build(samples tabula.ClasetInterface) (e error) {
	// check input samples
	if samples == nil {
		return ErrNoInput
	}

	// (0)
	e = forest.Initialize(samples)
	if e != nil {
		return
	}

	fmt.Println(tag, "Training set    :", samples)
	fmt.Println(tag, "Sample (one row):", samples.GetRow(0))
	fmt.Println(tag, "Forest config   :", forest)

	// (1)
	for t := 0; t < forest.NTree; t++ {
		if DEBUG >= 1 {
			fmt.Println(tag, "tree #", t)
		}

		// (1.1)
		for {
			_, _, e = forest.GrowTree(samples)
			if e == nil {
				break
			}

			fmt.Println(tag, "error:", e)
		}
	}

	// (2)
	return forest.Finalize()
}