Пример #1
0
//
// ClassifySet given a samples predict their class by running each sample in
// forest, adn return their class prediction with confusion matrix.
// `samples` is the sample that will be predicted, `sampleIds` is the index of
// samples.
// If `sampleIds` is not nil, then sample index will be checked in each tree,
// if the sample is used for training, their vote is not counted.
//
// Algorithm,
//
// (0) Get value space (possible class values in dataset)
// (1) For each row in test-set,
// (1.1) collect votes in all trees,
// (1.2) select majority class vote, and
// (1.3) compute and save the actual class probabilities.
// (2) Compute confusion matrix from predictions.
// (3) Compute stat from confusion matrix.
// (4) Write the stat to file only if sampleIds is empty, which mean its run
// not from OOB set.
//
func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface,
	sampleIds []int,
) (
	predicts []string, cm *classifier.CM, probs []float64,
) {
	stat := classifier.Stat{}
	stat.Start()

	if len(sampleIds) <= 0 {
		fmt.Println(tag, "Classify set:", samples)
		fmt.Println(tag, "Classify set sample (one row):",
			samples.GetRow(0))
	}

	// (0)
	vs := samples.GetClassValueSpace()
	actuals := samples.GetClassAsStrings()
	sampleIdx := -1

	// (1)
	rows := samples.GetRows()
	for x, row := range *rows {
		// (1.1)
		if len(sampleIds) > 0 {
			sampleIdx = sampleIds[x]
		}
		votes := forest.Votes(row, sampleIdx)

		// (1.2)
		classProbs := tekstus.WordsProbabilitiesOf(votes, vs, false)

		_, idx, ok := numerus.Floats64FindMax(classProbs)

		if ok {
			predicts = append(predicts, vs[idx])
		}

		// (1.3)
		probs = append(probs, classProbs[0])
	}

	// (2)
	cm = forest.ComputeCM(sampleIds, vs, actuals, predicts)

	// (3)
	forest.ComputeStatFromCM(&stat, cm)
	stat.End()

	if len(sampleIds) <= 0 {
		fmt.Println(tag, "CM:", cm)
		fmt.Println(tag, "Classifying stat:", stat)
		_ = stat.Write(forest.StatFile)
	}

	return predicts, cm, probs
}