Ejemplo n.º 1
0
//
// ClassifySet given a samples predict their class by running each sample in
// forest, adn return their class prediction with confusion matrix.
// `samples` is the sample that will be predicted, `sampleIds` is the index of
// samples.
// If `sampleIds` is not nil, then sample index will be checked in each tree,
// if the sample is used for training, their vote is not counted.
//
// Algorithm,
//
// (0) Get value space (possible class values in dataset)
// (1) For each row in test-set,
// (1.1) collect votes in all trees,
// (1.2) select majority class vote, and
// (1.3) compute and save the actual class probabilities.
// (2) Compute confusion matrix from predictions.
// (3) Compute stat from confusion matrix.
// (4) Write the stat to file only if sampleIds is empty, which mean its run
// not from OOB set.
//
func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface,
	sampleIds []int,
) (
	predicts []string, cm *classifier.CM, probs []float64,
) {
	stat := classifier.Stat{}
	stat.Start()

	if len(sampleIds) <= 0 {
		fmt.Println(tag, "Classify set:", samples)
		fmt.Println(tag, "Classify set sample (one row):",
			samples.GetRow(0))
	}

	// (0)
	vs := samples.GetClassValueSpace()
	actuals := samples.GetClassAsStrings()
	sampleIdx := -1

	// (1)
	rows := samples.GetRows()
	for x, row := range *rows {
		// (1.1)
		if len(sampleIds) > 0 {
			sampleIdx = sampleIds[x]
		}
		votes := forest.Votes(row, sampleIdx)

		// (1.2)
		classProbs := tekstus.WordsProbabilitiesOf(votes, vs, false)

		_, idx, ok := numerus.Floats64FindMax(classProbs)

		if ok {
			predicts = append(predicts, vs[idx])
		}

		// (1.3)
		probs = append(probs, classProbs[0])
	}

	// (2)
	cm = forest.ComputeCM(sampleIds, vs, actuals, predicts)

	// (3)
	forest.ComputeStatFromCM(&stat, cm)
	stat.End()

	if len(sampleIds) <= 0 {
		fmt.Println(tag, "CM:", cm)
		fmt.Println(tag, "Classifying stat:", stat)
		_ = stat.Write(forest.StatFile)
	}

	return predicts, cm, probs
}
Ejemplo n.º 2
0
//
// computePerfByProbs will compute classifier performance using probabilities
// or score `probs`.
//
// This currently only work for two class problem.
//
func (rt *Runtime) computePerfByProbs(samples tabula.ClasetInterface,
	actuals []string, probs []float64,
) {
	vs := samples.GetClassValueSpace()
	nactuals := numerus.IntsTo64(samples.Counts())
	nclass := tekstus.WordsCountTokens(actuals, vs, false)

	pprev := math.Inf(-1)
	tp := int64(0)
	fp := int64(0)
	tpprev := int64(0)
	fpprev := int64(0)

	auc := float64(0)

	for x, p := range probs {
		if p != pprev {
			stat := Stat{}
			stat.SetTPRate(tp, nactuals[0])
			stat.SetFPRate(fp, nactuals[1])
			stat.SetPrecisionFromRate(nactuals[0], nactuals[1])

			auc = auc + trapezoidArea(fp, fpprev, tp, tpprev)
			stat.SetAUC(auc)

			rt.perfs = append(rt.perfs, &stat)

			pprev = p
			tpprev = tp
			fpprev = fp
		}

		if actuals[x] == vs[0] {
			tp++
		} else {
			fp++
		}
	}

	stat := Stat{}
	stat.SetTPRate(tp, nactuals[0])
	stat.SetFPRate(fp, nactuals[1])
	stat.SetPrecisionFromRate(nactuals[0], nactuals[1])

	auc = auc + trapezoidArea(fp, fpprev, tp, tpprev)
	auc = auc / float64(nclass[0]*nclass[1])
	stat.SetAUC(auc)

	rt.perfs = append(rt.perfs, &stat)

	if len(rt.perfs) >= 2 {
		// Replace the first stat with second stat, because of NaN
		// value on the first precision.
		rt.perfs[0] = rt.perfs[1]
	}
}
Ejemplo n.º 3
0
/*
computeGain calculate the gini index for each value in each attribute.
*/
func (runtime *Runtime) computeGain(D tabula.ClasetInterface) (
	gains []gini.Gini,
) {
	switch runtime.SplitMethod {
	case SplitMethodGini:
		// create gains value for all attribute minus target class.
		gains = make([]gini.Gini, D.GetNColumn())
	}

	runtime.SelectRandomFeature(D)

	classVS := D.GetClassValueSpace()
	classIdx := D.GetClassIndex()
	classType := D.GetClassType()

	for x, col := range *D.GetColumns() {
		// skip class attribute.
		if x == classIdx {
			continue
		}

		// skip column flagged with parent
		if (col.Flag & ColFlagParent) == ColFlagParent {
			gains[x].Skip = true
			continue
		}

		// ignore column flagged with skip
		if (col.Flag & ColFlagSkip) == ColFlagSkip {
			gains[x].Skip = true
			continue
		}

		// compute gain.
		if col.GetType() == tabula.TReal {
			attr := col.ToFloatSlice()

			if classType == tabula.TString {
				target := D.GetClassAsStrings()
				gains[x].ComputeContinu(&attr, &target,
					&classVS)
			} else {
				targetReal := D.GetClassAsReals()
				classVSReal := tekstus.StringsToFloat64(
					classVS)

				gains[x].ComputeContinuFloat(&attr,
					&targetReal, &classVSReal)
			}
		} else {
			attr := col.ToStringSlice()
			attrV := col.ValueSpace

			if DEBUG >= 2 {
				fmt.Println("[cart] attr :", attr)
				fmt.Println("[cart] attrV:", attrV)
			}

			target := D.GetClassAsStrings()
			gains[x].ComputeDiscrete(&attr, &attrV, &target,
				&classVS)
		}

		if DEBUG >= 2 {
			fmt.Println("[cart] gain :", gains[x])
		}
	}
	return
}
Ejemplo n.º 4
0
//
// ClassifySetByWeight will classify each instance in samples by weight
// with respect to its single performance.
//
// Algorithm,
// (1) For each instance in samples,
// (1.1) for each stage,
// (1.1.1) collect votes for instance in current stage.
// (1.1.2) Compute probabilities of each classes in votes.
//
//		prob_class = count_of_class / total_votes
//
// (1.1.3) Compute total of probabilites times of stage weight.
//
//		stage_prob = prob_class * stage_weight
//
// (1.2) Divide each class stage probabilites with
//
//		stage_prob = stage_prob /
//			(sum_of_all_weights * number_of_tree_in_forest)
//
// (1.3) Select class label with highest probabilites.
// (1.4) Save stage probabilities for positive class.
// (2) Compute confusion matrix.
//
func (crf *Runtime) ClassifySetByWeight(samples tabula.ClasetInterface,
	sampleIds []int,
) (
	predicts []string, cm *classifier.CM, probs []float64,
) {
	stat := classifier.Stat{}
	stat.Start()

	vs := samples.GetClassValueSpace()
	stageProbs := make([]float64, len(vs))
	stageSumProbs := make([]float64, len(vs))
	sumWeights := numerus.Floats64Sum(crf.weights)

	// (1)
	rows := samples.GetDataAsRows()
	for _, row := range *rows {
		for y := range stageSumProbs {
			stageSumProbs[y] = 0
		}

		// (1.1)
		for y, forest := range crf.forests {
			// (1.1.1)
			votes := forest.Votes(row, -1)

			// (1.1.2)
			probs := tekstus.WordsProbabilitiesOf(votes, vs, false)

			// (1.1.3)
			for z := range probs {
				stageSumProbs[z] += probs[z]
				stageProbs[z] += probs[z] * crf.weights[y]
			}
		}

		// (1.2)
		stageWeight := sumWeights * float64(crf.NTree)

		for x := range stageProbs {
			stageProbs[x] = stageProbs[x] / stageWeight
		}

		// (1.3)
		_, maxi, ok := numerus.Floats64FindMax(stageProbs)
		if ok {
			predicts = append(predicts, vs[maxi])
		}

		probs = append(probs, stageSumProbs[0]/
			float64(len(crf.forests)))
	}

	// (2)
	actuals := samples.GetClassAsStrings()
	cm = crf.ComputeCM(sampleIds, vs, actuals, predicts)

	crf.ComputeStatFromCM(&stat, cm)
	stat.End()

	_ = stat.Write(crf.StatFile)

	return predicts, cm, probs
}