Beispiel #1
0
func (crf *Runtime) deleteTrueNegative(samples tabula.ClasetInterface,
	cm *classifier.CM,
) {
	var row *tabula.Row

	tnids := cm.TNIndices()
	sort.Ints(tnids)

	// (1)
	if len(crf.weights) <= 1 {
		for _, i := range tnids {
			crf.tnset.PushRow(samples.GetRow(i))
		}
	}

	// (2)
	c := 0
	for x, i := range tnids {
		row = samples.DeleteRow(i - x)
		if row != nil {
			c++
		}
	}

	if DEBUG >= 1 {
		fmt.Println(tag, "# TN", len(tnids), "# deleted", c)
	}
}
Beispiel #2
0
//
// Build given a sample dataset, build the stage with randomforest.
//
func (crf *Runtime) Build(samples tabula.ClasetInterface) (e error) {
	if samples == nil {
		return ErrNoInput
	}

	e = crf.Initialize(samples)
	if e != nil {
		return
	}

	fmt.Println(tag, "Training samples:", samples)
	fmt.Println(tag, "Sample (one row):", samples.GetRow(0))
	fmt.Println(tag, "Config:", crf)

	for x := 0; x < crf.NStage; x++ {
		if DEBUG >= 1 {
			fmt.Println(tag, "Stage #", x)
		}

		forest, e := crf.createForest(samples)
		if e != nil {
			return e
		}

		e = crf.finalizeStage(forest)
		if e != nil {
			return e
		}
	}

	return crf.Finalize()
}
Beispiel #3
0
//
// Initialize will check forest inputs and set it to default values if invalid.
//
// It will also calculate number of random samples for each tree using,
//
//	number-of-sample * percentage-of-bootstrap
//
//
func (forest *Runtime) Initialize(samples tabula.ClasetInterface) error {
	if forest.NTree <= 0 {
		forest.NTree = DefNumTree
	}
	if forest.PercentBoot <= 0 {
		forest.PercentBoot = DefPercentBoot
	}
	if forest.NRandomFeature <= 0 {
		// Set default value to square-root of features.
		ncol := samples.GetNColumn() - 1
		forest.NRandomFeature = int(math.Sqrt(float64(ncol)))
	}
	if forest.OOBStatsFile == "" {
		forest.OOBStatsFile = DefOOBStatsFile
	}
	if forest.PerfFile == "" {
		forest.PerfFile = DefPerfFile
	}
	if forest.StatFile == "" {
		forest.StatFile = DefStatFile
	}

	forest.nSubsample = int(float32(samples.GetNRow()) *
		(float32(forest.PercentBoot) / 100.0))

	return forest.Runtime.Initialize()
}
Beispiel #4
0
//
// Initialize will check crf inputs and set it to default values if its
// invalid.
//
func (crf *Runtime) Initialize(samples tabula.ClasetInterface) error {
	if crf.NStage <= 0 {
		crf.NStage = DefStage
	}
	if crf.TPRate <= 0 || crf.TPRate >= 1 {
		crf.TPRate = DefTPRate
	}
	if crf.TNRate <= 0 || crf.TNRate >= 1 {
		crf.TNRate = DefTNRate
	}
	if crf.NTree <= 0 {
		crf.NTree = DefNumTree
	}
	if crf.PercentBoot <= 0 {
		crf.PercentBoot = DefPercentBoot
	}
	if crf.NRandomFeature <= 0 {
		// Set default value to square-root of features.
		ncol := samples.GetNColumn() - 1
		crf.NRandomFeature = int(math.Sqrt(float64(ncol)))
	}
	if crf.PerfFile == "" {
		crf.PerfFile = DefPerfFile
	}
	if crf.StatFile == "" {
		crf.StatFile = DefStatFile
	}
	crf.tnset = samples.Clone().(*tabula.Claset)

	return crf.Runtime.Initialize()
}
Beispiel #5
0
//
// computePerfByProbs will compute classifier performance using probabilities
// or score `probs`.
//
// This currently only work for two class problem.
//
func (rt *Runtime) computePerfByProbs(samples tabula.ClasetInterface,
	actuals []string, probs []float64,
) {
	vs := samples.GetClassValueSpace()
	nactuals := numerus.IntsTo64(samples.Counts())
	nclass := tekstus.WordsCountTokens(actuals, vs, false)

	pprev := math.Inf(-1)
	tp := int64(0)
	fp := int64(0)
	tpprev := int64(0)
	fpprev := int64(0)

	auc := float64(0)

	for x, p := range probs {
		if p != pprev {
			stat := Stat{}
			stat.SetTPRate(tp, nactuals[0])
			stat.SetFPRate(fp, nactuals[1])
			stat.SetPrecisionFromRate(nactuals[0], nactuals[1])

			auc = auc + trapezoidArea(fp, fpprev, tp, tpprev)
			stat.SetAUC(auc)

			rt.perfs = append(rt.perfs, &stat)

			pprev = p
			tpprev = tp
			fpprev = fp
		}

		if actuals[x] == vs[0] {
			tp++
		} else {
			fp++
		}
	}

	stat := Stat{}
	stat.SetTPRate(tp, nactuals[0])
	stat.SetFPRate(fp, nactuals[1])
	stat.SetPrecisionFromRate(nactuals[0], nactuals[1])

	auc = auc + trapezoidArea(fp, fpprev, tp, tpprev)
	auc = auc / float64(nclass[0]*nclass[1])
	stat.SetAUC(auc)

	rt.perfs = append(rt.perfs, &stat)

	if len(rt.perfs) >= 2 {
		// Replace the first stat with second stat, because of NaN
		// value on the first precision.
		rt.perfs[0] = rt.perfs[1]
	}
}
Beispiel #6
0
//
// Init will initialize LNSmote runtime by checking input values and set it to
// default if not set or invalid.
//
func (in *Runtime) Init(dataset tabula.ClasetInterface) {
	in.Runtime.Init()

	in.NSynthetic = in.PercentOver / 100.0
	in.datasetRows = dataset.GetDataAsRows()

	in.minorset = tabula.SelectRowsWhere(dataset, in.ClassIndex,
		in.ClassMinor)

	in.outliers = make(tabula.Rows, 0)

	if DEBUG >= 1 {
		fmt.Println("[lnsmote] n:", in.NSynthetic)
		fmt.Println("[lnsmote] n minority:", in.minorset.Len())
	}
}
Beispiel #7
0
//
// ClassifySet given a samples predict their class by running each sample in
// forest, adn return their class prediction with confusion matrix.
// `samples` is the sample that will be predicted, `sampleIds` is the index of
// samples.
// If `sampleIds` is not nil, then sample index will be checked in each tree,
// if the sample is used for training, their vote is not counted.
//
// Algorithm,
//
// (0) Get value space (possible class values in dataset)
// (1) For each row in test-set,
// (1.1) collect votes in all trees,
// (1.2) select majority class vote, and
// (1.3) compute and save the actual class probabilities.
// (2) Compute confusion matrix from predictions.
// (3) Compute stat from confusion matrix.
// (4) Write the stat to file only if sampleIds is empty, which mean its run
// not from OOB set.
//
func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface,
	sampleIds []int,
) (
	predicts []string, cm *classifier.CM, probs []float64,
) {
	stat := classifier.Stat{}
	stat.Start()

	if len(sampleIds) <= 0 {
		fmt.Println(tag, "Classify set:", samples)
		fmt.Println(tag, "Classify set sample (one row):",
			samples.GetRow(0))
	}

	// (0)
	vs := samples.GetClassValueSpace()
	actuals := samples.GetClassAsStrings()
	sampleIdx := -1

	// (1)
	rows := samples.GetRows()
	for x, row := range *rows {
		// (1.1)
		if len(sampleIds) > 0 {
			sampleIdx = sampleIds[x]
		}
		votes := forest.Votes(row, sampleIdx)

		// (1.2)
		classProbs := tekstus.WordsProbabilitiesOf(votes, vs, false)

		_, idx, ok := numerus.Floats64FindMax(classProbs)

		if ok {
			predicts = append(predicts, vs[idx])
		}

		// (1.3)
		probs = append(probs, classProbs[0])
	}

	// (2)
	cm = forest.ComputeCM(sampleIds, vs, actuals, predicts)

	// (3)
	forest.ComputeStatFromCM(&stat, cm)
	stat.End()

	if len(sampleIds) <= 0 {
		fmt.Println(tag, "CM:", cm)
		fmt.Println(tag, "Classifying stat:", stat)
		_ = stat.Write(forest.StatFile)
	}

	return predicts, cm, probs
}
Beispiel #8
0
// SelectRandomFeature if NRandomFeature is greater than zero, select and
// compute gain in n random features instead of in all features
func (runtime *Runtime) SelectRandomFeature(D tabula.ClasetInterface) {
	if runtime.NRandomFeature <= 0 {
		// all features selected
		return
	}

	ncols := D.GetNColumn()

	// count all features minus class
	nfeature := ncols - 1
	if runtime.NRandomFeature >= nfeature {
		// Do nothing if number of random feature equal or greater than
		// number of feature in dataset.
		return
	}

	// exclude class index and parent node index
	excludeIdx := []int{D.GetClassIndex()}
	cols := D.GetColumns()
	for x, col := range *cols {
		if (col.Flag & ColFlagParent) == ColFlagParent {
			excludeIdx = append(excludeIdx, x)
		} else {
			(*cols)[x].Flag |= ColFlagSkip
		}
	}

	// Select random features excluding feature in `excludeIdx`.
	var pickedIdx []int
	for x := 0; x < runtime.NRandomFeature; x++ {
		idx := numerus.IntPickRandPositive(ncols, false, pickedIdx,
			excludeIdx)
		pickedIdx = append(pickedIdx, idx)

		// Remove skip flag on selected column
		col := D.GetColumn(idx)
		col.Flag = col.Flag &^ ColFlagSkip
	}

	if DEBUG >= 1 {
		fmt.Println("[cart] selected random features:", pickedIdx)
		fmt.Println("[cart] selected columns        :", D.GetColumns())
	}
}
Beispiel #9
0
//
// Performance given an actuals class label and their probabilities, compute
// the performance statistic of classifier.
//
// Algorithm,
// (1) Sort the probabilities in descending order.
// (2) Sort the actuals and predicts using sorted index from probs
// (3) Compute tpr, fpr, precision
// (4) Write performance to file.
//
func (rt *Runtime) Performance(samples tabula.ClasetInterface,
	predicts []string, probs []float64,
) (
	perfs Stats,
) {
	// (1)
	actuals := samples.GetClassAsStrings()
	sortedIds := numerus.IntCreateSeq(0, len(probs)-1)
	numerus.Floats64InplaceMergesort(probs, sortedIds, 0, len(probs),
		false)

	// (2)
	tekstus.StringsSortByIndex(&actuals, sortedIds)
	tekstus.StringsSortByIndex(&predicts, sortedIds)

	// (3)
	rt.computePerfByProbs(samples, actuals, probs)

	return rt.perfs
}
Beispiel #10
0
//
// refillWithFP will copy the false-positive data in training set `tnset`
// and append it to `samples`.
//
func (crf *Runtime) refillWithFP(samples, tnset tabula.ClasetInterface,
	cm *classifier.CM,
) {
	// Get and sort FP.
	fpids := cm.FPIndices()
	sort.Ints(fpids)

	// Move FP samples from TN-set to training set samples.
	for _, i := range fpids {
		samples.PushRow(tnset.GetRow(i))
	}

	// Delete FP from training set.
	var row *tabula.Row
	c := 0
	for x, i := range fpids {
		row = tnset.DeleteRow(i - x)
		if row != nil {
			c++
		}
	}

	if DEBUG >= 1 {
		fmt.Println(tag, "# FP", len(fpids), "# refilled", c)
	}
}
Beispiel #11
0
/*
Build the forest using samples dataset.

Algorithm,

(0) Recheck input value: number of tree, percentage bootstrap, etc; and
    Open statistic file output.
(1) For 0 to NTree,
(1.1) Create new tree, repeat until all trees has been build.
(2) Compute and write total statistic.
*/
func (forest *Runtime) Build(samples tabula.ClasetInterface) (e error) {
	// check input samples
	if samples == nil {
		return ErrNoInput
	}

	// (0)
	e = forest.Initialize(samples)
	if e != nil {
		return
	}

	fmt.Println(tag, "Training set    :", samples)
	fmt.Println(tag, "Sample (one row):", samples.GetRow(0))
	fmt.Println(tag, "Forest config   :", forest)

	// (1)
	for t := 0; t < forest.NTree; t++ {
		if DEBUG >= 1 {
			fmt.Println(tag, "tree #", t)
		}

		// (1.1)
		for {
			_, _, e = forest.GrowTree(samples)
			if e == nil {
				break
			}

			fmt.Println(tag, "error:", e)
		}
	}

	// (2)
	return forest.Finalize()
}
Beispiel #12
0
/*
ClassifySet set the class attribute based on tree classification.
*/
func (runtime *Runtime) ClassifySet(data tabula.ClasetInterface) (e error) {
	nrow := data.GetNRow()
	targetAttr := data.GetClassColumn()

	for i := 0; i < nrow; i++ {
		class := runtime.Classify(data.GetRow(i))

		_ = (*targetAttr).Records[i].SetValue(class, tabula.TString)
	}

	return
}
Beispiel #13
0
/*
computeGain calculate the gini index for each value in each attribute.
*/
func (runtime *Runtime) computeGain(D tabula.ClasetInterface) (
	gains []gini.Gini,
) {
	switch runtime.SplitMethod {
	case SplitMethodGini:
		// create gains value for all attribute minus target class.
		gains = make([]gini.Gini, D.GetNColumn())
	}

	runtime.SelectRandomFeature(D)

	classVS := D.GetClassValueSpace()
	classIdx := D.GetClassIndex()
	classType := D.GetClassType()

	for x, col := range *D.GetColumns() {
		// skip class attribute.
		if x == classIdx {
			continue
		}

		// skip column flagged with parent
		if (col.Flag & ColFlagParent) == ColFlagParent {
			gains[x].Skip = true
			continue
		}

		// ignore column flagged with skip
		if (col.Flag & ColFlagSkip) == ColFlagSkip {
			gains[x].Skip = true
			continue
		}

		// compute gain.
		if col.GetType() == tabula.TReal {
			attr := col.ToFloatSlice()

			if classType == tabula.TString {
				target := D.GetClassAsStrings()
				gains[x].ComputeContinu(&attr, &target,
					&classVS)
			} else {
				targetReal := D.GetClassAsReals()
				classVSReal := tekstus.StringsToFloat64(
					classVS)

				gains[x].ComputeContinuFloat(&attr,
					&targetReal, &classVSReal)
			}
		} else {
			attr := col.ToStringSlice()
			attrV := col.ValueSpace

			if DEBUG >= 2 {
				fmt.Println("[cart] attr :", attr)
				fmt.Println("[cart] attrV:", attrV)
			}

			target := D.GetClassAsStrings()
			gains[x].ComputeDiscrete(&attr, &attrV, &target,
				&classVS)
		}

		if DEBUG >= 2 {
			fmt.Println("[cart] gain :", gains[x])
		}
	}
	return
}
Beispiel #14
0
/*
splitTreeByGain calculate the gain in all dataset, and split into two node:
left and right.

Return node with the split information.
*/
func (runtime *Runtime) splitTreeByGain(D tabula.ClasetInterface) (
	node *binary.BTNode,
	e error,
) {
	node = &binary.BTNode{}

	D.RecountMajorMinor()

	// if dataset is empty return node labeled with majority classes in
	// dataset.
	nrow := D.GetNRow()

	if nrow <= 0 {
		if DEBUG >= 2 {
			fmt.Printf("[cart] empty dataset (%s) : %v\n",
				D.MajorityClass(), D)
		}

		node.Value = NodeValue{
			IsLeaf: true,
			Class:  D.MajorityClass(),
			Size:   0,
		}
		return node, nil
	}

	// if all dataset is in the same class, return node as leaf with class
	// is set to that class.
	single, name := D.IsInSingleClass()
	if single {
		if DEBUG >= 2 {
			fmt.Printf("[cart] in single class (%s): %v\n", name,
				D.GetColumns())
		}

		node.Value = NodeValue{
			IsLeaf: true,
			Class:  name,
			Size:   nrow,
		}
		return node, nil
	}

	if DEBUG >= 2 {
		fmt.Println("[cart] D:", D)
	}

	// calculate the Gini gain for each attribute.
	gains := runtime.computeGain(D)

	// get attribute with maximum Gini gain.
	MaxGainIdx := gini.FindMaxGain(&gains)
	MaxGain := gains[MaxGainIdx]

	// if maxgain value is 0, use majority class as node and terminate
	// the process
	if MaxGain.GetMaxGainValue() == 0 {
		if DEBUG >= 2 {
			fmt.Println("[cart] max gain 0 with target",
				D.GetClassAsStrings(),
				" and majority class is ", D.MajorityClass())
		}

		node.Value = NodeValue{
			IsLeaf: true,
			Class:  D.MajorityClass(),
			Size:   0,
		}
		return node, nil
	}

	// using the sorted index in MaxGain, sort all field in dataset
	tabula.SortColumnsByIndex(D, MaxGain.SortedIndex)

	if DEBUG >= 2 {
		fmt.Println("[cart] maxgain:", MaxGain)
	}

	// Now that we have attribute with max gain in MaxGainIdx, and their
	// gain dan partition value in Gains[MaxGainIdx] and
	// GetMaxPartValue(), we split the dataset based on type of max-gain
	// attribute.
	// If its continuous, split the attribute using numeric value.
	// If its discrete, split the attribute using subset (partition) of
	// nominal values.
	var splitV interface{}

	if MaxGain.IsContinu {
		splitV = MaxGain.GetMaxPartGainValue()
	} else {
		attrPartV := MaxGain.GetMaxPartGainValue()
		attrSubV := attrPartV.(tekstus.ListStrings)
		splitV = attrSubV[0].Normalize()
	}

	if DEBUG >= 2 {
		fmt.Println("[cart] maxgainindex:", MaxGainIdx)
		fmt.Println("[cart] split v:", splitV)
	}

	node.Value = NodeValue{
		SplitAttrName: D.GetColumn(MaxGainIdx).GetName(),
		IsLeaf:        false,
		IsContinu:     MaxGain.IsContinu,
		Size:          nrow,
		SplitAttrIdx:  MaxGainIdx,
		SplitV:        splitV,
	}

	dsL, dsR, e := tabula.SplitRowsByValue(D, MaxGainIdx, splitV)

	if e != nil {
		return node, e
	}

	splitL := dsL.(tabula.ClasetInterface)
	splitR := dsR.(tabula.ClasetInterface)

	// Set the flag to parent in attribute referenced by
	// MaxGainIdx, so it will not computed again in the next round.
	cols := splitL.GetColumns()
	for x := range *cols {
		if x == MaxGainIdx {
			(*cols)[x].Flag = ColFlagParent
		} else {
			(*cols)[x].Flag = 0
		}
	}

	cols = splitR.GetColumns()
	for x := range *cols {
		if x == MaxGainIdx {
			(*cols)[x].Flag = ColFlagParent
		} else {
			(*cols)[x].Flag = 0
		}
	}

	nodeLeft, e := runtime.splitTreeByGain(splitL)
	if e != nil {
		return node, e
	}

	nodeRight, e := runtime.splitTreeByGain(splitR)
	if e != nil {
		return node, e
	}

	node.SetLeft(nodeLeft)
	node.SetRight(nodeRight)

	return node, nil
}
Beispiel #15
0
//
// ClassifySetByWeight will classify each instance in samples by weight
// with respect to its single performance.
//
// Algorithm,
// (1) For each instance in samples,
// (1.1) for each stage,
// (1.1.1) collect votes for instance in current stage.
// (1.1.2) Compute probabilities of each classes in votes.
//
//		prob_class = count_of_class / total_votes
//
// (1.1.3) Compute total of probabilites times of stage weight.
//
//		stage_prob = prob_class * stage_weight
//
// (1.2) Divide each class stage probabilites with
//
//		stage_prob = stage_prob /
//			(sum_of_all_weights * number_of_tree_in_forest)
//
// (1.3) Select class label with highest probabilites.
// (1.4) Save stage probabilities for positive class.
// (2) Compute confusion matrix.
//
func (crf *Runtime) ClassifySetByWeight(samples tabula.ClasetInterface,
	sampleIds []int,
) (
	predicts []string, cm *classifier.CM, probs []float64,
) {
	stat := classifier.Stat{}
	stat.Start()

	vs := samples.GetClassValueSpace()
	stageProbs := make([]float64, len(vs))
	stageSumProbs := make([]float64, len(vs))
	sumWeights := numerus.Floats64Sum(crf.weights)

	// (1)
	rows := samples.GetDataAsRows()
	for _, row := range *rows {
		for y := range stageSumProbs {
			stageSumProbs[y] = 0
		}

		// (1.1)
		for y, forest := range crf.forests {
			// (1.1.1)
			votes := forest.Votes(row, -1)

			// (1.1.2)
			probs := tekstus.WordsProbabilitiesOf(votes, vs, false)

			// (1.1.3)
			for z := range probs {
				stageSumProbs[z] += probs[z]
				stageProbs[z] += probs[z] * crf.weights[y]
			}
		}

		// (1.2)
		stageWeight := sumWeights * float64(crf.NTree)

		for x := range stageProbs {
			stageProbs[x] = stageProbs[x] / stageWeight
		}

		// (1.3)
		_, maxi, ok := numerus.Floats64FindMax(stageProbs)
		if ok {
			predicts = append(predicts, vs[maxi])
		}

		probs = append(probs, stageSumProbs[0]/
			float64(len(crf.forests)))
	}

	// (2)
	actuals := samples.GetClassAsStrings()
	cm = crf.ComputeCM(sampleIds, vs, actuals, predicts)

	crf.ComputeStatFromCM(&stat, cm)
	stat.End()

	_ = stat.Write(crf.StatFile)

	return predicts, cm, probs
}
Beispiel #16
0
//
// createForest will create and return a forest and run the training `samples`
// on it.
//
// Algorithm,
// (1) Initialize forest.
// (2) For 0 to maximum number of tree in forest,
// (2.1) grow one tree until success.
// (2.2) If tree tp-rate and tn-rate greater than threshold, stop growing.
// (3) Calculate weight.
// (4) TODO: Move true-negative from samples. The collection of true-negative
// will be used again to test the model and after test and the sample with FP
// will be moved to training samples again.
// (5) Refill samples with false-positive.
//
func (crf *Runtime) createForest(samples tabula.ClasetInterface) (
	forest *rf.Runtime, e error,
) {
	var cm *classifier.CM
	var stat *classifier.Stat

	fmt.Println(tag, "Forest samples:", samples)

	// (1)
	forest = &rf.Runtime{
		Runtime: classifier.Runtime{
			RunOOB: true,
		},
		NTree:          crf.NTree,
		NRandomFeature: crf.NRandomFeature,
	}

	e = forest.Initialize(samples)
	if e != nil {
		return nil, e
	}

	// (2)
	for t := 0; t < crf.NTree; t++ {
		if DEBUG >= 2 {
			fmt.Println(tag, "Tree #", t)
		}

		// (2.1)
		for {
			cm, stat, e = forest.GrowTree(samples)
			if e == nil {
				break
			}
		}

		// (2.2)
		if stat.TPRate > crf.TPRate &&
			stat.TNRate > crf.TNRate {
			break
		}
	}

	e = forest.Finalize()
	if e != nil {
		return nil, e
	}

	// (3)
	crf.computeWeight(stat)

	if DEBUG >= 1 {
		fmt.Println(tag, "Weight:", stat.FMeasure)
	}

	// (4)
	crf.deleteTrueNegative(samples, cm)

	// (5)
	crf.runTPSet(samples)

	samples.RecountMajorMinor()

	return forest, nil
}