func (trainer *AdaboostClassifierTrainer) trainNextBaseEstimator(data_set *mlearn.DataSet, step int) ( float64, mlearn.BaseClassifier, float64) { base_estimator := trainer.baseModelTrainer.TrainClassifierWithWeights(data_set, trainer.weights) // embedded features ranking if trainer.options.EnableEmbeddedFeaturesRanking { for j := range trainer.EmbeddedFeaturesRank { trainer.EmbeddedFeaturesRank[j] += trainer.baseModelTrainer.GetFeaturesRank()[j] } } // save the predictions for later use sample := make([]float64, data_set.FeaturesNum) for i := range trainer.prediction { data_set.GetSampleInplace(i, sample) trainer.prediction[i] = base_estimator.PredictProbe(sample) } // calculate the weighted prediction error var err float64 for i := 0; i < data_set.SamplesNum; i++ { if trainer.prediction[i] != data_set.Classes[i] { err += trainer.weights[i] } } // check it to prevent math.Log domain violation (err == 0 means that there's nothing to boost) if err == 0 { return 1.0, base_estimator, err } b := math.Log((1.0-err)/err) + math.Log(float64(data_set.ClassesNum)-1.0) // update weights according to error const minimalWeight float64 = 10e-9 var weights_sum float64 for i := range trainer.weights { if trainer.prediction[i] != data_set.Classes[i] { trainer.weights[i] *= math.Exp(b) if trainer.weights[i] < minimalWeight { /* this is to deal with rounding errors */ trainer.weights[i] = 0 } } else { //trainer.weights[i] *= math.Exp(-b) } weights_sum += trainer.weights[i] } // normalize weights to 1.0 sum var normalized_sum float64 for i := range trainer.weights { trainer.weights[i] /= weights_sum normalized_sum += trainer.weights[i] } // dirty hack to guarantee 1.0 weights sum which is important to base model (regardless of rounding errors) trainer.weights[0] += 1.0 - normalized_sum return b, base_estimator, err }
func testEmbeddedFeaturesFiltering(numBaseModels, maxTreeHeight int, train_dataset, test_dataset *mlearn.DataSet) ( []testEmbeddedFeaturesFilteringResult, float64) { fmt.Println("training adaboost classifier with embedded features filtering ...") cartTrainer := adaboost.NewCARTClassifierTrainer(train_dataset, adaboost.CARTClassifierTrainOptions{MaxDepth: int(maxTreeHeight), MinElementsInLeaf: 10, EnableEmbeddedFeaturesRanking: true}) adaboostTrainer := adaboost.NewAdaboostClassifierTrainer(cartTrainer) classifier := adaboostTrainer.TrainClassifier(train_dataset, adaboost.AdaboostClassifierTrainOptions{MaxEstimators: numBaseModels, EnableEmbeddedFeaturesRanking: true}) ranked_features := adaboostTrainer.GetRankedFeatures() fmt.Println("predicting ...") predictions := make([]int, len(test_dataset.Classes)) for i := range predictions { predictions[i] = classifier.PredictProbe(test_dataset.GetSample(i)) } precision, recall, ref_f1 := mlearn.PrecisionRecallF1(predictions, test_dataset.Classes, test_dataset.ClassesNum) fmt.Printf("reference precision: %.3v recall: %.3v f1: %.3v \n\n", precision, recall, ref_f1) var test_results []testEmbeddedFeaturesFilteringResult for j := 10; j < test_dataset.FeaturesNum; j++ { fmt.Printf("now training again using only %v of selected features ...\n", j) selected_features := ranked_features[:j] fmt.Println("selected features are: ") fmt.Println(selected_features) train_data_subset := &mlearn.DataSet{} test_data_subset := &mlearn.DataSet{} *train_data_subset = *train_dataset *test_data_subset = *test_dataset train_data_subset.SubsetFeatures(selected_features) test_data_subset.SubsetFeatures(selected_features) fmt.Println() cartTrainer = adaboost.NewCARTClassifierTrainer(train_data_subset, adaboost.CARTClassifierTrainOptions{MaxDepth: int(maxTreeHeight), MinElementsInLeaf: 10}) adaboostTrainer = adaboost.NewAdaboostClassifierTrainer(cartTrainer) classifier = adaboostTrainer.TrainClassifier(train_data_subset, adaboost.AdaboostClassifierTrainOptions{MaxEstimators: numBaseModels}) fmt.Println("predicting ...") predictions = make([]int, len(test_data_subset.Classes)) for i := range predictions { predictions[i] = classifier.PredictProbe(test_data_subset.GetSample(i)) } precision, recall, f1 := mlearn.PrecisionRecallF1(predictions, test_dataset.Classes, test_dataset.ClassesNum) fmt.Printf("precision: %.3v recall: %.3v f1: %.3v \n", precision, recall, f1) test_results = append(test_results, testEmbeddedFeaturesFilteringResult{FeaturesCount: j, F1: f1}) } return test_results, ref_f1 }
func testCARTClassifier(train_dataset, test_dataset *mlearn.DataSet) { fmt.Println("testing CART classifier itself ...") cartTrainer := adaboost.NewCARTClassifierTrainer(train_dataset, adaboost.CARTClassifierTrainOptions{MaxDepth: 0, MinElementsInLeaf: 1}) classifier := cartTrainer.TrainClassifier(train_dataset) predictions := make([]int, len(test_dataset.Classes)) for i := range predictions { predictions[i] = classifier.PredictProbe(test_dataset.GetSample(i)) } precision, recall, f1 := mlearn.PrecisionRecallF1(predictions, test_dataset.Classes, test_dataset.ClassesNum) // here we expect f1 == 1.0 fmt.Printf("precision: %.3v recall: %.3v f1: %.3v \n\n", precision, recall, f1) }
func testAdaboostClassifier(numBaseModels, maxTreeHeight, minLeafItems int, train_dataset, test_dataset *mlearn.DataSet) { fmt.Println("training adaboost classifier over CART trees ...") cartTrainer := adaboost.NewCARTClassifierTrainer(train_dataset, adaboost.CARTClassifierTrainOptions{MaxDepth: maxTreeHeight, MinElementsInLeaf: minLeafItems}) adaboostTrainer := adaboost.NewAdaboostClassifierTrainer(cartTrainer) classifier := adaboostTrainer.TrainClassifier(train_dataset, adaboost.AdaboostClassifierTrainOptions{MaxEstimators: numBaseModels}) fmt.Println("predicting ...") predictions := make([]int, len(test_dataset.Classes)) for i := range predictions { predictions[i] = classifier.PredictProbe(test_dataset.GetSample(i)) } precision, recall, f1 := mlearn.PrecisionRecallF1(predictions, test_dataset.Classes, test_dataset.ClassesNum) fmt.Printf("\nprecision: %.3v recall: %.3v f1: %.3v \n", precision, recall, f1) }