func SplitFile(dataset *core.DataSet, total, part int) (*core.DataSet, *core.DataSet) { train := core.NewDataSet() test := core.NewDataSet() for i, sample := range dataset.Samples { if i%total == part { test.AddSample(sample) } else { train.AddSample(sample) } } return train, test }
func main() { train_path, _, _, method, params := hector.PrepareParams() global, _ := strconv.ParseInt(params["global"], 10, 64) profile, _ := params["profile"] dataset := core.NewDataSet() dataset.Load(train_path, global) cv, _ := strconv.ParseInt(params["cv"], 10, 32) total := int(cv) if profile != "" { fmt.Println(profile) f, err := os.Create(profile) if err != nil { fmt.Println("%v", err) log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } average_auc := 0.0 for part := 0; part < total; part++ { train, test := SplitFile(dataset, total, part) classifier := hector.GetClassifier(method) classifier.Init(params) auc, _ := hector.AlgorithmRunOnDataSet(classifier, train, test, "", params) fmt.Println("AUC:") fmt.Println(auc) average_auc += auc classifier = nil } fmt.Println(average_auc / float64(total)) }
func main() { train_path, _, _, method, params := hector.PrepareParams() global, _ := strconv.ParseInt(params["global"], 10, 64) profile, _ := params["profile"] dataset := core.NewDataSet() dataset.Load(train_path, global) cv, _ := strconv.ParseInt(params["cv"], 10, 32) total := int(cv) if profile != "" { f, err := os.Create(profile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } average_accuracy := 0.0 for part := 0; part < total; part++ { train, test := SplitFile(dataset, total, part) classifier := hector.GetMutliClassClassifier(method) classifier.Init(params) accuracy := hector.MultiClassRunOnDataSet(classifier, train, test, "", params) fmt.Println("accuracy : ", accuracy) average_accuracy += accuracy classifier = nil train = nil test = nil runtime.GC() } fmt.Println(average_accuracy / float64(total)) }
func MultiClassRun(classifier algo.MultiClassClassifier, train_path string, test_path string, pred_path string, params map[string]string) (float64, error) { global, _ := strconv.ParseInt(params["global"], 10, 64) train_dataset := core.NewDataSet() err := train_dataset.Load(train_path, global) if err != nil { return 0.5, err } test_dataset := core.NewDataSet() err = test_dataset.Load(test_path, global) if err != nil { return 0.5, err } classifier.Init(params) accuracy := MultiClassRunOnDataSet(classifier, train_dataset, test_dataset, pred_path, params) return accuracy, nil }
func AlgorithmRun(classifier algo.Classifier, train_path string, test_path string, pred_path string, params map[string]string) (float64, []*eval.LabelPrediction, error) { global, _ := strconv.ParseInt(params["global"], 10, 64) train_dataset := core.NewDataSet() err := train_dataset.Load(train_path, global) if err != nil { return 0.5, nil, err } test_dataset := core.NewDataSet() err = test_dataset.Load(test_path, global) if err != nil { return 0.5, nil, err } classifier.Init(params) auc, predictions := AlgorithmRunOnDataSet(classifier, train_dataset, test_dataset, pred_path, params) return auc, predictions, nil }
func constructTrainingData(records []creditRecord, borrower2iuser []int, protos []*hector.Sample) *hector.DataSet { data := hector.NewDataSet() for borrower, record := range records { for i := 0; i < record.borrowed; i++ { s := protos[borrower2iuser[borrower]].Clone() if i < record.returned { s.Label = 1 } else { s.Label = 0 } data.AddSample(s) } } return data }
func main() { path := flag.String("input", "", "path of dataset") flag.Parse() ds := core.NewDataSet() ds.Load(*path, -1) iv := core.InformationValue(ds) fs := make(FeatureValueList, 0, len(iv)) for f, v := range iv { fs = append(fs, FeatureValue{Name: ds.FeatureNameIdMap[f], Value: v}) } sort.Sort(fs) for _, f := range fs { fmt.Printf("%s\t%v\n", f.Name, f.Value) } }
func (c *L1VM) Train(dataset *core.DataSet) { c.sv = []*core.Vector{} kernel_dataset := core.NewDataSet() positive := []int{} negative := []int{} for i, si := range dataset.Samples { if si.Label > 0.0 { positive = append(positive, i) } else { negative = append(negative, i) } } perm_positive := rand.Perm(len(positive)) for i, k := range perm_positive { if i > c.count { break } c.sv = append(c.sv, dataset.Samples[positive[k]].GetFeatureVector()) } perm_negative := rand.Perm(len(negative)) for i, k := range perm_negative { if i > c.count { break } c.sv = append(c.sv, dataset.Samples[negative[k]].GetFeatureVector()) } for _, si := range dataset.Samples { xi := si.GetFeatureVector() tsample := core.NewSample() tsample.Label = si.Label for j, xj := range c.sv { tsample.AddFeature(core.Feature{Id: int64(j), Value: RBFKernel(xi, xj, c.radius)}) } kernel_dataset.AddSample(tsample) } c.ftrl.Train(kernel_dataset) }
func MultiClassTest(classifier algo.MultiClassClassifier, test_path string, pred_path string, params map[string]string) (float64, error) { global, _ := strconv.ParseInt(params["global"], 10, 64) model_path, _ := params["model"] classifier.Init(params) if model_path != "" { classifier.LoadModel(model_path) } else { return 0.0, nil } test_dataset := core.NewDataSet() err := test_dataset.Load(test_path, global) if err != nil { return 0.0, err } accuracy := MultiClassRunOnDataSet(classifier, nil, test_dataset, pred_path, params) return accuracy, nil }
func MultiClassTrain(classifier algo.MultiClassClassifier, train_path string, params map[string]string) error { global, _ := strconv.ParseInt(params["global"], 10, 64) train_dataset := core.NewDataSet() err := train_dataset.Load(train_path, global) if err != nil { return err } classifier.Init(params) classifier.Train(train_dataset) model_path, _ := params["model"] if model_path != "" { classifier.SaveModel(model_path) } return nil }
func AlgorithmTest(classifier algo.Classifier, test_path string, pred_path string, params map[string]string) (float64, []*eval.LabelPrediction, error) { global, _ := strconv.ParseInt(params["global"], 10, 64) model_path, _ := params["model"] classifier.Init(params) if model_path != "" { classifier.LoadModel(model_path) } else { return 0.0, nil, nil } test_dataset := core.NewDataSet() err := test_dataset.Load(test_path, global) if err != nil { return 0.0, nil, err } auc, predictions := AlgorithmRunOnDataSet(classifier, nil, test_dataset, pred_path, params) return auc, predictions, nil }
func EncodeLabelAction(e *core.LabelEncoder, data_path string) { dataset := core.NewDataSet() err := dataset.Load(data_path, -1) if err != nil { log.Fatal(err) return } encoded_label_dataset := e.TransformDataset(dataset) var output_file *os.File output_file, _ = os.Create(data_path + ".hector") for _, sample := range encoded_label_dataset.Samples { output_file.WriteString(string(sample.ToString(false)) + "\n") } if output_file != nil { defer output_file.Close() } }
func main() { train_path, test_path, pred_path, _, params := hector.PrepareParams() total := 5 methods := []string{"ftrl", "fm"} all_methods_predictions := [][]*eval.LabelPrediction{} all_methods_test_predictions := [][]*eval.LabelPrediction{} for _, method := range methods { fmt.Println(method) average_auc := 0.0 all_predictions := []*eval.LabelPrediction{} for part := 0; part < total; part++ { train, test, _ := SplitFile(train_path, total, part) classifier := hector.GetClassifier(method) auc, predictions, _ := hector.AlgorithmRun(classifier, train, test, "", params) fmt.Println("AUC:") fmt.Println(auc) average_auc += auc os.Remove(train) os.Remove(test) classifier = nil for _, pred := range predictions { all_predictions = append(all_predictions, pred) } } all_methods_predictions = append(all_methods_predictions, all_predictions) fmt.Println(average_auc / float64(total)) classifier := hector.GetClassifier(method) fmt.Println(test_path) _, test_predictions, _ := hector.AlgorithmRun(classifier, train_path, test_path, "", params) all_methods_test_predictions = append(all_methods_test_predictions, test_predictions) } var wait sync.WaitGroup wait.Add(2) dataset := core.NewDataSet() go func() { for i, _ := range all_methods_predictions[0] { sample := core.NewSample() sample.Label = all_methods_predictions[0][i].Label for j, _ := range all_methods_predictions { feature := core.Feature{Id: int64(j), Value: all_methods_predictions[j][i].Prediction} sample.AddFeature(feature) } dataset.Samples <- sample } close(dataset.Samples) wait.Done() }() ensembler := lr.LinearRegression{} go func() { ensembler.Init(params) ensembler.Train(dataset) wait.Done() }() wait.Wait() fmt.Println(ensembler.Model) wait.Add(2) test_dataset := hector.NewDataSet() go func() { for i, _ := range all_methods_test_predictions[0] { sample := hector.NewSample() sample.Label = all_methods_test_predictions[0][i].Prediction for j, _ := range all_methods_test_predictions { feature := hector.Feature{Id: int64(j), Value: all_methods_test_predictions[j][i].Prediction} sample.AddFeature(feature) } test_dataset.Samples <- sample } close(test_dataset.Samples) wait.Done() }() go func() { pred_file, _ := os.Create(test_path + ".out") for sample := range test_dataset.Samples { prediction := sample.Label //ensembler.Predict(sample) pred_file.WriteString(strconv.FormatFloat(prediction, 'g', 5, 64) + "\n") } defer pred_file.Close() wait.Done() }() wait.Wait() }