Ejemplo n.º 1
0
func main() {
	fm := flag.String("fm",
		"featurematrix.afm", "AFM formated feature matrix containing data.")

	blacklist := flag.String("blacklist",
		"", "A list of feature id's to exclude from the set of predictors.")

	targetname := flag.String("target",
		"", "The row header of the target in the feature matrix.")
	train := flag.String("train",
		"train_%v.fm", "Format string for training fms.")
	test := flag.String("test",
		"test_%v.fm", "Format string for testing fms.")

	// var zipoutput bool
	// flag.BoolVar(&zipoutput, "zip", false, "Output ziped files.")
	var unstratified bool
	flag.BoolVar(&unstratified, "unstratified", false, "Force unstratified sampeling of categorical target.")

	var writelibsvm bool
	flag.BoolVar(&writelibsvm, "writelibsvm", false, "Output libsvm.")

	var writearff bool
	flag.BoolVar(&writearff, "writearff", false, "Output arff.")

	var writeall bool
	flag.BoolVar(&writeall, "writeall", false, "Output all three formats.")

	var folds int
	flag.IntVar(&folds, "folds", 5, "Number of folds to generate.")

	var maxcats int
	flag.IntVar(&maxcats, "maxcats", -1, "Maximum number of categories to allow in a feature.")

	var impute bool
	flag.BoolVar(&impute, "impute", false, "Impute missing values to feature mean/mode.")

	var onehot bool
	flag.BoolVar(&onehot, "onehot", false, "Do one hot encoding of categorical features to boolean true false.")

	var num bool
	flag.BoolVar(&num, "num", false, "Do one hot encoding of categorical features to numerical features.")

	flag.Parse()

	//Parse Data
	data, err := CloudForest.LoadAFM(*fm)
	if err != nil {
		log.Fatal(err)
	}

	blacklisted := 0
	blacklistis := make([]bool, len(data.Data))
	if *blacklist != "" {
		fmt.Printf("Loading blacklist from: %v\n", *blacklist)
		blackfile, err := os.Open(*blacklist)
		if err != nil {
			log.Fatal(err)
		}
		tsv := csv.NewReader(blackfile)
		tsv.Comma = '\t'
		for {
			id, err := tsv.Read()
			if err == io.EOF {
				break
			} else if err != nil {
				log.Fatal(err)
			}
			if id[0] == *targetname {
				continue
			}
			i, ok := data.Map[id[0]]
			if !ok {
				fmt.Printf("Ignoring blacklist feature not found in data: %v\n", id[0])
				continue
			}
			if !blacklistis[i] {
				blacklisted += 1
				blacklistis[i] = true
			}

		}
		blackfile.Close()

	}

	newdata := make([]CloudForest.Feature, 0, len(data.Data)-blacklisted)
	newmap := make(map[string]int, len(data.Data)-blacklisted)

	for i, f := range data.Data {
		if !blacklistis[i] && (maxcats == -1 || f.NCats() <= maxcats) {
			newmap[f.GetName()] = len(newdata)
			newdata = append(newdata, f)
		}
	}

	data.Data = newdata
	data.Map = newmap

	if impute {
		fmt.Println("Imputing missing values to feature mean/mode.")
		data.ImputeMissing()
	}

	if onehot {
		fmt.Println("OneHot encoding.")
		data.OneHot()
	}

	if num {
		fmt.Println("Numerical OneHot encoding.")
		data = data.EncodeToNum()
	}

	foldis := make([][]int, 0, folds)

	foldsize := len(data.CaseLabels) / folds
	fmt.Printf("%v cases, foldsize %v\n", len(data.CaseLabels), foldsize)
	for i := 0; i < folds; i++ {
		foldis = append(foldis, make([]int, 0, foldsize))
	}

	var targetf CloudForest.Feature

	//find the target feature
	fmt.Printf("Target : %v\n", *targetname)
	targeti, ok := data.Map[*targetname]
	if !ok {
		fmt.Println("Target not found in data, doing unstratified sampeling.")
		unstratified = true
	}

	if ok {
		targetf = data.Data[targeti]

		switch targetf.(type) {
		case *CloudForest.DenseNumFeature:
			unstratified = true
		}
	}
	if unstratified {
		ncases := len(data.CaseLabels)
		cases := make([]int, ncases, ncases)
		for i := 0; i < ncases; i++ {
			cases[i] = i
		}
		CloudForest.SampleFirstN(&cases, nil, len(cases), 0)
		for j := 0; j < folds; j++ {
			for k := j * foldsize; k < (j+1)*foldsize; k++ {
				foldis[j] = append(foldis[j], cases[k])
			}
		}

	} else {
		//sample folds stratified by case
		fmt.Printf("Stratifying by %v classes.\n", targetf.(*CloudForest.DenseCatFeature).NCats())
		bSampler := CloudForest.NewBalancedSampler(targetf.(*CloudForest.DenseCatFeature))

		fmt.Printf("Stratifying by %v classes.\n", len(bSampler.Cases))
		var samples []int
		for i := 0; i < len(bSampler.Cases); i++ {
			fmt.Printf("%v cases in class %v.\n", len(bSampler.Cases[i]), i)
			//shuffle in place
			CloudForest.SampleFirstN(&bSampler.Cases[i], &samples, len(bSampler.Cases[i]), 0)
			stratFoldSize := len(bSampler.Cases[i]) / folds
			for j := 0; j < folds; j++ {
				for k := j * stratFoldSize; k < (j+1)*stratFoldSize; k++ {
					foldis[j] = append(foldis[j], bSampler.Cases[i][k])

				}
			}

		}
	}
	encode := false

	for _, f := range data.Data {
		if f.NCats() > 0 {
			encode = true
		}
	}

	encoded := data
	if encode && (writelibsvm || writeall) {
		encoded = data.EncodeToNum()
	}

	trainis := make([]int, 0, foldsize*(folds-1))
	//Write training and testing matrixes
	for i := 0; i < folds; i++ {

		trainfn := fmt.Sprintf(*train, i)
		testfn := fmt.Sprintf(*test, i)

		trainis = trainis[0:0]
		for j := 0; j < folds; j++ {
			if i != j {
				trainis = append(trainis, foldis[j]...)
			}
		}

		if writearff || writeall {
			trainW, testW := openfiles(trainfn+".arff", testfn+".arff")
			CloudForest.WriteArffCases(data, foldis[i], *targetname, testW)
			CloudForest.WriteArffCases(data, trainis, *targetname, trainW)
		}

		if ((!writelibsvm) && (!writearff)) || writeall {
			trainW, testW := openfiles(trainfn, testfn)
			data.WriteCases(testW, foldis[i])
			data.WriteCases(trainW, trainis)
		}

		if writelibsvm || writeall {
			trainW, testW := openfiles(trainfn+".libsvm", testfn+".libsvm")
			CloudForest.WriteLibSvmCases(encoded, foldis[i], *targetname, testW)
			CloudForest.WriteLibSvmCases(encoded, trainis, *targetname, trainW)
		}

		fmt.Printf("Wrote fold %v. %v testing cases and %v training cases.\n", i, len(foldis[i]), len(trainis))
	}

}
Ejemplo n.º 2
0
func main() {
	fm := flag.String("fm",
		"featurematrix.afm", "AFM formated feature matrix containing data.")

	targetname := flag.String("target",
		"", "The row header of the target in the feature matrix.")
	train := flag.String("train",
		"train_%v.fm", "Format string for training fms.")
	test := flag.String("test",
		"test_%v.fm", "Format string for testing fms.")

	// var zipoutput bool
	// flag.BoolVar(&zipoutput, "zip", false, "Output ziped files.")
	var unstratified bool
	flag.BoolVar(&unstratified, "unstratified", false, "Force unstratified sampeling of categorical target.")

	var writelibsvm bool
	flag.BoolVar(&writelibsvm, "writelibsvm", false, "Output libsvm.")

	var writearff bool
	flag.BoolVar(&writearff, "writearff", false, "Output arff.")

	var writeall bool
	flag.BoolVar(&writeall, "writeall", false, "Output all three formats.")

	var folds int
	flag.IntVar(&folds, "folds", 5, "Number of folds to generate.")

	flag.Parse()

	//Parse Data
	data, err := CloudForest.LoadAFM(*fm)
	if err != nil {
		log.Fatal(err)
	}

	foldis := make([][]int, 0, folds)

	foldsize := len(data.CaseLabels) / folds
	fmt.Printf("%v cases, foldsize %v\n", len(data.CaseLabels), foldsize)
	for i := 0; i < folds; i++ {
		foldis = append(foldis, make([]int, 0, foldsize))
	}

	var targetf CloudForest.Feature

	//find the target feature
	fmt.Printf("Target : %v\n", *targetname)
	targeti, ok := data.Map[*targetname]
	if !ok {
		fmt.Println("Target not found in data, doing unstratified sampeling.")
		unstratified = true
	}

	if ok {
		targetf = data.Data[targeti]

		switch targetf.(type) {
		case *CloudForest.DenseNumFeature:
			unstratified = true
		}
	}
	if unstratified {
		ncases := len(data.CaseLabels)
		cases := make([]int, ncases, ncases)
		for i := 0; i < ncases; i++ {
			cases[i] = i
		}
		CloudForest.SampleFirstN(&cases, nil, len(cases), 0)
		for j := 0; j < folds; j++ {
			for k := j * foldsize; k < (j+1)*foldsize; k++ {
				foldis[j] = append(foldis[j], cases[k])
			}
		}

	} else {
		//sample folds stratified by case
		fmt.Printf("Stratifying by %v classes.\n", targetf.(*CloudForest.DenseCatFeature).NCats())
		bSampler := CloudForest.NewBalancedSampler(targetf.(*CloudForest.DenseCatFeature))

		fmt.Printf("Stratifying by %v classes.\n", len(bSampler.Cases))
		var samples []int
		for i := 0; i < len(bSampler.Cases); i++ {
			fmt.Printf("%v cases in class %v.\n", len(bSampler.Cases[i]), i)
			//shuffle in place
			CloudForest.SampleFirstN(&bSampler.Cases[i], &samples, len(bSampler.Cases[i]), 0)
			stratFoldSize := len(bSampler.Cases[i]) / folds
			for j := 0; j < folds; j++ {
				for k := j * stratFoldSize; k < (j+1)*stratFoldSize; k++ {
					foldis[j] = append(foldis[j], bSampler.Cases[i][k])

				}
			}

		}
	}

	trainis := make([]int, 0, foldsize*(folds-1))
	//Write training and testing matrixes
	for i := 0; i < folds; i++ {

		trainfn := fmt.Sprintf(*train, i)
		testfn := fmt.Sprintf(*test, i)

		trainis = trainis[0:0]
		for j := 0; j < folds; j++ {
			if i != j {
				trainis = append(trainis, foldis[j]...)
			}
		}

		if writearff || writeall {
			trainW, testW := openfiles(trainfn+".arff", testfn+".arff")
			CloudForest.WriteArffCases(data, foldis[i], *targetname, testW)
			CloudForest.WriteArffCases(data, trainis, *targetname, trainW)
		}

		if ((!writelibsvm) && (!writearff)) || writeall {
			trainW, testW := openfiles(trainfn, testfn)
			data.WriteCases(testW, foldis[i])
			data.WriteCases(trainW, trainis)
		}

		if writelibsvm || writeall {
			trainW, testW := openfiles(trainfn+".libsvm", testfn+".libsvm")
			CloudForest.WriteLibSvmCases(data, foldis[i], *targetname, testW)
			CloudForest.WriteLibSvmCases(data, trainis, *targetname, trainW)
		}

		fmt.Printf("Wrote fold %v. %v testing cases and %v training cases.\n", i, len(foldis[i]), len(trainis))
	}

}