//Start the selection attributes process
func (as *AttributeSelection) StartSelection(instances data.Instances) {
	as.input = data.NewInstancesWithInst(instances, len(instances.Attributes()))
	as.input = instances
	as.hasClass = as.input.ClassIndex() >= 0
	as.selectedAttributes = as.SelectAttributes(as.input)
	if len(as.selectedAttributes) == 0 {
		panic("No selected attributes")
	//Set output
	fmt.Println(as.selectedAttributes, "as.selectedAttributes")
	as.output = data.NewInstances()
	attributes := make([]data.Attribute, 0)
	for i := range as.selectedAttributes {
		attributes = append(attributes, *as.input.Attribute(as.selectedAttributes[i]))
	fmt.Println(attributes, "attributes")
	if as.hasClass {
		as.output.SetClassIndex(len(as.selectedAttributes) - 1)
	// Convert pending input instances
	tmpInst := make([]data.Instance, 0)
	for _, in := range as.input.Instances() {
		tmpInst = append(tmpInst, as.convertInstance(in))
func (r *Remove) SetInputFormat(instInfo data.Instances) {
	attributes := make([]data.Attribute, 0)
	outputClass := -1
	for _, current := range r.selectedAttributes {
		if instInfo.ClassIndex() == current {
			outputClass = len(attributes)
		keep := *instInfo.Attribute(current)
		attributes = append(attributes, keep)
	fmt.Println(len(attributes), "attributes", "\n", outputClass, "outputClass")
	r.outputFormat = data.NewInstancesWithClassIndex(outputClass)
//Select attributes for a split of the data
func (as *AttributeSelection) selectAttributesCVSplit(split data.Instances) {
	attributeRanking := make([][]float64, 0)
	//this is only helpfull if this method is called from outside not from inner method of the object
	//	if as.trainInstances.(nil)  {
	//		as.trainInstances =  split
	//	}

	// create space to hold statistics
	if as.rankResults == nil && as.subSetResults == nil {
		as.subSetResults = make([]float64, len(split.Attributes()))
		as.rankResults = make([][]float64, 4)
		for i := range as.rankResults {
			as.rankResults[i] = make([]float64, len(split.Attributes()))

	// Do the search
	attributeSet := as.search.Search(as.evaluator, split)
	if as.doRank {
		attributeRanking = as.search.rankedAttributes()
		for j := range attributeRanking {
			// merit
			as.rankResults[0][int(attributeRanking[j][0])] += attributeRanking[j][1]
			// squared merit
			as.rankResults[2][int(attributeRanking[j][0])] += (attributeRanking[j][1] * attributeRanking[j][1])
			// rank
			as.rankResults[1][int(attributeRanking[j][0])] += float64(j + 1)
			// squared rank
			as.rankResults[3][int(attributeRanking[j][0])] += float64((j + 1) * (j + 1))
	} else {
		for j := range attributeSet {
func (stwv *StringToWordVector) determineDictionary(inst *data.Instances) {
	/* TODO: see if use a stopwords list*/
	fmt.Println("Determing dictionary!")
	classInd := inst.ClassIndex()
	values := 1
	if stwv.perClass && (classInd != -1) {
		values = len(inst.Attributes()[classInd].Values())
	dicA := make([]*omap.Map, values)
	for i := 0; i < values; i++ {
		dicA[i] = omap.NewStringKeyed()
	// Tokenize all training text into an orderedMap of "words".
	pruneRate := int64((stwv.perdiodicPruningRate / 100) * len(inst.Instances()))
	for i, instance := range inst.Instances() {
		vInd := int(0)
		if stwv.perClass && (classInd != -1) {
			vInd = int(instance.RealValues()[classInd])
		//Iterate through all relevant string attributes of the current instance
		hashtable := make(map[string]int, 0)
		for j := 0; j < instance.NumAttributes(); j++ {
			if !instance.IsMissingValue(j) && inst.Attributes()[j].IsString() {
				// Iterate through tokens, perform stemming, and remove stopwords
				// (if required)
				words := strings.Fields(instance.Values()[j])
				for _, word := range words {
					_, present := hashtable[word]
					if !present {
						hashtable[word] = 0
					if count, present := dicA[vInd].Find(word); !present {
						dicA[vInd].Insert(word, Count{1, 0})
					} else {
						count, _ := count.(Count)
						dicA[vInd].Insert(word, count)
		//updating the docCount for the words that have occurred in this
		enumeration := make([]string, 0, len(hashtable))
		for word, _ := range hashtable { //only the words
			enumeration = append(enumeration, word)
		for _, word := range enumeration {
			if count, present := dicA[vInd].Find(word); present {
				count := count.(Count)
				//delete(dicA[vInd], word)
				dicA[vInd].Insert(word, count)
				//fmt.Println(word, " ",dicA[vInd][word])
			} else {
				panic("Check the code, there must be a word in the dictionary")

		if pruneRate > 0 {
			if int64(i)%pruneRate == 0 && i > 0 {
				for z := 0; z < values; z++ {
					d := make([]string, 1000)
					dicA[z].Do(func(key, value interface{}) {
						word, _ := key.(string)
						count, _ := value.(Count)
						if count.Count <= 1 {
							d = append(d, word)
					//					for word, _ := range dicA[z] {
					//						count := dicA[z][word]
					//						if count.Count <= 1 {
					//							d = append(d, word)
					//						}
					//					}
					for _, word := range d {
						//delete(dicA[z], word)
		//fmt.Println("new instance-----------------------------------------------------------")
	// Figure out the minimum required word frequency
	totalSize := int(0)
	prune := make([]int, values)
	for z := 0; z < values; z++ {
		totalSize += dicA[z].Len()
		array := make([]int, dicA[z].Len())
		pos := int(0)
		dicA[z].Do(func(key, value interface{}) {
			//_, _ := key.(string)
			count, _ := value.(Count)
			array[pos] = count.Count
		//		for word, _ := range dicA[z] {
		//			count := dicA[z][word]
		//			array[pos] = count.Count
		//			pos++
		//		}
		if len(array) < stwv.wordsToKeep {
			// if there aren't enough words, set the threshold to
			// minFreq
			prune[z] = int(stwv.minTermFreq)
		} else {
			// otherwise set it to be at least minFreq
			idx := len(array) - stwv.wordsToKeep
			prune[z] = int(math.Max(float64(stwv.minTermFreq), float64(array[idx])))
	// Convert the dictionary into an attribute index
	// and create one attribute per word
	attributes := make([]data.Attribute, 0, totalSize+len(inst.Attributes()))
	fmt.Println(totalSize+len(inst.Attributes()), "len(attributes)")
	// Add the non-converted attributes
	classIndex := int(-1)
	for i, attr := range stwv.inputFormat.Attributes() {
		if !attr.IsString() {
			if inst.ClassIndex() == i {
				classIndex = len(attributes)
			attributes = append(attributes, attr)
	// Add the word vector attributes (eliminating duplicates
	// that occur in multiple classes)
	newDic := omap.NewStringKeyed()
	index := len(attributes)
	for z := 0; z < values; z++ {
		dicA[z].Do(func(key, value interface{}) {
			word, _ := key.(string)
			count, _ := value.(Count)
			if count.Count >= prune[z] {
				if _, present := newDic.Find(word); !present {
					newDic.Insert(word, int(index))
					att := data.NewAttribute()
					attributes = append(attributes, att)

		//		for word, _ := range dicA[z] {
		//			count := dicA[z][word]
		//			//fmt.Println(count.Count, prune[z])
		//			if count.Count >= prune[z] {
		//				if _, present := newDic[word]; !present {
		//					newDic[word] = float64(index)
		//					index++
		//					att := data.NewAttribute()
		//					att.SetName(word)
		//					att.SetType(data.STRING)
		//					attributes = append(attributes, att)
		//					fmt.Println(index)
		//				}
		//			}
		//		}
	// Compute document frequencies
	stwv.docsCounts = make([]int, len(attributes))
	//idx := 0
	newDic.Do(func(key, value interface{}) {
		word, _ := key.(string)
		idx, _ := value.(int)
		docsCount := 0
		for j := 0; j < values; j++ {
			if count, present := dicA[j].Find(word); present {
				count := count.(Count)
				//fmt.Println(count.DocCount, "doccount newdic")
				docsCount += count.DocCount
		stwv.docsCounts[idx] = docsCount
	//	for word, idx := range newDic {
	//		docsCount := 0
	//		for j := 0; j < values; j++ {
	//			if count, present := dicA[j][word]; present {
	//				docsCount += count.DocCount
	//			}
	//		}
	//		stwv.docsCounts[int(idx)] = docsCount
	//		//idx++
	//	}
	fmt.Println("doc: ", stwv.docsCounts)
	stwv.dictionary = newDic
	////fmt.Println("numInst", len(inst.Instances()))
	stwv.numInstances = len(inst.Instances())
	stwv.outputFormat = data.NewInstances()
Example #5
func (ig *InfoGain) BuildEvaluator(instances data.Instances) {
	classIndex := instances.ClassIndex()
	numInstances := len(instances.Instances())

	if ig.binarize { //binarize instances
		//implement NumericToBinary function
		ntb := NewNumericToBinary()
		instances = ntb.Output()
	} else { //discretize instances
		//implement Discretize function
	numClasses := instances.Attribute(classIndex).NumValues()
	// Reserve space and initialize counters
	counts := make([][][]float64, len(instances.Attributes())) //initialize first dimension
	for k := range instances.Attributes() {
		if k != classIndex {
			numValues := len(instances.Attributes()[k].Values())
			counts[k] = make([][]float64, numValues+1) //initialize second dimension
			for i := range counts[k] {
				counts[k][i] = make([]float64, numClasses+1) //initialize third dimension
	// Initialize counters
	fmt.Println(numClasses, "numclasses")
	temp := make([]float64, numClasses+1)
	for k := 0; k < numInstances; k++ {
		inst := instances.Instance(k)
		if inst.ClassMissing(classIndex) { //check that class if the class is missing /*implement method to do that*/
			temp[numClasses] += inst.Weight()
		} else {
			fmt.Println(int(inst.ClassValue(classIndex)), "classIndexes", inst.Weight(), "weights")
			temp[int(inst.ClassValue(classIndex))] += inst.Weight() //get the index of the value of the class
	for k := range counts {
		if k != classIndex {
			for i := range temp {
				counts[k][0][i] = temp[i]
	// Get counts
	//inst.RealValues()[classIndex]) check this after finish, may contains errors, its have to be check if the classIndex exists if not return 0 /*see weka*/
	//implement the necessary methods to make easier this implementation and not bugs friendly
	//New methods already implemented!!!!!!!! Later check it's functioning
	for k := 0; k < numInstances; k++ {
		inst := instances.Instance(k)
		for i := range inst.RealValues() {
			if inst.Index(i) != classIndex {
				if inst.IsMissingValue(i) || inst.ClassMissing(classIndex) { //if is missing the real value and the class
					if !inst.IsMissingValue(i) {
						counts[inst.Index(i)][int(inst.ValueSparse(i))][numClasses] += inst.Weight()
						counts[inst.Index(i)][0][numClasses] -= inst.Weight()
					} else if !inst.IsMissingValue(classIndex) {
						counts[inst.Index(i)][instances.Attribute(inst.Index(i)).NumValues()][int(inst.ClassValue(classIndex))] += inst.Weight() //tongue twister, now its not
						counts[inst.Index(i)][0][int(inst.ClassValue(classIndex))] -= inst.Weight()
					} else {
						counts[inst.Index(i)][instances.Attribute(inst.Index(i)).NumValues()][numClasses] += inst.Weight()
						counts[inst.Index(i)][0][numClasses] -= inst.Weight()
				} else {
					counts[inst.Index(i)][int(inst.ValueSparse(i))][int(inst.ClassValue(classIndex))] += inst.Weight()
					counts[inst.Index(i)][0][int(inst.ClassValue(classIndex))] -= inst.Weight()
	// distribute missing counts if required
	if ig.missingMerge {
		for k := range instances.Attributes() {
			if k != classIndex {
				numValues := len(instances.Attributes()[k].Values())
				// Compute marginals
				rowSums := make([]float64, numValues)
				columnSums := make([]float64, numClasses)
				sum := 0.0
				for i := 0; i < numValues; i++ {
					for j := 0; j < numClasses; j++ {
						rowSums[i] += counts[k][i][j]
						columnSums[j] += counts[k][i][j]
					sum += rowSums[i]
				if utils.Gr(sum, 0) {
					additions := make([][]float64, numValues) //initializes slices
					for i := range additions {
						additions[i] = make([]float64, numClasses)
					// Compute what needs to be added to each row
					for i := range additions {
						for j := range additions[i] {
							additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]
					// Compute what needs to be added to each column
					for i := 0; i < numClasses; i++ {
						for j := 0; j < numValues; j++ {
							additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]
					// Compute what needs to be added to each cell
					for i := 0; i < numClasses; i++ {
						for j := 0; j < numValues; j++ {
							additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]
					// Make new contingency table
					newTable := make([][]float64, numValues) //initializes slices
					for i := range newTable {
						newTable[i] = make([]float64, numClasses)
					for i := range newTable {
						for j := range newTable[i] {
							newTable[i][j] = counts[k][i][j] + additions[i][j]
					counts[k] = newTable
	// Compute info gains
	ig.infoGains = make([]float64, len(instances.Attributes()))
	for i := range instances.Attributes() {
		if i != classIndex {
			ig.infoGains[i] = entropyOverColumns(counts[i]) - entropyConditionedOnRows(counts[i])
	//fmt.Println(ig.infoGains, "infogain")