コード例 #1
0
func (r *Remove) SetInputFormat(instInfo data.Instances) {
	r.getSelectedAttributes(len(instInfo.Attributes()))
	attributes := make([]data.Attribute, 0)
	outputClass := -1
	for _, current := range r.selectedAttributes {
		if instInfo.ClassIndex() == current {
			outputClass = len(attributes)
		}
		keep := *instInfo.Attribute(current)
		fmt.Println(keep.Name())
		attributes = append(attributes, keep)
	}
	fmt.Println(len(attributes), "attributes", "\n", outputClass, "outputClass")
	r.outputFormat = data.NewInstancesWithClassIndex(outputClass)
	r.outputFormat.SetAttributes(attributes)
}
コード例 #2
0
//New StringToWordVector function with default values
func NewStringToWordVectorInst(inputData data.Instances) StringToWordVector {
	var stwv StringToWordVector
	stwv.dictionary = omap.NewStringKeyed()
	stwv.outputsCounts = false
	stwv.docsCounts = make([]int, 0)
	stwv.avgDocLength = -1
	stwv.wordsToKeep = 1000
	stwv.numInstances = -1
	stwv.perdiodicPruningRate = -1
	stwv.minTermFreq = 1
	stwv.perClass = true
	stwv.normalize = true
	stwv.inputFormat = inputData
	stwv.outputFormat = data.NewInstancesWithClassIndex(inputData.ClassIndex())
	stwv.firstTime = true
	stwv.tf_transformation, stwv.idf_transformation = true, true
	return stwv
}
コード例 #3
0
func (stwv *StringToWordVector) determineDictionary(inst *data.Instances) {
	/* TODO: see if use a stopwords list*/
	fmt.Println("Determing dictionary!")
	classInd := inst.ClassIndex()
	values := 1
	if stwv.perClass && (classInd != -1) {
		values = len(inst.Attributes()[classInd].Values())
	}
	dicA := make([]*omap.Map, values)
	for i := 0; i < values; i++ {
		dicA[i] = omap.NewStringKeyed()
	}
	// Tokenize all training text into an orderedMap of "words".
	pruneRate := int64((stwv.perdiodicPruningRate / 100) * len(inst.Instances()))
	for i, instance := range inst.Instances() {
		vInd := int(0)
		if stwv.perClass && (classInd != -1) {
			vInd = int(instance.RealValues()[classInd])
		}
		//Iterate through all relevant string attributes of the current instance
		hashtable := make(map[string]int, 0)
		for j := 0; j < instance.NumAttributes(); j++ {
			if !instance.IsMissingValue(j) && inst.Attributes()[j].IsString() {
				// Iterate through tokens, perform stemming, and remove stopwords
				// (if required)
				//fmt.Println(instance.Values())
				words := strings.Fields(instance.Values()[j])
				for _, word := range words {
					_, present := hashtable[word]
					if !present {
						hashtable[word] = 0
					}
					//fmt.Println(word)
					if count, present := dicA[vInd].Find(word); !present {
						dicA[vInd].Insert(word, Count{1, 0})
					} else {
						count, _ := count.(Count)
						count.Count++
						dicA[vInd].Insert(word, count)
					}
					//fmt.Println(dicA[vInd][word])
				}
			}
		}
		//updating the docCount for the words that have occurred in this
		//instance(document).
		enumeration := make([]string, 0, len(hashtable))
		for word, _ := range hashtable { //only the words
			enumeration = append(enumeration, word)
		}
		for _, word := range enumeration {
			if count, present := dicA[vInd].Find(word); present {
				count := count.(Count)
				count.DocCount++
				//delete(dicA[vInd], word)
				dicA[vInd].Insert(word, count)
				//fmt.Println(word, " ",dicA[vInd][word])
			} else {
				panic("Check the code, there must be a word in the dictionary")
			}
			fmt.Println(dicA[vInd].Find(word))
		}

		if pruneRate > 0 {
			if int64(i)%pruneRate == 0 && i > 0 {
				for z := 0; z < values; z++ {
					d := make([]string, 1000)
					dicA[z].Do(func(key, value interface{}) {
						word, _ := key.(string)
						count, _ := value.(Count)
						if count.Count <= 1 {
							d = append(d, word)
						}
					})
					//					for word, _ := range dicA[z] {
					//						count := dicA[z][word]
					//						if count.Count <= 1 {
					//							d = append(d, word)
					//						}
					//					}
					for _, word := range d {
						dicA[z].Delete(word)
						//delete(dicA[z], word)
					}
				}
			}
		}
		//fmt.Println("new instance-----------------------------------------------------------")
	}
	//fmt.Println(dicA)
	// Figure out the minimum required word frequency
	totalSize := int(0)
	prune := make([]int, values)
	for z := 0; z < values; z++ {
		totalSize += dicA[z].Len()
		array := make([]int, dicA[z].Len())
		pos := int(0)
		dicA[z].Do(func(key, value interface{}) {
			//_, _ := key.(string)
			count, _ := value.(Count)
			array[pos] = count.Count
			pos++
		})
		//		for word, _ := range dicA[z] {
		//			count := dicA[z][word]
		//			array[pos] = count.Count
		//			pos++
		//		}
		sort.Ints(array)
		fmt.Println(array)
		if len(array) < stwv.wordsToKeep {
			// if there aren't enough words, set the threshold to
			// minFreq
			prune[z] = int(stwv.minTermFreq)
		} else {
			// otherwise set it to be at least minFreq
			idx := len(array) - stwv.wordsToKeep
			prune[z] = int(math.Max(float64(stwv.minTermFreq), float64(array[idx])))
		}
		//fmt.Println(prune[z])
	}
	// Convert the dictionary into an attribute index
	// and create one attribute per word
	attributes := make([]data.Attribute, 0, totalSize+len(inst.Attributes()))
	fmt.Println(totalSize+len(inst.Attributes()), "len(attributes)")
	// Add the non-converted attributes
	classIndex := int(-1)
	for i, attr := range stwv.inputFormat.Attributes() {
		if !attr.IsString() {
			if inst.ClassIndex() == i {
				classIndex = len(attributes)
			}
			//fmt.Println(attr)
			attributes = append(attributes, attr)
		}
	}
	// Add the word vector attributes (eliminating duplicates
	// that occur in multiple classes)
	newDic := omap.NewStringKeyed()
	index := len(attributes)
	for z := 0; z < values; z++ {
		dicA[z].Do(func(key, value interface{}) {
			word, _ := key.(string)
			count, _ := value.(Count)
			if count.Count >= prune[z] {
				if _, present := newDic.Find(word); !present {
					newDic.Insert(word, int(index))
					index++
					att := data.NewAttribute()
					att.SetName(word)
					att.SetType(data.NUMERIC)
					attributes = append(attributes, att)
					//fmt.Println(index)
				}
			}

		})
		//		for word, _ := range dicA[z] {
		//			count := dicA[z][word]
		//			//fmt.Println(count.Count, prune[z])
		//			if count.Count >= prune[z] {
		//				if _, present := newDic[word]; !present {
		//					newDic[word] = float64(index)
		//					index++
		//					att := data.NewAttribute()
		//					att.SetName(word)
		//					att.SetType(data.STRING)
		//					attributes = append(attributes, att)
		//					fmt.Println(index)
		//				}
		//			}
		//		}
	}
	//fmt.Println(newDic)
	// Compute document frequencies
	stwv.docsCounts = make([]int, len(attributes))
	//idx := 0
	newDic.Do(func(key, value interface{}) {
		word, _ := key.(string)
		idx, _ := value.(int)
		docsCount := 0
		for j := 0; j < values; j++ {
			if count, present := dicA[j].Find(word); present {
				count := count.(Count)
				//fmt.Println(count.DocCount, "doccount newdic")
				docsCount += count.DocCount
			}
		}
		stwv.docsCounts[idx] = docsCount
	})
	//	for word, idx := range newDic {
	//		docsCount := 0
	//		for j := 0; j < values; j++ {
	//			if count, present := dicA[j][word]; present {
	//				docsCount += count.DocCount
	//			}
	//		}
	//		stwv.docsCounts[int(idx)] = docsCount
	//		//idx++
	//	}
	fmt.Println("doc: ", stwv.docsCounts)
	stwv.dictionary = newDic
	////fmt.Println("numInst", len(inst.Instances()))
	stwv.numInstances = len(inst.Instances())
	stwv.outputFormat = data.NewInstances()
	stwv.outputFormat.SetAttributes(attributes)
	stwv.outputFormat.SetClassIndex(classIndex)
}
コード例 #4
0
ファイル: infogain.go プロジェクト: jamolinet/project-mac
func (ig *InfoGain) BuildEvaluator(instances data.Instances) {
	classIndex := instances.ClassIndex()
	numInstances := len(instances.Instances())

	if ig.binarize { //binarize instances
		//implement NumericToBinary function
		ntb := NewNumericToBinary()
		ntb.Exec(instances)
		instances = ntb.Output()
		fmt.Println(instances.Instances())
	} else { //discretize instances
		//implement Discretize function
	}
	numClasses := instances.Attribute(classIndex).NumValues()
	// Reserve space and initialize counters
	counts := make([][][]float64, len(instances.Attributes())) //initialize first dimension
	for k := range instances.Attributes() {
		//fmt.Println(k)
		if k != classIndex {
			numValues := len(instances.Attributes()[k].Values())
			counts[k] = make([][]float64, numValues+1) //initialize second dimension
			for i := range counts[k] {
				counts[k][i] = make([]float64, numClasses+1) //initialize third dimension
			}
		}
	}
	// Initialize counters
	fmt.Println(numClasses, "numclasses")
	temp := make([]float64, numClasses+1)
	for k := 0; k < numInstances; k++ {
		inst := instances.Instance(k)
		if inst.ClassMissing(classIndex) { //check that class if the class is missing /*implement method to do that*/
			temp[numClasses] += inst.Weight()
		} else {
			fmt.Println(int(inst.ClassValue(classIndex)), "classIndexes", inst.Weight(), "weights")
			temp[int(inst.ClassValue(classIndex))] += inst.Weight() //get the index of the value of the class
		}
	}
	fmt.Println(temp)
	for k := range counts {
		if k != classIndex {
			for i := range temp {
				counts[k][0][i] = temp[i]
			}
		}
	}
	// Get counts
	//inst.RealValues()[classIndex]) check this after finish, may contains errors, its have to be check if the classIndex exists if not return 0 /*see weka*/
	//implement the necessary methods to make easier this implementation and not bugs friendly
	//New methods already implemented!!!!!!!! Later check it's functioning
	for k := 0; k < numInstances; k++ {
		inst := instances.Instance(k)
		for i := range inst.RealValues() {
			if inst.Index(i) != classIndex {
				if inst.IsMissingValue(i) || inst.ClassMissing(classIndex) { //if is missing the real value and the class
					if !inst.IsMissingValue(i) {
						counts[inst.Index(i)][int(inst.ValueSparse(i))][numClasses] += inst.Weight()
						counts[inst.Index(i)][0][numClasses] -= inst.Weight()
					} else if !inst.IsMissingValue(classIndex) {
						counts[inst.Index(i)][instances.Attribute(inst.Index(i)).NumValues()][int(inst.ClassValue(classIndex))] += inst.Weight() //tongue twister, now its not
						counts[inst.Index(i)][0][int(inst.ClassValue(classIndex))] -= inst.Weight()
					} else {
						counts[inst.Index(i)][instances.Attribute(inst.Index(i)).NumValues()][numClasses] += inst.Weight()
						counts[inst.Index(i)][0][numClasses] -= inst.Weight()
					}
				} else {
					counts[inst.Index(i)][int(inst.ValueSparse(i))][int(inst.ClassValue(classIndex))] += inst.Weight()
					counts[inst.Index(i)][0][int(inst.ClassValue(classIndex))] -= inst.Weight()
				}
			}
		}
	}
	// distribute missing counts if required
	if ig.missingMerge {
		for k := range instances.Attributes() {
			if k != classIndex {
				numValues := len(instances.Attributes()[k].Values())
				// Compute marginals
				rowSums := make([]float64, numValues)
				columnSums := make([]float64, numClasses)
				sum := 0.0
				for i := 0; i < numValues; i++ {
					for j := 0; j < numClasses; j++ {
						rowSums[i] += counts[k][i][j]
						columnSums[j] += counts[k][i][j]
					}
					sum += rowSums[i]
				}
				if utils.Gr(sum, 0) {
					additions := make([][]float64, numValues) //initializes slices
					for i := range additions {
						additions[i] = make([]float64, numClasses)
					}
					// Compute what needs to be added to each row
					for i := range additions {
						for j := range additions[i] {
							additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]
						}
					}
					// Compute what needs to be added to each column
					for i := 0; i < numClasses; i++ {
						for j := 0; j < numValues; j++ {
							additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]
						}
					}
					// Compute what needs to be added to each cell
					for i := 0; i < numClasses; i++ {
						for j := 0; j < numValues; j++ {
							additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]
						}
					}
					// Make new contingency table
					newTable := make([][]float64, numValues) //initializes slices
					for i := range newTable {
						newTable[i] = make([]float64, numClasses)
					}
					for i := range newTable {
						for j := range newTable[i] {
							newTable[i][j] = counts[k][i][j] + additions[i][j]
						}
					}
					counts[k] = newTable
				}
			}
		}
	}
	// Compute info gains
	ig.infoGains = make([]float64, len(instances.Attributes()))
	for i := range instances.Attributes() {
		if i != classIndex {
			ig.infoGains[i] = entropyOverColumns(counts[i]) - entropyConditionedOnRows(counts[i])
		}
	}
	//fmt.Println(ig.infoGains, "infogain")
}