func (r *Remove) SetInputFormat(instInfo data.Instances) { r.getSelectedAttributes(len(instInfo.Attributes())) attributes := make([]data.Attribute, 0) outputClass := -1 for _, current := range r.selectedAttributes { if instInfo.ClassIndex() == current { outputClass = len(attributes) } keep := *instInfo.Attribute(current) fmt.Println(keep.Name()) attributes = append(attributes, keep) } fmt.Println(len(attributes), "attributes", "\n", outputClass, "outputClass") r.outputFormat = data.NewInstancesWithClassIndex(outputClass) r.outputFormat.SetAttributes(attributes) }
//New StringToWordVector function with default values func NewStringToWordVectorInst(inputData data.Instances) StringToWordVector { var stwv StringToWordVector stwv.dictionary = omap.NewStringKeyed() stwv.outputsCounts = false stwv.docsCounts = make([]int, 0) stwv.avgDocLength = -1 stwv.wordsToKeep = 1000 stwv.numInstances = -1 stwv.perdiodicPruningRate = -1 stwv.minTermFreq = 1 stwv.perClass = true stwv.normalize = true stwv.inputFormat = inputData stwv.outputFormat = data.NewInstancesWithClassIndex(inputData.ClassIndex()) stwv.firstTime = true stwv.tf_transformation, stwv.idf_transformation = true, true return stwv }
func (stwv *StringToWordVector) determineDictionary(inst *data.Instances) { /* TODO: see if use a stopwords list*/ fmt.Println("Determing dictionary!") classInd := inst.ClassIndex() values := 1 if stwv.perClass && (classInd != -1) { values = len(inst.Attributes()[classInd].Values()) } dicA := make([]*omap.Map, values) for i := 0; i < values; i++ { dicA[i] = omap.NewStringKeyed() } // Tokenize all training text into an orderedMap of "words". pruneRate := int64((stwv.perdiodicPruningRate / 100) * len(inst.Instances())) for i, instance := range inst.Instances() { vInd := int(0) if stwv.perClass && (classInd != -1) { vInd = int(instance.RealValues()[classInd]) } //Iterate through all relevant string attributes of the current instance hashtable := make(map[string]int, 0) for j := 0; j < instance.NumAttributes(); j++ { if !instance.IsMissingValue(j) && inst.Attributes()[j].IsString() { // Iterate through tokens, perform stemming, and remove stopwords // (if required) //fmt.Println(instance.Values()) words := strings.Fields(instance.Values()[j]) for _, word := range words { _, present := hashtable[word] if !present { hashtable[word] = 0 } //fmt.Println(word) if count, present := dicA[vInd].Find(word); !present { dicA[vInd].Insert(word, Count{1, 0}) } else { count, _ := count.(Count) count.Count++ dicA[vInd].Insert(word, count) } //fmt.Println(dicA[vInd][word]) } } } //updating the docCount for the words that have occurred in this //instance(document). enumeration := make([]string, 0, len(hashtable)) for word, _ := range hashtable { //only the words enumeration = append(enumeration, word) } for _, word := range enumeration { if count, present := dicA[vInd].Find(word); present { count := count.(Count) count.DocCount++ //delete(dicA[vInd], word) dicA[vInd].Insert(word, count) //fmt.Println(word, " ",dicA[vInd][word]) } else { panic("Check the code, there must be a word in the dictionary") } fmt.Println(dicA[vInd].Find(word)) } if pruneRate > 0 { if int64(i)%pruneRate == 0 && i > 0 { for z := 0; z < values; z++ { d := make([]string, 1000) dicA[z].Do(func(key, value interface{}) { word, _ := key.(string) count, _ := value.(Count) if count.Count <= 1 { d = append(d, word) } }) // for word, _ := range dicA[z] { // count := dicA[z][word] // if count.Count <= 1 { // d = append(d, word) // } // } for _, word := range d { dicA[z].Delete(word) //delete(dicA[z], word) } } } } //fmt.Println("new instance-----------------------------------------------------------") } //fmt.Println(dicA) // Figure out the minimum required word frequency totalSize := int(0) prune := make([]int, values) for z := 0; z < values; z++ { totalSize += dicA[z].Len() array := make([]int, dicA[z].Len()) pos := int(0) dicA[z].Do(func(key, value interface{}) { //_, _ := key.(string) count, _ := value.(Count) array[pos] = count.Count pos++ }) // for word, _ := range dicA[z] { // count := dicA[z][word] // array[pos] = count.Count // pos++ // } sort.Ints(array) fmt.Println(array) if len(array) < stwv.wordsToKeep { // if there aren't enough words, set the threshold to // minFreq prune[z] = int(stwv.minTermFreq) } else { // otherwise set it to be at least minFreq idx := len(array) - stwv.wordsToKeep prune[z] = int(math.Max(float64(stwv.minTermFreq), float64(array[idx]))) } //fmt.Println(prune[z]) } // Convert the dictionary into an attribute index // and create one attribute per word attributes := make([]data.Attribute, 0, totalSize+len(inst.Attributes())) fmt.Println(totalSize+len(inst.Attributes()), "len(attributes)") // Add the non-converted attributes classIndex := int(-1) for i, attr := range stwv.inputFormat.Attributes() { if !attr.IsString() { if inst.ClassIndex() == i { classIndex = len(attributes) } //fmt.Println(attr) attributes = append(attributes, attr) } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) newDic := omap.NewStringKeyed() index := len(attributes) for z := 0; z < values; z++ { dicA[z].Do(func(key, value interface{}) { word, _ := key.(string) count, _ := value.(Count) if count.Count >= prune[z] { if _, present := newDic.Find(word); !present { newDic.Insert(word, int(index)) index++ att := data.NewAttribute() att.SetName(word) att.SetType(data.NUMERIC) attributes = append(attributes, att) //fmt.Println(index) } } }) // for word, _ := range dicA[z] { // count := dicA[z][word] // //fmt.Println(count.Count, prune[z]) // if count.Count >= prune[z] { // if _, present := newDic[word]; !present { // newDic[word] = float64(index) // index++ // att := data.NewAttribute() // att.SetName(word) // att.SetType(data.STRING) // attributes = append(attributes, att) // fmt.Println(index) // } // } // } } //fmt.Println(newDic) // Compute document frequencies stwv.docsCounts = make([]int, len(attributes)) //idx := 0 newDic.Do(func(key, value interface{}) { word, _ := key.(string) idx, _ := value.(int) docsCount := 0 for j := 0; j < values; j++ { if count, present := dicA[j].Find(word); present { count := count.(Count) //fmt.Println(count.DocCount, "doccount newdic") docsCount += count.DocCount } } stwv.docsCounts[idx] = docsCount }) // for word, idx := range newDic { // docsCount := 0 // for j := 0; j < values; j++ { // if count, present := dicA[j][word]; present { // docsCount += count.DocCount // } // } // stwv.docsCounts[int(idx)] = docsCount // //idx++ // } fmt.Println("doc: ", stwv.docsCounts) stwv.dictionary = newDic ////fmt.Println("numInst", len(inst.Instances())) stwv.numInstances = len(inst.Instances()) stwv.outputFormat = data.NewInstances() stwv.outputFormat.SetAttributes(attributes) stwv.outputFormat.SetClassIndex(classIndex) }
func (ig *InfoGain) BuildEvaluator(instances data.Instances) { classIndex := instances.ClassIndex() numInstances := len(instances.Instances()) if ig.binarize { //binarize instances //implement NumericToBinary function ntb := NewNumericToBinary() ntb.Exec(instances) instances = ntb.Output() fmt.Println(instances.Instances()) } else { //discretize instances //implement Discretize function } numClasses := instances.Attribute(classIndex).NumValues() // Reserve space and initialize counters counts := make([][][]float64, len(instances.Attributes())) //initialize first dimension for k := range instances.Attributes() { //fmt.Println(k) if k != classIndex { numValues := len(instances.Attributes()[k].Values()) counts[k] = make([][]float64, numValues+1) //initialize second dimension for i := range counts[k] { counts[k][i] = make([]float64, numClasses+1) //initialize third dimension } } } // Initialize counters fmt.Println(numClasses, "numclasses") temp := make([]float64, numClasses+1) for k := 0; k < numInstances; k++ { inst := instances.Instance(k) if inst.ClassMissing(classIndex) { //check that class if the class is missing /*implement method to do that*/ temp[numClasses] += inst.Weight() } else { fmt.Println(int(inst.ClassValue(classIndex)), "classIndexes", inst.Weight(), "weights") temp[int(inst.ClassValue(classIndex))] += inst.Weight() //get the index of the value of the class } } fmt.Println(temp) for k := range counts { if k != classIndex { for i := range temp { counts[k][0][i] = temp[i] } } } // Get counts //inst.RealValues()[classIndex]) check this after finish, may contains errors, its have to be check if the classIndex exists if not return 0 /*see weka*/ //implement the necessary methods to make easier this implementation and not bugs friendly //New methods already implemented!!!!!!!! Later check it's functioning for k := 0; k < numInstances; k++ { inst := instances.Instance(k) for i := range inst.RealValues() { if inst.Index(i) != classIndex { if inst.IsMissingValue(i) || inst.ClassMissing(classIndex) { //if is missing the real value and the class if !inst.IsMissingValue(i) { counts[inst.Index(i)][int(inst.ValueSparse(i))][numClasses] += inst.Weight() counts[inst.Index(i)][0][numClasses] -= inst.Weight() } else if !inst.IsMissingValue(classIndex) { counts[inst.Index(i)][instances.Attribute(inst.Index(i)).NumValues()][int(inst.ClassValue(classIndex))] += inst.Weight() //tongue twister, now its not counts[inst.Index(i)][0][int(inst.ClassValue(classIndex))] -= inst.Weight() } else { counts[inst.Index(i)][instances.Attribute(inst.Index(i)).NumValues()][numClasses] += inst.Weight() counts[inst.Index(i)][0][numClasses] -= inst.Weight() } } else { counts[inst.Index(i)][int(inst.ValueSparse(i))][int(inst.ClassValue(classIndex))] += inst.Weight() counts[inst.Index(i)][0][int(inst.ClassValue(classIndex))] -= inst.Weight() } } } } // distribute missing counts if required if ig.missingMerge { for k := range instances.Attributes() { if k != classIndex { numValues := len(instances.Attributes()[k].Values()) // Compute marginals rowSums := make([]float64, numValues) columnSums := make([]float64, numClasses) sum := 0.0 for i := 0; i < numValues; i++ { for j := 0; j < numClasses; j++ { rowSums[i] += counts[k][i][j] columnSums[j] += counts[k][i][j] } sum += rowSums[i] } if utils.Gr(sum, 0) { additions := make([][]float64, numValues) //initializes slices for i := range additions { additions[i] = make([]float64, numClasses) } // Compute what needs to be added to each row for i := range additions { for j := range additions[i] { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j] } } // Compute what needs to be added to each column for i := 0; i < numClasses; i++ { for j := 0; j < numValues; j++ { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses] } } // Compute what needs to be added to each cell for i := 0; i < numClasses; i++ { for j := 0; j < numValues; j++ { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses] } } // Make new contingency table newTable := make([][]float64, numValues) //initializes slices for i := range newTable { newTable[i] = make([]float64, numClasses) } for i := range newTable { for j := range newTable[i] { newTable[i][j] = counts[k][i][j] + additions[i][j] } } counts[k] = newTable } } } } // Compute info gains ig.infoGains = make([]float64, len(instances.Attributes())) for i := range instances.Attributes() { if i != classIndex { ig.infoGains[i] = entropyOverColumns(counts[i]) - entropyConditionedOnRows(counts[i]) } } //fmt.Println(ig.infoGains, "infogain") }