//Convert a single instance over func (as *AttributeSelection) convertInstance(inst data.Instance) data.Instance { newVasl := make([]float64, 0, len(as.output.Attributes())) for i, current := range as.selectedAttributes { fmt.Println(current, i, inst.RealValues()) newVasl = append(newVasl, inst.Value(current)) //newVasl[i] = inst.Value(current) //fmt.Println(newVasl[i], "newVasl[i]") } fmt.Println("----------------------------------------------") newInst := data.NewInstance() // newInst.SetNumAttributes(len(newVasl)) values_ := make([]float64, len(newVasl)) indices_ := make([]int, len(newVasl)) vals := 0 for i := 0; i < len(newVasl); i++ { if newVasl[i] != 0 { values_[vals] = newVasl[i] indices_[vals] = i vals++ } } values := make([]float64, vals) indices := make([]int, vals) copy(values, values_) copy(indices, indices_) // fmt.Println(values, "values") // fmt.Println(indices, "indices") for k, i := range indices { if as.output.Attribute(i).IsNominal() { newInst.AddValues(as.output.Attribute(i).Values()[int(values[k])]) } else { newInst.AddValues(as.output.Attribute(i).Name()) } } newInst.SetIndices(indices) newInst.SetRealValues(values) newInst.SetWeight(inst.Weight()) return newInst }
func (stwv *StringToWordVector) normalizeInstance(inst *data.Instance, firstCopy int) { //fmt.Println("firstcopy ", firstCopy) //fmt.Println("avgdoclength ", stwv.avgDocLength) docLength := float64(0) if stwv.avgDocLength < 0 { panic("Average document length not set.") } // fmt.Println("valores: ", inst.RealValues()) // Compute length of document vector for j := 0; j < len(inst.RealValues()); j++ { if inst.Indices()[j] >= firstCopy { docLength += inst.RealValues()[j] * inst.RealValues()[j] } } docLength = math.Sqrt(docLength) // Normalize document vector for j := 0; j < len(inst.RealValues()); j++ { if inst.Indices()[j] >= firstCopy { val := inst.RealValues()[j] * stwv.avgDocLength / docLength inst.AddRealValuesIndex(j, val) if val == 0 { fmt.Println("Setting value %d to zero", inst.Indices()[j]) j-- } } } }
func (stwv *StringToWordVector) convertInstancewoDocNorm(inst data.Instance) (int, data.Instance) { // Convert the instance into a sorted set of indexes contained := omap.NewIntKeyed() mapKeys := make([]float64, 0) // Copy all non-converted attributes from input to output firstCopy := 0 for i, _ := range stwv.inputFormat.Attributes() { //fmt.Println("input attrs: ", i) if !stwv.inputFormat.Attributes()[i].IsString() { // Add simple nominal and numeric attributes directly if inst.RealValues()[i] != 0 { contained.Insert(firstCopy, inst.RealValues()[i]) mapKeys = append(mapKeys, float64(firstCopy)) firstCopy++ } else { firstCopy++ } } else if inst.IsMissingValue(i) { //fmt.Println("print 1.2") contained.Insert(firstCopy, inst.MissingValue) mapKeys = append(mapKeys, float64(firstCopy)) firstCopy++ } else if stwv.inputFormat.Attributes()[i].IsString() { //if i have to implement the range selector then code this part } } //Copy the converted attributes //fmt.Println("print 2.0" , inst.NumAttributes()) for j := 0; j < inst.NumAttributes(); j++ { //fmt.Println("print 2.0.1" , stwv.inputFormat.Attributes()[1].IsString()) if stwv.inputFormat.Attributes()[j].IsString() && inst.IsMissingValue(j) == false { //fmt.Println("print 2") words := strings.Fields(inst.Values()[j]) //fmt.Println(stwv.dictionary) //fmt.Println("------------------------------------------------") for _, word := range words { //fmt.Println("print 3", idx) if index, present := stwv.dictionary.Find(word); present { if stwv.outputsCounts { if count, isthere := contained.Find(index); isthere { if count, ok := count.(float64); ok { //type assertion contained.Insert(int(index.(int)), count+1) mapKeys = append(mapKeys, float64(index.(int))) } } else { //fmt.Println(index) contained.Insert(int(index.(int)), float64(1)) mapKeys = append(mapKeys, float64(index.(int))) } } else { //fmt.Println(index) contained.Insert(int(index.(int)), float64(1)) mapKeys = append(mapKeys, float64(index.(int))) } } } } } //To calculate frequencies indexes := make([]int, contained.Len()) _values := make([]float64, contained.Len()) n := 0 contained.Do(func(key, value interface{}) { //fmt.Println(key, " <-->", value) index, _ := key.(int) _value, _ := value.(float64) indexes[n] = index _values[n] = _value n++ }) //------------ //TF_freq transform if stwv.tf_transformation { for i := 0; i < len(indexes); i++ { index := indexes[i] if index >= firstCopy { val := _values[i] val = math.Log(val + 1) contained.Insert(index, val) } } } indexes = make([]int, contained.Len()) _values = make([]float64, contained.Len()) n = 0 contained.Do(func(key, value interface{}) { //fmt.Println(key, " <-->", value) index, _ := key.(int) _value, _ := value.(float64) indexes[n] = index _values[n] = _value n++ }) //IDF_freq transform if stwv.idf_transformation { for i := 0; i < len(indexes); i++ { index := indexes[i] if index >= firstCopy { val := _values[i] val = val * math.Log(float64(stwv.numInstances)/float64(stwv.docsCounts[index])) contained.Insert(index, val) } } // contained.Do(func(key, value interface{}) { // k, _ := key.(int) // val, _ := value.(float64) // if k >= firstCopy { // val = val * math.Log(float64(stwv.numInstances)/float64(stwv.docsCounts[k])) // contained.Insert(k, val) // } // }) } //TF_IDF_freq transform // if stwv.transformation == TF_IDF { // for i:= 0; i < len(indexes); i++ { // index := indexes[i] // if index >= firstCopy { // val := _values[i] // val = (val * math.Log(float64(stwv.numInstances)/float64(stwv.docsCounts[index]))) * math.Log(val+1) // contained.Insert(index, val) // } // } // contained.Do(func(key, value interface{}) { // k, _ := key.(int) // val, _ := value.(float64) // if k >= firstCopy { // val = (val * math.Log(float64(stwv.numInstances)/float64(stwv.docsCounts[k]))) * math.Log(val+1) // contained.Insert(k, val) // } // }) // } // contained.Do(func(key, value interface{}) { // fmt.Println(key, " ", value) // }) // Convert the set to structures needed to create a sparse instance. values := make([]float64, contained.Len()) indices := make([]int, contained.Len()) i := 0 //fmt.Println(contained.Len()) contained.Do(func(key, value interface{}) { index, _ := key.(int) _value, _ := value.(float64) values[i] = _value indices[i] = index i++ }) instSparse := data.NewInstance() for k, i := range indices { if stwv.outputFormat.Attributes()[i].IsNominal() { instSparse.AddValues(stwv.outputFormat.Attributes()[i].Values()[int(values[k])]) } else if stwv.outputFormat.Attributes()[i].IsNominal() && !stwv.outputFormat.Attributes()[i].IsString() { instSparse.AddValues(stwv.outputFormat.Attributes()[i].Values()[i]) } else { instSparse.AddValues(stwv.outputFormat.Attributes()[i].Name()) } } instSparse.SetIndices(indices) instSparse.SetRealValues(values) instSparse.SetWeight(inst.Weight()) instSparse.SetNumAttributes(len(values)) return firstCopy, instSparse }
func (ntb *NumericToBinary) convertInstance(instance data.Instance) data.Instance { inst := data.NewInstance() vals := make([]float64, len(instance.RealValues())) newIndexes := make([]int, len(instance.RealValues())) for j := range instance.RealValues() { att := ntb.input.Attribute(instance.Index(j)) if att.Type() != data.NUMERIC || instance.Index(j) == ntb.input.ClassIndex() { //fmt.Println(ntb.input.ClassIndex()) vals[j] = instance.ValueSparse(j) } else { if instance.IsMissingValue(j) { //fmt.Println("DSAD") vals[j] = instance.ValueSparse(j) } else { //fmt.Println("DSAD---") vals[j] = 1 } } newIndexes[j] = instance.Index(j) } inst.SetWeight(instance.Weight()) inst.SetRealValues(vals) inst.SetIndices(newIndexes) return inst }