Exemplo n.º 1
0
func (stwv *StringToWordVector) convertInstancewoDocNorm(inst data.Instance) (int, data.Instance) {

	// Convert the instance into a sorted set of indexes
	contained := omap.NewIntKeyed()
	mapKeys := make([]float64, 0)
	// Copy all non-converted attributes from input to output
	firstCopy := 0

	for i, _ := range stwv.inputFormat.Attributes() {
		//fmt.Println("input attrs: ", i)
		if !stwv.inputFormat.Attributes()[i].IsString() {
			// Add simple nominal and numeric attributes directly
			if inst.RealValues()[i] != 0 {
				contained.Insert(firstCopy, inst.RealValues()[i])
				mapKeys = append(mapKeys, float64(firstCopy))
				firstCopy++
			} else {
				firstCopy++
			}
		} else if inst.IsMissingValue(i) {
			//fmt.Println("print 1.2")
			contained.Insert(firstCopy, inst.MissingValue)
			mapKeys = append(mapKeys, float64(firstCopy))
			firstCopy++
		} else if stwv.inputFormat.Attributes()[i].IsString() {
			//if i have to implement the range selector then code this part
		}
	}
	//Copy the converted attributes
	//fmt.Println("print 2.0" , inst.NumAttributes())
	for j := 0; j < inst.NumAttributes(); j++ {
		//fmt.Println("print 2.0.1" , stwv.inputFormat.Attributes()[1].IsString())
		if stwv.inputFormat.Attributes()[j].IsString() && inst.IsMissingValue(j) == false {
			//fmt.Println("print 2")
			words := strings.Fields(inst.Values()[j])
			//fmt.Println(stwv.dictionary)
			//fmt.Println("------------------------------------------------")
			for _, word := range words {
				//fmt.Println("print 3", idx)
				if index, present := stwv.dictionary.Find(word); present {
					if stwv.outputsCounts {
						if count, isthere := contained.Find(index); isthere {
							if count, ok := count.(float64); ok { //type assertion
								contained.Insert(int(index.(int)), count+1)
								mapKeys = append(mapKeys, float64(index.(int)))
							}
						} else {
							//fmt.Println(index)
							contained.Insert(int(index.(int)), float64(1))
							mapKeys = append(mapKeys, float64(index.(int)))
						}
					} else {
						//fmt.Println(index)
						contained.Insert(int(index.(int)), float64(1))
						mapKeys = append(mapKeys, float64(index.(int)))
					}
				}
			}
		}
	}
	//To calculate frequencies
	indexes := make([]int, contained.Len())
	_values := make([]float64, contained.Len())
	n := 0
	contained.Do(func(key, value interface{}) {
		//fmt.Println(key, " <-->", value)
		index, _ := key.(int)
		_value, _ := value.(float64)
		indexes[n] = index
		_values[n] = _value
		n++
	})
	//------------
	//TF_freq transform
	if stwv.tf_transformation {
		for i := 0; i < len(indexes); i++ {
			index := indexes[i]
			if index >= firstCopy {
				val := _values[i]
				val = math.Log(val + 1)
				contained.Insert(index, val)
			}
		}
	}
	indexes = make([]int, contained.Len())
	_values = make([]float64, contained.Len())
	n = 0
	contained.Do(func(key, value interface{}) {
		//fmt.Println(key, " <-->", value)
		index, _ := key.(int)
		_value, _ := value.(float64)
		indexes[n] = index
		_values[n] = _value
		n++
	})
	//IDF_freq transform
	if stwv.idf_transformation {
		for i := 0; i < len(indexes); i++ {
			index := indexes[i]
			if index >= firstCopy {
				val := _values[i]
				val = val * math.Log(float64(stwv.numInstances)/float64(stwv.docsCounts[index]))
				contained.Insert(index, val)
			}
		}
		//		contained.Do(func(key, value interface{}) {
		//			k, _ := key.(int)
		//			val, _ := value.(float64)
		//			if k >= firstCopy {
		//				val = val * math.Log(float64(stwv.numInstances)/float64(stwv.docsCounts[k]))
		//				contained.Insert(k, val)
		//			}
		//		})
	}
	//TF_IDF_freq transform
	//	if stwv.transformation == TF_IDF {
	//		for i:= 0; i < len(indexes); i++ {
	//			index := indexes[i]
	//			if index >= firstCopy {
	//				val := _values[i]
	//				val = (val * math.Log(float64(stwv.numInstances)/float64(stwv.docsCounts[index]))) * math.Log(val+1)
	//				contained.Insert(index, val)
	//			}
	//		}
	//		contained.Do(func(key, value interface{}) {
	//			k, _ := key.(int)
	//			val, _ := value.(float64)
	//			if k >= firstCopy {
	//				val = (val * math.Log(float64(stwv.numInstances)/float64(stwv.docsCounts[k]))) * math.Log(val+1)
	//				contained.Insert(k, val)
	//			}
	//		})
	//	}
	//	 contained.Do(func(key, value interface{}) {
	//	 	fmt.Println(key, " ", value)
	//	 })
	// Convert the set to structures needed to create a sparse instance.
	values := make([]float64, contained.Len())
	indices := make([]int, contained.Len())
	i := 0
	//fmt.Println(contained.Len())
	contained.Do(func(key, value interface{}) {
		index, _ := key.(int)
		_value, _ := value.(float64)
		values[i] = _value
		indices[i] = index
		i++
	})
	instSparse := data.NewInstance()
	for k, i := range indices {
		if stwv.outputFormat.Attributes()[i].IsNominal() {
			instSparse.AddValues(stwv.outputFormat.Attributes()[i].Values()[int(values[k])])
		} else if stwv.outputFormat.Attributes()[i].IsNominal() && !stwv.outputFormat.Attributes()[i].IsString() {
			instSparse.AddValues(stwv.outputFormat.Attributes()[i].Values()[i])
		} else {
			instSparse.AddValues(stwv.outputFormat.Attributes()[i].Name())
		}

	}
	instSparse.SetIndices(indices)
	instSparse.SetRealValues(values)
	instSparse.SetWeight(inst.Weight())
	instSparse.SetNumAttributes(len(values))
	return firstCopy, instSparse
}