Пример #1
0
//New StringToWordVector function with default values
func NewStringToWordVectorInst(inputData data.Instances) StringToWordVector {
	var stwv StringToWordVector
	stwv.dictionary = omap.NewStringKeyed()
	stwv.outputsCounts = false
	stwv.docsCounts = make([]int, 0)
	stwv.avgDocLength = -1
	stwv.wordsToKeep = 1000
	stwv.numInstances = -1
	stwv.perdiodicPruningRate = -1
	stwv.minTermFreq = 1
	stwv.perClass = true
	stwv.normalize = true
	stwv.inputFormat = inputData
	stwv.outputFormat = data.NewInstancesWithClassIndex(inputData.ClassIndex())
	stwv.firstTime = true
	stwv.tf_transformation, stwv.idf_transformation = true, true
	return stwv
}
Пример #2
0
func (stwv *StringToWordVector) determineDictionary(inst *data.Instances) {
	/* TODO: see if use a stopwords list*/
	fmt.Println("Determing dictionary!")
	classInd := inst.ClassIndex()
	values := 1
	if stwv.perClass && (classInd != -1) {
		values = len(inst.Attributes()[classInd].Values())
	}
	dicA := make([]*omap.Map, values)
	for i := 0; i < values; i++ {
		dicA[i] = omap.NewStringKeyed()
	}
	// Tokenize all training text into an orderedMap of "words".
	pruneRate := int64((stwv.perdiodicPruningRate / 100) * len(inst.Instances()))
	for i, instance := range inst.Instances() {
		vInd := int(0)
		if stwv.perClass && (classInd != -1) {
			vInd = int(instance.RealValues()[classInd])
		}
		//Iterate through all relevant string attributes of the current instance
		hashtable := make(map[string]int, 0)
		for j := 0; j < instance.NumAttributes(); j++ {
			if !instance.IsMissingValue(j) && inst.Attributes()[j].IsString() {
				// Iterate through tokens, perform stemming, and remove stopwords
				// (if required)
				//fmt.Println(instance.Values())
				words := strings.Fields(instance.Values()[j])
				for _, word := range words {
					_, present := hashtable[word]
					if !present {
						hashtable[word] = 0
					}
					//fmt.Println(word)
					if count, present := dicA[vInd].Find(word); !present {
						dicA[vInd].Insert(word, Count{1, 0})
					} else {
						count, _ := count.(Count)
						count.Count++
						dicA[vInd].Insert(word, count)
					}
					//fmt.Println(dicA[vInd][word])
				}
			}
		}
		//updating the docCount for the words that have occurred in this
		//instance(document).
		enumeration := make([]string, 0, len(hashtable))
		for word, _ := range hashtable { //only the words
			enumeration = append(enumeration, word)
		}
		for _, word := range enumeration {
			if count, present := dicA[vInd].Find(word); present {
				count := count.(Count)
				count.DocCount++
				//delete(dicA[vInd], word)
				dicA[vInd].Insert(word, count)
				//fmt.Println(word, " ",dicA[vInd][word])
			} else {
				panic("Check the code, there must be a word in the dictionary")
			}
			fmt.Println(dicA[vInd].Find(word))
		}

		if pruneRate > 0 {
			if int64(i)%pruneRate == 0 && i > 0 {
				for z := 0; z < values; z++ {
					d := make([]string, 1000)
					dicA[z].Do(func(key, value interface{}) {
						word, _ := key.(string)
						count, _ := value.(Count)
						if count.Count <= 1 {
							d = append(d, word)
						}
					})
					//					for word, _ := range dicA[z] {
					//						count := dicA[z][word]
					//						if count.Count <= 1 {
					//							d = append(d, word)
					//						}
					//					}
					for _, word := range d {
						dicA[z].Delete(word)
						//delete(dicA[z], word)
					}
				}
			}
		}
		//fmt.Println("new instance-----------------------------------------------------------")
	}
	//fmt.Println(dicA)
	// Figure out the minimum required word frequency
	totalSize := int(0)
	prune := make([]int, values)
	for z := 0; z < values; z++ {
		totalSize += dicA[z].Len()
		array := make([]int, dicA[z].Len())
		pos := int(0)
		dicA[z].Do(func(key, value interface{}) {
			//_, _ := key.(string)
			count, _ := value.(Count)
			array[pos] = count.Count
			pos++
		})
		//		for word, _ := range dicA[z] {
		//			count := dicA[z][word]
		//			array[pos] = count.Count
		//			pos++
		//		}
		sort.Ints(array)
		fmt.Println(array)
		if len(array) < stwv.wordsToKeep {
			// if there aren't enough words, set the threshold to
			// minFreq
			prune[z] = int(stwv.minTermFreq)
		} else {
			// otherwise set it to be at least minFreq
			idx := len(array) - stwv.wordsToKeep
			prune[z] = int(math.Max(float64(stwv.minTermFreq), float64(array[idx])))
		}
		//fmt.Println(prune[z])
	}
	// Convert the dictionary into an attribute index
	// and create one attribute per word
	attributes := make([]data.Attribute, 0, totalSize+len(inst.Attributes()))
	fmt.Println(totalSize+len(inst.Attributes()), "len(attributes)")
	// Add the non-converted attributes
	classIndex := int(-1)
	for i, attr := range stwv.inputFormat.Attributes() {
		if !attr.IsString() {
			if inst.ClassIndex() == i {
				classIndex = len(attributes)
			}
			//fmt.Println(attr)
			attributes = append(attributes, attr)
		}
	}
	// Add the word vector attributes (eliminating duplicates
	// that occur in multiple classes)
	newDic := omap.NewStringKeyed()
	index := len(attributes)
	for z := 0; z < values; z++ {
		dicA[z].Do(func(key, value interface{}) {
			word, _ := key.(string)
			count, _ := value.(Count)
			if count.Count >= prune[z] {
				if _, present := newDic.Find(word); !present {
					newDic.Insert(word, int(index))
					index++
					att := data.NewAttribute()
					att.SetName(word)
					att.SetType(data.NUMERIC)
					attributes = append(attributes, att)
					//fmt.Println(index)
				}
			}

		})
		//		for word, _ := range dicA[z] {
		//			count := dicA[z][word]
		//			//fmt.Println(count.Count, prune[z])
		//			if count.Count >= prune[z] {
		//				if _, present := newDic[word]; !present {
		//					newDic[word] = float64(index)
		//					index++
		//					att := data.NewAttribute()
		//					att.SetName(word)
		//					att.SetType(data.STRING)
		//					attributes = append(attributes, att)
		//					fmt.Println(index)
		//				}
		//			}
		//		}
	}
	//fmt.Println(newDic)
	// Compute document frequencies
	stwv.docsCounts = make([]int, len(attributes))
	//idx := 0
	newDic.Do(func(key, value interface{}) {
		word, _ := key.(string)
		idx, _ := value.(int)
		docsCount := 0
		for j := 0; j < values; j++ {
			if count, present := dicA[j].Find(word); present {
				count := count.(Count)
				//fmt.Println(count.DocCount, "doccount newdic")
				docsCount += count.DocCount
			}
		}
		stwv.docsCounts[idx] = docsCount
	})
	//	for word, idx := range newDic {
	//		docsCount := 0
	//		for j := 0; j < values; j++ {
	//			if count, present := dicA[j][word]; present {
	//				docsCount += count.DocCount
	//			}
	//		}
	//		stwv.docsCounts[int(idx)] = docsCount
	//		//idx++
	//	}
	fmt.Println("doc: ", stwv.docsCounts)
	stwv.dictionary = newDic
	////fmt.Println("numInst", len(inst.Instances()))
	stwv.numInstances = len(inst.Instances())
	stwv.outputFormat = data.NewInstances()
	stwv.outputFormat.SetAttributes(attributes)
	stwv.outputFormat.SetClassIndex(classIndex)
}