//New StringToWordVector function with default values func NewStringToWordVectorInst(inputData data.Instances) StringToWordVector { var stwv StringToWordVector stwv.dictionary = omap.NewStringKeyed() stwv.outputsCounts = false stwv.docsCounts = make([]int, 0) stwv.avgDocLength = -1 stwv.wordsToKeep = 1000 stwv.numInstances = -1 stwv.perdiodicPruningRate = -1 stwv.minTermFreq = 1 stwv.perClass = true stwv.normalize = true stwv.inputFormat = inputData stwv.outputFormat = data.NewInstancesWithClassIndex(inputData.ClassIndex()) stwv.firstTime = true stwv.tf_transformation, stwv.idf_transformation = true, true return stwv }
func (stwv *StringToWordVector) determineDictionary(inst *data.Instances) { /* TODO: see if use a stopwords list*/ fmt.Println("Determing dictionary!") classInd := inst.ClassIndex() values := 1 if stwv.perClass && (classInd != -1) { values = len(inst.Attributes()[classInd].Values()) } dicA := make([]*omap.Map, values) for i := 0; i < values; i++ { dicA[i] = omap.NewStringKeyed() } // Tokenize all training text into an orderedMap of "words". pruneRate := int64((stwv.perdiodicPruningRate / 100) * len(inst.Instances())) for i, instance := range inst.Instances() { vInd := int(0) if stwv.perClass && (classInd != -1) { vInd = int(instance.RealValues()[classInd]) } //Iterate through all relevant string attributes of the current instance hashtable := make(map[string]int, 0) for j := 0; j < instance.NumAttributes(); j++ { if !instance.IsMissingValue(j) && inst.Attributes()[j].IsString() { // Iterate through tokens, perform stemming, and remove stopwords // (if required) //fmt.Println(instance.Values()) words := strings.Fields(instance.Values()[j]) for _, word := range words { _, present := hashtable[word] if !present { hashtable[word] = 0 } //fmt.Println(word) if count, present := dicA[vInd].Find(word); !present { dicA[vInd].Insert(word, Count{1, 0}) } else { count, _ := count.(Count) count.Count++ dicA[vInd].Insert(word, count) } //fmt.Println(dicA[vInd][word]) } } } //updating the docCount for the words that have occurred in this //instance(document). enumeration := make([]string, 0, len(hashtable)) for word, _ := range hashtable { //only the words enumeration = append(enumeration, word) } for _, word := range enumeration { if count, present := dicA[vInd].Find(word); present { count := count.(Count) count.DocCount++ //delete(dicA[vInd], word) dicA[vInd].Insert(word, count) //fmt.Println(word, " ",dicA[vInd][word]) } else { panic("Check the code, there must be a word in the dictionary") } fmt.Println(dicA[vInd].Find(word)) } if pruneRate > 0 { if int64(i)%pruneRate == 0 && i > 0 { for z := 0; z < values; z++ { d := make([]string, 1000) dicA[z].Do(func(key, value interface{}) { word, _ := key.(string) count, _ := value.(Count) if count.Count <= 1 { d = append(d, word) } }) // for word, _ := range dicA[z] { // count := dicA[z][word] // if count.Count <= 1 { // d = append(d, word) // } // } for _, word := range d { dicA[z].Delete(word) //delete(dicA[z], word) } } } } //fmt.Println("new instance-----------------------------------------------------------") } //fmt.Println(dicA) // Figure out the minimum required word frequency totalSize := int(0) prune := make([]int, values) for z := 0; z < values; z++ { totalSize += dicA[z].Len() array := make([]int, dicA[z].Len()) pos := int(0) dicA[z].Do(func(key, value interface{}) { //_, _ := key.(string) count, _ := value.(Count) array[pos] = count.Count pos++ }) // for word, _ := range dicA[z] { // count := dicA[z][word] // array[pos] = count.Count // pos++ // } sort.Ints(array) fmt.Println(array) if len(array) < stwv.wordsToKeep { // if there aren't enough words, set the threshold to // minFreq prune[z] = int(stwv.minTermFreq) } else { // otherwise set it to be at least minFreq idx := len(array) - stwv.wordsToKeep prune[z] = int(math.Max(float64(stwv.minTermFreq), float64(array[idx]))) } //fmt.Println(prune[z]) } // Convert the dictionary into an attribute index // and create one attribute per word attributes := make([]data.Attribute, 0, totalSize+len(inst.Attributes())) fmt.Println(totalSize+len(inst.Attributes()), "len(attributes)") // Add the non-converted attributes classIndex := int(-1) for i, attr := range stwv.inputFormat.Attributes() { if !attr.IsString() { if inst.ClassIndex() == i { classIndex = len(attributes) } //fmt.Println(attr) attributes = append(attributes, attr) } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) newDic := omap.NewStringKeyed() index := len(attributes) for z := 0; z < values; z++ { dicA[z].Do(func(key, value interface{}) { word, _ := key.(string) count, _ := value.(Count) if count.Count >= prune[z] { if _, present := newDic.Find(word); !present { newDic.Insert(word, int(index)) index++ att := data.NewAttribute() att.SetName(word) att.SetType(data.NUMERIC) attributes = append(attributes, att) //fmt.Println(index) } } }) // for word, _ := range dicA[z] { // count := dicA[z][word] // //fmt.Println(count.Count, prune[z]) // if count.Count >= prune[z] { // if _, present := newDic[word]; !present { // newDic[word] = float64(index) // index++ // att := data.NewAttribute() // att.SetName(word) // att.SetType(data.STRING) // attributes = append(attributes, att) // fmt.Println(index) // } // } // } } //fmt.Println(newDic) // Compute document frequencies stwv.docsCounts = make([]int, len(attributes)) //idx := 0 newDic.Do(func(key, value interface{}) { word, _ := key.(string) idx, _ := value.(int) docsCount := 0 for j := 0; j < values; j++ { if count, present := dicA[j].Find(word); present { count := count.(Count) //fmt.Println(count.DocCount, "doccount newdic") docsCount += count.DocCount } } stwv.docsCounts[idx] = docsCount }) // for word, idx := range newDic { // docsCount := 0 // for j := 0; j < values; j++ { // if count, present := dicA[j][word]; present { // docsCount += count.DocCount // } // } // stwv.docsCounts[int(idx)] = docsCount // //idx++ // } fmt.Println("doc: ", stwv.docsCounts) stwv.dictionary = newDic ////fmt.Println("numInst", len(inst.Instances())) stwv.numInstances = len(inst.Instances()) stwv.outputFormat = data.NewInstances() stwv.outputFormat.SetAttributes(attributes) stwv.outputFormat.SetClassIndex(classIndex) }