// generateTrainingAttrs selects RandomFeatures number of base.Attributes from // the provided base.Instances. func (b *BaggedModel) generateTrainingAttrs(model int, from *base.Instances) []base.Attribute { ret := make([]base.Attribute, 0) if b.RandomFeatures == 0 { for j := 0; j < from.Cols; j++ { attr := from.GetAttr(j) ret = append(ret, attr) } } else { for { if len(ret) >= b.RandomFeatures { break } attrIndex := rand.Intn(from.Cols) if attrIndex == from.ClassIndex { continue } attr := from.GetAttr(attrIndex) matched := false for _, a := range ret { if a.Equals(attr) { matched = true break } } if !matched { ret = append(ret, attr) } } } ret = append(ret, from.GetClassAttr()) b.lock.Lock() b.selectedAttributes[model] = ret b.lock.Unlock() return ret }
// GetSplitAttributeFromSelection returns the class Attribute which maximises // the information gain amongst consideredAttributes // // IMPORTANT: passing a zero-length consideredAttributes parameter will panic() func (r *InformationGainRuleGenerator) GetSplitAttributeFromSelection(consideredAttributes []int, f *base.Instances) base.Attribute { // Next step is to compute the information gain at this node // for each randomly chosen attribute, and pick the one // which maximises it maxGain := math.Inf(-1) selectedAttribute := -1 // Compute the base entropy classDist := f.GetClassDistribution() baseEntropy := getBaseEntropy(classDist) // Compute the information gain for each attribute for _, s := range consideredAttributes { proposedClassDist := f.GetClassDistributionAfterSplit(f.GetAttr(s)) localEntropy := getSplitEntropy(proposedClassDist) informationGain := baseEntropy - localEntropy if informationGain > maxGain { maxGain = informationGain selectedAttribute = s } } // Pick the one which maximises IG return f.GetAttr(selectedAttribute) }
// Run discretises the set of Instances `on' // // IMPORTANT: ChiMergeFilter discretises in place. func (c *ChiMergeFilter) Run(on *base.Instances) { if !c._Trained { panic("Call Build() beforehand") } for attr := range c.Tables { table := c.Tables[attr] for i := 0; i < on.Rows; i++ { val := on.Get(i, attr) dis := 0 for j, k := range table { if k.Value < val { dis = j continue } break } on.Set(i, attr, float64(dis)) } newAttribute := new(base.CategoricalAttribute) newAttribute.SetName(on.GetAttr(attr).GetName()) for _, k := range table { newAttribute.GetSysValFromString(fmt.Sprintf("%f", k.Value)) } on.ReplaceAttr(attr, newAttribute) } }
// Run applies a trained BinningFilter to a set of Instances, // discretising any numeric attributes added. // // IMPORTANT: Run discretises in-place, so make sure to take // a copy if the original instances are still needed // // IMPORTANT: This function panic()s if the filter has not been // trained. Call Build() before running this function // // IMPORTANT: Call Build() after adding any additional attributes. // Otherwise, the training structure will be out of date from // the values expected and could cause a panic. func (b *BinningFilter) Run(on *base.Instances) { if !b.trained { panic("Call Build() beforehand") } for attr := range b.Attributes { minVal := b.MinVals[attr] maxVal := b.MaxVals[attr] disc := 0 // Casts to float32 to replicate a floating point precision error delta := float32(maxVal - minVal) delta /= float32(b.BinCount) for i := 0; i < on.Rows; i++ { val := on.Get(i, attr) if val <= minVal { disc = 0 } else { disc = int(math.Floor(float64(float32(val-minVal) / delta))) if disc >= b.BinCount { disc = b.BinCount - 1 } } on.Set(i, attr, float64(disc)) } newAttribute := new(base.CategoricalAttribute) newAttribute.SetName(on.GetAttr(attr).GetName()) for i := 0; i < b.BinCount; i++ { newAttribute.GetSysValFromString(fmt.Sprintf("%d", i)) } on.ReplaceAttr(attr, newAttribute) } }
func ChiMBuildFrequencyTable(attr int, inst *base.Instances) []*FrequencyTableEntry { ret := make([]*FrequencyTableEntry, 0) var attribute *base.FloatAttribute attribute, ok := inst.GetAttr(attr).(*base.FloatAttribute) if !ok { panic("only use Chi-M on numeric stuff") } for i := 0; i < inst.Rows; i++ { value := inst.Get(i, attr) valueConv := attribute.GetUsrVal(value) class := inst.GetClass(i) // Search the frequency table for the value found := false for _, entry := range ret { if entry.Value == valueConv { found = true entry.Frequency[class] += 1 } } if !found { newEntry := &FrequencyTableEntry{ valueConv, make(map[string]int), } newEntry.Frequency[class] = 1 ret = append(ret, newEntry) } } return ret }