// Prune eliminates branches which hurt accuracy func (d *DecisionTreeNode) Prune(using *base.Instances) { // If you're a leaf, you're already pruned if d.Children == nil { return } else { if d.SplitAttr == nil { return } // Recursively prune children of this node sub := using.DecomposeOnAttributeValues(d.SplitAttr) for k := range d.Children { if sub[k] == nil { continue } d.Children[k].Prune(sub[k]) } } // Get a baseline accuracy baselineAccuracy := computeAccuracy(d.Predict(using), using) // Speculatively remove the children and re-evaluate tmpChildren := d.Children d.Children = nil newAccuracy := computeAccuracy(d.Predict(using), using) // Keep the children removed if better, else restore if newAccuracy < baselineAccuracy { d.Children = tmpChildren } }
// GenerateSplitAttribute returns the best attribute out of those randomly chosen // which maximises Information Gain func (r *RandomTreeRuleGenerator) GenerateSplitAttribute(f *base.Instances) base.Attribute { // First step is to generate the random attributes that we'll consider maximumAttribute := f.GetAttributeCount() consideredAttributes := make([]int, r.Attributes) attrCounter := 0 for { if len(consideredAttributes) >= r.Attributes { break } selectedAttribute := rand.Intn(maximumAttribute) fmt.Println(selectedAttribute, attrCounter, consideredAttributes, len(consideredAttributes)) if selectedAttribute != f.ClassIndex { matched := false for _, a := range consideredAttributes { if a == selectedAttribute { matched = true break } } if matched { continue } consideredAttributes = append(consideredAttributes, selectedAttribute) attrCounter++ } } return r.internalRule.GetSplitAttributeFromSelection(consideredAttributes, f) }
func convertInstancesToLabelVec(X *base.Instances) []float64 { labelVec := make([]float64, X.Rows) for i := 0; i < X.Rows; i++ { labelVec[i] = X.Get(i, X.ClassIndex) } return labelVec }
func (KNN *KNNClassifier) Predict(what *base.Instances) *base.Instances { ret := what.GeneratePredictionVector() for i := 0; i < what.Rows; i++ { ret.SetAttrStr(i, 0, KNN.PredictOne(what.GetRowVectorWithoutClass(i))) } return ret }
// Run discretises the set of Instances `on' // // IMPORTANT: ChiMergeFilter discretises in place. func (c *ChiMergeFilter) Run(on *base.Instances) { if !c._Trained { panic("Call Build() beforehand") } for attr := range c.Tables { table := c.Tables[attr] for i := 0; i < on.Rows; i++ { val := on.Get(i, attr) dis := 0 for j, k := range table { if k.Value < val { dis = j continue } break } on.Set(i, attr, float64(dis)) } newAttribute := new(base.CategoricalAttribute) newAttribute.SetName(on.GetAttr(attr).GetName()) for _, k := range table { newAttribute.GetSysValFromString(fmt.Sprintf("%f", k.Value)) } on.ReplaceAttr(attr, newAttribute) } }
// GetSplitAttributeFromSelection returns the class Attribute which maximises // the information gain amongst consideredAttributes // // IMPORTANT: passing a zero-length consideredAttributes parameter will panic() func (r *InformationGainRuleGenerator) GetSplitAttributeFromSelection(consideredAttributes []int, f *base.Instances) base.Attribute { // Next step is to compute the information gain at this node // for each randomly chosen attribute, and pick the one // which maximises it maxGain := math.Inf(-1) selectedAttribute := -1 // Compute the base entropy classDist := f.GetClassDistribution() baseEntropy := getBaseEntropy(classDist) // Compute the information gain for each attribute for _, s := range consideredAttributes { proposedClassDist := f.GetClassDistributionAfterSplit(f.GetAttr(s)) localEntropy := getSplitEntropy(proposedClassDist) informationGain := baseEntropy - localEntropy if informationGain > maxGain { maxGain = informationGain selectedAttribute = s } } // Pick the one which maximises IG return f.GetAttr(selectedAttribute) }
// Run applies a trained BinningFilter to a set of Instances, // discretising any numeric attributes added. // // IMPORTANT: Run discretises in-place, so make sure to take // a copy if the original instances are still needed // // IMPORTANT: This function panic()s if the filter has not been // trained. Call Build() before running this function // // IMPORTANT: Call Build() after adding any additional attributes. // Otherwise, the training structure will be out of date from // the values expected and could cause a panic. func (b *BinningFilter) Run(on *base.Instances) { if !b.trained { panic("Call Build() beforehand") } for attr := range b.Attributes { minVal := b.MinVals[attr] maxVal := b.MaxVals[attr] disc := 0 // Casts to float32 to replicate a floating point precision error delta := float32(maxVal - minVal) delta /= float32(b.BinCount) for i := 0; i < on.Rows; i++ { val := on.Get(i, attr) if val <= minVal { disc = 0 } else { disc = int(math.Floor(float64(float32(val-minVal) / delta))) if disc >= b.BinCount { disc = b.BinCount - 1 } } on.Set(i, attr, float64(disc)) } newAttribute := new(base.CategoricalAttribute) newAttribute.SetName(on.GetAttr(attr).GetName()) for i := 0; i < b.BinCount; i++ { newAttribute.GetSysValFromString(fmt.Sprintf("%d", i)) } on.ReplaceAttr(attr, newAttribute) } }
func convertInstancesToProblemVec(X *base.Instances) [][]float64 { problemVec := make([][]float64, X.Rows) for i := 0; i < X.Rows; i++ { problemVecCounter := 0 problemVec[i] = make([]float64, X.Cols-1) for j := 0; j < X.Cols; j++ { if j == X.ClassIndex { continue } problemVec[i][problemVecCounter] = X.Get(i, j) problemVecCounter++ } } fmt.Println(problemVec, X) return problemVec }
func (lr *LogisticRegression) Predict(X *base.Instances) *base.Instances { ret := X.GeneratePredictionVector() row := make([]float64, X.Cols-1) for i := 0; i < X.Rows; i++ { rowCounter := 0 for j := 0; j < X.Cols; j++ { if j != X.ClassIndex { row[rowCounter] = X.Get(i, j) rowCounter++ } } fmt.Println(Predict(lr.model, row), row) ret.Set(i, 0, Predict(lr.model, row)) } return ret }
func (lr *LinearRegression) Predict(X *base.Instances) (*base.Instances, error) { if !lr.fitted { return nil, NoTrainingDataError } ret := X.GeneratePredictionVector() for i := 0; i < X.Rows; i++ { var prediction float64 = lr.disturbance for j := 0; j < X.Cols; j++ { if j != X.ClassIndex { prediction += X.Get(i, j) * lr.regressionCoefficients[j] } } ret.Set(i, 0, prediction) } return ret, nil }
// GetConfusionMatrix builds a ConfusionMatrix from a set of reference (`ref') // and generate (`gen') Instances. func GetConfusionMatrix(ref *base.Instances, gen *base.Instances) map[string]map[string]int { if ref.Rows != gen.Rows { panic("Row counts should match") } ret := make(map[string]map[string]int) for i := 0; i < ref.Rows; i++ { referenceClass := ref.GetClass(i) predictedClass := gen.GetClass(i) if _, ok := ret[referenceClass]; ok { ret[referenceClass][predictedClass]++ } else { ret[referenceClass] = make(map[string]int) ret[referenceClass][predictedClass] = 1 } } return ret }
// Predict outputs a base.Instances containing predictions from this tree func (d *DecisionTreeNode) Predict(what *base.Instances) *base.Instances { outputAttrs := make([]base.Attribute, 1) outputAttrs[0] = what.GetClassAttr() predictions := base.NewInstances(outputAttrs, what.Rows) for i := 0; i < what.Rows; i++ { cur := d for { if cur.Children == nil { predictions.SetAttrStr(i, 0, cur.Class) break } else { at := cur.SplitAttr j := what.GetAttrIndex(at) if j == -1 { predictions.SetAttrStr(i, 0, cur.Class) break } classVar := at.GetStringFromSysVal(what.Get(i, j)) if next, ok := cur.Children[classVar]; ok { cur = next } else { var bestChild string for c := range cur.Children { bestChild = c if c > classVar { break } } cur = cur.Children[bestChild] } } } } return predictions }
func ChiMBuildFrequencyTable(attr int, inst *base.Instances) []*FrequencyTableEntry { ret := make([]*FrequencyTableEntry, 0) var attribute *base.FloatAttribute attribute, ok := inst.GetAttr(attr).(*base.FloatAttribute) if !ok { panic("only use Chi-M on numeric stuff") } for i := 0; i < inst.Rows; i++ { value := inst.Get(i, attr) valueConv := attribute.GetUsrVal(value) class := inst.GetClass(i) // Search the frequency table for the value found := false for _, entry := range ret { if entry.Value == valueConv { found = true entry.Frequency[class] += 1 } } if !found { newEntry := &FrequencyTableEntry{ valueConv, make(map[string]int), } newEntry.Frequency[class] = 1 ret = append(ret, newEntry) } } return ret }
// generateTrainingAttrs selects RandomFeatures number of base.Attributes from // the provided base.Instances. func (b *BaggedModel) generateTrainingAttrs(model int, from *base.Instances) []base.Attribute { ret := make([]base.Attribute, 0) if b.RandomFeatures == 0 { for j := 0; j < from.Cols; j++ { attr := from.GetAttr(j) ret = append(ret, attr) } } else { for { if len(ret) >= b.RandomFeatures { break } attrIndex := rand.Intn(from.Cols) if attrIndex == from.ClassIndex { continue } attr := from.GetAttr(attrIndex) matched := false for _, a := range ret { if a.Equals(attr) { matched = true break } } if !matched { ret = append(ret, attr) } } } ret = append(ret, from.GetClassAttr()) b.lock.Lock() b.selectedAttributes[model] = ret b.lock.Unlock() return ret }
// generateTrainingInstances generates RandomFeatures number of // attributes and returns a modified version of base.Instances // for training the model func (b *BaggedModel) generateTrainingInstances(model int, from *base.Instances) *base.Instances { insts := from.SampleWithReplacement(from.Rows) selected := b.generateTrainingAttrs(model, from) return insts.SelectAttributes(selected) }
// generatePredictionInstances returns a modified version of the // requested base.Instances with only the base.Attributes selected // for training the model. func (b *BaggedModel) generatePredictionInstances(model int, from *base.Instances) *base.Instances { selected := b.selectedAttributes[model] return from.SelectAttributes(selected) }
// Predict gathers predictions from all the classifiers // and outputs the most common (majority) class // // IMPORTANT: in the event of a tie, the first class which // achieved the tie value is output. func (b *BaggedModel) Predict(from *base.Instances) *base.Instances { n := runtime.NumCPU() // Channel to receive the results as they come in votes := make(chan *base.Instances, n) // Count the votes for each class voting := make(map[int](map[string]int)) // Create a goroutine to collect the votes var votingwait sync.WaitGroup votingwait.Add(1) go func() { for { incoming, ok := <-votes if ok { // Step through each prediction for j := 0; j < incoming.Rows; j++ { // Check if we've seen this class before... if _, ok := voting[j]; !ok { // If we haven't, create an entry voting[j] = make(map[string]int) // Continue on the current row j-- continue } voting[j][incoming.GetClass(j)]++ } } else { votingwait.Done() break } } }() // Create workers to process the predictions processpipe := make(chan int, n) var processwait sync.WaitGroup for i := 0; i < n; i++ { processwait.Add(1) go func() { for { if i, ok := <-processpipe; ok { c := b.Models[i] l := b.generatePredictionInstances(i, from) votes <- c.Predict(l) } else { processwait.Done() break } } }() } // Send all the models to the workers for prediction for i := range b.Models { processpipe <- i } close(processpipe) // Finished sending models to be predicted processwait.Wait() // Predictors all finished processing close(votes) // Close the vote channel and allow it to drain votingwait.Wait() // All the votes are in // Generate the overall consensus ret := from.GeneratePredictionVector() for i := range voting { maxClass := "" maxCount := 0 // Find the most popular class for c := range voting[i] { votes := voting[i][c] if votes > maxCount { maxClass = c maxCount = votes } } ret.SetAttrStr(i, 0, maxClass) } return ret }
func (lr *LinearRegression) Fit(inst *base.Instances) error { if inst.Rows < inst.GetAttributeCount() { return NotEnoughDataError } // Split into two matrices, observed results (dependent variable y) // and the explanatory variables (X) - see http://en.wikipedia.org/wiki/Linear_regression observed := mat64.NewDense(inst.Rows, 1, nil) explVariables := mat64.NewDense(inst.Rows, inst.GetAttributeCount(), nil) for i := 0; i < inst.Rows; i++ { observed.Set(i, 0, inst.Get(i, inst.ClassIndex)) // Set observed data for j := 0; j < inst.GetAttributeCount(); j++ { if j == 0 { // Set intercepts to 1.0 // Could / should be done better: http://www.theanalysisfactor.com/interpret-the-intercept/ explVariables.Set(i, 0, 1.0) } else { explVariables.Set(i, j, inst.Get(i, j-1)) } } } n := inst.GetAttributeCount() qr := mat64.QR(explVariables) q := qr.Q() reg := qr.R() var transposed, qty mat64.Dense transposed.TCopy(q) qty.Mul(&transposed, observed) regressionCoefficients := make([]float64, n) for i := n - 1; i >= 0; i-- { regressionCoefficients[i] = qty.At(i, 0) for j := i + 1; j < n; j++ { regressionCoefficients[i] -= regressionCoefficients[j] * reg.At(i, j) } regressionCoefficients[i] /= reg.At(i, i) } lr.disturbance = regressionCoefficients[0] lr.regressionCoefficients = regressionCoefficients[1:] lr.fitted = true return nil }
// InferID3Tree builds a decision tree using a RuleGenerator // from a set of Instances (implements the ID3 algorithm) func InferID3Tree(from *base.Instances, with RuleGenerator) *DecisionTreeNode { // Count the number of classes at this node classes := from.CountClassValues() // If there's only one class, return a DecisionTreeLeaf with // the only class available if len(classes) == 1 { maxClass := "" for i := range classes { maxClass = i } ret := &DecisionTreeNode{ LeafNode, nil, nil, classes, maxClass, from.GetClassAttrPtr(), } return ret } // Only have the class attribute maxVal := 0 maxClass := "" for i := range classes { if classes[i] > maxVal { maxClass = i maxVal = classes[i] } } // If there are no more Attributes left to split on, // return a DecisionTreeLeaf with the majority class if from.GetAttributeCount() == 2 { ret := &DecisionTreeNode{ LeafNode, nil, nil, classes, maxClass, from.GetClassAttrPtr(), } return ret } ret := &DecisionTreeNode{ RuleNode, nil, nil, classes, maxClass, from.GetClassAttrPtr(), } // Generate a return structure // Generate the splitting attribute splitOnAttribute := with.GenerateSplitAttribute(from) if splitOnAttribute == nil { // Can't determine, just return what we have return ret } // Split the attributes based on this attribute's value splitInstances := from.DecomposeOnAttributeValues(splitOnAttribute) // Create new children from these attributes ret.Children = make(map[string]*DecisionTreeNode) for k := range splitInstances { newInstances := splitInstances[k] ret.Children[k] = InferID3Tree(newInstances, with) } ret.SplitAttr = splitOnAttribute return ret }