// GenerateSplitAttribute returns the best attribute out of those randomly chosen // which maximises Information Gain func (r *RandomTreeRuleGenerator) GenerateSplitAttribute(f *base.Instances) base.Attribute { // First step is to generate the random attributes that we'll consider maximumAttribute := f.GetAttributeCount() consideredAttributes := make([]int, r.Attributes) attrCounter := 0 for { if len(consideredAttributes) >= r.Attributes { break } selectedAttribute := rand.Intn(maximumAttribute) fmt.Println(selectedAttribute, attrCounter, consideredAttributes, len(consideredAttributes)) if selectedAttribute != f.ClassIndex { matched := false for _, a := range consideredAttributes { if a == selectedAttribute { matched = true break } } if matched { continue } consideredAttributes = append(consideredAttributes, selectedAttribute) attrCounter++ } } return r.internalRule.GetSplitAttributeFromSelection(consideredAttributes, f) }
func (lr *LinearRegression) Fit(inst *base.Instances) error { if inst.Rows < inst.GetAttributeCount() { return NotEnoughDataError } // Split into two matrices, observed results (dependent variable y) // and the explanatory variables (X) - see http://en.wikipedia.org/wiki/Linear_regression observed := mat64.NewDense(inst.Rows, 1, nil) explVariables := mat64.NewDense(inst.Rows, inst.GetAttributeCount(), nil) for i := 0; i < inst.Rows; i++ { observed.Set(i, 0, inst.Get(i, inst.ClassIndex)) // Set observed data for j := 0; j < inst.GetAttributeCount(); j++ { if j == 0 { // Set intercepts to 1.0 // Could / should be done better: http://www.theanalysisfactor.com/interpret-the-intercept/ explVariables.Set(i, 0, 1.0) } else { explVariables.Set(i, j, inst.Get(i, j-1)) } } } n := inst.GetAttributeCount() qr := mat64.QR(explVariables) q := qr.Q() reg := qr.R() var transposed, qty mat64.Dense transposed.TCopy(q) qty.Mul(&transposed, observed) regressionCoefficients := make([]float64, n) for i := n - 1; i >= 0; i-- { regressionCoefficients[i] = qty.At(i, 0) for j := i + 1; j < n; j++ { regressionCoefficients[i] -= regressionCoefficients[j] * reg.At(i, j) } regressionCoefficients[i] /= reg.At(i, i) } lr.disturbance = regressionCoefficients[0] lr.regressionCoefficients = regressionCoefficients[1:] lr.fitted = true return nil }
// InferID3Tree builds a decision tree using a RuleGenerator // from a set of Instances (implements the ID3 algorithm) func InferID3Tree(from *base.Instances, with RuleGenerator) *DecisionTreeNode { // Count the number of classes at this node classes := from.CountClassValues() // If there's only one class, return a DecisionTreeLeaf with // the only class available if len(classes) == 1 { maxClass := "" for i := range classes { maxClass = i } ret := &DecisionTreeNode{ LeafNode, nil, nil, classes, maxClass, from.GetClassAttrPtr(), } return ret } // Only have the class attribute maxVal := 0 maxClass := "" for i := range classes { if classes[i] > maxVal { maxClass = i maxVal = classes[i] } } // If there are no more Attributes left to split on, // return a DecisionTreeLeaf with the majority class if from.GetAttributeCount() == 2 { ret := &DecisionTreeNode{ LeafNode, nil, nil, classes, maxClass, from.GetClassAttrPtr(), } return ret } ret := &DecisionTreeNode{ RuleNode, nil, nil, classes, maxClass, from.GetClassAttrPtr(), } // Generate a return structure // Generate the splitting attribute splitOnAttribute := with.GenerateSplitAttribute(from) if splitOnAttribute == nil { // Can't determine, just return what we have return ret } // Split the attributes based on this attribute's value splitInstances := from.DecomposeOnAttributeValues(splitOnAttribute) // Create new children from these attributes ret.Children = make(map[string]*DecisionTreeNode) for k := range splitInstances { newInstances := splitInstances[k] ret.Children[k] = InferID3Tree(newInstances, with) } ret.SplitAttr = splitOnAttribute return ret }