func findBestSplit(partition base.FixedDataGrid) { var delta float64 delta = math.MinInt64 attrs := partition.AllAttributes() classAttrs := partition.AllClassAttributes() candidates := base.AttributeDifferenceReferences(attrs, classAttrs) fmt.Println(delta) fmt.Println(classAttrs) fmt.Println(reflect.TypeOf(partition)) fmt.Println(reflect.TypeOf(candidates)) for i, n := range attrs { fmt.Println(i) //fmt.Println(partition) fmt.Println(reflect.TypeOf(n)) attributeSpec, _ := partition.GetAttribute(n) fmt.Println(partition.GetAttribute(n)) _, rows := partition.Size() for j := 0; j < rows; j++ { data := partition.Get(attributeSpec, j) fmt.Println(base.UnpackBytesToFloat(data)) } } }
func (lr *LogisticRegression) Predict(X base.FixedDataGrid) base.FixedDataGrid { // Only support 1 class Attribute classAttrs := X.AllClassAttributes() if len(classAttrs) != 1 { panic(fmt.Sprintf("%d Wrong number of classes", len(classAttrs))) } // Generate return structure ret := base.GeneratePredictionVector(X) classAttrSpecs := base.ResolveAttributes(ret, classAttrs) // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(X) numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) // Allocate row storage row := make([]float64, len(numericAttrSpecs)) X.MapOverRows(numericAttrSpecs, func(rowBytes [][]byte, rowNo int) (bool, error) { for i, r := range rowBytes { row[i] = base.UnpackBytesToFloat(r) } val := Predict(lr.model, row) vals := base.PackFloatToBytes(val) ret.Set(classAttrSpecs[0], rowNo, vals) return true, nil }) return ret }
// generateTrainingAttrs selects RandomFeatures number of base.Attributes from // the provided base.Instances. func (b *BaggedModel) generateTrainingAttrs(model int, from base.FixedDataGrid) []base.Attribute { ret := make([]base.Attribute, 0) attrs := base.NonClassAttributes(from) if b.RandomFeatures == 0 { ret = attrs } else { for { if len(ret) >= b.RandomFeatures { break } attrIndex := rand.Intn(len(attrs)) attr := attrs[attrIndex] matched := false for _, a := range ret { if a.Equals(attr) { matched = true break } } if !matched { ret = append(ret, attr) } } } for _, a := range from.AllClassAttributes() { ret = append(ret, a) } b.lock.Lock() b.selectedAttributes[model] = ret b.lock.Unlock() return ret }
// GenerateSplitRule returns the best attribute out of those randomly chosen // which maximises Information Gain func (r *RandomTreeRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule { var consideredAttributes []base.Attribute // First step is to generate the random attributes that we'll consider allAttributes := base.AttributeDifferenceReferences(f.AllAttributes(), f.AllClassAttributes()) maximumAttribute := len(allAttributes) attrCounter := 0 for { if len(consideredAttributes) >= r.Attributes { break } selectedAttrIndex := rand.Intn(maximumAttribute) selectedAttribute := allAttributes[selectedAttrIndex] matched := false for _, a := range consideredAttributes { if a.Equals(selectedAttribute) { matched = true break } } if matched { continue } consideredAttributes = append(consideredAttributes, selectedAttribute) attrCounter++ } return r.internalRule.GetSplitRuleFromSelection(consideredAttributes, f) }
// GenerateSplitAttribute returns the non-class Attribute which maximises the // information gain. // // IMPORTANT: passing a base.Instances with no Attributes other than the class // variable will panic() func (r *InformationGainRuleGenerator) GenerateSplitAttribute(f base.FixedDataGrid) base.Attribute { attrs := f.AllAttributes() classAttrs := f.AllClassAttributes() candidates := base.AttributeDifferenceReferences(attrs, classAttrs) return r.GetSplitAttributeFromSelection(candidates, f) }
// GenerateSplitRule returns the non-class Attribute-based DecisionTreeRule // which maximises the information gain. // // IMPORTANT: passing a base.Instances with no Attributes other than the class // variable will panic() func (g *GiniCoefficientRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule { attrs := f.AllAttributes() classAttrs := f.AllClassAttributes() candidates := base.AttributeDifferenceReferences(attrs, classAttrs) return g.GetSplitRuleFromSelection(candidates, f) }
func generateClassWeightVectorFromFixed(X base.FixedDataGrid) []float64 { classAttrs := X.AllClassAttributes() if len(classAttrs) != 1 { panic("Wrong number of class Attributes") } if _, ok := classAttrs[0].(*base.FloatAttribute); ok { ret := make([]float64, 2) for i := range ret { ret[i] = 1.0 } return ret } else { panic("Must be a FloatAttribute") } }
func processData(x base.FixedDataGrid) instances { _, rows := x.Size() result := make(instances, rows) // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(x) numericAttrSpecs := base.ResolveAttributes(x, numericAttrs) // Retrieve class Attributes classAttrs := x.AllClassAttributes() if len(classAttrs) != 1 { panic("Only one classAttribute supported!") } // Check that the class Attribute is categorical // (with two values) or binary classAttr := classAttrs[0] if attr, ok := classAttr.(*base.CategoricalAttribute); ok { if len(attr.GetValues()) != 2 { panic("To many values for Attribute!") } } else if _, ok := classAttr.(*base.BinaryAttribute); ok { } else { panic("Wrong class Attribute type!") } // Convert each row x.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { // Allocate a new row probRow := make([]float64, len(numericAttrSpecs)) // Read out the row for i, _ := range numericAttrSpecs { probRow[i] = base.UnpackBytesToFloat(row[i]) } // Get the class for the values class := base.GetClass(x, rowNo) instance := instance{class, probRow} result[rowNo] = instance return true, nil }) return result }
func (m *OneVsAllModel) generateAttributes(from base.FixedDataGrid) map[base.Attribute]base.Attribute { attrs := from.AllAttributes() classAttrs := from.AllClassAttributes() if len(classAttrs) != 1 { panic("Only 1 class Attribute is supported!") } ret := make(map[base.Attribute]base.Attribute) for _, a := range attrs { ret[a] = a for _, b := range classAttrs { if a.Equals(b) { cur := base.NewFloatAttribute(b.GetName()) ret[a] = cur } } } return ret }
// Fit creates n filtered datasets (where n is the number of values // a CategoricalAttribute can take) and uses them to train the // underlying classifiers. func (m *OneVsAllModel) Fit(using base.FixedDataGrid) { var classAttr *base.CategoricalAttribute // Do some validation classAttrs := using.AllClassAttributes() for _, a := range classAttrs { if c, ok := a.(*base.CategoricalAttribute); !ok { panic("Unsupported ClassAttribute type") } else { classAttr = c } } attrs := m.generateAttributes(using) // Find the highest stored value val := uint64(0) classVals := classAttr.GetValues() for _, s := range classVals { cur := base.UnpackBytesToU64(classAttr.GetSysValFromString(s)) if cur > val { val = cur } } if val == 0 { panic("Must have more than one class!") } m.maxClassVal = val // Create individual filtered instances for training filters := make([]*oneVsAllFilter, val+1) classifiers := make([]base.Classifier, val+1) for i := uint64(0); i <= val; i++ { f := &oneVsAllFilter{ attrs, classAttr, i, } filters[i] = f classifiers[i] = m.NewClassifierFunction(classVals[int(i)]) classifiers[i].Fit(base.NewLazilyFilteredInstances(using, f)) } m.filters = filters m.classifiers = classifiers }
func convertInstancesToLabelVec(X base.FixedDataGrid) []float64 { // Get the class Attributes classAttrs := X.AllClassAttributes() // Only support 1 class Attribute if len(classAttrs) != 1 { panic(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) } // ClassAttribute must be numeric if _, ok := classAttrs[0].(*base.FloatAttribute); !ok { panic(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) } // Allocate return structure _, rows := X.Size() labelVec := make([]float64, rows) // Resolve class Attribute specification classAttrSpecs := base.ResolveAttributes(X, classAttrs) X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { labelVec[rowNo] = base.UnpackBytesToFloat(row[0]) return true, nil }) return labelVec }
func getClassAttr(from base.FixedDataGrid) base.Attribute { allClassAttrs := from.AllClassAttributes() return allClassAttrs[0] }
// Fill data matrix with Bernoulli Naive Bayes model. All values // necessary for calculating prior probability and p(f_i) func (nb *BernoulliNBClassifier) Fit(X base.FixedDataGrid) { // Check that all Attributes are binary classAttrs := X.AllClassAttributes() allAttrs := X.AllAttributes() featAttrs := base.AttributeDifference(allAttrs, classAttrs) for i := range featAttrs { if _, ok := featAttrs[i].(*base.BinaryAttribute); !ok { panic(fmt.Sprintf("%v: Should be BinaryAttribute", featAttrs[i])) } } featAttrSpecs := base.ResolveAttributes(X, featAttrs) // Check that only one classAttribute is defined if len(classAttrs) != 1 { panic("Only one class Attribute can be used") } // Number of features and instances in this training set _, nb.trainingInstances = X.Size() nb.attrs = featAttrs nb.features = len(featAttrs) // Number of instances in class nb.classInstances = make(map[string]int) // Number of documents with given term (by class) docsContainingTerm := make(map[string][]int) // This algorithm could be vectorized after binarizing the data // matrix. Since mat64 doesn't have this function, a iterative // version is used. X.MapOverRows(featAttrSpecs, func(docVector [][]byte, r int) (bool, error) { class := base.GetClass(X, r) // increment number of instances in class t, ok := nb.classInstances[class] if !ok { t = 0 } nb.classInstances[class] = t + 1 for feat := 0; feat < len(docVector); feat++ { v := docVector[feat] // In Bernoulli Naive Bayes the presence and absence of // features are considered. All non-zero values are // treated as presence. if v[0] > 0 { // Update number of times this feature appeared within // given label. t, ok := docsContainingTerm[class] if !ok { t = make([]int, nb.features) docsContainingTerm[class] = t } t[feat] += 1 } } return true, nil }) // Pre-calculate conditional probabilities for each class for c, _ := range nb.classInstances { nb.condProb[c] = make([]float64, nb.features) for feat := 0; feat < nb.features; feat++ { classTerms, _ := docsContainingTerm[c] numDocs := classTerms[feat] docsInClass, _ := nb.classInstances[c] classCondProb, _ := nb.condProb[c] // Calculate conditional probability with laplace smoothing classCondProb[feat] = float64(numDocs+1) / float64(docsInClass+1) } } }
func (lr *LinearRegression) Fit(inst base.FixedDataGrid) error { // Retrieve row size _, rows := inst.Size() // Validate class Attribute count classAttrs := inst.AllClassAttributes() if len(classAttrs) != 1 { return fmt.Errorf("Only 1 class variable is permitted") } classAttrSpecs := base.ResolveAttributes(inst, classAttrs) // Retrieve relevant Attributes allAttrs := base.NonClassAttributes(inst) attrs := make([]base.Attribute, 0) for _, a := range allAttrs { if _, ok := a.(*base.FloatAttribute); ok { attrs = append(attrs, a) } } cols := len(attrs) + 1 if rows < cols { return NotEnoughDataError } // Retrieve relevant Attribute specifications attrSpecs := base.ResolveAttributes(inst, attrs) // Split into two matrices, observed results (dependent variable y) // and the explanatory variables (X) - see http://en.wikipedia.org/wiki/Linear_regression observed := mat64.NewDense(rows, 1, nil) explVariables := mat64.NewDense(rows, cols, nil) // Build the observed matrix inst.MapOverRows(classAttrSpecs, func(row [][]byte, i int) (bool, error) { val := base.UnpackBytesToFloat(row[0]) observed.Set(i, 0, val) return true, nil }) // Build the explainatory variables inst.MapOverRows(attrSpecs, func(row [][]byte, i int) (bool, error) { // Set intercepts to 1.0 explVariables.Set(i, 0, 1.0) for j, r := range row { explVariables.Set(i, j+1, base.UnpackBytesToFloat(r)) } return true, nil }) n := cols qr := new(mat64.QR) qr.Factorize(explVariables) var q, reg mat64.Dense q.QFromQR(qr) reg.RFromQR(qr) var transposed, qty mat64.Dense transposed.Clone(q.T()) qty.Mul(&transposed, observed) regressionCoefficients := make([]float64, n) for i := n - 1; i >= 0; i-- { regressionCoefficients[i] = qty.At(i, 0) for j := i + 1; j < n; j++ { regressionCoefficients[i] -= regressionCoefficients[j] * reg.At(i, j) } regressionCoefficients[i] /= reg.At(i, i) } lr.disturbance = regressionCoefficients[0] lr.regressionCoefficients = regressionCoefficients[1:] lr.fitted = true lr.attrs = attrs lr.cls = classAttrs[0] return nil }