func ChiMBuildFrequencyTable(attr base.Attribute, inst base.FixedDataGrid) []*FrequencyTableEntry { ret := make([]*FrequencyTableEntry, 0) attribute := attr.(*base.FloatAttribute) attrSpec, err := inst.GetAttribute(attr) if err != nil { panic(err) } attrSpecs := []base.AttributeSpec{attrSpec} err = inst.MapOverRows(attrSpecs, func(row [][]byte, rowNo int) (bool, error) { value := row[0] valueConv := attribute.GetFloatFromSysVal(value) class := base.GetClass(inst, rowNo) // Search the frequency table for the value found := false for _, entry := range ret { if entry.Value == valueConv { found = true entry.Frequency[class] += 1 } } if !found { newEntry := &FrequencyTableEntry{ valueConv, make(map[string]int), } newEntry.Frequency[class] = 1 ret = append(ret, newEntry) } return true, nil }) return ret }
// generateTrainingAttrs selects RandomFeatures number of base.Attributes from // the provided base.Instances. func (b *BaggedModel) generateTrainingAttrs(model int, from base.FixedDataGrid) []base.Attribute { ret := make([]base.Attribute, 0) attrs := base.NonClassAttributes(from) if b.RandomFeatures == 0 { ret = attrs } else { for { if len(ret) >= b.RandomFeatures { break } attrIndex := rand.Intn(len(attrs)) attr := attrs[attrIndex] matched := false for _, a := range ret { if a.Equals(attr) { matched = true break } } if !matched { ret = append(ret, attr) } } } for _, a := range from.AllClassAttributes() { ret = append(ret, a) } b.lock.Lock() b.selectedAttributes[model] = ret b.lock.Unlock() return ret }
func (lr *LogisticRegression) Predict(X base.FixedDataGrid) base.FixedDataGrid { // Only support 1 class Attribute classAttrs := X.AllClassAttributes() if len(classAttrs) != 1 { panic(fmt.Sprintf("%d Wrong number of classes", len(classAttrs))) } // Generate return structure ret := base.GeneratePredictionVector(X) classAttrSpecs := base.ResolveAttributes(ret, classAttrs) // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(X) numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) // Allocate row storage row := make([]float64, len(numericAttrSpecs)) X.MapOverRows(numericAttrSpecs, func(rowBytes [][]byte, rowNo int) (bool, error) { for i, r := range rowBytes { row[i] = base.UnpackBytesToFloat(r) } val := Predict(lr.model, row) vals := base.PackFloatToBytes(val) ret.Set(classAttrSpecs[0], rowNo, vals) return true, nil }) return ret }
// GenerateSplitRule returns the best attribute out of those randomly chosen // which maximises Information Gain func (r *RandomTreeRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule { var consideredAttributes []base.Attribute // First step is to generate the random attributes that we'll consider allAttributes := base.AttributeDifferenceReferences(f.AllAttributes(), f.AllClassAttributes()) maximumAttribute := len(allAttributes) attrCounter := 0 for { if len(consideredAttributes) >= r.Attributes { break } selectedAttrIndex := rand.Intn(maximumAttribute) selectedAttribute := allAttributes[selectedAttrIndex] matched := false for _, a := range consideredAttributes { if a.Equals(selectedAttribute) { matched = true break } } if matched { continue } consideredAttributes = append(consideredAttributes, selectedAttribute) attrCounter++ } return r.internalRule.GetSplitRuleFromSelection(consideredAttributes, f) }
// GenerateSplitRule returns the non-class Attribute-based DecisionTreeRule // which maximises the information gain. // // IMPORTANT: passing a base.Instances with no Attributes other than the class // variable will panic() func (g *GiniCoefficientRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule { attrs := f.AllAttributes() classAttrs := f.AllClassAttributes() candidates := base.AttributeDifferenceReferences(attrs, classAttrs) return g.GetSplitRuleFromSelection(candidates, f) }
// GenerateSplitAttribute returns the non-class Attribute which maximises the // information gain. // // IMPORTANT: passing a base.Instances with no Attributes other than the class // variable will panic() func (r *InformationGainRuleGenerator) GenerateSplitAttribute(f base.FixedDataGrid) base.Attribute { attrs := f.AllAttributes() classAttrs := f.AllClassAttributes() candidates := base.AttributeDifferenceReferences(attrs, classAttrs) return r.GetSplitAttributeFromSelection(candidates, f) }
// Predict outputs a base.Instances containing predictions from this tree func (d *DecisionTreeNode) Predict(what base.FixedDataGrid) (base.FixedDataGrid, error) { predictions := base.GeneratePredictionVector(what) classAttr := getClassAttr(predictions) classAttrSpec, err := predictions.GetAttribute(classAttr) if err != nil { panic(err) } predAttrs := base.AttributeDifferenceReferences(what.AllAttributes(), predictions.AllClassAttributes()) predAttrSpecs := base.ResolveAttributes(what, predAttrs) what.MapOverRows(predAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { cur := d for { if cur.Children == nil { predictions.Set(classAttrSpec, rowNo, classAttr.GetSysValFromString(cur.Class)) break } else { splitVal := cur.SplitRule.SplitVal at := cur.SplitRule.SplitAttr ats, err := what.GetAttribute(at) if err != nil { //predictions.Set(classAttrSpec, rowNo, classAttr.GetSysValFromString(cur.Class)) //break panic(err) } var classVar string if _, ok := ats.GetAttribute().(*base.FloatAttribute); ok { // If it's a numeric Attribute (e.g. FloatAttribute) check that // the value of the current node is greater than the old one classVal := base.UnpackBytesToFloat(what.Get(ats, rowNo)) if classVal > splitVal { classVar = "1" } else { classVar = "0" } } else { classVar = ats.GetAttribute().GetStringFromSysVal(what.Get(ats, rowNo)) } if next, ok := cur.Children[classVar]; ok { cur = next } else { // Suspicious of this var bestChild string for c := range cur.Children { bestChild = c if c > classVar { break } } cur = cur.Children[bestChild] } } } return true, nil }) return predictions, nil }
func (m *MultiLayerNet) convertToFloatInsts(X base.FixedDataGrid) base.FixedDataGrid { // Make sure everything's a FloatAttribute fFilt := filters.NewFloatConvertFilter() for _, a := range X.AllAttributes() { fFilt.AddAttribute(a) } fFilt.Train() insts := base.NewLazilyFilteredInstances(X, fFilt) return insts }
// Predict is just a wrapper for the PredictOne function. // // IMPORTANT: Predict panics if Fit was not called or if the // document vector and train matrix have a different number of columns. func (nb *BernoulliNBClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Generate return vector ret := base.GeneratePredictionVector(what) // Get the features featAttrSpecs := base.ResolveAttributes(what, nb.attrs) what.MapOverRows(featAttrSpecs, func(row [][]byte, i int) (bool, error) { base.SetClass(ret, i, nb.PredictOne(row)) return true, nil }) return ret }
// NewChiMergeFilter creates a ChiMergeFilter with some helpful intialisations. func NewChiMergeFilter(d base.FixedDataGrid, significance float64) *ChiMergeFilter { _, rows := d.Size() return &ChiMergeFilter{ AbstractDiscretizeFilter{ make(map[base.Attribute]bool), false, d, }, make(map[base.Attribute][]*FrequencyTableEntry), significance, 2, rows, } }
func generateClassWeightVectorFromFixed(X base.FixedDataGrid) []float64 { classAttrs := X.AllClassAttributes() if len(classAttrs) != 1 { panic("Wrong number of class Attributes") } if _, ok := classAttrs[0].(*base.FloatAttribute); ok { ret := make([]float64, 2) for i := range ret { ret[i] = 1.0 } return ret } else { panic("Must be a FloatAttribute") } }
func (m *OneVsAllModel) generateAttributes(from base.FixedDataGrid) map[base.Attribute]base.Attribute { attrs := from.AllAttributes() classAttrs := from.AllClassAttributes() if len(classAttrs) != 1 { panic("Only 1 class Attribute is supported!") } ret := make(map[base.Attribute]base.Attribute) for _, a := range attrs { ret[a] = a for _, b := range classAttrs { if a.Equals(b) { cur := base.NewFloatAttribute(b.GetName()) ret[a] = cur } } } return ret }
// Fit creates n filtered datasets (where n is the number of values // a CategoricalAttribute can take) and uses them to train the // underlying classifiers. func (m *OneVsAllModel) Fit(using base.FixedDataGrid) { var classAttr *base.CategoricalAttribute // Do some validation classAttrs := using.AllClassAttributes() for _, a := range classAttrs { if c, ok := a.(*base.CategoricalAttribute); !ok { panic("Unsupported ClassAttribute type") } else { classAttr = c } } attrs := m.generateAttributes(using) // Find the highest stored value val := uint64(0) classVals := classAttr.GetValues() for _, s := range classVals { cur := base.UnpackBytesToU64(classAttr.GetSysValFromString(s)) if cur > val { val = cur } } if val == 0 { panic("Must have more than one class!") } m.maxClassVal = val // Create individual filtered instances for training filters := make([]*oneVsAllFilter, val+1) classifiers := make([]base.Classifier, val+1) for i := uint64(0); i <= val; i++ { f := &oneVsAllFilter{ attrs, classAttr, i, } filters[i] = f classifiers[i] = m.NewClassifierFunction(classVals[int(i)]) classifiers[i].Fit(base.NewLazilyFilteredInstances(using, f)) } m.filters = filters m.classifiers = classifiers }
func computePairwiseDistances(inst base.FixedDataGrid, attrs []base.Attribute, metric pairwise.PairwiseDistanceFunc) (*mat64.Dense, error) { // Compute pair-wise distances // First convert everything to floats mats, err := base.ConvertAllRowsToMat64(attrs, inst) if err != nil { return nil, err } // Next, do an n^2 computation of all pairwise distances _, rows := inst.Size() dist := mat64.NewDense(rows, rows, nil) for i := 0; i < rows; i++ { for j := i + 1; j < rows; j++ { d := metric.Distance(mats[i], mats[j]) dist.Set(i, j, d) dist.Set(j, i, d) } } return dist, nil }
// GetConfusionMatrix builds a ConfusionMatrix from a set of reference (`ref') // and generate (`gen') Instances. func GetConfusionMatrix(ref base.FixedDataGrid, gen base.FixedDataGrid) (map[string]map[string]int, error) { _, refRows := ref.Size() _, genRows := gen.Size() if refRows != genRows { return nil, errors.New(fmt.Sprintf("Row count mismatch: ref has %d rows, gen has %d rows", refRows, genRows)) } ret := make(map[string]map[string]int) for i := 0; i < int(refRows); i++ { referenceClass := base.GetClass(ref, i) predictedClass := base.GetClass(gen, i) if _, ok := ret[referenceClass]; ok { ret[referenceClass][predictedClass] += 1 } else { ret[referenceClass] = make(map[string]int) ret[referenceClass][predictedClass] = 1 } } return ret, nil }
func convertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { // Allocate problem array _, rows := X.Size() problemVec := make([][]float64, rows) // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(X) numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) // Convert each row X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { // Allocate a new row probRow := make([]float64, len(numericAttrSpecs)) // Read out the row for i, _ := range numericAttrSpecs { probRow[i] = base.UnpackBytesToFloat(row[i]) } // Add the row problemVec[rowNo] = probRow return true, nil }) return problemVec }
// GetConfusionMatrix builds a ConfusionMatrix from a set of reference (`ref') // and generate (`gen') Instances. func GetConfusionMatrix(ref base.FixedDataGrid, gen base.FixedDataGrid) map[string]map[string]int { _, refRows := ref.Size() _, genRows := gen.Size() if refRows != genRows { panic("Row counts should match") } ret := make(map[string]map[string]int) for i := 0; i < int(refRows); i++ { referenceClass := base.GetClass(ref, i) predictedClass := base.GetClass(gen, i) if _, ok := ret[referenceClass]; ok { ret[referenceClass][predictedClass] += 1 } else { ret[referenceClass] = make(map[string]int) ret[referenceClass][predictedClass] = 1 } } return ret }
func (lr *LinearRegression) Predict(X base.FixedDataGrid) (base.FixedDataGrid, error) { if !lr.fitted { return nil, NoTrainingDataError } ret := base.GeneratePredictionVector(X) attrSpecs := base.ResolveAttributes(X, lr.attrs) clsSpec, err := ret.GetAttribute(lr.cls) if err != nil { return nil, err } X.MapOverRows(attrSpecs, func(row [][]byte, i int) (bool, error) { var prediction float64 = lr.disturbance for j, r := range row { prediction += base.UnpackBytesToFloat(r) * lr.regressionCoefficients[j] } ret.Set(clsSpec, i, base.PackFloatToBytes(prediction)) return true, nil }) return ret, nil }
func getNumericAttributeEntropy(f base.FixedDataGrid, attr *base.FloatAttribute) (float64, float64) { // Resolve Attribute attrSpec, err := f.GetAttribute(attr) if err != nil { panic(err) } // Build sortable vector _, rows := f.Size() refs := make([]numericSplitRef, rows) f.MapOverRows([]base.AttributeSpec{attrSpec}, func(val [][]byte, row int) (bool, error) { cls := base.GetClass(f, row) v := base.UnpackBytesToFloat(val[0]) refs[row] = numericSplitRef{v, cls} return true, nil }) // Sort sort.Sort(splitVec(refs)) generateCandidateSplitDistribution := func(val float64) map[string]map[string]int { presplit := make(map[string]int) postplit := make(map[string]int) for _, i := range refs { if i.val < val { presplit[i.class]++ } else { postplit[i.class]++ } } ret := make(map[string]map[string]int) ret["0"] = presplit ret["1"] = postplit return ret } minSplitEntropy := math.Inf(1) minSplitVal := math.Inf(1) // Consider each possible function for i := 0; i < len(refs)-1; i++ { val := refs[i].val + refs[i+1].val val /= 2 splitDist := generateCandidateSplitDistribution(val) splitEntropy := getSplitEntropy(splitDist) if splitEntropy < minSplitEntropy { minSplitEntropy = splitEntropy minSplitVal = val } } return minSplitEntropy, minSplitVal }
// Predict outputs a base.Instances containing predictions from this tree func (d *DecisionTreeNode) Predict(what base.FixedDataGrid) base.FixedDataGrid { predictions := base.GeneratePredictionVector(what) classAttr := getClassAttr(predictions) classAttrSpec, err := predictions.GetAttribute(classAttr) if err != nil { panic(err) } predAttrs := base.AttributeDifferenceReferences(what.AllAttributes(), predictions.AllClassAttributes()) predAttrSpecs := base.ResolveAttributes(what, predAttrs) what.MapOverRows(predAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { cur := d for { if cur.Children == nil { predictions.Set(classAttrSpec, rowNo, classAttr.GetSysValFromString(cur.Class)) break } else { at := cur.SplitAttr ats, err := what.GetAttribute(at) if err != nil { predictions.Set(classAttrSpec, rowNo, classAttr.GetSysValFromString(cur.Class)) break } classVar := ats.GetAttribute().GetStringFromSysVal(what.Get(ats, rowNo)) if next, ok := cur.Children[classVar]; ok { cur = next } else { var bestChild string for c := range cur.Children { bestChild = c if c > classVar { break } } cur = cur.Children[bestChild] } } } return true, nil }) return predictions }
// GenerateCrossFoldValidationConfusionMatrices divides the data into a number of folds // then trains and evaluates the classifier on each fold, producing a new ConfusionMatrix. func GenerateCrossFoldValidationConfusionMatrices(data base.FixedDataGrid, cls base.Classifier, folds int) ([]ConfusionMatrix, error) { _, rows := data.Size() // Assign each row to a fold foldMap := make([]int, rows) inverseFoldMap := make(map[int][]int) for i := 0; i < rows; i++ { fold := rand.Intn(folds) foldMap[i] = fold if _, ok := inverseFoldMap[fold]; !ok { inverseFoldMap[fold] = make([]int, 0) } inverseFoldMap[fold] = append(inverseFoldMap[fold], i) } ret := make([]ConfusionMatrix, folds) // Create training/test views for each fold for i := 0; i < folds; i++ { // Fold i is for testing testData := base.NewInstancesViewFromVisible(data, inverseFoldMap[i], data.AllAttributes()) otherRows := make([]int, 0) for j := 0; j < folds; j++ { if i == j { continue } otherRows = append(otherRows, inverseFoldMap[j]...) } trainData := base.NewInstancesViewFromVisible(data, otherRows, data.AllAttributes()) // Train err := cls.Fit(trainData) if err != nil { return nil, err } // Predict pred, err := cls.Predict(testData) if err != nil { return nil, err } // Evaluate cf, err := GetConfusionMatrix(testData, pred) if err != nil { return nil, err } ret[i] = cf } return ret, nil }
func processData(x base.FixedDataGrid) instances { _, rows := x.Size() result := make(instances, rows) // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(x) numericAttrSpecs := base.ResolveAttributes(x, numericAttrs) // Retrieve class Attributes classAttrs := x.AllClassAttributes() if len(classAttrs) != 1 { panic("Only one classAttribute supported!") } // Check that the class Attribute is categorical // (with two values) or binary classAttr := classAttrs[0] if attr, ok := classAttr.(*base.CategoricalAttribute); ok { if len(attr.GetValues()) != 2 { panic("To many values for Attribute!") } } else if _, ok := classAttr.(*base.BinaryAttribute); ok { } else { panic("Wrong class Attribute type!") } // Convert each row x.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { // Allocate a new row probRow := make([]float64, len(numericAttrSpecs)) // Read out the row for i, _ := range numericAttrSpecs { probRow[i] = base.UnpackBytesToFloat(row[i]) } // Get the class for the values class := base.GetClass(x, rowNo) instance := instance{class, probRow} result[rowNo] = instance return true, nil }) return result }
func convertInstancesToLabelVec(X base.FixedDataGrid) []float64 { // Get the class Attributes classAttrs := X.AllClassAttributes() // Only support 1 class Attribute if len(classAttrs) != 1 { panic(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) } // ClassAttribute must be numeric if _, ok := classAttrs[0].(*base.FloatAttribute); !ok { panic(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) } // Allocate return structure _, rows := X.Size() labelVec := make([]float64, rows) // Resolve class Attribute specification classAttrSpecs := base.ResolveAttributes(X, classAttrs) X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { labelVec[rowNo] = base.UnpackBytesToFloat(row[0]) return true, nil }) return labelVec }
// Predict returns a classification for the vector, based on a vector input, using the KNN algorithm. func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Check what distance function we are using var distanceFunc pairwise.PairwiseDistanceFunc switch KNN.DistanceFunc { case "euclidean": distanceFunc = pairwise.NewEuclidean() case "manhattan": distanceFunc = pairwise.NewManhattan() default: panic("unsupported distance function") } // Check Compatibility allAttrs := base.CheckCompatible(what, KNN.TrainingData) if allAttrs == nil { // Don't have the same Attributes return nil } // Remove the Attributes which aren't numeric allNumericAttrs := make([]base.Attribute, 0) for _, a := range allAttrs { if fAttr, ok := a.(*base.FloatAttribute); ok { allNumericAttrs = append(allNumericAttrs, fAttr) } } // Generate return vector ret := base.GeneratePredictionVector(what) // Resolve Attribute specifications for both whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs) trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs) // Reserve storage for most the most similar items distances := make(map[int]float64) // Reserve storage for voting map maxmap := make(map[string]int) // Reserve storage for row computations trainRowBuf := make([]float64, len(allNumericAttrs)) predRowBuf := make([]float64, len(allNumericAttrs)) // Iterate over all outer rows what.MapOverRows(whatAttrSpecs, func(predRow [][]byte, predRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { predRowBuf[i] = base.UnpackBytesToFloat(predRow[i]) } predMat := utilities.FloatsToMatrix(predRowBuf) // Find the closest match in the training data KNN.TrainingData.MapOverRows(trainAttrSpecs, func(trainRow [][]byte, srcRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { trainRowBuf[i] = base.UnpackBytesToFloat(trainRow[i]) } // Compute the distance trainMat := utilities.FloatsToMatrix(trainRowBuf) distances[srcRowNo] = distanceFunc.Distance(predMat, trainMat) return true, nil }) sorted := utilities.SortIntMap(distances) values := sorted[:KNN.NearestNeighbours] // Reset maxMap for a := range maxmap { maxmap[a] = 0 } // Refresh maxMap for _, elem := range values { label := base.GetClass(KNN.TrainingData, elem) if _, ok := maxmap[label]; ok { maxmap[label]++ } else { maxmap[label] = 1 } } // Sort the maxMap var maxClass string maxVal := -1 for a := range maxmap { if maxmap[a] > maxVal { maxVal = maxmap[a] maxClass = a } } base.SetClass(ret, predRowNo, maxClass) return true, nil }) return ret }
// Predict returns a classification for the vector, based on a vector input, using the KNN algorithm. func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Check what distance function we are using var distanceFunc pairwise.PairwiseDistanceFunc switch KNN.DistanceFunc { case "euclidean": distanceFunc = pairwise.NewEuclidean() case "manhattan": distanceFunc = pairwise.NewManhattan() default: panic("unsupported distance function") } // Check Compatibility allAttrs := base.CheckCompatible(what, KNN.TrainingData) if allAttrs == nil { // Don't have the same Attributes return nil } // Use optimised version if permitted if KNN.AllowOptimisations { if KNN.DistanceFunc == "euclidean" { if KNN.canUseOptimisations(what) { return KNN.optimisedEuclideanPredict(what.(*base.DenseInstances)) } } } fmt.Println("Optimisations are switched off") // Remove the Attributes which aren't numeric allNumericAttrs := make([]base.Attribute, 0) for _, a := range allAttrs { if fAttr, ok := a.(*base.FloatAttribute); ok { allNumericAttrs = append(allNumericAttrs, fAttr) } } // Generate return vector ret := base.GeneratePredictionVector(what) // Resolve Attribute specifications for both whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs) trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs) // Reserve storage for most the most similar items distances := make(map[int]float64) // Reserve storage for voting map maxmap := make(map[string]int) // Reserve storage for row computations trainRowBuf := make([]float64, len(allNumericAttrs)) predRowBuf := make([]float64, len(allNumericAttrs)) _, maxRow := what.Size() curRow := 0 // Iterate over all outer rows what.MapOverRows(whatAttrSpecs, func(predRow [][]byte, predRowNo int) (bool, error) { if (curRow%1) == 0 && curRow > 0 { fmt.Printf("KNN: %.2f %% done\n", float64(curRow)*100.0/float64(maxRow)) } curRow++ // Read the float values out for i, _ := range allNumericAttrs { predRowBuf[i] = base.UnpackBytesToFloat(predRow[i]) } predMat := utilities.FloatsToMatrix(predRowBuf) // Find the closest match in the training data KNN.TrainingData.MapOverRows(trainAttrSpecs, func(trainRow [][]byte, srcRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { trainRowBuf[i] = base.UnpackBytesToFloat(trainRow[i]) } // Compute the distance trainMat := utilities.FloatsToMatrix(trainRowBuf) distances[srcRowNo] = distanceFunc.Distance(predMat, trainMat) return true, nil }) sorted := utilities.SortIntMap(distances) values := sorted[:KNN.NearestNeighbours] maxClass := KNN.vote(maxmap, values) base.SetClass(ret, predRowNo, maxClass) return true, nil }) return ret }
// generateTrainingInstances generates RandomFeatures number of // attributes and returns a modified version of base.Instances // for training the model func (b *BaggedModel) generateTrainingInstances(model int, from base.FixedDataGrid) base.FixedDataGrid { _, rows := from.Size() insts := base.SampleWithReplacement(from, rows) selected := b.generateTrainingAttrs(model, from) return base.NewInstancesViewFromAttrs(insts, selected) }
// Fill data matrix with Bernoulli Naive Bayes model. All values // necessary for calculating prior probability and p(f_i) func (nb *BernoulliNBClassifier) Fit(X base.FixedDataGrid) { // Check that all Attributes are binary classAttrs := X.AllClassAttributes() allAttrs := X.AllAttributes() featAttrs := base.AttributeDifference(allAttrs, classAttrs) for i := range featAttrs { if _, ok := featAttrs[i].(*base.BinaryAttribute); !ok { panic(fmt.Sprintf("%v: Should be BinaryAttribute", featAttrs[i])) } } featAttrSpecs := base.ResolveAttributes(X, featAttrs) // Check that only one classAttribute is defined if len(classAttrs) != 1 { panic("Only one class Attribute can be used") } // Number of features and instances in this training set _, nb.trainingInstances = X.Size() nb.attrs = featAttrs nb.features = len(featAttrs) // Number of instances in class nb.classInstances = make(map[string]int) // Number of documents with given term (by class) docsContainingTerm := make(map[string][]int) // This algorithm could be vectorized after binarizing the data // matrix. Since mat64 doesn't have this function, a iterative // version is used. X.MapOverRows(featAttrSpecs, func(docVector [][]byte, r int) (bool, error) { class := base.GetClass(X, r) // increment number of instances in class t, ok := nb.classInstances[class] if !ok { t = 0 } nb.classInstances[class] = t + 1 for feat := 0; feat < len(docVector); feat++ { v := docVector[feat] // In Bernoulli Naive Bayes the presence and absence of // features are considered. All non-zero values are // treated as presence. if v[0] > 0 { // Update number of times this feature appeared within // given label. t, ok := docsContainingTerm[class] if !ok { t = make([]int, nb.features) docsContainingTerm[class] = t } t[feat] += 1 } } return true, nil }) // Pre-calculate conditional probabilities for each class for c, _ := range nb.classInstances { nb.condProb[c] = make([]float64, nb.features) for feat := 0; feat < nb.features; feat++ { classTerms, _ := docsContainingTerm[c] numDocs := classTerms[feat] docsInClass, _ := nb.classInstances[c] classCondProb, _ := nb.condProb[c] // Calculate conditional probability with laplace smoothing classCondProb[feat] = float64(numDocs+1) / float64(docsInClass+1) } } }
func getClassAttr(from base.FixedDataGrid) base.Attribute { allClassAttrs := from.AllClassAttributes() return allClassAttrs[0] }
// InferID3Tree builds a decision tree using a RuleGenerator // from a set of Instances (implements the ID3 algorithm) func InferID3Tree(from base.FixedDataGrid, with RuleGenerator) *DecisionTreeNode { // Count the number of classes at this node classes := base.GetClassDistribution(from) // If there's only one class, return a DecisionTreeLeaf with // the only class available if len(classes) == 1 { maxClass := "" for i := range classes { maxClass = i } ret := &DecisionTreeNode{ LeafNode, nil, classes, maxClass, getClassAttr(from), &DecisionTreeRule{nil, 0.0}, } return ret } // Only have the class attribute maxVal := 0 maxClass := "" for i := range classes { if classes[i] > maxVal { maxClass = i maxVal = classes[i] } } // If there are no more Attributes left to split on, // return a DecisionTreeLeaf with the majority class cols, _ := from.Size() if cols == 2 { ret := &DecisionTreeNode{ LeafNode, nil, classes, maxClass, getClassAttr(from), &DecisionTreeRule{nil, 0.0}, } return ret } // Generate a return structure ret := &DecisionTreeNode{ RuleNode, nil, classes, maxClass, getClassAttr(from), nil, } // Generate the splitting rule splitRule := with.GenerateSplitRule(from) if splitRule == nil { // Can't determine, just return what we have return ret } // Split the attributes based on this attribute's value var splitInstances map[string]base.FixedDataGrid if _, ok := splitRule.SplitAttr.(*base.FloatAttribute); ok { splitInstances = base.DecomposeOnNumericAttributeThreshold(from, splitRule.SplitAttr, splitRule.SplitVal) } else { splitInstances = base.DecomposeOnAttributeValues(from, splitRule.SplitAttr) } // Create new children from these attributes ret.Children = make(map[string]*DecisionTreeNode) for k := range splitInstances { newInstances := splitInstances[k] ret.Children[k] = InferID3Tree(newInstances, with) } ret.SplitRule = splitRule return ret }
func findBestSplit(partition base.FixedDataGrid) { var delta float64 delta = math.MinInt64 attrs := partition.AllAttributes() classAttrs := partition.AllClassAttributes() candidates := base.AttributeDifferenceReferences(attrs, classAttrs) fmt.Println(delta) fmt.Println(classAttrs) fmt.Println(reflect.TypeOf(partition)) fmt.Println(reflect.TypeOf(candidates)) for i, n := range attrs { fmt.Println(i) //fmt.Println(partition) fmt.Println(reflect.TypeOf(n)) attributeSpec, _ := partition.GetAttribute(n) fmt.Println(partition.GetAttribute(n)) _, rows := partition.Size() for j := 0; j < rows; j++ { data := partition.Get(attributeSpec, j) fmt.Println(base.UnpackBytesToFloat(data)) } } }