func findBestSplit(partition base.FixedDataGrid) { var delta float64 delta = math.MinInt64 attrs := partition.AllAttributes() classAttrs := partition.AllClassAttributes() candidates := base.AttributeDifferenceReferences(attrs, classAttrs) fmt.Println(delta) fmt.Println(classAttrs) fmt.Println(reflect.TypeOf(partition)) fmt.Println(reflect.TypeOf(candidates)) for i, n := range attrs { fmt.Println(i) //fmt.Println(partition) fmt.Println(reflect.TypeOf(n)) attributeSpec, _ := partition.GetAttribute(n) fmt.Println(partition.GetAttribute(n)) _, rows := partition.Size() for j := 0; j < rows; j++ { data := partition.Get(attributeSpec, j) fmt.Println(base.UnpackBytesToFloat(data)) } } }
func getNumericAttributeEntropy(f base.FixedDataGrid, attr *base.FloatAttribute) (float64, float64) { // Resolve Attribute attrSpec, err := f.GetAttribute(attr) if err != nil { panic(err) } // Build sortable vector _, rows := f.Size() refs := make([]numericSplitRef, rows) f.MapOverRows([]base.AttributeSpec{attrSpec}, func(val [][]byte, row int) (bool, error) { cls := base.GetClass(f, row) v := base.UnpackBytesToFloat(val[0]) refs[row] = numericSplitRef{v, cls} return true, nil }) // Sort sort.Sort(splitVec(refs)) generateCandidateSplitDistribution := func(val float64) map[string]map[string]int { presplit := make(map[string]int) postplit := make(map[string]int) for _, i := range refs { if i.val < val { presplit[i.class]++ } else { postplit[i.class]++ } } ret := make(map[string]map[string]int) ret["0"] = presplit ret["1"] = postplit return ret } minSplitEntropy := math.Inf(1) minSplitVal := math.Inf(1) // Consider each possible function for i := 0; i < len(refs)-1; i++ { val := refs[i].val + refs[i+1].val val /= 2 splitDist := generateCandidateSplitDistribution(val) splitEntropy := getSplitEntropy(splitDist) if splitEntropy < minSplitEntropy { minSplitEntropy = splitEntropy minSplitVal = val } } return minSplitEntropy, minSplitVal }
// NewChiMergeFilter creates a ChiMergeFilter with some helpful intialisations. func NewChiMergeFilter(d base.FixedDataGrid, significance float64) *ChiMergeFilter { _, rows := d.Size() return &ChiMergeFilter{ AbstractDiscretizeFilter{ make(map[base.Attribute]bool), false, d, }, make(map[base.Attribute][]*FrequencyTableEntry), significance, 2, rows, } }
// GenerateCrossFoldValidationConfusionMatrices divides the data into a number of folds // then trains and evaluates the classifier on each fold, producing a new ConfusionMatrix. func GenerateCrossFoldValidationConfusionMatrices(data base.FixedDataGrid, cls base.Classifier, folds int) ([]ConfusionMatrix, error) { _, rows := data.Size() // Assign each row to a fold foldMap := make([]int, rows) inverseFoldMap := make(map[int][]int) for i := 0; i < rows; i++ { fold := rand.Intn(folds) foldMap[i] = fold if _, ok := inverseFoldMap[fold]; !ok { inverseFoldMap[fold] = make([]int, 0) } inverseFoldMap[fold] = append(inverseFoldMap[fold], i) } ret := make([]ConfusionMatrix, folds) // Create training/test views for each fold for i := 0; i < folds; i++ { // Fold i is for testing testData := base.NewInstancesViewFromVisible(data, inverseFoldMap[i], data.AllAttributes()) otherRows := make([]int, 0) for j := 0; j < folds; j++ { if i == j { continue } otherRows = append(otherRows, inverseFoldMap[j]...) } trainData := base.NewInstancesViewFromVisible(data, otherRows, data.AllAttributes()) // Train err := cls.Fit(trainData) if err != nil { return nil, err } // Predict pred, err := cls.Predict(testData) if err != nil { return nil, err } // Evaluate cf, err := GetConfusionMatrix(testData, pred) if err != nil { return nil, err } ret[i] = cf } return ret, nil }
func processData(x base.FixedDataGrid) instances { _, rows := x.Size() result := make(instances, rows) // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(x) numericAttrSpecs := base.ResolveAttributes(x, numericAttrs) // Retrieve class Attributes classAttrs := x.AllClassAttributes() if len(classAttrs) != 1 { panic("Only one classAttribute supported!") } // Check that the class Attribute is categorical // (with two values) or binary classAttr := classAttrs[0] if attr, ok := classAttr.(*base.CategoricalAttribute); ok { if len(attr.GetValues()) != 2 { panic("To many values for Attribute!") } } else if _, ok := classAttr.(*base.BinaryAttribute); ok { } else { panic("Wrong class Attribute type!") } // Convert each row x.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { // Allocate a new row probRow := make([]float64, len(numericAttrSpecs)) // Read out the row for i, _ := range numericAttrSpecs { probRow[i] = base.UnpackBytesToFloat(row[i]) } // Get the class for the values class := base.GetClass(x, rowNo) instance := instance{class, probRow} result[rowNo] = instance return true, nil }) return result }
func computePairwiseDistances(inst base.FixedDataGrid, attrs []base.Attribute, metric pairwise.PairwiseDistanceFunc) (*mat64.Dense, error) { // Compute pair-wise distances // First convert everything to floats mats, err := base.ConvertAllRowsToMat64(attrs, inst) if err != nil { return nil, err } // Next, do an n^2 computation of all pairwise distances _, rows := inst.Size() dist := mat64.NewDense(rows, rows, nil) for i := 0; i < rows; i++ { for j := i + 1; j < rows; j++ { d := metric.Distance(mats[i], mats[j]) dist.Set(i, j, d) dist.Set(j, i, d) } } return dist, nil }
func convertInstancesToLabelVec(X base.FixedDataGrid) []float64 { // Get the class Attributes classAttrs := X.AllClassAttributes() // Only support 1 class Attribute if len(classAttrs) != 1 { panic(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs))) } // ClassAttribute must be numeric if _, ok := classAttrs[0].(*base.FloatAttribute); !ok { panic(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0])) } // Allocate return structure _, rows := X.Size() labelVec := make([]float64, rows) // Resolve class Attribute specification classAttrSpecs := base.ResolveAttributes(X, classAttrs) X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { labelVec[rowNo] = base.UnpackBytesToFloat(row[0]) return true, nil }) return labelVec }
// GetConfusionMatrix builds a ConfusionMatrix from a set of reference (`ref') // and generate (`gen') Instances. func GetConfusionMatrix(ref base.FixedDataGrid, gen base.FixedDataGrid) (map[string]map[string]int, error) { _, refRows := ref.Size() _, genRows := gen.Size() if refRows != genRows { return nil, errors.New(fmt.Sprintf("Row count mismatch: ref has %d rows, gen has %d rows", refRows, genRows)) } ret := make(map[string]map[string]int) for i := 0; i < int(refRows); i++ { referenceClass := base.GetClass(ref, i) predictedClass := base.GetClass(gen, i) if _, ok := ret[referenceClass]; ok { ret[referenceClass][predictedClass] += 1 } else { ret[referenceClass] = make(map[string]int) ret[referenceClass][predictedClass] = 1 } } return ret, nil }
func convertInstancesToProblemVec(X base.FixedDataGrid) [][]float64 { // Allocate problem array _, rows := X.Size() problemVec := make([][]float64, rows) // Retrieve numeric non-class Attributes numericAttrs := base.NonClassFloatAttributes(X) numericAttrSpecs := base.ResolveAttributes(X, numericAttrs) // Convert each row X.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) { // Allocate a new row probRow := make([]float64, len(numericAttrSpecs)) // Read out the row for i, _ := range numericAttrSpecs { probRow[i] = base.UnpackBytesToFloat(row[i]) } // Add the row problemVec[rowNo] = probRow return true, nil }) return problemVec }
// GetConfusionMatrix builds a ConfusionMatrix from a set of reference (`ref') // and generate (`gen') Instances. func GetConfusionMatrix(ref base.FixedDataGrid, gen base.FixedDataGrid) map[string]map[string]int { _, refRows := ref.Size() _, genRows := gen.Size() if refRows != genRows { panic("Row counts should match") } ret := make(map[string]map[string]int) for i := 0; i < int(refRows); i++ { referenceClass := base.GetClass(ref, i) predictedClass := base.GetClass(gen, i) if _, ok := ret[referenceClass]; ok { ret[referenceClass][predictedClass] += 1 } else { ret[referenceClass] = make(map[string]int) ret[referenceClass][predictedClass] = 1 } } return ret }
// InferID3Tree builds a decision tree using a RuleGenerator // from a set of Instances (implements the ID3 algorithm) func InferID3Tree(from base.FixedDataGrid, with RuleGenerator) *DecisionTreeNode { // Count the number of classes at this node classes := base.GetClassDistribution(from) // If there's only one class, return a DecisionTreeLeaf with // the only class available if len(classes) == 1 { maxClass := "" for i := range classes { maxClass = i } ret := &DecisionTreeNode{ LeafNode, nil, classes, maxClass, getClassAttr(from), &DecisionTreeRule{nil, 0.0}, } return ret } // Only have the class attribute maxVal := 0 maxClass := "" for i := range classes { if classes[i] > maxVal { maxClass = i maxVal = classes[i] } } // If there are no more Attributes left to split on, // return a DecisionTreeLeaf with the majority class cols, _ := from.Size() if cols == 2 { ret := &DecisionTreeNode{ LeafNode, nil, classes, maxClass, getClassAttr(from), &DecisionTreeRule{nil, 0.0}, } return ret } // Generate a return structure ret := &DecisionTreeNode{ RuleNode, nil, classes, maxClass, getClassAttr(from), nil, } // Generate the splitting rule splitRule := with.GenerateSplitRule(from) if splitRule == nil { // Can't determine, just return what we have return ret } // Split the attributes based on this attribute's value var splitInstances map[string]base.FixedDataGrid if _, ok := splitRule.SplitAttr.(*base.FloatAttribute); ok { splitInstances = base.DecomposeOnNumericAttributeThreshold(from, splitRule.SplitAttr, splitRule.SplitVal) } else { splitInstances = base.DecomposeOnAttributeValues(from, splitRule.SplitAttr) } // Create new children from these attributes ret.Children = make(map[string]*DecisionTreeNode) for k := range splitInstances { newInstances := splitInstances[k] ret.Children[k] = InferID3Tree(newInstances, with) } ret.SplitRule = splitRule return ret }
// Fill data matrix with Bernoulli Naive Bayes model. All values // necessary for calculating prior probability and p(f_i) func (nb *BernoulliNBClassifier) Fit(X base.FixedDataGrid) { // Check that all Attributes are binary classAttrs := X.AllClassAttributes() allAttrs := X.AllAttributes() featAttrs := base.AttributeDifference(allAttrs, classAttrs) for i := range featAttrs { if _, ok := featAttrs[i].(*base.BinaryAttribute); !ok { panic(fmt.Sprintf("%v: Should be BinaryAttribute", featAttrs[i])) } } featAttrSpecs := base.ResolveAttributes(X, featAttrs) // Check that only one classAttribute is defined if len(classAttrs) != 1 { panic("Only one class Attribute can be used") } // Number of features and instances in this training set _, nb.trainingInstances = X.Size() nb.attrs = featAttrs nb.features = len(featAttrs) // Number of instances in class nb.classInstances = make(map[string]int) // Number of documents with given term (by class) docsContainingTerm := make(map[string][]int) // This algorithm could be vectorized after binarizing the data // matrix. Since mat64 doesn't have this function, a iterative // version is used. X.MapOverRows(featAttrSpecs, func(docVector [][]byte, r int) (bool, error) { class := base.GetClass(X, r) // increment number of instances in class t, ok := nb.classInstances[class] if !ok { t = 0 } nb.classInstances[class] = t + 1 for feat := 0; feat < len(docVector); feat++ { v := docVector[feat] // In Bernoulli Naive Bayes the presence and absence of // features are considered. All non-zero values are // treated as presence. if v[0] > 0 { // Update number of times this feature appeared within // given label. t, ok := docsContainingTerm[class] if !ok { t = make([]int, nb.features) docsContainingTerm[class] = t } t[feat] += 1 } } return true, nil }) // Pre-calculate conditional probabilities for each class for c, _ := range nb.classInstances { nb.condProb[c] = make([]float64, nb.features) for feat := 0; feat < nb.features; feat++ { classTerms, _ := docsContainingTerm[c] numDocs := classTerms[feat] docsInClass, _ := nb.classInstances[c] classCondProb, _ := nb.condProb[c] // Calculate conditional probability with laplace smoothing classCondProb[feat] = float64(numDocs+1) / float64(docsInClass+1) } } }
// generateTrainingInstances generates RandomFeatures number of // attributes and returns a modified version of base.Instances // for training the model func (b *BaggedModel) generateTrainingInstances(model int, from base.FixedDataGrid) base.FixedDataGrid { _, rows := from.Size() insts := base.SampleWithReplacement(from, rows) selected := b.generateTrainingAttrs(model, from) return base.NewInstancesViewFromAttrs(insts, selected) }
// Predict returns a classification for the vector, based on a vector input, using the KNN algorithm. func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Check what distance function we are using var distanceFunc pairwise.PairwiseDistanceFunc switch KNN.DistanceFunc { case "euclidean": distanceFunc = pairwise.NewEuclidean() case "manhattan": distanceFunc = pairwise.NewManhattan() default: panic("unsupported distance function") } // Check Compatibility allAttrs := base.CheckCompatible(what, KNN.TrainingData) if allAttrs == nil { // Don't have the same Attributes return nil } // Use optimised version if permitted if KNN.AllowOptimisations { if KNN.DistanceFunc == "euclidean" { if KNN.canUseOptimisations(what) { return KNN.optimisedEuclideanPredict(what.(*base.DenseInstances)) } } } fmt.Println("Optimisations are switched off") // Remove the Attributes which aren't numeric allNumericAttrs := make([]base.Attribute, 0) for _, a := range allAttrs { if fAttr, ok := a.(*base.FloatAttribute); ok { allNumericAttrs = append(allNumericAttrs, fAttr) } } // Generate return vector ret := base.GeneratePredictionVector(what) // Resolve Attribute specifications for both whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs) trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs) // Reserve storage for most the most similar items distances := make(map[int]float64) // Reserve storage for voting map maxmap := make(map[string]int) // Reserve storage for row computations trainRowBuf := make([]float64, len(allNumericAttrs)) predRowBuf := make([]float64, len(allNumericAttrs)) _, maxRow := what.Size() curRow := 0 // Iterate over all outer rows what.MapOverRows(whatAttrSpecs, func(predRow [][]byte, predRowNo int) (bool, error) { if (curRow%1) == 0 && curRow > 0 { fmt.Printf("KNN: %.2f %% done\n", float64(curRow)*100.0/float64(maxRow)) } curRow++ // Read the float values out for i, _ := range allNumericAttrs { predRowBuf[i] = base.UnpackBytesToFloat(predRow[i]) } predMat := utilities.FloatsToMatrix(predRowBuf) // Find the closest match in the training data KNN.TrainingData.MapOverRows(trainAttrSpecs, func(trainRow [][]byte, srcRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { trainRowBuf[i] = base.UnpackBytesToFloat(trainRow[i]) } // Compute the distance trainMat := utilities.FloatsToMatrix(trainRowBuf) distances[srcRowNo] = distanceFunc.Distance(predMat, trainMat) return true, nil }) sorted := utilities.SortIntMap(distances) values := sorted[:KNN.NearestNeighbours] maxClass := KNN.vote(maxmap, values) base.SetClass(ret, predRowNo, maxClass) return true, nil }) return ret }
// DBSCAN clusters inst using the parameters allowed in and produces a ClusterId->[RowId] map func DBSCAN(inst base.FixedDataGrid, params DBSCANParameters) (ClusterMap, error) { // Compute the distances between each possible point dist, err := computePairwiseDistances(inst, params.Attributes, params.Metric) if err != nil { return nil, err } _, rows := inst.Size() clusterMap := make(map[int][]int) visited := big.NewInt(0) clustered := big.NewInt(0) // expandCluster adds P to a cluster C, visiting any neighbours expandCluster := func(p int, neighbours *big.Int, c int) { if clustered.Bit(p) == 1 { panic("Shouldn't happen!") } // Add this point to cluster C if _, ok := clusterMap[c]; !ok { clusterMap[c] = make([]int, 0) } clusterMap[c] = append(clusterMap[c], p) clustered.SetBit(clustered, p, 1) visited.SetBit(visited, p, 1) for i := 0; i < rows; i++ { reset := false if neighbours.Bit(i) == 0 { // Not a neighbour, so skip continue } if visited.Bit(i) == 0 { // not yet visited visited = visited.SetBit(visited, i, 1) // Mark as visited newNeighbours := big.NewInt(0) newNeighbours = regionQuery(i, newNeighbours, dist, params.Eps) if BitCount(newNeighbours) >= params.MinCount { neighbours = neighbours.Or(neighbours, newNeighbours) reset = true } } else { continue } if clustered.Bit(i) == 0 { clusterMap[c] = append(clusterMap[c], i) clustered = clustered.SetBit(clustered, i, 1) } if reset { i = 0 } } } c := 0 for i := 0; i < rows; i++ { if visited.Bit(i) == 1 { continue // Already visited here } visited.SetBit(visited, i, 1) neighbours := big.NewInt(0) neighbours = regionQuery(i, neighbours, dist, params.Eps) if BitCount(neighbours) < params.MinCount { // Noise, cluster 0 clustered = clustered.Or(clustered, neighbours) continue } c = c + 1 // Increment cluster count expandCluster(i, neighbours, c) } // Remove anything from the map which doesn't make // minimum points rmKeys := make([]int, 0) for id := range clusterMap { if len(clusterMap[id]) < params.MinCount { rmKeys = append(rmKeys, id) } } for _, r := range rmKeys { delete(clusterMap, r) } return ClusterMap(clusterMap), nil }
func (lr *LinearRegression) Fit(inst base.FixedDataGrid) error { // Retrieve row size _, rows := inst.Size() // Validate class Attribute count classAttrs := inst.AllClassAttributes() if len(classAttrs) != 1 { return fmt.Errorf("Only 1 class variable is permitted") } classAttrSpecs := base.ResolveAttributes(inst, classAttrs) // Retrieve relevant Attributes allAttrs := base.NonClassAttributes(inst) attrs := make([]base.Attribute, 0) for _, a := range allAttrs { if _, ok := a.(*base.FloatAttribute); ok { attrs = append(attrs, a) } } cols := len(attrs) + 1 if rows < cols { return NotEnoughDataError } // Retrieve relevant Attribute specifications attrSpecs := base.ResolveAttributes(inst, attrs) // Split into two matrices, observed results (dependent variable y) // and the explanatory variables (X) - see http://en.wikipedia.org/wiki/Linear_regression observed := mat64.NewDense(rows, 1, nil) explVariables := mat64.NewDense(rows, cols, nil) // Build the observed matrix inst.MapOverRows(classAttrSpecs, func(row [][]byte, i int) (bool, error) { val := base.UnpackBytesToFloat(row[0]) observed.Set(i, 0, val) return true, nil }) // Build the explainatory variables inst.MapOverRows(attrSpecs, func(row [][]byte, i int) (bool, error) { // Set intercepts to 1.0 explVariables.Set(i, 0, 1.0) for j, r := range row { explVariables.Set(i, j+1, base.UnpackBytesToFloat(r)) } return true, nil }) n := cols qr := new(mat64.QR) qr.Factorize(explVariables) var q, reg mat64.Dense q.QFromQR(qr) reg.RFromQR(qr) var transposed, qty mat64.Dense transposed.Clone(q.T()) qty.Mul(&transposed, observed) regressionCoefficients := make([]float64, n) for i := n - 1; i >= 0; i-- { regressionCoefficients[i] = qty.At(i, 0) for j := i + 1; j < n; j++ { regressionCoefficients[i] -= regressionCoefficients[j] * reg.At(i, j) } regressionCoefficients[i] /= reg.At(i, i) } lr.disturbance = regressionCoefficients[0] lr.regressionCoefficients = regressionCoefficients[1:] lr.fitted = true lr.attrs = attrs lr.cls = classAttrs[0] return nil }