// Kmeansbi bisects a given cluster and determines which centroids give the lowest error. // Take the points in a cluster // While the number of cluster < k // for every cluster // measure total error // cacl kmeansp with k=2 on a given cluster // measure total error after kmeansp split // choose the cluster split with the lowest SSE // commit the chosen split // // N.B. We are using SSE until the BIC is completed. func Kmeansbi(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (matCentroidlist, clusterAssignment *matrix.DenseMatrix, err error) { numRows, numCols := datapoints.GetSize() clusterAssignment = matrix.Zeros(numRows, numCols) matCentroidlist = matrix.Zeros(k, numCols) centroid0 := datapoints.MeanCols() centroidlist := []*matrix.DenseMatrix{centroid0} // Initially create one cluster. for j := 0; j < numRows; j++ { point := datapoints.GetRowVector(j) distJ, err := measurer.CalcDist(centroid0, point) if err != nil { return matCentroidlist, clusterAssignment, errors.New(fmt.Sprintf("Kmeansbi: CalcDist returned err=%v", err)) } clusterAssignment.Set(j, 1, math.Pow(distJ, 2)) } var bestClusterAssignment, bestNewCentroids *matrix.DenseMatrix var bestCentroidToSplit int // Find the best centroid configuration. for len(centroidlist) < k { lowestSSE := math.Inf(1) // Split cluster for i, _ := range centroidlist { // Get the points in this cluster pointsCurCluster, err := clusterAssignment.FiltCol(float64(i), float64(i), 0) if err != nil { return matCentroidlist, clusterAssignment, err } centroids, splitClusterAssignment, err := Kmeansp(pointsCurCluster, 2, cc, measurer) if err != nil { return matCentroidlist, clusterAssignment, err } /* centroids is a 2X2 matrix of the best centroids found by kmeans splitClustAssignment is a mX2 matrix where col0 is either 0 or 1 and refers to the rows in centroids where col1 cotains the squared error between a centroid and a point. The rows here correspond to the rows in ptsInCurrCluster. For example, if row 2 contains [1, 7.999] this means that centroid 1 has been paired with the point in row 2 of splitClustAssignment and that the squared error (distance between centroid and point) is 7.999. */ // Calculate the sum of squared errors for each centroid. // This give a statistcal measurement of how good // the clustering is for this cluster. sseSplit := splitClusterAssignment.SumCol(1) // Calculate the SSE for the original cluster sqerr, err := clusterAssignment.FiltCol(float64(0), math.Inf(1), 0) if err != nil { return matCentroidlist, clusterAssignment, err } sseNotSplit := sqerr.SumCol(1) // TODO: Pre-BCI is this the best way to evaluate? if sseSplit+sseNotSplit < lowestSSE { bestCentroidToSplit = 1 bestNewCentroids = matrix.MakeDenseCopy(centroids) bestClusterAssignment = matrix.MakeDenseCopy(splitClusterAssignment) } } // Applying the split overwrites the existing cluster assginments for the // cluster you have decided to split. Kmeansp() returned two clusters // labeled 0 and 1. Change these cluster numbers to the cluster number // you are splitting and the next cluster to be added. m, err := bestClusterAssignment.FiltColMap(1, 1, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range m { bestClusterAssignment.Set(i, 0, float64(len(centroidlist))) } n, err := bestClusterAssignment.FiltColMap(0, 0, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range n { bestClusterAssignment.Set(i, 1, float64(bestCentroidToSplit)) } fmt.Printf("Best centroid to split %f\n", bestCentroidToSplit) r, _ := bestClusterAssignment.GetSize() fmt.Printf("The length of best cluster assesment is %f\n", r) // Replace a centroid with the two best centroids from the split. centroidlist[bestCentroidToSplit] = bestNewCentroids.GetRowVector(0) centroidlist = append(centroidlist, bestNewCentroids.GetRowVector(1)) // Reassign new clusters and SSE rows, _ := clusterAssignment.GetSize() for i, j := 0, 0; i < rows; i++ { if clusterAssignment.Get(i, 0) == float64(bestCentroidToSplit) { clusterAssignment.Set(i, 0, bestClusterAssignment.Get(j, 0)) clusterAssignment.Set(i, 1, bestClusterAssignment.Get(j, 1)) j++ } } // make centroidlist into a matrix s := make([][]float64, len(centroidlist)) for i, mat := range centroidlist { s[i][0] = mat.Get(0, 0) s[i][1] = mat.Get(0, 1) } matCentroidlist = matrix.MakeDenseMatrixStacked(s) } return matCentroidlist, clusterAssignment, nil }