// Kmeansp returns means and distance squared of the coordinates for each // centroid using parallel computation. // // Input values // // datapoints - a kX2 matrix of R^2 coordinates // // centroids - a kX2 matrix of R^2 coordinates for centroids. // // measurer - anythng that implements the matutil.VectorMeasurer interface to // calculate the distance between a centroid and datapoint. (e.g., Euclidian // distance) // // Return values // // centroidMean - a kX2 matrix where the row number corresponds to the same // row in the centroid matrix and the two columns are the means of the // coordinates for that cluster. i.e., the best centroids that could // be determined. // // ____ ______ // | 12.29 32.94 | <-- The mean of coordinates for centroid 0 // | 4.6 29.22 | <-- The mean of coordinates for centroid 1 // |_____ ______| // // // centroidSqErr - a kX2 matrix where the first column contains a number // indicating the centroid and the second column contains the minimum // distance between centroid and point squared. (i.e., the squared error) // // ____ _______ // | 0 38.01 | <-- Centroid 0, squared error for the coordinates in row 0 of datapoints // | 1 23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints // | 0 14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints // _____ _______ //func Kmeansp(datapoints, centroids *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (centroidMean, func Kmeansp(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (centroidMean, centroidSqErr *matrix.DenseMatrix, err error) { //k, _ := centroids.GetSize() fp, _ := os.Create("/var/tmp/km.log") w := io.Writer(fp) log.SetOutput(w) centroids := cc.ChooseCentroids(datapoints, k) numRows, numCols := datapoints.GetSize() centroidSqErr = matrix.Zeros(numRows, numCols) centroidMean = matrix.Zeros(k, numCols) jobs := make(chan PairPointCentroidJob, numworkers) results := make(chan PairPointCentroidResult, minimum(1024, numRows)) done := make(chan int, numworkers) go addPairPointCentroidJobs(jobs, datapoints, centroidSqErr, centroids, measurer, results) for i := 0; i < numworkers; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) processPairPointToCentroidResults(centroidSqErr, results) // This blocks so that all the results can be processed // Now that you have each data point grouped with a centroid, iterate // through the centroidSqErr martix and for each centroid retrieve the // original coordinates from datapoints and place the results in // pointsInCuster. for c := 0; c < k; c++ { // c is the index that identifies the current centroid. // d is the index that identifies a row in centroidSqErr and datapoints. // Select all the rows in centroidSqErr whose first col value == c. // Get the corresponding row vector from datapoints and place it in pointsInCluster. matches, err := centroidSqErr.FiltColMap(float64(c), float64(c), 0) //rows with c in column 0. if err != nil { return centroidMean, centroidSqErr, nil } // It is possible that some centroids will not have any points, so there // may not be any matches in the first column of centroidSqErr. if len(matches) == 0 { continue } pointsInCluster := matrix.Zeros(len(matches), 2) for d, rownum := range matches { pointsInCluster.Set(d, 0, datapoints.Get(int(rownum), 0)) pointsInCluster.Set(d, 1, datapoints.Get(int(rownum), 1)) } // pointsInCluster now contains all the data points for the current // centroid. Take the mean of each of the 2 cols in pointsInCluster. means := pointsInCluster.MeanCols() centroidMean.Set(c, 0, means.Get(0, 0)) centroidMean.Set(c, 1, means.Get(0, 1)) } return }
// assessClusters assigns the results to the CentPointDist matrix. func assessClusters(CentPointDist *matrix.DenseMatrix, results <-chan PairPointCentroidResult) bool { change := false for result := range results { if CentPointDist.Get(result.rowNum, 0) != result.centroidRowNum { change = true } CentPointDist.Set(result.rowNum, 0, result.centroidRowNum) CentPointDist.Set(result.rowNum, 1, result.distSquared) } return change }
// CalcDist finds the ManhattanDistance which is the sum of the aboslute // difference of the coordinates. Also known as rectilinear distance, // city block distance, or taxicab distance. func (md ManhattanDist) CalcDist(a, b *matrix.DenseMatrix) (dist float64, err error) { dist = float64(0) err = nil arows, acols := a.GetSize() brows, bcols := b.GetSize() if arows != 1 || brows != 1 { return dist, errors.New(fmt.Sprintf("matutil: Matrices must contain only 1 row. a has %d and b has %d.", arows, brows)) } else if arows != brows { return dist, errors.New(fmt.Sprintf("matutil: Matrices must have the same dimensions. a=%dX%d b=%dX%d", arows, acols, brows, bcols)) } dist = math.Abs(a.Get(0, 0)-b.Get(0, 0)) + math.Abs(a.Get(0, 1)-b.Get(0, 1)) return }
// boundaries returns the max and min x and y values for a dense matrix // of shape m x m. func boundaries(mat *matrix.DenseMatrix) (xmin, xmax, ymin, ymax float64) { rows, _ := mat.GetSize() xmin, ymin = mat.Get(0, 0), mat.Get(0, 1) xmax, ymax = mat.Get(0, 0), mat.Get(0, 1) for i := 1; i < rows; i++ { xi, yi := mat.Get(i, 0), mat.Get(i, 1) if xi > xmax { xmax = xi } else if xi < xmin { xmin = xi } if yi > ymax { ymax = yi } else if yi < ymin { ymin = yi } } return }
// GetBoundaries returns the max and min x and y values for a dense matrix // of shape m x 2. func GetBoundaries(mat *matrix.DenseMatrix) (xmin, xmax, ymin, ymax float64) { rows, cols := mat.GetSize() if cols != 2 { // TODO - should there be an err return, or should we panic here? } xmin, ymin = mat.Get(0, 0), mat.Get(0, 1) xmax, ymax = mat.Get(0, 0), mat.Get(0, 1) for i := 1; i < rows; i++ { xi, yi := mat.Get(i, 0), mat.Get(i, 1) if xi > xmax { xmax = xi } else if xi < xmin { xmin = xi } if yi > ymax { ymax = yi } else if yi < ymin { ymin = yi } } return }
// kmeans partitions datapoints into K clusters. This results in a partitioning of // the data space into Voronoi cells. The problem is NP-hard so here we attempt // to parallelize or make concurrent as many processes as possible to reduce the // running time. // // 1. Place K points into the space represented by the objects that are being clustered. // These points represent initial group centroids. // // 2. Assign each object to the group that has the closest centroid. // // 3. When all objects have been assigned, recalculate the positions of the K centroids // by calculating the mean of all cooridnates in a cluster and making that // the new centroid. // // 4. Repeat Steps 2 and 3 until the centroids no longer move. // // centroids is K x M matrix that cotains the coordinates for the centroids. // The centroids are indexed by the 0 based rows of this matrix. // ____ _________ // | 12.29 32.94 ... | <-- The coordinates for centroid 0 // | 4.6 29.22 ... | <-- The coordinates for centroid 1 // |_____ __________| // // // CentPointDist is ax R x M matrix. The rows have a 1:1 relationship to // the rows in datapoints. Column 0 contains the row number in centroids // that corresponds to the centroid for the datapoint in row i of this matrix. // Column 1 contains (x_i - mu(i))^2. // ____ _______ // | 3 38.01 | <-- Centroid 3, squared error for the coordinates in row 0 of datapoints // | 1 23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints // | 0 14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints // _____ _______ // func kmeans(datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer) Model { /* datapoints CentPoinDist centroids ________________ ____ ____ __|__ ______ | ____ ___________ | ... | | ... | V | ... | | 3.0 5.1| <-- row i --> | 3 32.12 | row 3 | 3 38.1, ... | |____ ___| |____ ______| |___ __________ | */ R, M := datapoints.GetSize() CentPointDist := matrix.Zeros(R, 2) k, _ := centroids.GetSize() clusterChanged := true var clusters []cluster for clusterChanged == true { clusterChanged = false clusters = make([]cluster, 0) jobs := make(chan PairPointCentroidJob, 1024) results := make(chan PairPointCentroidResult, 1024) done := make(chan int, 1024) // Pair each point with its closest centroid. go addPairPointCentroidJobs(jobs, datapoints, centroids, measurer, results) for i := 0; i < numworkers; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) clusterChanged = assessClusters(CentPointDist, results) // This blocks so that all the results can be processed // You have each data point grouped with a centroid, for idx, cent := 0, 0; cent < k; cent++ { // Select all the rows in CentPointDist whose first col value == cent. // Get the corresponding row vector from datapoints and place it in pointsInCluster. r, _ := CentPointDist.GetSize() matches := make([]int, 0) for i := 0; i < r; i++ { v := CentPointDist.Get(i, 0) if v == float64(cent) { matches = append(matches, i) } } // It is possible that some centroids may have zero points, so there // may not be any matches. if len(matches) == 0 { continue } pointsInCluster := matrix.Zeros(len(matches), M) i := 0 for _, rownum := range matches { pointsInCluster.Set(i, 0, datapoints.Get(int(rownum), 0)) pointsInCluster.Set(i, 1, datapoints.Get(int(rownum), 1)) i++ } // pointsInCluster now contains all the data points for the current // centroid. The mean of the coordinates for this cluster becomes // the new centroid for this cluster. mean := pointsInCluster.MeanCols() centroids.SetRowVector(mean, cent) clust := cluster{pointsInCluster, mean, 0} clust.Variance = variance(clust, measurer) clusters = append(clusters, clust) idx++ } } modelbic := calcbic(R, M, clusters) model := Model{modelbic, clusters} return model }
// CalcDist finds the ManhattanDistance which is the sum of the aboslute // difference of the coordinates. Also known as rectilinear distance, // city block distance, or taxicab distance. func (md ManhattanDist) CalcDist(a, b *matrix.DenseMatrix) float64 { return math.Abs(a.Get(0, 0)-b.Get(0, 0)) + math.Abs(a.Get(0, 1)-b.Get(0, 1)) }
// Kmeansbi bisects a given cluster and determines which centroids give the lowest error. // Take the points in a cluster // While the number of cluster < k // for every cluster // measure total error // cacl kmeansp with k=2 on a given cluster // measure total error after kmeansp split // choose the cluster split with the lowest SSE // commit the chosen split // // N.B. We are using SSE until the BIC is completed. func Kmeansbi(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (matCentroidlist, clusterAssignment *matrix.DenseMatrix, err error) { numRows, numCols := datapoints.GetSize() clusterAssignment = matrix.Zeros(numRows, numCols) matCentroidlist = matrix.Zeros(k, numCols) centroid0 := datapoints.MeanCols() centroidlist := []*matrix.DenseMatrix{centroid0} // Initially create one cluster. for j := 0; j < numRows; j++ { point := datapoints.GetRowVector(j) distJ, err := measurer.CalcDist(centroid0, point) if err != nil { return matCentroidlist, clusterAssignment, errors.New(fmt.Sprintf("Kmeansbi: CalcDist returned err=%v", err)) } clusterAssignment.Set(j, 1, math.Pow(distJ, 2)) } var bestClusterAssignment, bestNewCentroids *matrix.DenseMatrix var bestCentroidToSplit int // Find the best centroid configuration. for len(centroidlist) < k { lowestSSE := math.Inf(1) // Split cluster for i, _ := range centroidlist { // Get the points in this cluster pointsCurCluster, err := clusterAssignment.FiltCol(float64(i), float64(i), 0) if err != nil { return matCentroidlist, clusterAssignment, err } centroids, splitClusterAssignment, err := Kmeansp(pointsCurCluster, 2, cc, measurer) if err != nil { return matCentroidlist, clusterAssignment, err } /* centroids is a 2X2 matrix of the best centroids found by kmeans splitClustAssignment is a mX2 matrix where col0 is either 0 or 1 and refers to the rows in centroids where col1 cotains the squared error between a centroid and a point. The rows here correspond to the rows in ptsInCurrCluster. For example, if row 2 contains [1, 7.999] this means that centroid 1 has been paired with the point in row 2 of splitClustAssignment and that the squared error (distance between centroid and point) is 7.999. */ // Calculate the sum of squared errors for each centroid. // This give a statistcal measurement of how good // the clustering is for this cluster. sseSplit := splitClusterAssignment.SumCol(1) // Calculate the SSE for the original cluster sqerr, err := clusterAssignment.FiltCol(float64(0), math.Inf(1), 0) if err != nil { return matCentroidlist, clusterAssignment, err } sseNotSplit := sqerr.SumCol(1) // TODO: Pre-BCI is this the best way to evaluate? if sseSplit+sseNotSplit < lowestSSE { bestCentroidToSplit = 1 bestNewCentroids = matrix.MakeDenseCopy(centroids) bestClusterAssignment = matrix.MakeDenseCopy(splitClusterAssignment) } } // Applying the split overwrites the existing cluster assginments for the // cluster you have decided to split. Kmeansp() returned two clusters // labeled 0 and 1. Change these cluster numbers to the cluster number // you are splitting and the next cluster to be added. m, err := bestClusterAssignment.FiltColMap(1, 1, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range m { bestClusterAssignment.Set(i, 0, float64(len(centroidlist))) } n, err := bestClusterAssignment.FiltColMap(0, 0, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range n { bestClusterAssignment.Set(i, 1, float64(bestCentroidToSplit)) } fmt.Printf("Best centroid to split %f\n", bestCentroidToSplit) r, _ := bestClusterAssignment.GetSize() fmt.Printf("The length of best cluster assesment is %f\n", r) // Replace a centroid with the two best centroids from the split. centroidlist[bestCentroidToSplit] = bestNewCentroids.GetRowVector(0) centroidlist = append(centroidlist, bestNewCentroids.GetRowVector(1)) // Reassign new clusters and SSE rows, _ := clusterAssignment.GetSize() for i, j := 0, 0; i < rows; i++ { if clusterAssignment.Get(i, 0) == float64(bestCentroidToSplit) { clusterAssignment.Set(i, 0, bestClusterAssignment.Get(j, 0)) clusterAssignment.Set(i, 1, bestClusterAssignment.Get(j, 1)) j++ } } // make centroidlist into a matrix s := make([][]float64, len(centroidlist)) for i, mat := range centroidlist { s[i][0] = mat.Get(0, 0) s[i][1] = mat.Get(0, 1) } matCentroidlist = matrix.MakeDenseMatrixStacked(s) } return matCentroidlist, clusterAssignment, nil }