// kmeans partitions datapoints into K clusters. This results in a partitioning of // the data space into Voronoi cells. The problem is NP-hard so here we attempt // to parallelize or make concurrent as many processes as possible to reduce the // running time. // // 1. Place K points into the space represented by the objects that are being clustered. // These points represent initial group centroids. // // 2. Assign each object to the group that has the closest centroid. // // 3. When all objects have been assigned, recalculate the positions of the K centroids // by calculating the mean of all cooridnates in a cluster and making that // the new centroid. // // 4. Repeat Steps 2 and 3 until the centroids no longer move. // // centroids is K x M matrix that cotains the coordinates for the centroids. // The centroids are indexed by the 0 based rows of this matrix. // ____ _________ // | 12.29 32.94 ... | <-- The coordinates for centroid 0 // | 4.6 29.22 ... | <-- The coordinates for centroid 1 // |_____ __________| // // // CentPointDist is ax R x M matrix. The rows have a 1:1 relationship to // the rows in datapoints. Column 0 contains the row number in centroids // that corresponds to the centroid for the datapoint in row i of this matrix. // Column 1 contains (x_i - mu(i))^2. // ____ _______ // | 3 38.01 | <-- Centroid 3, squared error for the coordinates in row 0 of datapoints // | 1 23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints // | 0 14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints // _____ _______ // func kmeans(datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer) Model { /* datapoints CentPoinDist centroids ________________ ____ ____ __|__ ______ | ____ ___________ | ... | | ... | V | ... | | 3.0 5.1| <-- row i --> | 3 32.12 | row 3 | 3 38.1, ... | |____ ___| |____ ______| |___ __________ | */ R, M := datapoints.GetSize() CentPointDist := matrix.Zeros(R, 2) k, _ := centroids.GetSize() clusterChanged := true var clusters []cluster for clusterChanged == true { clusterChanged = false clusters = make([]cluster, 0) jobs := make(chan PairPointCentroidJob, 1024) results := make(chan PairPointCentroidResult, 1024) done := make(chan int, 1024) // Pair each point with its closest centroid. go addPairPointCentroidJobs(jobs, datapoints, centroids, measurer, results) for i := 0; i < numworkers; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) clusterChanged = assessClusters(CentPointDist, results) // This blocks so that all the results can be processed // You have each data point grouped with a centroid, for idx, cent := 0, 0; cent < k; cent++ { // Select all the rows in CentPointDist whose first col value == cent. // Get the corresponding row vector from datapoints and place it in pointsInCluster. r, _ := CentPointDist.GetSize() matches := make([]int, 0) for i := 0; i < r; i++ { v := CentPointDist.Get(i, 0) if v == float64(cent) { matches = append(matches, i) } } // It is possible that some centroids may have zero points, so there // may not be any matches. if len(matches) == 0 { continue } pointsInCluster := matrix.Zeros(len(matches), M) i := 0 for _, rownum := range matches { pointsInCluster.Set(i, 0, datapoints.Get(int(rownum), 0)) pointsInCluster.Set(i, 1, datapoints.Get(int(rownum), 1)) i++ } // pointsInCluster now contains all the data points for the current // centroid. The mean of the coordinates for this cluster becomes // the new centroid for this cluster. mean := pointsInCluster.MeanCols() centroids.SetRowVector(mean, cent) clust := cluster{pointsInCluster, mean, 0} clust.Variance = variance(clust, measurer) clusters = append(clusters, clust) idx++ } } modelbic := calcbic(R, M, clusters) model := Model{modelbic, clusters} return model }