// chooseCentroids picks random centroids based on the min and max values in the matrix // and return a k by m matrix of the centroids. func (c randCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { _, cols := mat.GetSize() centroids := matrix.Zeros(k, cols) for colnum := 0; colnum < cols; colnum++ { r := mat.ColSlice(colnum) minj := float64(0) // min value from column for _, val := range r { minj = math.Min(minj, val) } // max value from column maxj := float64(0) for _, val := range r { maxj = math.Max(maxj, val) } // create a slice of random centroids // based on maxj + minJ * random num to stay in range for h := 0; h < k; h++ { randInRange := ((maxj - minj) * rand.Float64()) + minj centroids.Set(h, colnum, randInRange) } } return centroids }
// variance calculates the unbiased variance based on the number of data points // and centroids (i.e., parameters). In our case, numcentroids should always be 1 // since each data point has been paired with one centroid. // // The points matrix contains the coordinates of the data points. // The centroids matrix is 1Xn that contains the centroid cooordinates. // variance = // 1 / (numpoints - numcentroids) * sum for all points (x_i - mean_(i))^2 func variance(points, centroid *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (float64, error) { crows, _ := centroid.GetSize() if crows > 1 { return float64(0), errors.New(fmt.Sprintf("variance: expected centroid matrix with 1 row, received matrix with %d rows.", crows)) } prows, _ := points.GetSize() // Term 1 t1 := float64(1 / float64((prows - 1))) // Mean of distance between all points and the centroid. mean := modelMean(points, centroid) // Term 2 // Sum over all points (point_i - mean(i))^2 t2 := float64(0) for i := 0; i < prows; i++ { p := points.GetRowVector(i) dist, err := measurer.CalcDist(p, mean) if err != nil { return float64(-1), errors.New(fmt.Sprintf("variance: CalcDist returned: %v", err)) } t2 += math.Pow(dist, 2) } variance := t1 * t2 return variance, nil }
// Xmeans runs k-means for k lower bound to k upper bound on a data set. // Once the k centroids have converged each cluster is bisected and the BIC // of the orginal cluster (parent = a model with one centroid) to the // the bisected model which consists of two centroids and whichever is greater // is committed to the set of clusters for this larger model k. // func Xmeans(datapoints, centroids *matrix.DenseMatrix, kmax int, cc, bisectcc CentroidChooser, measurer VectorMeasurer) ([]Model, map[string]error) { logname := "/var/tmp/xmeans.log" fp, err := os.OpenFile(logname, os.O_RDWR|os.O_APPEND, 0666) if err != nil { if os.IsNotExist(err) { fp, err = os.Create(logname) if err != nil { fmt.Printf("Xmeans: cannot open %s for logging.\n", logname) } } } log.SetOutput(io.Writer(fp)) k, _ := centroids.GetSize() log.Printf("Start k=%d kmax=%d\n", k, kmax) R, M := datapoints.GetSize() errs := make(map[string]error) runtime.GOMAXPROCS(numworkers) models := make([]Model, 0) for k <= kmax { log.Printf("kmeans started k=%d\n", k) model, err := kmeans(datapoints, centroids, measurer) if err != nil { errs[strconv.Itoa(k)] = err } // Bisect the returned clusters log.Println("bisect started") bimodel := bisect(model.Clusters, R, M, bisectcc, measurer) numCentroids := len(bimodel.Clusters) log.Printf("bisect returned %d clusters\n", numCentroids) models = append(models, model) var cent *matrix.DenseMatrix if numCentroids <= kmax { for rowexists := true; rowexists == true; { cent = cc.ChooseCentroids(datapoints, 1) rowexists = centroids.RowExists(cent) } centroids, err = centroids.AppendRow(cent) if err != nil { log.Printf("AppendRow: %v\n", err) errs["ApppendRow"] = err break } k++ } else { k = numCentroids } } log.Println("Finished") return models, errs }
// ComputeCentroids Needs comments. func ComputeCentroid(mat *matrix.DenseMatrix) (*matrix.DenseMatrix, error) { rows, _ := mat.GetSize() vectorSum := mat.SumCols() if rows == 0 { return vectorSum, errors.New("No points inputted") } vectorSum.Scale(1.0 / float64(rows)) return vectorSum, nil }
// addPairPointCentroidJobs adds a job to the jobs channel. func addPairPointCentroidJobs(jobs chan<- PairPointCentroidJob, datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer, results chan<- PairPointCentroidResult) { numRows, _ := datapoints.GetSize() for i := 0; i < numRows; i++ { point := datapoints.GetRowVector(i) jobs <- PairPointCentroidJob{point, centroids, results, i, measurer} } close(jobs) }
// Kmeansp returns means and distance squared of the coordinates for each // centroid using parallel computation. // // Input values // // datapoints - a kX2 matrix of R^2 coordinates // // centroids - a kX2 matrix of R^2 coordinates for centroids. // // measurer - anythng that implements the matutil.VectorMeasurer interface to // calculate the distance between a centroid and datapoint. (e.g., Euclidian // distance) // // Return values // // centroidMean - a kX2 matrix where the row number corresponds to the same // row in the centroid matrix and the two columns are the means of the // coordinates for that cluster. i.e., the best centroids that could // be determined. // // ____ ______ // | 12.29 32.94 | <-- The mean of coordinates for centroid 0 // | 4.6 29.22 | <-- The mean of coordinates for centroid 1 // |_____ ______| // // // centroidSqErr - a kX2 matrix where the first column contains a number // indicating the centroid and the second column contains the minimum // distance between centroid and point squared. (i.e., the squared error) // // ____ _______ // | 0 38.01 | <-- Centroid 0, squared error for the coordinates in row 0 of datapoints // | 1 23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints // | 0 14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints // _____ _______ //func Kmeansp(datapoints, centroids *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (centroidMean, func Kmeansp(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (centroidMean, centroidSqErr *matrix.DenseMatrix, err error) { //k, _ := centroids.GetSize() fp, _ := os.Create("/var/tmp/km.log") w := io.Writer(fp) log.SetOutput(w) centroids := cc.ChooseCentroids(datapoints, k) numRows, numCols := datapoints.GetSize() centroidSqErr = matrix.Zeros(numRows, numCols) centroidMean = matrix.Zeros(k, numCols) jobs := make(chan PairPointCentroidJob, numworkers) results := make(chan PairPointCentroidResult, minimum(1024, numRows)) done := make(chan int, numworkers) go addPairPointCentroidJobs(jobs, datapoints, centroidSqErr, centroids, measurer, results) for i := 0; i < numworkers; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) processPairPointToCentroidResults(centroidSqErr, results) // This blocks so that all the results can be processed // Now that you have each data point grouped with a centroid, iterate // through the centroidSqErr martix and for each centroid retrieve the // original coordinates from datapoints and place the results in // pointsInCuster. for c := 0; c < k; c++ { // c is the index that identifies the current centroid. // d is the index that identifies a row in centroidSqErr and datapoints. // Select all the rows in centroidSqErr whose first col value == c. // Get the corresponding row vector from datapoints and place it in pointsInCluster. matches, err := centroidSqErr.FiltColMap(float64(c), float64(c), 0) //rows with c in column 0. if err != nil { return centroidMean, centroidSqErr, nil } // It is possible that some centroids will not have any points, so there // may not be any matches in the first column of centroidSqErr. if len(matches) == 0 { continue } pointsInCluster := matrix.Zeros(len(matches), 2) for d, rownum := range matches { pointsInCluster.Set(d, 0, datapoints.Get(int(rownum), 0)) pointsInCluster.Set(d, 1, datapoints.Get(int(rownum), 1)) } // pointsInCluster now contains all the data points for the current // centroid. Take the mean of each of the 2 cols in pointsInCluster. means := pointsInCluster.MeanCols() centroidMean.Set(c, 0, means.Get(0, 0)) centroidMean.Set(c, 1, means.Get(0, 1)) } return }
// modelMean calculates the mean between all points in a model and a centroid. func modelMean(points, centroid *matrix.DenseMatrix) *matrix.DenseMatrix { prows, pcols := points.GetSize() pdist := matrix.Zeros(prows, pcols) for i := 0; i < prows; i++ { diff := matrix.Difference(centroid, points.GetRowVector(i)) pdist.SetRowVector(diff, i) } return pdist.MeanCols() }
// CalcDist finds the ManhattanDistance which is the sum of the aboslute // difference of the coordinates. Also known as rectilinear distance, // city block distance, or taxicab distance. func (md ManhattanDist) CalcDist(a, b *matrix.DenseMatrix) (dist float64, err error) { dist = float64(0) err = nil arows, acols := a.GetSize() brows, bcols := b.GetSize() if arows != 1 || brows != 1 { return dist, errors.New(fmt.Sprintf("matutil: Matrices must contain only 1 row. a has %d and b has %d.", arows, brows)) } else if arows != brows { return dist, errors.New(fmt.Sprintf("matutil: Matrices must have the same dimensions. a=%dX%d b=%dX%d", arows, acols, brows, bcols)) } dist = math.Abs(a.Get(0, 0)-b.Get(0, 0)) + math.Abs(a.Get(0, 1)-b.Get(0, 1)) return }
// Needs comments func (c EllipseCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { _, cols := mat.GetSize() var xmin, xmax, ymin, ymax = matutil.GetBoundaries(mat) x0, y0 := xmin+(xmax-xmin)/2.0, ymin+(ymax-ymin)/2.0 centroids := matrix.Zeros(k, cols) rx, ry := xmax-x0, ymax-y0 thetaInit := rand.Float64() * math.Pi for i := 0; i < k; i++ { centroids.Set(i, 0, rx*c.frac*math.Cos(thetaInit+float64(i)*math.Pi/float64(k))) centroids.Set(i, 1, ry*c.frac*math.Sin(thetaInit+float64(i)*math.Pi/float64(k))) } return centroids }
// EllipseCentroids lays out the initial centroids evenly along an elipse inscribed and centered within the boundaries of the dataset. // It is only defined for M=2 // * Frac: This must be a float between 0 and 1. It determines the scale of the inscribing ellipse relative to the dataset, // so Frac==1.0 produces an ellipse that spans the entire dataset, while Frac==0.5 produces an ellipse spanning half the dataset. func (c EllipseCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { _, cols := mat.GetSize() // TODO Cache boundaries call for each matrix so that it is not called on each bisect var xmin, xmax, ymin, ymax = boundaries(mat) x0, y0 := xmin+(xmax-xmin)/2.0, ymin+(ymax-ymin)/2.0 centroids := matrix.Zeros(k, cols) rx, ry := xmax-x0, ymax-y0 thetaInit := rand.Float64() * math.Pi for i := 0; i < k; i++ { centroids.Set(i, 0, rx*c.Frac*math.Cos(thetaInit+float64(i)*2.0*math.Pi/float64(k))) centroids.Set(i, 1, ry*c.Frac*math.Sin(thetaInit+float64(i)*2.0*math.Pi/float64(k))) } return centroids }
// DataCentroids picks k distinct points from the dataset. If k is > points in // the matrix then k is set to the number of points. func (c DataCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { // first set up a map to keep track of which data points have already been chosen so we don't dupe rows, cols := mat.GetSize() centroids := matrix.Zeros(k, cols) if k > rows { k = rows } chosenIdxs := make(map[int]bool, k) for len(chosenIdxs) < k { index := rand.Intn(rows) chosenIdxs[index] = true } i := 0 for idx, _ := range chosenIdxs { centroids.SetRowVector(mat.GetRowVector(idx).Copy(), i) i += 1 } return centroids }
// boundaries returns the max and min x and y values for a dense matrix // of shape m x m. func boundaries(mat *matrix.DenseMatrix) (xmin, xmax, ymin, ymax float64) { rows, _ := mat.GetSize() xmin, ymin = mat.Get(0, 0), mat.Get(0, 1) xmax, ymax = mat.Get(0, 0), mat.Get(0, 1) for i := 1; i < rows; i++ { xi, yi := mat.Get(i, 0), mat.Get(i, 1) if xi > xmax { xmax = xi } else if xi < xmin { xmin = xi } if yi > ymax { ymax = yi } else if yi < ymin { ymin = yi } } return }
func makeCentPointDist(datapoints, centroids *matrix.DenseMatrix) *matrix.DenseMatrix { r, c := datapoints.GetSize() CentPointDist := matrix.Zeros(r, c) done := make(chan int) jobs := make(chan PairPointCentroidJob, r) results := make(chan PairPointCentroidResult, minimum(1024, r)) var ed EuclidDist go addPairPointCentroidJobs(jobs, datapoints, centroids, ed, results) for i := 0; i < r; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) clusterChanged := assessClusters(CentPointDist, results) if clusterChanged == true || clusterChanged == false { } return CentPointDist }
// GetBoundaries returns the max and min x and y values for a dense matrix // of shape m x 2. func GetBoundaries(mat *matrix.DenseMatrix) (xmin, xmax, ymin, ymax float64) { rows, cols := mat.GetSize() if cols != 2 { // TODO - should there be an err return, or should we panic here? } xmin, ymin = mat.Get(0, 0), mat.Get(0, 1) xmax, ymax = mat.Get(0, 0), mat.Get(0, 1) for i := 1; i < rows; i++ { xi, yi := mat.Get(i, 0), mat.Get(i, 1) if xi > xmax { xmax = xi } else if xi < xmin { xmin = xi } if yi > ymax { ymax = yi } else if yi < ymin { ymin = yi } } return }
// kmeans partitions datapoints into K clusters. This results in a partitioning of // the data space into Voronoi cells. The problem is NP-hard so here we attempt // to parallelize or make concurrent as many processes as possible to reduce the // running time. // // 1. Place K points into the space represented by the objects that are being clustered. // These points represent initial group centroids. // // 2. Assign each object to the group that has the closest centroid. // // 3. When all objects have been assigned, recalculate the positions of the K centroids // by calculating the mean of all cooridnates in a cluster and making that // the new centroid. // // 4. Repeat Steps 2 and 3 until the centroids no longer move. // // centroids is K x M matrix that cotains the coordinates for the centroids. // The centroids are indexed by the 0 based rows of this matrix. // ____ _________ // | 12.29 32.94 ... | <-- The coordinates for centroid 0 // | 4.6 29.22 ... | <-- The coordinates for centroid 1 // |_____ __________| // // // CentPointDist is ax R x M matrix. The rows have a 1:1 relationship to // the rows in datapoints. Column 0 contains the row number in centroids // that corresponds to the centroid for the datapoint in row i of this matrix. // Column 1 contains (x_i - mu(i))^2. // ____ _______ // | 3 38.01 | <-- Centroid 3, squared error for the coordinates in row 0 of datapoints // | 1 23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints // | 0 14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints // _____ _______ // func kmeans(datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer) Model { /* datapoints CentPoinDist centroids ________________ ____ ____ __|__ ______ | ____ ___________ | ... | | ... | V | ... | | 3.0 5.1| <-- row i --> | 3 32.12 | row 3 | 3 38.1, ... | |____ ___| |____ ______| |___ __________ | */ R, M := datapoints.GetSize() CentPointDist := matrix.Zeros(R, 2) k, _ := centroids.GetSize() clusterChanged := true var clusters []cluster for clusterChanged == true { clusterChanged = false clusters = make([]cluster, 0) jobs := make(chan PairPointCentroidJob, 1024) results := make(chan PairPointCentroidResult, 1024) done := make(chan int, 1024) // Pair each point with its closest centroid. go addPairPointCentroidJobs(jobs, datapoints, centroids, measurer, results) for i := 0; i < numworkers; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) clusterChanged = assessClusters(CentPointDist, results) // This blocks so that all the results can be processed // You have each data point grouped with a centroid, for idx, cent := 0, 0; cent < k; cent++ { // Select all the rows in CentPointDist whose first col value == cent. // Get the corresponding row vector from datapoints and place it in pointsInCluster. r, _ := CentPointDist.GetSize() matches := make([]int, 0) for i := 0; i < r; i++ { v := CentPointDist.Get(i, 0) if v == float64(cent) { matches = append(matches, i) } } // It is possible that some centroids may have zero points, so there // may not be any matches. if len(matches) == 0 { continue } pointsInCluster := matrix.Zeros(len(matches), M) i := 0 for _, rownum := range matches { pointsInCluster.Set(i, 0, datapoints.Get(int(rownum), 0)) pointsInCluster.Set(i, 1, datapoints.Get(int(rownum), 1)) i++ } // pointsInCluster now contains all the data points for the current // centroid. The mean of the coordinates for this cluster becomes // the new centroid for this cluster. mean := pointsInCluster.MeanCols() centroids.SetRowVector(mean, cent) clust := cluster{pointsInCluster, mean, 0} clust.Variance = variance(clust, measurer) clusters = append(clusters, clust) idx++ } } modelbic := calcbic(R, M, clusters) model := Model{modelbic, clusters} return model }
// Kmeansbi bisects a given cluster and determines which centroids give the lowest error. // Take the points in a cluster // While the number of cluster < k // for every cluster // measure total error // cacl kmeansp with k=2 on a given cluster // measure total error after kmeansp split // choose the cluster split with the lowest SSE // commit the chosen split // // N.B. We are using SSE until the BIC is completed. func Kmeansbi(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (matCentroidlist, clusterAssignment *matrix.DenseMatrix, err error) { numRows, numCols := datapoints.GetSize() clusterAssignment = matrix.Zeros(numRows, numCols) matCentroidlist = matrix.Zeros(k, numCols) centroid0 := datapoints.MeanCols() centroidlist := []*matrix.DenseMatrix{centroid0} // Initially create one cluster. for j := 0; j < numRows; j++ { point := datapoints.GetRowVector(j) distJ, err := measurer.CalcDist(centroid0, point) if err != nil { return matCentroidlist, clusterAssignment, errors.New(fmt.Sprintf("Kmeansbi: CalcDist returned err=%v", err)) } clusterAssignment.Set(j, 1, math.Pow(distJ, 2)) } var bestClusterAssignment, bestNewCentroids *matrix.DenseMatrix var bestCentroidToSplit int // Find the best centroid configuration. for len(centroidlist) < k { lowestSSE := math.Inf(1) // Split cluster for i, _ := range centroidlist { // Get the points in this cluster pointsCurCluster, err := clusterAssignment.FiltCol(float64(i), float64(i), 0) if err != nil { return matCentroidlist, clusterAssignment, err } centroids, splitClusterAssignment, err := Kmeansp(pointsCurCluster, 2, cc, measurer) if err != nil { return matCentroidlist, clusterAssignment, err } /* centroids is a 2X2 matrix of the best centroids found by kmeans splitClustAssignment is a mX2 matrix where col0 is either 0 or 1 and refers to the rows in centroids where col1 cotains the squared error between a centroid and a point. The rows here correspond to the rows in ptsInCurrCluster. For example, if row 2 contains [1, 7.999] this means that centroid 1 has been paired with the point in row 2 of splitClustAssignment and that the squared error (distance between centroid and point) is 7.999. */ // Calculate the sum of squared errors for each centroid. // This give a statistcal measurement of how good // the clustering is for this cluster. sseSplit := splitClusterAssignment.SumCol(1) // Calculate the SSE for the original cluster sqerr, err := clusterAssignment.FiltCol(float64(0), math.Inf(1), 0) if err != nil { return matCentroidlist, clusterAssignment, err } sseNotSplit := sqerr.SumCol(1) // TODO: Pre-BCI is this the best way to evaluate? if sseSplit+sseNotSplit < lowestSSE { bestCentroidToSplit = 1 bestNewCentroids = matrix.MakeDenseCopy(centroids) bestClusterAssignment = matrix.MakeDenseCopy(splitClusterAssignment) } } // Applying the split overwrites the existing cluster assginments for the // cluster you have decided to split. Kmeansp() returned two clusters // labeled 0 and 1. Change these cluster numbers to the cluster number // you are splitting and the next cluster to be added. m, err := bestClusterAssignment.FiltColMap(1, 1, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range m { bestClusterAssignment.Set(i, 0, float64(len(centroidlist))) } n, err := bestClusterAssignment.FiltColMap(0, 0, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range n { bestClusterAssignment.Set(i, 1, float64(bestCentroidToSplit)) } fmt.Printf("Best centroid to split %f\n", bestCentroidToSplit) r, _ := bestClusterAssignment.GetSize() fmt.Printf("The length of best cluster assesment is %f\n", r) // Replace a centroid with the two best centroids from the split. centroidlist[bestCentroidToSplit] = bestNewCentroids.GetRowVector(0) centroidlist = append(centroidlist, bestNewCentroids.GetRowVector(1)) // Reassign new clusters and SSE rows, _ := clusterAssignment.GetSize() for i, j := 0, 0; i < rows; i++ { if clusterAssignment.Get(i, 0) == float64(bestCentroidToSplit) { clusterAssignment.Set(i, 0, bestClusterAssignment.Get(j, 0)) clusterAssignment.Set(i, 1, bestClusterAssignment.Get(j, 1)) j++ } } // make centroidlist into a matrix s := make([][]float64, len(centroidlist)) for i, mat := range centroidlist { s[i][0] = mat.Get(0, 0) s[i][1] = mat.Get(0, 1) } matCentroidlist = matrix.MakeDenseMatrixStacked(s) } return matCentroidlist, clusterAssignment, nil }
// Xmeans runs k-means for k lower bound to k upper bound on a data set. // Once the k centroids have converged each cluster is bisected and the BIC // of the orginal cluster (parent = a model with one centroid) to the // the bisected model which consists of two centroids and whichever is greater // is committed to the set of clusters for this larger model k. // func Xmeans(datapoints, centroids *matrix.DenseMatrix, k, kmax int, cc, bisectcc CentroidChooser, measurer VectorMeasurer) ([]Model, map[string]error) { var err error // Uncomment logging code as well as the import statement above if you want simple logging to the elapsed // time between major events. /* logname := "/var/tmp/xmeans.log" fp, err := os.OpenFile(logname, os.O_RDWR|os.O_APPEND, 0666) if err != nil { if os.IsNotExist(err) { fp, err = os.Create(logname) if err != nil { fmt.Printf("Xmeans: cannot open %s for logging.\n", logname) } } } log.SetOutput(io.Writer(fp)) */ if k > kmax { m := make([]Model, 0) e := map[string]error{ "k": errors.New(fmt.Sprintf("k must be <= kmax. Received k=%d and kmax=%d.", k, kmax)), } return m, e } // log.Printf("Start k=%d kmax=%d\n", k, kmax) R, M := datapoints.GetSize() errs := make(map[string]error) runtime.GOMAXPROCS(numworkers) models := make([]Model, 0) for k <= kmax { // log.Printf("kmeans started k=%d\n", k) model := kmeans(datapoints, centroids, measurer) // Bisect the returned clusters // log.Println("bisect started") bimodel := bisect(model.Clusters, R, M, bisectcc, measurer) numCentroids := len(bimodel.Clusters) // log.Printf("bisect returned %d clusters\n", numCentroids) models = append(models, model) var cent *matrix.DenseMatrix if numCentroids <= kmax { for rowexists := true; rowexists == true; { cent = cc.ChooseCentroids(datapoints, 1) rowexists = centroids.RowExists(cent) } centroids, err = centroids.AppendRow(cent) if err != nil { errs["ApppendRow"] = err break } k++ } else { k = numCentroids } } // log.Println("Finished") return models, errs }