func TestDoPairPointCentroidJobs(t *testing.T) { r := 4 c := 2 dataPoints := matrix.Zeros(r, c) // centroidSqDist := matrix.Zeros(r, c) centroids := matrix.Zeros(r, c) done := make(chan int) jobs := make(chan PairPointCentroidJob, r) results := make(chan PairPointCentroidResult, minimum(1024, r)) var md ManhattanDist go addPairPointCentroidJobs(jobs, dataPoints, centroids, md, results) for i := 0; i < r; i++ { go doPairPointCentroidJobs(done, jobs) } j := 0 for ; j < r; j++ { <-done } if j != r { t.Errorf("doPairPointToCentroidJobs jobs processed=%d. Should be %d", j, r) } }
// Kmeansp returns means and distance squared of the coordinates for each // centroid using parallel computation. // // Input values // // datapoints - a kX2 matrix of R^2 coordinates // // centroids - a kX2 matrix of R^2 coordinates for centroids. // // measurer - anythng that implements the matutil.VectorMeasurer interface to // calculate the distance between a centroid and datapoint. (e.g., Euclidian // distance) // // Return values // // centroidMean - a kX2 matrix where the row number corresponds to the same // row in the centroid matrix and the two columns are the means of the // coordinates for that cluster. i.e., the best centroids that could // be determined. // // ____ ______ // | 12.29 32.94 | <-- The mean of coordinates for centroid 0 // | 4.6 29.22 | <-- The mean of coordinates for centroid 1 // |_____ ______| // // // centroidSqErr - a kX2 matrix where the first column contains a number // indicating the centroid and the second column contains the minimum // distance between centroid and point squared. (i.e., the squared error) // // ____ _______ // | 0 38.01 | <-- Centroid 0, squared error for the coordinates in row 0 of datapoints // | 1 23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints // | 0 14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints // _____ _______ //func Kmeansp(datapoints, centroids *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (centroidMean, func Kmeansp(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (centroidMean, centroidSqErr *matrix.DenseMatrix, err error) { //k, _ := centroids.GetSize() fp, _ := os.Create("/var/tmp/km.log") w := io.Writer(fp) log.SetOutput(w) centroids := cc.ChooseCentroids(datapoints, k) numRows, numCols := datapoints.GetSize() centroidSqErr = matrix.Zeros(numRows, numCols) centroidMean = matrix.Zeros(k, numCols) jobs := make(chan PairPointCentroidJob, numworkers) results := make(chan PairPointCentroidResult, minimum(1024, numRows)) done := make(chan int, numworkers) go addPairPointCentroidJobs(jobs, datapoints, centroidSqErr, centroids, measurer, results) for i := 0; i < numworkers; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) processPairPointToCentroidResults(centroidSqErr, results) // This blocks so that all the results can be processed // Now that you have each data point grouped with a centroid, iterate // through the centroidSqErr martix and for each centroid retrieve the // original coordinates from datapoints and place the results in // pointsInCuster. for c := 0; c < k; c++ { // c is the index that identifies the current centroid. // d is the index that identifies a row in centroidSqErr and datapoints. // Select all the rows in centroidSqErr whose first col value == c. // Get the corresponding row vector from datapoints and place it in pointsInCluster. matches, err := centroidSqErr.FiltColMap(float64(c), float64(c), 0) //rows with c in column 0. if err != nil { return centroidMean, centroidSqErr, nil } // It is possible that some centroids will not have any points, so there // may not be any matches in the first column of centroidSqErr. if len(matches) == 0 { continue } pointsInCluster := matrix.Zeros(len(matches), 2) for d, rownum := range matches { pointsInCluster.Set(d, 0, datapoints.Get(int(rownum), 0)) pointsInCluster.Set(d, 1, datapoints.Get(int(rownum), 1)) } // pointsInCluster now contains all the data points for the current // centroid. Take the mean of each of the 2 cols in pointsInCluster. means := pointsInCluster.MeanCols() centroidMean.Set(c, 0, means.Get(0, 0)) centroidMean.Set(c, 1, means.Get(0, 1)) } return }
func TestAssessClusters(t *testing.T) { r, c := DATAPOINTS.GetSize() CentPointDist := matrix.Zeros(r, c) done := make(chan int) jobs := make(chan PairPointCentroidJob, r) results := make(chan PairPointCentroidResult, minimum(1024, r)) var md ManhattanDist go addPairPointCentroidJobs(jobs, DATAPOINTS, CENTROIDS, md, results) for i := 0; i < r; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) clusterChanged := assessClusters(CentPointDist, results) if clusterChanged != true { t.Errorf("TestAssessClusters: clusterChanged=%b and should be true.", clusterChanged) } if CentPointDist.Get(9, 0) != 0 || CentPointDist.Get(10, 0) != 1 { t.Errorf("TestAssessClusters: rows 9 and 10 should have 0 and 1 in column 0, but received %v", CentPointDist) } }
func TestComputeCentroid(t *testing.T) { empty := matrix.Zeros(0, 0) _, err := ComputeCentroid(empty) if err == nil { t.Errorf("Did not raise error on empty matrix") } twoByTwo := matrix.Ones(2, 2) centr, err := ComputeCentroid(twoByTwo) if err != nil { t.Errorf("Could not compute centroid, err=%v", err) } expected := matrix.MakeDenseMatrix([]float64{1.0, 1.0}, 1, 2) if !matrix.Equals(centr, expected) { t.Errorf("Incorrect centroid: was %v, should have been %v", expected, centr) } twoByTwo.Set(0, 0, 3.0) expected.Set(0, 0, 2.0) centr, err = ComputeCentroid(twoByTwo) if err != nil { t.Errorf("Could not compute centroid, err=%v", err) } if !matrix.Equals(centr, expected) { t.Errorf("Incorrect centroid: was %v, should have been %v", expected, centr) } }
// chooseCentroids picks random centroids based on the min and max values in the matrix // and return a k by m matrix of the centroids. func (c randCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { _, cols := mat.GetSize() centroids := matrix.Zeros(k, cols) for colnum := 0; colnum < cols; colnum++ { r := mat.ColSlice(colnum) minj := float64(0) // min value from column for _, val := range r { minj = math.Min(minj, val) } // max value from column maxj := float64(0) for _, val := range r { maxj = math.Max(maxj, val) } // create a slice of random centroids // based on maxj + minJ * random num to stay in range for h := 0; h < k; h++ { randInRange := ((maxj - minj) * rand.Float64()) + minj centroids.Set(h, colnum, randInRange) } } return centroids }
// modelMean calculates the mean between all points in a model and a centroid. func modelMean(points, centroid *matrix.DenseMatrix) *matrix.DenseMatrix { prows, pcols := points.GetSize() pdist := matrix.Zeros(prows, pcols) for i := 0; i < prows; i++ { diff := matrix.Difference(centroid, points.GetRowVector(i)) pdist.SetRowVector(diff, i) } return pdist.MeanCols() }
func TestAddPairPointToCentroidJob(t *testing.T) { r := 4 c := 2 jobs := make(chan PairPointCentroidJob, r) results := make(chan PairPointCentroidResult, minimum(1024, r)) dataPoints := matrix.Zeros(r, c) // centroidSqDist := matrix.Zeros(r, c) centroids := matrix.Zeros(r, c) var ed EuclidDist go addPairPointCentroidJobs(jobs, dataPoints, centroids, ed, results) i := 0 for ; i < r; i++ { <-jobs } if i != r { t.Errorf("addPairPointToCentroidJobs number of jobs=%d. Should be %d", i, r) } }
func TestProcessPairPointToCentroidResults(t *testing.T) { r := 4 c := 2 dataPoints := matrix.Zeros(r, c) centroidSqDist := matrix.Zeros(r, c) centroids := matrix.Zeros(r, c) done := make(chan int) jobs := make(chan PairPointCentroidJob, r) results := make(chan PairPointCentroidResult, minimum(1024, r)) var md matutil.ManhattanDist go addPairPointCentroidJobs(jobs, dataPoints, centroids, centroidSqDist, md, results) for i := 0; i < r; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) //TODO check deterministic results of centroidDistSq processPairPointToCentroidResults(centroidSqDist, results) }
// Needs comments func (c EllipseCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { _, cols := mat.GetSize() var xmin, xmax, ymin, ymax = matutil.GetBoundaries(mat) x0, y0 := xmin+(xmax-xmin)/2.0, ymin+(ymax-ymin)/2.0 centroids := matrix.Zeros(k, cols) rx, ry := xmax-x0, ymax-y0 thetaInit := rand.Float64() * math.Pi for i := 0; i < k; i++ { centroids.Set(i, 0, rx*c.frac*math.Cos(thetaInit+float64(i)*math.Pi/float64(k))) centroids.Set(i, 1, ry*c.frac*math.Sin(thetaInit+float64(i)*math.Pi/float64(k))) } return centroids }
// EllipseCentroids lays out the initial centroids evenly along an elipse inscribed and centered within the boundaries of the dataset. // It is only defined for M=2 // * Frac: This must be a float between 0 and 1. It determines the scale of the inscribing ellipse relative to the dataset, // so Frac==1.0 produces an ellipse that spans the entire dataset, while Frac==0.5 produces an ellipse spanning half the dataset. func (c EllipseCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { _, cols := mat.GetSize() // TODO Cache boundaries call for each matrix so that it is not called on each bisect var xmin, xmax, ymin, ymax = boundaries(mat) x0, y0 := xmin+(xmax-xmin)/2.0, ymin+(ymax-ymin)/2.0 centroids := matrix.Zeros(k, cols) rx, ry := xmax-x0, ymax-y0 thetaInit := rand.Float64() * math.Pi for i := 0; i < k; i++ { centroids.Set(i, 0, rx*c.Frac*math.Cos(thetaInit+float64(i)*2.0*math.Pi/float64(k))) centroids.Set(i, 1, ry*c.Frac*math.Sin(thetaInit+float64(i)*2.0*math.Pi/float64(k))) } return centroids }
// DataCentroids picks k distinct points from the dataset. If k is > points in // the matrix then k is set to the number of points. func (c DataCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { // first set up a map to keep track of which data points have already been chosen so we don't dupe rows, cols := mat.GetSize() centroids := matrix.Zeros(k, cols) if k > rows { k = rows } chosenIdxs := make(map[int]bool, k) for len(chosenIdxs) < k { index := rand.Intn(rows) chosenIdxs[index] = true } i := 0 for idx, _ := range chosenIdxs { centroids.SetRowVector(mat.GetRowVector(idx).Copy(), i) i += 1 } return centroids }
func makeCentPointDist(datapoints, centroids *matrix.DenseMatrix) *matrix.DenseMatrix { r, c := datapoints.GetSize() CentPointDist := matrix.Zeros(r, c) done := make(chan int) jobs := make(chan PairPointCentroidJob, r) results := make(chan PairPointCentroidResult, minimum(1024, r)) var ed EuclidDist go addPairPointCentroidJobs(jobs, datapoints, centroids, ed, results) for i := 0; i < r; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) clusterChanged := assessClusters(CentPointDist, results) if clusterChanged == true || clusterChanged == false { } return CentPointDist }
// Load loads a tab delimited text file of floats into a matrix. func Load(fname, sep string) (*matrix.DenseMatrix, error) { z := matrix.Zeros(1, 1) fp, err := os.Open(fname) if err != nil { return z, err } defer fp.Close() data := make([]float64, 0) cols := 0 r := bufio.NewReader(fp) linenum := 0 eof := false for !eof { var line string var buf []byte buf, _, err := r.ReadLine() line = string(buf) if err == io.EOF { err = nil eof = true break } else if err != nil { return z, errors.New(fmt.Sprintf("goxmean.Load: reading linenum %d: %v", linenum, err)) } l1 := strings.TrimRight(line, "\n") l := strings.Split(l1, sep) // If each line does not have the same number of columns then error if linenum == 0 { cols = len(l) } if len(l) != cols { return z, errors.New(fmt.Sprintf("Load(): linenum %d has %d columns. It should have %d columns.", linenum, len(line), cols)) } if len(l) < 2 { return z, errors.New(fmt.Sprintf("Load(): linenum %d has only %d elements", linenum, len(line))) } linenum++ // Convert the strings to float64 and build up the slice t by appending. t := make([]float64, 0) for _, v := range l { v = strings.TrimSpace(v) f, err := strconv.ParseFloat(v, 64) if err != nil { return z, errors.New(fmt.Sprintf("goxmeans.Load: cannot convert value %s to float64.", v)) } t = append(t, f) } data = append(data, t...) } mat := matrix.MakeDenseMatrix(data, linenum, cols) //fmt.Println(time.Now())n // flag for debugging return mat, nil }
// kmeans partitions datapoints into K clusters. This results in a partitioning of // the data space into Voronoi cells. The problem is NP-hard so here we attempt // to parallelize or make concurrent as many processes as possible to reduce the // running time. // // 1. Place K points into the space represented by the objects that are being clustered. // These points represent initial group centroids. // // 2. Assign each object to the group that has the closest centroid. // // 3. When all objects have been assigned, recalculate the positions of the K centroids // by calculating the mean of all cooridnates in a cluster and making that // the new centroid. // // 4. Repeat Steps 2 and 3 until the centroids no longer move. // // centroids is K x M matrix that cotains the coordinates for the centroids. // The centroids are indexed by the 0 based rows of this matrix. // ____ _________ // | 12.29 32.94 ... | <-- The coordinates for centroid 0 // | 4.6 29.22 ... | <-- The coordinates for centroid 1 // |_____ __________| // // // CentPointDist is ax R x M matrix. The rows have a 1:1 relationship to // the rows in datapoints. Column 0 contains the row number in centroids // that corresponds to the centroid for the datapoint in row i of this matrix. // Column 1 contains (x_i - mu(i))^2. // ____ _______ // | 3 38.01 | <-- Centroid 3, squared error for the coordinates in row 0 of datapoints // | 1 23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints // | 0 14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints // _____ _______ // func kmeans(datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer) Model { /* datapoints CentPoinDist centroids ________________ ____ ____ __|__ ______ | ____ ___________ | ... | | ... | V | ... | | 3.0 5.1| <-- row i --> | 3 32.12 | row 3 | 3 38.1, ... | |____ ___| |____ ______| |___ __________ | */ R, M := datapoints.GetSize() CentPointDist := matrix.Zeros(R, 2) k, _ := centroids.GetSize() clusterChanged := true var clusters []cluster for clusterChanged == true { clusterChanged = false clusters = make([]cluster, 0) jobs := make(chan PairPointCentroidJob, 1024) results := make(chan PairPointCentroidResult, 1024) done := make(chan int, 1024) // Pair each point with its closest centroid. go addPairPointCentroidJobs(jobs, datapoints, centroids, measurer, results) for i := 0; i < numworkers; i++ { go doPairPointCentroidJobs(done, jobs) } go awaitPairPointCentroidCompletion(done, results) clusterChanged = assessClusters(CentPointDist, results) // This blocks so that all the results can be processed // You have each data point grouped with a centroid, for idx, cent := 0, 0; cent < k; cent++ { // Select all the rows in CentPointDist whose first col value == cent. // Get the corresponding row vector from datapoints and place it in pointsInCluster. r, _ := CentPointDist.GetSize() matches := make([]int, 0) for i := 0; i < r; i++ { v := CentPointDist.Get(i, 0) if v == float64(cent) { matches = append(matches, i) } } // It is possible that some centroids may have zero points, so there // may not be any matches. if len(matches) == 0 { continue } pointsInCluster := matrix.Zeros(len(matches), M) i := 0 for _, rownum := range matches { pointsInCluster.Set(i, 0, datapoints.Get(int(rownum), 0)) pointsInCluster.Set(i, 1, datapoints.Get(int(rownum), 1)) i++ } // pointsInCluster now contains all the data points for the current // centroid. The mean of the coordinates for this cluster becomes // the new centroid for this cluster. mean := pointsInCluster.MeanCols() centroids.SetRowVector(mean, cent) clust := cluster{pointsInCluster, mean, 0} clust.Variance = variance(clust, measurer) clusters = append(clusters, clust) idx++ } } modelbic := calcbic(R, M, clusters) model := Model{modelbic, clusters} return model }
// Load loads a tab delimited text file of floats into a slice. // Assume last column is the target. // For now, we limit ourselves to two columns func Load(fname string) (*matrix.DenseMatrix, error) { datamatrix := matrix.Zeros(1, 1) data := make([]float64, 2048) idx := 0 fp, err := os.Open(fname) if err != nil { return datamatrix, err } defer fp.Close() r := bufio.NewReader(fp) linenum := 1 eof := false for !eof { var line string var buf []byte // line, err := r.ReadString('\n') buf, _, err := r.ReadLine() line = string(buf) // fmt.Printf("linenum=%d buf=%v line=%v\n",linenum,buf, line) if err == io.EOF { err = nil eof = true break } else if err != nil { return datamatrix, errors.New(fmt.Sprintf("means.Load: reading linenum %d: %v", linenum, err)) } linenum++ l1 := strings.TrimRight(line, "\n") l := strings.Split(l1, "\t") if len(l) < 2 { return datamatrix, errors.New(fmt.Sprintf("means.Load: linenum %d has only %d elements", linenum, len(line))) } // for now assume 2 dimensions only f0, err := Atof64(string(l[0])) if err != nil { return datamatrix, errors.New(fmt.Sprintf("means.Load: cannot convert f0 %s to float64.", l[0])) } f1, err := Atof64(string(l[1])) if err != nil { return datamatrix, errors.New(fmt.Sprintf("means.Load: cannot convert f1 %s to float64.", l[1])) } if linenum >= len(data) { data = append(data, f0, f1) } else { data[idx] = f0 idx++ data[idx] = f1 idx++ } } numcols := 2 datamatrix = matrix.MakeDenseMatrix(data, linenum-1, numcols) return datamatrix, nil }
// Kmeansbi bisects a given cluster and determines which centroids give the lowest error. // Take the points in a cluster // While the number of cluster < k // for every cluster // measure total error // cacl kmeansp with k=2 on a given cluster // measure total error after kmeansp split // choose the cluster split with the lowest SSE // commit the chosen split // // N.B. We are using SSE until the BIC is completed. func Kmeansbi(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (matCentroidlist, clusterAssignment *matrix.DenseMatrix, err error) { numRows, numCols := datapoints.GetSize() clusterAssignment = matrix.Zeros(numRows, numCols) matCentroidlist = matrix.Zeros(k, numCols) centroid0 := datapoints.MeanCols() centroidlist := []*matrix.DenseMatrix{centroid0} // Initially create one cluster. for j := 0; j < numRows; j++ { point := datapoints.GetRowVector(j) distJ, err := measurer.CalcDist(centroid0, point) if err != nil { return matCentroidlist, clusterAssignment, errors.New(fmt.Sprintf("Kmeansbi: CalcDist returned err=%v", err)) } clusterAssignment.Set(j, 1, math.Pow(distJ, 2)) } var bestClusterAssignment, bestNewCentroids *matrix.DenseMatrix var bestCentroidToSplit int // Find the best centroid configuration. for len(centroidlist) < k { lowestSSE := math.Inf(1) // Split cluster for i, _ := range centroidlist { // Get the points in this cluster pointsCurCluster, err := clusterAssignment.FiltCol(float64(i), float64(i), 0) if err != nil { return matCentroidlist, clusterAssignment, err } centroids, splitClusterAssignment, err := Kmeansp(pointsCurCluster, 2, cc, measurer) if err != nil { return matCentroidlist, clusterAssignment, err } /* centroids is a 2X2 matrix of the best centroids found by kmeans splitClustAssignment is a mX2 matrix where col0 is either 0 or 1 and refers to the rows in centroids where col1 cotains the squared error between a centroid and a point. The rows here correspond to the rows in ptsInCurrCluster. For example, if row 2 contains [1, 7.999] this means that centroid 1 has been paired with the point in row 2 of splitClustAssignment and that the squared error (distance between centroid and point) is 7.999. */ // Calculate the sum of squared errors for each centroid. // This give a statistcal measurement of how good // the clustering is for this cluster. sseSplit := splitClusterAssignment.SumCol(1) // Calculate the SSE for the original cluster sqerr, err := clusterAssignment.FiltCol(float64(0), math.Inf(1), 0) if err != nil { return matCentroidlist, clusterAssignment, err } sseNotSplit := sqerr.SumCol(1) // TODO: Pre-BCI is this the best way to evaluate? if sseSplit+sseNotSplit < lowestSSE { bestCentroidToSplit = 1 bestNewCentroids = matrix.MakeDenseCopy(centroids) bestClusterAssignment = matrix.MakeDenseCopy(splitClusterAssignment) } } // Applying the split overwrites the existing cluster assginments for the // cluster you have decided to split. Kmeansp() returned two clusters // labeled 0 and 1. Change these cluster numbers to the cluster number // you are splitting and the next cluster to be added. m, err := bestClusterAssignment.FiltColMap(1, 1, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range m { bestClusterAssignment.Set(i, 0, float64(len(centroidlist))) } n, err := bestClusterAssignment.FiltColMap(0, 0, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range n { bestClusterAssignment.Set(i, 1, float64(bestCentroidToSplit)) } fmt.Printf("Best centroid to split %f\n", bestCentroidToSplit) r, _ := bestClusterAssignment.GetSize() fmt.Printf("The length of best cluster assesment is %f\n", r) // Replace a centroid with the two best centroids from the split. centroidlist[bestCentroidToSplit] = bestNewCentroids.GetRowVector(0) centroidlist = append(centroidlist, bestNewCentroids.GetRowVector(1)) // Reassign new clusters and SSE rows, _ := clusterAssignment.GetSize() for i, j := 0, 0; i < rows; i++ { if clusterAssignment.Get(i, 0) == float64(bestCentroidToSplit) { clusterAssignment.Set(i, 0, bestClusterAssignment.Get(j, 0)) clusterAssignment.Set(i, 1, bestClusterAssignment.Get(j, 1)) j++ } } // make centroidlist into a matrix s := make([][]float64, len(centroidlist)) for i, mat := range centroidlist { s[i][0] = mat.Get(0, 0) s[i][1] = mat.Get(0, 1) } matCentroidlist = matrix.MakeDenseMatrixStacked(s) } return matCentroidlist, clusterAssignment, nil }