// variance calculates the unbiased variance based on the number of data points // and centroids (i.e., parameters). In our case, numcentroids should always be 1 // since each data point has been paired with one centroid. // // The points matrix contains the coordinates of the data points. // The centroids matrix is 1Xn that contains the centroid cooordinates. // variance = // 1 / (numpoints - numcentroids) * sum for all points (x_i - mean_(i))^2 func variance(points, centroid *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (float64, error) { crows, _ := centroid.GetSize() if crows > 1 { return float64(0), errors.New(fmt.Sprintf("variance: expected centroid matrix with 1 row, received matrix with %d rows.", crows)) } prows, _ := points.GetSize() // Term 1 t1 := float64(1 / float64((prows - 1))) // Mean of distance between all points and the centroid. mean := modelMean(points, centroid) // Term 2 // Sum over all points (point_i - mean(i))^2 t2 := float64(0) for i := 0; i < prows; i++ { p := points.GetRowVector(i) dist, err := measurer.CalcDist(p, mean) if err != nil { return float64(-1), errors.New(fmt.Sprintf("variance: CalcDist returned: %v", err)) } t2 += math.Pow(dist, 2) } variance := t1 * t2 return variance, nil }
// addPairPointCentroidJobs adds a job to the jobs channel. func addPairPointCentroidJobs(jobs chan<- PairPointCentroidJob, datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer, results chan<- PairPointCentroidResult) { numRows, _ := datapoints.GetSize() for i := 0; i < numRows; i++ { point := datapoints.GetRowVector(i) jobs <- PairPointCentroidJob{point, centroids, results, i, measurer} } close(jobs) }
// modelMean calculates the mean between all points in a model and a centroid. func modelMean(points, centroid *matrix.DenseMatrix) *matrix.DenseMatrix { prows, pcols := points.GetSize() pdist := matrix.Zeros(prows, pcols) for i := 0; i < prows; i++ { diff := matrix.Difference(centroid, points.GetRowVector(i)) pdist.SetRowVector(diff, i) } return pdist.MeanCols() }
// DataCentroids picks k distinct points from the dataset. If k is > points in // the matrix then k is set to the number of points. func (c DataCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix { // first set up a map to keep track of which data points have already been chosen so we don't dupe rows, cols := mat.GetSize() centroids := matrix.Zeros(k, cols) if k > rows { k = rows } chosenIdxs := make(map[int]bool, k) for len(chosenIdxs) < k { index := rand.Intn(rows) chosenIdxs[index] = true } i := 0 for idx, _ := range chosenIdxs { centroids.SetRowVector(mat.GetRowVector(idx).Copy(), i) i += 1 } return centroids }
// Kmeansbi bisects a given cluster and determines which centroids give the lowest error. // Take the points in a cluster // While the number of cluster < k // for every cluster // measure total error // cacl kmeansp with k=2 on a given cluster // measure total error after kmeansp split // choose the cluster split with the lowest SSE // commit the chosen split // // N.B. We are using SSE until the BIC is completed. func Kmeansbi(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (matCentroidlist, clusterAssignment *matrix.DenseMatrix, err error) { numRows, numCols := datapoints.GetSize() clusterAssignment = matrix.Zeros(numRows, numCols) matCentroidlist = matrix.Zeros(k, numCols) centroid0 := datapoints.MeanCols() centroidlist := []*matrix.DenseMatrix{centroid0} // Initially create one cluster. for j := 0; j < numRows; j++ { point := datapoints.GetRowVector(j) distJ, err := measurer.CalcDist(centroid0, point) if err != nil { return matCentroidlist, clusterAssignment, errors.New(fmt.Sprintf("Kmeansbi: CalcDist returned err=%v", err)) } clusterAssignment.Set(j, 1, math.Pow(distJ, 2)) } var bestClusterAssignment, bestNewCentroids *matrix.DenseMatrix var bestCentroidToSplit int // Find the best centroid configuration. for len(centroidlist) < k { lowestSSE := math.Inf(1) // Split cluster for i, _ := range centroidlist { // Get the points in this cluster pointsCurCluster, err := clusterAssignment.FiltCol(float64(i), float64(i), 0) if err != nil { return matCentroidlist, clusterAssignment, err } centroids, splitClusterAssignment, err := Kmeansp(pointsCurCluster, 2, cc, measurer) if err != nil { return matCentroidlist, clusterAssignment, err } /* centroids is a 2X2 matrix of the best centroids found by kmeans splitClustAssignment is a mX2 matrix where col0 is either 0 or 1 and refers to the rows in centroids where col1 cotains the squared error between a centroid and a point. The rows here correspond to the rows in ptsInCurrCluster. For example, if row 2 contains [1, 7.999] this means that centroid 1 has been paired with the point in row 2 of splitClustAssignment and that the squared error (distance between centroid and point) is 7.999. */ // Calculate the sum of squared errors for each centroid. // This give a statistcal measurement of how good // the clustering is for this cluster. sseSplit := splitClusterAssignment.SumCol(1) // Calculate the SSE for the original cluster sqerr, err := clusterAssignment.FiltCol(float64(0), math.Inf(1), 0) if err != nil { return matCentroidlist, clusterAssignment, err } sseNotSplit := sqerr.SumCol(1) // TODO: Pre-BCI is this the best way to evaluate? if sseSplit+sseNotSplit < lowestSSE { bestCentroidToSplit = 1 bestNewCentroids = matrix.MakeDenseCopy(centroids) bestClusterAssignment = matrix.MakeDenseCopy(splitClusterAssignment) } } // Applying the split overwrites the existing cluster assginments for the // cluster you have decided to split. Kmeansp() returned two clusters // labeled 0 and 1. Change these cluster numbers to the cluster number // you are splitting and the next cluster to be added. m, err := bestClusterAssignment.FiltColMap(1, 1, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range m { bestClusterAssignment.Set(i, 0, float64(len(centroidlist))) } n, err := bestClusterAssignment.FiltColMap(0, 0, 0) if err != nil { return matCentroidlist, clusterAssignment, err } for i, _ := range n { bestClusterAssignment.Set(i, 1, float64(bestCentroidToSplit)) } fmt.Printf("Best centroid to split %f\n", bestCentroidToSplit) r, _ := bestClusterAssignment.GetSize() fmt.Printf("The length of best cluster assesment is %f\n", r) // Replace a centroid with the two best centroids from the split. centroidlist[bestCentroidToSplit] = bestNewCentroids.GetRowVector(0) centroidlist = append(centroidlist, bestNewCentroids.GetRowVector(1)) // Reassign new clusters and SSE rows, _ := clusterAssignment.GetSize() for i, j := 0, 0; i < rows; i++ { if clusterAssignment.Get(i, 0) == float64(bestCentroidToSplit) { clusterAssignment.Set(i, 0, bestClusterAssignment.Get(j, 0)) clusterAssignment.Set(i, 1, bestClusterAssignment.Get(j, 1)) j++ } } // make centroidlist into a matrix s := make([][]float64, len(centroidlist)) for i, mat := range centroidlist { s[i][0] = mat.Get(0, 0) s[i][1] = mat.Get(0, 1) } matCentroidlist = matrix.MakeDenseMatrixStacked(s) } return matCentroidlist, clusterAssignment, nil }