示例#1
0
// variance calculates the unbiased variance based on the number of data points
// and centroids (i.e., parameters).  In our case, numcentroids should always be 1
// since each data point has been paired with one centroid.
//
// The points matrix contains the coordinates of the data points.
// The centroids matrix is 1Xn that contains the centroid cooordinates.
// variance = 	// 1 / (numpoints - numcentroids) * sum for all points  (x_i - mean_(i))^2
func variance(points, centroid *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (float64, error) {
	crows, _ := centroid.GetSize()
	if crows > 1 {
		return float64(0), errors.New(fmt.Sprintf("variance: expected centroid matrix with 1 row, received matrix with %d rows.", crows))
	}
	prows, _ := points.GetSize()

	// Term 1
	t1 := float64(1 / float64((prows - 1)))

	// Mean of distance between all points and the centroid.
	mean := modelMean(points, centroid)

	// Term 2
	// Sum over all points (point_i - mean(i))^2
	t2 := float64(0)
	for i := 0; i < prows; i++ {
		p := points.GetRowVector(i)
		dist, err := measurer.CalcDist(p, mean)
		if err != nil {
			return float64(-1), errors.New(fmt.Sprintf("variance: CalcDist returned: %v", err))
		}
		t2 += math.Pow(dist, 2)
	}
	variance := t1 * t2

	return variance, nil
}
示例#2
0
// Kmeansbi bisects a given cluster and determines which centroids give the lowest error.
// Take the points in a cluster
// While the number of cluster < k
//    for every cluster
//        measure total error
//        cacl kmeansp with k=2 on a given cluster
//        measure total error after kmeansp split
//    choose the cluster split with the lowest SSE
//    commit the chosen split
//
// N.B. We are using SSE until the BIC is completed.
func Kmeansbi(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (matCentroidlist, clusterAssignment *matrix.DenseMatrix, err error) {
	numRows, numCols := datapoints.GetSize()
	clusterAssignment = matrix.Zeros(numRows, numCols)
	matCentroidlist = matrix.Zeros(k, numCols)
	centroid0 := datapoints.MeanCols()
	centroidlist := []*matrix.DenseMatrix{centroid0}

	// Initially create one cluster.
	for j := 0; j < numRows; j++ {
		point := datapoints.GetRowVector(j)
		distJ, err := measurer.CalcDist(centroid0, point)
		if err != nil {
			return matCentroidlist, clusterAssignment, errors.New(fmt.Sprintf("Kmeansbi: CalcDist returned err=%v", err))
		}
		clusterAssignment.Set(j, 1, math.Pow(distJ, 2))
	}

	var bestClusterAssignment, bestNewCentroids *matrix.DenseMatrix
	var bestCentroidToSplit int

	// Find the best centroid configuration.
	for len(centroidlist) < k {
		lowestSSE := math.Inf(1)
		// Split cluster
		for i, _ := range centroidlist {
			// Get the points in this cluster
			pointsCurCluster, err := clusterAssignment.FiltCol(float64(i), float64(i), 0)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}

			centroids, splitClusterAssignment, err := Kmeansp(pointsCurCluster, 2, cc, measurer)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}

			/* centroids is a 2X2 matrix of the best centroids found by kmeans

			   splitClustAssignment is a mX2 matrix where col0 is either 0 or 1 and refers to the rows in centroids
			   where col1 cotains the squared error between a centroid and a point.  The rows here correspond to
			   the rows in ptsInCurrCluster.  For example, if row 2 contains [1, 7.999] this means that centroid 1
			   has been paired with the point in row 2 of splitClustAssignment and that the squared error (distance
			   between centroid and point) is 7.999.
			*/

			// Calculate the sum of squared errors for each centroid.
			// This give a statistcal measurement of how good
			// the clustering is for this cluster.
			sseSplit := splitClusterAssignment.SumCol(1)
			// Calculate the SSE for the original cluster
			sqerr, err := clusterAssignment.FiltCol(float64(0), math.Inf(1), 0)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}
			sseNotSplit := sqerr.SumCol(1)

			// TODO: Pre-BCI is this the best way to evaluate?
			if sseSplit+sseNotSplit < lowestSSE {
				bestCentroidToSplit = 1
				bestNewCentroids = matrix.MakeDenseCopy(centroids)
				bestClusterAssignment = matrix.MakeDenseCopy(splitClusterAssignment)
			}
		}

		// Applying the split overwrites the existing cluster assginments for the
		// cluster you have decided to split.  Kmeansp() returned two clusters
		// labeled 0 and 1. Change these cluster numbers to the cluster number
		// you are splitting and the next cluster to be added.
		m, err := bestClusterAssignment.FiltColMap(1, 1, 0)
		if err != nil {
			return matCentroidlist, clusterAssignment, err
		}
		for i, _ := range m {
			bestClusterAssignment.Set(i, 0, float64(len(centroidlist)))
		}

		n, err := bestClusterAssignment.FiltColMap(0, 0, 0)
		if err != nil {
			return matCentroidlist, clusterAssignment, err
		}
		for i, _ := range n {
			bestClusterAssignment.Set(i, 1, float64(bestCentroidToSplit))
		}

		fmt.Printf("Best centroid to split %f\n", bestCentroidToSplit)
		r, _ := bestClusterAssignment.GetSize()
		fmt.Printf("The length of best cluster assesment is %f\n", r)

		// Replace a centroid with the two best centroids from the split.
		centroidlist[bestCentroidToSplit] = bestNewCentroids.GetRowVector(0)
		centroidlist = append(centroidlist, bestNewCentroids.GetRowVector(1))

		// Reassign new clusters and SSE
		rows, _ := clusterAssignment.GetSize()
		for i, j := 0, 0; i < rows; i++ {
			if clusterAssignment.Get(i, 0) == float64(bestCentroidToSplit) {
				clusterAssignment.Set(i, 0, bestClusterAssignment.Get(j, 0))
				clusterAssignment.Set(i, 1, bestClusterAssignment.Get(j, 1))
				j++
			}
		}

		// make centroidlist into a matrix
		s := make([][]float64, len(centroidlist))
		for i, mat := range centroidlist {
			s[i][0] = mat.Get(0, 0)
			s[i][1] = mat.Get(0, 1)
		}
		matCentroidlist = matrix.MakeDenseMatrixStacked(s)
	}
	return matCentroidlist, clusterAssignment, nil
}