Пример #1
0
// Kmeansp returns means and distance squared of the coordinates for each
// centroid using parallel computation.
//
// Input values
//
// datapoints - a kX2 matrix of R^2 coordinates
//
// centroids - a kX2 matrix of R^2 coordinates for centroids.
//
// measurer - anythng that implements the matutil.VectorMeasurer interface to
// calculate the distance between a centroid and datapoint. (e.g., Euclidian
// distance)
//
// Return values
//
// centroidMean - a kX2 matrix where the row number corresponds to the same
// row in the centroid matrix and the two columns are the means of the
// coordinates for that cluster.  i.e., the best centroids that could
// be determined.
//
//  ____      ______
//  | 12.29   32.94 | <-- The mean of coordinates for centroid 0
//  | 4.6     29.22 | <-- The mean of coordinates for centroid 1
//  |_____    ______|
//
//
// centroidSqErr - a kX2 matrix where the first column contains a number
// indicating the centroid and the second column contains the minimum
// distance between centroid and point squared.  (i.e., the squared error)
//
//  ____      _______
//  | 0        38.01 | <-- Centroid 0, squared error for the coordinates in row 0 of datapoints
//  | 1        23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints
//  | 0        14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints
//  _____     _______
//func Kmeansp(datapoints, centroids *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (centroidMean,
func Kmeansp(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (centroidMean,
	centroidSqErr *matrix.DenseMatrix, err error) {
	//k, _ := centroids.GetSize()
	fp, _ := os.Create("/var/tmp/km.log")
	w := io.Writer(fp)
	log.SetOutput(w)

	centroids := cc.ChooseCentroids(datapoints, k)
	numRows, numCols := datapoints.GetSize()
	centroidSqErr = matrix.Zeros(numRows, numCols)
	centroidMean = matrix.Zeros(k, numCols)

	jobs := make(chan PairPointCentroidJob, numworkers)
	results := make(chan PairPointCentroidResult, minimum(1024, numRows))
	done := make(chan int, numworkers)

	go addPairPointCentroidJobs(jobs, datapoints, centroidSqErr, centroids, measurer, results)
	for i := 0; i < numworkers; i++ {
		go doPairPointCentroidJobs(done, jobs)
	}
	go awaitPairPointCentroidCompletion(done, results)
	processPairPointToCentroidResults(centroidSqErr, results) // This blocks so that all the results can be processed

	// Now that you have each data point grouped with a centroid, iterate
	// through the  centroidSqErr martix and for each centroid retrieve the
	// original coordinates from datapoints and place the results in
	// pointsInCuster.
	for c := 0; c < k; c++ {
		// c is the index that identifies the current centroid.
		// d is the index that identifies a row in centroidSqErr and datapoints.
		// Select all the rows in centroidSqErr whose first col value == c.
		// Get the corresponding row vector from datapoints and place it in pointsInCluster.
		matches, err := centroidSqErr.FiltColMap(float64(c), float64(c), 0) //rows with c in column 0.
		if err != nil {
			return centroidMean, centroidSqErr, nil
		}
		// It is possible that some centroids will not have any points, so there
		// may not be any matches in the first column of centroidSqErr.
		if len(matches) == 0 {
			continue
		}

		pointsInCluster := matrix.Zeros(len(matches), 2)
		for d, rownum := range matches {
			pointsInCluster.Set(d, 0, datapoints.Get(int(rownum), 0))
			pointsInCluster.Set(d, 1, datapoints.Get(int(rownum), 1))
		}

		// pointsInCluster now contains all the data points for the current
		// centroid.  Take the mean of each of the 2 cols in pointsInCluster.
		means := pointsInCluster.MeanCols()
		centroidMean.Set(c, 0, means.Get(0, 0))
		centroidMean.Set(c, 1, means.Get(0, 1))
	}
	return
}
Пример #2
0
// assessClusters assigns the results to the CentPointDist matrix.
func assessClusters(CentPointDist *matrix.DenseMatrix, results <-chan PairPointCentroidResult) bool {
	change := false
	for result := range results {
		if CentPointDist.Get(result.rowNum, 0) != result.centroidRowNum {
			change = true
		}
		CentPointDist.Set(result.rowNum, 0, result.centroidRowNum)
		CentPointDist.Set(result.rowNum, 1, result.distSquared)
	}
	return change
}
Пример #3
0
// CalcDist finds the ManhattanDistance which is the sum of the aboslute
// difference of the coordinates.   Also known as rectilinear distance,
// city block distance, or taxicab distance.
func (md ManhattanDist) CalcDist(a, b *matrix.DenseMatrix) (dist float64, err error) {
	dist = float64(0)
	err = nil
	arows, acols := a.GetSize()
	brows, bcols := b.GetSize()

	if arows != 1 || brows != 1 {
		return dist, errors.New(fmt.Sprintf("matutil: Matrices must contain only 1 row.  a has %d and b has %d.", arows, brows))
	} else if arows != brows {
		return dist, errors.New(fmt.Sprintf("matutil: Matrices must have the same dimensions.  a=%dX%d b=%dX%d", arows, acols, brows, bcols))
	}
	dist = math.Abs(a.Get(0, 0)-b.Get(0, 0)) + math.Abs(a.Get(0, 1)-b.Get(0, 1))
	return
}
Пример #4
0
// boundaries returns the max and min x and y values for a dense matrix
// of shape m x m.
func boundaries(mat *matrix.DenseMatrix) (xmin, xmax, ymin, ymax float64) {
	rows, _ := mat.GetSize()
	xmin, ymin = mat.Get(0, 0), mat.Get(0, 1)
	xmax, ymax = mat.Get(0, 0), mat.Get(0, 1)

	for i := 1; i < rows; i++ {
		xi, yi := mat.Get(i, 0), mat.Get(i, 1)

		if xi > xmax {
			xmax = xi
		} else if xi < xmin {
			xmin = xi
		}

		if yi > ymax {
			ymax = yi
		} else if yi < ymin {
			ymin = yi
		}
	}
	return
}
Пример #5
0
// GetBoundaries returns the max and min x and y values for a dense matrix
// of shape m x 2.
func GetBoundaries(mat *matrix.DenseMatrix) (xmin, xmax, ymin, ymax float64) {
	rows, cols := mat.GetSize()
	if cols != 2 {
		// TODO - should there be an err return, or should we panic here?
	}
	xmin, ymin = mat.Get(0, 0), mat.Get(0, 1)
	xmax, ymax = mat.Get(0, 0), mat.Get(0, 1)
	for i := 1; i < rows; i++ {
		xi, yi := mat.Get(i, 0), mat.Get(i, 1)

		if xi > xmax {
			xmax = xi
		} else if xi < xmin {
			xmin = xi
		}

		if yi > ymax {
			ymax = yi
		} else if yi < ymin {
			ymin = yi
		}
	}
	return
}
Пример #6
0
// kmeans partitions datapoints into K clusters.  This results in a partitioning of
// the data space into Voronoi cells.  The problem is NP-hard so here we attempt
// to parallelize or make concurrent as many processes as possible to reduce the
// running time.
//
// 1. Place K points into the space represented by the objects that are being clustered.
// These points represent initial group centroids.
//
// 2. Assign each object to the group that has the closest centroid.
//
// 3. When all objects have been assigned, recalculate the positions of the K centroids
// by calculating the mean of all cooridnates in a cluster and making that
// the new centroid.
//
// 4. Repeat Steps 2 and 3 until the centroids no longer move.
//
// centroids is K x M matrix that cotains the coordinates for the centroids.
// The centroids are indexed by the 0 based rows of this matrix.
//  ____      _________
//  | 12.29   32.94 ... | <-- The coordinates for centroid 0
//  | 4.6     29.22 ... | <-- The coordinates for centroid 1
//  |_____    __________|
//
//
// CentPointDist is ax R x M matrix.  The rows have a 1:1 relationship to
// the rows in datapoints.  Column 0 contains the row number in centroids
// that corresponds to the centroid for the datapoint in row i of this matrix.
// Column 1 contains (x_i - mu(i))^2.
//  ____      _______
//  | 3        38.01 | <-- Centroid 3, squared error for the coordinates in row 0 of datapoints
//  | 1        23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints
//  | 0        14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints
//  _____     _______
//
func kmeans(datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer) Model {
	/*  datapoints				  CentPoinDist            centroids
	                                  ________________
	    ____	  ____				  __|__	  ______	 |	  ____	___________
	    | ...	 |				 |	...			|	 V	 | ...		       |
	    | 3.0  5.1| <-- row i --> |	3	  32.12 |  row 3 | 3	 38.1, ... |
	    |____  ___|				 |____	  ______|	     |___	__________ |
	*/
	R, M := datapoints.GetSize()
	CentPointDist := matrix.Zeros(R, 2)
	k, _ := centroids.GetSize()

	clusterChanged := true
	var clusters []cluster

	for clusterChanged == true {
		clusterChanged = false
		clusters = make([]cluster, 0)

		jobs := make(chan PairPointCentroidJob, 1024)
		results := make(chan PairPointCentroidResult, 1024)
		done := make(chan int, 1024)

		// Pair each point with its closest centroid.
		go addPairPointCentroidJobs(jobs, datapoints, centroids, measurer, results)
		for i := 0; i < numworkers; i++ {
			go doPairPointCentroidJobs(done, jobs)
		}
		go awaitPairPointCentroidCompletion(done, results)

		clusterChanged = assessClusters(CentPointDist, results) // This blocks so that all the results can be processed

		// You have each data point grouped with a centroid,
		for idx, cent := 0, 0; cent < k; cent++ {
			// Select all the rows in CentPointDist whose first col value == cent.
			// Get the corresponding row vector from datapoints and place it in pointsInCluster.
			r, _ := CentPointDist.GetSize()
			matches := make([]int, 0)

			for i := 0; i < r; i++ {
				v := CentPointDist.Get(i, 0)
				if v == float64(cent) {
					matches = append(matches, i)
				}
			}

			// It is possible that some centroids may have zero points, so there
			// may not be any matches.
			if len(matches) == 0 {
				continue
			}

			pointsInCluster := matrix.Zeros(len(matches), M)
			i := 0

			for _, rownum := range matches {
				pointsInCluster.Set(i, 0, datapoints.Get(int(rownum), 0))
				pointsInCluster.Set(i, 1, datapoints.Get(int(rownum), 1))
				i++
			}

			// pointsInCluster now contains all the data points for the current
			// centroid.  The mean of the coordinates for this cluster becomes
			// the new centroid for this cluster.
			mean := pointsInCluster.MeanCols()
			centroids.SetRowVector(mean, cent)

			clust := cluster{pointsInCluster, mean, 0}
			clust.Variance = variance(clust, measurer)
			clusters = append(clusters, clust)
			idx++
		}
	}
	modelbic := calcbic(R, M, clusters)
	model := Model{modelbic, clusters}
	return model
}
Пример #7
0
// CalcDist finds the ManhattanDistance which is the sum of the aboslute
// difference of the coordinates.   Also known as rectilinear distance,
// city block distance, or taxicab distance.
func (md ManhattanDist) CalcDist(a, b *matrix.DenseMatrix) float64 {
	return math.Abs(a.Get(0, 0)-b.Get(0, 0)) + math.Abs(a.Get(0, 1)-b.Get(0, 1))
}
Пример #8
0
// Kmeansbi bisects a given cluster and determines which centroids give the lowest error.
// Take the points in a cluster
// While the number of cluster < k
//    for every cluster
//        measure total error
//        cacl kmeansp with k=2 on a given cluster
//        measure total error after kmeansp split
//    choose the cluster split with the lowest SSE
//    commit the chosen split
//
// N.B. We are using SSE until the BIC is completed.
func Kmeansbi(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (matCentroidlist, clusterAssignment *matrix.DenseMatrix, err error) {
	numRows, numCols := datapoints.GetSize()
	clusterAssignment = matrix.Zeros(numRows, numCols)
	matCentroidlist = matrix.Zeros(k, numCols)
	centroid0 := datapoints.MeanCols()
	centroidlist := []*matrix.DenseMatrix{centroid0}

	// Initially create one cluster.
	for j := 0; j < numRows; j++ {
		point := datapoints.GetRowVector(j)
		distJ, err := measurer.CalcDist(centroid0, point)
		if err != nil {
			return matCentroidlist, clusterAssignment, errors.New(fmt.Sprintf("Kmeansbi: CalcDist returned err=%v", err))
		}
		clusterAssignment.Set(j, 1, math.Pow(distJ, 2))
	}

	var bestClusterAssignment, bestNewCentroids *matrix.DenseMatrix
	var bestCentroidToSplit int

	// Find the best centroid configuration.
	for len(centroidlist) < k {
		lowestSSE := math.Inf(1)
		// Split cluster
		for i, _ := range centroidlist {
			// Get the points in this cluster
			pointsCurCluster, err := clusterAssignment.FiltCol(float64(i), float64(i), 0)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}

			centroids, splitClusterAssignment, err := Kmeansp(pointsCurCluster, 2, cc, measurer)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}

			/* centroids is a 2X2 matrix of the best centroids found by kmeans

			   splitClustAssignment is a mX2 matrix where col0 is either 0 or 1 and refers to the rows in centroids
			   where col1 cotains the squared error between a centroid and a point.  The rows here correspond to
			   the rows in ptsInCurrCluster.  For example, if row 2 contains [1, 7.999] this means that centroid 1
			   has been paired with the point in row 2 of splitClustAssignment and that the squared error (distance
			   between centroid and point) is 7.999.
			*/

			// Calculate the sum of squared errors for each centroid.
			// This give a statistcal measurement of how good
			// the clustering is for this cluster.
			sseSplit := splitClusterAssignment.SumCol(1)
			// Calculate the SSE for the original cluster
			sqerr, err := clusterAssignment.FiltCol(float64(0), math.Inf(1), 0)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}
			sseNotSplit := sqerr.SumCol(1)

			// TODO: Pre-BCI is this the best way to evaluate?
			if sseSplit+sseNotSplit < lowestSSE {
				bestCentroidToSplit = 1
				bestNewCentroids = matrix.MakeDenseCopy(centroids)
				bestClusterAssignment = matrix.MakeDenseCopy(splitClusterAssignment)
			}
		}

		// Applying the split overwrites the existing cluster assginments for the
		// cluster you have decided to split.  Kmeansp() returned two clusters
		// labeled 0 and 1. Change these cluster numbers to the cluster number
		// you are splitting and the next cluster to be added.
		m, err := bestClusterAssignment.FiltColMap(1, 1, 0)
		if err != nil {
			return matCentroidlist, clusterAssignment, err
		}
		for i, _ := range m {
			bestClusterAssignment.Set(i, 0, float64(len(centroidlist)))
		}

		n, err := bestClusterAssignment.FiltColMap(0, 0, 0)
		if err != nil {
			return matCentroidlist, clusterAssignment, err
		}
		for i, _ := range n {
			bestClusterAssignment.Set(i, 1, float64(bestCentroidToSplit))
		}

		fmt.Printf("Best centroid to split %f\n", bestCentroidToSplit)
		r, _ := bestClusterAssignment.GetSize()
		fmt.Printf("The length of best cluster assesment is %f\n", r)

		// Replace a centroid with the two best centroids from the split.
		centroidlist[bestCentroidToSplit] = bestNewCentroids.GetRowVector(0)
		centroidlist = append(centroidlist, bestNewCentroids.GetRowVector(1))

		// Reassign new clusters and SSE
		rows, _ := clusterAssignment.GetSize()
		for i, j := 0, 0; i < rows; i++ {
			if clusterAssignment.Get(i, 0) == float64(bestCentroidToSplit) {
				clusterAssignment.Set(i, 0, bestClusterAssignment.Get(j, 0))
				clusterAssignment.Set(i, 1, bestClusterAssignment.Get(j, 1))
				j++
			}
		}

		// make centroidlist into a matrix
		s := make([][]float64, len(centroidlist))
		for i, mat := range centroidlist {
			s[i][0] = mat.Get(0, 0)
			s[i][1] = mat.Get(0, 1)
		}
		matCentroidlist = matrix.MakeDenseMatrixStacked(s)
	}
	return matCentroidlist, clusterAssignment, nil
}