Ejemplo n.º 1
0
// kmeans partitions datapoints into K clusters.  This results in a partitioning of
// the data space into Voronoi cells.  The problem is NP-hard so here we attempt
// to parallelize or make concurrent as many processes as possible to reduce the
// running time.
//
// 1. Place K points into the space represented by the objects that are being clustered.
// These points represent initial group centroids.
//
// 2. Assign each object to the group that has the closest centroid.
//
// 3. When all objects have been assigned, recalculate the positions of the K centroids
// by calculating the mean of all cooridnates in a cluster and making that
// the new centroid.
//
// 4. Repeat Steps 2 and 3 until the centroids no longer move.
//
// centroids is K x M matrix that cotains the coordinates for the centroids.
// The centroids are indexed by the 0 based rows of this matrix.
//  ____      _________
//  | 12.29   32.94 ... | <-- The coordinates for centroid 0
//  | 4.6     29.22 ... | <-- The coordinates for centroid 1
//  |_____    __________|
//
//
// CentPointDist is ax R x M matrix.  The rows have a 1:1 relationship to
// the rows in datapoints.  Column 0 contains the row number in centroids
// that corresponds to the centroid for the datapoint in row i of this matrix.
// Column 1 contains (x_i - mu(i))^2.
//  ____      _______
//  | 3        38.01 | <-- Centroid 3, squared error for the coordinates in row 0 of datapoints
//  | 1        23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints
//  | 0        14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints
//  _____     _______
//
func kmeans(datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer) Model {
	/*  datapoints				  CentPoinDist            centroids
	                                  ________________
	    ____	  ____				  __|__	  ______	 |	  ____	___________
	    | ...	 |				 |	...			|	 V	 | ...		       |
	    | 3.0  5.1| <-- row i --> |	3	  32.12 |  row 3 | 3	 38.1, ... |
	    |____  ___|				 |____	  ______|	     |___	__________ |
	*/
	R, M := datapoints.GetSize()
	CentPointDist := matrix.Zeros(R, 2)
	k, _ := centroids.GetSize()

	clusterChanged := true
	var clusters []cluster

	for clusterChanged == true {
		clusterChanged = false
		clusters = make([]cluster, 0)

		jobs := make(chan PairPointCentroidJob, 1024)
		results := make(chan PairPointCentroidResult, 1024)
		done := make(chan int, 1024)

		// Pair each point with its closest centroid.
		go addPairPointCentroidJobs(jobs, datapoints, centroids, measurer, results)
		for i := 0; i < numworkers; i++ {
			go doPairPointCentroidJobs(done, jobs)
		}
		go awaitPairPointCentroidCompletion(done, results)

		clusterChanged = assessClusters(CentPointDist, results) // This blocks so that all the results can be processed

		// You have each data point grouped with a centroid,
		for idx, cent := 0, 0; cent < k; cent++ {
			// Select all the rows in CentPointDist whose first col value == cent.
			// Get the corresponding row vector from datapoints and place it in pointsInCluster.
			r, _ := CentPointDist.GetSize()
			matches := make([]int, 0)

			for i := 0; i < r; i++ {
				v := CentPointDist.Get(i, 0)
				if v == float64(cent) {
					matches = append(matches, i)
				}
			}

			// It is possible that some centroids may have zero points, so there
			// may not be any matches.
			if len(matches) == 0 {
				continue
			}

			pointsInCluster := matrix.Zeros(len(matches), M)
			i := 0

			for _, rownum := range matches {
				pointsInCluster.Set(i, 0, datapoints.Get(int(rownum), 0))
				pointsInCluster.Set(i, 1, datapoints.Get(int(rownum), 1))
				i++
			}

			// pointsInCluster now contains all the data points for the current
			// centroid.  The mean of the coordinates for this cluster becomes
			// the new centroid for this cluster.
			mean := pointsInCluster.MeanCols()
			centroids.SetRowVector(mean, cent)

			clust := cluster{pointsInCluster, mean, 0}
			clust.Variance = variance(clust, measurer)
			clusters = append(clusters, clust)
			idx++
		}
	}
	modelbic := calcbic(R, M, clusters)
	model := Model{modelbic, clusters}
	return model
}