Ejemplo n.º 1
0
// chooseCentroids picks random centroids based on the min and max values in the matrix
// and return a k by m matrix of the centroids.
func (c randCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix {
	_, cols := mat.GetSize()
	centroids := matrix.Zeros(k, cols)

	for colnum := 0; colnum < cols; colnum++ {
		r := mat.ColSlice(colnum)

		minj := float64(0)
		// min value from column
		for _, val := range r {
			minj = math.Min(minj, val)
		}

		// max value from column
		maxj := float64(0)
		for _, val := range r {
			maxj = math.Max(maxj, val)
		}

		// create a slice of random centroids
		// based on maxj + minJ * random num to stay in range
		for h := 0; h < k; h++ {
			randInRange := ((maxj - minj) * rand.Float64()) + minj
			centroids.Set(h, colnum, randInRange)
		}
	}
	return centroids
}
Ejemplo n.º 2
0
// ComputeCentroids Needs comments.
func ComputeCentroid(mat *matrix.DenseMatrix) (*matrix.DenseMatrix, error) {
	rows, _ := mat.GetSize()
	vectorSum := mat.SumCols()
	if rows == 0 {
		return vectorSum, errors.New("No points inputted")
	}
	vectorSum.Scale(1.0 / float64(rows))
	return vectorSum, nil
}
Ejemplo n.º 3
0
// addPairPointCentroidJobs adds a job to the jobs channel.
func addPairPointCentroidJobs(jobs chan<- PairPointCentroidJob, datapoints,
	centroids *matrix.DenseMatrix, measurer VectorMeasurer, results chan<- PairPointCentroidResult) {
	numRows, _ := datapoints.GetSize()
	for i := 0; i < numRows; i++ {
		point := datapoints.GetRowVector(i)
		jobs <- PairPointCentroidJob{point, centroids, results, i, measurer}
	}
	close(jobs)
}
Ejemplo n.º 4
0
// Kmeansp returns means and distance squared of the coordinates for each
// centroid using parallel computation.
//
// Input values
//
// datapoints - a kX2 matrix of R^2 coordinates
//
// centroids - a kX2 matrix of R^2 coordinates for centroids.
//
// measurer - anythng that implements the matutil.VectorMeasurer interface to
// calculate the distance between a centroid and datapoint. (e.g., Euclidian
// distance)
//
// Return values
//
// centroidMean - a kX2 matrix where the row number corresponds to the same
// row in the centroid matrix and the two columns are the means of the
// coordinates for that cluster.  i.e., the best centroids that could
// be determined.
//
//  ____      ______
//  | 12.29   32.94 | <-- The mean of coordinates for centroid 0
//  | 4.6     29.22 | <-- The mean of coordinates for centroid 1
//  |_____    ______|
//
//
// centroidSqErr - a kX2 matrix where the first column contains a number
// indicating the centroid and the second column contains the minimum
// distance between centroid and point squared.  (i.e., the squared error)
//
//  ____      _______
//  | 0        38.01 | <-- Centroid 0, squared error for the coordinates in row 0 of datapoints
//  | 1        23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints
//  | 0        14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints
//  _____     _______
//func Kmeansp(datapoints, centroids *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (centroidMean,
func Kmeansp(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (centroidMean,
	centroidSqErr *matrix.DenseMatrix, err error) {
	//k, _ := centroids.GetSize()
	fp, _ := os.Create("/var/tmp/km.log")
	w := io.Writer(fp)
	log.SetOutput(w)

	centroids := cc.ChooseCentroids(datapoints, k)
	numRows, numCols := datapoints.GetSize()
	centroidSqErr = matrix.Zeros(numRows, numCols)
	centroidMean = matrix.Zeros(k, numCols)

	jobs := make(chan PairPointCentroidJob, numworkers)
	results := make(chan PairPointCentroidResult, minimum(1024, numRows))
	done := make(chan int, numworkers)

	go addPairPointCentroidJobs(jobs, datapoints, centroidSqErr, centroids, measurer, results)
	for i := 0; i < numworkers; i++ {
		go doPairPointCentroidJobs(done, jobs)
	}
	go awaitPairPointCentroidCompletion(done, results)
	processPairPointToCentroidResults(centroidSqErr, results) // This blocks so that all the results can be processed

	// Now that you have each data point grouped with a centroid, iterate
	// through the  centroidSqErr martix and for each centroid retrieve the
	// original coordinates from datapoints and place the results in
	// pointsInCuster.
	for c := 0; c < k; c++ {
		// c is the index that identifies the current centroid.
		// d is the index that identifies a row in centroidSqErr and datapoints.
		// Select all the rows in centroidSqErr whose first col value == c.
		// Get the corresponding row vector from datapoints and place it in pointsInCluster.
		matches, err := centroidSqErr.FiltColMap(float64(c), float64(c), 0) //rows with c in column 0.
		if err != nil {
			return centroidMean, centroidSqErr, nil
		}
		// It is possible that some centroids will not have any points, so there
		// may not be any matches in the first column of centroidSqErr.
		if len(matches) == 0 {
			continue
		}

		pointsInCluster := matrix.Zeros(len(matches), 2)
		for d, rownum := range matches {
			pointsInCluster.Set(d, 0, datapoints.Get(int(rownum), 0))
			pointsInCluster.Set(d, 1, datapoints.Get(int(rownum), 1))
		}

		// pointsInCluster now contains all the data points for the current
		// centroid.  Take the mean of each of the 2 cols in pointsInCluster.
		means := pointsInCluster.MeanCols()
		centroidMean.Set(c, 0, means.Get(0, 0))
		centroidMean.Set(c, 1, means.Get(0, 1))
	}
	return
}
Ejemplo n.º 5
0
// modelMean calculates the mean between all points in a model and a centroid.
func modelMean(points, centroid *matrix.DenseMatrix) *matrix.DenseMatrix {
	prows, pcols := points.GetSize()
	pdist := matrix.Zeros(prows, pcols)

	for i := 0; i < prows; i++ {
		diff := matrix.Difference(centroid, points.GetRowVector(i))
		pdist.SetRowVector(diff, i)
	}
	return pdist.MeanCols()
}
Ejemplo n.º 6
0
// Needs comments
func (c EllipseCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix {
	_, cols := mat.GetSize()
	var xmin, xmax, ymin, ymax = matutil.GetBoundaries(mat)
	x0, y0 := xmin+(xmax-xmin)/2.0, ymin+(ymax-ymin)/2.0
	centroids := matrix.Zeros(k, cols)
	rx, ry := xmax-x0, ymax-y0
	thetaInit := rand.Float64() * math.Pi

	for i := 0; i < k; i++ {
		centroids.Set(i, 0, rx*c.frac*math.Cos(thetaInit+float64(i)*math.Pi/float64(k)))
		centroids.Set(i, 1, ry*c.frac*math.Sin(thetaInit+float64(i)*math.Pi/float64(k)))
	}
	return centroids
}
Ejemplo n.º 7
0
// Xmeans runs k-means for k lower bound to k upper bound on a data set.
// Once the k centroids have converged each cluster is bisected and the BIC
// of the orginal cluster (parent = a model with one centroid) to the
// the bisected model which consists of two centroids and whichever is greater
// is committed to the set of clusters for this larger model k.
//
func Xmeans(datapoints, centroids *matrix.DenseMatrix, kmax int, cc, bisectcc CentroidChooser, measurer VectorMeasurer) ([]Model, map[string]error) {
	logname := "/var/tmp/xmeans.log"
	fp, err := os.OpenFile(logname, os.O_RDWR|os.O_APPEND, 0666)
	if err != nil {
		if os.IsNotExist(err) {
			fp, err = os.Create(logname)
			if err != nil {
				fmt.Printf("Xmeans: cannot open %s for logging.\n", logname)
			}
		}
	}

	log.SetOutput(io.Writer(fp))

	k, _ := centroids.GetSize()
	log.Printf("Start k=%d kmax=%d\n", k, kmax)

	R, M := datapoints.GetSize()
	errs := make(map[string]error)
	runtime.GOMAXPROCS(numworkers)
	models := make([]Model, 0)

	for k <= kmax {
		log.Printf("kmeans started k=%d\n", k)
		model, err := kmeans(datapoints, centroids, measurer)
		if err != nil {
			errs[strconv.Itoa(k)] = err
		}

		// Bisect the returned clusters
		log.Println("bisect started")
		bimodel := bisect(model.Clusters, R, M, bisectcc, measurer)
		numCentroids := len(bimodel.Clusters)
		log.Printf("bisect returned %d clusters\n", numCentroids)
		models = append(models, model)

		var cent *matrix.DenseMatrix

		if numCentroids <= kmax {
			for rowexists := true; rowexists == true; {
				cent = cc.ChooseCentroids(datapoints, 1)
				rowexists = centroids.RowExists(cent)
			}

			centroids, err = centroids.AppendRow(cent)
			if err != nil {
				log.Printf("AppendRow: %v\n", err)
				errs["ApppendRow"] = err
				break
			}
			k++
		} else {
			k = numCentroids
		}
	}

	log.Println("Finished")
	return models, errs
}
Ejemplo n.º 8
0
// EllipseCentroids lays out the initial centroids evenly along an elipse inscribed and centered within the boundaries of the dataset.
// It is only defined for M=2
// * Frac: This must be a float between 0 and 1. It determines the scale of the inscribing ellipse relative to the dataset,
//	so Frac==1.0 produces an ellipse that spans the entire dataset, while Frac==0.5 produces an ellipse spanning half the dataset.
func (c EllipseCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix {
	_, cols := mat.GetSize()
	// TODO Cache boundaries call for each matrix so that it is not called on each bisect
	var xmin, xmax, ymin, ymax = boundaries(mat)

	x0, y0 := xmin+(xmax-xmin)/2.0, ymin+(ymax-ymin)/2.0
	centroids := matrix.Zeros(k, cols)
	rx, ry := xmax-x0, ymax-y0
	thetaInit := rand.Float64() * math.Pi

	for i := 0; i < k; i++ {
		centroids.Set(i, 0, rx*c.Frac*math.Cos(thetaInit+float64(i)*2.0*math.Pi/float64(k)))
		centroids.Set(i, 1, ry*c.Frac*math.Sin(thetaInit+float64(i)*2.0*math.Pi/float64(k)))
	}
	return centroids
}
Ejemplo n.º 9
0
// DataCentroids picks k distinct points from the dataset.  If k is > points in
// the matrix then k is set to the number of points.
func (c DataCentroids) ChooseCentroids(mat *matrix.DenseMatrix, k int) *matrix.DenseMatrix {
	// first set up a map to keep track of which data points have already been chosen so we don't dupe
	rows, cols := mat.GetSize()
	centroids := matrix.Zeros(k, cols)
	if k > rows {
		k = rows
	}

	chosenIdxs := make(map[int]bool, k)
	for len(chosenIdxs) < k {
		index := rand.Intn(rows)
		chosenIdxs[index] = true
	}
	i := 0
	for idx, _ := range chosenIdxs {
		centroids.SetRowVector(mat.GetRowVector(idx).Copy(), i)
		i += 1
	}
	return centroids
}
Ejemplo n.º 10
0
// variance calculates the unbiased variance based on the number of data points
// and centroids (i.e., parameters).  In our case, numcentroids should always be 1
// since each data point has been paired with one centroid.
//
// The points matrix contains the coordinates of the data points.
// The centroids matrix is 1Xn that contains the centroid cooordinates.
// variance = 	// 1 / (numpoints - numcentroids) * sum for all points  (x_i - mean_(i))^2
func variance(points, centroid *matrix.DenseMatrix, measurer matutil.VectorMeasurer) (float64, error) {
	crows, _ := centroid.GetSize()
	if crows > 1 {
		return float64(0), errors.New(fmt.Sprintf("variance: expected centroid matrix with 1 row, received matrix with %d rows.", crows))
	}
	prows, _ := points.GetSize()

	// Term 1
	t1 := float64(1 / float64((prows - 1)))

	// Mean of distance between all points and the centroid.
	mean := modelMean(points, centroid)

	// Term 2
	// Sum over all points (point_i - mean(i))^2
	t2 := float64(0)
	for i := 0; i < prows; i++ {
		p := points.GetRowVector(i)
		dist, err := measurer.CalcDist(p, mean)
		if err != nil {
			return float64(-1), errors.New(fmt.Sprintf("variance: CalcDist returned: %v", err))
		}
		t2 += math.Pow(dist, 2)
	}
	variance := t1 * t2

	return variance, nil
}
Ejemplo n.º 11
0
func makeCentPointDist(datapoints, centroids *matrix.DenseMatrix) *matrix.DenseMatrix {
	r, c := datapoints.GetSize()
	CentPointDist := matrix.Zeros(r, c)

	done := make(chan int)
	jobs := make(chan PairPointCentroidJob, r)
	results := make(chan PairPointCentroidResult, minimum(1024, r))
	var ed EuclidDist

	go addPairPointCentroidJobs(jobs, datapoints, centroids, ed, results)

	for i := 0; i < r; i++ {
		go doPairPointCentroidJobs(done, jobs)
	}
	go awaitPairPointCentroidCompletion(done, results)

	clusterChanged := assessClusters(CentPointDist, results)

	if clusterChanged == true || clusterChanged == false {
	}
	return CentPointDist
}
Ejemplo n.º 12
0
// boundaries returns the max and min x and y values for a dense matrix
// of shape m x m.
func boundaries(mat *matrix.DenseMatrix) (xmin, xmax, ymin, ymax float64) {
	rows, _ := mat.GetSize()
	xmin, ymin = mat.Get(0, 0), mat.Get(0, 1)
	xmax, ymax = mat.Get(0, 0), mat.Get(0, 1)

	for i := 1; i < rows; i++ {
		xi, yi := mat.Get(i, 0), mat.Get(i, 1)

		if xi > xmax {
			xmax = xi
		} else if xi < xmin {
			xmin = xi
		}

		if yi > ymax {
			ymax = yi
		} else if yi < ymin {
			ymin = yi
		}
	}
	return
}
Ejemplo n.º 13
0
// GetBoundaries returns the max and min x and y values for a dense matrix
// of shape m x 2.
func GetBoundaries(mat *matrix.DenseMatrix) (xmin, xmax, ymin, ymax float64) {
	rows, cols := mat.GetSize()
	if cols != 2 {
		// TODO - should there be an err return, or should we panic here?
	}
	xmin, ymin = mat.Get(0, 0), mat.Get(0, 1)
	xmax, ymax = mat.Get(0, 0), mat.Get(0, 1)
	for i := 1; i < rows; i++ {
		xi, yi := mat.Get(i, 0), mat.Get(i, 1)

		if xi > xmax {
			xmax = xi
		} else if xi < xmin {
			xmin = xi
		}

		if yi > ymax {
			ymax = yi
		} else if yi < ymin {
			ymin = yi
		}
	}
	return
}
Ejemplo n.º 14
0
// assessClusters assigns the results to the CentPointDist matrix.
func assessClusters(CentPointDist *matrix.DenseMatrix, results <-chan PairPointCentroidResult) bool {
	change := false
	for result := range results {
		if CentPointDist.Get(result.rowNum, 0) != result.centroidRowNum {
			change = true
		}
		CentPointDist.Set(result.rowNum, 0, result.centroidRowNum)
		CentPointDist.Set(result.rowNum, 1, result.distSquared)
	}
	return change
}
Ejemplo n.º 15
0
// processPairPointToCentroidResults assigns the results to the centroidSqErr matrix.
func processPairPointToCentroidResults(centroidSqErr *matrix.DenseMatrix, results <-chan PairPointCentroidResult) {
	for result := range results {
		centroidSqErr.Set(result.rowNum, 0, result.centroidRowNum)
		centroidSqErr.Set(result.rowNum, 1, result.distSquared)
	}
}
Ejemplo n.º 16
0
// kmeans partitions datapoints into K clusters.  This results in a partitioning of
// the data space into Voronoi cells.  The problem is NP-hard so here we attempt
// to parallelize or make concurrent as many processes as possible to reduce the
// running time.
//
// 1. Place K points into the space represented by the objects that are being clustered.
// These points represent initial group centroids.
//
// 2. Assign each object to the group that has the closest centroid.
//
// 3. When all objects have been assigned, recalculate the positions of the K centroids
// by calculating the mean of all cooridnates in a cluster and making that
// the new centroid.
//
// 4. Repeat Steps 2 and 3 until the centroids no longer move.
//
// centroids is K x M matrix that cotains the coordinates for the centroids.
// The centroids are indexed by the 0 based rows of this matrix.
//  ____      _________
//  | 12.29   32.94 ... | <-- The coordinates for centroid 0
//  | 4.6     29.22 ... | <-- The coordinates for centroid 1
//  |_____    __________|
//
//
// CentPointDist is ax R x M matrix.  The rows have a 1:1 relationship to
// the rows in datapoints.  Column 0 contains the row number in centroids
// that corresponds to the centroid for the datapoint in row i of this matrix.
// Column 1 contains (x_i - mu(i))^2.
//  ____      _______
//  | 3        38.01 | <-- Centroid 3, squared error for the coordinates in row 0 of datapoints
//  | 1        23 .21| <-- Centroid 1, squared error for the coordinates in row 1 of datapoints
//  | 0        14.12 | <-- Centroid 0, squared error for the coordinates in row 2 of datapoints
//  _____     _______
//
func kmeans(datapoints, centroids *matrix.DenseMatrix, measurer VectorMeasurer) Model {
	/*  datapoints				  CentPoinDist            centroids
	                                  ________________
	    ____	  ____				  __|__	  ______	 |	  ____	___________
	    | ...	 |				 |	...			|	 V	 | ...		       |
	    | 3.0  5.1| <-- row i --> |	3	  32.12 |  row 3 | 3	 38.1, ... |
	    |____  ___|				 |____	  ______|	     |___	__________ |
	*/
	R, M := datapoints.GetSize()
	CentPointDist := matrix.Zeros(R, 2)
	k, _ := centroids.GetSize()

	clusterChanged := true
	var clusters []cluster

	for clusterChanged == true {
		clusterChanged = false
		clusters = make([]cluster, 0)

		jobs := make(chan PairPointCentroidJob, 1024)
		results := make(chan PairPointCentroidResult, 1024)
		done := make(chan int, 1024)

		// Pair each point with its closest centroid.
		go addPairPointCentroidJobs(jobs, datapoints, centroids, measurer, results)
		for i := 0; i < numworkers; i++ {
			go doPairPointCentroidJobs(done, jobs)
		}
		go awaitPairPointCentroidCompletion(done, results)

		clusterChanged = assessClusters(CentPointDist, results) // This blocks so that all the results can be processed

		// You have each data point grouped with a centroid,
		for idx, cent := 0, 0; cent < k; cent++ {
			// Select all the rows in CentPointDist whose first col value == cent.
			// Get the corresponding row vector from datapoints and place it in pointsInCluster.
			r, _ := CentPointDist.GetSize()
			matches := make([]int, 0)

			for i := 0; i < r; i++ {
				v := CentPointDist.Get(i, 0)
				if v == float64(cent) {
					matches = append(matches, i)
				}
			}

			// It is possible that some centroids may have zero points, so there
			// may not be any matches.
			if len(matches) == 0 {
				continue
			}

			pointsInCluster := matrix.Zeros(len(matches), M)
			i := 0

			for _, rownum := range matches {
				pointsInCluster.Set(i, 0, datapoints.Get(int(rownum), 0))
				pointsInCluster.Set(i, 1, datapoints.Get(int(rownum), 1))
				i++
			}

			// pointsInCluster now contains all the data points for the current
			// centroid.  The mean of the coordinates for this cluster becomes
			// the new centroid for this cluster.
			mean := pointsInCluster.MeanCols()
			centroids.SetRowVector(mean, cent)

			clust := cluster{pointsInCluster, mean, 0}
			clust.Variance = variance(clust, measurer)
			clusters = append(clusters, clust)
			idx++
		}
	}
	modelbic := calcbic(R, M, clusters)
	model := Model{modelbic, clusters}
	return model
}
Ejemplo n.º 17
0
// Xmeans runs k-means for k lower bound to k upper bound on a data set.
// Once the k centroids have converged each cluster is bisected and the BIC
// of the orginal cluster (parent = a model with one centroid) to the
// the bisected model which consists of two centroids and whichever is greater
// is committed to the set of clusters for this larger model k.
//
func Xmeans(datapoints, centroids *matrix.DenseMatrix, k, kmax int, cc, bisectcc CentroidChooser, measurer VectorMeasurer) ([]Model, map[string]error) {
	var err error

	// Uncomment logging code as well as the import statement above if you want simple logging to the elapsed
	// time between major events.
	/*	logname := "/var/tmp/xmeans.log"
			fp, err :=  os.OpenFile(logname, os.O_RDWR|os.O_APPEND, 0666)
			if err != nil {
				if os.IsNotExist(err) {
					fp, err = os.Create(logname)
		    		if err != nil {
			    		fmt.Printf("Xmeans: cannot open %s for logging.\n", logname)
					}
				}
			}

			log.SetOutput(io.Writer(fp))
	*/
	if k > kmax {
		m := make([]Model, 0)
		e := map[string]error{
			"k": errors.New(fmt.Sprintf("k must be <= kmax.  Received k=%d and kmax=%d.", k, kmax)),
		}
		return m, e
	}

	//	log.Printf("Start k=%d kmax=%d\n", k, kmax)

	R, M := datapoints.GetSize()
	errs := make(map[string]error)
	runtime.GOMAXPROCS(numworkers)
	models := make([]Model, 0)

	for k <= kmax {
		//		log.Printf("kmeans started k=%d\n", k)
		model := kmeans(datapoints, centroids, measurer)

		// Bisect the returned clusters
		//		log.Println("bisect started")
		bimodel := bisect(model.Clusters, R, M, bisectcc, measurer)
		numCentroids := len(bimodel.Clusters)
		//		log.Printf("bisect returned %d clusters\n", numCentroids)
		models = append(models, model)

		var cent *matrix.DenseMatrix

		if numCentroids <= kmax {
			for rowexists := true; rowexists == true; {
				cent = cc.ChooseCentroids(datapoints, 1)
				rowexists = centroids.RowExists(cent)
			}

			centroids, err = centroids.AppendRow(cent)
			if err != nil {
				errs["ApppendRow"] = err
				break
			}
			k++
		} else {
			k = numCentroids
		}
	}

	//	log.Println("Finished")
	return models, errs
}
Ejemplo n.º 18
0
// CalcDist finds the ManhattanDistance which is the sum of the aboslute
// difference of the coordinates.   Also known as rectilinear distance,
// city block distance, or taxicab distance.
func (md ManhattanDist) CalcDist(a, b *matrix.DenseMatrix) float64 {
	return math.Abs(a.Get(0, 0)-b.Get(0, 0)) + math.Abs(a.Get(0, 1)-b.Get(0, 1))
}
Ejemplo n.º 19
0
// Kmeansbi bisects a given cluster and determines which centroids give the lowest error.
// Take the points in a cluster
// While the number of cluster < k
//    for every cluster
//        measure total error
//        cacl kmeansp with k=2 on a given cluster
//        measure total error after kmeansp split
//    choose the cluster split with the lowest SSE
//    commit the chosen split
//
// N.B. We are using SSE until the BIC is completed.
func Kmeansbi(datapoints *matrix.DenseMatrix, k int, cc CentroidChooser, measurer matutil.VectorMeasurer) (matCentroidlist, clusterAssignment *matrix.DenseMatrix, err error) {
	numRows, numCols := datapoints.GetSize()
	clusterAssignment = matrix.Zeros(numRows, numCols)
	matCentroidlist = matrix.Zeros(k, numCols)
	centroid0 := datapoints.MeanCols()
	centroidlist := []*matrix.DenseMatrix{centroid0}

	// Initially create one cluster.
	for j := 0; j < numRows; j++ {
		point := datapoints.GetRowVector(j)
		distJ, err := measurer.CalcDist(centroid0, point)
		if err != nil {
			return matCentroidlist, clusterAssignment, errors.New(fmt.Sprintf("Kmeansbi: CalcDist returned err=%v", err))
		}
		clusterAssignment.Set(j, 1, math.Pow(distJ, 2))
	}

	var bestClusterAssignment, bestNewCentroids *matrix.DenseMatrix
	var bestCentroidToSplit int

	// Find the best centroid configuration.
	for len(centroidlist) < k {
		lowestSSE := math.Inf(1)
		// Split cluster
		for i, _ := range centroidlist {
			// Get the points in this cluster
			pointsCurCluster, err := clusterAssignment.FiltCol(float64(i), float64(i), 0)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}

			centroids, splitClusterAssignment, err := Kmeansp(pointsCurCluster, 2, cc, measurer)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}

			/* centroids is a 2X2 matrix of the best centroids found by kmeans

			   splitClustAssignment is a mX2 matrix where col0 is either 0 or 1 and refers to the rows in centroids
			   where col1 cotains the squared error between a centroid and a point.  The rows here correspond to
			   the rows in ptsInCurrCluster.  For example, if row 2 contains [1, 7.999] this means that centroid 1
			   has been paired with the point in row 2 of splitClustAssignment and that the squared error (distance
			   between centroid and point) is 7.999.
			*/

			// Calculate the sum of squared errors for each centroid.
			// This give a statistcal measurement of how good
			// the clustering is for this cluster.
			sseSplit := splitClusterAssignment.SumCol(1)
			// Calculate the SSE for the original cluster
			sqerr, err := clusterAssignment.FiltCol(float64(0), math.Inf(1), 0)
			if err != nil {
				return matCentroidlist, clusterAssignment, err
			}
			sseNotSplit := sqerr.SumCol(1)

			// TODO: Pre-BCI is this the best way to evaluate?
			if sseSplit+sseNotSplit < lowestSSE {
				bestCentroidToSplit = 1
				bestNewCentroids = matrix.MakeDenseCopy(centroids)
				bestClusterAssignment = matrix.MakeDenseCopy(splitClusterAssignment)
			}
		}

		// Applying the split overwrites the existing cluster assginments for the
		// cluster you have decided to split.  Kmeansp() returned two clusters
		// labeled 0 and 1. Change these cluster numbers to the cluster number
		// you are splitting and the next cluster to be added.
		m, err := bestClusterAssignment.FiltColMap(1, 1, 0)
		if err != nil {
			return matCentroidlist, clusterAssignment, err
		}
		for i, _ := range m {
			bestClusterAssignment.Set(i, 0, float64(len(centroidlist)))
		}

		n, err := bestClusterAssignment.FiltColMap(0, 0, 0)
		if err != nil {
			return matCentroidlist, clusterAssignment, err
		}
		for i, _ := range n {
			bestClusterAssignment.Set(i, 1, float64(bestCentroidToSplit))
		}

		fmt.Printf("Best centroid to split %f\n", bestCentroidToSplit)
		r, _ := bestClusterAssignment.GetSize()
		fmt.Printf("The length of best cluster assesment is %f\n", r)

		// Replace a centroid with the two best centroids from the split.
		centroidlist[bestCentroidToSplit] = bestNewCentroids.GetRowVector(0)
		centroidlist = append(centroidlist, bestNewCentroids.GetRowVector(1))

		// Reassign new clusters and SSE
		rows, _ := clusterAssignment.GetSize()
		for i, j := 0, 0; i < rows; i++ {
			if clusterAssignment.Get(i, 0) == float64(bestCentroidToSplit) {
				clusterAssignment.Set(i, 0, bestClusterAssignment.Get(j, 0))
				clusterAssignment.Set(i, 1, bestClusterAssignment.Get(j, 1))
				j++
			}
		}

		// make centroidlist into a matrix
		s := make([][]float64, len(centroidlist))
		for i, mat := range centroidlist {
			s[i][0] = mat.Get(0, 0)
			s[i][1] = mat.Get(0, 1)
		}
		matCentroidlist = matrix.MakeDenseMatrixStacked(s)
	}
	return matCentroidlist, clusterAssignment, nil
}
Ejemplo n.º 20
0
// CalcDist finds the ManhattanDistance which is the sum of the aboslute
// difference of the coordinates.   Also known as rectilinear distance,
// city block distance, or taxicab distance.
func (md ManhattanDist) CalcDist(a, b *matrix.DenseMatrix) (dist float64, err error) {
	dist = float64(0)
	err = nil
	arows, acols := a.GetSize()
	brows, bcols := b.GetSize()

	if arows != 1 || brows != 1 {
		return dist, errors.New(fmt.Sprintf("matutil: Matrices must contain only 1 row.  a has %d and b has %d.", arows, brows))
	} else if arows != brows {
		return dist, errors.New(fmt.Sprintf("matutil: Matrices must have the same dimensions.  a=%dX%d b=%dX%d", arows, acols, brows, bcols))
	}
	dist = math.Abs(a.Get(0, 0)-b.Get(0, 0)) + math.Abs(a.Get(0, 1)-b.Get(0, 1))
	return
}