Пример #1
0
func TestComputeCentroid(t *testing.T) {
	empty := matrix.Zeros(0, 0)
	_, err := ComputeCentroid(empty)
	if err == nil {
		t.Errorf("Did not raise error on empty matrix")
	}
	twoByTwo := matrix.Ones(2, 2)
	centr, err := ComputeCentroid(twoByTwo)
	if err != nil {
		t.Errorf("Could not compute centroid, err=%v", err)
	}
	expected := matrix.MakeDenseMatrix([]float64{1.0, 1.0}, 1, 2)
	if !matrix.Equals(centr, expected) {
		t.Errorf("Incorrect centroid: was %v, should have been %v", expected, centr)
	}
	twoByTwo.Set(0, 0, 3.0)
	expected.Set(0, 0, 2.0)
	centr, err = ComputeCentroid(twoByTwo)
	if err != nil {
		t.Errorf("Could not compute centroid, err=%v", err)
	}
	if !matrix.Equals(centr, expected) {
		t.Errorf("Incorrect centroid: was %v, should have been %v", expected, centr)
	}
}
Пример #2
0
// variance is the maximum likelihood estimate (MLE) for the variance, under
// the identical spherical Gaussian assumption.
//
// points = an R x M matrix of all point coordinates.
//
// CentPointDist =  R x M+1 matrix.  Column 0 contains the index {0...K-1} of
// a centroid.  Column 1 contains (datapoint_i - mu(i))^2
//
// centroids =  K x M+1 matrix.  Column 0 continas the centroid index {0...K}.
// Columns 1...M contain the centroid coordinates.  (See kmeans() for an example.)
//
//    1        __                 2
// ------  *  \     (x   -  mu   )
// R - K      /__ i   i       (i)
//
// where i indexes the individual points.
//
// N.B. mu_(i) denotes the coordinates of the centroid closest to the i-th data point.  Not
// the mean of the entire cluster.
//
// TODO would it be more efficient to calculate it in one pass instead of pre-calculating the
// mean?  Or will we always have to pre-calc to fill the cluster?
//
//   1    __  2       1    / __    \2
// ----- \   x  - -------- |\   x  |
// R - K /__  i          2 \/__  i /
//                (R - K)
//
func variance(c cluster, measurer VectorMeasurer) float64 {
	if matrix.Equals(c.Points, c.Centroid) == true {
		return 0.0
	}

	sum := float64(0)
	denom := float64(c.Numpoints() - c.Numcentroids())

	for i := 0; i < c.Numpoints(); i++ {
		p := c.Points.GetRowVector(i)
		mu_i := c.Centroid.GetRowVector(0)
		dist := measurer.CalcDist(mu_i, p)
		sum += dist * dist
	}
	v := (1.0 / denom) * sum
	return v
}