func TestComputeCentroid(t *testing.T) { empty := matrix.Zeros(0, 0) _, err := ComputeCentroid(empty) if err == nil { t.Errorf("Did not raise error on empty matrix") } twoByTwo := matrix.Ones(2, 2) centr, err := ComputeCentroid(twoByTwo) if err != nil { t.Errorf("Could not compute centroid, err=%v", err) } expected := matrix.MakeDenseMatrix([]float64{1.0, 1.0}, 1, 2) if !matrix.Equals(centr, expected) { t.Errorf("Incorrect centroid: was %v, should have been %v", expected, centr) } twoByTwo.Set(0, 0, 3.0) expected.Set(0, 0, 2.0) centr, err = ComputeCentroid(twoByTwo) if err != nil { t.Errorf("Could not compute centroid, err=%v", err) } if !matrix.Equals(centr, expected) { t.Errorf("Incorrect centroid: was %v, should have been %v", expected, centr) } }
// variance is the maximum likelihood estimate (MLE) for the variance, under // the identical spherical Gaussian assumption. // // points = an R x M matrix of all point coordinates. // // CentPointDist = R x M+1 matrix. Column 0 contains the index {0...K-1} of // a centroid. Column 1 contains (datapoint_i - mu(i))^2 // // centroids = K x M+1 matrix. Column 0 continas the centroid index {0...K}. // Columns 1...M contain the centroid coordinates. (See kmeans() for an example.) // // 1 __ 2 // ------ * \ (x - mu ) // R - K /__ i i (i) // // where i indexes the individual points. // // N.B. mu_(i) denotes the coordinates of the centroid closest to the i-th data point. Not // the mean of the entire cluster. // // TODO would it be more efficient to calculate it in one pass instead of pre-calculating the // mean? Or will we always have to pre-calc to fill the cluster? // // 1 __ 2 1 / __ \2 // ----- \ x - -------- |\ x | // R - K /__ i 2 \/__ i / // (R - K) // func variance(c cluster, measurer VectorMeasurer) float64 { if matrix.Equals(c.Points, c.Centroid) == true { return 0.0 } sum := float64(0) denom := float64(c.Numpoints() - c.Numcentroids()) for i := 0; i < c.Numpoints(); i++ { p := c.Points.GetRowVector(i) mu_i := c.Centroid.GetRowVector(0) dist := measurer.CalcDist(mu_i, p) sum += dist * dist } v := (1.0 / denom) * sum return v }