func TestCentroidsAndTraces(t *testing.T) { centroids, traces, f, reverse := CentroidsAndTraces(paramset, traceparams, 2) assert.Equal(t, 2, len(centroids)) assert.Equal(t, 4, len(traces)) var tr4 *Trace for _, clusterable := range traces { tr4 = clusterable.(*Trace) if tr4.ID == "tr4" { break } } assert.Equal(t, 0, tr4.Params[1], "Missing param should map to 0.") assert.Equal(t, 3, tr4.Params[0], "The value gpu should sort to last of the three params.") assert.Equal(t, traceparams["tr4"], reverse(tr4)) kmeansCentroid := f([]kmeans.Clusterable{tr4, tr4}) centroid := kmeansCentroid.(*Centroid) assert.Equal(t, 0.0, centroid.Distance(tr4)) var tr1 *Trace for _, clusterable := range traces { tr1 = clusterable.(*Trace) if tr1.ID == "tr1" { break } } assert.Equal(t, 2.0, centroid.Distance(tr1)) // Now run the k-means algorithm, which is deterministic for a fixed set of // starting centroids, so pick our centroids explicitly so we always get the // same answer. obs := make([]kmeans.Clusterable, len(traces)) for i, tr := range traces { obs[i] = tr } cent := []kmeans.Centroid{ f([]kmeans.Clusterable{tr1}), f([]kmeans.Clusterable{tr4}), } kmCentroids, kmClusters := kmeans.KMeans(obs, cent, 2, 10, f) assert.Equal(t, 2, len(kmCentroids)) assert.Equal(t, 2, len(kmClusters)) assert.Equal(t, 3, len(kmClusters[0])) assert.Equal(t, 1, len(kmClusters[1])) assert.Equal(t, "tr4", kmClusters[1][0].(*Trace).ID, "tr4 should be the singe member of the second cluster.") assert.InDelta(t, 2.7748, kmeans.TotalError(obs, kmCentroids), 0.01) // Run k-means again but with just one centroid and show the total error gets // larger. kmCentroids, kmClusters = kmeans.KMeans(obs, cent[:1], 2, 10, f) assert.Equal(t, 1, len(kmCentroids)) assert.Equal(t, 1, len(kmClusters)) assert.Equal(t, 4, len(kmClusters[0])) assert.InDelta(t, 4.42496, kmeans.TotalError(obs, kmCentroids), 0.01) }
// CalculateClusterSummaries runs k-means clustering over the trace shapes. func CalculateClusterSummaries(tile *tiling.Tile, k int, stddevThreshhold float64, filter Filter) (*ClusterSummaries, error) { lastCommitIndex := tile.LastCommitIndex() observations := make([]kmeans.Clusterable, 0, len(tile.Traces)) for key, trace := range tile.Traces { if filter(key, trace.(*types.PerfTrace)) { observations = append(observations, ctrace.NewFullTrace(string(key), trace.(*types.PerfTrace).Values[:lastCommitIndex+1], trace.Params(), stddevThreshhold)) } } if len(observations) == 0 { return nil, fmt.Errorf("Zero traces matched.") } // Create K starting centroids. centroids := chooseK(observations, k) // TODO(jcgregorio) Keep iterating until the total error stops changing. lastTotalError := 0.0 for i := 0; i < MAX_KMEANS_ITERATIONS; i++ { centroids = kmeans.Do(observations, centroids, ctrace.CalculateCentroid) totalError := kmeans.TotalError(observations, centroids) glog.Infof("Total Error: %f\n", totalError) if math.Abs(totalError-lastTotalError) < KMEAN_EPSILON { break } lastTotalError = totalError } clusterSummaries := GetClusterSummaries(observations, centroids, tile.Commits) clusterSummaries.K = k clusterSummaries.StdDevThreshhold = stddevThreshhold return clusterSummaries, nil }