func TestCentroidsAndTraces(t *testing.T) { centroids, traces, f, reverse := CentroidsAndTraces(paramset, traceparams, 2) assert.Equal(t, 2, len(centroids)) assert.Equal(t, 4, len(traces)) var tr4 *Trace for _, clusterable := range traces { tr4 = clusterable.(*Trace) if tr4.ID == "tr4" { break } } assert.Equal(t, 0, tr4.Params[1], "Missing param should map to 0.") assert.Equal(t, 3, tr4.Params[0], "The value gpu should sort to last of the three params.") assert.Equal(t, traceparams["tr4"], reverse(tr4)) kmeansCentroid := f([]kmeans.Clusterable{tr4, tr4}) centroid := kmeansCentroid.(*Centroid) assert.Equal(t, 0.0, centroid.Distance(tr4)) var tr1 *Trace for _, clusterable := range traces { tr1 = clusterable.(*Trace) if tr1.ID == "tr1" { break } } assert.Equal(t, 2.0, centroid.Distance(tr1)) // Now run the k-means algorithm, which is deterministic for a fixed set of // starting centroids, so pick our centroids explicitly so we always get the // same answer. obs := make([]kmeans.Clusterable, len(traces)) for i, tr := range traces { obs[i] = tr } cent := []kmeans.Centroid{ f([]kmeans.Clusterable{tr1}), f([]kmeans.Clusterable{tr4}), } kmCentroids, kmClusters := kmeans.KMeans(obs, cent, 2, 10, f) assert.Equal(t, 2, len(kmCentroids)) assert.Equal(t, 2, len(kmClusters)) assert.Equal(t, 3, len(kmClusters[0])) assert.Equal(t, 1, len(kmClusters[1])) assert.Equal(t, "tr4", kmClusters[1][0].(*Trace).ID, "tr4 should be the singe member of the second cluster.") assert.InDelta(t, 2.7748, kmeans.TotalError(obs, kmCentroids), 0.01) // Run k-means again but with just one centroid and show the total error gets // larger. kmCentroids, kmClusters = kmeans.KMeans(obs, cent[:1], 2, 10, f) assert.Equal(t, 1, len(kmCentroids)) assert.Equal(t, 1, len(kmClusters)) assert.Equal(t, 4, len(kmClusters[0])) assert.InDelta(t, 4.42496, kmeans.TotalError(obs, kmCentroids), 0.01) }
// ClusterAndDescribe takes all the params from a set of traces, as passed in // via traceparams, and does k-means clustering on the parameters and returns // the results of the clustering in a Description. // // The paramset is needed so we know the total number of values for each // parameter. The value of total is the total number of traces being analyzed, // of which traceparams contains a subset. func ClusterAndDescribe(paramSet map[string][]string, traceparams map[string]map[string]string, total int) Description { if len(traceparams) == 0 { return Description{ Centers: []*Center{}, Percent: 0, } } // A good guess for k is sqrt(n)/2. k := int(math.Sqrt(float64(len(traceparams))) / 2.0) // But never go below 5 clusters. if k < 5 { k = 5 } cent, obs, f, reverse := CentroidsAndTraces(paramSet, traceparams, k) _, clusters := kmeans.KMeans(obs, cent, k, 100, f) // Now that clustering is complete build of the slice of Center's for each // cluster found. centers := []*Center{} for _, cl := range clusters { size := len(cl) params := []map[string]string{} for _, tr := range cl { params = append(params, reverse(tr.(*Trace))) } if len(cl) > 20 { cl = cl[:20] } ids := make([]string, len(cl)) for i, tr := range cl { ids[i] = tr.(*Trace).ID } centers = append(centers, &Center{ IDs: ids, WordCloud: valueweight.FromParams(params), Size: size, }) } return Description{ Centers: centers, Percent: float64(len(traceparams)) / float64(total), } }