func TestCentroidsAndTraces(t *testing.T) {
	centroids, traces, f, reverse := CentroidsAndTraces(paramset, traceparams, 2)
	assert.Equal(t, 2, len(centroids))
	assert.Equal(t, 4, len(traces))
	var tr4 *Trace
	for _, clusterable := range traces {
		tr4 = clusterable.(*Trace)
		if tr4.ID == "tr4" {
			break
		}
	}
	assert.Equal(t, 0, tr4.Params[1], "Missing param should map to 0.")
	assert.Equal(t, 3, tr4.Params[0], "The value gpu should sort to last of the three params.")
	assert.Equal(t, traceparams["tr4"], reverse(tr4))
	kmeansCentroid := f([]kmeans.Clusterable{tr4, tr4})
	centroid := kmeansCentroid.(*Centroid)
	assert.Equal(t, 0.0, centroid.Distance(tr4))
	var tr1 *Trace
	for _, clusterable := range traces {
		tr1 = clusterable.(*Trace)
		if tr1.ID == "tr1" {
			break
		}
	}
	assert.Equal(t, 2.0, centroid.Distance(tr1))

	// Now run the k-means algorithm, which is deterministic for a fixed set of
	// starting centroids, so pick our centroids explicitly so we always get the
	// same answer.
	obs := make([]kmeans.Clusterable, len(traces))
	for i, tr := range traces {
		obs[i] = tr
	}
	cent := []kmeans.Centroid{
		f([]kmeans.Clusterable{tr1}),
		f([]kmeans.Clusterable{tr4}),
	}
	kmCentroids, kmClusters := kmeans.KMeans(obs, cent, 2, 10, f)
	assert.Equal(t, 2, len(kmCentroids))
	assert.Equal(t, 2, len(kmClusters))
	assert.Equal(t, 3, len(kmClusters[0]))
	assert.Equal(t, 1, len(kmClusters[1]))
	assert.Equal(t, "tr4", kmClusters[1][0].(*Trace).ID, "tr4 should be the singe member of the second cluster.")
	assert.InDelta(t, 2.7748, kmeans.TotalError(obs, kmCentroids), 0.01)

	// Run k-means again but with just one centroid and show the total error gets
	// larger.
	kmCentroids, kmClusters = kmeans.KMeans(obs, cent[:1], 2, 10, f)
	assert.Equal(t, 1, len(kmCentroids))
	assert.Equal(t, 1, len(kmClusters))
	assert.Equal(t, 4, len(kmClusters[0]))
	assert.InDelta(t, 4.42496, kmeans.TotalError(obs, kmCentroids), 0.01)
}
Example #2
0
// ClusterAndDescribe takes all the params from a set of traces, as passed in
// via traceparams, and does k-means clustering on the parameters and returns
// the results of the clustering in a Description.
//
// The paramset is needed so we know the total number of values for each
// parameter.  The value of total is the total number of traces being analyzed,
// of which traceparams contains a subset.
func ClusterAndDescribe(paramSet map[string][]string, traceparams map[string]map[string]string, total int) Description {
	if len(traceparams) == 0 {
		return Description{
			Centers: []*Center{},
			Percent: 0,
		}
	}

	// A good guess for k is sqrt(n)/2.
	k := int(math.Sqrt(float64(len(traceparams))) / 2.0)
	// But never go below 5 clusters.
	if k < 5 {
		k = 5
	}
	cent, obs, f, reverse := CentroidsAndTraces(paramSet, traceparams, k)
	_, clusters := kmeans.KMeans(obs, cent, k, 100, f)

	// Now that clustering is complete build of the slice of Center's for each
	// cluster found.
	centers := []*Center{}
	for _, cl := range clusters {
		size := len(cl)
		params := []map[string]string{}
		for _, tr := range cl {
			params = append(params, reverse(tr.(*Trace)))
		}
		if len(cl) > 20 {
			cl = cl[:20]
		}
		ids := make([]string, len(cl))
		for i, tr := range cl {
			ids[i] = tr.(*Trace).ID
		}
		centers = append(centers, &Center{
			IDs:       ids,
			WordCloud: valueweight.FromParams(params),
			Size:      size,
		})
	}

	return Description{
		Centers: centers,
		Percent: float64(len(traceparams)) / float64(total),
	}
}