func TestKMeansOnNumImagesData(t *testing.T) { numClusters := 10 lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt") if err != nil { panic(err) } data := utils.StringArrayToFloatArray(lines) start := time.Now() clusterer := clusterer.NewKMeansSimple(numClusters, data) clusterer.Run() result := clusterer.GetCentroids() time := time.Since(start) totalSqDist := float64(0) for _, vector := range data { _, dist := utils.FindNearestDistance(vector, result) totalSqDist += dist * dist } t.Log("Total Square Distance: ", totalSqDist) t.Log("Average Square Distance: ", totalSqDist/float64(len(data))) t.Log("Runtime(seconds): ", time.Seconds()) if len(result) != numClusters { t.Errorf("RPHash Stream did not present the correct number of clusters.") } }
func TestSimpleLeastDistanceVsKmeans(t *testing.T) { //Create fake data var numClusters = 5 var numRows = 400 var dimensionality = 1000 data := make([][]float64, numRows, numRows) for i := 0; i < numRows; i++ { row := make([]float64, dimensionality, dimensionality) for j := 0; j < dimensionality; j++ { row[j] = rand.Float64() } data[i] = row } start := time.Now() //Test RPHash with Fake Object RPHashObject := reader.NewSimpleArray(data, numClusters) simpleObject := simple.NewSimple(RPHashObject) simpleObject.Run() if len(RPHashObject.GetCentroids()) != numClusters { t.Errorf("Requested %v centriods. But RPHashSimple returned %v.", numClusters, len(RPHashObject.GetCentroids())) } rpHashResult := RPHashObject.GetCentroids() fmt.Println("RPHash: ", time.Since(start)) //Find clusters using KMeans start = time.Now() clusterer := clusterer.NewKMeansSimple(numClusters, data) clusterer.Run() kMeansResult := clusterer.GetCentroids() fmt.Println("kMeans: ", time.Since(start)) var kMeansAssignment = 0 var rpHashAssignment = 0 var matchingAssignmentCount = 0 var kMeansTotalDist = float64(0) var rpHashTotalDist = float64(0) for _, vector := range data { rpHashAssignment, _ = utils.FindNearestDistance(vector, rpHashResult) kMeansAssignment, _ = utils.FindNearestDistance(vector, kMeansResult) kMeansTotalDist += utils.Distance(vector, kMeansResult[kMeansAssignment]) rpHashTotalDist += utils.Distance(vector, rpHashResult[rpHashAssignment]) //t.Log(rpHashAssignments[i], kMeansAssignments[i]); if rpHashAssignment == kMeansAssignment { matchingAssignmentCount += 1 } } t.Log("RPHash:", rpHashTotalDist) t.Log("KMeans:", kMeansTotalDist) t.Log("Ratio: ", kMeansTotalDist/rpHashTotalDist) }
func BenchmarkKMeans(b *testing.B) { var numClusters = 5 var numRows = 4000 var dimensionality = 1000 data := make([][]float64, numRows, numRows) for i := 0; i < numRows; i++ { row := make([]float64, dimensionality, dimensionality) for j := 0; j < dimensionality; j++ { row[j] = rand.Float64() } data[i] = row } for i := 0; i < b.N; i++ { clusterer := clusterer.NewKMeansSimple(numClusters, data) clusterer.Run() clusterer.GetCentroids() } }
func TestClustererUniformVectors(t *testing.T) { //initilize data var numClusters = 2 var numDataPoints = 8 var dimensionality = 4 data := make([][]float64, numDataPoints) for i := 0; i < numDataPoints; i++ { data[i] = make([]float64, dimensionality) for j := 0; j < dimensionality; j++ { data[i][j] = float64(i) } } //run test clusterer := clusterer.NewKMeansSimple(numClusters, data) clusterer.Run() var result = clusterer.GetCentroids() //Test Results if len(result) != numClusters { t.Errorf("Clusterer created %v clusters. When %v was input for k.", len(result), numClusters) } if len(result[0]) != dimensionality { t.Errorf("Cluster dimensionalioty of %v does not match the dimensionality of the input data, %v.", len(result[0]), dimensionality) } expectedResults := make([]float64, numClusters) expectedResults[0] = 1.5 // (0+1+2+3)/4 = 1.5 expectedResults[1] = 5.5 // (4+5+6+7)/4 = 5.5 for i := 0; i < numClusters; i++ { for j := 0; j < dimensionality; j++ { if result[i][j] != expectedResults[i] { t.Errorf("Data did not cluster as expected. Data: %v, Clusters: %v. Failure at %v, %v.", data, result, i, j) } } } }