func TestSimpleLeastDistanceVsKmeans(t *testing.T) { //Create fake data var numClusters = 5 var numRows = 400 var dimensionality = 1000 data := make([][]float64, numRows, numRows) for i := 0; i < numRows; i++ { row := make([]float64, dimensionality, dimensionality) for j := 0; j < dimensionality; j++ { row[j] = rand.Float64() } data[i] = row } start := time.Now() //Test RPHash with Fake Object RPHashObject := reader.NewSimpleArray(data, numClusters) simpleObject := simple.NewSimple(RPHashObject) simpleObject.Run() if len(RPHashObject.GetCentroids()) != numClusters { t.Errorf("Requested %v centriods. But RPHashSimple returned %v.", numClusters, len(RPHashObject.GetCentroids())) } rpHashResult := RPHashObject.GetCentroids() fmt.Println("RPHash: ", time.Since(start)) //Find clusters using KMeans start = time.Now() clusterer := clusterer.NewKMeansSimple(numClusters, data) clusterer.Run() kMeansResult := clusterer.GetCentroids() fmt.Println("kMeans: ", time.Since(start)) var kMeansAssignment = 0 var rpHashAssignment = 0 var matchingAssignmentCount = 0 var kMeansTotalDist = float64(0) var rpHashTotalDist = float64(0) for _, vector := range data { rpHashAssignment, _ = utils.FindNearestDistance(vector, rpHashResult) kMeansAssignment, _ = utils.FindNearestDistance(vector, kMeansResult) kMeansTotalDist += utils.Distance(vector, kMeansResult[kMeansAssignment]) rpHashTotalDist += utils.Distance(vector, rpHashResult[rpHashAssignment]) //t.Log(rpHashAssignments[i], kMeansAssignments[i]); if rpHashAssignment == kMeansAssignment { matchingAssignmentCount += 1 } } t.Log("RPHash:", rpHashTotalDist) t.Log("KMeans:", kMeansTotalDist) t.Log("Ratio: ", kMeansTotalDist/rpHashTotalDist) }
func TestStreamingKMeansOnNumImagesData(t *testing.T) { numClusters := 10 lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt") if err != nil { panic(err) } dimensionality := len(lines[0]) data := utils.StringArrayToFloatArray(lines) start := time.Now() kmeansStream := clusterer.NewKMeansStream(numClusters, 10, dimensionality) for _, vector := range data { kmeansStream.AddDataPoint(vector) } result := kmeansStream.GetCentroids() time := time.Since(start) totalSqDist := float64(0) for _, vector := range data { _, dist := utils.FindNearestDistance(vector, result) totalSqDist += dist * dist } t.Log("Total Square Distance: ", totalSqDist) t.Log("Average Square Distance: ", totalSqDist/float64(len(data))) t.Log("Runtime(seconds): ", time.Seconds()) if len(result) != numClusters { t.Errorf("RPHash Stream did not present the correct number of clusters.") } }
func TestRPHashSimpleOnNumImagesData(t *testing.T) { numClusters := 10 lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt") if err != nil { panic(err) } data := utils.StringArrayToFloatArray(lines) start := time.Now() RPHashObject := reader.NewSimpleArray(data, numClusters) simpleObject := simple.NewSimple(RPHashObject) simpleObject.Run() result := RPHashObject.GetCentroids() time := time.Since(start) totalSqDist := float64(0) for _, vector := range data { _, dist := utils.FindNearestDistance(vector, result) totalSqDist += dist * dist } t.Log("Total Square Distance: ", totalSqDist) t.Log("Average Square Distance: ", totalSqDist/float64(len(data))) t.Log("Runtime(seconds): ", time.Seconds()) if len(result) != numClusters { t.Errorf("RPHash Stream did not present the correct number of clusters.") } }
func (this *KMeans) AssignClusters(data [][]float64) int { swaps := 0 newClusters := [][]int{} for j := 0; j < this.k; j++ { newClusterList := []int{} newClusters = append(newClusters, newClusterList) } for clusterid := 0; clusterid < this.k; clusterid++ { for _, member := range this.clusters[clusterid] { nearest, _ := utils.FindNearestDistance(data[member], this.means) newClusters[nearest] = append(newClusters[nearest], member) if nearest != clusterid { swaps++ } } } this.clusters = newClusters return swaps }
func TestStreamingRPHashOnRandomData(t *testing.T) { filereader := utils.NewDataFileReader(filePath) start := time.Now() rphashObject := reader.NewStreamObject(dimensionality, numClusters) rphashStream := stream.NewStream(rphashObject) elapsedtime := time.Since(start) for { vector := filereader.Next() if vector == nil { break } start := time.Now() rphashStream.AppendVector(vector) elapsedtime = elapsedtime + time.Since(start) } start = time.Now() result := rphashStream.GetCentroids() elapsedtime = elapsedtime + time.Since(start) totalSqDist := float64(0) filereader = utils.NewDataFileReader(filePath) for { vector := filereader.Next() if vector == nil { break } _, dist := utils.FindNearestDistance(vector, result) totalSqDist += dist * dist } t.Log("Total Square Distance: ", totalSqDist) t.Log("Average Square Distance: ", totalSqDist/numDataPoints) t.Log("Runtime(seconds): ", elapsedtime.Seconds()) if len(result) != numClusters { t.Errorf("RPHash Stream did not present the correct number of clusters.") } }