Пример #1
0
func TestSimpleLeastDistanceVsKmeans(t *testing.T) {

	//Create fake data
	var numClusters = 5
	var numRows = 400
	var dimensionality = 1000
	data := make([][]float64, numRows, numRows)
	for i := 0; i < numRows; i++ {
		row := make([]float64, dimensionality, dimensionality)
		for j := 0; j < dimensionality; j++ {
			row[j] = rand.Float64()
		}
		data[i] = row
	}

	start := time.Now()
	//Test RPHash with Fake Object
	RPHashObject := reader.NewSimpleArray(data, numClusters)
	simpleObject := simple.NewSimple(RPHashObject)
	simpleObject.Run()

	if len(RPHashObject.GetCentroids()) != numClusters {
		t.Errorf("Requested %v centriods. But RPHashSimple returned %v.", numClusters, len(RPHashObject.GetCentroids()))
	}
	rpHashResult := RPHashObject.GetCentroids()
	fmt.Println("RPHash: ", time.Since(start))
	//Find clusters using KMeans
	start = time.Now()
	clusterer := clusterer.NewKMeansSimple(numClusters, data)
	clusterer.Run()

	kMeansResult := clusterer.GetCentroids()
	fmt.Println("kMeans: ", time.Since(start))

	var kMeansAssignment = 0
	var rpHashAssignment = 0
	var matchingAssignmentCount = 0
	var kMeansTotalDist = float64(0)
	var rpHashTotalDist = float64(0)
	for _, vector := range data {
		rpHashAssignment, _ = utils.FindNearestDistance(vector, rpHashResult)
		kMeansAssignment, _ = utils.FindNearestDistance(vector, kMeansResult)
		kMeansTotalDist += utils.Distance(vector, kMeansResult[kMeansAssignment])
		rpHashTotalDist += utils.Distance(vector, rpHashResult[rpHashAssignment])
		//t.Log(rpHashAssignments[i], kMeansAssignments[i]);
		if rpHashAssignment == kMeansAssignment {
			matchingAssignmentCount += 1
		}
	}
	t.Log("RPHash:", rpHashTotalDist)
	t.Log("KMeans:", kMeansTotalDist)
	t.Log("Ratio: ", kMeansTotalDist/rpHashTotalDist)
}
Пример #2
0
func TestStreamingKMeansOnNumImagesData(t *testing.T) {
	numClusters := 10
	lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt")
	if err != nil {
		panic(err)
	}
	dimensionality := len(lines[0])
	data := utils.StringArrayToFloatArray(lines)

	start := time.Now()
	kmeansStream := clusterer.NewKMeansStream(numClusters, 10, dimensionality)
	for _, vector := range data {
		kmeansStream.AddDataPoint(vector)
	}

	result := kmeansStream.GetCentroids()
	time := time.Since(start)
	totalSqDist := float64(0)
	for _, vector := range data {
		_, dist := utils.FindNearestDistance(vector, result)
		totalSqDist += dist * dist
	}

	t.Log("Total Square Distance: ", totalSqDist)
	t.Log("Average Square Distance: ", totalSqDist/float64(len(data)))
	t.Log("Runtime(seconds): ", time.Seconds())

	if len(result) != numClusters {
		t.Errorf("RPHash Stream did not present the correct number of clusters.")
	}
}
Пример #3
0
func TestRPHashSimpleOnNumImagesData(t *testing.T) {
	numClusters := 10
	lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt")
	if err != nil {
		panic(err)
	}
	data := utils.StringArrayToFloatArray(lines)

	start := time.Now()
	RPHashObject := reader.NewSimpleArray(data, numClusters)
	simpleObject := simple.NewSimple(RPHashObject)
	simpleObject.Run()

	result := RPHashObject.GetCentroids()
	time := time.Since(start)

	totalSqDist := float64(0)
	for _, vector := range data {
		_, dist := utils.FindNearestDistance(vector, result)
		totalSqDist += dist * dist
	}

	t.Log("Total Square Distance: ", totalSqDist)
	t.Log("Average Square Distance: ", totalSqDist/float64(len(data)))
	t.Log("Runtime(seconds): ", time.Seconds())

	if len(result) != numClusters {
		t.Errorf("RPHash Stream did not present the correct number of clusters.")
	}
}
Пример #4
0
func (this *KMeans) AssignClusters(data [][]float64) int {
	swaps := 0
	newClusters := [][]int{}
	for j := 0; j < this.k; j++ {
		newClusterList := []int{}
		newClusters = append(newClusters, newClusterList)
	}
	for clusterid := 0; clusterid < this.k; clusterid++ {
		for _, member := range this.clusters[clusterid] {
			nearest, _ := utils.FindNearestDistance(data[member], this.means)
			newClusters[nearest] = append(newClusters[nearest], member)
			if nearest != clusterid {
				swaps++
			}
		}
	}
	this.clusters = newClusters
	return swaps
}
Пример #5
0
func TestStreamingRPHashOnRandomData(t *testing.T) {
	filereader := utils.NewDataFileReader(filePath)

	start := time.Now()
	rphashObject := reader.NewStreamObject(dimensionality, numClusters)
	rphashStream := stream.NewStream(rphashObject)
	elapsedtime := time.Since(start)
	for {
		vector := filereader.Next()
		if vector == nil {
			break
		}
		start := time.Now()
		rphashStream.AppendVector(vector)
		elapsedtime = elapsedtime + time.Since(start)
	}
	start = time.Now()
	result := rphashStream.GetCentroids()
	elapsedtime = elapsedtime + time.Since(start)
	totalSqDist := float64(0)
	filereader = utils.NewDataFileReader(filePath)
	for {
		vector := filereader.Next()
		if vector == nil {
			break
		}
		_, dist := utils.FindNearestDistance(vector, result)
		totalSqDist += dist * dist
	}

	t.Log("Total Square Distance: ", totalSqDist)
	t.Log("Average Square Distance: ", totalSqDist/numDataPoints)
	t.Log("Runtime(seconds): ", elapsedtime.Seconds())

	if len(result) != numClusters {
		t.Errorf("RPHash Stream did not present the correct number of clusters.")
	}
}