Пример #1
0
func TestSimpleLeastDistanceVsKmeans(t *testing.T) {

	//Create fake data
	var numClusters = 5
	var numRows = 400
	var dimensionality = 1000
	data := make([][]float64, numRows, numRows)
	for i := 0; i < numRows; i++ {
		row := make([]float64, dimensionality, dimensionality)
		for j := 0; j < dimensionality; j++ {
			row[j] = rand.Float64()
		}
		data[i] = row
	}

	start := time.Now()
	//Test RPHash with Fake Object
	RPHashObject := reader.NewSimpleArray(data, numClusters)
	simpleObject := simple.NewSimple(RPHashObject)
	simpleObject.Run()

	if len(RPHashObject.GetCentroids()) != numClusters {
		t.Errorf("Requested %v centriods. But RPHashSimple returned %v.", numClusters, len(RPHashObject.GetCentroids()))
	}
	rpHashResult := RPHashObject.GetCentroids()
	fmt.Println("RPHash: ", time.Since(start))
	//Find clusters using KMeans
	start = time.Now()
	clusterer := clusterer.NewKMeansSimple(numClusters, data)
	clusterer.Run()

	kMeansResult := clusterer.GetCentroids()
	fmt.Println("kMeans: ", time.Since(start))

	var kMeansAssignment = 0
	var rpHashAssignment = 0
	var matchingAssignmentCount = 0
	var kMeansTotalDist = float64(0)
	var rpHashTotalDist = float64(0)
	for _, vector := range data {
		rpHashAssignment, _ = utils.FindNearestDistance(vector, rpHashResult)
		kMeansAssignment, _ = utils.FindNearestDistance(vector, kMeansResult)
		kMeansTotalDist += utils.Distance(vector, kMeansResult[kMeansAssignment])
		rpHashTotalDist += utils.Distance(vector, rpHashResult[rpHashAssignment])
		//t.Log(rpHashAssignments[i], kMeansAssignments[i]);
		if rpHashAssignment == kMeansAssignment {
			matchingAssignmentCount += 1
		}
	}
	t.Log("RPHash:", rpHashTotalDist)
	t.Log("KMeans:", kMeansTotalDist)
	t.Log("Ratio: ", kMeansTotalDist/rpHashTotalDist)
}
Пример #2
0
func TestSpherical(t *testing.T) {
	var dimension, k, l, iterations int = 64, 6, 4, 10000
	sphere := decoder.NewSpherical(dimension, k, l)
	var collisions int = 0
	var distavg float64 = 0.0
	for j := 0; j < iterations; j++ {
		p1, p2 := make([]float64, dimension), make([]float64, dimension)
		for k := 0; k < dimension; k++ {
			p1[k] = rand.Float64()*2 - 1
			p2[k] = rand.Float64()*2 - 1
		}
		/* Get the distance of each vector from eachother. */
		distavg += utils.Distance(p1, p2)
		mh := hash.NewMurmur(1<<63 - 1)
		/* Decode from 24-dimensions -> 1-dimensional integer */
		hp1, hp2 := sphere.Hash(utils.Normalize(p1)), sphere.Hash(utils.Normalize(p2))
		/* Blurring the integers into a smaller space. */
		hash1, hash2 := mh.Hash(hp1), mh.Hash(hp2)
		if hash1 == hash2 {
			collisions++
		}
	}
	if collisions > (iterations / 100) {
		t.Errorf("More than 1 percent of the iterations resulted in collisions. %v collisions in %v iterations.",
			collisions, iterations)
	}
	t.Log("Average Distance: ", distavg/float64(iterations))
	t.Log("Percent collisions : ", float64(collisions)/float64(iterations))
	t.Log("√ Spherical Decoder test complete")
}
Пример #3
0
//Add a new data point to the stream
func (this *KMeansStream) addDataPointWeighted(data []float64, weight int64) {
	if len(data) != this.dimensionality {
		return
		// panic("The input data does not have the correct dimenstionality")
	}
	minIndex := 0
	minDist := 0.0
	for i, centriod := range this.candidateClusters {
		currDist := utils.Distance(data, centriod.Centroid())
		if i == 0 || minDist > currDist {
			minDist = currDist
			minIndex = i
		}
	}
	minDistSquared := minDist * minDist
	if len(this.candidateClusters) < this.k || rand.Float64() < float64(weight)*(minDistSquared/this.frequency) {
		this.candidateClusters = append(this.candidateClusters, *itemset.NewCentroidWeighted(data, weight))
	} else {
		this.candidateClusters[minIndex].UpdateVector(data)
	}
	if len(this.candidateClusters) > this.maxCandidateClusters {
		this.reduceCandidateClusters()
	}
}