func TestSimpleLeastDistanceVsKmeans(t *testing.T) { //Create fake data var numClusters = 5 var numRows = 400 var dimensionality = 1000 data := make([][]float64, numRows, numRows) for i := 0; i < numRows; i++ { row := make([]float64, dimensionality, dimensionality) for j := 0; j < dimensionality; j++ { row[j] = rand.Float64() } data[i] = row } start := time.Now() //Test RPHash with Fake Object RPHashObject := reader.NewSimpleArray(data, numClusters) simpleObject := simple.NewSimple(RPHashObject) simpleObject.Run() if len(RPHashObject.GetCentroids()) != numClusters { t.Errorf("Requested %v centriods. But RPHashSimple returned %v.", numClusters, len(RPHashObject.GetCentroids())) } rpHashResult := RPHashObject.GetCentroids() fmt.Println("RPHash: ", time.Since(start)) //Find clusters using KMeans start = time.Now() clusterer := clusterer.NewKMeansSimple(numClusters, data) clusterer.Run() kMeansResult := clusterer.GetCentroids() fmt.Println("kMeans: ", time.Since(start)) var kMeansAssignment = 0 var rpHashAssignment = 0 var matchingAssignmentCount = 0 var kMeansTotalDist = float64(0) var rpHashTotalDist = float64(0) for _, vector := range data { rpHashAssignment, _ = utils.FindNearestDistance(vector, rpHashResult) kMeansAssignment, _ = utils.FindNearestDistance(vector, kMeansResult) kMeansTotalDist += utils.Distance(vector, kMeansResult[kMeansAssignment]) rpHashTotalDist += utils.Distance(vector, rpHashResult[rpHashAssignment]) //t.Log(rpHashAssignments[i], kMeansAssignments[i]); if rpHashAssignment == kMeansAssignment { matchingAssignmentCount += 1 } } t.Log("RPHash:", rpHashTotalDist) t.Log("KMeans:", kMeansTotalDist) t.Log("Ratio: ", kMeansTotalDist/rpHashTotalDist) }
func TestSpherical(t *testing.T) { var dimension, k, l, iterations int = 64, 6, 4, 10000 sphere := decoder.NewSpherical(dimension, k, l) var collisions int = 0 var distavg float64 = 0.0 for j := 0; j < iterations; j++ { p1, p2 := make([]float64, dimension), make([]float64, dimension) for k := 0; k < dimension; k++ { p1[k] = rand.Float64()*2 - 1 p2[k] = rand.Float64()*2 - 1 } /* Get the distance of each vector from eachother. */ distavg += utils.Distance(p1, p2) mh := hash.NewMurmur(1<<63 - 1) /* Decode from 24-dimensions -> 1-dimensional integer */ hp1, hp2 := sphere.Hash(utils.Normalize(p1)), sphere.Hash(utils.Normalize(p2)) /* Blurring the integers into a smaller space. */ hash1, hash2 := mh.Hash(hp1), mh.Hash(hp2) if hash1 == hash2 { collisions++ } } if collisions > (iterations / 100) { t.Errorf("More than 1 percent of the iterations resulted in collisions. %v collisions in %v iterations.", collisions, iterations) } t.Log("Average Distance: ", distavg/float64(iterations)) t.Log("Percent collisions : ", float64(collisions)/float64(iterations)) t.Log("√ Spherical Decoder test complete") }
//Add a new data point to the stream func (this *KMeansStream) addDataPointWeighted(data []float64, weight int64) { if len(data) != this.dimensionality { return // panic("The input data does not have the correct dimenstionality") } minIndex := 0 minDist := 0.0 for i, centriod := range this.candidateClusters { currDist := utils.Distance(data, centriod.Centroid()) if i == 0 || minDist > currDist { minDist = currDist minIndex = i } } minDistSquared := minDist * minDist if len(this.candidateClusters) < this.k || rand.Float64() < float64(weight)*(minDistSquared/this.frequency) { this.candidateClusters = append(this.candidateClusters, *itemset.NewCentroidWeighted(data, weight)) } else { this.candidateClusters[minIndex].UpdateVector(data) } if len(this.candidateClusters) > this.maxCandidateClusters { this.reduceCandidateClusters() } }