func TestLSHStream(t *testing.T) { var seed int64 = 0 var d, k, l int = 64, 6, 4 data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} var inDimensions, outDimentions int = 10, 2 hash := hash.NewMurmur(1<<63 - 1) decoder := decoder.NewSpherical(d, k, l) projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) lsh := lsh.NewLSH(hash, decoder, projector) lsh.LSHHashStream(data, 1) t.Log("√ LSH Stream test complete") }
func BenchmarkDBFriendlyProjection(b *testing.B) { var inDimensions, outDimentions int = 10, 2 for i := 0; i < b.N; i++ { b.StopTimer() var randomGen = rand.New(rand.NewSource(int64(time.Now().Nanosecond()))) data := make([]float64, inDimensions) for i := 0; i < inDimensions; i++ { data[i] = randomGen.Float64() } b.StartTimer() var seed int64 = int64(time.Now().Nanosecond()) RP := projector.NewDBFriendly(inDimensions, outDimentions, seed) RP.Project(data) } }
func BenchmarkStream(b *testing.B) { var seed int64 = 0 var d, k, l int = 64, 6, 4 data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} var inDimensions, outDimentions int = 10, 2 hash := hash.NewMurmur(1<<63 - 1) decoder := decoder.NewSpherical(d, k, l) projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) for i := 0; i < b.N; i++ { lsh := lsh.NewLSH(hash, decoder, projector) b.StopTimer() lsh.LSHHashStream(data, 1) b.StartTimer() } }
func TestDBFriendly(t *testing.T) { //there is probably a better way to test this than hard coding. data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} expectedResult := []float64{1.224744871391589, 13.472193585307478} var inDimensions, outDimentions int = 10, 2 //Use a uniform seed for testing var seed int64 = 0 RP := projector.NewDBFriendly(inDimensions, outDimentions, seed) result := RP.Project(data) if len(result) != len(expectedResult) { t.Error("The result and expected result are not the same length.") } for i := 0; i < len(result); i++ { if result[i] != expectedResult[i] { t.Error(fmt.Sprintf("The result at index %d: %f did not match the expected result: %f", i, result[i], expectedResult[i])) } } }
func (this *KMeans) Run() { //This is a condition to avoid infinite Run.. maxiters := 10000 swaps := 3 fulldata := this.data data := make([][]float64, 0) var p types.Projector = nil if this.projdim != 0 { p = projector.NewDBFriendly(len(fulldata[0]), this.projdim, rand.Int63()) } for _, v := range fulldata { if p != nil { data = append(data, p.Project(v)) } else { data = append(data, v) } } this.n = len(data) this.means = make([][]float64, this.k) for i := 0; i < this.k; i++ { this.means[i] = data[i*(this.n/this.k)] } this.clusters = make([][]int, this.k) //initilize cluster lists to be evenly diveded sequentailly for i := 0; i < this.k; i++ { cluster := make([]int, this.n/this.k) clusterStart := i * (this.n / this.k) for j := 0; j < this.n/this.k; j++ { cluster[j] = j + clusterStart } this.clusters[i] = cluster } for swaps > 2 && maxiters > 0 { maxiters-- this.UpdateMeans(data) swaps = this.AssignClusters(data) } if maxiters == 0 { fmt.Println("Warning: Max Iterations Reached") } data = fulldata this.UpdateMeans(data) }
// The datapoints are seeded in so that the first two data points are near eachother in euclidian geometery and the 3rd and 4th datapoint are // near eachother in euclidian geometery. So the result1Cluster1 and result2Cluster1 should be closer together than the other two points. //The same is true for the points in cluster two vs either point in cluster one. func TestLSHSimple(t *testing.T) { var seed int64 = 0 var d, k, l int = 10, 6, 4 dataPoint1Cluster1 := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} dataPoint2Cluster1 := []float64{2.0, 3.0, 2.0, 6.0, 5.5, 2.0, 8.0, 3.1, 2.0, 0.0} dataPoint1Cluster2 := []float64{100.0, -120.0, 6.0, 18.0, 209.0, 0.0, -2.0, 1036.0, 15.0, 123.0} dataPoint2Cluster2 := []float64{99.0, -119.0, 2.0, 18.0, 208.5, 0.0, -3.0, 1048.0, 13.0, 122.0} var inDimensions, outDimentions int = 10, 2 hash := hash.NewMurmur(1<<63 - 1) decoder := decoder.NewSpherical(d, k, l) projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) lsh := lsh.NewLSH(hash, decoder, projector) result1Cluster1 := lsh.LSHHashSimple(dataPoint1Cluster1) result2Cluster1 := lsh.LSHHashSimple(dataPoint2Cluster1) result1Cluster2 := lsh.LSHHashSimple(dataPoint1Cluster2) result2Cluster2 := lsh.LSHHashSimple(dataPoint2Cluster2) // Assert that results are still localy sensetive based on the original euclidian geometry if math.Abs(float64(result1Cluster1-result2Cluster1)) > math.Abs(float64(result1Cluster1-result1Cluster2)) { t.Errorf("\nThe first datapoint in cluster two is closer to the first data point in cluster one than the second data point in cluster one"+ "\ndatapoint cluster one datapoint one: %d, \ndatapoint cluster one datapoint two: %d, \ndatapoint cluster two datapoint one: %d", result1Cluster1, result2Cluster1, result1Cluster2) } if math.Abs(float64(result1Cluster1-result2Cluster1)) > math.Abs(float64(result1Cluster1-result2Cluster2)) { t.Errorf("\nThe second datapoint in cluster two is closer to the first data point in cluster one than the second data point in cluster one"+ "\nCluster one datapoint one: %d, \nCluster one datapoint two: %d, \nCluster two datapoint two: %d", result1Cluster1, result2Cluster1, result2Cluster2) } if math.Abs(float64(result1Cluster2-result2Cluster2)) > math.Abs(float64(result1Cluster1-result1Cluster2)) { t.Errorf("\nThe first datapoint in cluster one is closer to the first data point in cluster two than the second data point in cluster two"+ "\nCluster one datapoint one: %d, \nCluster two datapoint one: %d, \nCluster two datapoint two: %d", result1Cluster1, result1Cluster2, result2Cluster2) } t.Log("√ LSH Simple test complete") }
func NewProjector(n, t int, randomseed int64) types.Projector { return projector.NewDBFriendly(n, t, randomseed) }