func NewSimpleArray(inData [][]float64, k int) *SimpleArray { randomSeed := rand.Int63() data := utils.NewIterator(inData) numDataPoints := len(inData) dimension := 2 // As the number of rotations increases, the distance increases. // Increases the noise. numberOfRotations := 6 numberOfSearches := 1 numberOfProjections := 2 numberOfBlurs := 2 if data != nil { // Get the first vector in the data set's length. dimension = len(inData[0]) } hashModulus := int64(math.MaxInt64) // Set the target dimension much lower. targetDimension := int(math.Floor(float64(dimension / 2))) decoder := decoder.NewSpherical(targetDimension, numberOfRotations, numberOfSearches) centroids := [][]float64{} topIDs := []int64{} return &SimpleArray{ numDataPoints: numDataPoints, data: data, dimension: dimension, numberOfProjections: numberOfProjections, randomSeed: randomSeed, hashModulus: hashModulus, k: k, numberOfBlurs: numberOfBlurs, decoder: decoder, centroids: centroids, topIDs: topIDs, } }
func TestSpherical(t *testing.T) { var dimension, k, l, iterations int = 64, 6, 4, 10000 sphere := decoder.NewSpherical(dimension, k, l) var collisions int = 0 var distavg float64 = 0.0 for j := 0; j < iterations; j++ { p1, p2 := make([]float64, dimension), make([]float64, dimension) for k := 0; k < dimension; k++ { p1[k] = rand.Float64()*2 - 1 p2[k] = rand.Float64()*2 - 1 } /* Get the distance of each vector from eachother. */ distavg += utils.Distance(p1, p2) mh := hash.NewMurmur(1<<63 - 1) /* Decode from 24-dimensions -> 1-dimensional integer */ hp1, hp2 := sphere.Hash(utils.Normalize(p1)), sphere.Hash(utils.Normalize(p2)) /* Blurring the integers into a smaller space. */ hash1, hash2 := mh.Hash(hp1), mh.Hash(hp2) if hash1 == hash2 { collisions++ } } if collisions > (iterations / 100) { t.Errorf("More than 1 percent of the iterations resulted in collisions. %v collisions in %v iterations.", collisions, iterations) } t.Log("Average Distance: ", distavg/float64(iterations)) t.Log("Percent collisions : ", float64(collisions)/float64(iterations)) t.Log("√ Spherical Decoder test complete") }
func TestLSHStream(t *testing.T) { var seed int64 = 0 var d, k, l int = 64, 6, 4 data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} var inDimensions, outDimentions int = 10, 2 hash := hash.NewMurmur(1<<63 - 1) decoder := decoder.NewSpherical(d, k, l) projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) lsh := lsh.NewLSH(hash, decoder, projector) lsh.LSHHashStream(data, 1) t.Log("√ LSH Stream test complete") }
func BenchmarkStreamLSH(b *testing.B) { var seed int64 = 0 var d, k, l int = 64, 6, 4 data := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} var inDimensions, outDimentions int = 10, 2 hash := hash.NewMurmur(1<<63 - 1) decoder := decoder.NewSpherical(d, k, l) projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) for i := 0; i < b.N; i++ { lsh := lsh.NewLSH(hash, decoder, projector) b.StopTimer() lsh.LSHHashStream(data, 1) b.StartTimer() } }
func BenchmarkSpherical(b *testing.B) { b.StopTimer() randomSeed := rand.New(rand.NewSource(time.Now().UnixNano())) var d, k, l int = 64, 6, 4 sphere := decoder.NewSpherical(d, k, l) p1, p2 := make([]float64, d), make([]float64, d) for i := 0; i < b.N; i++ { for j := 0; j < d; j++ { p1[j], p2[j] = randomSeed.NormFloat64(), randomSeed.NormFloat64() } b.StartTimer() hp1, hp2 := sphere.Hash(utils.Normalize(p1)), sphere.Hash(utils.Normalize(p2)) b.StopTimer() if hp1 == nil || hp2 == nil { b.Error("Spherical hashes are null") } } }
func NewStreamObject(dimension, k int) *StreamObject { var centroids [][]float64 var topIDs []int64 numberOfRotations := 2 numberOfSearches := 2 targetDimension := 24 decoder := decoder.NewSpherical(targetDimension, numberOfRotations, numberOfSearches) data := utils.NewIterator([][]float64{}) return &StreamObject{ decoder: decoder, data: data, dimension: dimension, randomSeed: int64(0), hashModulus: int64(int(0 >> 1)), numberOfProjections: 1, numberOfBlurs: 1, k: k, topIDs: topIDs, centroids: centroids, numDataPoints: 0, } }
// The datapoints are seeded in so that the first two data points are near eachother in euclidian geometery and the 3rd and 4th datapoint are // near eachother in euclidian geometery. So the result1Cluster1 and result2Cluster1 should be closer together than the other two points. // The same is true for the points in cluster two vs either point in cluster one. func TestLSHSimple(t *testing.T) { var seed int64 = 0 // We want to limit the dimension reduction because it causes a lot of noise. var inDimensions, outDimentions, numberOfClusters, numberOfSearches int = 10, 5, 3, 1 dataPoint1Cluster1 := []float64{1.0, 0.0, 2.0, 7.0, 4.0, 0.0, 8.0, 3.0, 2.0, 1.0} dataPoint2Cluster1 := []float64{2.0, 3.0, 2.0, 6.0, 5.5, 2.0, 8.0, 3.1, 2.0, 0.0} dataPoint1Cluster2 := []float64{100.0, -120.0, 6.0, 18.0, 209.0, 0.0, -2.0, 1036.0, 15.0, 123.0} dataPoint2Cluster2 := []float64{99.0, -119.0, 2.0, 18.0, 208.5, 0.0, -3.0, 1048.0, 13.0, 122.0} hash := hash.NewMurmur(1<<63 - 1) decoder := decoder.NewSpherical(inDimensions, numberOfClusters, numberOfSearches) projector := projector.NewDBFriendly(inDimensions, outDimentions, seed) lsh := lsh.NewLSH(hash, decoder, projector) result1Cluster1 := lsh.LSHHashSimple(dataPoint1Cluster1) result2Cluster1 := lsh.LSHHashSimple(dataPoint2Cluster1) result1Cluster2 := lsh.LSHHashSimple(dataPoint1Cluster2) result2Cluster2 := lsh.LSHHashSimple(dataPoint2Cluster2) // Assert that results are still localy sensetive based on the original euclidian geometry if math.Abs(float64(result1Cluster1-result2Cluster1)) > math.Abs(float64(result1Cluster1-result1Cluster2)) { t.Errorf("\nThe first datapoint in cluster two is closer to the first data point in cluster one than the second data point in cluster one"+ "\ndatapoint cluster one datapoint one: %d, \ndatapoint cluster one datapoint two: %d, \ndatapoint cluster two datapoint one: %d", result1Cluster1, result2Cluster1, result1Cluster2) } if math.Abs(float64(result1Cluster1-result2Cluster1)) > math.Abs(float64(result1Cluster1-result2Cluster2)) { t.Errorf("\nThe second datapoint in cluster two is closer to the first data point in cluster one than the second data point in cluster one"+ "\nCluster one datapoint one: %d, \nCluster one datapoint two: %d, \nCluster two datapoint two: %d", result1Cluster1, result2Cluster1, result2Cluster2) } if math.Abs(float64(result1Cluster2-result2Cluster2)) > math.Abs(float64(result1Cluster1-result1Cluster2)) { t.Errorf("\nThe first datapoint in cluster one is closer to the first data point in cluster two than the second data point in cluster two"+ "\nCluster one datapoint one: %d, \nCluster two datapoint one: %d, \nCluster two datapoint two: %d", result1Cluster1, result1Cluster2, result2Cluster2) } t.Log("√ LSH Simple test complete") }
func NewDecoder(dimension, rotations, numberOfSearches int) types.Decoder { return decoder.NewSpherical(dimension, rotations, numberOfSearches) }