func NewStream(rphashObject types.RPHashObject) *Stream { randomSeedGenerator := rand.New(rand.NewSource(rphashObject.GetRandomSeed())) hash := defaults.NewHash(rphashObject.GetHashModulus()) decoder := rphashObject.GetDecoderType() varianceTracker := defaults.NewStatTest(0.01) projections := rphashObject.GetNumberOfProjections() k := rphashObject.GetK() * projections CentroidCounter := defaults.NewCentroidCounter(k) lshGroup := make([]types.LSH, projections) lshChannel := make(chan *itemset.Centroid, 10000) var projector types.Projector for i := 0; i < projections; i++ { projector = defaults.NewProjector(rphashObject.GetDimensions(), decoder.GetDimensionality(), randomSeedGenerator.Int63()) lshGroup[i] = defaults.NewLSH(hash, decoder, projector) } return &Stream{ counts: nil, centroids: nil, variance: 0, processedCount: 0, vectorCount: 0, CentroidCounter: CentroidCounter, randomSeedGenerator: randomSeedGenerator, rphashObject: rphashObject, lshGroup: lshGroup, hash: hash, decoder: decoder, projector: projector, varianceTracker: varianceTracker, lshChannel: lshChannel, } }
// Map is doing the count. func (this *Simple) Map() *Simple { runtime.GOMAXPROCS(runtime.NumCPU()) vecs := this.rphashObject.GetVectorIterator() //var hashResult int64; targetDimension := int(math.Floor(float64(this.rphashObject.GetDimensions() / 2))) numberOfRotations := 6 numberOfSearches := 1 vec := vecs.Next() hash := defaults.NewHash(this.rphashObject.GetHashModulus()) decoder := defaults.NewDecoder(targetDimension, numberOfRotations, numberOfSearches) projector := defaults.NewProjector(this.rphashObject.GetDimensions(), decoder.GetDimensionality(), this.rphashObject.GetRandomSeed()) LSH := defaults.NewLSH(hash, decoder, projector) // k := int(float64(this.rphashObject.GetK()) * math.Log(float64(this.rphashObject.GetK()))); CountMinSketch := defaults.NewCountMinSketch(this.rphashObject.GetK()) var vecCount = 0 //1000 is an arbitrary comprise between speed and size should be tweeked later. hashChannel := make(chan int64, this.rphashObject.NumDataPoints()) hashValues := make([]int64, this.rphashObject.NumDataPoints(), this.rphashObject.NumDataPoints()) for vecs.HasNext() { go func(vec []float64, index int) { // Project the Vector to lower dimension. // Decode the new vector for meaningful integers // Hash the new vector into a 64 bit int. value := LSH.LSHHashSimple(vec) hashValues[index] = value hashChannel <- value //hashResult = LSH.LSHHashSimple(vec); // Add it to the count min sketch to update frequencies. }(vec, vecCount) vecCount++ vec = vecs.Next() } vecs.StoreLSHValues(hashValues) //TODO should we Paralelize this? slowest loop but also have to wait for LSH Loops for i := 0; i < vecCount; i++ { hashResult := <-hashChannel CountMinSketch.Add(hashResult) } this.rphashObject.SetPreviousTopID(CountMinSketch.GetTop()) vecs.Reset() return this }