Beispiel #1
0
func NewStream(rphashObject types.RPHashObject) *Stream {
	randomSeedGenerator := rand.New(rand.NewSource(rphashObject.GetRandomSeed()))
	hash := defaults.NewHash(rphashObject.GetHashModulus())
	decoder := rphashObject.GetDecoderType()
	varianceTracker := defaults.NewStatTest(0.01)
	projections := rphashObject.GetNumberOfProjections()
	k := rphashObject.GetK() * projections
	CentroidCounter := defaults.NewCentroidCounter(k)
	lshGroup := make([]types.LSH, projections)
	lshChannel := make(chan *itemset.Centroid, 10000)
	var projector types.Projector
	for i := 0; i < projections; i++ {
		projector = defaults.NewProjector(rphashObject.GetDimensions(), decoder.GetDimensionality(), randomSeedGenerator.Int63())
		lshGroup[i] = defaults.NewLSH(hash, decoder, projector)
	}
	return &Stream{
		counts:              nil,
		centroids:           nil,
		variance:            0,
		processedCount:      0,
		vectorCount:         0,
		CentroidCounter:     CentroidCounter,
		randomSeedGenerator: randomSeedGenerator,
		rphashObject:        rphashObject,
		lshGroup:            lshGroup,
		hash:                hash,
		decoder:             decoder,
		projector:           projector,
		varianceTracker:     varianceTracker,
		lshChannel:          lshChannel,
	}
}
Beispiel #2
0
// Map is doing the count.
func (this *Simple) Map() *Simple {
	runtime.GOMAXPROCS(runtime.NumCPU())
	vecs := this.rphashObject.GetVectorIterator()
	//var hashResult int64;
	targetDimension := int(math.Floor(float64(this.rphashObject.GetDimensions() / 2)))
	numberOfRotations := 6
	numberOfSearches := 1
	vec := vecs.Next()
	hash := defaults.NewHash(this.rphashObject.GetHashModulus())
	decoder := defaults.NewDecoder(targetDimension, numberOfRotations, numberOfSearches)
	projector := defaults.NewProjector(this.rphashObject.GetDimensions(), decoder.GetDimensionality(), this.rphashObject.GetRandomSeed())
	LSH := defaults.NewLSH(hash, decoder, projector)
	// k := int(float64(this.rphashObject.GetK()) * math.Log(float64(this.rphashObject.GetK())));
	CountMinSketch := defaults.NewCountMinSketch(this.rphashObject.GetK())
	var vecCount = 0
	//1000 is an arbitrary comprise between speed and size should be tweeked later.
	hashChannel := make(chan int64, this.rphashObject.NumDataPoints())
	hashValues := make([]int64, this.rphashObject.NumDataPoints(), this.rphashObject.NumDataPoints())
	for vecs.HasNext() {
		go func(vec []float64, index int) {
			// Project the Vector to lower dimension.
			// Decode the new vector for meaningful integers
			// Hash the new vector into a 64 bit int.
			value := LSH.LSHHashSimple(vec)
			hashValues[index] = value
			hashChannel <- value
			//hashResult = LSH.LSHHashSimple(vec);
			// Add it to the count min sketch to update frequencies.
		}(vec, vecCount)
		vecCount++
		vec = vecs.Next()
	}
	vecs.StoreLSHValues(hashValues)
	//TODO should we Paralelize this? slowest loop but also have to wait for LSH Loops
	for i := 0; i < vecCount; i++ {
		hashResult := <-hashChannel
		CountMinSketch.Add(hashResult)
	}
	this.rphashObject.SetPreviousTopID(CountMinSketch.GetTop())
	vecs.Reset()
	return this
}