func TestStreamingRPHashOnNumImagesData(t *testing.T) {
	numClusters := 10
	lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt")
	if err != nil {
		panic(err)
	}
	dimensionality := len(lines[0])
	data := utils.StringArrayToFloatArray(lines)

	start := time.Now()
	rphashObject := reader.NewStreamObject(dimensionality, numClusters)
	rphashStream := stream.NewStream(rphashObject)
	for _, vector := range data {
		rphashStream.AppendVector(vector)
	}
	rpHashResult := rphashStream.GetCentroids()
	time := time.Since(start)
	totalSqDist := float64(0)
	for _, vector := range data {
		_, dist := utils.FindNearestDistance(vector, rpHashResult)
		totalSqDist += dist * dist
	}

	t.Log("Total Square Distance: ", totalSqDist)
	t.Log("Average Square Distance: ", totalSqDist/float64(len(data)))
	t.Log("Runtime(seconds): ", time.Seconds())

	if len(rpHashResult) != numClusters {
		t.Errorf("RPHash Stream did not present the correct number of clusters.")
	}
}
Exemple #2
0
func main() {
	gob.Register(Centroid{})
	gob.Register(itemset.Centroid{})
	gob.Register(utils.Hash64Set{})
	flag.Parse()

	t1 := time.Now()
	records := utils.ReadCSV("./dataset.csv")

	Object := reader.NewStreamObject(len(records[0]), numClusters)
	Stream := stream.NewStream(Object)

	outChannel := make(chan Centroid)

	ch := make(chan []float64)

	source := f.Channel(ch)

	f1 := source.Map(func(record []float64) Centroid {
		return Centroid{C: Stream.AddVectorOnlineStep(record)}
	}).AddOutput(outChannel)

	flow.Ready()

	var wg sync.WaitGroup

	goStart(&wg, func() {
		f1.Run()
	})

	goStart(&wg, func() {
		for out := range outChannel {
			Stream.CentroidCounter.Add(out.C)
		}
	})

	for _, record := range records {
		ch <- record
	}

	close(ch)
	wg.Wait()

	normalizedResults := Stream.GetCentroids()
	ts := time.Since(t1)

	file, err := os.OpenFile("./results.txt", os.O_WRONLY|os.O_CREATE, 0644)
	if err != nil {
		panic(err)
	}
	defer file.Close()
	for _, result := range normalizedResults {
		for _, dimension := range result {
			file.WriteString(fmt.Sprintf("%f ", parse.DeNormalize(dimension)))
		}
		file.WriteString("\n")
	}
	file.WriteString("Time: " + ts.String())
}
func TestStreamingRPHashOnRandomData(t *testing.T) {
	filereader := utils.NewDataFileReader(filePath)

	start := time.Now()
	rphashObject := reader.NewStreamObject(dimensionality, numClusters)
	rphashStream := stream.NewStream(rphashObject)
	elapsedtime := time.Since(start)
	for {
		vector := filereader.Next()
		if vector == nil {
			break
		}
		start := time.Now()
		rphashStream.AppendVector(vector)
		elapsedtime = elapsedtime + time.Since(start)
	}
	start = time.Now()
	result := rphashStream.GetCentroids()
	elapsedtime = elapsedtime + time.Since(start)
	totalSqDist := float64(0)
	filereader = utils.NewDataFileReader(filePath)
	for {
		vector := filereader.Next()
		if vector == nil {
			break
		}
		_, dist := utils.FindNearestDistance(vector, result)
		totalSqDist += dist * dist
	}

	t.Log("Total Square Distance: ", totalSqDist)
	t.Log("Average Square Distance: ", totalSqDist/numDataPoints)
	t.Log("Runtime(seconds): ", elapsedtime.Seconds())

	if len(result) != numClusters {
		t.Errorf("RPHash Stream did not present the correct number of clusters.")
	}
}
Exemple #4
0
func main() {
	var rphashObject *reader.StreamObject
	var rphashStream *stream.Stream
	var centroids []types.Centroid
	t1 := time.Now()
	// Split the data into shards and send them to the Agents to work on.
	f.Source(func(out chan Vector) {
		records, err := utils.ReadLines(dataFilePath)
		if err != nil {
			panic(err)
		}
		// Convert the record to standard floating points.
		for i, record := range records {
			if i == 0 {
				// Create a new RPHash stream.
				rphashObject = reader.NewStreamObject(len(record), numClusters)
				rphashStream = stream.NewStream(rphashObject)
				rphashStream.RunCount = 1
			}
			data := make([]float64, len(record))
			for j, entry := range record {
				f, err := strconv.ParseFloat(entry, 64)
				f = parse.Normalize(f)
				if err != nil {
					panic(err)
				}
				data[j] = f
			}
			out <- Vector{Data: data}
		}
	}, numShards).Map(func(vec Vector) {
		centroids = append(centroids, rphashStream.AddVectorOnlineStep(vec.Data))
	}).Run()

	for _, cent := range centroids {
		rphashStream.CentroidCounter.Add(cent)
	}
	normalizedResults := rphashStream.GetCentroids()
	t2 := time.Now()
	log.Println("Time: ", t2.Sub(t1))

	denormalizedResults := make([][]float64, len(normalizedResults))
	for i, result := range normalizedResults {
		row := make([]float64, len(result))
		for j, dimension := range result {
			row[j] = parse.DeNormalize(dimension)
		}
		denormalizedResults[i] = row
	}
	labels := make([]string, len(denormalizedResults))
	xPlotValues := make([][]float64, len(denormalizedResults))
	yPlotValues := make([][]float64, len(denormalizedResults))
	for i, result := range denormalizedResults {
		xPlotValues[i] = make([]float64, len(result))
		yPlotValues[i] = make([]float64, len(result))
		for j, val := range result {
			xPlotValues[i][j] = float64(j)
			yPlotValues[i][j] = val
		}
		Paint(result, i)
		sI := strconv.FormatInt(int64(i), 16)
		labels[i] = "Digit " + sI + " (by Classifier Centroid)"
	}
	GeneratePlots(xPlotValues, yPlotValues, "High Dimension Handwritting Digits 0-9 Classification", "Dimension", "Strength of Visual Pixel Recognition (0-1000)", "plots/centroid-dimensions-", labels)
}
Exemple #5
0
func NewRPHashObject(dimension, k int) types.RPHashObject {
	return reader.NewStreamObject(dimension, k)
}
func TestStreamObject(t *testing.T) {
	var k = 4
	var dimensionality = 100
	var numBlurs = 2
	var numProjections = 2
	var numDataPoints = 8
	var origVariance float64 = 1
	var testDecoderType types.Decoder
	var newNumProjections = 4
	var newHashModulus int64 = rand.Int63()
	//var newRandomSeed int64 = rand.Int63()

	newVarianceSample, newCentroidList := make([][]float64, numDataPoints), make([][]float64, numDataPoints)
	for i := 0; i < numDataPoints; i++ {
		newVarianceSample[i], newCentroidList[i] = make([]float64, dimensionality), make([]float64, dimensionality)
		for j := 0; j < dimensionality; j++ {
			newVarianceSample[i][j], newCentroidList[i][j] = float64(i), float64(i)
		}
	}

	newCentroid := make([]float64, dimensionality)
	for i := 0; i < dimensionality; i++ {
		newCentroid[i] = float64(i)
	}

	newTopId := make([]int64, dimensionality)
	for i := 0; i < dimensionality; i++ {
		newTopId[i] = int64(i)
	}

	RPHashObject := reader.NewStreamObject(dimensionality, k)

	// K.
	assert.Equal(t, k, RPHashObject.GetK(), "Expected K equal to Stream K.")

	// Dimensionality.
	assert.Equal(t, dimensionality, RPHashObject.GetDimensions(), "Expected dimensionality equal to Stream dimensionality.")

	// Iterator.
	assert.Equal(t, RPHashObject.GetVectorIterator().HasNext(), false, "Vector iterator should be initially empty.")

	// Blurs.
	assert.Equal(t, numBlurs, RPHashObject.GetNumberOfBlurs(), "Number of blurs should be initially 2.")

	// Variance.
	assert.Equal(t, origVariance, RPHashObject.GetVariance(), "Variance should be equal to the new variance value.")
	RPHashObject.SetVariance(newVarianceSample)
	newVariance := utils.VarianceSample(newVarianceSample, 0.01)
	assert.Equal(t, newVariance, RPHashObject.GetVariance(), "Variance should be equal to the new variance value.")

	// Decoders.
	origDecoderType := RPHashObject.GetDecoderType()
	assert.NotNil(t, origDecoderType)
	assert.Equal(t, reflect.ValueOf(&testDecoderType).Elem().Type(), reflect.ValueOf(&origDecoderType).Elem().Type(), "Decoder should implement the Decoder interface.")
	RPHashObject.SetDecoderType(testDecoderType)
	assert.Equal(t, testDecoderType, RPHashObject.GetDecoderType(), "Decoder should be set to a new Decoder.")

	// Projections.
	assert.Equal(t, numProjections, RPHashObject.GetNumberOfProjections(), "Number of projections should be initially 2.")
	RPHashObject.SetNumberOfProjections(newNumProjections)
	assert.Equal(t, newNumProjections, RPHashObject.GetNumberOfProjections(), "Number of projections should be equal to the new number of projections.")

	// Hash modulus.
	RPHashObject.SetHashModulus(newHashModulus)
	assert.Equal(t, newHashModulus, RPHashObject.GetHashModulus(), "Hash modulus should be equal to the new hash modulus.")

	// Centroids.
	assert.Empty(t, RPHashObject.GetCentroids(), "Centroids should initially be empty.")
	RPHashObject.AddCentroid(newCentroid)
	assert.Equal(t, newCentroid, RPHashObject.GetCentroids()[0], "First centroid should be the new centroid.")
	RPHashObject.SetCentroids(newCentroidList)
	assert.Equal(t, newCentroidList, RPHashObject.GetCentroids(), "Centroids should be equal to the new centroid list.")

	// Top IDs
	assert.Empty(t, RPHashObject.GetPreviousTopID(), "Previous top ID should initially be empty.")
	RPHashObject.SetPreviousTopID(newTopId)
	assert.Equal(t, newTopId, RPHashObject.GetPreviousTopID(), "Previous top ID should be equal to the new top centroid.")
}