func TestStreamingRPHashOnNumImagesData(t *testing.T) { numClusters := 10 lines, err := utils.ReadLines("../demo/data/MNISTnumImages5000.txt") if err != nil { panic(err) } dimensionality := len(lines[0]) data := utils.StringArrayToFloatArray(lines) start := time.Now() rphashObject := reader.NewStreamObject(dimensionality, numClusters) rphashStream := stream.NewStream(rphashObject) for _, vector := range data { rphashStream.AppendVector(vector) } rpHashResult := rphashStream.GetCentroids() time := time.Since(start) totalSqDist := float64(0) for _, vector := range data { _, dist := utils.FindNearestDistance(vector, rpHashResult) totalSqDist += dist * dist } t.Log("Total Square Distance: ", totalSqDist) t.Log("Average Square Distance: ", totalSqDist/float64(len(data))) t.Log("Runtime(seconds): ", time.Seconds()) if len(rpHashResult) != numClusters { t.Errorf("RPHash Stream did not present the correct number of clusters.") } }
func main() { gob.Register(Centroid{}) gob.Register(itemset.Centroid{}) gob.Register(utils.Hash64Set{}) flag.Parse() t1 := time.Now() records := utils.ReadCSV("./dataset.csv") Object := reader.NewStreamObject(len(records[0]), numClusters) Stream := stream.NewStream(Object) outChannel := make(chan Centroid) ch := make(chan []float64) source := f.Channel(ch) f1 := source.Map(func(record []float64) Centroid { return Centroid{C: Stream.AddVectorOnlineStep(record)} }).AddOutput(outChannel) flow.Ready() var wg sync.WaitGroup goStart(&wg, func() { f1.Run() }) goStart(&wg, func() { for out := range outChannel { Stream.CentroidCounter.Add(out.C) } }) for _, record := range records { ch <- record } close(ch) wg.Wait() normalizedResults := Stream.GetCentroids() ts := time.Since(t1) file, err := os.OpenFile("./results.txt", os.O_WRONLY|os.O_CREATE, 0644) if err != nil { panic(err) } defer file.Close() for _, result := range normalizedResults { for _, dimension := range result { file.WriteString(fmt.Sprintf("%f ", parse.DeNormalize(dimension))) } file.WriteString("\n") } file.WriteString("Time: " + ts.String()) }
func TestStreamingRPHashOnRandomData(t *testing.T) { filereader := utils.NewDataFileReader(filePath) start := time.Now() rphashObject := reader.NewStreamObject(dimensionality, numClusters) rphashStream := stream.NewStream(rphashObject) elapsedtime := time.Since(start) for { vector := filereader.Next() if vector == nil { break } start := time.Now() rphashStream.AppendVector(vector) elapsedtime = elapsedtime + time.Since(start) } start = time.Now() result := rphashStream.GetCentroids() elapsedtime = elapsedtime + time.Since(start) totalSqDist := float64(0) filereader = utils.NewDataFileReader(filePath) for { vector := filereader.Next() if vector == nil { break } _, dist := utils.FindNearestDistance(vector, result) totalSqDist += dist * dist } t.Log("Total Square Distance: ", totalSqDist) t.Log("Average Square Distance: ", totalSqDist/numDataPoints) t.Log("Runtime(seconds): ", elapsedtime.Seconds()) if len(result) != numClusters { t.Errorf("RPHash Stream did not present the correct number of clusters.") } }
func main() { var rphashObject *reader.StreamObject var rphashStream *stream.Stream var centroids []types.Centroid t1 := time.Now() // Split the data into shards and send them to the Agents to work on. f.Source(func(out chan Vector) { records, err := utils.ReadLines(dataFilePath) if err != nil { panic(err) } // Convert the record to standard floating points. for i, record := range records { if i == 0 { // Create a new RPHash stream. rphashObject = reader.NewStreamObject(len(record), numClusters) rphashStream = stream.NewStream(rphashObject) rphashStream.RunCount = 1 } data := make([]float64, len(record)) for j, entry := range record { f, err := strconv.ParseFloat(entry, 64) f = parse.Normalize(f) if err != nil { panic(err) } data[j] = f } out <- Vector{Data: data} } }, numShards).Map(func(vec Vector) { centroids = append(centroids, rphashStream.AddVectorOnlineStep(vec.Data)) }).Run() for _, cent := range centroids { rphashStream.CentroidCounter.Add(cent) } normalizedResults := rphashStream.GetCentroids() t2 := time.Now() log.Println("Time: ", t2.Sub(t1)) denormalizedResults := make([][]float64, len(normalizedResults)) for i, result := range normalizedResults { row := make([]float64, len(result)) for j, dimension := range result { row[j] = parse.DeNormalize(dimension) } denormalizedResults[i] = row } labels := make([]string, len(denormalizedResults)) xPlotValues := make([][]float64, len(denormalizedResults)) yPlotValues := make([][]float64, len(denormalizedResults)) for i, result := range denormalizedResults { xPlotValues[i] = make([]float64, len(result)) yPlotValues[i] = make([]float64, len(result)) for j, val := range result { xPlotValues[i][j] = float64(j) yPlotValues[i][j] = val } Paint(result, i) sI := strconv.FormatInt(int64(i), 16) labels[i] = "Digit " + sI + " (by Classifier Centroid)" } GeneratePlots(xPlotValues, yPlotValues, "High Dimension Handwritting Digits 0-9 Classification", "Dimension", "Strength of Visual Pixel Recognition (0-1000)", "plots/centroid-dimensions-", labels) }
func NewRPHashObject(dimension, k int) types.RPHashObject { return reader.NewStreamObject(dimension, k) }
func TestStreamObject(t *testing.T) { var k = 4 var dimensionality = 100 var numBlurs = 2 var numProjections = 2 var numDataPoints = 8 var origVariance float64 = 1 var testDecoderType types.Decoder var newNumProjections = 4 var newHashModulus int64 = rand.Int63() //var newRandomSeed int64 = rand.Int63() newVarianceSample, newCentroidList := make([][]float64, numDataPoints), make([][]float64, numDataPoints) for i := 0; i < numDataPoints; i++ { newVarianceSample[i], newCentroidList[i] = make([]float64, dimensionality), make([]float64, dimensionality) for j := 0; j < dimensionality; j++ { newVarianceSample[i][j], newCentroidList[i][j] = float64(i), float64(i) } } newCentroid := make([]float64, dimensionality) for i := 0; i < dimensionality; i++ { newCentroid[i] = float64(i) } newTopId := make([]int64, dimensionality) for i := 0; i < dimensionality; i++ { newTopId[i] = int64(i) } RPHashObject := reader.NewStreamObject(dimensionality, k) // K. assert.Equal(t, k, RPHashObject.GetK(), "Expected K equal to Stream K.") // Dimensionality. assert.Equal(t, dimensionality, RPHashObject.GetDimensions(), "Expected dimensionality equal to Stream dimensionality.") // Iterator. assert.Equal(t, RPHashObject.GetVectorIterator().HasNext(), false, "Vector iterator should be initially empty.") // Blurs. assert.Equal(t, numBlurs, RPHashObject.GetNumberOfBlurs(), "Number of blurs should be initially 2.") // Variance. assert.Equal(t, origVariance, RPHashObject.GetVariance(), "Variance should be equal to the new variance value.") RPHashObject.SetVariance(newVarianceSample) newVariance := utils.VarianceSample(newVarianceSample, 0.01) assert.Equal(t, newVariance, RPHashObject.GetVariance(), "Variance should be equal to the new variance value.") // Decoders. origDecoderType := RPHashObject.GetDecoderType() assert.NotNil(t, origDecoderType) assert.Equal(t, reflect.ValueOf(&testDecoderType).Elem().Type(), reflect.ValueOf(&origDecoderType).Elem().Type(), "Decoder should implement the Decoder interface.") RPHashObject.SetDecoderType(testDecoderType) assert.Equal(t, testDecoderType, RPHashObject.GetDecoderType(), "Decoder should be set to a new Decoder.") // Projections. assert.Equal(t, numProjections, RPHashObject.GetNumberOfProjections(), "Number of projections should be initially 2.") RPHashObject.SetNumberOfProjections(newNumProjections) assert.Equal(t, newNumProjections, RPHashObject.GetNumberOfProjections(), "Number of projections should be equal to the new number of projections.") // Hash modulus. RPHashObject.SetHashModulus(newHashModulus) assert.Equal(t, newHashModulus, RPHashObject.GetHashModulus(), "Hash modulus should be equal to the new hash modulus.") // Centroids. assert.Empty(t, RPHashObject.GetCentroids(), "Centroids should initially be empty.") RPHashObject.AddCentroid(newCentroid) assert.Equal(t, newCentroid, RPHashObject.GetCentroids()[0], "First centroid should be the new centroid.") RPHashObject.SetCentroids(newCentroidList) assert.Equal(t, newCentroidList, RPHashObject.GetCentroids(), "Centroids should be equal to the new centroid list.") // Top IDs assert.Empty(t, RPHashObject.GetPreviousTopID(), "Previous top ID should initially be empty.") RPHashObject.SetPreviousTopID(newTopId) assert.Equal(t, newTopId, RPHashObject.GetPreviousTopID(), "Previous top ID should be equal to the new top centroid.") }