// Returns a classification for the vector, based on a vector input, using the KNN algorithm. // See http://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm. func (KNN *KNNClassifier) PredictOne(vector []float64) string { rows := KNN.TrainingData.Rows rownumbers := make(map[int]float64) labels := make([]string, 0) maxmap := make(map[string]int) convertedVector := util.FloatsToMatrix(vector) // Check what distance function we are using switch KNN.DistanceFunc { case "euclidean": { euclidean := pairwiseMetrics.NewEuclidean() for i := 0; i < rows; i++ { row := KNN.TrainingData.GetRowVectorWithoutClass(i) rowMat := util.FloatsToMatrix(row) distance := euclidean.Distance(rowMat, convertedVector) rownumbers[i] = distance } } case "manhattan": { manhattan := pairwiseMetrics.NewEuclidean() for i := 0; i < rows; i++ { row := KNN.TrainingData.GetRowVectorWithoutClass(i) rowMat := util.FloatsToMatrix(row) distance := manhattan.Distance(rowMat, convertedVector) rownumbers[i] = distance } } } sorted := util.SortIntMap(rownumbers) values := sorted[:KNN.NearestNeighbours] for _, elem := range values { label := KNN.TrainingData.GetClass(elem) labels = append(labels, label) if _, ok := maxmap[label]; ok { maxmap[label] += 1 } else { maxmap[label] = 1 } } sortedlabels := util.SortStringMap(maxmap) label := sortedlabels[0] return label }
// Returns a classification for the vector, based on a vector input, using the KNN algorithm. // See http://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm. func (KNN *KNNClassifier) Predict(vector []float64, K int) string { convertedVector := util.FloatsToMatrix(vector) // Get the number of rows rows, _ := KNN.Data.Dims() rownumbers := make(map[int]float64) labels := make([]string, 0) maxmap := make(map[string]int) // Check what distance function we are using switch KNN.DistanceFunc { case "euclidean": { euclidean := pairwiseMetrics.NewEuclidean() for i := 0; i < rows; i++ { row := KNN.Data.RowView(i) rowMat := util.FloatsToMatrix(row) distance := euclidean.Distance(rowMat, convertedVector) rownumbers[i] = distance } } case "manhattan": { manhattan := pairwiseMetrics.NewEuclidean() for i := 0; i < rows; i++ { row := KNN.Data.RowView(i) rowMat := util.FloatsToMatrix(row) distance := manhattan.Distance(rowMat, convertedVector) rownumbers[i] = distance } } } sorted := util.SortIntMap(rownumbers) values := sorted[:K] for _, elem := range values { // It's when we access this map labels = append(labels, KNN.Labels[elem]) if _, ok := maxmap[KNN.Labels[elem]]; ok { maxmap[KNN.Labels[elem]] += 1 } else { maxmap[KNN.Labels[elem]] = 1 } } sortedlabels := util.SortStringMap(maxmap) label := sortedlabels[0] return label }
func TestDBSCANSynthetic(t *testing.T) { Convey("Synthetic DBSCAN test should work...", t, func() { inst, err := base.ParseCSVToInstances("synthetic.csv", false) So(err, ShouldBeNil) p := DBSCANParameters{ ClusterParameters{ inst.AllAttributes(), pairwise.NewEuclidean(), }, 1, 1, } m, err := DBSCAN(inst, p) So(err, ShouldBeNil) So(len(m), ShouldEqual, 2) So(m[1], ShouldContain, 0) So(m[1], ShouldContain, 1) So(m[1], ShouldContain, 2) So(m[1], ShouldContain, 3) }) }
func TestDBSCANDistanceQuery(t *testing.T) { Convey("Should be able to determine which points are in range...", t, func() { // Read in the synthetic test data inst, err := base.ParseCSVToInstances("synthetic.csv", false) So(err, ShouldBeNil) // Create a neighbours vector neighbours := big.NewInt(0) // Compute pairwise distances dist, err := computePairwiseDistances(inst, inst.AllAttributes(), pairwise.NewEuclidean()) So(dist.At(0, 0), ShouldAlmostEqual, 0) So(dist.At(0, 1), ShouldAlmostEqual, 1) So(dist.At(1, 0), ShouldAlmostEqual, 1) So(dist.At(0, 2), ShouldAlmostEqual, math.Sqrt(5)) So(dist.At(2, 0), ShouldAlmostEqual, math.Sqrt(5)) So(err, ShouldBeNil) // Do the region query neighbours = regionQuery(0, neighbours, dist, 1) So(neighbours.Bit(0), ShouldEqual, 1) So(neighbours.Bit(1), ShouldEqual, 1) So(neighbours.Bit(2), ShouldEqual, 0) So(neighbours.Bit(3), ShouldEqual, 0) So(neighbours.Bit(4), ShouldEqual, 0) }) }
func TestDBSCAN(t *testing.T) { Convey("Loading some data and labels...", t, func() { inst, err := base.ParseCSVToInstances("dbscan.csv", false) So(err, ShouldBeNil) file, err := os.Open("dbscan_labels.csv") defer file.Close() So(err, ShouldBeNil) clusterMap := ClusterMap(make(map[int][]int)) scanner := bufio.NewScanner(file) line := -1 for scanner.Scan() { line = line + 1 v, err := strconv.ParseInt(scanner.Text(), 10, 64) if err != nil { panic(err) } v = v + 1 // -1 are noise in scikit-learn's DBSCAN c := int(v) if c == 0 { continue } if _, ok := clusterMap[c]; !ok { clusterMap[c] = make([]int, 0) } clusterMap[c] = append(clusterMap[c], line) } Convey("Our DBSCAN implementation should match...", func() { p := DBSCANParameters{ ClusterParameters{ inst.AllAttributes(), pairwise.NewEuclidean(), }, 0.3, 10, } m, err := DBSCAN(inst, p) Convey("There should be nothing in the result that's smaller than MinPts", func() { for id := range m { So(len(m[id]), ShouldBeGreaterThanOrEqualTo, 10) } }) So(err, ShouldBeNil) eq, err := clusterMap.Equals(m) So(err, ShouldBeNil) So(eq, ShouldBeTrue) }) }) }
//Returns an average of the K nearest labels/variables, based on a vector input. func (KNN *KNNRegressor) Predict(vector *mat64.Dense, K int) float64 { // Get the number of rows rows, _ := KNN.Data.Dims() rownumbers := make(map[int]float64) labels := make([]float64, 0) // Check what distance function we are using switch KNN.DistanceFunc { case "euclidean": { euclidean := pairwiseMetrics.NewEuclidean() for i := 0; i < rows; i++ { row := KNN.Data.RowView(i) rowMat := util.FloatsToMatrix(row) distance := euclidean.Distance(rowMat, vector) rownumbers[i] = distance } } case "manhattan": { manhattan := pairwiseMetrics.NewEuclidean() for i := 0; i < rows; i++ { row := KNN.Data.RowView(i) rowMat := util.FloatsToMatrix(row) distance := manhattan.Distance(rowMat, vector) rownumbers[i] = distance } } } sorted := util.SortIntMap(rownumbers) values := sorted[:K] var sum float64 for _, elem := range values { value := KNN.Values[elem] labels = append(labels, value) sum += value } average := sum / float64(K) return average }
func TestDBSCANDistanceMetric(t *testing.T) { Convey("Check the distance function is sane...", t, func() { d1 := mat64.NewDense(1, 2, nil) d2 := mat64.NewDense(1, 2, nil) d1.Set(0, 0, 0.494260967249) d1.Set(0, 1, 1.45106696541) d2.Set(0, 0, -1.42808099324) d2.Set(0, 1, -0.83706376669) e := pairwise.NewEuclidean() So(e.Distance(d1, d2), ShouldAlmostEqual, 2.9882, 0.001) }) }
func (KNN *KNNRegressor) Predict(vector *mat64.Dense, K int) float64 { // Get the number of rows rows, _ := KNN.Data.Dims() rownumbers := make(map[int]float64) labels := make([]float64, 0) // Check what distance function we are using var distanceFunc pairwise.PairwiseDistanceFunc switch KNN.DistanceFunc { case "euclidean": distanceFunc = pairwise.NewEuclidean() case "manhattan": distanceFunc = pairwise.NewManhattan() default: panic("unsupported distance function") } for i := 0; i < rows; i++ { row := KNN.Data.RowView(i) distance := distanceFunc.Distance(utilities.VectorToMatrix(row), vector) rownumbers[i] = distance } sorted := utilities.SortIntMap(rownumbers) values := sorted[:K] var sum float64 for _, elem := range values { value := KNN.Values[elem] labels = append(labels, value) sum += value } average := sum / float64(K) return average }
// Predict returns a classification for the vector, based on a vector input, using the KNN algorithm. func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Check what distance function we are using var distanceFunc pairwise.PairwiseDistanceFunc switch KNN.DistanceFunc { case "euclidean": distanceFunc = pairwise.NewEuclidean() case "manhattan": distanceFunc = pairwise.NewManhattan() default: panic("unsupported distance function") } // Check Compatibility allAttrs := base.CheckCompatible(what, KNN.TrainingData) if allAttrs == nil { // Don't have the same Attributes return nil } // Remove the Attributes which aren't numeric allNumericAttrs := make([]base.Attribute, 0) for _, a := range allAttrs { if fAttr, ok := a.(*base.FloatAttribute); ok { allNumericAttrs = append(allNumericAttrs, fAttr) } } // Generate return vector ret := base.GeneratePredictionVector(what) // Resolve Attribute specifications for both whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs) trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs) // Reserve storage for most the most similar items distances := make(map[int]float64) // Reserve storage for voting map maxmap := make(map[string]int) // Reserve storage for row computations trainRowBuf := make([]float64, len(allNumericAttrs)) predRowBuf := make([]float64, len(allNumericAttrs)) // Iterate over all outer rows what.MapOverRows(whatAttrSpecs, func(predRow [][]byte, predRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { predRowBuf[i] = base.UnpackBytesToFloat(predRow[i]) } predMat := utilities.FloatsToMatrix(predRowBuf) // Find the closest match in the training data KNN.TrainingData.MapOverRows(trainAttrSpecs, func(trainRow [][]byte, srcRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { trainRowBuf[i] = base.UnpackBytesToFloat(trainRow[i]) } // Compute the distance trainMat := utilities.FloatsToMatrix(trainRowBuf) distances[srcRowNo] = distanceFunc.Distance(predMat, trainMat) return true, nil }) sorted := utilities.SortIntMap(distances) values := sorted[:KNN.NearestNeighbours] // Reset maxMap for a := range maxmap { maxmap[a] = 0 } // Refresh maxMap for _, elem := range values { label := base.GetClass(KNN.TrainingData, elem) if _, ok := maxmap[label]; ok { maxmap[label]++ } else { maxmap[label] = 1 } } // Sort the maxMap var maxClass string maxVal := -1 for a := range maxmap { if maxmap[a] > maxVal { maxVal = maxmap[a] maxClass = a } } base.SetClass(ret, predRowNo, maxClass) return true, nil }) return ret }
// Predict returns a classification for the vector, based on a vector input, using the KNN algorithm. func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Check what distance function we are using var distanceFunc pairwise.PairwiseDistanceFunc switch KNN.DistanceFunc { case "euclidean": distanceFunc = pairwise.NewEuclidean() case "manhattan": distanceFunc = pairwise.NewManhattan() default: panic("unsupported distance function") } // Check Compatibility allAttrs := base.CheckCompatible(what, KNN.TrainingData) if allAttrs == nil { // Don't have the same Attributes return nil } // Use optimised version if permitted if KNN.AllowOptimisations { if KNN.DistanceFunc == "euclidean" { if KNN.canUseOptimisations(what) { return KNN.optimisedEuclideanPredict(what.(*base.DenseInstances)) } } } fmt.Println("Optimisations are switched off") // Remove the Attributes which aren't numeric allNumericAttrs := make([]base.Attribute, 0) for _, a := range allAttrs { if fAttr, ok := a.(*base.FloatAttribute); ok { allNumericAttrs = append(allNumericAttrs, fAttr) } } // Generate return vector ret := base.GeneratePredictionVector(what) // Resolve Attribute specifications for both whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs) trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs) // Reserve storage for most the most similar items distances := make(map[int]float64) // Reserve storage for voting map maxmap := make(map[string]int) // Reserve storage for row computations trainRowBuf := make([]float64, len(allNumericAttrs)) predRowBuf := make([]float64, len(allNumericAttrs)) _, maxRow := what.Size() curRow := 0 // Iterate over all outer rows what.MapOverRows(whatAttrSpecs, func(predRow [][]byte, predRowNo int) (bool, error) { if (curRow%1) == 0 && curRow > 0 { fmt.Printf("KNN: %.2f %% done\n", float64(curRow)*100.0/float64(maxRow)) } curRow++ // Read the float values out for i, _ := range allNumericAttrs { predRowBuf[i] = base.UnpackBytesToFloat(predRow[i]) } predMat := utilities.FloatsToMatrix(predRowBuf) // Find the closest match in the training data KNN.TrainingData.MapOverRows(trainAttrSpecs, func(trainRow [][]byte, srcRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { trainRowBuf[i] = base.UnpackBytesToFloat(trainRow[i]) } // Compute the distance trainMat := utilities.FloatsToMatrix(trainRowBuf) distances[srcRowNo] = distanceFunc.Distance(predMat, trainMat) return true, nil }) sorted := utilities.SortIntMap(distances) values := sorted[:KNN.NearestNeighbours] maxClass := KNN.vote(maxmap, values) base.SetClass(ret, predRowNo, maxClass) return true, nil }) return ret }