// Predict is just a wrapper for the PredictOne function. // // IMPORTANT: Predict panics if Fit was not called or if the // document vector and train matrix have a different number of columns. func (nb *BernoulliNBClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Generate return vector ret := base.GeneratePredictionVector(what) // Get the features featAttrSpecs := base.ResolveAttributes(what, nb.attrs) what.MapOverRows(featAttrSpecs, func(row [][]byte, i int) (bool, error) { base.SetClass(ret, i, nb.PredictOne(row)) return true, nil }) return ret }
// Predict returns a classification for the vector, based on a vector input, using the KNN algorithm. func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Check what distance function we are using var distanceFunc pairwise.PairwiseDistanceFunc switch KNN.DistanceFunc { case "euclidean": distanceFunc = pairwise.NewEuclidean() case "manhattan": distanceFunc = pairwise.NewManhattan() default: panic("unsupported distance function") } // Check Compatibility allAttrs := base.CheckCompatible(what, KNN.TrainingData) if allAttrs == nil { // Don't have the same Attributes return nil } // Remove the Attributes which aren't numeric allNumericAttrs := make([]base.Attribute, 0) for _, a := range allAttrs { if fAttr, ok := a.(*base.FloatAttribute); ok { allNumericAttrs = append(allNumericAttrs, fAttr) } } // Generate return vector ret := base.GeneratePredictionVector(what) // Resolve Attribute specifications for both whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs) trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs) // Reserve storage for most the most similar items distances := make(map[int]float64) // Reserve storage for voting map maxmap := make(map[string]int) // Reserve storage for row computations trainRowBuf := make([]float64, len(allNumericAttrs)) predRowBuf := make([]float64, len(allNumericAttrs)) // Iterate over all outer rows what.MapOverRows(whatAttrSpecs, func(predRow [][]byte, predRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { predRowBuf[i] = base.UnpackBytesToFloat(predRow[i]) } predMat := utilities.FloatsToMatrix(predRowBuf) // Find the closest match in the training data KNN.TrainingData.MapOverRows(trainAttrSpecs, func(trainRow [][]byte, srcRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { trainRowBuf[i] = base.UnpackBytesToFloat(trainRow[i]) } // Compute the distance trainMat := utilities.FloatsToMatrix(trainRowBuf) distances[srcRowNo] = distanceFunc.Distance(predMat, trainMat) return true, nil }) sorted := utilities.SortIntMap(distances) values := sorted[:KNN.NearestNeighbours] // Reset maxMap for a := range maxmap { maxmap[a] = 0 } // Refresh maxMap for _, elem := range values { label := base.GetClass(KNN.TrainingData, elem) if _, ok := maxmap[label]; ok { maxmap[label]++ } else { maxmap[label] = 1 } } // Sort the maxMap var maxClass string maxVal := -1 for a := range maxmap { if maxmap[a] > maxVal { maxVal = maxmap[a] maxClass = a } } base.SetClass(ret, predRowNo, maxClass) return true, nil }) return ret }
// Predict gathers predictions from all the classifiers // and outputs the most common (majority) class // // IMPORTANT: in the event of a tie, the first class which // achieved the tie value is output. func (b *BaggedModel) Predict(from base.FixedDataGrid) base.FixedDataGrid { n := runtime.NumCPU() // Channel to receive the results as they come in votes := make(chan base.DataGrid, n) // Count the votes for each class voting := make(map[int](map[string]int)) // Create a goroutine to collect the votes var votingwait sync.WaitGroup votingwait.Add(1) go func() { for { // Need to resolve the voting problem incoming, ok := <-votes if ok { cSpecs := base.ResolveAttributes(incoming, incoming.AllClassAttributes()) incoming.MapOverRows(cSpecs, func(row [][]byte, predRow int) (bool, error) { // Check if we've seen this class before... if _, ok := voting[predRow]; !ok { // If we haven't, create an entry voting[predRow] = make(map[string]int) // Continue on the current row } voting[predRow][base.GetClass(incoming, predRow)]++ return true, nil }) } else { votingwait.Done() break } } }() // Create workers to process the predictions processpipe := make(chan int, n) var processwait sync.WaitGroup for i := 0; i < n; i++ { processwait.Add(1) go func() { for { if i, ok := <-processpipe; ok { c := b.Models[i] l := b.generatePredictionInstances(i, from) votes <- c.Predict(l) } else { processwait.Done() break } } }() } // Send all the models to the workers for prediction for i := range b.Models { processpipe <- i } close(processpipe) // Finished sending models to be predicted processwait.Wait() // Predictors all finished processing close(votes) // Close the vote channel and allow it to drain votingwait.Wait() // All the votes are in // Generate the overall consensus ret := base.GeneratePredictionVector(from) for i := range voting { maxClass := "" maxCount := 0 // Find the most popular class for c := range voting[i] { votes := voting[i][c] if votes > maxCount { maxClass = c maxCount = votes } } base.SetClass(ret, i, maxClass) } return ret }
// Predict returns a classification for the vector, based on a vector input, using the KNN algorithm. func (KNN *KNNClassifier) Predict(what base.FixedDataGrid) base.FixedDataGrid { // Check what distance function we are using var distanceFunc pairwise.PairwiseDistanceFunc switch KNN.DistanceFunc { case "euclidean": distanceFunc = pairwise.NewEuclidean() case "manhattan": distanceFunc = pairwise.NewManhattan() default: panic("unsupported distance function") } // Check Compatibility allAttrs := base.CheckCompatible(what, KNN.TrainingData) if allAttrs == nil { // Don't have the same Attributes return nil } // Use optimised version if permitted if KNN.AllowOptimisations { if KNN.DistanceFunc == "euclidean" { if KNN.canUseOptimisations(what) { return KNN.optimisedEuclideanPredict(what.(*base.DenseInstances)) } } } fmt.Println("Optimisations are switched off") // Remove the Attributes which aren't numeric allNumericAttrs := make([]base.Attribute, 0) for _, a := range allAttrs { if fAttr, ok := a.(*base.FloatAttribute); ok { allNumericAttrs = append(allNumericAttrs, fAttr) } } // Generate return vector ret := base.GeneratePredictionVector(what) // Resolve Attribute specifications for both whatAttrSpecs := base.ResolveAttributes(what, allNumericAttrs) trainAttrSpecs := base.ResolveAttributes(KNN.TrainingData, allNumericAttrs) // Reserve storage for most the most similar items distances := make(map[int]float64) // Reserve storage for voting map maxmap := make(map[string]int) // Reserve storage for row computations trainRowBuf := make([]float64, len(allNumericAttrs)) predRowBuf := make([]float64, len(allNumericAttrs)) _, maxRow := what.Size() curRow := 0 // Iterate over all outer rows what.MapOverRows(whatAttrSpecs, func(predRow [][]byte, predRowNo int) (bool, error) { if (curRow%1) == 0 && curRow > 0 { fmt.Printf("KNN: %.2f %% done\n", float64(curRow)*100.0/float64(maxRow)) } curRow++ // Read the float values out for i, _ := range allNumericAttrs { predRowBuf[i] = base.UnpackBytesToFloat(predRow[i]) } predMat := utilities.FloatsToMatrix(predRowBuf) // Find the closest match in the training data KNN.TrainingData.MapOverRows(trainAttrSpecs, func(trainRow [][]byte, srcRowNo int) (bool, error) { // Read the float values out for i, _ := range allNumericAttrs { trainRowBuf[i] = base.UnpackBytesToFloat(trainRow[i]) } // Compute the distance trainMat := utilities.FloatsToMatrix(trainRowBuf) distances[srcRowNo] = distanceFunc.Distance(predMat, trainMat) return true, nil }) sorted := utilities.SortIntMap(distances) values := sorted[:KNN.NearestNeighbours] maxClass := KNN.vote(maxmap, values) base.SetClass(ret, predRowNo, maxClass) return true, nil }) return ret }
func (KNN *KNNClassifier) optimisedEuclideanPredict(d *base.DenseInstances) base.FixedDataGrid { // Create return vector ret := base.GeneratePredictionVector(d) // Type-assert training data tr := KNN.TrainingData.(*base.DenseInstances) // Enumeration of AttributeGroups agPos := make(map[string]int) agTrain := tr.AllAttributeGroups() agPred := d.AllAttributeGroups() classAttrs := tr.AllClassAttributes() counter := 0 for ag := range agTrain { // Detect whether the AttributeGroup has any classes in it attrs := agTrain[ag].Attributes() //matched := false if len(base.AttributeIntersect(classAttrs, attrs)) == 0 { agPos[ag] = counter } counter++ } // Pointers to the start of each prediction row rowPointers := make([]*C.double, len(agPred)) trainPointers := make([]*C.double, len(agPred)) rowSizes := make([]int, len(agPred)) for ag := range agPred { if ap, ok := agPos[ag]; ok { rowPointers[ap] = (*C.double)(unsafe.Pointer(&(agPred[ag].Storage()[0]))) trainPointers[ap] = (*C.double)(unsafe.Pointer(&(agTrain[ag].Storage()[0]))) rowSizes[ap] = agPred[ag].RowSizeInBytes() / 8 } } _, predRows := d.Size() _, trainRows := tr.Size() // Crete the distance vector distanceVec := distanceRecs(make([]_Ctype_struct_dist, trainRows)) // Additional datastructures voteVec := make([]int, KNN.NearestNeighbours) maxMap := make(map[string]int) for row := 0; row < predRows; row++ { for i := 0; i < trainRows; i++ { distanceVec[i].dist = 0 } for ag := range agPred { if ap, ok := agPos[ag]; ok { C.euclidean_distance( &(distanceVec[0]), C.int(trainRows), C.int(len(agPred[ag].Attributes())), C.int(row), trainPointers[ap], rowPointers[ap], ) } } sort.Sort(distanceVec) votes := distanceVec[:KNN.NearestNeighbours] for i, v := range votes { voteVec[i] = int(v.p) } maxClass := KNN.vote(maxMap, voteVec) base.SetClass(ret, row, maxClass) } return ret }
func main() { // Instances can be read using ParseCsvToInstances rawData, err := base.ParseCSVToInstances("../datasets/iris_headers.csv", true) if err != nil { panic(err) } // Instances can be printed, and you'll see a human-readable summary // if you do so. The first section is a line like // Instances with 150 row(s) and 5 attribute(s) // // It next prints all the attributes // FloatAttribute(Sepal length) // FloatAttribute(Sepal width) // FloatAttribute(Petal length) // FloatAttribute(Petal width) // CategoricalAttribute([Iris-setosa Iris-versicolor Iris-viriginica]) // The final attribute has an asterisk (*) printed before it, // meaning that it is the class variable. It then prints out up to // 30 rows which correspond to those attributes. // 5.10 3.50 1.40 0.20 Iris-setosa // 4.90 3.00 1.40 0.20 Iris-setosa fmt.Println(rawData) // If two decimal places isn't enough, you can update the // Precision field on any FloatAttribute if attr, ok := rawData.AllAttributes()[0].(*base.FloatAttribute); !ok { panic("Invalid cast") } else { attr.Precision = 4 } // Now the first column has more precision fmt.Println(rawData) // We can update the set of Instances, although the API // for doing so is not very sophisticated. // First, have to resolve Attribute Specifications as := base.ResolveAttributes(rawData, rawData.AllAttributes()) // Attribute Specifications describe where a given column lives rawData.Set(as[0], 0, as[0].GetAttribute().GetSysValFromString("1.00")) // A SetClass method exists as a shortcut base.SetClass(rawData, 0, "Iris-unusual") fmt.Println(rawData) // There is a way of creating new Instances from scratch. // Inside an Instance, everything's stored as float64 newData := make([]float64, 2) newData[0] = 1.0 newData[1] = 0.0 // Let's create some attributes attrs := make([]base.Attribute, 2) attrs[0] = base.NewFloatAttribute("Arbitrary Float Quantity") attrs[1] = new(base.CategoricalAttribute) attrs[1].SetName("Class") // Insert a standard class attrs[1].GetSysValFromString("A") // Now let's create the final instances set newInst := base.NewDenseInstances() // Add the attributes newSpecs := make([]base.AttributeSpec, len(attrs)) for i, a := range attrs { newSpecs[i] = newInst.AddAttribute(a) } // Allocate space newInst.Extend(1) // Write the data newInst.Set(newSpecs[0], 0, newSpecs[0].GetAttribute().GetSysValFromString("1.0")) newInst.Set(newSpecs[1], 0, newSpecs[1].GetAttribute().GetSysValFromString("A")) fmt.Println(newInst) }