// generateTrainingAttrs selects RandomFeatures number of base.Attributes from // the provided base.Instances. func (b *BaggedModel) generateTrainingAttrs(model int, from base.FixedDataGrid) []base.Attribute { ret := make([]base.Attribute, 0) attrs := base.NonClassAttributes(from) if b.RandomFeatures == 0 { ret = attrs } else { for { if len(ret) >= b.RandomFeatures { break } attrIndex := rand.Intn(len(attrs)) attr := attrs[attrIndex] matched := false for _, a := range ret { if a.Equals(attr) { matched = true break } } if !matched { ret = append(ret, attr) } } } for _, a := range from.AllClassAttributes() { ret = append(ret, a) } b.lock.Lock() b.selectedAttributes[model] = ret b.lock.Unlock() return ret }
func convertToBinary(src base.FixedDataGrid) base.FixedDataGrid { // Convert to binary b := filters.NewBinaryConvertFilter() attrs := base.NonClassAttributes(src) for _, a := range attrs { b.AddAttribute(a) } b.Train() ret := base.NewLazilyFilteredInstances(src, b) return ret }
func (KNN *KNNClassifier) canUseOptimisations(what base.FixedDataGrid) bool { // Check that the two have exactly the same layout if !base.CheckStrictlyCompatible(what, KNN.TrainingData) { return false } // Check that the two are DenseInstances whatd, ok1 := what.(*base.DenseInstances) _, ok2 := KNN.TrainingData.(*base.DenseInstances) if !ok1 || !ok2 { return false } // Check that no Class Attributes are mixed in with the data classAttrs := whatd.AllClassAttributes() normalAttrs := base.NonClassAttributes(whatd) // Retrieve all the AGs ags := whatd.AllAttributeGroups() classAttrGroups := make([]base.AttributeGroup, 0) for agName := range ags { ag := ags[agName] attrs := ag.Attributes() matched := false for _, a := range attrs { for _, c := range classAttrs { if a.Equals(c) { matched = true } } } if matched { classAttrGroups = append(classAttrGroups, ag) } } for _, cag := range classAttrGroups { attrs := cag.Attributes() common := base.AttributeIntersect(normalAttrs, attrs) if len(common) != 0 { return false } } // Check that all of the Attributes are numeric for _, a := range normalAttrs { if _, ok := a.(*base.FloatAttribute); !ok { return false } } // If that's fine, return true return true }
func TestRandomForest(t *testing.T) { Convey("Given a valid CSV file", t, func() { inst, err := base.ParseCSVToInstances("../examples/datasets/iris_headers.csv", true) So(err, ShouldBeNil) Convey("When Chi-Merge filtering the data", func() { filt := filters.NewChiMergeFilter(inst, 0.90) for _, a := range base.NonClassFloatAttributes(inst) { filt.AddAttribute(a) } filt.Train() instf := base.NewLazilyFilteredInstances(inst, filt) Convey("Splitting the data into test and training sets", func() { trainData, testData := base.InstancesTrainTestSplit(instf, 0.60) Convey("Fitting and predicting with a Random Forest", func() { rf := NewRandomForest(10, 3) err = rf.Fit(trainData) So(err, ShouldBeNil) predictions, err := rf.Predict(testData) So(err, ShouldBeNil) confusionMat, err := evaluation.GetConfusionMatrix(testData, predictions) So(err, ShouldBeNil) Convey("Predictions should be somewhat accurate", func() { So(evaluation.GetAccuracy(confusionMat), ShouldBeGreaterThan, 0.35) }) }) }) }) Convey("Fitting with a Random Forest with too many features compared to the data", func() { rf := NewRandomForest(10, len(base.NonClassAttributes(inst))+1) err = rf.Fit(inst) Convey("Should return an error", func() { So(err, ShouldNotBeNil) }) }) }) }
// Fit builds the RandomForest on the specified instances func (f *RandomForest) Fit(on base.FixedDataGrid) error { numNonClassAttributes := len(base.NonClassAttributes(on)) if numNonClassAttributes < f.Features { return errors.New(fmt.Sprintf( "Random forest with %d features cannot fit data grid with %d non-class attributes", f.Features, numNonClassAttributes, )) } f.Model = new(meta.BaggedModel) f.Model.RandomFeatures = f.Features for i := 0; i < f.ForestSize; i++ { tree := trees.NewID3DecisionTree(0.00) f.Model.AddModel(tree) } f.Model.Fit(on) return nil }
func TestBinaryFilter(t *testing.T) { Convey("Given a contrived dataset...", t, func() { // Read the contrived dataset inst, err := base.ParseCSVToInstances("./binary_test.csv", true) So(err, ShouldEqual, nil) // Add Attributes to the filter bFilt := NewBinaryConvertFilter() bAttrs := base.NonClassAttributes(inst) for _, a := range bAttrs { bFilt.AddAttribute(a) } bFilt.Train() // Construct a LazilyFilteredInstances to handle it instF := base.NewLazilyFilteredInstances(inst, bFilt) Convey("All the non-class Attributes should be binary...", func() { // Check that all the Attributes are the right type for _, a := range base.NonClassAttributes(instF) { _, ok := a.(*base.BinaryAttribute) So(ok, ShouldEqual, true) } }) // Check that all the class Attributes made it Convey("All the class Attributes should have survived...", func() { origClassAttrs := inst.AllClassAttributes() newClassAttrs := instF.AllClassAttributes() intersectClassAttrs := base.AttributeIntersect(origClassAttrs, newClassAttrs) So(len(intersectClassAttrs), ShouldEqual, len(origClassAttrs)) }) // Check that the Attributes have the right names Convey("Attribute names should be correct...", func() { origNames := []string{"floatAttr", "shouldBe1Binary", "shouldBe3Binary_stoicism", "shouldBe3Binary_heroism", "shouldBe3Binary_romanticism", "arbitraryClass"} origMap := make(map[string]bool) for _, a := range origNames { origMap[a] = false } for _, a := range instF.AllAttributes() { name := a.GetName() _, ok := origMap[name] if !ok { t.Error(fmt.Sprintf("Weird: %s", name)) } origMap[name] = true } for a := range origMap { So(origMap[a], ShouldEqual, true) } }) // Check that the Attributes have been discretised correctly Convey("Discretisation should have worked", func() { // Build Attribute map attrMap := make(map[string]base.Attribute) for _, a := range instF.AllAttributes() { attrMap[a.GetName()] = a } // For each attribute for name := range attrMap { attr := attrMap[name] // Retrieve AttributeSpec as, err := instF.GetAttribute(attr) So(err, ShouldEqual, nil) if name == "floatAttr" { So(instF.Get(as, 0), ShouldResemble, []byte{1}) So(instF.Get(as, 1), ShouldResemble, []byte{1}) So(instF.Get(as, 2), ShouldResemble, []byte{0}) } else if name == "shouldBe1Binary" { So(instF.Get(as, 0), ShouldResemble, []byte{0}) So(instF.Get(as, 1), ShouldResemble, []byte{1}) So(instF.Get(as, 2), ShouldResemble, []byte{1}) } else if name == "shouldBe3Binary_stoicism" { So(instF.Get(as, 0), ShouldResemble, []byte{1}) So(instF.Get(as, 1), ShouldResemble, []byte{0}) So(instF.Get(as, 2), ShouldResemble, []byte{0}) } else if name == "shouldBe3Binary_heroism" { So(instF.Get(as, 0), ShouldResemble, []byte{0}) So(instF.Get(as, 1), ShouldResemble, []byte{1}) So(instF.Get(as, 2), ShouldResemble, []byte{0}) } else if name == "shouldBe3Binary_romanticism" { So(instF.Get(as, 0), ShouldResemble, []byte{0}) So(instF.Get(as, 1), ShouldResemble, []byte{0}) So(instF.Get(as, 2), ShouldResemble, []byte{1}) } else if name == "arbitraryClass" { } else { t.Error("Shouldn't have %s", name) } } }) }) }
// Predict uses the underlying network to produce predictions for the // class variables of X. // // Can only predict one CategoricalAttribute at a time, or up to n // FloatAttributes. Set or unset ClassAttributes to work around this // limitation. func (m *MultiLayerNet) Predict(X base.FixedDataGrid) base.FixedDataGrid { // Create the return vector ret := base.GeneratePredictionVector(X) // Make sure everything's a FloatAttribute insts := m.convertToFloatInsts(X) // Get the input/output Attributes inputAttrs := base.NonClassAttributes(insts) outputAttrs := ret.AllClassAttributes() // Compute layers layers := 2 + len(m.layers) // Check that we're operating in a singular mode floatMode := 0 categoricalMode := 0 for _, a := range outputAttrs { if _, ok := a.(*base.CategoricalAttribute); ok { categoricalMode++ } else if _, ok := a.(*base.FloatAttribute); ok { floatMode++ } else { panic("Unsupported output Attribute type!") } } if floatMode > 0 && categoricalMode > 0 { panic("Can't predict a mix of float and categorical Attributes") } else if categoricalMode > 1 { panic("Can't predict more than one categorical class Attribute") } // Create the activation vector a := mat64.NewDense(m.network.size, 1, make([]float64, m.network.size)) // Resolve the input AttributeSpecs inputAs := base.ResolveAttributes(insts, inputAttrs) // Resolve the output Attributespecs outputAs := base.ResolveAttributes(ret, outputAttrs) // Map over each input row insts.MapOverRows(inputAs, func(row [][]byte, rc int) (bool, error) { // Clear the activation vector for i := 0; i < m.network.size; i++ { a.Set(i, 0, 0.0) } // Build the activation vector for i, vb := range row { if cIndex, ok := m.attrs[inputAs[i].GetAttribute()]; !ok { panic("Can't resolve the Attribute!") } else { a.Set(cIndex, 0, base.UnpackBytesToFloat(vb)) } } // Robots, activate! m.network.Activate(a, layers) // Decide which class to set if floatMode > 0 { for _, as := range outputAs { cIndex := m.attrs[as.GetAttribute()] ret.Set(as, rc, base.PackFloatToBytes(a.At(cIndex, 0))) } } else { maxIndex := 0 maxVal := 0.0 for i := m.classAttrOffset; i < m.classAttrOffset+m.classAttrCount; i++ { val := a.At(i, 0) if val > maxVal { maxIndex = i maxVal = val } } maxIndex -= m.classAttrOffset ret.Set(outputAs[0], rc, base.PackU64ToBytes(uint64(maxIndex))) } return true, nil }) return ret }
// Fit trains the neural network on the given fixed datagrid. // // Training stops when the mean-squared error acheived is less // than the Convergence value, or when back-propagation has occured // more times than the value set by MaxIterations. func (m *MultiLayerNet) Fit(X base.FixedDataGrid) { // Make sure everything's a FloatAttribute insts := m.convertToFloatInsts(X) // The size of the first layer is the number of things // in the revised instances which aren't class Attributes inputAttrsVec := base.NonClassAttributes(insts) // The size of the output layer is the number of things // in the revised instances which are class Attributes classAttrsVec := insts.AllClassAttributes() // The total number of layers is input layer + output layer // plus number of layers specified totalLayers := 2 + len(m.layers) // The size is then augmented by the number of nodes // in the centre size := len(inputAttrsVec) size += len(classAttrsVec) hiddenSize := 0 for _, a := range m.layers { size += a hiddenSize += a } // Enumerate the Attributes trainingAttrs := make(map[base.Attribute]int) classAttrs := make(map[base.Attribute]int) attrCounter := 0 for i, a := range inputAttrsVec { attrCounter = i m.attrs[a] = attrCounter trainingAttrs[a] = attrCounter } m.classAttrOffset = attrCounter + 1 for _, a := range classAttrsVec { attrCounter++ m.attrs[a] = attrCounter + hiddenSize classAttrs[a] = attrCounter + hiddenSize m.classAttrCount++ } // Create the underlying Network m.network = NewNetwork(size, len(inputAttrsVec), Sigmoid) // Initialise inter-hidden layer weights and biases to small random values layerOffset := len(inputAttrsVec) for i := 0; i < len(m.layers)-1; i++ { // Get the size of this layer thisLayerSize := m.layers[i] // Next layer size nextLayerSize := m.layers[i+1] // For every node in this layer for j := 1; j <= thisLayerSize; j++ { // Compute the offset nodeOffset1 := layerOffset + j // For every node in the next layer for k := 1; k <= nextLayerSize; k++ { // Compute offset nodeOffset2 := layerOffset + thisLayerSize + k // Set weight randomly m.network.SetWeight(nodeOffset1, nodeOffset2, rand.NormFloat64()*0.1) } } layerOffset += thisLayerSize } // Initialise biases with each hidden layer layerOffset = len(inputAttrsVec) for _, l := range m.layers { for j := 1; j <= l; j++ { nodeOffset := layerOffset + j m.network.SetBias(nodeOffset, rand.NormFloat64()*0.1) } layerOffset += l } // Initialise biases for output layer for i := 0; i < len(classAttrsVec); i++ { nodeOffset := layerOffset + i m.network.SetBias(nodeOffset, rand.NormFloat64()*0.1) } // Connect final hidden layer with the output layer layerOffset = len(inputAttrsVec) for i, l := range m.layers { if i == len(m.layers)-1 { for j := 1; j <= l; j++ { nodeOffset1 := layerOffset + j for k := 1; k <= len(classAttrsVec); k++ { nodeOffset2 := layerOffset + l + k m.network.SetWeight(nodeOffset1, nodeOffset2, rand.NormFloat64()*0.1) } } } layerOffset += l } // Connect input layer with first hidden layer (or output layer for i := 1; i <= len(inputAttrsVec); i++ { nextLayerLen := 0 if len(m.layers) > 0 { nextLayerLen = m.layers[0] } else { nextLayerLen = len(classAttrsVec) } for j := 1; j <= nextLayerLen; j++ { nodeOffset := len(inputAttrsVec) + j v := rand.NormFloat64() * 0.1 m.network.SetWeight(i, nodeOffset, v) } } // Create the training activation vector trainVec := mat64.NewDense(size, 1, make([]float64, size)) // Create the error vector errVec := mat64.NewDense(size, 1, make([]float64, size)) // Resolve training AttributeSpecs trainAs := base.ResolveAllAttributes(insts) // Feed-forward, compute error and update for each training example // until convergence (what's that) for iteration := 0; iteration < m.MaxIterations; iteration++ { totalError := 0.0 maxRow := 0 insts.MapOverRows(trainAs, func(row [][]byte, i int) (bool, error) { maxRow = i // Clear vectors for i := 0; i < size; i++ { trainVec.Set(i, 0, 0.0) errVec.Set(i, 0, 0.0) } // Build vectors for i, vb := range row { v := base.UnpackBytesToFloat(vb) if attrIndex, ok := trainingAttrs[trainAs[i].GetAttribute()]; ok { // Add to Activation vector trainVec.Set(attrIndex, 0, v) } else if attrIndex, ok := classAttrs[trainAs[i].GetAttribute()]; ok { // Set to error vector errVec.Set(attrIndex, 0, v) } else { panic("Should be able to find this Attribute!") } } // Activate the network m.network.Activate(trainVec, totalLayers-1) // Compute the error for a := range classAttrs { cIndex := classAttrs[a] errVec.Set(cIndex, 0, errVec.At(cIndex, 0)-trainVec.At(cIndex, 0)) } // Update total error totalError += math.Abs(errVec.Sum()) // Back-propagate the error b := m.network.Error(trainVec, errVec, totalLayers) // Update the weights m.network.UpdateWeights(trainVec, b, m.LearningRate) // Update the biases m.network.UpdateBias(b, m.LearningRate) return true, nil }) totalError /= float64(maxRow) // If we've converged, no need to carry on if totalError < m.Convergence { break } } }
func TestFloatFilter(t *testing.T) { Convey("Given a contrived dataset...", t, func() { // Read the contrived dataset inst, err := base.ParseCSVToInstances("./binary_test.csv", true) So(err, ShouldEqual, nil) // Add Attributes to the filter bFilt := NewFloatConvertFilter() bAttrs := base.NonClassAttributes(inst) for _, a := range bAttrs { bFilt.AddAttribute(a) } bFilt.Train() // Construct a LazilyFilteredInstances to handle it instF := base.NewLazilyFilteredInstances(inst, bFilt) Convey("All the non-class Attributes should be floats...", func() { // Check that all the Attributes are the right type for _, a := range base.NonClassAttributes(instF) { _, ok := a.(*base.FloatAttribute) So(ok, ShouldEqual, true) } }) // Check that all the class Attributes made it Convey("All the class Attributes should have survived...", func() { origClassAttrs := inst.AllClassAttributes() newClassAttrs := instF.AllClassAttributes() intersectClassAttrs := base.AttributeIntersect(origClassAttrs, newClassAttrs) So(len(intersectClassAttrs), ShouldEqual, len(origClassAttrs)) }) // Check that the Attributes have the right names Convey("Attribute names should be correct...", func() { origNames := []string{"floatAttr", "shouldBe1Binary", "shouldBe3Binary_stoicism", "shouldBe3Binary_heroism", "shouldBe3Binary_romanticism", "arbitraryClass"} origMap := make(map[string]bool) for _, a := range origNames { origMap[a] = false } for _, a := range instF.AllAttributes() { name := a.GetName() _, ok := origMap[name] So(ok, ShouldBeTrue) origMap[name] = true } for a := range origMap { So(origMap[a], ShouldEqual, true) } }) Convey("All Attributes should be the correct type...", func() { for _, a := range instF.AllAttributes() { if a.GetName() == "arbitraryClass" { _, ok := a.(*base.CategoricalAttribute) So(ok, ShouldEqual, true) } else { _, ok := a.(*base.FloatAttribute) So(ok, ShouldEqual, true) } } }) // Check that the Attributes have been discretised correctly Convey("FloatConversion should have worked", func() { // Build Attribute map attrMap := make(map[string]base.Attribute) for _, a := range instF.AllAttributes() { attrMap[a.GetName()] = a } // For each attribute for name := range attrMap { So(name, ShouldBeIn, []string{ "floatAttr", "shouldBe1Binary", "shouldBe3Binary_stoicism", "shouldBe3Binary_heroism", "shouldBe3Binary_romanticism", "arbitraryClass", }) attr := attrMap[name] as, err := instF.GetAttribute(attr) So(err, ShouldEqual, nil) if name == "floatAttr" { So(instF.Get(as, 0), ShouldResemble, base.PackFloatToBytes(1.0)) So(instF.Get(as, 1), ShouldResemble, base.PackFloatToBytes(1.0)) So(instF.Get(as, 2), ShouldResemble, base.PackFloatToBytes(0.0)) } else if name == "shouldBe1Binary" { So(instF.Get(as, 0), ShouldResemble, base.PackFloatToBytes(0.0)) So(instF.Get(as, 1), ShouldResemble, base.PackFloatToBytes(1.0)) So(instF.Get(as, 2), ShouldResemble, base.PackFloatToBytes(1.0)) } else if name == "shouldBe3Binary_stoicism" { So(instF.Get(as, 0), ShouldResemble, base.PackFloatToBytes(1.0)) So(instF.Get(as, 1), ShouldResemble, base.PackFloatToBytes(0.0)) So(instF.Get(as, 2), ShouldResemble, base.PackFloatToBytes(0.0)) } else if name == "shouldBe3Binary_heroism" { So(instF.Get(as, 0), ShouldResemble, base.PackFloatToBytes(0.0)) So(instF.Get(as, 1), ShouldResemble, base.PackFloatToBytes(1.0)) So(instF.Get(as, 2), ShouldResemble, base.PackFloatToBytes(0.0)) } else if name == "shouldBe3Binary_romanticism" { So(instF.Get(as, 0), ShouldResemble, base.PackFloatToBytes(0.0)) So(instF.Get(as, 1), ShouldResemble, base.PackFloatToBytes(0.0)) So(instF.Get(as, 2), ShouldResemble, base.PackFloatToBytes(1.0)) } else if name == "arbitraryClass" { } } }) }) }
func (lr *LinearRegression) Fit(inst base.FixedDataGrid) error { // Retrieve row size _, rows := inst.Size() // Validate class Attribute count classAttrs := inst.AllClassAttributes() if len(classAttrs) != 1 { return fmt.Errorf("Only 1 class variable is permitted") } classAttrSpecs := base.ResolveAttributes(inst, classAttrs) // Retrieve relevant Attributes allAttrs := base.NonClassAttributes(inst) attrs := make([]base.Attribute, 0) for _, a := range allAttrs { if _, ok := a.(*base.FloatAttribute); ok { attrs = append(attrs, a) } } cols := len(attrs) + 1 if rows < cols { return NotEnoughDataError } // Retrieve relevant Attribute specifications attrSpecs := base.ResolveAttributes(inst, attrs) // Split into two matrices, observed results (dependent variable y) // and the explanatory variables (X) - see http://en.wikipedia.org/wiki/Linear_regression observed := mat64.NewDense(rows, 1, nil) explVariables := mat64.NewDense(rows, cols, nil) // Build the observed matrix inst.MapOverRows(classAttrSpecs, func(row [][]byte, i int) (bool, error) { val := base.UnpackBytesToFloat(row[0]) observed.Set(i, 0, val) return true, nil }) // Build the explainatory variables inst.MapOverRows(attrSpecs, func(row [][]byte, i int) (bool, error) { // Set intercepts to 1.0 explVariables.Set(i, 0, 1.0) for j, r := range row { explVariables.Set(i, j+1, base.UnpackBytesToFloat(r)) } return true, nil }) n := cols qr := new(mat64.QR) qr.Factorize(explVariables) var q, reg mat64.Dense q.QFromQR(qr) reg.RFromQR(qr) var transposed, qty mat64.Dense transposed.Clone(q.T()) qty.Mul(&transposed, observed) regressionCoefficients := make([]float64, n) for i := n - 1; i >= 0; i-- { regressionCoefficients[i] = qty.At(i, 0) for j := i + 1; j < n; j++ { regressionCoefficients[i] -= regressionCoefficients[j] * reg.At(i, j) } regressionCoefficients[i] /= reg.At(i, i) } lr.disturbance = regressionCoefficients[0] lr.regressionCoefficients = regressionCoefficients[1:] lr.fitted = true lr.attrs = attrs lr.cls = classAttrs[0] return nil }