Beispiel #1
0
// GenerateSplitRule returns the best attribute out of those randomly chosen
// which maximises Information Gain
func (r *RandomTreeRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule {

	var consideredAttributes []base.Attribute

	// First step is to generate the random attributes that we'll consider
	allAttributes := base.AttributeDifferenceReferences(f.AllAttributes(), f.AllClassAttributes())
	maximumAttribute := len(allAttributes)

	attrCounter := 0
	for {
		if len(consideredAttributes) >= r.Attributes {
			break
		}
		selectedAttrIndex := rand.Intn(maximumAttribute)
		selectedAttribute := allAttributes[selectedAttrIndex]
		matched := false
		for _, a := range consideredAttributes {
			if a.Equals(selectedAttribute) {
				matched = true
				break
			}
		}
		if matched {
			continue
		}
		consideredAttributes = append(consideredAttributes, selectedAttribute)
		attrCounter++
	}

	return r.internalRule.GetSplitRuleFromSelection(consideredAttributes, f)
}
Beispiel #2
0
func findBestSplit(partition base.FixedDataGrid) {
	var delta float64
	delta = math.MinInt64

	attrs := partition.AllAttributes()
	classAttrs := partition.AllClassAttributes()
	candidates := base.AttributeDifferenceReferences(attrs, classAttrs)

	fmt.Println(delta)
	fmt.Println(classAttrs)
	fmt.Println(reflect.TypeOf(partition))
	fmt.Println(reflect.TypeOf(candidates))

	for i, n := range attrs {
		fmt.Println(i)
		//fmt.Println(partition)
		fmt.Println(reflect.TypeOf(n))
		attributeSpec, _ := partition.GetAttribute(n)

		fmt.Println(partition.GetAttribute(n))
		_, rows := partition.Size()
		for j := 0; j < rows; j++ {
			data := partition.Get(attributeSpec, j)
			fmt.Println(base.UnpackBytesToFloat(data))
		}

	}
}
Beispiel #3
0
// GenerateSplitAttribute returns the non-class Attribute which maximises the
// information gain.
//
// IMPORTANT: passing a base.Instances with no Attributes other than the class
// variable will panic()
func (r *InformationGainRuleGenerator) GenerateSplitAttribute(f base.FixedDataGrid) base.Attribute {

	attrs := f.AllAttributes()
	classAttrs := f.AllClassAttributes()
	candidates := base.AttributeDifferenceReferences(attrs, classAttrs)

	return r.GetSplitAttributeFromSelection(candidates, f)
}
Beispiel #4
0
// GenerateSplitRule returns the non-class Attribute-based DecisionTreeRule
// which maximises the information gain.
//
// IMPORTANT: passing a base.Instances with no Attributes other than the class
// variable will panic()
func (g *GiniCoefficientRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule {

	attrs := f.AllAttributes()
	classAttrs := f.AllClassAttributes()
	candidates := base.AttributeDifferenceReferences(attrs, classAttrs)

	return g.GetSplitRuleFromSelection(candidates, f)
}
Beispiel #5
0
// Predict outputs a base.Instances containing predictions from this tree
func (d *DecisionTreeNode) Predict(what base.FixedDataGrid) (base.FixedDataGrid, error) {
	predictions := base.GeneratePredictionVector(what)
	classAttr := getClassAttr(predictions)
	classAttrSpec, err := predictions.GetAttribute(classAttr)
	if err != nil {
		panic(err)
	}
	predAttrs := base.AttributeDifferenceReferences(what.AllAttributes(), predictions.AllClassAttributes())
	predAttrSpecs := base.ResolveAttributes(what, predAttrs)
	what.MapOverRows(predAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
		cur := d
		for {
			if cur.Children == nil {
				predictions.Set(classAttrSpec, rowNo, classAttr.GetSysValFromString(cur.Class))
				break
			} else {
				splitVal := cur.SplitRule.SplitVal
				at := cur.SplitRule.SplitAttr
				ats, err := what.GetAttribute(at)
				if err != nil {
					//predictions.Set(classAttrSpec, rowNo, classAttr.GetSysValFromString(cur.Class))
					//break
					panic(err)
				}

				var classVar string
				if _, ok := ats.GetAttribute().(*base.FloatAttribute); ok {
					// If it's a numeric Attribute (e.g. FloatAttribute) check that
					// the value of the current node is greater than the old one
					classVal := base.UnpackBytesToFloat(what.Get(ats, rowNo))
					if classVal > splitVal {
						classVar = "1"
					} else {
						classVar = "0"
					}
				} else {
					classVar = ats.GetAttribute().GetStringFromSysVal(what.Get(ats, rowNo))
				}
				if next, ok := cur.Children[classVar]; ok {
					cur = next
				} else {
					// Suspicious of this
					var bestChild string
					for c := range cur.Children {
						bestChild = c
						if c > classVar {
							break
						}
					}
					cur = cur.Children[bestChild]
				}
			}
		}
		return true, nil
	})
	return predictions, nil
}
Beispiel #6
0
func (m *MultiLayerNet) convertToFloatInsts(X base.FixedDataGrid) base.FixedDataGrid {

	// Make sure everything's a FloatAttribute
	fFilt := filters.NewFloatConvertFilter()
	for _, a := range X.AllAttributes() {
		fFilt.AddAttribute(a)
	}
	fFilt.Train()
	insts := base.NewLazilyFilteredInstances(X, fFilt)
	return insts
}
Beispiel #7
0
// GenerateCrossFoldValidationConfusionMatrices divides the data into a number of folds
// then trains and evaluates the classifier on each fold, producing a new ConfusionMatrix.
func GenerateCrossFoldValidationConfusionMatrices(data base.FixedDataGrid, cls base.Classifier, folds int) ([]ConfusionMatrix, error) {
	_, rows := data.Size()

	// Assign each row to a fold
	foldMap := make([]int, rows)
	inverseFoldMap := make(map[int][]int)
	for i := 0; i < rows; i++ {
		fold := rand.Intn(folds)
		foldMap[i] = fold
		if _, ok := inverseFoldMap[fold]; !ok {
			inverseFoldMap[fold] = make([]int, 0)
		}
		inverseFoldMap[fold] = append(inverseFoldMap[fold], i)
	}

	ret := make([]ConfusionMatrix, folds)

	// Create training/test views for each fold
	for i := 0; i < folds; i++ {
		// Fold i is for testing
		testData := base.NewInstancesViewFromVisible(data, inverseFoldMap[i], data.AllAttributes())
		otherRows := make([]int, 0)
		for j := 0; j < folds; j++ {
			if i == j {
				continue
			}
			otherRows = append(otherRows, inverseFoldMap[j]...)
		}
		trainData := base.NewInstancesViewFromVisible(data, otherRows, data.AllAttributes())
		// Train
		err := cls.Fit(trainData)
		if err != nil {
			return nil, err
		}
		// Predict
		pred, err := cls.Predict(testData)
		if err != nil {
			return nil, err
		}
		// Evaluate
		cf, err := GetConfusionMatrix(testData, pred)
		if err != nil {
			return nil, err
		}
		ret[i] = cf
	}
	return ret, nil

}
Beispiel #8
0
func (m *OneVsAllModel) generateAttributes(from base.FixedDataGrid) map[base.Attribute]base.Attribute {
	attrs := from.AllAttributes()
	classAttrs := from.AllClassAttributes()
	if len(classAttrs) != 1 {
		panic("Only 1 class Attribute is supported!")
	}
	ret := make(map[base.Attribute]base.Attribute)
	for _, a := range attrs {
		ret[a] = a
		for _, b := range classAttrs {
			if a.Equals(b) {
				cur := base.NewFloatAttribute(b.GetName())
				ret[a] = cur
			}
		}
	}
	return ret
}
Beispiel #9
0
// Predict outputs a base.Instances containing predictions from this tree
func (d *DecisionTreeNode) Predict(what base.FixedDataGrid) base.FixedDataGrid {
	predictions := base.GeneratePredictionVector(what)
	classAttr := getClassAttr(predictions)
	classAttrSpec, err := predictions.GetAttribute(classAttr)
	if err != nil {
		panic(err)
	}
	predAttrs := base.AttributeDifferenceReferences(what.AllAttributes(), predictions.AllClassAttributes())
	predAttrSpecs := base.ResolveAttributes(what, predAttrs)
	what.MapOverRows(predAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
		cur := d
		for {
			if cur.Children == nil {
				predictions.Set(classAttrSpec, rowNo, classAttr.GetSysValFromString(cur.Class))
				break
			} else {
				at := cur.SplitAttr
				ats, err := what.GetAttribute(at)
				if err != nil {
					predictions.Set(classAttrSpec, rowNo, classAttr.GetSysValFromString(cur.Class))
					break
				}

				classVar := ats.GetAttribute().GetStringFromSysVal(what.Get(ats, rowNo))
				if next, ok := cur.Children[classVar]; ok {
					cur = next
				} else {
					var bestChild string
					for c := range cur.Children {
						bestChild = c
						if c > classVar {
							break
						}
					}
					cur = cur.Children[bestChild]
				}
			}
		}
		return true, nil
	})
	return predictions
}
Beispiel #10
0
// Fill data matrix with Bernoulli Naive Bayes model. All values
// necessary for calculating prior probability and p(f_i)
func (nb *BernoulliNBClassifier) Fit(X base.FixedDataGrid) {

	// Check that all Attributes are binary
	classAttrs := X.AllClassAttributes()
	allAttrs := X.AllAttributes()
	featAttrs := base.AttributeDifference(allAttrs, classAttrs)
	for i := range featAttrs {
		if _, ok := featAttrs[i].(*base.BinaryAttribute); !ok {
			panic(fmt.Sprintf("%v: Should be BinaryAttribute", featAttrs[i]))
		}
	}
	featAttrSpecs := base.ResolveAttributes(X, featAttrs)

	// Check that only one classAttribute is defined
	if len(classAttrs) != 1 {
		panic("Only one class Attribute can be used")
	}

	// Number of features and instances in this training set
	_, nb.trainingInstances = X.Size()
	nb.attrs = featAttrs
	nb.features = len(featAttrs)

	// Number of instances in class
	nb.classInstances = make(map[string]int)

	// Number of documents with given term (by class)
	docsContainingTerm := make(map[string][]int)

	// This algorithm could be vectorized after binarizing the data
	// matrix. Since mat64 doesn't have this function, a iterative
	// version is used.
	X.MapOverRows(featAttrSpecs, func(docVector [][]byte, r int) (bool, error) {
		class := base.GetClass(X, r)

		// increment number of instances in class
		t, ok := nb.classInstances[class]
		if !ok {
			t = 0
		}
		nb.classInstances[class] = t + 1

		for feat := 0; feat < len(docVector); feat++ {
			v := docVector[feat]
			// In Bernoulli Naive Bayes the presence and absence of
			// features are considered. All non-zero values are
			// treated as presence.
			if v[0] > 0 {
				// Update number of times this feature appeared within
				// given label.
				t, ok := docsContainingTerm[class]
				if !ok {
					t = make([]int, nb.features)
					docsContainingTerm[class] = t
				}
				t[feat] += 1
			}
		}
		return true, nil
	})

	// Pre-calculate conditional probabilities for each class
	for c, _ := range nb.classInstances {
		nb.condProb[c] = make([]float64, nb.features)
		for feat := 0; feat < nb.features; feat++ {
			classTerms, _ := docsContainingTerm[c]
			numDocs := classTerms[feat]
			docsInClass, _ := nb.classInstances[c]

			classCondProb, _ := nb.condProb[c]
			// Calculate conditional probability with laplace smoothing
			classCondProb[feat] = float64(numDocs+1) / float64(docsInClass+1)
		}
	}
}