Ejemplo n.º 1
0
func findBestSplit(partition base.FixedDataGrid) {
	var delta float64
	delta = math.MinInt64

	attrs := partition.AllAttributes()
	classAttrs := partition.AllClassAttributes()
	candidates := base.AttributeDifferenceReferences(attrs, classAttrs)

	fmt.Println(delta)
	fmt.Println(classAttrs)
	fmt.Println(reflect.TypeOf(partition))
	fmt.Println(reflect.TypeOf(candidates))

	for i, n := range attrs {
		fmt.Println(i)
		//fmt.Println(partition)
		fmt.Println(reflect.TypeOf(n))
		attributeSpec, _ := partition.GetAttribute(n)

		fmt.Println(partition.GetAttribute(n))
		_, rows := partition.Size()
		for j := 0; j < rows; j++ {
			data := partition.Get(attributeSpec, j)
			fmt.Println(base.UnpackBytesToFloat(data))
		}

	}
}
Ejemplo n.º 2
0
func (lr *LogisticRegression) Predict(X base.FixedDataGrid) base.FixedDataGrid {

	// Only support 1 class Attribute
	classAttrs := X.AllClassAttributes()
	if len(classAttrs) != 1 {
		panic(fmt.Sprintf("%d Wrong number of classes", len(classAttrs)))
	}
	// Generate return structure
	ret := base.GeneratePredictionVector(X)
	classAttrSpecs := base.ResolveAttributes(ret, classAttrs)
	// Retrieve numeric non-class Attributes
	numericAttrs := base.NonClassFloatAttributes(X)
	numericAttrSpecs := base.ResolveAttributes(X, numericAttrs)

	// Allocate row storage
	row := make([]float64, len(numericAttrSpecs))
	X.MapOverRows(numericAttrSpecs, func(rowBytes [][]byte, rowNo int) (bool, error) {
		for i, r := range rowBytes {
			row[i] = base.UnpackBytesToFloat(r)
		}
		val := Predict(lr.model, row)
		vals := base.PackFloatToBytes(val)
		ret.Set(classAttrSpecs[0], rowNo, vals)
		return true, nil
	})

	return ret
}
Ejemplo n.º 3
0
// generateTrainingAttrs selects RandomFeatures number of base.Attributes from
// the provided base.Instances.
func (b *BaggedModel) generateTrainingAttrs(model int, from base.FixedDataGrid) []base.Attribute {
	ret := make([]base.Attribute, 0)
	attrs := base.NonClassAttributes(from)
	if b.RandomFeatures == 0 {
		ret = attrs
	} else {
		for {
			if len(ret) >= b.RandomFeatures {
				break
			}
			attrIndex := rand.Intn(len(attrs))
			attr := attrs[attrIndex]
			matched := false
			for _, a := range ret {
				if a.Equals(attr) {
					matched = true
					break
				}
			}
			if !matched {
				ret = append(ret, attr)
			}
		}
	}
	for _, a := range from.AllClassAttributes() {
		ret = append(ret, a)
	}
	b.lock.Lock()
	b.selectedAttributes[model] = ret
	b.lock.Unlock()
	return ret
}
Ejemplo n.º 4
0
// GenerateSplitRule returns the best attribute out of those randomly chosen
// which maximises Information Gain
func (r *RandomTreeRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule {

	var consideredAttributes []base.Attribute

	// First step is to generate the random attributes that we'll consider
	allAttributes := base.AttributeDifferenceReferences(f.AllAttributes(), f.AllClassAttributes())
	maximumAttribute := len(allAttributes)

	attrCounter := 0
	for {
		if len(consideredAttributes) >= r.Attributes {
			break
		}
		selectedAttrIndex := rand.Intn(maximumAttribute)
		selectedAttribute := allAttributes[selectedAttrIndex]
		matched := false
		for _, a := range consideredAttributes {
			if a.Equals(selectedAttribute) {
				matched = true
				break
			}
		}
		if matched {
			continue
		}
		consideredAttributes = append(consideredAttributes, selectedAttribute)
		attrCounter++
	}

	return r.internalRule.GetSplitRuleFromSelection(consideredAttributes, f)
}
Ejemplo n.º 5
0
// GenerateSplitAttribute returns the non-class Attribute which maximises the
// information gain.
//
// IMPORTANT: passing a base.Instances with no Attributes other than the class
// variable will panic()
func (r *InformationGainRuleGenerator) GenerateSplitAttribute(f base.FixedDataGrid) base.Attribute {

	attrs := f.AllAttributes()
	classAttrs := f.AllClassAttributes()
	candidates := base.AttributeDifferenceReferences(attrs, classAttrs)

	return r.GetSplitAttributeFromSelection(candidates, f)
}
Ejemplo n.º 6
0
// GenerateSplitRule returns the non-class Attribute-based DecisionTreeRule
// which maximises the information gain.
//
// IMPORTANT: passing a base.Instances with no Attributes other than the class
// variable will panic()
func (g *GiniCoefficientRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule {

	attrs := f.AllAttributes()
	classAttrs := f.AllClassAttributes()
	candidates := base.AttributeDifferenceReferences(attrs, classAttrs)

	return g.GetSplitRuleFromSelection(candidates, f)
}
Ejemplo n.º 7
0
func generateClassWeightVectorFromFixed(X base.FixedDataGrid) []float64 {
	classAttrs := X.AllClassAttributes()
	if len(classAttrs) != 1 {
		panic("Wrong number of class Attributes")
	}
	if _, ok := classAttrs[0].(*base.FloatAttribute); ok {
		ret := make([]float64, 2)
		for i := range ret {
			ret[i] = 1.0
		}
		return ret
	} else {
		panic("Must be a FloatAttribute")
	}
}
Ejemplo n.º 8
0
func processData(x base.FixedDataGrid) instances {
	_, rows := x.Size()

	result := make(instances, rows)

	// Retrieve numeric non-class Attributes
	numericAttrs := base.NonClassFloatAttributes(x)
	numericAttrSpecs := base.ResolveAttributes(x, numericAttrs)

	// Retrieve class Attributes
	classAttrs := x.AllClassAttributes()
	if len(classAttrs) != 1 {
		panic("Only one classAttribute supported!")
	}

	// Check that the class Attribute is categorical
	// (with two values) or binary
	classAttr := classAttrs[0]
	if attr, ok := classAttr.(*base.CategoricalAttribute); ok {
		if len(attr.GetValues()) != 2 {
			panic("To many values for Attribute!")
		}
	} else if _, ok := classAttr.(*base.BinaryAttribute); ok {
	} else {
		panic("Wrong class Attribute type!")
	}

	// Convert each row
	x.MapOverRows(numericAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
		// Allocate a new row
		probRow := make([]float64, len(numericAttrSpecs))

		// Read out the row
		for i, _ := range numericAttrSpecs {
			probRow[i] = base.UnpackBytesToFloat(row[i])
		}

		// Get the class for the values
		class := base.GetClass(x, rowNo)
		instance := instance{class, probRow}
		result[rowNo] = instance
		return true, nil
	})
	return result
}
Ejemplo n.º 9
0
func (m *OneVsAllModel) generateAttributes(from base.FixedDataGrid) map[base.Attribute]base.Attribute {
	attrs := from.AllAttributes()
	classAttrs := from.AllClassAttributes()
	if len(classAttrs) != 1 {
		panic("Only 1 class Attribute is supported!")
	}
	ret := make(map[base.Attribute]base.Attribute)
	for _, a := range attrs {
		ret[a] = a
		for _, b := range classAttrs {
			if a.Equals(b) {
				cur := base.NewFloatAttribute(b.GetName())
				ret[a] = cur
			}
		}
	}
	return ret
}
Ejemplo n.º 10
0
// Fit creates n filtered datasets (where n is the number of values
// a CategoricalAttribute can take) and uses them to train the
// underlying classifiers.
func (m *OneVsAllModel) Fit(using base.FixedDataGrid) {
	var classAttr *base.CategoricalAttribute
	// Do some validation
	classAttrs := using.AllClassAttributes()
	for _, a := range classAttrs {
		if c, ok := a.(*base.CategoricalAttribute); !ok {
			panic("Unsupported ClassAttribute type")
		} else {
			classAttr = c
		}
	}
	attrs := m.generateAttributes(using)

	// Find the highest stored value
	val := uint64(0)
	classVals := classAttr.GetValues()
	for _, s := range classVals {
		cur := base.UnpackBytesToU64(classAttr.GetSysValFromString(s))
		if cur > val {
			val = cur
		}
	}
	if val == 0 {
		panic("Must have more than one class!")
	}
	m.maxClassVal = val

	// Create individual filtered instances for training
	filters := make([]*oneVsAllFilter, val+1)
	classifiers := make([]base.Classifier, val+1)
	for i := uint64(0); i <= val; i++ {
		f := &oneVsAllFilter{
			attrs,
			classAttr,
			i,
		}
		filters[i] = f
		classifiers[i] = m.NewClassifierFunction(classVals[int(i)])
		classifiers[i].Fit(base.NewLazilyFilteredInstances(using, f))
	}

	m.filters = filters
	m.classifiers = classifiers
}
Ejemplo n.º 11
0
func convertInstancesToLabelVec(X base.FixedDataGrid) []float64 {
	// Get the class Attributes
	classAttrs := X.AllClassAttributes()
	// Only support 1 class Attribute
	if len(classAttrs) != 1 {
		panic(fmt.Sprintf("%d ClassAttributes (1 expected)", len(classAttrs)))
	}
	// ClassAttribute must be numeric
	if _, ok := classAttrs[0].(*base.FloatAttribute); !ok {
		panic(fmt.Sprintf("%s: ClassAttribute must be a FloatAttribute", classAttrs[0]))
	}
	// Allocate return structure
	_, rows := X.Size()
	labelVec := make([]float64, rows)
	// Resolve class Attribute specification
	classAttrSpecs := base.ResolveAttributes(X, classAttrs)
	X.MapOverRows(classAttrSpecs, func(row [][]byte, rowNo int) (bool, error) {
		labelVec[rowNo] = base.UnpackBytesToFloat(row[0])
		return true, nil
	})
	return labelVec
}
Ejemplo n.º 12
0
func getClassAttr(from base.FixedDataGrid) base.Attribute {
	allClassAttrs := from.AllClassAttributes()
	return allClassAttrs[0]
}
Ejemplo n.º 13
0
// Fill data matrix with Bernoulli Naive Bayes model. All values
// necessary for calculating prior probability and p(f_i)
func (nb *BernoulliNBClassifier) Fit(X base.FixedDataGrid) {

	// Check that all Attributes are binary
	classAttrs := X.AllClassAttributes()
	allAttrs := X.AllAttributes()
	featAttrs := base.AttributeDifference(allAttrs, classAttrs)
	for i := range featAttrs {
		if _, ok := featAttrs[i].(*base.BinaryAttribute); !ok {
			panic(fmt.Sprintf("%v: Should be BinaryAttribute", featAttrs[i]))
		}
	}
	featAttrSpecs := base.ResolveAttributes(X, featAttrs)

	// Check that only one classAttribute is defined
	if len(classAttrs) != 1 {
		panic("Only one class Attribute can be used")
	}

	// Number of features and instances in this training set
	_, nb.trainingInstances = X.Size()
	nb.attrs = featAttrs
	nb.features = len(featAttrs)

	// Number of instances in class
	nb.classInstances = make(map[string]int)

	// Number of documents with given term (by class)
	docsContainingTerm := make(map[string][]int)

	// This algorithm could be vectorized after binarizing the data
	// matrix. Since mat64 doesn't have this function, a iterative
	// version is used.
	X.MapOverRows(featAttrSpecs, func(docVector [][]byte, r int) (bool, error) {
		class := base.GetClass(X, r)

		// increment number of instances in class
		t, ok := nb.classInstances[class]
		if !ok {
			t = 0
		}
		nb.classInstances[class] = t + 1

		for feat := 0; feat < len(docVector); feat++ {
			v := docVector[feat]
			// In Bernoulli Naive Bayes the presence and absence of
			// features are considered. All non-zero values are
			// treated as presence.
			if v[0] > 0 {
				// Update number of times this feature appeared within
				// given label.
				t, ok := docsContainingTerm[class]
				if !ok {
					t = make([]int, nb.features)
					docsContainingTerm[class] = t
				}
				t[feat] += 1
			}
		}
		return true, nil
	})

	// Pre-calculate conditional probabilities for each class
	for c, _ := range nb.classInstances {
		nb.condProb[c] = make([]float64, nb.features)
		for feat := 0; feat < nb.features; feat++ {
			classTerms, _ := docsContainingTerm[c]
			numDocs := classTerms[feat]
			docsInClass, _ := nb.classInstances[c]

			classCondProb, _ := nb.condProb[c]
			// Calculate conditional probability with laplace smoothing
			classCondProb[feat] = float64(numDocs+1) / float64(docsInClass+1)
		}
	}
}
Ejemplo n.º 14
0
func (lr *LinearRegression) Fit(inst base.FixedDataGrid) error {

	// Retrieve row size
	_, rows := inst.Size()

	// Validate class Attribute count
	classAttrs := inst.AllClassAttributes()
	if len(classAttrs) != 1 {
		return fmt.Errorf("Only 1 class variable is permitted")
	}
	classAttrSpecs := base.ResolveAttributes(inst, classAttrs)

	// Retrieve relevant Attributes
	allAttrs := base.NonClassAttributes(inst)
	attrs := make([]base.Attribute, 0)
	for _, a := range allAttrs {
		if _, ok := a.(*base.FloatAttribute); ok {
			attrs = append(attrs, a)
		}
	}

	cols := len(attrs) + 1

	if rows < cols {
		return NotEnoughDataError
	}

	// Retrieve relevant Attribute specifications
	attrSpecs := base.ResolveAttributes(inst, attrs)

	// Split into two matrices, observed results (dependent variable y)
	// and the explanatory variables (X) - see http://en.wikipedia.org/wiki/Linear_regression
	observed := mat64.NewDense(rows, 1, nil)
	explVariables := mat64.NewDense(rows, cols, nil)

	// Build the observed matrix
	inst.MapOverRows(classAttrSpecs, func(row [][]byte, i int) (bool, error) {
		val := base.UnpackBytesToFloat(row[0])
		observed.Set(i, 0, val)
		return true, nil
	})

	// Build the explainatory variables
	inst.MapOverRows(attrSpecs, func(row [][]byte, i int) (bool, error) {
		// Set intercepts to 1.0
		explVariables.Set(i, 0, 1.0)
		for j, r := range row {
			explVariables.Set(i, j+1, base.UnpackBytesToFloat(r))
		}
		return true, nil
	})

	n := cols
	qr := new(mat64.QR)
	qr.Factorize(explVariables)
	var q, reg mat64.Dense
	q.QFromQR(qr)
	reg.RFromQR(qr)

	var transposed, qty mat64.Dense
	transposed.Clone(q.T())
	qty.Mul(&transposed, observed)

	regressionCoefficients := make([]float64, n)
	for i := n - 1; i >= 0; i-- {
		regressionCoefficients[i] = qty.At(i, 0)
		for j := i + 1; j < n; j++ {
			regressionCoefficients[i] -= regressionCoefficients[j] * reg.At(i, j)
		}
		regressionCoefficients[i] /= reg.At(i, i)
	}

	lr.disturbance = regressionCoefficients[0]
	lr.regressionCoefficients = regressionCoefficients[1:]
	lr.fitted = true
	lr.attrs = attrs
	lr.cls = classAttrs[0]
	return nil
}