Beispiel #1
0
// Prune eliminates branches which hurt accuracy
func (d *DecisionTreeNode) Prune(using *base.Instances) {
	// If you're a leaf, you're already pruned
	if d.Children == nil {
		return
	} else {
		if d.SplitAttr == nil {
			return
		}
		// Recursively prune children of this node
		sub := using.DecomposeOnAttributeValues(d.SplitAttr)
		for k := range d.Children {
			if sub[k] == nil {
				continue
			}
			d.Children[k].Prune(sub[k])
		}
	}

	// Get a baseline accuracy
	baselineAccuracy := computeAccuracy(d.Predict(using), using)

	// Speculatively remove the children and re-evaluate
	tmpChildren := d.Children
	d.Children = nil
	newAccuracy := computeAccuracy(d.Predict(using), using)

	// Keep the children removed if better, else restore
	if newAccuracy < baselineAccuracy {
		d.Children = tmpChildren
	}
}
Beispiel #2
0
// GenerateSplitAttribute returns the best attribute out of those randomly chosen
// which maximises Information Gain
func (r *RandomTreeRuleGenerator) GenerateSplitAttribute(f *base.Instances) base.Attribute {

	// First step is to generate the random attributes that we'll consider
	maximumAttribute := f.GetAttributeCount()
	consideredAttributes := make([]int, r.Attributes)
	attrCounter := 0
	for {
		if len(consideredAttributes) >= r.Attributes {
			break
		}
		selectedAttribute := rand.Intn(maximumAttribute)
		fmt.Println(selectedAttribute, attrCounter, consideredAttributes, len(consideredAttributes))
		if selectedAttribute != f.ClassIndex {
			matched := false
			for _, a := range consideredAttributes {
				if a == selectedAttribute {
					matched = true
					break
				}
			}
			if matched {
				continue
			}
			consideredAttributes = append(consideredAttributes, selectedAttribute)
			attrCounter++
		}
	}

	return r.internalRule.GetSplitAttributeFromSelection(consideredAttributes, f)
}
Beispiel #3
0
func convertInstancesToLabelVec(X *base.Instances) []float64 {
	labelVec := make([]float64, X.Rows)
	for i := 0; i < X.Rows; i++ {
		labelVec[i] = X.Get(i, X.ClassIndex)
	}
	return labelVec
}
Beispiel #4
0
func (KNN *KNNClassifier) Predict(what *base.Instances) *base.Instances {
	ret := what.GeneratePredictionVector()
	for i := 0; i < what.Rows; i++ {
		ret.SetAttrStr(i, 0, KNN.PredictOne(what.GetRowVectorWithoutClass(i)))
	}
	return ret
}
Beispiel #5
0
// Run discretises the set of Instances `on'
//
// IMPORTANT: ChiMergeFilter discretises in place.
func (c *ChiMergeFilter) Run(on *base.Instances) {
	if !c._Trained {
		panic("Call Build() beforehand")
	}
	for attr := range c.Tables {
		table := c.Tables[attr]
		for i := 0; i < on.Rows; i++ {
			val := on.Get(i, attr)
			dis := 0
			for j, k := range table {
				if k.Value < val {
					dis = j
					continue
				}
				break
			}
			on.Set(i, attr, float64(dis))
		}
		newAttribute := new(base.CategoricalAttribute)
		newAttribute.SetName(on.GetAttr(attr).GetName())
		for _, k := range table {
			newAttribute.GetSysValFromString(fmt.Sprintf("%f", k.Value))
		}
		on.ReplaceAttr(attr, newAttribute)
	}
}
Beispiel #6
0
// GetSplitAttributeFromSelection returns the class Attribute which maximises
// the information gain amongst consideredAttributes
//
// IMPORTANT: passing a zero-length consideredAttributes parameter will panic()
func (r *InformationGainRuleGenerator) GetSplitAttributeFromSelection(consideredAttributes []int, f *base.Instances) base.Attribute {

	// Next step is to compute the information gain at this node
	// for each randomly chosen attribute, and pick the one
	// which maximises it
	maxGain := math.Inf(-1)
	selectedAttribute := -1

	// Compute the base entropy
	classDist := f.GetClassDistribution()
	baseEntropy := getBaseEntropy(classDist)

	// Compute the information gain for each attribute
	for _, s := range consideredAttributes {
		proposedClassDist := f.GetClassDistributionAfterSplit(f.GetAttr(s))
		localEntropy := getSplitEntropy(proposedClassDist)
		informationGain := baseEntropy - localEntropy
		if informationGain > maxGain {
			maxGain = informationGain
			selectedAttribute = s
		}
	}

	// Pick the one which maximises IG
	return f.GetAttr(selectedAttribute)
}
Beispiel #7
0
// Run applies a trained BinningFilter to a set of Instances,
// discretising any numeric attributes added.
//
// IMPORTANT: Run discretises in-place, so make sure to take
// a copy if the original instances are still needed
//
// IMPORTANT: This function panic()s if the filter has not been
// trained. Call Build() before running this function
//
// IMPORTANT: Call Build() after adding any additional attributes.
// Otherwise, the training structure will be out of date from
// the values expected and could cause a panic.
func (b *BinningFilter) Run(on *base.Instances) {
	if !b.trained {
		panic("Call Build() beforehand")
	}
	for attr := range b.Attributes {
		minVal := b.MinVals[attr]
		maxVal := b.MaxVals[attr]
		disc := 0
		// Casts to float32 to replicate a floating point precision error
		delta := float32(maxVal - minVal)
		delta /= float32(b.BinCount)
		for i := 0; i < on.Rows; i++ {
			val := on.Get(i, attr)
			if val <= minVal {
				disc = 0
			} else {
				disc = int(math.Floor(float64(float32(val-minVal) / delta)))
				if disc >= b.BinCount {
					disc = b.BinCount - 1
				}
			}
			on.Set(i, attr, float64(disc))
		}
		newAttribute := new(base.CategoricalAttribute)
		newAttribute.SetName(on.GetAttr(attr).GetName())
		for i := 0; i < b.BinCount; i++ {
			newAttribute.GetSysValFromString(fmt.Sprintf("%d", i))
		}
		on.ReplaceAttr(attr, newAttribute)
	}
}
Beispiel #8
0
func convertInstancesToProblemVec(X *base.Instances) [][]float64 {
	problemVec := make([][]float64, X.Rows)
	for i := 0; i < X.Rows; i++ {
		problemVecCounter := 0
		problemVec[i] = make([]float64, X.Cols-1)
		for j := 0; j < X.Cols; j++ {
			if j == X.ClassIndex {
				continue
			}
			problemVec[i][problemVecCounter] = X.Get(i, j)
			problemVecCounter++
		}
	}
	fmt.Println(problemVec, X)
	return problemVec
}
Beispiel #9
0
func (lr *LogisticRegression) Predict(X *base.Instances) *base.Instances {
	ret := X.GeneratePredictionVector()
	row := make([]float64, X.Cols-1)
	for i := 0; i < X.Rows; i++ {
		rowCounter := 0
		for j := 0; j < X.Cols; j++ {
			if j != X.ClassIndex {
				row[rowCounter] = X.Get(i, j)
				rowCounter++
			}
		}
		fmt.Println(Predict(lr.model, row), row)
		ret.Set(i, 0, Predict(lr.model, row))
	}
	return ret
}
Beispiel #10
0
func (lr *LinearRegression) Predict(X *base.Instances) (*base.Instances, error) {
	if !lr.fitted {
		return nil, NoTrainingDataError
	}

	ret := X.GeneratePredictionVector()
	for i := 0; i < X.Rows; i++ {
		var prediction float64 = lr.disturbance
		for j := 0; j < X.Cols; j++ {
			if j != X.ClassIndex {
				prediction += X.Get(i, j) * lr.regressionCoefficients[j]
			}
		}
		ret.Set(i, 0, prediction)
	}

	return ret, nil
}
Beispiel #11
0
// GetConfusionMatrix builds a ConfusionMatrix from a set of reference (`ref')
// and generate (`gen') Instances.
func GetConfusionMatrix(ref *base.Instances, gen *base.Instances) map[string]map[string]int {

	if ref.Rows != gen.Rows {
		panic("Row counts should match")
	}

	ret := make(map[string]map[string]int)

	for i := 0; i < ref.Rows; i++ {
		referenceClass := ref.GetClass(i)
		predictedClass := gen.GetClass(i)
		if _, ok := ret[referenceClass]; ok {
			ret[referenceClass][predictedClass]++
		} else {
			ret[referenceClass] = make(map[string]int)
			ret[referenceClass][predictedClass] = 1
		}
	}
	return ret
}
Beispiel #12
0
// Predict outputs a base.Instances containing predictions from this tree
func (d *DecisionTreeNode) Predict(what *base.Instances) *base.Instances {
	outputAttrs := make([]base.Attribute, 1)
	outputAttrs[0] = what.GetClassAttr()
	predictions := base.NewInstances(outputAttrs, what.Rows)
	for i := 0; i < what.Rows; i++ {
		cur := d
		for {
			if cur.Children == nil {
				predictions.SetAttrStr(i, 0, cur.Class)
				break
			} else {
				at := cur.SplitAttr
				j := what.GetAttrIndex(at)
				if j == -1 {
					predictions.SetAttrStr(i, 0, cur.Class)
					break
				}
				classVar := at.GetStringFromSysVal(what.Get(i, j))
				if next, ok := cur.Children[classVar]; ok {
					cur = next
				} else {
					var bestChild string
					for c := range cur.Children {
						bestChild = c
						if c > classVar {
							break
						}
					}
					cur = cur.Children[bestChild]
				}
			}
		}
	}
	return predictions
}
Beispiel #13
0
func ChiMBuildFrequencyTable(attr int, inst *base.Instances) []*FrequencyTableEntry {
	ret := make([]*FrequencyTableEntry, 0)
	var attribute *base.FloatAttribute
	attribute, ok := inst.GetAttr(attr).(*base.FloatAttribute)
	if !ok {
		panic("only use Chi-M on numeric stuff")
	}
	for i := 0; i < inst.Rows; i++ {
		value := inst.Get(i, attr)
		valueConv := attribute.GetUsrVal(value)
		class := inst.GetClass(i)
		// Search the frequency table for the value
		found := false
		for _, entry := range ret {
			if entry.Value == valueConv {
				found = true
				entry.Frequency[class] += 1
			}
		}
		if !found {
			newEntry := &FrequencyTableEntry{
				valueConv,
				make(map[string]int),
			}
			newEntry.Frequency[class] = 1
			ret = append(ret, newEntry)
		}
	}

	return ret
}
Beispiel #14
0
// generateTrainingAttrs selects RandomFeatures number of base.Attributes from
// the provided base.Instances.
func (b *BaggedModel) generateTrainingAttrs(model int, from *base.Instances) []base.Attribute {
	ret := make([]base.Attribute, 0)
	if b.RandomFeatures == 0 {
		for j := 0; j < from.Cols; j++ {
			attr := from.GetAttr(j)
			ret = append(ret, attr)
		}
	} else {
		for {
			if len(ret) >= b.RandomFeatures {
				break
			}
			attrIndex := rand.Intn(from.Cols)
			if attrIndex == from.ClassIndex {
				continue
			}
			attr := from.GetAttr(attrIndex)
			matched := false
			for _, a := range ret {
				if a.Equals(attr) {
					matched = true
					break
				}
			}
			if !matched {
				ret = append(ret, attr)
			}
		}
	}
	ret = append(ret, from.GetClassAttr())
	b.lock.Lock()
	b.selectedAttributes[model] = ret
	b.lock.Unlock()
	return ret
}
Beispiel #15
0
// generateTrainingInstances generates RandomFeatures number of
// attributes and returns a modified version of base.Instances
// for training the model
func (b *BaggedModel) generateTrainingInstances(model int, from *base.Instances) *base.Instances {
	insts := from.SampleWithReplacement(from.Rows)
	selected := b.generateTrainingAttrs(model, from)
	return insts.SelectAttributes(selected)
}
Beispiel #16
0
// generatePredictionInstances returns a modified version of the
// requested base.Instances with only the base.Attributes selected
// for training the model.
func (b *BaggedModel) generatePredictionInstances(model int, from *base.Instances) *base.Instances {
	selected := b.selectedAttributes[model]
	return from.SelectAttributes(selected)
}
Beispiel #17
0
// Predict gathers predictions from all the classifiers
// and outputs the most common (majority) class
//
// IMPORTANT: in the event of a tie, the first class which
// achieved the tie value is output.
func (b *BaggedModel) Predict(from *base.Instances) *base.Instances {
	n := runtime.NumCPU()
	// Channel to receive the results as they come in
	votes := make(chan *base.Instances, n)
	// Count the votes for each class
	voting := make(map[int](map[string]int))

	// Create a goroutine to collect the votes
	var votingwait sync.WaitGroup
	votingwait.Add(1)
	go func() {
		for {
			incoming, ok := <-votes
			if ok {
				// Step through each prediction
				for j := 0; j < incoming.Rows; j++ {
					// Check if we've seen this class before...
					if _, ok := voting[j]; !ok {
						// If we haven't, create an entry
						voting[j] = make(map[string]int)
						// Continue on the current row
						j--
						continue
					}
					voting[j][incoming.GetClass(j)]++
				}
			} else {
				votingwait.Done()
				break
			}
		}
	}()

	// Create workers to process the predictions
	processpipe := make(chan int, n)
	var processwait sync.WaitGroup
	for i := 0; i < n; i++ {
		processwait.Add(1)
		go func() {
			for {
				if i, ok := <-processpipe; ok {
					c := b.Models[i]
					l := b.generatePredictionInstances(i, from)
					votes <- c.Predict(l)
				} else {
					processwait.Done()
					break
				}
			}
		}()
	}

	// Send all the models to the workers for prediction
	for i := range b.Models {
		processpipe <- i
	}
	close(processpipe) // Finished sending models to be predicted
	processwait.Wait() // Predictors all finished processing
	close(votes)       // Close the vote channel and allow it to drain
	votingwait.Wait()  // All the votes are in

	// Generate the overall consensus
	ret := from.GeneratePredictionVector()
	for i := range voting {
		maxClass := ""
		maxCount := 0
		// Find the most popular class
		for c := range voting[i] {
			votes := voting[i][c]
			if votes > maxCount {
				maxClass = c
				maxCount = votes
			}
		}
		ret.SetAttrStr(i, 0, maxClass)
	}
	return ret
}
Beispiel #18
0
func (lr *LinearRegression) Fit(inst *base.Instances) error {
	if inst.Rows < inst.GetAttributeCount() {
		return NotEnoughDataError
	}

	// Split into two matrices, observed results (dependent variable y)
	// and the explanatory variables (X) - see http://en.wikipedia.org/wiki/Linear_regression
	observed := mat64.NewDense(inst.Rows, 1, nil)
	explVariables := mat64.NewDense(inst.Rows, inst.GetAttributeCount(), nil)

	for i := 0; i < inst.Rows; i++ {
		observed.Set(i, 0, inst.Get(i, inst.ClassIndex)) // Set observed data

		for j := 0; j < inst.GetAttributeCount(); j++ {
			if j == 0 {
				// Set intercepts to 1.0
				// Could / should be done better: http://www.theanalysisfactor.com/interpret-the-intercept/
				explVariables.Set(i, 0, 1.0)
			} else {
				explVariables.Set(i, j, inst.Get(i, j-1))
			}
		}
	}

	n := inst.GetAttributeCount()
	qr := mat64.QR(explVariables)
	q := qr.Q()
	reg := qr.R()

	var transposed, qty mat64.Dense
	transposed.TCopy(q)
	qty.Mul(&transposed, observed)

	regressionCoefficients := make([]float64, n)
	for i := n - 1; i >= 0; i-- {
		regressionCoefficients[i] = qty.At(i, 0)
		for j := i + 1; j < n; j++ {
			regressionCoefficients[i] -= regressionCoefficients[j] * reg.At(i, j)
		}
		regressionCoefficients[i] /= reg.At(i, i)
	}

	lr.disturbance = regressionCoefficients[0]
	lr.regressionCoefficients = regressionCoefficients[1:]
	lr.fitted = true

	return nil
}
Beispiel #19
0
// InferID3Tree builds a decision tree using a RuleGenerator
// from a set of Instances (implements the ID3 algorithm)
func InferID3Tree(from *base.Instances, with RuleGenerator) *DecisionTreeNode {
	// Count the number of classes at this node
	classes := from.CountClassValues()
	// If there's only one class, return a DecisionTreeLeaf with
	// the only class available
	if len(classes) == 1 {
		maxClass := ""
		for i := range classes {
			maxClass = i
		}
		ret := &DecisionTreeNode{
			LeafNode,
			nil,
			nil,
			classes,
			maxClass,
			from.GetClassAttrPtr(),
		}
		return ret
	}

	// Only have the class attribute
	maxVal := 0
	maxClass := ""
	for i := range classes {
		if classes[i] > maxVal {
			maxClass = i
			maxVal = classes[i]
		}
	}

	// If there are no more Attributes left to split on,
	// return a DecisionTreeLeaf with the majority class
	if from.GetAttributeCount() == 2 {
		ret := &DecisionTreeNode{
			LeafNode,
			nil,
			nil,
			classes,
			maxClass,
			from.GetClassAttrPtr(),
		}
		return ret
	}

	ret := &DecisionTreeNode{
		RuleNode,
		nil,
		nil,
		classes,
		maxClass,
		from.GetClassAttrPtr(),
	}

	// Generate a return structure
	// Generate the splitting attribute
	splitOnAttribute := with.GenerateSplitAttribute(from)
	if splitOnAttribute == nil {
		// Can't determine, just return what we have
		return ret
	}
	// Split the attributes based on this attribute's value
	splitInstances := from.DecomposeOnAttributeValues(splitOnAttribute)
	// Create new children from these attributes
	ret.Children = make(map[string]*DecisionTreeNode)
	for k := range splitInstances {
		newInstances := splitInstances[k]
		ret.Children[k] = InferID3Tree(newInstances, with)
	}
	ret.SplitAttr = splitOnAttribute
	return ret
}