Esempio n. 1
0
func (dt *CART) SingleTreeBuild(samples []*core.MapBasedSample, feature_select_prob float64, bootstrap bool) Tree {
	tree := Tree{}
	queue := list.New()
	root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}}

	if !bootstrap {
		for i, sample := range samples {
			root.AddSample(i)
			root.prediction.AddValue(sample.Label, 1.0)
		}
	} else {
		for i := 0; i < len(samples); i++ {
			k := rand.Intn(len(samples))
			root.AddSample(k)
			root.prediction.AddValue(samples[k].Label, 1.0)
		}
	}
	root.sample_count = len(root.samples)
	root.prediction.Scale(1.0 / root.prediction.Sum())

	queue.PushBack(&root)
	tree.AddTreeNode(&root)
	for {
		nodes := DTGetElementFromQueue(queue, 10)
		if len(nodes) == 0 {
			break
		}

		for _, node := range nodes {
			dt.AppendNodeToTree(samples, node, queue, &tree, feature_select_prob)
		}
	}
	return tree
}
Esempio n. 2
0
func (rdt *RandomDecisionTree) SingleTreeBuild(samples []*core.MapBasedSample) Tree {
	tree := Tree{}
	queue := list.New()
	root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}}
	
	for i := 0; i < len(samples); i++{
		k := rand.Intn(len(samples))
		root.AddSample(k)
		root.prediction.AddValue(samples[k].Label, 1.0)
	}
	root.sample_count = len(root.samples)
	root.prediction.Scale(1.0 / root.prediction.Sum())

	queue.PushBack(&root)
	tree.AddTreeNode(&root)
	for {
		nodes := DTGetElementFromQueue(queue, 10)
		if len(nodes) == 0 {
			break
		}
		
		for _, node := range nodes {
			rdt.AppendNodeToTree(samples, node, queue, &tree)
		}
	}
	return tree
}
Esempio n. 3
0
func (dt *RegressionTree) SingleTreeBuild(samples []*core.MapBasedSample, select_features map[int64]bool) Tree {
	tree := Tree{}
	queue := list.New()
	root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}}
	total := 0.0
	positive := 0.0
	for i, sample := range samples {
		root.AddSample(i)
		total += 1.0
		positive += sample.Prediction
	}
	root.sample_count = len(root.samples)
	root.prediction.SetValue(0, positive / total)

	queue.PushBack(&root)
	tree.AddTreeNode(&root)
	for {
		nodes := dt.GetElementFromQueue(queue, 10)
		if len(nodes) == 0 {
			break
		}
		
		for _, node := range nodes {
			dt.AppendNodeToTree(samples, node, queue, &tree, select_features)
		}
	}
	return tree
}
Esempio n. 4
0
func (dt *RegressionTree) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree, select_features map[int64]bool) {
	if node.depth >= dt.params.MaxDepth {
		return
	}
	
	dt.FindBestSplit(samples, node, select_features)

	if node.feature_split.Id < 0{
		return
	}
	left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: core.NewArrayVector(), sample_count: 0, samples: []int{}}
	right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: core.NewArrayVector(), sample_count: 0, samples: []int{}}

	left_positive := 0.0
	left_total := 0.0
	right_positive := 0.0
	right_total := 0.0
	for _, k := range node.samples {
		if dt.GoLeft(samples[k], node.feature_split) {
			left_node.samples = append(left_node.samples, k)
			left_positive += samples[k].Prediction
			left_total += 1.0
		} else {
			right_node.samples = append(right_node.samples, k)
			right_positive += samples[k].Prediction
			right_total += 1.0
		}
	}
	node.samples = nil
	
	if len(left_node.samples) > dt.params.MinLeafSize {
		left_node.sample_count = len(left_node.samples)
		left_node.prediction.SetValue(0, left_positive / left_total)
		queue.PushBack(&left_node)
		node.left = len(tree.nodes)
		tree.AddTreeNode(&left_node)
	}

	if len(right_node.samples) > dt.params.MinLeafSize {
		right_node.sample_count = len(right_node.samples)
		right_node.prediction.SetValue(0, right_positive / right_total)
		queue.PushBack(&right_node)
		node.right = len(tree.nodes)
		tree.AddTreeNode(&right_node)
	}
}
Esempio n. 5
0
func (dt *CART) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree, feature_select_prob float64) {
	if node.depth >= dt.params.MaxDepth {
		return
	}

	if dt.continuous_features {
		dt.FindBestSplitOfContinusousFeature(samples, node, feature_select_prob)
	} else {
		dt.FindBestSplitOfBinaryFeature(samples, node, feature_select_prob)
	}
	if node.feature_split.Id < 0 {
		return
	}
	left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}}
	right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}}

	left_node.prediction = core.NewArrayVector()
	right_node.prediction = core.NewArrayVector()
	for _, k := range node.samples {
		if DTGoLeft(samples[k], node.feature_split) {
			left_node.samples = append(left_node.samples, k)
			left_node.prediction.AddValue(samples[k].Label, 1.0)
		} else {
			right_node.samples = append(right_node.samples, k)
			right_node.prediction.AddValue(samples[k].Label, 1.0)
		}
	}
	node.samples = nil

	if len(left_node.samples) > dt.params.MinLeafSize {
		left_node.sample_count = len(left_node.samples)
		left_node.prediction.Scale(1.0 / left_node.prediction.Sum())
		queue.PushBack(&left_node)
		node.left = len(tree.nodes)
		tree.AddTreeNode(&left_node)
	}

	if len(right_node.samples) > dt.params.MinLeafSize {
		right_node.sample_count = len(right_node.samples)
		right_node.prediction.Scale(1.0 / right_node.prediction.Sum())
		queue.PushBack(&right_node)
		node.right = len(tree.nodes)
		tree.AddTreeNode(&right_node)
	}
}
Esempio n. 6
0
func (rdt *RandomDecisionTree) PredictMultiClass(sample * core.Sample) *core.ArrayVector {
	msample := sample.ToMapBasedSample()
	predictions := core.NewArrayVector()
	total := 0.0
	for _, tree := range rdt.trees{
		node, _ := PredictBySingleTree(tree, msample)
		predictions.AddVector(node.prediction, 1.0)
		total += 1.0
	}
	predictions.Scale(1.0 / total)
	return predictions
}
Esempio n. 7
0
func (rdt *RandomDecisionTree) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree) {
	node.prediction = core.NewArrayVector()
	for _, k := range node.samples {
		node.prediction.AddValue(samples[k].Label, 1.0)
	}
	node.prediction.Scale(1.0 / node.prediction.Sum())

	random_sample := samples[node.samples[rand.Intn(len(node.samples))]]

	split := core.Feature{Id:-1, Value: -1.0}
	for fid, fvalue := range random_sample.Features {
		if split.Id < 0 || rand.Intn(len(random_sample.Features)) == 0 {
			split.Id = fid
			split.Value = fvalue
		}
	}

	if split.Id < 0 || node.depth > rdt.params.MaxDepth {
		return
	}

	node.feature_split = split
	left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}}
	right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}}

	for _, k := range node.samples {
		if DTGoLeft(samples[k], node.feature_split) {
			left_node.samples = append(left_node.samples, k)
		} else {
			right_node.samples = append(right_node.samples, k)
		}
	}
	node.samples = nil

	if len(left_node.samples) == 0 || len(right_node.samples) == 0 {
		return
	}

	if len(left_node.samples) > rdt.params.MinLeafSize {
		queue.PushBack(&left_node)
		node.left = len(tree.nodes)
		tree.AddTreeNode(&left_node)
	}

	if len(right_node.samples) > rdt.params.MinLeafSize {
		queue.PushBack(&right_node)
		node.right = len(tree.nodes)
		tree.AddTreeNode(&right_node)
	}
}
Esempio n. 8
0
func (dt *CART) FindBestSplitOfBinaryFeature(samples []*core.MapBasedSample, node *TreeNode, feature_select_prob float64) {
	feature_right_dis := make(map[int64]*core.ArrayVector)
	total_dis := core.NewArrayVector()
	for i, k := range node.samples {
		if i > 10 && rand.Float64() > dt.params.SamplingRatio {
			continue
		}
		total_dis.AddValue(samples[k].Label, 1.0)
		for fid, _ := range samples[k].Features {
			if dt.RandByFeatureId(fid) > feature_select_prob {
				continue
			}
			_, ok := feature_right_dis[fid]
			if !ok {
				feature_right_dis[fid] = core.NewArrayVector()
			}
			feature_right_dis[fid].AddValue(samples[k].Label, 1.0)
		}
	}

	min_gini := 1.0
	node.feature_split = core.Feature{Id: -1, Value: 0}
	for fid, right_dis := range feature_right_dis {
		left_dis := total_dis.Copy()
		left_dis.AddVector(right_dis, -1.0)
		gini := core.Gini(left_dis, right_dis)
		if min_gini > gini {
			min_gini = gini
			node.feature_split.Id = fid
			node.feature_split.Value = 1.0
		}
	}
	if min_gini > dt.params.GiniThreshold {
		node.feature_split.Id = -1
		node.feature_split.Value = 0.0
	}
}
Esempio n. 9
0
func (algo *NeuralNetwork) PredictMultiClass(sample *core.Sample) *core.ArrayVector {
	y := core.NewVector()
	z := core.NewArrayVector()
	for i := int64(0); i < algo.Params.Hidden; i++ {
		sum := float64(0)
		for _, f := range sample.Features {
			sum += f.Value * algo.Model.L1.Data[i].GetValue(f.Id)
		}
		y.Data[i] = util.Sigmoid(sum)
	}
	y.Data[algo.Params.Hidden] = 1
	for i := 0; i <= int(algo.MaxLabel); i++ {
		sum := float64(0)
		for j := int64(0); j <= algo.Params.Hidden; j++ {
			sum += y.GetValue(j) * algo.Model.L2.GetValue(j, int64(i))
		}
		z.SetValue(i, sum)
	}
	z = z.SoftMaxNorm()
	return z
}
Esempio n. 10
0
func (c *KNN) PredictMultiClass(sample *core.Sample) *core.ArrayVector {
	x := sample.GetFeatureVector()
	predictions := []*eval.LabelPrediction{}
	for i, s := range c.sv {
		predictions = append(predictions, &(eval.LabelPrediction{Label: c.labels[i], Prediction: c.Kernel(s, x)}))
	}

	compare := func(p1, p2 *eval.LabelPrediction) bool {
		return p1.Prediction > p2.Prediction
	}

	eval.By(compare).Sort(predictions)

	ret := core.NewArrayVector()
	for i, pred := range predictions {
		if i > c.k {
			break
		}
		ret.AddValue(pred.Label, 1.0)
	}
	return ret
}
Esempio n. 11
0
func (t *Tree) FromString(buf string) {
	lines := strings.Split(buf, "\n")
	size, _ := strconv.Atoi(lines[0])
	t.nodes = make([]*TreeNode, size + 1, size + 1)
	for _, line := range lines[1:] {
		if len(line) == 0 {
			break
		}
		tks := strings.Split(line, "\t")
		node := TreeNode{}
		i, _ := strconv.Atoi(tks[0])
		node.left, _ = strconv.Atoi(tks[1])
		node.right, _ = strconv.Atoi(tks[2])
		node.depth, _ = strconv.Atoi(tks[3])
		node.prediction = core.NewArrayVector()
		node.prediction.FromString(tks[4])
		node.sample_count, _ = strconv.Atoi(tks[5])
		node.feature_split = core.Feature{}
		node.feature_split.Id, _ = strconv.ParseInt(tks[6], 10, 64)
		node.feature_split.Value, _ = strconv.ParseFloat(tks[7], 64)
		t.nodes[i] = &node
	}
}
Esempio n. 12
0
func (dt *CART) FindBestSplitOfContinusousFeature(samples []*core.MapBasedSample, node *TreeNode, feature_select_prob float64) {
	feature_weight_labels := make(map[int64]*core.FeatureLabelDistribution)
	total_dis := core.NewArrayVector()
	for i, k := range node.samples {
		if i > 10 && rand.Float64() > dt.params.SamplingRatio {
			continue
		}
		total_dis.AddValue(samples[k].Label, 1.0)
		for fid, fvalue := range samples[k].Features {
			if dt.RandByFeatureId(fid) > feature_select_prob {
				continue
			}
			_, ok := feature_weight_labels[fid]
			if !ok {
				feature_weight_labels[fid] = core.NewFeatureLabelDistribution()
			}
			feature_weight_labels[fid].AddWeightLabel(fvalue, samples[k].Label)
		}
	}

	min_gini := 1.0
	node.feature_split = core.Feature{Id: -1, Value: 0}
	for fid, distribution := range feature_weight_labels {
		sort.Sort(distribution)
		split, gini := distribution.BestSplitByGini(total_dis)
		if min_gini > gini {
			min_gini = gini
			node.feature_split.Id = fid
			node.feature_split.Value = split
		}
	}
	if min_gini > dt.params.GiniThreshold {
		node.feature_split.Id = -1
		node.feature_split.Value = 0.0
	}
}