func (rdt *RandomDecisionTree) SingleTreeBuild(samples []*core.MapBasedSample) Tree { tree := Tree{} queue := list.New() root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}} for i := 0; i < len(samples); i++ { k := rand.Intn(len(samples)) root.AddSample(k) root.prediction.AddValue(samples[k].Label, 1.0) } root.sample_count = len(root.samples) root.prediction.Scale(1.0 / root.prediction.Sum()) queue.PushBack(&root) tree.AddTreeNode(&root) for { nodes := DTGetElementFromQueue(queue, 10) if len(nodes) == 0 { break } for _, node := range nodes { rdt.AppendNodeToTree(samples, node, queue, &tree) } } return tree }
func (dt *CART) SingleTreeBuild(samples []*core.MapBasedSample, feature_select_prob float64, bootstrap bool) Tree { tree := Tree{} queue := list.New() root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}} if !bootstrap { for i, sample := range samples { root.AddSample(i) root.prediction.AddValue(sample.Label, 1.0) } } else { for i := 0; i < len(samples); i++ { k := rand.Intn(len(samples)) root.AddSample(k) root.prediction.AddValue(samples[k].Label, 1.0) } } root.sample_count = len(root.samples) root.prediction.Scale(1.0 / root.prediction.Sum()) queue.PushBack(&root) tree.AddTreeNode(&root) for { nodes := DTGetElementFromQueue(queue, 10) if len(nodes) == 0 { break } for _, node := range nodes { dt.AppendNodeToTree(samples, node, queue, &tree, feature_select_prob) } } return tree }
func (dt *RegressionTree) SingleTreeBuild(samples []*core.MapBasedSample, select_features map[int64]bool) Tree { tree := Tree{} queue := list.New() root := TreeNode{depth: 0, left: -1, right: -1, prediction: core.NewArrayVector(), samples: []int{}} total := 0.0 positive := 0.0 for i, sample := range samples { root.AddSample(i) total += 1.0 positive += sample.Prediction } root.sample_count = len(root.samples) root.prediction.SetValue(0, positive/total) queue.PushBack(&root) tree.AddTreeNode(&root) for { nodes := dt.GetElementFromQueue(queue, 10) if len(nodes) == 0 { break } for _, node := range nodes { dt.AppendNodeToTree(samples, node, queue, &tree, select_features) } } return tree }
func (dt *RegressionTree) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree, select_features map[int64]bool) { if node.depth >= dt.params.MaxDepth { return } dt.FindBestSplit(samples, node, select_features) if node.feature_split.Id < 0 { return } left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: core.NewArrayVector(), sample_count: 0, samples: []int{}} right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: core.NewArrayVector(), sample_count: 0, samples: []int{}} left_positive := 0.0 left_total := 0.0 right_positive := 0.0 right_total := 0.0 for _, k := range node.samples { if dt.GoLeft(samples[k], node.feature_split) { left_node.samples = append(left_node.samples, k) left_positive += samples[k].Prediction left_total += 1.0 } else { right_node.samples = append(right_node.samples, k) right_positive += samples[k].Prediction right_total += 1.0 } } node.samples = nil if len(left_node.samples) > dt.params.MinLeafSize { left_node.sample_count = len(left_node.samples) left_node.prediction.SetValue(0, left_positive/left_total) queue.PushBack(&left_node) node.left = len(tree.nodes) tree.AddTreeNode(&left_node) } if len(right_node.samples) > dt.params.MinLeafSize { right_node.sample_count = len(right_node.samples) right_node.prediction.SetValue(0, right_positive/right_total) queue.PushBack(&right_node) node.right = len(tree.nodes) tree.AddTreeNode(&right_node) } }
func (dt *CART) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree, feature_select_prob float64) { if node.depth >= dt.params.MaxDepth { return } if dt.continuous_features { dt.FindBestSplitOfContinusousFeature(samples, node, feature_select_prob) } else { dt.FindBestSplitOfBinaryFeature(samples, node, feature_select_prob) } if node.feature_split.Id < 0 { return } left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}} right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}} left_node.prediction = core.NewArrayVector() right_node.prediction = core.NewArrayVector() for _, k := range node.samples { if DTGoLeft(samples[k], node.feature_split) { left_node.samples = append(left_node.samples, k) left_node.prediction.AddValue(samples[k].Label, 1.0) } else { right_node.samples = append(right_node.samples, k) right_node.prediction.AddValue(samples[k].Label, 1.0) } } node.samples = nil if len(left_node.samples) > dt.params.MinLeafSize { left_node.sample_count = len(left_node.samples) left_node.prediction.Scale(1.0 / left_node.prediction.Sum()) queue.PushBack(&left_node) node.left = len(tree.nodes) tree.AddTreeNode(&left_node) } if len(right_node.samples) > dt.params.MinLeafSize { right_node.sample_count = len(right_node.samples) right_node.prediction.Scale(1.0 / right_node.prediction.Sum()) queue.PushBack(&right_node) node.right = len(tree.nodes) tree.AddTreeNode(&right_node) } }
func (rdt *RandomDecisionTree) PredictMultiClass(sample *core.Sample) *core.ArrayVector { msample := sample.ToMapBasedSample() predictions := core.NewArrayVector() total := 0.0 for _, tree := range rdt.trees { node, _ := PredictBySingleTree(tree, msample) predictions.AddVector(node.prediction, 1.0) total += 1.0 } predictions.Scale(1.0 / total) return predictions }
func (rdt *RandomDecisionTree) AppendNodeToTree(samples []*core.MapBasedSample, node *TreeNode, queue *list.List, tree *Tree) { node.prediction = core.NewArrayVector() for _, k := range node.samples { node.prediction.AddValue(samples[k].Label, 1.0) } node.prediction.Scale(1.0 / node.prediction.Sum()) random_sample := samples[node.samples[rand.Intn(len(node.samples))]] split := core.Feature{Id: -1, Value: -1.0} for fid, fvalue := range random_sample.Features { if split.Id < 0 || rand.Intn(len(random_sample.Features)) == 0 { split.Id = fid split.Value = fvalue } } if split.Id < 0 || node.depth > rdt.params.MaxDepth { return } node.feature_split = split left_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}} right_node := TreeNode{depth: node.depth + 1, left: -1, right: -1, prediction: nil, sample_count: 0, samples: []int{}} for _, k := range node.samples { if DTGoLeft(samples[k], node.feature_split) { left_node.samples = append(left_node.samples, k) } else { right_node.samples = append(right_node.samples, k) } } node.samples = nil if len(left_node.samples) == 0 || len(right_node.samples) == 0 { return } if len(left_node.samples) > rdt.params.MinLeafSize { queue.PushBack(&left_node) node.left = len(tree.nodes) tree.AddTreeNode(&left_node) } if len(right_node.samples) > rdt.params.MinLeafSize { queue.PushBack(&right_node) node.right = len(tree.nodes) tree.AddTreeNode(&right_node) } }
func (dt *CART) FindBestSplitOfBinaryFeature(samples []*core.MapBasedSample, node *TreeNode, feature_select_prob float64) { feature_right_dis := make(map[int64]*core.ArrayVector) total_dis := core.NewArrayVector() for i, k := range node.samples { if i > 10 && rand.Float64() > dt.params.SamplingRatio { continue } total_dis.AddValue(samples[k].Label, 1.0) for fid, _ := range samples[k].Features { if dt.RandByFeatureId(fid) > feature_select_prob { continue } _, ok := feature_right_dis[fid] if !ok { feature_right_dis[fid] = core.NewArrayVector() } feature_right_dis[fid].AddValue(samples[k].Label, 1.0) } } min_gini := 1.0 node.feature_split = core.Feature{Id: -1, Value: 0} for fid, right_dis := range feature_right_dis { left_dis := total_dis.Copy() left_dis.AddVector(right_dis, -1.0) gini := core.Gini(left_dis, right_dis) if min_gini > gini { min_gini = gini node.feature_split.Id = fid node.feature_split.Value = 1.0 } } if min_gini > dt.params.GiniThreshold { node.feature_split.Id = -1 node.feature_split.Value = 0.0 } }
func (algo *NeuralNetwork) PredictMultiClass(sample *core.Sample) *core.ArrayVector { y := core.NewVector() z := core.NewArrayVector() for i := int64(0); i < algo.Params.Hidden; i++ { sum := float64(0) for _, f := range sample.Features { sum += f.Value * algo.Model.L1.Data[i].GetValue(f.Id) } y.Data[i] = util.Sigmoid(sum) } y.Data[algo.Params.Hidden] = 1 for i := 0; i <= int(algo.MaxLabel); i++ { sum := float64(0) for j := int64(0); j <= algo.Params.Hidden; j++ { sum += y.GetValue(j) * algo.Model.L2.GetValue(j, int64(i)) } z.SetValue(i, sum) } z = z.SoftMaxNorm() return z }
func (c *KNN) PredictMultiClass(sample *core.Sample) *core.ArrayVector { x := sample.GetFeatureVector() predictions := []*eval.LabelPrediction{} for i, s := range c.sv { predictions = append(predictions, &(eval.LabelPrediction{Label: c.labels[i], Prediction: c.Kernel(s, x)})) } compare := func(p1, p2 *eval.LabelPrediction) bool { return p1.Prediction > p2.Prediction } eval.By(compare).Sort(predictions) ret := core.NewArrayVector() for i, pred := range predictions { if i > c.k { break } ret.AddValue(pred.Label, 1.0) } return ret }
func (t *Tree) FromString(buf string) { lines := strings.Split(buf, "\n") size, _ := strconv.Atoi(lines[0]) t.nodes = make([]*TreeNode, size+1, size+1) for _, line := range lines[1:] { if len(line) == 0 { break } tks := strings.Split(line, "\t") node := TreeNode{} i, _ := strconv.Atoi(tks[0]) node.left, _ = strconv.Atoi(tks[1]) node.right, _ = strconv.Atoi(tks[2]) node.depth, _ = strconv.Atoi(tks[3]) node.prediction = core.NewArrayVector() node.prediction.FromString(tks[4]) node.sample_count, _ = strconv.Atoi(tks[5]) node.feature_split = core.Feature{} node.feature_split.Id, _ = strconv.ParseInt(tks[6], 10, 64) node.feature_split.Value, _ = strconv.ParseFloat(tks[7], 64) t.nodes[i] = &node } }
func (dt *CART) FindBestSplitOfContinusousFeature(samples []*core.MapBasedSample, node *TreeNode, feature_select_prob float64) { feature_weight_labels := make(map[int64]*core.FeatureLabelDistribution) total_dis := core.NewArrayVector() for i, k := range node.samples { if i > 10 && rand.Float64() > dt.params.SamplingRatio { continue } total_dis.AddValue(samples[k].Label, 1.0) for fid, fvalue := range samples[k].Features { if dt.RandByFeatureId(fid) > feature_select_prob { continue } _, ok := feature_weight_labels[fid] if !ok { feature_weight_labels[fid] = core.NewFeatureLabelDistribution() } feature_weight_labels[fid].AddWeightLabel(fvalue, samples[k].Label) } } min_gini := 1.0 node.feature_split = core.Feature{Id: -1, Value: 0} for fid, distribution := range feature_weight_labels { sort.Sort(distribution) split, gini := distribution.BestSplitByGini(total_dis) if min_gini > gini { min_gini = gini node.feature_split.Id = fid node.feature_split.Value = split } } if min_gini > dt.params.GiniThreshold { node.feature_split.Id = -1 node.feature_split.Value = 0.0 } }