func MaxEntComputeInstanceDerivative( weights *util.Matrix, instance *data.Instance, instanceDerivative *util.Matrix) { // 定义偏导和特征向量 features := instance.Features // 得到维度信息 numLabels := weights.NumLabels() + 1 // 计算 z = 1 + exp(sum(w_i * x_i)) label := instance.Output.Label z := ComputeZ(weights, features, label, instanceDerivative) inverseZ := float64(1) / z for iLabel := 1; iLabel < numLabels; iLabel++ { vec := instanceDerivative.GetValues(iLabel - 1) if label == 0 || label != iLabel { vec.Multiply(inverseZ, 0, features) } else { vec.Multiply(inverseZ, -1, features) } } }
// 计算 z = 1 + sum(exp(sum(w_i * x_i))) // // 在temp中保存 exp(sum(w_i * x_i)) func ComputeZ(weights *util.Matrix, features *util.Vector, label int, temp *util.Matrix) float64 { result := float64(1.0) numLabels := weights.NumLabels() + 1 for iLabel := 1; iLabel < numLabels; iLabel++ { exp := math.Exp(util.VecDotProduct(features, weights.GetValues(iLabel-1))) result += exp tempVec := temp.GetValues(iLabel - 1) if tempVec.IsSparse() { for _, k := range features.Keys() { tempVec.Set(k, exp) } } else { tempVec.SetAll(exp) } } return result }
func (opt *gdOptimizer) OptimizeWeights( weights *util.Matrix, derivative_func ComputeInstanceDerivativeFunc, set data.Dataset) { // 偏导数向量 derivative := weights.Populate() // 学习率计算器 learningRate := NewLearningRate(opt.options) // 优化循环 iterator := set.CreateIterator() step := 0 var learning_rate float64 convergingSteps := 0 oldWeights := weights.Populate() weightsDelta := weights.Populate() instanceDerivative := weights.Populate() log.Print("开始梯度递降优化") for { if opt.options.MaxIterations > 0 && step >= opt.options.MaxIterations { break } step++ // 每次遍历样本前对偏导数向量清零 derivative.Clear() // 遍历所有样本,计算偏导数向量并累加 iterator.Start() instancesProcessed := 0 for !iterator.End() { instance := iterator.GetInstance() derivative_func(weights, instance, instanceDerivative) derivative.Increment(instanceDerivative, 1.0/float64(set.NumInstances())) iterator.Next() instancesProcessed++ if opt.options.GDBatchSize > 0 && instancesProcessed >= opt.options.GDBatchSize { // 添加正则化项 derivative.Increment(ComputeRegularization(weights, opt.options), float64(instancesProcessed)/(float64(set.NumInstances())*float64(set.NumInstances()))) // 计算特征权重的增量 delta := opt.GetDeltaX(weights, derivative) // 根据学习率更新权重 learning_rate = learningRate.ComputeLearningRate(delta) weights.Increment(delta, learning_rate) // 重置 derivative.Clear() instancesProcessed = 0 } } if instancesProcessed > 0 { // 处理剩余的样本 derivative.Increment(ComputeRegularization(weights, opt.options), float64(instancesProcessed)/(float64(set.NumInstances())*float64(set.NumInstances()))) delta := opt.GetDeltaX(weights, derivative) learning_rate = learningRate.ComputeLearningRate(delta) weights.Increment(delta, learning_rate) } weightsDelta.WeightedSum(weights, oldWeights, 1, -1) oldWeights.DeepCopy(weights) weightsNorm := weights.Norm() weightsDeltaNorm := weightsDelta.Norm() log.Printf("#%d |w|=%1.3g |dw|/|w|=%1.3g lr=%1.3g", step, weightsNorm, weightsDeltaNorm/weightsNorm, learning_rate) // 判断是否溢出 if math.IsNaN(weightsNorm) { log.Fatal("优化失败:不收敛") } // 判断是否收敛 if weightsDelta.Norm()/weights.Norm() < opt.options.ConvergingDeltaWeight { convergingSteps++ if convergingSteps > opt.options.ConvergingSteps { log.Printf("收敛") break } } } }
// 输入x_k和g_k,返回x需要更新的增量 d_k = - g_k func (opt *gdOptimizer) GetDeltaX(x, g *util.Matrix) *util.Matrix { return g.Opposite() }
// 根据正则化方法计算偏导数向量需要添加正则化项 func ComputeRegularization(weights *util.Matrix, options OptimizerOptions) *util.Matrix { reg := weights.Populate() if options.RegularizationScheme == 1 { // L-1正则化 for iLabel := 0; iLabel < weights.NumLabels(); iLabel++ { for _, k := range weights.GetValues(iLabel).Keys() { if weights.Get(iLabel, k) > 0 { reg.Set(iLabel, k, options.RegularizationFactor) } else { reg.Set(iLabel, k, -options.RegularizationFactor) } } } } else if options.RegularizationScheme == 2 { // L-2正则化 for iLabel := 0; iLabel < weights.NumLabels(); iLabel++ { for _, k := range weights.GetValues(iLabel).Keys() { reg.Set(iLabel, k, options.RegularizationFactor*float64(2)*weights.Get(iLabel, k)) } } } return reg }
func (opt *lbfgsOptimizer) OptimizeWeights( weights *util.Matrix, derivative_func ComputeInstanceDerivativeFunc, set data.Dataset) { // 学习率计算器 learningRate := NewLearningRate(opt.options) // 偏导数向量 derivative := weights.Populate() // 优化循环 step := 0 convergingSteps := 0 oldWeights := weights.Populate() weightsDelta := weights.Populate() // 为各个工作协程开辟临时资源 numLbfgsThreads := *lbfgs_threads if numLbfgsThreads == 0 { numLbfgsThreads = runtime.NumCPU() } workerSet := make([]data.Dataset, numLbfgsThreads) workerDerivative := make([]*util.Matrix, numLbfgsThreads) workerInstanceDerivative := make([]*util.Matrix, numLbfgsThreads) for iWorker := 0; iWorker < numLbfgsThreads; iWorker++ { workerBuckets := []data.SkipBucket{ {true, iWorker}, {false, 1}, {true, numLbfgsThreads - 1 - iWorker}, } workerSet[iWorker] = data.NewSkipDataset(set, workerBuckets) workerDerivative[iWorker] = weights.Populate() workerInstanceDerivative[iWorker] = weights.Populate() } log.Print("开始L-BFGS优化") for { if opt.options.MaxIterations > 0 && step >= opt.options.MaxIterations { break } step++ // 开始工作协程 workerChannel := make(chan int, numLbfgsThreads) for iWorker := 0; iWorker < numLbfgsThreads; iWorker++ { go func(iw int) { workerDerivative[iw].Clear() iterator := workerSet[iw].CreateIterator() iterator.Start() for !iterator.End() { instance := iterator.GetInstance() derivative_func( weights, instance, workerInstanceDerivative[iw]) // log.Print(workerInstanceDerivative[iw].GetValues(0)) workerDerivative[iw].Increment( workerInstanceDerivative[iw], float64(1)/float64(set.NumInstances())) iterator.Next() } workerChannel <- iw }(iWorker) } derivative.Clear() // 等待工作协程结束 for iWorker := 0; iWorker < numLbfgsThreads; iWorker++ { <-workerChannel } for iWorker := 0; iWorker < numLbfgsThreads; iWorker++ { derivative.Increment(workerDerivative[iWorker], 1) } // 添加正则化项 derivative.Increment(ComputeRegularization(weights, opt.options), 1.0/float64(set.NumInstances())) // 计算特征权重的增量 delta := opt.GetDeltaX(weights, derivative) // 根据学习率更新权重 learning_rate := learningRate.ComputeLearningRate(delta) weights.Increment(delta, learning_rate) weightsDelta.WeightedSum(weights, oldWeights, 1, -1) oldWeights.DeepCopy(weights) weightsNorm := weights.Norm() weightsDeltaNorm := weightsDelta.Norm() log.Printf("#%d |dw|/|w|=%f |w|=%f lr=%1.3g", step, weightsDeltaNorm/weightsNorm, weightsNorm, learning_rate) // 判断是否溢出 if math.IsNaN(weightsNorm) { log.Fatal("优化失败:不收敛") } // 判断是否收敛 if weightsDeltaNorm/weightsNorm < opt.options.ConvergingDeltaWeight { convergingSteps++ if convergingSteps > opt.options.ConvergingSteps { log.Printf("收敛") break } } else { convergingSteps = 0 } } }
// 输入x_k和g_k,返回x需要更新的增量 d_k = - H_k * g_k func (opt *lbfgsOptimizer) GetDeltaX(x, g *util.Matrix) *util.Matrix { if x.NumLabels() != g.NumLabels() { log.Fatal("x和g的维度不一致") } // 第一次调用时开辟内存 if opt.k == 0 { if x.IsSparse() { opt.initStruct(x.NumLabels(), 0, x.IsSparse()) } else { opt.initStruct(x.NumLabels(), x.NumValues(), x.IsSparse()) } } currIndex := util.Mod(opt.k, *lbfgs_history_size) // 更新x_k opt.x[currIndex].DeepCopy(x) // 更新g_k opt.g[currIndex].DeepCopy(g) // 当为第0步时,使用简单的gradient descent if opt.k == 0 { opt.k++ return g.Opposite() } prevIndex := util.Mod(opt.k-1, *lbfgs_history_size) // 更新s_(k-1) opt.s[prevIndex].WeightedSum(opt.x[currIndex], opt.x[prevIndex], 1, -1) // 更新y_(k-1) opt.y[prevIndex].WeightedSum(opt.g[currIndex], opt.g[prevIndex], 1, -1) // 更新ro_(k-1) opt.ro.Set(prevIndex, 1.0/util.MatrixDotProduct(opt.y[prevIndex], opt.s[prevIndex])) // 计算两个循环的下限 lowerBound := opt.k - *lbfgs_history_size if lowerBound < 0 { lowerBound = 0 } // 第一个循环 opt.q.DeepCopy(g) for i := opt.k - 1; i >= lowerBound; i-- { currIndex := util.Mod(i, *lbfgs_history_size) opt.alpha.Set(currIndex, opt.ro.Get(currIndex)*util.MatrixDotProduct(opt.s[currIndex], opt.q)) opt.q.Increment(opt.y[currIndex], -opt.alpha.Get(currIndex)) } // 第二个循环 opt.z.DeepCopy(opt.q) for i := lowerBound; i <= opt.k-1; i++ { currIndex := util.Mod(i, *lbfgs_history_size) opt.beta.Set(currIndex, opt.ro.Get(currIndex)*util.MatrixDotProduct(opt.y[currIndex], opt.z)) opt.z.Increment(opt.s[currIndex], opt.alpha.Get(currIndex)-opt.beta.Get(currIndex)) } // 更新k opt.k++ return opt.z.Opposite() }