func (b *BFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) { if len(loc.X) != b.dim { panic("bfgs: unexpected size mismatch") } if len(loc.Gradient) != b.dim { panic("bfgs: unexpected size mismatch") } if len(dir) != b.dim { panic("bfgs: unexpected size mismatch") } // Compute the gradient difference in the last step // y = g_{k+1} - g_{k} floats.SubTo(b.y, loc.Gradient, b.grad) // Compute the step difference // s = x_{k+1} - x_{k} floats.SubTo(b.s, loc.X, b.x) sDotY := floats.Dot(b.s, b.y) sDotYSquared := sDotY * sDotY if b.first { // Rescale the initial hessian. // From: Numerical optimization, Nocedal and Wright, Page 143, Eq. 6.20 (second edition). yDotY := floats.Dot(b.y, b.y) scale := sDotY / yDotY for i := 0; i < len(loc.X); i++ { for j := 0; j < len(loc.X); j++ { if i == j { b.invHess.SetSym(i, i, scale) } else { b.invHess.SetSym(i, j, 0) } } } b.first = false } // Compute the update rule // B_{k+1}^-1 // First term is just the existing inverse hessian // Second term is // (sk^T yk + yk^T B_k^-1 yk)(s_k sk_^T) / (sk^T yk)^2 // Third term is // B_k ^-1 y_k sk^T + s_k y_k^T B_k-1 // // y_k^T B_k^-1 y_k is a scalar, and the third term is a rank-two update // where B_k^-1 y_k is one vector and s_k is the other. Compute the update // values then actually perform the rank updates. yBy := mat64.Inner(b.yVec, b.invHess, b.yVec) firstTermConst := (sDotY + yBy) / (sDotYSquared) b.tmpVec.MulVec(b.invHess, b.yVec) b.invHess.RankTwo(b.invHess, -1/sDotY, b.tmpVec, b.sVec) b.invHess.SymRankOne(b.invHess, firstTermConst, b.sVec) // update the bfgs stored data to the new iteration copy(b.x, loc.X) copy(b.grad, loc.Gradient) // Compute the new search direction d := mat64.NewVector(b.dim, dir) g := mat64.NewVector(b.dim, loc.Gradient) d.MulVec(b.invHess, g) // new direction stored in place floats.Scale(-1, dir) return 1 }
func (b *BFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) { dim := b.dim if len(loc.X) != dim { panic("bfgs: unexpected size mismatch") } if len(loc.Gradient) != dim { panic("bfgs: unexpected size mismatch") } if len(dir) != dim { panic("bfgs: unexpected size mismatch") } x := mat64.NewVector(dim, loc.X) grad := mat64.NewVector(dim, loc.Gradient) // s = x_{k+1} - x_{k} b.s.SubVec(x, &b.x) // y = g_{k+1} - g_{k} b.y.SubVec(grad, &b.grad) sDotY := mat64.Dot(&b.s, &b.y) if b.first { // Rescale the initial Hessian. // From: Nocedal, J., Wright, S.: Numerical Optimization (2nd ed). // Springer (2006), page 143, eq. 6.20. yDotY := mat64.Dot(&b.y, &b.y) scale := sDotY / yDotY for i := 0; i < dim; i++ { for j := i; j < dim; j++ { if i == j { b.invHess.SetSym(i, i, scale) } else { b.invHess.SetSym(i, j, 0) } } } b.first = false } if math.Abs(sDotY) != 0 { // Update the inverse Hessian according to the formula // // B_{k+1}^-1 = B_k^-1 // + (s_k^T y_k + y_k^T B_k^-1 y_k) / (s_k^T y_k)^2 * (s_k s_k^T) // - (B_k^-1 y_k s_k^T + s_k y_k^T B_k^-1) / (s_k^T y_k). // // Note that y_k^T B_k^-1 y_k is a scalar, and that the third term is a // rank-two update where B_k^-1 y_k is one vector and s_k is the other. yBy := mat64.Inner(&b.y, b.invHess, &b.y) b.tmp.MulVec(b.invHess, &b.y) scale := (1 + yBy/sDotY) / sDotY b.invHess.SymRankOne(b.invHess, scale, &b.s) b.invHess.RankTwo(b.invHess, -1/sDotY, &b.tmp, &b.s) } // Update the stored BFGS data. b.x.CopyVec(x) b.grad.CopyVec(grad) // New direction is stored in dir. d := mat64.NewVector(dim, dir) d.MulVec(b.invHess, grad) d.ScaleVec(-1, d) return 1 }
func (g *GP) marginalLikelihoodDerivative(x, grad []float64, trainNoise bool, mem *margLikeMemory) { // d/dTheta_j log[(p|X,theta)] = // 1/2 * y^T * K^-1 dK/dTheta_j * K^-1 * y - 1/2 * tr(K^-1 * dK/dTheta_j) // 1/2 * α^T * dK/dTheta_j * α - 1/2 * tr(K^-1 dK/dTheta_j) // Multiply by the same -2 // -α^T * K^-1 * α + tr(K^-1 dK/dTheta_j) // This first computation is an inner product. n := len(g.outputs) nHyper := g.kernel.NumHyper() k := mem.k chol := mem.chol alpha := mem.alpha dKdTheta := mem.dKdTheta kInvDK := mem.kInvDK y := mat64.NewVector(n, g.outputs) var noise float64 if trainNoise { noise = math.Exp(x[len(x)-1]) } else { noise = g.noise } // If x is the same, then reuse what has been computed in the function. if !floats.Equal(mem.lastX, x) { copy(mem.lastX, x) g.kernel.SetHyper(x[:nHyper]) g.setKernelMat(k, noise) //chol.Cholesky(k, false) chol.Factorize(k) alpha.SolveCholeskyVec(chol, y) } g.setKernelMatDeriv(dKdTheta, trainNoise, noise) for i := range dKdTheta { kInvDK.SolveCholesky(chol, dKdTheta[i]) inner := mat64.Inner(alpha, dKdTheta[i], alpha) grad[i] = -inner + mat64.Trace(kInvDK) } floats.Scale(1/float64(n), grad) bounds := g.kernel.Bounds() if trainNoise { bounds = append(bounds, Bound{minLogNoise, maxLogNoise}) } barrierGrad := make([]float64, len(grad)) for i, v := range x { // Quadratic barrier penalty. if v < bounds[i].Min { diff := bounds[i].Min - v barrierGrad[i] = -(barrierPow) * math.Pow(diff, barrierPow-1) } if v > bounds[i].Max { diff := v - bounds[i].Max barrierGrad[i] = (barrierPow) * math.Pow(diff, barrierPow-1) } } fmt.Println("noise, minNoise", x[len(x)-1], bounds[len(x)-1].Min) fmt.Println("barrier Grad", barrierGrad) floats.Add(grad, barrierGrad) //copy(grad, barrierGrad) }