Example #1
0
func (b *BFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) {
	if len(loc.X) != b.dim {
		panic("bfgs: unexpected size mismatch")
	}
	if len(loc.Gradient) != b.dim {
		panic("bfgs: unexpected size mismatch")
	}
	if len(dir) != b.dim {
		panic("bfgs: unexpected size mismatch")
	}

	// Compute the gradient difference in the last step
	// y = g_{k+1} - g_{k}
	floats.SubTo(b.y, loc.Gradient, b.grad)

	// Compute the step difference
	// s = x_{k+1} - x_{k}
	floats.SubTo(b.s, loc.X, b.x)

	sDotY := floats.Dot(b.s, b.y)
	sDotYSquared := sDotY * sDotY

	if b.first {
		// Rescale the initial hessian.
		// From: Numerical optimization, Nocedal and Wright, Page 143, Eq. 6.20 (second edition).
		yDotY := floats.Dot(b.y, b.y)
		scale := sDotY / yDotY
		for i := 0; i < len(loc.X); i++ {
			for j := 0; j < len(loc.X); j++ {
				if i == j {
					b.invHess.SetSym(i, i, scale)
				} else {
					b.invHess.SetSym(i, j, 0)
				}
			}
		}
		b.first = false
	}

	// Compute the update rule
	//     B_{k+1}^-1
	// First term is just the existing inverse hessian
	// Second term is
	//     (sk^T yk + yk^T B_k^-1 yk)(s_k sk_^T) / (sk^T yk)^2
	// Third term is
	//     B_k ^-1 y_k sk^T + s_k y_k^T B_k-1
	//
	// y_k^T B_k^-1 y_k is a scalar, and the third term is a rank-two update
	// where B_k^-1 y_k is one vector and s_k is the other. Compute the update
	// values then actually perform the rank updates.
	yBy := mat64.Inner(b.yVec, b.invHess, b.yVec)
	firstTermConst := (sDotY + yBy) / (sDotYSquared)
	b.tmpVec.MulVec(b.invHess, b.yVec)

	b.invHess.RankTwo(b.invHess, -1/sDotY, b.tmpVec, b.sVec)
	b.invHess.SymRankOne(b.invHess, firstTermConst, b.sVec)

	// update the bfgs stored data to the new iteration
	copy(b.x, loc.X)
	copy(b.grad, loc.Gradient)

	// Compute the new search direction
	d := mat64.NewVector(b.dim, dir)
	g := mat64.NewVector(b.dim, loc.Gradient)

	d.MulVec(b.invHess, g) // new direction stored in place
	floats.Scale(-1, dir)
	return 1
}
Example #2
0
func (b *BFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) {
	dim := b.dim
	if len(loc.X) != dim {
		panic("bfgs: unexpected size mismatch")
	}
	if len(loc.Gradient) != dim {
		panic("bfgs: unexpected size mismatch")
	}
	if len(dir) != dim {
		panic("bfgs: unexpected size mismatch")
	}

	x := mat64.NewVector(dim, loc.X)
	grad := mat64.NewVector(dim, loc.Gradient)

	// s = x_{k+1} - x_{k}
	b.s.SubVec(x, &b.x)
	// y = g_{k+1} - g_{k}
	b.y.SubVec(grad, &b.grad)

	sDotY := mat64.Dot(&b.s, &b.y)

	if b.first {
		// Rescale the initial Hessian.
		// From: Nocedal, J., Wright, S.: Numerical Optimization (2nd ed).
		//       Springer (2006), page 143, eq. 6.20.
		yDotY := mat64.Dot(&b.y, &b.y)
		scale := sDotY / yDotY
		for i := 0; i < dim; i++ {
			for j := i; j < dim; j++ {
				if i == j {
					b.invHess.SetSym(i, i, scale)
				} else {
					b.invHess.SetSym(i, j, 0)
				}
			}
		}
		b.first = false
	}

	if math.Abs(sDotY) != 0 {
		// Update the inverse Hessian according to the formula
		//
		//  B_{k+1}^-1 = B_k^-1
		//             + (s_k^T y_k + y_k^T B_k^-1 y_k) / (s_k^T y_k)^2 * (s_k s_k^T)
		//             - (B_k^-1 y_k s_k^T + s_k y_k^T B_k^-1) / (s_k^T y_k).
		//
		// Note that y_k^T B_k^-1 y_k is a scalar, and that the third term is a
		// rank-two update where B_k^-1 y_k is one vector and s_k is the other.
		yBy := mat64.Inner(&b.y, b.invHess, &b.y)
		b.tmp.MulVec(b.invHess, &b.y)
		scale := (1 + yBy/sDotY) / sDotY
		b.invHess.SymRankOne(b.invHess, scale, &b.s)
		b.invHess.RankTwo(b.invHess, -1/sDotY, &b.tmp, &b.s)
	}

	// Update the stored BFGS data.
	b.x.CopyVec(x)
	b.grad.CopyVec(grad)

	// New direction is stored in dir.
	d := mat64.NewVector(dim, dir)
	d.MulVec(b.invHess, grad)
	d.ScaleVec(-1, d)

	return 1
}
Example #3
0
func (g *GP) marginalLikelihoodDerivative(x, grad []float64, trainNoise bool, mem *margLikeMemory) {
	// d/dTheta_j log[(p|X,theta)] =
	//		1/2 * y^T * K^-1 dK/dTheta_j * K^-1 * y - 1/2 * tr(K^-1 * dK/dTheta_j)
	//		1/2 * α^T * dK/dTheta_j * α - 1/2 * tr(K^-1 dK/dTheta_j)
	// Multiply by the same -2
	//		-α^T * K^-1 * α + tr(K^-1 dK/dTheta_j)
	// This first computation is an inner product.
	n := len(g.outputs)
	nHyper := g.kernel.NumHyper()
	k := mem.k
	chol := mem.chol
	alpha := mem.alpha
	dKdTheta := mem.dKdTheta
	kInvDK := mem.kInvDK

	y := mat64.NewVector(n, g.outputs)

	var noise float64
	if trainNoise {
		noise = math.Exp(x[len(x)-1])
	} else {
		noise = g.noise
	}

	// If x is the same, then reuse what has been computed in the function.
	if !floats.Equal(mem.lastX, x) {
		copy(mem.lastX, x)
		g.kernel.SetHyper(x[:nHyper])
		g.setKernelMat(k, noise)
		//chol.Cholesky(k, false)
		chol.Factorize(k)
		alpha.SolveCholeskyVec(chol, y)
	}
	g.setKernelMatDeriv(dKdTheta, trainNoise, noise)
	for i := range dKdTheta {
		kInvDK.SolveCholesky(chol, dKdTheta[i])
		inner := mat64.Inner(alpha, dKdTheta[i], alpha)
		grad[i] = -inner + mat64.Trace(kInvDK)
	}
	floats.Scale(1/float64(n), grad)

	bounds := g.kernel.Bounds()
	if trainNoise {
		bounds = append(bounds, Bound{minLogNoise, maxLogNoise})
	}
	barrierGrad := make([]float64, len(grad))
	for i, v := range x {
		// Quadratic barrier penalty.
		if v < bounds[i].Min {
			diff := bounds[i].Min - v
			barrierGrad[i] = -(barrierPow) * math.Pow(diff, barrierPow-1)
		}
		if v > bounds[i].Max {
			diff := v - bounds[i].Max
			barrierGrad[i] = (barrierPow) * math.Pow(diff, barrierPow-1)
		}
	}
	fmt.Println("noise, minNoise", x[len(x)-1], bounds[len(x)-1].Min)
	fmt.Println("barrier Grad", barrierGrad)
	floats.Add(grad, barrierGrad)
	//copy(grad, barrierGrad)
}