func (l *LBFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) { // Uses two-loop correction as described in // Nocedal, J., Wright, S.: Numerical Optimization (2nd ed). Springer (2006), chapter 7, page 178. if len(loc.X) != l.dim { panic("lbfgs: unexpected size mismatch") } if len(loc.Gradient) != l.dim { panic("lbfgs: unexpected size mismatch") } if len(dir) != l.dim { panic("lbfgs: unexpected size mismatch") } y := l.y[l.oldest] floats.SubTo(y, loc.Gradient, l.grad) s := l.s[l.oldest] floats.SubTo(s, loc.X, l.x) sDotY := floats.Dot(s, y) l.rho[l.oldest] = 1 / sDotY l.oldest = (l.oldest + 1) % l.Store copy(l.x, loc.X) copy(l.grad, loc.Gradient) copy(dir, loc.Gradient) // Start with the most recent element and go backward, for i := 0; i < l.Store; i++ { idx := l.oldest - i - 1 if idx < 0 { idx += l.Store } l.a[idx] = l.rho[idx] * floats.Dot(l.s[idx], dir) floats.AddScaled(dir, -l.a[idx], l.y[idx]) } // Scale the initial Hessian. gamma := sDotY / floats.Dot(y, y) floats.Scale(gamma, dir) // Start with the oldest element and go forward. for i := 0; i < l.Store; i++ { idx := i + l.oldest if idx >= l.Store { idx -= l.Store } beta := l.rho[idx] * floats.Dot(l.y[idx], dir) floats.AddScaled(dir, l.a[idx]-beta, l.s[idx]) } // dir contains H^{-1} * g, so flip the direction for minimization. floats.Scale(-1, dir) return 1 }
func (l *LBFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) { if len(loc.X) != l.dim { panic("lbfgs: unexpected size mismatch") } if len(loc.Gradient) != l.dim { panic("lbfgs: unexpected size mismatch") } if len(dir) != l.dim { panic("lbfgs: unexpected size mismatch") } // Update direction. Uses two-loop correction as described in // Nocedal, Wright (2006), Numerical Optimization (2nd ed.). Chapter 7, page 178. copy(dir, loc.Gradient) floats.SubTo(l.y, loc.Gradient, l.grad) floats.SubTo(l.s, loc.X, l.x) copy(l.sHist[l.oldest], l.s) copy(l.yHist[l.oldest], l.y) sDotY := floats.Dot(l.y, l.s) l.rhoHist[l.oldest] = 1 / sDotY l.oldest++ l.oldest = l.oldest % l.Store copy(l.x, loc.X) copy(l.grad, loc.Gradient) // two loop update. First loop starts with the most recent element // and goes backward, second starts with the oldest element and goes // forward. At the end have computed H^-1 * g, so flip the direction for // minimization. for i := 0; i < l.Store; i++ { idx := l.oldest - i - 1 if idx < 0 { idx += l.Store } l.a[idx] = l.rhoHist[idx] * floats.Dot(l.sHist[idx], dir) floats.AddScaled(dir, -l.a[idx], l.yHist[idx]) } // Scale the initial Hessian. gamma := sDotY / floats.Dot(l.y, l.y) floats.Scale(gamma, dir) for i := 0; i < l.Store; i++ { idx := i + l.oldest if idx >= l.Store { idx -= l.Store } beta := l.rhoHist[idx] * floats.Dot(l.yHist[idx], dir) floats.AddScaled(dir, l.a[idx]-beta, l.sHist[idx]) } floats.Scale(-1, dir) return 1 }
func (n *Newton) NextDirection(loc *Location, dir []float64) (stepSize float64) { // This method implements Algorithm 3.3 (Cholesky with Added Multiple of // the Identity) from Nocedal, Wright (2006), 2nd edition. dim := len(loc.X) n.hess.CopySym(loc.Hessian) // Find the smallest diagonal entry of the Hesssian. minA := n.hess.At(0, 0) for i := 1; i < dim; i++ { a := n.hess.At(i, i) if a < minA { minA = a } } // If the smallest diagonal entry is positive, the Hessian may be positive // definite, and so first attempt to apply the Cholesky factorization to // the un-modified Hessian. If the smallest entry is negative, use the // final tau from the last iteration if regularization was needed, // otherwise guess an appropriate value for tau. if minA > 0 { n.tau = 0 } else if n.tau == 0 { n.tau = -minA + 0.001 } for k := 0; k < maxNewtonModifications; k++ { if n.tau != 0 { // Add a multiple of identity to the Hessian. for i := 0; i < dim; i++ { n.hess.SetSym(i, i, loc.Hessian.At(i, i)+n.tau) } } // Try to apply the Cholesky factorization. pd := n.chol.Factorize(n.hess) if pd { d := mat64.NewVector(dim, dir) // Store the solution in d's backing array, dir. d.SolveCholeskyVec(&n.chol, mat64.NewVector(dim, loc.Gradient)) floats.Scale(-1, dir) return 1 } // Modified Hessian is not PD, so increase tau. n.tau = math.Max(n.Increase*n.tau, 0.001) } // Hessian modification failed to get a PD matrix. Return the negative // gradient as the descent direction. copy(dir, loc.Gradient) floats.Scale(-1, dir) return 1 }
func (b *BFGS) InitDirection(loc *Location, dir []float64) (stepSize float64) { dim := len(loc.X) b.dim = dim b.x = resize(b.x, dim) copy(b.x, loc.X) b.grad = resize(b.grad, dim) copy(b.grad, loc.Gradient) b.y = resize(b.y, dim) b.s = resize(b.s, dim) b.tmp = resize(b.tmp, dim) b.yVec = mat64.NewVector(dim, b.y) b.sVec = mat64.NewVector(dim, b.s) b.tmpVec = mat64.NewVector(dim, b.tmp) if b.invHess == nil || cap(b.invHess.RawSymmetric().Data) < dim*dim { b.invHess = mat64.NewSymDense(dim, nil) } else { b.invHess = mat64.NewSymDense(dim, b.invHess.RawSymmetric().Data[:dim*dim]) } // The values of the hessian are initialized in the first call to NextDirection // initial direcion is just negative of gradient because the hessian is 1 copy(dir, loc.Gradient) floats.Scale(-1, dir) b.first = true return 1 / floats.Norm(dir, 2) }
// locationAsy returns the node locations and weights of a Hermite quadrature rule // with len(x) points. func (h Hermite) locationsAsy(x, w []float64) { // A. Townsend, T. Trogdon, and S.Olver, Fast computation of Gauss quadrature // nodes and weights the whole real line, IMA J. Numer. Anal., // 36: 337–358, 2016. http://arxiv.org/abs/1410.5286 // Find the positive locations and weights. n := len(x) l := n / 2 xa := x[l:] wa := w[l:] for i := range xa { xa[i], wa[i] = h.locationsAsy0(i, n) } // Flip around zero -- copy the negative x locations with the corresponding // weights. if n%2 == 0 { l-- } for i, v := range xa { x[l-i] = -v } for i, v := range wa { w[l-i] = v } sumW := floats.Sum(w) c := math.SqrtPi / sumW floats.Scale(c, w) }
func TestCategoricalCDF(t *testing.T) { for _, test := range [][]float64{ {1, 2, 3, 0, 4}, } { c := make([]float64, len(test)) copy(c, test) floats.Scale(1/floats.Sum(c), c) sum := make([]float64, len(test)) floats.CumSum(sum, c) dist := NewCategorical(test, nil) cdf := dist.CDF(-0.5) if cdf != 0 { t.Errorf("CDF of negative number not zero") } for i := range c { cdf := dist.CDF(float64(i)) if math.Abs(cdf-sum[i]) > 1e-14 { t.Errorf("CDF mismatch %v. Want %v, got %v.", float64(i), sum[i], cdf) } cdfp := dist.CDF(float64(i) + 0.5) if cdfp != cdf { t.Errorf("CDF mismatch for non-integer input") } } } }
func TestCategoricalProb(t *testing.T) { for _, test := range [][]float64{ {1, 2, 3, 0}, } { dist := NewCategorical(test, nil) norm := make([]float64, len(test)) floats.Scale(1/floats.Sum(norm), norm) for i, v := range norm { p := dist.Prob(float64(i)) if math.Abs(p-v) > 1e-14 { t.Errorf("Probability mismatch element %d", i) } p = dist.Prob(float64(i) + 0.5) if p != 0 { t.Errorf("Non-zero probability for non-integer x") } } p := dist.Prob(-1) if p != 0 { t.Errorf("Non-zero probability for -1") } p = dist.Prob(float64(len(test))) if p != 0 { t.Errorf("Non-zero probability for len(test)") } } }
// Not callable in parallel because of the batches func (g *GradOptimizable) FuncGrad(params []float64, deriv []float64) float64 { inds := g.Sampler.Iterate() total := len(inds) var totalLoss float64 for i := range deriv { deriv[i] = 0 } // Send the regularizer g.batches[0].parameters = params g.regularizeChan <- g.batches[0] // Send initial batches out var initBatches int var lastSent int for i := 0; i < g.NumWorkers; i++ { if lastSent == total { break } add := g.grainSize if lastSent+add >= total { add = total - lastSent } initBatches++ g.batches[i+1].idxs = inds[lastSent : lastSent+add] g.batches[i+1].parameters = params g.sendWork <- g.batches[i+1] lastSent += add } // Collect the batches and resend out for lastSent < total { batch := <-g.receiveWork totalLoss += batch.loss floats.Add(deriv, batch.deriv) add := g.grainSize if lastSent+add >= total { add = total - lastSent } batch.idxs = inds[lastSent : lastSent+add] g.sendWork <- batch lastSent += add } // All inds sent, so just weight for all the collection for i := 0; i < initBatches; i++ { batch := <-g.receiveWork totalLoss += batch.loss floats.Add(deriv, batch.deriv) } batch := <-g.regDone totalLoss += batch.loss floats.Add(deriv, batch.deriv) totalLoss /= float64(len(inds)) floats.Scale(1/float64(len(inds)), deriv) return totalLoss }
// returnNext updates the location based on the iteration type and the current // simplex, and returns the next operation. func (n *NelderMead) returnNext(iter nmIterType, loc *Location) (Operation, error) { n.lastIter = iter switch iter { case nmMajor: // Fill loc with the current best point and value, // and command a convergence check. copy(loc.X, n.vertices[0]) loc.F = n.values[0] return MajorIteration, nil case nmReflected, nmExpanded, nmContractedOutside, nmContractedInside: // x_new = x_centroid + scale * (x_centroid - x_worst) var scale float64 switch iter { case nmReflected: scale = n.reflection case nmExpanded: scale = n.reflection * n.expansion case nmContractedOutside: scale = n.reflection * n.contraction case nmContractedInside: scale = -n.contraction } dim := len(loc.X) floats.SubTo(loc.X, n.centroid, n.vertices[dim]) floats.Scale(scale, loc.X) floats.Add(loc.X, n.centroid) if iter == nmReflected { copy(n.reflectedPoint, loc.X) } return FuncEvaluation, nil case nmShrink: // x_shrink = x_best + delta * (x_i + x_best) floats.SubTo(loc.X, n.vertices[n.fillIdx], n.vertices[0]) floats.Scale(n.shrink, loc.X) floats.Add(loc.X, n.vertices[0]) return FuncEvaluation, nil default: panic("unreachable") } }
// ObjDeriv computes the objective value and stores the derivative in place func (g *BatchGradBased) ObjGrad(parameters []float64, derivative []float64) (loss float64) { c := make(chan lossDerivStruct, 10) // Set the channel for parallel for f := func(start, end int) { g.lossDerivFunc(start, end, c, parameters) } go func() { wg := &sync.WaitGroup{} // Compute the losses and the derivatives all in parallel wg.Add(2) go func() { common.ParallelFor(g.nTrain, g.grainSize, f) wg.Done() }() // Compute the regularization go func() { deriv := make([]float64, g.nParameters) loss := g.regularizer.LossDeriv(parameters, deriv) //fmt.Println("regularizer loss = ", loss) //fmt.Println("regularizer deriv = ", deriv) c <- lossDerivStruct{ loss: loss, deriv: deriv, } wg.Done() }() // Wait for all of the results to be sent on the channel wg.Wait() // Close the channel close(c) }() // zero the derivative for i := range derivative { derivative[i] = 0 } // Range over the channel, incrementing the loss and derivative // as they come in for l := range c { loss += l.loss floats.Add(derivative, l.deriv) } //fmt.Println("nTrain", g.nTrain) //fmt.Println("final deriv", derivative) // Normalize by the number of training samples loss /= float64(g.nTrain) floats.Scale(1/float64(g.nTrain), derivative) return loss }
// UpdateOne updates sufficient statistics using one observation. func (g *Model) UpdateOne(o model.Obs, w float64) { glog.V(6).Infof("gaussian update, name:%s, obs:%v, weight:%e", g.ModelName, o, w) /* Update sufficient statistics. */ obs, _, _ := model.ObsToF64(o) floatx.Apply(floatx.ScaleFunc(w), obs, g.tmpArray) floats.Add(g.Sumx, g.tmpArray) floatx.Sq(g.tmpArray, obs) floats.Scale(w, g.tmpArray) floats.Add(g.Sumxsq, g.tmpArray) g.NSamples += w }
func sampleCategorical(t *testing.T, dist Categorical, nSamples int) []float64 { counts := make([]float64, dist.Len()) for i := 0; i < nSamples; i++ { v := dist.Rand() if float64(int(v)) != v { t.Fatalf("Random number is not an integer") } counts[int(v)]++ } sum := floats.Sum(counts) floats.Scale(1/sum, counts) return counts }
func TestJensenShannon(t *testing.T) { for i, test := range []struct { p []float64 q []float64 }{ { p: []float64{0.5, 0.1, 0.3, 0.1}, q: []float64{0.1, 0.4, 0.25, 0.25}, }, { p: []float64{0.4, 0.6, 0.0}, q: []float64{0.2, 0.2, 0.6}, }, { p: []float64{0.1, 0.1, 0.0, 0.8}, q: []float64{0.6, 0.3, 0.0, 0.1}, }, { p: []float64{0.5, 0.1, 0.3, 0.1}, q: []float64{0.5, 0, 0.25, 0.25}, }, { p: []float64{0.5, 0.1, 0, 0.4}, q: []float64{0.1, 0.4, 0.25, 0.25}, }, } { m := make([]float64, len(test.p)) p := test.p q := test.q floats.Add(m, p) floats.Add(m, q) floats.Scale(0.5, m) js1 := 0.5*KullbackLeibler(p, m) + 0.5*KullbackLeibler(q, m) js2 := JensenShannon(p, q) if math.IsNaN(js2) { t.Errorf("In case %v, JS distance is NaN", i) } if math.Abs(js1-js2) > 1e-14 { t.Errorf("JS mismatch case %v. Expected %v, found %v.", i, js1, js2) } } if !Panics(func() { JensenShannon(make([]float64, 3), make([]float64, 2)) }) { t.Errorf("JensenShannon did not panic with p, q length mismatch") } }
// Explicitly forms vectors and computes normalized dot product. func cosCorrMultiNaive(f, g *rimg64.Multi) *rimg64.Image { h := rimg64.New(f.Width-g.Width+1, f.Height-g.Height+1) n := g.Width * g.Height * g.Channels a := make([]float64, n) b := make([]float64, n) for i := 0; i < h.Width; i++ { for j := 0; j < h.Height; j++ { a = a[:0] b = b[:0] for u := 0; u < g.Width; u++ { for v := 0; v < g.Height; v++ { for p := 0; p < g.Channels; p++ { a = append(a, f.At(i+u, j+v, p)) b = append(b, g.At(u, v, p)) } } } floats.Scale(1/floats.Norm(a, 2), a) floats.Scale(1/floats.Norm(b, 2), b) h.Set(i, j, floats.Dot(a, b)) } } return h }
// returnNext finds the next location to evaluate, stores the location in xNext, // and returns the data func (n *NelderMead) returnNext(iter nmIterType, xNext []float64) (EvaluationType, IterationType, error) { dim := len(xNext) n.lastIter = iter switch iter { case nmReflected, nmExpanded, nmContractedOutside, nmContractedInside: // x_new = x_centroid + scale * (x_centroid - x_worst) var scale float64 switch iter { case nmReflected: scale = n.reflection case nmExpanded: scale = n.reflection * n.expansion case nmContractedOutside: scale = n.reflection * n.contraction case nmContractedInside: scale = -n.contraction } floats.SubTo(xNext, n.centroid, n.vertices[dim]) floats.Scale(scale, xNext) floats.Add(xNext, n.centroid) if iter == nmReflected { copy(n.reflectedPoint, xNext) // Nelder Mead iterations start with Reflection step return FuncEvaluation, MajorIteration, nil } return FuncEvaluation, MinorIteration, nil case nmShrink: // x_shrink = x_best + delta * (x_i + x_best) floats.SubTo(xNext, n.vertices[n.fillIdx], n.vertices[0]) floats.Scale(n.shrink, xNext) floats.Add(xNext, n.vertices[0]) return FuncEvaluation, SubIteration, nil default: panic("unreachable") } }
func (l *LBFGS) InitDirection(loc *Location, dir []float64) (stepSize float64) { dim := len(loc.X) l.dim = dim if l.Store == 0 { l.Store = 15 } l.oldest = l.Store - 1 // the first vector will be put in at 0 l.x = resize(l.x, dim) l.grad = resize(l.grad, dim) copy(l.x, loc.X) copy(l.grad, loc.Gradient) l.y = resize(l.y, dim) l.s = resize(l.s, dim) l.a = resize(l.a, l.Store) l.rhoHist = resize(l.rhoHist, l.Store) if cap(l.yHist) < l.Store { n := make([][]float64, l.Store-cap(l.yHist)) l.yHist = append(l.yHist, n...) } if cap(l.sHist) < l.Store { n := make([][]float64, l.Store-cap(l.sHist)) l.sHist = append(l.sHist, n...) } l.yHist = l.yHist[:l.Store] l.sHist = l.sHist[:l.Store] for i := range l.sHist { l.sHist[i] = resize(l.sHist[i], dim) for j := range l.sHist[i] { l.sHist[i][j] = 0 } } for i := range l.yHist { l.yHist[i] = resize(l.yHist[i], dim) for j := range l.yHist[i] { l.yHist[i][j] = 0 } } copy(dir, loc.Gradient) floats.Scale(-1, dir) return 1 / floats.Norm(dir, 2) }
// PrincipalComponents returns the principal component direction vectors and // the column variances of the principal component scores, vecs * a, computed // using the singular value decomposition of the input. The input a is an n×d // matrix where each row is an observation and each column represents a variable. // // PrincipalComponents centers the variables but does not scale the variance. // // The slice weights is used to weight the observations. If weights is nil, // each weight is considered to have a value of one, otherwise the length of // weights must match the number of observations or PrincipalComponents will // panic. // // On successful completion, the principal component direction vectors are // returned in vecs as a d×min(n, d) matrix, and the variances are returned in // vars as a min(n, d)-long slice in descending sort order. // // If no singular value decomposition is possible, vecs and vars are returned // nil and ok is returned false. func PrincipalComponents(a mat64.Matrix, weights []float64) (vecs *mat64.Dense, vars []float64, ok bool) { n, d := a.Dims() if weights != nil && len(weights) != n { panic("stat: len(weights) != observations") } centered := mat64.NewDense(n, d, nil) col := make([]float64, n) for j := 0; j < d; j++ { mat64.Col(col, j, a) floats.AddConst(-Mean(col, weights), col) centered.SetCol(j, col) } for i, w := range weights { floats.Scale(math.Sqrt(w), centered.RawRowView(i)) } kind := matrix.SVDFull if n > d { kind = matrix.SVDThin } var svd mat64.SVD ok = svd.Factorize(centered, kind) if !ok { return nil, nil, false } vecs = &mat64.Dense{} vecs.VFromSVD(&svd) if n < d { // Don't retain columns that are not valid direction vectors. vecs.Clone(vecs.View(0, 0, d, n)) } vars = svd.Values(nil) var f float64 if weights == nil { f = 1 / float64(n-1) } else { f = 1 / (floats.Sum(weights) - 1) } for i, v := range vars { vars[i] = f * v * v } return vecs, vars, true }
// StdDevBatch predicts the standard deviation at a set of locations of x. func (g *GP) StdDevBatch(std []float64, x mat64.Matrix) []float64 { r, c := x.Dims() if c != g.inputDim { panic(badInputLength) } if std == nil { std = make([]float64, r) } if len(std) != r { panic(badStorage) } // For a single point, the stddev is // sigma = k(x,x) - k_*^T * K^-1 * k_* // where k is the vector of kernels between the input points and the output points // For many points, the formula is: // nu_* = k(x_*, k_*) - k_*^T * K^-1 * k_* // This creates the full covariance matrix which is an rxr matrix. However, // the standard deviations are just the diagonal of this matrix. Instead, be // smart about it and compute the diagonal terms one at a time. kStar := g.formKStar(x) var tmp mat64.Dense tmp.SolveCholesky(g.cholK, kStar) // set k(x_*, x_*) into std then subtract k_*^T K^-1 k_* , computed one row at a time var tmp2 mat64.Vector row := make([]float64, c) for i := range std { for k := 0; k < c; k++ { row[k] = x.At(i, k) } std[i] = g.kernel.Distance(row, row) tmp2.MulVec(kStar.ColView(i).T(), tmp.ColView(i)) rt, ct := tmp2.Dims() if rt != 1 && ct != 1 { panic("bad size") } std[i] -= tmp2.At(0, 0) std[i] = math.Sqrt(std[i]) } // Need to scale the standard deviation to be in the same units as y. floats.Scale(g.std, std) return std }
func (l *LBFGS) InitDirection(loc *Location, dir []float64) (stepSize float64) { dim := len(loc.X) l.dim = dim l.oldest = 0 l.a = resize(l.a, l.Store) l.rho = resize(l.rho, l.Store) l.y = l.initHistory(l.y) l.s = l.initHistory(l.s) l.x = resize(l.x, dim) copy(l.x, loc.X) l.grad = resize(l.grad, dim) copy(l.grad, loc.Gradient) copy(dir, loc.Gradient) floats.Scale(-1, dir) return 1 / floats.Norm(dir, 2) }
func MakeFitLinScale(targetImage *imgut.Image) func(*imgut.Image) float64 { // Pre-compute image to slice of floats dataTarg := imgut.ToSlice(targetImage) // Pre-compute average avgt := floats.Sum(dataTarg) / float64(len(dataTarg)) return func(indImage *imgut.Image) float64 { // Images to vector dataInd := imgut.ToSlice(indImage) // Compute average pixels avgy := floats.Sum(dataInd) / float64(len(dataInd)) // Difference y - avgy y_avgy := make([]float64, len(dataInd)) copy(y_avgy, dataInd) floats.AddConst(-avgy, y_avgy) // Difference t - avgt t_avgt := make([]float64, len(dataTarg)) copy(t_avgt, dataTarg) floats.AddConst(-avgt, t_avgt) // Multuplication (t - avgt)(y - avgy) floats.Mul(t_avgt, y_avgy) // Summation numerator := floats.Sum(t_avgt) // Square (y - avgy)^2 floats.Mul(y_avgy, y_avgy) denomin := floats.Sum(y_avgy) // Compute b-value b := numerator / denomin // Compute a-value a := avgt - b*avgy // Compute now the scaled RMSE, using y' = a + b*y floats.Scale(b, dataInd) // b*y floats.AddConst(a, dataInd) // a + b*y floats.Sub(dataInd, dataTarg) // (a + b * y - t) floats.Mul(dataInd, dataInd) // (a + b * y - t)^2 total := floats.Sum(dataInd) // Sum(...) return math.Sqrt(total / float64(len(dataInd))) } }
// computeMove computes how far can be moved replacing each index. The results // are stored into move. func computeMove(move []float64, minIdx int, A mat64.Matrix, ab *mat64.Dense, xb []float64, nonBasicIdx []int) error { // Find ae. col := mat64.Col(nil, nonBasicIdx[minIdx], A) aCol := mat64.NewVector(len(col), col) // d = - Ab^-1 Ae nb, _ := ab.Dims() d := make([]float64, nb) dVec := mat64.NewVector(nb, d) err := dVec.SolveVec(ab, aCol) if err != nil { return ErrLinSolve } floats.Scale(-1, d) for i, v := range d { if math.Abs(v) < dRoundTol { d[i] = 0 } } // If no di < 0, then problem is unbounded. if floats.Min(d) >= 0 { return ErrUnbounded } // move = bhat_i / - d_i, assuming d is negative. bHat := xb // ab^-1 b for i, v := range d { if v >= 0 { move[i] = math.Inf(1) } else { move[i] = bHat[i] / math.Abs(v) } } return nil }
func (g *GP) marginalLikelihoodDerivative(x, grad []float64, trainNoise bool, mem *margLikeMemory) { // d/dTheta_j log[(p|X,theta)] = // 1/2 * y^T * K^-1 dK/dTheta_j * K^-1 * y - 1/2 * tr(K^-1 * dK/dTheta_j) // 1/2 * α^T * dK/dTheta_j * α - 1/2 * tr(K^-1 dK/dTheta_j) // Multiply by the same -2 // -α^T * K^-1 * α + tr(K^-1 dK/dTheta_j) // This first computation is an inner product. n := len(g.outputs) nHyper := g.kernel.NumHyper() k := mem.k chol := mem.chol alpha := mem.alpha dKdTheta := mem.dKdTheta kInvDK := mem.kInvDK y := mat64.NewVector(n, g.outputs) var noise float64 if trainNoise { noise = math.Exp(x[len(x)-1]) } else { noise = g.noise } // If x is the same, then reuse what has been computed in the function. if !floats.Equal(mem.lastX, x) { copy(mem.lastX, x) g.kernel.SetHyper(x[:nHyper]) g.setKernelMat(k, noise) //chol.Cholesky(k, false) chol.Factorize(k) alpha.SolveCholeskyVec(chol, y) } g.setKernelMatDeriv(dKdTheta, trainNoise, noise) for i := range dKdTheta { kInvDK.SolveCholesky(chol, dKdTheta[i]) inner := mat64.Inner(alpha, dKdTheta[i], alpha) grad[i] = -inner + mat64.Trace(kInvDK) } floats.Scale(1/float64(n), grad) bounds := g.kernel.Bounds() if trainNoise { bounds = append(bounds, Bound{minLogNoise, maxLogNoise}) } barrierGrad := make([]float64, len(grad)) for i, v := range x { // Quadratic barrier penalty. if v < bounds[i].Min { diff := bounds[i].Min - v barrierGrad[i] = -(barrierPow) * math.Pow(diff, barrierPow-1) } if v > bounds[i].Max { diff := v - bounds[i].Max barrierGrad[i] = (barrierPow) * math.Pow(diff, barrierPow-1) } } fmt.Println("noise, minNoise", x[len(x)-1], bounds[len(x)-1].Min) fmt.Println("barrier Grad", barrierGrad) floats.Add(grad, barrierGrad) //copy(grad, barrierGrad) }
func (lbfgs *Lbfgs) Iterate(loc *multi.Location, obj *uni.Objective, grad *multi.Gradient, fun optimize.MultiObjGrad) (status.Status, error) { counter := lbfgs.counter q := lbfgs.q a := lbfgs.a b := lbfgs.b rhoHist := lbfgs.rhoHist sHist := lbfgs.sHist yHist := lbfgs.yHist gamma_k := lbfgs.gamma_k tmp := lbfgs.tmp p_k := lbfgs.p_k s_k := lbfgs.s_k y_k := lbfgs.y_k z := lbfgs.z // Calculate search direction for i, val := range grad.Curr() { q[i] = val } for i := counter - 1; i >= 0; i-- { a[i] = rhoHist[i] * floats.Dot(sHist[i], q) copy(tmp, yHist[i]) floats.Scale(a[i], tmp) floats.Sub(q, tmp) } for i := lbfgs.NumStore - 1; i >= counter; i-- { a[i] = rhoHist[i] * floats.Dot(sHist[i], q) copy(tmp, yHist[i]) floats.Scale(a[i], tmp) //fmt.Println(q) //fmt.Println(tmp) floats.Sub(q, tmp) } // Assume H_0 is the identity times gamma_k copy(z, q) floats.Scale(gamma_k, z) // Second loop for update, going oldest to newest for i := counter; i < lbfgs.NumStore; i++ { b[i] = rhoHist[i] * floats.Dot(yHist[i], z) copy(tmp, sHist[i]) floats.Scale(a[i]-b[i], tmp) floats.Add(z, tmp) } for i := 0; i < counter; i++ { b[i] = rhoHist[i] * floats.Dot(yHist[i], z) copy(tmp, sHist[i]) floats.Scale(a[i]-b[i], tmp) floats.Add(z, tmp) } lbfgs.a = a lbfgs.b = b copy(p_k, z) floats.Scale(-1, p_k) normP_k := floats.Norm(p_k, 2) // Perform line search -- need to find some way to implement this, especially bookkeeping function values linesearchResult, err := linesearch.Linesearch(fun, lbfgs.LinesearchMethod, lbfgs.LinesearchSettings, lbfgs.Wolfe, p_k, loc.Curr(), obj.Curr(), grad.Curr()) // In the future add a check to switch to a different linesearcher? if err != nil { return status.LinesearchFailure, err } x_kp1 := linesearchResult.Loc f_kp1 := linesearchResult.Obj g_kp1 := linesearchResult.Grad alpha_k := linesearchResult.Step // Update hessian estimate copy(s_k, p_k) floats.Scale(alpha_k, s_k) copy(y_k, g_kp1) floats.Sub(y_k, grad.Curr()) skDotYk := floats.Dot(s_k, y_k) // Bookkeep the results stepSize := alpha_k * normP_k lbfgs.step.AddToHist(stepSize) lbfgs.step.SetCurr(stepSize) loc.SetCurr(x_kp1) //lbfgs.loc.AddToHist(x_kp1) //fmt.Println(lbfgs.loc.GetHist()) obj.SetCurr(f_kp1) grad.SetCurr(g_kp1) copy(sHist[counter], s_k) copy(yHist[counter], y_k) rhoHist[counter] = 1 / skDotYk lbfgs.gamma_k = skDotYk / floats.Dot(y_k, y_k) lbfgs.counter += 1 if lbfgs.counter == lbfgs.NumStore { lbfgs.counter = 0 } return status.Continue, nil }
func (b *BatchGradient) funcGrad(params, deriv []float64) float64 { nParameters := len(deriv) // Send out all of the work done := make(chan result) sz := b.nSamples / b.Workers sent := 0 for i := 0; i < b.Workers; i++ { outputDim := b.outputDim last := sent + sz if i == b.Workers-1 { last = b.nSamples } go func(sent, last int) { lossDeriver := b.Trainable.NewLossDeriver() predOutput := make([]float64, outputDim) dLossDPred := make([]float64, outputDim) dLossDParam := make([]float64, nParameters) outputs := make([]float64, outputDim) tmpderiv := make([]float64, nParameters) var totalLoss float64 for i := sent; i < last; i++ { lossDeriver.Predict(params, b.features.RawRowView(i), predOutput) b.Outputs.Row(outputs, i) loss := b.Losser.LossDeriv(predOutput, outputs, dLossDPred) if b.Weights == nil { totalLoss += loss } else { totalLoss += b.Weights[i] * loss } lossDeriver.Deriv(params, b.features.RawRowView(i), predOutput, dLossDPred, dLossDParam) if b.Weights != nil { floats.Scale(b.Weights[i], dLossDParam) } floats.Add(tmpderiv, dLossDParam) } done <- result{totalLoss, tmpderiv} }(sent, last) sent += sz } // Collect all the results var totalLoss float64 for i := range deriv { deriv[i] = 0 } for i := 0; i < b.Workers; i++ { w := <-done totalLoss += w.loss floats.Add(deriv, w.deriv) } // Compute the regularizer if b.Regularizer != nil { tmp := make([]float64, nParameters) totalLoss += b.Regularizer.LossDeriv(params, tmp) floats.Add(deriv, tmp) } sumWeights := float64(b.nSamples) if b.Weights != nil { sumWeights = floats.Sum(b.Weights) } totalLoss /= sumWeights floats.Scale(1/sumWeights, deriv) return totalLoss }
func (b *BFGS) NextDirection(loc *Location, dir []float64) (stepSize float64) { if len(loc.X) != b.dim { panic("bfgs: unexpected size mismatch") } if len(loc.Gradient) != b.dim { panic("bfgs: unexpected size mismatch") } if len(dir) != b.dim { panic("bfgs: unexpected size mismatch") } // Compute the gradient difference in the last step // y = g_{k+1} - g_{k} floats.SubTo(b.y, loc.Gradient, b.grad) // Compute the step difference // s = x_{k+1} - x_{k} floats.SubTo(b.s, loc.X, b.x) sDotY := floats.Dot(b.s, b.y) sDotYSquared := sDotY * sDotY if b.first { // Rescale the initial hessian. // From: Numerical optimization, Nocedal and Wright, Page 143, Eq. 6.20 (second edition). yDotY := floats.Dot(b.y, b.y) scale := sDotY / yDotY for i := 0; i < len(loc.X); i++ { for j := 0; j < len(loc.X); j++ { if i == j { b.invHess.SetSym(i, i, scale) } else { b.invHess.SetSym(i, j, 0) } } } b.first = false } // Compute the update rule // B_{k+1}^-1 // First term is just the existing inverse hessian // Second term is // (sk^T yk + yk^T B_k^-1 yk)(s_k sk_^T) / (sk^T yk)^2 // Third term is // B_k ^-1 y_k sk^T + s_k y_k^T B_k-1 // // y_k^T B_k^-1 y_k is a scalar, and the third term is a rank-two update // where B_k^-1 y_k is one vector and s_k is the other. Compute the update // values then actually perform the rank updates. yBy := mat64.Inner(b.yVec, b.invHess, b.yVec) firstTermConst := (sDotY + yBy) / (sDotYSquared) b.tmpVec.MulVec(b.invHess, b.yVec) b.invHess.RankTwo(b.invHess, -1/sDotY, b.tmpVec, b.sVec) b.invHess.SymRankOne(b.invHess, firstTermConst, b.sVec) // update the bfgs stored data to the new iteration copy(b.x, loc.X) copy(b.grad, loc.Gradient) // Compute the new search direction d := mat64.NewVector(b.dim, dir) g := mat64.NewVector(b.dim, loc.Gradient) d.MulVec(b.invHess, g) // new direction stored in place floats.Scale(-1, dir) return 1 }
func (g *GradOptimizable) Init() error { if g.Losser == nil { g.Losser = loss.SquaredDistance{} } if g.Regularizer == nil { g.Regularizer = regularize.None{} } if g.Sampler == nil { g.Sampler = &Batch{} } if g.Inputs == nil { return errors.New("No input data") } nSamples, _ := g.Inputs.Dims() if nSamples == 0 { return errors.New("No input data") } if g.NumWorkers == 0 { g.NumWorkers = 1 } outputSamples, outputDim := g.Outputs.Dims() if outputSamples != nSamples { return errors.New("gradoptimize: input and output row mismatch") } nParameters := g.Trainable.NumParameters() batches := make([]batchSend, g.NumWorkers+1) // +1 is for regularizer for i := range batches { batches[i].deriv = make([]float64, nParameters) } g.batches = batches g.grainSize = g.Trainable.GrainSize() g.Sampler.Init(nSamples) g.features = FeaturizeTrainable(g.Trainable, g.Inputs, nil) work := make(chan batchSend, g.NumWorkers) done := make(chan batchSend, g.NumWorkers) regularizeChan := make(chan batchSend, 1) regDone := make(chan batchSend, 1) quit := make(chan struct{}) g.sendWork = work g.receiveWork = done g.quit = quit g.regularizeChan = regularizeChan g.regDone = regDone // launch workers for worker := 0; worker < g.NumWorkers; worker++ { go func(outputDim, nParameterss int) { lossDeriver := g.Trainable.NewLossDeriver() predOutput := make([]float64, outputDim) dLossDPred := make([]float64, outputDim) dLossDParam := make([]float64, nParameters) outputs := make([]float64, outputDim) for { select { case w := <-work: // Zero out existing derivative w.loss = 0 for i := range w.deriv { w.deriv[i] = 0 } for _, idx := range w.idxs { lossDeriver.Predict(w.parameters, g.features.RawRowView(idx), predOutput) g.Outputs.Row(outputs, idx) loss := g.Losser.LossDeriv(predOutput, outputs, dLossDPred) if g.Weights == nil { w.loss += loss } else { w.loss += g.Weights[idx] * loss } lossDeriver.Deriv(w.parameters, g.features.RawRowView(idx), predOutput, dLossDPred, dLossDParam) if g.Weights != nil { floats.Scale(g.Weights[idx], dLossDParam) } floats.Add(w.deriv, dLossDParam) } // Send the result back done <- w case <-quit: return } } }(outputDim, nParameters) } // launch regularizer go func() { for { select { case w := <-regularizeChan: loss := g.Regularizer.LossDeriv(w.parameters, w.deriv) w.loss = loss regDone <- w case <-quit: return } } }() return nil }
func testDtrevc3(t *testing.T, impl Dtrevc3er, side lapack.EigVecSide, howmny lapack.HowMany, tmat blas64.General, optwork bool, rnd *rand.Rand) { const tol = 1e-14 n := tmat.Rows extra := tmat.Stride - tmat.Cols right := side != lapack.LeftEigVec left := side != lapack.RightEigVec var selected, selectedWant []bool var mWant int // How many columns will the eigenvectors occupy. if howmny == lapack.SelectedEigVec { selected = make([]bool, n) selectedWant = make([]bool, n) // Dtrevc3 will compute only selected eigenvectors. Pick them // randomly disregarding whether they are real or complex. for i := range selected { if rnd.Float64() < 0.5 { selected[i] = true } } // Dtrevc3 will modify (standardize) the slice selected based on // whether the corresponding eigenvalues are real or complex. Do // the same process here to fill selectedWant. for i := 0; i < n; { if i == n-1 || tmat.Data[(i+1)*tmat.Stride+i] == 0 { // Real eigenvalue. if selected[i] { selectedWant[i] = true mWant++ // Real eigenvectors occupy one column. } i++ } else { // Complex eigenvalue. if selected[i] || selected[i+1] { // Dtrevc3 will modify selected so that // only the first element of the pair is // true. selectedWant[i] = true mWant += 2 // Complex eigenvectors occupy two columns. } i += 2 } } } else { // All eigenvectors occupy n columns. mWant = n } var vr blas64.General if right { if howmny == lapack.AllEigVecMulQ { vr = eye(n, n+extra) } else { // VR will be overwritten. vr = nanGeneral(n, mWant, n+extra) } } var vl blas64.General if left { if howmny == lapack.AllEigVecMulQ { vl = eye(n, n+extra) } else { // VL will be overwritten. vl = nanGeneral(n, mWant, n+extra) } } work := make([]float64, max(1, 3*n)) if optwork { impl.Dtrevc3(side, howmny, nil, n, nil, 1, nil, 1, nil, 1, mWant, work, -1) work = make([]float64, int(work[0])) } m := impl.Dtrevc3(side, howmny, selected, n, tmat.Data, tmat.Stride, vl.Data, vl.Stride, vr.Data, vr.Stride, mWant, work, len(work)) prefix := fmt.Sprintf("Case side=%v, howmny=%v, n=%v, extra=%v, optwk=%v", side, howmny, n, extra, optwork) if !generalOutsideAllNaN(tmat) { t.Errorf("%v: out-of-range write to T", prefix) } if !generalOutsideAllNaN(vl) { t.Errorf("%v: out-of-range write to VL", prefix) } if !generalOutsideAllNaN(vr) { t.Errorf("%v: out-of-range write to VR", prefix) } if m != mWant { t.Errorf("%v: unexpected value of m. Want %v, got %v", prefix, mWant, m) } if howmny == lapack.SelectedEigVec { for i := range selected { if selected[i] != selectedWant[i] { t.Errorf("%v: unexpected selected[%v]", prefix, i) } } } // Check that the columns of VR and VL are actually eigenvectors and // that the magnitude of their largest element is 1. var k int for j := 0; j < n; { re := tmat.Data[j*tmat.Stride+j] if j == n-1 || tmat.Data[(j+1)*tmat.Stride+j] == 0 { if howmny == lapack.SelectedEigVec && !selected[j] { j++ continue } if right { ev := columnOf(vr, k) norm := floats.Norm(ev, math.Inf(1)) if math.Abs(norm-1) > tol { t.Errorf("%v: magnitude of largest element of VR[:,%v] not 1", prefix, k) } if !isRightEigenvectorOf(tmat, ev, nil, complex(re, 0), tol) { t.Errorf("%v: VR[:,%v] is not real right eigenvector", prefix, k) } } if left { ev := columnOf(vl, k) norm := floats.Norm(ev, math.Inf(1)) if math.Abs(norm-1) > tol { t.Errorf("%v: magnitude of largest element of VL[:,%v] not 1", prefix, k) } if !isLeftEigenvectorOf(tmat, ev, nil, complex(re, 0), tol) { t.Errorf("%v: VL[:,%v] is not real left eigenvector", prefix, k) } } k++ j++ continue } if howmny == lapack.SelectedEigVec && !selected[j] { j += 2 continue } im := math.Sqrt(math.Abs(tmat.Data[(j+1)*tmat.Stride+j])) * math.Sqrt(math.Abs(tmat.Data[j*tmat.Stride+j+1])) if right { evre := columnOf(vr, k) evim := columnOf(vr, k+1) var evmax float64 for i, v := range evre { evmax = math.Max(evmax, math.Abs(v)+math.Abs(evim[i])) } if math.Abs(evmax-1) > tol { t.Errorf("%v: magnitude of largest element of VR[:,%v] not 1", prefix, k) } if !isRightEigenvectorOf(tmat, evre, evim, complex(re, im), tol) { t.Errorf("%v: VR[:,%v:%v] is not complex right eigenvector", prefix, k, k+1) } floats.Scale(-1, evim) if !isRightEigenvectorOf(tmat, evre, evim, complex(re, -im), tol) { t.Errorf("%v: VR[:,%v:%v] is not complex right eigenvector", prefix, k, k+1) } } if left { evre := columnOf(vl, k) evim := columnOf(vl, k+1) var evmax float64 for i, v := range evre { evmax = math.Max(evmax, math.Abs(v)+math.Abs(evim[i])) } if math.Abs(evmax-1) > tol { t.Errorf("%v: magnitude of largest element of VL[:,%v] not 1", prefix, k) } if !isLeftEigenvectorOf(tmat, evre, evim, complex(re, im), tol) { t.Errorf("%v: VL[:,%v:%v] is not complex left eigenvector", prefix, k, k+1) } floats.Scale(-1, evim) if !isLeftEigenvectorOf(tmat, evre, evim, complex(re, -im), tol) { t.Errorf("%v: VL[:,%v:%v] is not complex left eigenvector", prefix, k, k+1) } } k += 2 j += 2 } }
// Convert converts a General-form LP into a standard form LP. // The general form of an LP is: // minimize c^T * x // s.t G * x <= h // A * x = b // And the standard form is: // minimize cNew^T * x // s.t aNew * x = bNew // x >= 0 // If there are no constraints of the given type, the inputs may be nil. func Convert(c []float64, g mat64.Matrix, h []float64, a mat64.Matrix, b []float64) (cNew []float64, aNew *mat64.Dense, bNew []float64) { nVar := len(c) nIneq := len(h) // Check input sizes. if g == nil { if nIneq != 0 { panic(badShape) } } else { gr, gc := g.Dims() if gr != nIneq { panic(badShape) } if gc != nVar { panic(badShape) } } nEq := len(b) if a == nil { if nEq != 0 { panic(badShape) } } else { ar, ac := a.Dims() if ar != nEq { panic(badShape) } if ac != nVar { panic(badShape) } } // Convert the general form LP. // Derivation: // 0. Start with general form // min. c^T * x // s.t. G * x <= h // A * x = b // 1. Introduce slack variables for each constraint // min. c^T * x // s.t. G * x + s = h // A * x = b // s >= 0 // 2. Add non-negativity constraints for x by splitting x // into positive and negative components. // x = xp - xn // xp >= 0, xn >= 0 // This makes the LP // min. c^T * xp - c^T xn // s.t. G * xp - G * xn + s = h // A * xp - A * xn = b // xp >= 0, xn >= 0, s >= 0 // 3. Write the above in standard form: // xt = [xp // xn // s ] // min. [c^T, -c^T, 0] xt // s.t. [G, -G, I] xt = h // [A, -A, 0] xt = b // x >= 0 // In summary: // Original LP: // min. c^T * x // s.t. G * x <= h // A * x = b // Standard Form: // xt = [xp; xn; s] // min. [c^T, -c^T, 0] xt // s.t. [G, -G, I] xt = h // [A, -A, 0] xt = b // x >= 0 // New size of x is [xp, xn, s] nNewVar := nVar + nVar + nIneq // Construct cNew = [c; -c; 0] cNew = make([]float64, nNewVar) copy(cNew, c) copy(cNew[nVar:], c) floats.Scale(-1, cNew[nVar:2*nVar]) // New number of equality constraints is the number of total constraints. nNewEq := nIneq + nEq // Construct bNew = [h, b]. bNew = make([]float64, nNewEq) copy(bNew, h) copy(bNew[nIneq:], b) // Construct aNew = [G, -G, I; A, -A, 0]. aNew = mat64.NewDense(nNewEq, nNewVar, nil) if nIneq != 0 { aView := (aNew.View(0, 0, nIneq, nVar)).(*mat64.Dense) aView.Copy(g) aView = (aNew.View(0, nVar, nIneq, nVar)).(*mat64.Dense) aView.Scale(-1, g) aView = (aNew.View(0, 2*nVar, nIneq, nIneq)).(*mat64.Dense) for i := 0; i < nIneq; i++ { aView.Set(i, i, 1) } } if nEq != 0 { aView := (aNew.View(nIneq, 0, nEq, nVar)).(*mat64.Dense) aView.Copy(a) aView = (aNew.View(nIneq, nVar, nEq, nVar)).(*mat64.Dense) aView.Scale(-1, a) } return cNew, aNew, bNew }
func (g *GradientDescent) NextDirection(loc *Location, dir []float64) (stepSize float64) { copy(dir, loc.Gradient) floats.Scale(-1, dir) return g.StepSizer.StepSize(loc, dir) }
// Normalize the vector of value, summing them and dividing each by the total func normalSlice(v []float64) { tot := floats.Sum(v) floats.Scale(1.0/tot, v) }