// Not callable in parallel because of the batches func (g *GradOptimizable) FuncGrad(params []float64, deriv []float64) float64 { inds := g.Sampler.Iterate() total := len(inds) var totalLoss float64 for i := range deriv { deriv[i] = 0 } // Send the regularizer g.batches[0].parameters = params g.regularizeChan <- g.batches[0] // Send initial batches out var initBatches int var lastSent int for i := 0; i < g.NumWorkers; i++ { if lastSent == total { break } add := g.grainSize if lastSent+add >= total { add = total - lastSent } initBatches++ g.batches[i+1].idxs = inds[lastSent : lastSent+add] g.batches[i+1].parameters = params g.sendWork <- g.batches[i+1] lastSent += add } // Collect the batches and resend out for lastSent < total { batch := <-g.receiveWork totalLoss += batch.loss floats.Add(deriv, batch.deriv) add := g.grainSize if lastSent+add >= total { add = total - lastSent } batch.idxs = inds[lastSent : lastSent+add] g.sendWork <- batch lastSent += add } // All inds sent, so just weight for all the collection for i := 0; i < initBatches; i++ { batch := <-g.receiveWork totalLoss += batch.loss floats.Add(deriv, batch.deriv) } batch := <-g.regDone totalLoss += batch.loss floats.Add(deriv, batch.deriv) totalLoss /= float64(len(inds)) floats.Scale(1/float64(len(inds)), deriv) return totalLoss }
// UpdateOne updates sufficient statistics using one observation. func (g *Model) UpdateOne(o model.Obs, w float64) { glog.V(6).Infof("gaussian update, name:%s, obs:%v, weight:%e", g.ModelName, o, w) /* Update sufficient statistics. */ obs, _, _ := model.ObsToF64(o) floatx.Apply(floatx.ScaleFunc(w), obs, g.tmpArray) floats.Add(g.Sumx, g.tmpArray) floatx.Sq(g.tmpArray, obs) floats.Scale(w, g.tmpArray) floats.Add(g.Sumxsq, g.tmpArray) g.NSamples += w }
func TestJensenShannon(t *testing.T) { for i, test := range []struct { p []float64 q []float64 }{ { p: []float64{0.5, 0.1, 0.3, 0.1}, q: []float64{0.1, 0.4, 0.25, 0.25}, }, { p: []float64{0.4, 0.6, 0.0}, q: []float64{0.2, 0.2, 0.6}, }, { p: []float64{0.1, 0.1, 0.0, 0.8}, q: []float64{0.6, 0.3, 0.0, 0.1}, }, { p: []float64{0.5, 0.1, 0.3, 0.1}, q: []float64{0.5, 0, 0.25, 0.25}, }, { p: []float64{0.5, 0.1, 0, 0.4}, q: []float64{0.1, 0.4, 0.25, 0.25}, }, } { m := make([]float64, len(test.p)) p := test.p q := test.q floats.Add(m, p) floats.Add(m, q) floats.Scale(0.5, m) js1 := 0.5*KullbackLeibler(p, m) + 0.5*KullbackLeibler(q, m) js2 := JensenShannon(p, q) if math.IsNaN(js2) { t.Errorf("In case %v, JS distance is NaN", i) } if math.Abs(js1-js2) > 1e-14 { t.Errorf("JS mismatch case %v. Expected %v, found %v.", i, js1, js2) } } if !Panics(func() { JensenShannon(make([]float64, 3), make([]float64, 2)) }) { t.Errorf("JensenShannon did not panic with p, q length mismatch") } }
// transformNormal performs the same operation as TransformNormal except no // safety checks are performed and both input slices must be non-nil. func (n *Normal) transformNormal(dst, normal []float64) []float64 { srcVec := mat64.NewVector(n.dim, normal) dstVec := mat64.NewVector(n.dim, dst) dstVec.MulVec(&n.lower, srcVec) floats.Add(dst, n.mu) return dst }
// returnNext updates the location based on the iteration type and the current // simplex, and returns the next operation. func (n *NelderMead) returnNext(iter nmIterType, loc *Location) (Operation, error) { n.lastIter = iter switch iter { case nmMajor: // Fill loc with the current best point and value, // and command a convergence check. copy(loc.X, n.vertices[0]) loc.F = n.values[0] return MajorIteration, nil case nmReflected, nmExpanded, nmContractedOutside, nmContractedInside: // x_new = x_centroid + scale * (x_centroid - x_worst) var scale float64 switch iter { case nmReflected: scale = n.reflection case nmExpanded: scale = n.reflection * n.expansion case nmContractedOutside: scale = n.reflection * n.contraction case nmContractedInside: scale = -n.contraction } dim := len(loc.X) floats.SubTo(loc.X, n.centroid, n.vertices[dim]) floats.Scale(scale, loc.X) floats.Add(loc.X, n.centroid) if iter == nmReflected { copy(n.reflectedPoint, loc.X) } return FuncEvaluation, nil case nmShrink: // x_shrink = x_best + delta * (x_i + x_best) floats.SubTo(loc.X, n.vertices[n.fillIdx], n.vertices[0]) floats.Scale(n.shrink, loc.X) floats.Add(loc.X, n.vertices[0]) return FuncEvaluation, nil default: panic("unreachable") } }
// ObjDeriv computes the objective value and stores the derivative in place func (g *BatchGradBased) ObjGrad(parameters []float64, derivative []float64) (loss float64) { c := make(chan lossDerivStruct, 10) // Set the channel for parallel for f := func(start, end int) { g.lossDerivFunc(start, end, c, parameters) } go func() { wg := &sync.WaitGroup{} // Compute the losses and the derivatives all in parallel wg.Add(2) go func() { common.ParallelFor(g.nTrain, g.grainSize, f) wg.Done() }() // Compute the regularization go func() { deriv := make([]float64, g.nParameters) loss := g.regularizer.LossDeriv(parameters, deriv) //fmt.Println("regularizer loss = ", loss) //fmt.Println("regularizer deriv = ", deriv) c <- lossDerivStruct{ loss: loss, deriv: deriv, } wg.Done() }() // Wait for all of the results to be sent on the channel wg.Wait() // Close the channel close(c) }() // zero the derivative for i := range derivative { derivative[i] = 0 } // Range over the channel, incrementing the loss and derivative // as they come in for l := range c { loss += l.loss floats.Add(derivative, l.deriv) } //fmt.Println("nTrain", g.nTrain) //fmt.Println("final deriv", derivative) // Normalize by the number of training samples loss /= float64(g.nTrain) floats.Scale(1/float64(g.nTrain), derivative) return loss }
// returnNext finds the next location to evaluate, stores the location in xNext, // and returns the data func (n *NelderMead) returnNext(iter nmIterType, xNext []float64) (EvaluationType, IterationType, error) { dim := len(xNext) n.lastIter = iter switch iter { case nmReflected, nmExpanded, nmContractedOutside, nmContractedInside: // x_new = x_centroid + scale * (x_centroid - x_worst) var scale float64 switch iter { case nmReflected: scale = n.reflection case nmExpanded: scale = n.reflection * n.expansion case nmContractedOutside: scale = n.reflection * n.contraction case nmContractedInside: scale = -n.contraction } floats.SubTo(xNext, n.centroid, n.vertices[dim]) floats.Scale(scale, xNext) floats.Add(xNext, n.centroid) if iter == nmReflected { copy(n.reflectedPoint, xNext) // Nelder Mead iterations start with Reflection step return FuncEvaluation, MajorIteration, nil } return FuncEvaluation, MinorIteration, nil case nmShrink: // x_shrink = x_best + delta * (x_i + x_best) floats.SubTo(xNext, n.vertices[n.fillIdx], n.vertices[0]) floats.Scale(n.shrink, xNext) floats.Add(xNext, n.vertices[0]) return FuncEvaluation, SubIteration, nil default: panic("unreachable") } }
// Rand generates a random number according to the distributon. // If the input slice is nil, new memory is allocated, otherwise the result is stored // in place. func (n *Normal) Rand(x []float64) []float64 { x = reuseAs(x, n.dim) tmp := make([]float64, n.dim) if n.src == nil { for i := range x { tmp[i] = rand.NormFloat64() } } else { for i := range x { tmp[i] = n.src.NormFloat64() } } tmpVec := mat64.NewVector(n.dim, tmp) xVec := mat64.NewVector(n.dim, x) xVec.MulVec(n.chol, true, tmpVec) floats.Add(x, n.mu) return x }
// Estimate computes model parameters using sufficient statistics. func (gmm *Model) UpdateOne(o model.Obs, w float64) { obs, _, _ := model.ObsToF64(o) maxProb := gmm.logProbInternal(obs, gmm.tmpProbs) gmm.Likelihood += maxProb floatx.Apply(floatx.AddScalarFunc(-maxProb+math.Log(w)), gmm.tmpProbs, nil) // Compute posterior probabilities. floatx.Exp(gmm.tmpProbs, gmm.tmpProbs) // Update posterior sum, needed to compute mixture weights. floats.Add(gmm.PosteriorSum, gmm.tmpProbs) // Update Gaussian components. for i, c := range gmm.Components { c.UpdateOne(o, gmm.tmpProbs[i]) } // Count number of observations. gmm.NSamples += w }
// NewBatchGradBased creates a new batch grad based with the given inputs func NewBatchGradBased(trainable Trainable, cacheFeatures bool, inputs, outputs common.RowMatrix, weights []float64, losser loss.DerivLosser, regularizer regularize.Regularizer) *BatchGradBased { var features *mat64.Dense if cacheFeatures { features = FeaturizeTrainable(trainable, inputs, nil) } // TODO: Add in error checking if losser == nil { losser = loss.SquaredDistance{} } if regularizer == nil { regularizer = regularize.None{} } if weights != nil { // TODO: Fix weights panic("non-nil weights") } nTrain, outputDim := outputs.Dims() _, inputDim := inputs.Dims() g := &BatchGradBased{ t: trainable, inputs: inputs, outputs: outputs, features: features, losser: losser, regularizer: regularizer, nTrain: nTrain, outputDim: outputDim, inputDim: inputDim, nParameters: trainable.NumParameters(), grainSize: trainable.GrainSize(), } // TODO: Add in row viewer stuff // TODO: Create a different function for computing just the loss //inputRowViewer, ok := inputs.(mat64.RowViewer) //outputRowViewer, ok := outputs.(mat64.RowViewer) // TODO: Move this to its own function var f func(start, end int, c chan lossDerivStruct, parameters []float64) switch { default: panic("Shouldn't be here") case cacheFeatures: f = func(start, end int, c chan lossDerivStruct, parameters []float64) { lossDeriver := g.t.NewLossDeriver() prediction := make([]float64, g.outputDim) dLossDPred := make([]float64, g.outputDim) dLossDWeight := make([]float64, g.nParameters) totalDLossDWeight := make([]float64, g.nParameters) var loss float64 output := make([]float64, g.outputDim) for i := start; i < end; i++ { // Compute the prediction lossDeriver.Predict(parameters, g.features.RawRowView(i), prediction) // Compute the loss g.outputs.Row(output, i) loss += g.losser.LossDeriv(prediction, output, dLossDPred) // Compute the derivative lossDeriver.Deriv(parameters, g.features.RawRowView(i), prediction, dLossDPred, dLossDWeight) floats.Add(totalDLossDWeight, dLossDWeight) } // Send the value back on the channel c <- lossDerivStruct{ loss: loss, deriv: totalDLossDWeight, } } case !cacheFeatures: f = func(start, end int, c chan lossDerivStruct, parameters []float64) { lossDeriver := g.t.NewLossDeriver() prediction := make([]float64, g.outputDim) dLossDPred := make([]float64, g.outputDim) dLossDWeight := make([]float64, g.nParameters) totalDLossDWeight := make([]float64, g.nParameters) var loss float64 output := make([]float64, g.outputDim) input := make([]float64, g.inputDim) features := make([]float64, g.t.NumFeatures()) featurizer := g.t.NewFeaturizer() for i := start; i < end; i++ { g.inputs.Row(input, i) featurizer.Featurize(input, features) // Compute the prediction lossDeriver.Predict(parameters, features, prediction) // Compute the loss g.outputs.Row(output, i) loss += g.losser.LossDeriv(prediction, output, dLossDPred) // Compute the derivative lossDeriver.Deriv(parameters, features, prediction, dLossDPred, dLossDWeight) // Add to the total derivative floats.Add(totalDLossDWeight, dLossDWeight) // Send the value back on the channel c <- lossDerivStruct{ loss: loss, deriv: totalDLossDWeight, } } } } g.lossDerivFunc = f return g }
func (lbfgs *Lbfgs) Iterate(loc *multi.Location, obj *uni.Objective, grad *multi.Gradient, fun optimize.MultiObjGrad) (status.Status, error) { counter := lbfgs.counter q := lbfgs.q a := lbfgs.a b := lbfgs.b rhoHist := lbfgs.rhoHist sHist := lbfgs.sHist yHist := lbfgs.yHist gamma_k := lbfgs.gamma_k tmp := lbfgs.tmp p_k := lbfgs.p_k s_k := lbfgs.s_k y_k := lbfgs.y_k z := lbfgs.z // Calculate search direction for i, val := range grad.Curr() { q[i] = val } for i := counter - 1; i >= 0; i-- { a[i] = rhoHist[i] * floats.Dot(sHist[i], q) copy(tmp, yHist[i]) floats.Scale(a[i], tmp) floats.Sub(q, tmp) } for i := lbfgs.NumStore - 1; i >= counter; i-- { a[i] = rhoHist[i] * floats.Dot(sHist[i], q) copy(tmp, yHist[i]) floats.Scale(a[i], tmp) //fmt.Println(q) //fmt.Println(tmp) floats.Sub(q, tmp) } // Assume H_0 is the identity times gamma_k copy(z, q) floats.Scale(gamma_k, z) // Second loop for update, going oldest to newest for i := counter; i < lbfgs.NumStore; i++ { b[i] = rhoHist[i] * floats.Dot(yHist[i], z) copy(tmp, sHist[i]) floats.Scale(a[i]-b[i], tmp) floats.Add(z, tmp) } for i := 0; i < counter; i++ { b[i] = rhoHist[i] * floats.Dot(yHist[i], z) copy(tmp, sHist[i]) floats.Scale(a[i]-b[i], tmp) floats.Add(z, tmp) } lbfgs.a = a lbfgs.b = b copy(p_k, z) floats.Scale(-1, p_k) normP_k := floats.Norm(p_k, 2) // Perform line search -- need to find some way to implement this, especially bookkeeping function values linesearchResult, err := linesearch.Linesearch(fun, lbfgs.LinesearchMethod, lbfgs.LinesearchSettings, lbfgs.Wolfe, p_k, loc.Curr(), obj.Curr(), grad.Curr()) // In the future add a check to switch to a different linesearcher? if err != nil { return status.LinesearchFailure, err } x_kp1 := linesearchResult.Loc f_kp1 := linesearchResult.Obj g_kp1 := linesearchResult.Grad alpha_k := linesearchResult.Step // Update hessian estimate copy(s_k, p_k) floats.Scale(alpha_k, s_k) copy(y_k, g_kp1) floats.Sub(y_k, grad.Curr()) skDotYk := floats.Dot(s_k, y_k) // Bookkeep the results stepSize := alpha_k * normP_k lbfgs.step.AddToHist(stepSize) lbfgs.step.SetCurr(stepSize) loc.SetCurr(x_kp1) //lbfgs.loc.AddToHist(x_kp1) //fmt.Println(lbfgs.loc.GetHist()) obj.SetCurr(f_kp1) grad.SetCurr(g_kp1) copy(sHist[counter], s_k) copy(yHist[counter], y_k) rhoHist[counter] = 1 / skDotYk lbfgs.gamma_k = skDotYk / floats.Dot(y_k, y_k) lbfgs.counter += 1 if lbfgs.counter == lbfgs.NumStore { lbfgs.counter = 0 } return status.Continue, nil }
func (b *BatchGradient) funcGrad(params, deriv []float64) float64 { nParameters := len(deriv) // Send out all of the work done := make(chan result) sz := b.nSamples / b.Workers sent := 0 for i := 0; i < b.Workers; i++ { outputDim := b.outputDim last := sent + sz if i == b.Workers-1 { last = b.nSamples } go func(sent, last int) { lossDeriver := b.Trainable.NewLossDeriver() predOutput := make([]float64, outputDim) dLossDPred := make([]float64, outputDim) dLossDParam := make([]float64, nParameters) outputs := make([]float64, outputDim) tmpderiv := make([]float64, nParameters) var totalLoss float64 for i := sent; i < last; i++ { lossDeriver.Predict(params, b.features.RawRowView(i), predOutput) b.Outputs.Row(outputs, i) loss := b.Losser.LossDeriv(predOutput, outputs, dLossDPred) if b.Weights == nil { totalLoss += loss } else { totalLoss += b.Weights[i] * loss } lossDeriver.Deriv(params, b.features.RawRowView(i), predOutput, dLossDPred, dLossDParam) if b.Weights != nil { floats.Scale(b.Weights[i], dLossDParam) } floats.Add(tmpderiv, dLossDParam) } done <- result{totalLoss, tmpderiv} }(sent, last) sent += sz } // Collect all the results var totalLoss float64 for i := range deriv { deriv[i] = 0 } for i := 0; i < b.Workers; i++ { w := <-done totalLoss += w.loss floats.Add(deriv, w.deriv) } // Compute the regularizer if b.Regularizer != nil { tmp := make([]float64, nParameters) totalLoss += b.Regularizer.LossDeriv(params, tmp) floats.Add(deriv, tmp) } sumWeights := float64(b.nSamples) if b.Weights != nil { sumWeights = floats.Sum(b.Weights) } totalLoss /= sumWeights floats.Scale(1/sumWeights, deriv) return totalLoss }
func (g *GradOptimizable) Init() error { if g.Losser == nil { g.Losser = loss.SquaredDistance{} } if g.Regularizer == nil { g.Regularizer = regularize.None{} } if g.Sampler == nil { g.Sampler = &Batch{} } if g.Inputs == nil { return errors.New("No input data") } nSamples, _ := g.Inputs.Dims() if nSamples == 0 { return errors.New("No input data") } if g.NumWorkers == 0 { g.NumWorkers = 1 } outputSamples, outputDim := g.Outputs.Dims() if outputSamples != nSamples { return errors.New("gradoptimize: input and output row mismatch") } nParameters := g.Trainable.NumParameters() batches := make([]batchSend, g.NumWorkers+1) // +1 is for regularizer for i := range batches { batches[i].deriv = make([]float64, nParameters) } g.batches = batches g.grainSize = g.Trainable.GrainSize() g.Sampler.Init(nSamples) g.features = FeaturizeTrainable(g.Trainable, g.Inputs, nil) work := make(chan batchSend, g.NumWorkers) done := make(chan batchSend, g.NumWorkers) regularizeChan := make(chan batchSend, 1) regDone := make(chan batchSend, 1) quit := make(chan struct{}) g.sendWork = work g.receiveWork = done g.quit = quit g.regularizeChan = regularizeChan g.regDone = regDone // launch workers for worker := 0; worker < g.NumWorkers; worker++ { go func(outputDim, nParameterss int) { lossDeriver := g.Trainable.NewLossDeriver() predOutput := make([]float64, outputDim) dLossDPred := make([]float64, outputDim) dLossDParam := make([]float64, nParameters) outputs := make([]float64, outputDim) for { select { case w := <-work: // Zero out existing derivative w.loss = 0 for i := range w.deriv { w.deriv[i] = 0 } for _, idx := range w.idxs { lossDeriver.Predict(w.parameters, g.features.RawRowView(idx), predOutput) g.Outputs.Row(outputs, idx) loss := g.Losser.LossDeriv(predOutput, outputs, dLossDPred) if g.Weights == nil { w.loss += loss } else { w.loss += g.Weights[idx] * loss } lossDeriver.Deriv(w.parameters, g.features.RawRowView(idx), predOutput, dLossDPred, dLossDParam) if g.Weights != nil { floats.Scale(g.Weights[idx], dLossDParam) } floats.Add(w.deriv, dLossDParam) } // Send the result back done <- w case <-quit: return } } }(outputDim, nParameters) } // launch regularizer go func() { for { select { case w := <-regularizeChan: loss := g.Regularizer.LossDeriv(w.parameters, w.deriv) w.loss = loss regDone <- w case <-quit: return } } }() return nil }
func (g *GP) marginalLikelihoodDerivative(x, grad []float64, trainNoise bool, mem *margLikeMemory) { // d/dTheta_j log[(p|X,theta)] = // 1/2 * y^T * K^-1 dK/dTheta_j * K^-1 * y - 1/2 * tr(K^-1 * dK/dTheta_j) // 1/2 * α^T * dK/dTheta_j * α - 1/2 * tr(K^-1 dK/dTheta_j) // Multiply by the same -2 // -α^T * K^-1 * α + tr(K^-1 dK/dTheta_j) // This first computation is an inner product. n := len(g.outputs) nHyper := g.kernel.NumHyper() k := mem.k chol := mem.chol alpha := mem.alpha dKdTheta := mem.dKdTheta kInvDK := mem.kInvDK y := mat64.NewVector(n, g.outputs) var noise float64 if trainNoise { noise = math.Exp(x[len(x)-1]) } else { noise = g.noise } // If x is the same, then reuse what has been computed in the function. if !floats.Equal(mem.lastX, x) { copy(mem.lastX, x) g.kernel.SetHyper(x[:nHyper]) g.setKernelMat(k, noise) //chol.Cholesky(k, false) chol.Factorize(k) alpha.SolveCholeskyVec(chol, y) } g.setKernelMatDeriv(dKdTheta, trainNoise, noise) for i := range dKdTheta { kInvDK.SolveCholesky(chol, dKdTheta[i]) inner := mat64.Inner(alpha, dKdTheta[i], alpha) grad[i] = -inner + mat64.Trace(kInvDK) } floats.Scale(1/float64(n), grad) bounds := g.kernel.Bounds() if trainNoise { bounds = append(bounds, Bound{minLogNoise, maxLogNoise}) } barrierGrad := make([]float64, len(grad)) for i, v := range x { // Quadratic barrier penalty. if v < bounds[i].Min { diff := bounds[i].Min - v barrierGrad[i] = -(barrierPow) * math.Pow(diff, barrierPow-1) } if v > bounds[i].Max { diff := v - bounds[i].Max barrierGrad[i] = (barrierPow) * math.Pow(diff, barrierPow-1) } } fmt.Println("noise, minNoise", x[len(x)-1], bounds[len(x)-1].Min) fmt.Println("barrier Grad", barrierGrad) floats.Add(grad, barrierGrad) //copy(grad, barrierGrad) }