func (s *pingerSuite) calculatePingTimeout(c *gc.C) time.Duration { // Try opening an API connection a few times and take the max // delay among the attempts. attempt := utils.AttemptStrategy{ Delay: coretesting.ShortWait, Min: 3, } var maxTimeout time.Duration for a := attempt.Start(); a.Next(); { openStart := time.Now() st, _ := s.OpenAPIAsNewMachine(c) err := st.Ping() if c.Check(err, jc.ErrorIsNil) { openDelay := time.Since(openStart) c.Logf("API open and initial ping took %v", openDelay) if maxTimeout < openDelay { maxTimeout = openDelay } } if st != nil { c.Check(st.Close(), jc.ErrorIsNil) } } if !c.Failed() && maxTimeout > 0 { return maxTimeout } c.Fatalf("cannot calculate ping timeout") return 0 }
// waitForAgentInitialisation polls the bootstrapped state server with a read-only // command which will fail until the state server is fully initialised. // TODO(wallyworld) - add a bespoke command to maybe the admin facade for this purpose. func (c *BootstrapCommand) waitForAgentInitialisation(ctx *cmd.Context) (err error) { attempts := utils.AttemptStrategy{ Min: bootstrapReadyPollCount, Delay: bootstrapReadyPollDelay, } var client block.BlockListAPI for attempt := attempts.Start(); attempt.Next(); { client, err = blockAPI(&c.EnvCommandBase) if err != nil { return err } _, err = client.List() client.Close() if err == nil { ctx.Infof("Bootstrap complete") return nil } if strings.Contains(err.Error(), apiserver.UpgradeInProgressError.Error()) { ctx.Infof("Waiting for API to become available") continue } return err } return err }
func createWebsocketDialer(cfg *websocket.Config, opts DialOpts) func(<-chan struct{}) (io.Closer, error) { openAttempt := utils.AttemptStrategy{ Total: opts.Timeout, Delay: opts.RetryDelay, } return func(stop <-chan struct{}) (io.Closer, error) { for a := openAttempt.Start(); a.Next(); { select { case <-stop: return nil, parallel.ErrStopped default: } logger.Infof("dialing %q", cfg.Location) conn, err := websocket.DialConfig(cfg) if err == nil { return conn, nil } if a.HasNext() { logger.Debugf("error dialing %q, will retry: %v", cfg.Location, err) } else { logger.Infof("error dialing %q: %v", cfg.Location, err) return nil, errors.Errorf("unable to connect to %q", cfg.Location) } } panic("unreachable") } }
// newWebsocketDialer0 returns a function that dials the websocket represented // by the given configuration with the given dial options, suitable for passing // to utils/parallel.Try.Start. func newWebsocketDialer(cfg *websocket.Config, opts DialOpts) func(<-chan struct{}) (io.Closer, error) { // TODO(katco): 2016-08-09: lp:1611427 openAttempt := utils.AttemptStrategy{ Total: opts.Timeout, Delay: opts.RetryDelay, } return func(stop <-chan struct{}) (io.Closer, error) { for a := openAttempt.Start(); a.Next(); { select { case <-stop: return nil, parallel.ErrStopped default: } logger.Infof("dialing %q", cfg.Location) conn, err := opts.DialWebsocket(cfg) if err == nil { return conn, nil } if !a.HasNext() || isX509Error(err) { // We won't reconnect when there's an X509 error // because we're not going to succeed if we retry // in that case. logger.Infof("error dialing %q: %v", cfg.Location, err) return nil, errors.Annotatef(err, "unable to connect to API") } } panic("unreachable") } }
// waitOperation waits for the provided operation to reach the "done" // status. It follows the given attempt strategy (e.g. wait time between // attempts) and may time out. func (rc *rawConn) waitOperation(projectID string, op *compute.Operation, attempts utils.AttemptStrategy) error { started := time.Now() logger.Infof("GCE operation %q, waiting...", op.Name) for a := attempts.Start(); a.Next(); { if op.Status == StatusDone { break } var err error op, err = rc.checkOperation(projectID, op) if err != nil { return errors.Trace(err) } } if op.Status != StatusDone { err := errors.Errorf("timed out after %d seconds", time.Now().Sub(started)/time.Second) return waitError{op, err} } if op.Error != nil { for _, err := range op.Error.Errors { logger.Errorf("GCE operation error: (%s) %s", err.Code, err.Message) } return waitError{op, nil} } logger.Infof("GCE operation %q finished", op.Name) return nil }
func checkFileHasContents(c *gc.C, stor storage.StorageReader, name string, contents []byte, attempt utils.AttemptStrategy) { r, err := storage.GetWithRetry(stor, name, attempt) c.Assert(err, gc.IsNil) c.Check(r, gc.NotNil) defer r.Close() data, err := ioutil.ReadAll(r) c.Check(err, gc.IsNil) c.Check(data, gc.DeepEquals, contents) url, err := stor.URL(name) c.Assert(err, gc.IsNil) var resp *http.Response for a := attempt.Start(); a.Next(); { resp, err = utils.GetValidatingHTTPClient().Get(url) c.Assert(err, gc.IsNil) if resp.StatusCode != 404 { break } c.Logf("get retrying after earlier get succeeded. *sigh*.") } c.Assert(err, gc.IsNil) data, err = ioutil.ReadAll(resp.Body) c.Assert(err, gc.IsNil) defer resp.Body.Close() c.Assert(resp.StatusCode, gc.Equals, 200, gc.Commentf("error response: %s", data)) c.Check(data, gc.DeepEquals, contents) }
func (c *restoreCommand) Run(ctx *cmd.Context) error { if c.showDescription { fmt.Fprintf(ctx.Stdout, "%s\n", c.Info().Purpose) return nil } if err := c.Log.Start(ctx); err != nil { return err } agentConf, err := extractConfig(c.backupFile) if err != nil { return errors.Annotate(err, "cannot extract configuration from backup file") } progress("extracted credentials from backup file") store, err := configstore.Default() if err != nil { return err } cfg, err := c.Config(store) if err != nil { return err } env, err := rebootstrap(cfg, ctx, c.Constraints) if err != nil { return errors.Annotate(err, "cannot re-bootstrap environment") } progress("connecting to newly bootstrapped instance") var apiState *api.State // The state server backend may not be ready to accept logins so we retry. // We'll do up to 8 retries over 2 minutes to give the server time to come up. // Typically we expect only 1 retry will be needed. attempt := utils.AttemptStrategy{Delay: 15 * time.Second, Min: 8} for a := attempt.Start(); a.Next(); { apiState, err = juju.NewAPIState(env, api.DefaultDialOpts()) if err == nil || errors.Cause(err).Error() != "EOF" { break } progress("bootstrapped instance not ready - attempting to redial") } if err != nil { return errors.Annotate(err, "cannot connect to bootstrap instance") } progress("restoring bootstrap machine") machine0Addr, err := restoreBootstrapMachine(apiState, c.backupFile, agentConf) if err != nil { return errors.Annotate(err, "cannot restore bootstrap machine") } progress("restored bootstrap machine") apiState, err = juju.NewAPIState(env, api.DefaultDialOpts()) progress("opening state") if err != nil { return errors.Annotate(err, "cannot connect to api server") } progress("updating all machines") if err := updateAllMachines(apiState, machine0Addr); err != nil { return errors.Annotate(err, "cannot update machines") } return nil }
// GetWithRetry gets the named file from stor using the specified attempt strategy. func GetWithRetry(stor StorageReader, name string, attempt utils.AttemptStrategy) (r io.ReadCloser, err error) { for a := attempt.Start(); a.Next(); { r, err = stor.Get(name) if err == nil || !stor.ShouldRetry(err) { break } } return r, err }
// ListWithRetry lists the files matching prefix from stor using the specified attempt strategy. func ListWithRetry(stor StorageReader, prefix string, attempt utils.AttemptStrategy) (list []string, err error) { for a := attempt.Start(); a.Next(); { list, err = stor.List(prefix) if err == nil || !stor.ShouldRetry(err) { break } } return list, err }
// WaitForAgentInitialisation polls the bootstrapped controller with a read-only // command which will fail until the controller is fully initialised. // TODO(wallyworld) - add a bespoke command to maybe the admin facade for this purpose. func WaitForAgentInitialisation(ctx *cmd.Context, c *modelcmd.ModelCommandBase, controllerName, hostedModelName string) error { // TODO(katco): 2016-08-09: lp:1611427 attempts := utils.AttemptStrategy{ Min: bootstrapReadyPollCount, Delay: bootstrapReadyPollDelay, } var ( apiAttempts int err error ) // Make a best effort to find the new controller address so we can print it. addressInfo := "" controller, err := c.ClientStore().ControllerByName(controllerName) if err == nil && len(controller.APIEndpoints) > 0 { addr, err := network.ParseHostPort(controller.APIEndpoints[0]) if err == nil { addressInfo = fmt.Sprintf(" at %s", addr.Address.Value) } } ctx.Infof("Contacting Juju controller%s to verify accessibility...", addressInfo) apiAttempts = 1 for attempt := attempts.Start(); attempt.Next(); apiAttempts++ { err = tryAPI(c) if err == nil { ctx.Infof("Bootstrap complete, %q controller now available.", controllerName) ctx.Infof("Controller machines are in the %q model.", bootstrap.ControllerModelName) ctx.Infof("Initial model %q added.", hostedModelName) break } // As the API server is coming up, it goes through a number of steps. // Initially the upgrade steps run, but the api server allows some // calls to be processed during the upgrade, but not the list blocks. // Logins are also blocked during space discovery. // It is also possible that the underlying database causes connections // to be dropped as it is initialising, or reconfiguring. These can // lead to EOF or "connection is shut down" error messages. We skip // these too, hoping that things come back up before the end of the // retry poll count. errorMessage := errors.Cause(err).Error() switch { case errors.Cause(err) == io.EOF, strings.HasSuffix(errorMessage, "connection is shut down"), strings.HasSuffix(errorMessage, "no api connection available"), strings.Contains(errorMessage, "spaces are still being discovered"): ctx.Verbosef("Still waiting for API to become available") continue case params.ErrCode(err) == params.CodeUpgradeInProgress: ctx.Verbosef("Still waiting for API to become available: %v", err) continue } break } return errors.Annotatef(err, "unable to contact api server after %d attempts", apiAttempts) }
func (s *pingerSuite) TestAgentConnectionDelaysShutdownWithPing(c *gc.C) { // To negate the effects of an underpowered or heavily loaded // machine running this test, tune the shortTimeout based on the // maximum duration it takes to open an API connection. shortTimeout := s.calculatePingTimeout(c) attemptDelay := shortTimeout / 4 s.PatchValue(apiserver.MaxClientPingInterval, time.Duration(shortTimeout)) st, _ := s.OpenAPIAsNewMachine(c) err := st.Ping() c.Assert(err, jc.ErrorIsNil) defer st.Close() // As long as we don't wait too long, the connection stays open attempt := utils.AttemptStrategy{ Min: 10, Delay: attemptDelay, } testStart := time.Now() c.Logf( "pinging %d times with %v delay, ping timeout %v, starting at %v", attempt.Min, attempt.Delay, shortTimeout, testStart, ) var lastLoop time.Time for a := attempt.Start(); a.Next(); { testNow := time.Now() loopDelta := testNow.Sub(lastLoop) if lastLoop.IsZero() { loopDelta = 0 } c.Logf("duration since last ping: %v", loopDelta) err = st.Ping() if !c.Check( err, jc.ErrorIsNil, gc.Commentf( "ping timeout exceeded at %v (%v since the test start)", testNow, testNow.Sub(testStart), ), ) { c.Check(err, gc.ErrorMatches, "connection is shut down") return } lastLoop = time.Now() } // However, once we stop pinging for too long, the connection dies time.Sleep(2 * shortTimeout) // Exceed the timeout. err = st.Ping() c.Assert(err, gc.ErrorMatches, "connection is shut down") }
func checkConnectionDies(c *gc.C, conn api.Connection) { attempt := utils.AttemptStrategy{ Total: coretesting.LongWait, Delay: coretesting.ShortWait, } for a := attempt.Start(); a.Next(); { err := pingConn(conn) if err != nil { c.Assert(err, gc.ErrorMatches, "connection is shut down") return } } c.Fatal("connection didn't get shut down") }
func attemptLoop(c *gc.C, strategy utils.AttemptStrategy, desc string, f func() error) { var err error start := time.Now() attemptCount := 0 for attempt := strategy.Start(); attempt.Next(); { attemptCount += 1 if err = f(); err == nil || !attempt.HasNext() { break } c.Logf("%s failed: %v", desc, err) } c.Logf("%s: %d attempts in %s", desc, attemptCount, time.Since(start)) c.Assert(err, gc.IsNil) }
// networkOperationWithRetries calls the supplied function and if it returns a // network error which is temporary, will retry a number of times before giving up. func networkOperationWithRetries(strategy utils.AttemptStrategy, networkOp func() error, description string) func() error { return func() error { for a := strategy.Start(); ; { a.Next() err := networkOp() if !a.HasNext() || err == nil { return errors.Trace(err) } if networkErr, ok := errors.Cause(err).(net.Error); !ok || !networkErr.Temporary() { return errors.Trace(err) } logger.Debugf("%q error, will retry: %v", description, err) } } }
func (v *ebsVolumeSource) waitVolumeCreated(volumeId string) (*ec2.Volume, error) { var attempt = utils.AttemptStrategy{ Total: 5 * time.Second, Delay: 200 * time.Millisecond, } for a := attempt.Start(); a.Next(); { volume, err := v.describeVolume(volumeId) if err != nil { return nil, errors.Trace(err) } if volume.Status != volumeStatusCreating { return volume, nil } } return nil, errors.Errorf("timed out waiting for volume %v to become available", volumeId) }
func assertStateBecomesClosed(c *gc.C, st *state.State) { // This is gross but I can't see any other way to check for // closedness outside the state package. checkModel := func() { attempt := utils.AttemptStrategy{ Total: coretesting.LongWait, Delay: coretesting.ShortWait, } for a := attempt.Start(); a.Next(); { // This will panic once the state is closed. _, _ = st.Model() } // If we got here then st is still open. st.Close() } c.Assert(checkModel, gc.PanicMatches, "Session already closed") }
// It appears that sometimes the lock is not cleared when we expect it to be. // Capture and log any errors from the Unlock method and retry a few times. func unlockEnvironmentLock(lock *fslock.Lock) { attempts := utils.AttemptStrategy{ Delay: 50 * time.Millisecond, Min: 10, } var err error for a := attempts.Start(); a.Next(); { err = lock.Unlock() if err == nil { return } if a.HasNext() { logger.Debugf("failed to unlock configstore lock: %s, retrying", err) } } logger.Errorf("unable to unlock configstore lock: %s", err) }
func (s *mongoPingerSuite) TestAgentConnectionsShutDownWhenStateDies(c *gc.C) { st, _ := s.OpenAPIAsNewMachine(c) err := st.Ping() c.Assert(err, jc.ErrorIsNil) gitjujutesting.MgoServer.Destroy() attempt := utils.AttemptStrategy{ Total: coretesting.LongWait, Delay: coretesting.ShortWait, } for a := attempt.Start(); a.Next(); { if err := st.Ping(); err != nil { c.Assert(err, gc.ErrorMatches, "connection is shut down") return } } c.Fatalf("timed out waiting for API server to die") }
// GetInstall runs 'apt-get install packages' for the packages listed // here. apt-get install calls are retried for 30 times with a 10 // second sleep between attempts. func GetInstall(packages ...string) error { cmdArgs := append([]string(nil), getCommand...) cmdArgs = append(cmdArgs, "install") cmdArgs = append(cmdArgs, packages...) logger.Infof("Running: %s", cmdArgs) cmd := exec.Command(cmdArgs[0], cmdArgs[1:]...) cmd.Env = append(os.Environ(), getEnvOptions...) var err error var out []byte // Retry APT operations for 30 times, sleeping 10 seconds // between attempts. This avoids failure in the case of // something else having the dpkg lock (e.g. a charm on the // machine we're deploying containers to). attempt := utils.AttemptStrategy{Delay: 10 * time.Second, Min: 30} for a := attempt.Start(); a.Next(); { out, err = CommandOutput(cmd) if err == nil { return nil } exitError, ok := err.(*exec.ExitError) if !ok { err = fmt.Errorf("unexpected error type %T", err) break } waitStatus, ok := exitError.ProcessState.Sys().(syscall.WaitStatus) if !ok { err = fmt.Errorf("unexpected process state type %T", exitError.ProcessState.Sys()) break } // From apt-get(8) "apt-get returns zero on normal // operation, decimal 100 on error." if waitStatus.ExitStatus() != 100 { break } } if err != nil { logger.Errorf("apt-get command failed: %v\nargs: %#v\n%s", err, cmdArgs, string(out)) return fmt.Errorf("apt-get failed: %v", err) } return nil }
func (v *ebsVolumeSource) waitVolume( volumeId string, attempt utils.AttemptStrategy, pred func(v *ec2.Volume) (bool, error), ) (*ec2.Volume, error) { for a := attempt.Start(); a.Next(); { volume, err := v.describeVolume(volumeId) if err != nil { return nil, errors.Trace(err) } ok, err := pred(volume) if err != nil { return nil, errors.Trace(err) } if ok { return volume, nil } } return nil, errWaitVolumeTimeout }
// waitForAgentInitialisation polls the bootstrapped controller with a read-only // command which will fail until the controller is fully initialised. // TODO(wallyworld) - add a bespoke command to maybe the admin facade for this purpose. func (c *bootstrapCommand) waitForAgentInitialisation(ctx *cmd.Context) (err error) { attempts := utils.AttemptStrategy{ Min: bootstrapReadyPollCount, Delay: bootstrapReadyPollDelay, } var client block.BlockListAPI for attempt := attempts.Start(); attempt.Next(); { client, err = blockAPI(&c.ModelCommandBase) if err != nil { // Logins are prevented whilst space discovery is ongoing. errorMessage := err.Error() if strings.Contains(errorMessage, "space discovery still in progress") { continue } return err } _, err = client.List() client.Close() if err == nil { ctx.Infof("Bootstrap complete") return nil } // As the API server is coming up, it goes through a number of steps. // Initially the upgrade steps run, but the api server allows some // calls to be processed during the upgrade, but not the list blocks. // It is also possible that the underlying database causes connections // to be dropped as it is initialising, or reconfiguring. These can // lead to EOF or "connection is shut down" error messages. We skip // these too, hoping that things come back up before the end of the // retry poll count. errorMessage := err.Error() if strings.Contains(errorMessage, apiserver.UpgradeInProgressError.Error()) || strings.HasSuffix(errorMessage, "EOF") || strings.HasSuffix(errorMessage, "connection is shut down") { ctx.Infof("Waiting for API to become available") continue } return err } return err }
// waitForAgentInitialisation polls the bootstrapped controller with a read-only // command which will fail until the controller is fully initialised. // TODO(wallyworld) - add a bespoke command to maybe the admin facade for this purpose. func (c *bootstrapCommand) waitForAgentInitialisation(ctx *cmd.Context) error { attempts := utils.AttemptStrategy{ Min: bootstrapReadyPollCount, Delay: bootstrapReadyPollDelay, } var ( apiAttempts int err error ) apiAttempts = 1 for attempt := attempts.Start(); attempt.Next(); apiAttempts++ { err = c.tryAPI() if err == nil { ctx.Infof("Bootstrap complete, %s now available.", c.controllerName) break } // As the API server is coming up, it goes through a number of steps. // Initially the upgrade steps run, but the api server allows some // calls to be processed during the upgrade, but not the list blocks. // Logins are also blocked during space discovery. // It is also possible that the underlying database causes connections // to be dropped as it is initialising, or reconfiguring. These can // lead to EOF or "connection is shut down" error messages. We skip // these too, hoping that things come back up before the end of the // retry poll count. errorMessage := errors.Cause(err).Error() switch { case errors.Cause(err) == io.EOF, strings.HasSuffix(errorMessage, "connection is shut down"), strings.Contains(errorMessage, "spaces are still being discovered"): ctx.Infof("Waiting for API to become available") continue case params.ErrCode(err) == params.CodeUpgradeInProgress: ctx.Infof("Waiting for API to become available: %v", err) continue } break } return errors.Annotatef(err, "unable to contact api server after %d attempts", apiAttempts) }
func (*utilsSuite) TestAttemptTiming(c *gc.C) { testAttempt := utils.AttemptStrategy{ Total: 0.25e9, Delay: 0.1e9, } want := []time.Duration{0, 0.1e9, 0.2e9, 0.2e9} got := make([]time.Duration, 0, len(want)) // avoid allocation when testing timing t0 := time.Now() for a := testAttempt.Start(); a.Next(); { got = append(got, time.Now().Sub(t0)) } got = append(got, time.Now().Sub(t0)) c.Assert(got, gc.HasLen, len(want)) const margin = 0.01e9 for i, got := range want { lo := want[i] - margin hi := want[i] + margin if got < lo || got > hi { c.Errorf("attempt %d want %g got %g", i, want[i].Seconds(), got.Seconds()) } } }
// newStateConnection tries to connect to the newly restored state server. func newStateConnection(environTag names.EnvironTag, info *mongo.MongoInfo) (*state.State, error) { // We need to retry here to allow mongo to come up on the restored state server. // The connection might succeed due to the mongo dial retries but there may still // be a problem issuing database commands. var ( st *state.State err error ) const ( newStateConnDelay = 15 * time.Second newStateConnMinAttempts = 8 ) attempt := utils.AttemptStrategy{Delay: newStateConnDelay, Min: newStateConnMinAttempts} for a := attempt.Start(); a.Next(); { st, err = state.Open(environTag, info, mongoDefaultDialOpts(), environsNewStatePolicy()) if err == nil { return st, nil } logger.Errorf("cannot open state, retrying: %v", err) } return st, errors.Annotate(err, "cannot open state") }
func ExampleAttempt_HasNext() { // This example shows how Attempt.HasNext can be used to help // structure an attempt loop. If the godoc example code allowed // us to make the example return an error, we would uncomment // the commented return statements. attempts := utils.AttemptStrategy{ Total: 1 * time.Second, Delay: 250 * time.Millisecond, } for attempt := attempts.Start(); attempt.Next(); { x, err := doSomething() if shouldRetry(err) && attempt.HasNext() { continue } if err != nil { // return err return } doSomethingWith(x) } // return ErrTimedOut return }
// newStateConnection tries to connect to the newly restored controller. func newStateConnection(controllerTag names.ControllerTag, modelTag names.ModelTag, info *mongo.MongoInfo) (*state.State, error) { // We need to retry here to allow mongo to come up on the restored controller. // The connection might succeed due to the mongo dial retries but there may still // be a problem issuing database commands. var ( st *state.State err error ) const ( newStateConnDelay = 15 * time.Second newStateConnMinAttempts = 8 ) // TODO(katco): 2016-08-09: lp:1611427 attempt := utils.AttemptStrategy{Delay: newStateConnDelay, Min: newStateConnMinAttempts} getEnviron := stateenvirons.GetNewEnvironFunc(environs.New) for a := attempt.Start(); a.Next(); { st, err = state.Open(modelTag, controllerTag, info, mongoDefaultDialOpts(), environsGetNewPolicyFunc(getEnviron)) if err == nil { return st, nil } logger.Errorf("cannot open state, retrying: %v", err) } return st, errors.Annotate(err, "cannot open state") }
func (s *MongoSuite) TestCurrentStatus(c *gc.C) { session := s.root.MustDial() defer session.Close() inst1 := newServer(c) defer inst1.Destroy() defer Remove(session, inst1.Addr()) inst2 := newServer(c) defer inst2.Destroy() defer Remove(session, inst2.Addr()) var err error strategy := utils.AttemptStrategy{Total: time.Minute * 2, Delay: time.Millisecond * 500} attempt := strategy.Start() for attempt.Next() { err = Add(session, Member{Address: inst1.Addr()}, Member{Address: inst2.Addr()}) if err == nil || !attempt.HasNext() { break } } c.Assert(err, gc.IsNil) expected := &Status{ Name: rsName, Members: []MemberStatus{{ Id: 1, Address: s.root.Addr(), Self: true, ErrMsg: "", Healthy: true, State: PrimaryState, }, { Id: 2, Address: inst1.Addr(), Self: false, ErrMsg: "", Healthy: true, State: SecondaryState, }, { Id: 3, Address: inst2.Addr(), Self: false, ErrMsg: "", Healthy: true, State: SecondaryState, }}, } strategy.Total = time.Second * 90 attempt = strategy.Start() var res *Status for attempt.Next() { var err error res, err = CurrentStatus(session) if err != nil { if !attempt.HasNext() { c.Errorf("Couldn't get status before timeout, got err: %v", err) return } else { // try again continue } } if res.Members[0].State == PrimaryState && res.Members[1].State == SecondaryState && res.Members[2].State == SecondaryState { break } if !attempt.HasNext() { c.Errorf("Servers did not get into final state before timeout. Status: %#v", res) return } } for x, _ := range res.Members { // non-empty uptime and ping c.Check(res.Members[x].Uptime, gc.Not(gc.Equals), 0) // ping is always going to be zero since we're on localhost // so we can't really test it right now // now overwrite Uptime so it won't throw off DeepEquals res.Members[x].Uptime = 0 } c.Check(res, jc.DeepEquals, expected) }
// It was aliased for testing purposes. var RunCommandWithRetry = func(cmd string, getFatalError func(string) error) (output string, code int, err error) { var out []byte // split the command for use with exec args := strings.Fields(cmd) if len(args) <= 1 { return "", 1, errors.New(fmt.Sprintf("too few arguments: expected at least 2, got %d", len(args))) } logger.Infof("Running: %s", cmd) // Retry operation 30 times, sleeping every 10 seconds between attempts. // This avoids failure in the case of something else having the dpkg lock // (e.g. a charm on the machine we're deploying containers to). for a := AttemptStrategy.Start(); a.Next(); { // Create the command for each attempt, because we need to // call cmd.CombinedOutput only once. See http://pad.lv/1394524. cmd := exec.Command(args[0], args[1:]...) out, err = CommandOutput(cmd) if err == nil { return string(out), 0, nil } exitError, ok := err.(*exec.ExitError) if !ok { err = errors.Annotatef(err, "unexpected error type %T", err) break }
func (c *restoreCommand) Run(ctx *cmd.Context) error { if c.showDescription { fmt.Fprintf(ctx.Stdout, "%s\n", c.Info().Purpose) return nil } if err := c.Log.Start(ctx); err != nil { return err } agentConf, err := extractConfig(c.backupFile) if err != nil { return errors.Annotate(err, "cannot extract configuration from backup file") } progress("extracted credentials from backup file") store, err := configstore.Default() if err != nil { return err } cfg, err := c.Config(store, nil) if err != nil { return err } env, err := rebootstrap(cfg, ctx, c.Constraints) if err != nil { return errors.Annotate(err, "cannot re-bootstrap environment") } progress("connecting to newly bootstrapped instance") var apiState api.Connection // The state server backend may not be ready to accept logins so we retry. // We'll do up to 8 retries over 2 minutes to give the server time to come up. // Typically we expect only 1 retry will be needed. attempt := utils.AttemptStrategy{Delay: 15 * time.Second, Min: 8} // While specifying the admin user will work for now, as soon as we allow // the users to have a different initial user name, or they have changed // the password for the admin user, this will fail. owner := names.NewUserTag("admin") for a := attempt.Start(); a.Next(); { apiState, err = juju.NewAPIState(owner, env, api.DefaultDialOpts()) if err == nil || errors.Cause(err).Error() != "EOF" { break } progress("bootstrapped instance not ready - attempting to redial") } if err != nil { return errors.Annotate(err, "cannot connect to bootstrap instance") } progress("restoring bootstrap machine") machine0Addr, err := restoreBootstrapMachine(apiState, c.backupFile, agentConf) if err != nil { return errors.Annotate(err, "cannot restore bootstrap machine") } progress("restored bootstrap machine") apiState, err = juju.NewAPIState(owner, env, api.DefaultDialOpts()) progress("opening state") if err != nil { return errors.Annotate(err, "cannot connect to api server") } progress("updating all machines") results, err := updateAllMachines(apiState, machine0Addr) if err != nil { return errors.Annotate(err, "cannot update machines") } var message string for _, result := range results { if result.err != nil { message = fmt.Sprintf("Update of machine %q failed: %v", result.machineName, result.err) } else { message = fmt.Sprintf("Succesful update of machine %q", result.machineName) } progress(message) } return nil }
func (c *restoreCommand) Run(ctx *cmd.Context) error { if c.showDescription { fmt.Fprintf(ctx.Stdout, "%s\n", c.Info().Purpose) return nil } if err := c.Log.Start(ctx); err != nil { return err } agentConf, err := extractConfig(c.backupFile) if err != nil { return fmt.Errorf("cannot extract configuration from backup file: %v", err) } progress("extracted credentials from backup file") store, err := configstore.Default() if err != nil { return err } cfg, _, err := environs.ConfigForName(c.EnvName, store) if err != nil { return err } env, err := rebootstrap(cfg, ctx, c.Constraints) if err != nil { return fmt.Errorf("cannot re-bootstrap environment: %v", err) } progress("connecting to newly bootstrapped instance") var conn *juju.APIConn // The state server backend may not be ready to accept logins so we retry. // We'll do up to 8 retries over 2 minutes to give the server time to come up. // Typically we expect only 1 retry will be needed. attempt := utils.AttemptStrategy{Delay: 15 * time.Second, Min: 8} for a := attempt.Start(); a.Next(); { conn, err = juju.NewAPIConn(env, api.DefaultDialOpts()) if err == nil || errors.Cause(err).Error() != "EOF" { break } progress("bootstrapped instance not ready - attempting to redial") } if err != nil { return fmt.Errorf("cannot connect to bootstrap instance: %v", err) } progress("restoring bootstrap machine") newInstId, machine0Addr, err := restoreBootstrapMachine(conn, c.backupFile, agentConf) if err != nil { return fmt.Errorf("cannot restore bootstrap machine: %v", err) } progress("restored bootstrap machine") // Update the environ state to point to the new instance. if err := bootstrap.SaveState(env.Storage(), &bootstrap.BootstrapState{ StateInstances: []instance.Id{newInstId}, }); err != nil { return fmt.Errorf("cannot update environ bootstrap state storage: %v", err) } // Construct our own state info rather than using juju.NewConn so // that we can avoid storage eventual-consistency issues // (and it's faster too). caCert, ok := cfg.CACert() if !ok { return fmt.Errorf("configuration has no CA certificate") } progress("opening state") // We need to retry here to allow mongo to come up on the restored state server. // The connection might succeed due to the mongo dial retries but there may still // be a problem issuing database commands. var st *state.State for a := attempt.Start(); a.Next(); { st, err = state.Open(&state.Info{ Info: mongo.Info{ Addrs: []string{fmt.Sprintf("%s:%d", machine0Addr, cfg.StatePort())}, CACert: caCert, }, Tag: agentConf.Credentials.Tag, Password: agentConf.Credentials.Password, }, mongo.DefaultDialOpts(), environs.NewStatePolicy()) if err == nil { break } progress("state server not ready - attempting to re-connect") } if err != nil { return fmt.Errorf("cannot open state: %v", err) } progress("updating all machines") if err := updateAllMachines(st, machine0Addr); err != nil { return fmt.Errorf("cannot update machines: %v", err) } return nil }