// StartSession creates a session with the backend and handles requests // using the passed in arguments. // The engine is expected to initialized and gathering container metrics by // the time the websocket client starts using it. func StartSession(params TelemetrySessionParams, statsEngine stats.Engine) error { backoff := utils.NewSimpleBackoff(time.Second, 1*time.Minute, 0.2, 2) for { tcsError := startTelemetrySession(params, statsEngine) if tcsError == nil || tcsError == io.EOF { backoff.Reset() } else { log.Info("Error from tcs; backing off", "err", tcsError) ttime.Sleep(backoff.Duration()) } } }
// RetryWithBackoff takes a Backoff and a function to call that returns an error // If the error is nil then the function will no longer be called // If the error is Retriable then that will be used to determine if it should be // retried func RetryWithBackoff(backoff Backoff, fn func() error) error { var err error for err = fn(); true; err = fn() { retriable, isRetriable := err.(Retriable) if err == nil || isRetriable && !retriable.Retry() { return err } ttime.Sleep(backoff.Duration()) } return err }
// StartSession creates a session with the backend and handles requests // using the passed in arguments. // The engine is expected to initialized and gathering container metrics by // the time the websocket client starts using it. func StartSession(params TelemetrySessionParams, statsEngine stats.Engine) error { backoff := utils.NewSimpleBackoff(time.Second, 1*time.Minute, 0.2, 2) for { tcsEndpoint, err := params.EcsClient.DiscoverTelemetryEndpoint(params.ContainerInstanceArn) if err != nil { log.Error("Unable to discover poll endpoint", "err", err) return err } log.Debug("Connecting to TCS endpoint " + tcsEndpoint) url := formatURL(tcsEndpoint, params.Cfg.Cluster, params.ContainerInstanceArn) tcsError := startSession(url, params.Cfg.AWSRegion, params.CredentialProvider, params.AcceptInvalidCert, statsEngine, defaultPublishMetricsInterval) if tcsError == nil || tcsError == io.EOF { backoff.Reset() } else { log.Info("Error from tcs; backing off", "err", tcsError) ttime.Sleep(backoff.Duration()) } } }
// ECSRetryHandler defines how to retry ECS service calls. It behaves like the default retry handler, except for the SubmitStateChange operations where it has a massive upper limit on retry counts func ECSRetryHandler(r *aws.Request) { if r.Operation == nil || (r.Operation.Name != opSubmitContainerStateChange && r.Operation.Name != opSubmitTaskStateChange) { aws.AfterRetryHandler(r) return } // else this is a Submit*StateChange operation // For these operations, fake the retry count for the sake of the WillRetry check. // Do this by temporarily setting it to 0 before calling that check. // We still keep the value around for sleep calculations // See https://github.com/aws/aws-sdk-go/blob/b2d953f489cf94029392157225e893d7b69cd447/aws/handler_functions.go#L107 // for this code's inspiration realRetryCount := r.RetryCount if r.RetryCount < maxSubmitRetryCount { r.RetryCount = 0 } r.Retryable.Set(r.Service.ShouldRetry(r)) if r.WillRetry() { r.RetryCount = realRetryCount if r.RetryCount > 20 { // Hardcoded max for calling RetryRules here because it *will* overflow if you let it and result in sleeping negative time r.RetryDelay = maxSubmitRetryDelay } else { r.RetryDelay = durationMin(maxSubmitRetryDelay, r.Service.RetryRules(r)) } // AddJitter is purely additive, so subtracting half the amount of jitter // makes it average out to RetryDelay ttime.Sleep(utils.AddJitter(r.RetryDelay-submitRetryDelayJitter/2, submitRetryDelayJitter)) if r.Error != nil { if err, ok := r.Error.(awserr.Error); ok { if isCodeExpiredCreds(err.Code()) { r.Config.Credentials.Expire() } } } r.RetryCount++ r.Error = nil } }
// StartSession creates a session with ACS and handles requests using the passed // in arguments. func StartSession(ctx context.Context, args StartSessionArguments) error { ecsclient := args.ECSClient cfg := args.Config backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier) payloadBuffer := make(chan *ecsacs.PayloadMessage, payloadMessageBufferSize) ackBuffer := make(chan string, payloadMessageBufferSize) go func() { // Handle any payloads async. For correctness, they must be handled in order, hence the buffered channel which is added to synchronously. for { select { case payload := <-payloadBuffer: handlePayloadMessage(ackBuffer, cfg.Cluster, args.ContainerInstanceArn, payload, args.TaskEngine, ecsclient, args.StateManager) case <-ctx.Done(): return } } }() for { acsError := func() error { acsEndpoint, err := ecsclient.DiscoverPollEndpoint(args.ContainerInstanceArn) if err != nil { log.Error("Unable to discover poll endpoint", "err", err) return err } log.Debug("Connecting to ACS endpoint " + acsEndpoint) url := AcsWsUrl(acsEndpoint, cfg.Cluster, args.ContainerInstanceArn, args.TaskEngine) clearStrChannel(ackBuffer) client := acsclient.New(url, cfg.AWSRegion, args.CredentialProvider, args.AcceptInvalidCert) defer client.Close() // Clear the ackbuffer whenever we get a new client because acks of // messageids don't have any value across sessions defer clearStrChannel(ackBuffer) timer := ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() { log.Warn("ACS Connection hasn't had any activity for too long; closing connection") closeErr := client.Close() if closeErr != nil { log.Warn("Error disconnecting: " + closeErr.Error()) } }) defer timer.Stop() // Any message from the server resets the disconnect timeout client.SetAnyRequestHandler(anyMessageHandler(timer)) client.AddRequestHandler(payloadMessageHandler(payloadBuffer)) // Ignore heartbeat messages; anyMessageHandler gets 'em client.AddRequestHandler(func(*ecsacs.HeartbeatMessage) {}) updater.AddAgentUpdateHandlers(client, cfg, args.StateManager, args.TaskEngine) err = client.Connect() if err != nil { log.Error("Error connecting to ACS: " + err.Error()) return err } ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() { // If we do not have an error connecting and remain connected for at // least 5 or so minutes, reset the backoff. This prevents disconnect // errors that only happen infrequently from damaging the // reconnectability as significantly. backoff.Reset() }) serveErr := make(chan error, 1) go func() { serveErr <- client.Serve() }() for { select { case mid := <-ackBuffer: ackMessageId(client, cfg.Cluster, args.ContainerInstanceArn, mid) case <-ctx.Done(): return ctx.Err() case err := <-serveErr: return err } } }() select { case <-ctx.Done(): return ctx.Err() default: } if acsError == nil || acsError == io.EOF { backoff.Reset() } else { log.Info("Error from acs; backing off", "err", acsError) ttime.Sleep(backoff.Duration()) } } }