func startSession(url string, region string, credentialProvider *credentials.Credentials, acceptInvalidCert bool, statsEngine stats.Engine, publishMetricsInterval time.Duration, deregisterInstanceEventStream *eventstream.EventStream) error { client := tcsclient.New(url, region, credentialProvider, acceptInvalidCert, statsEngine, publishMetricsInterval) defer client.Close() err := deregisterInstanceEventStream.Subscribe(deregisterContainerInstanceHandler, client.Disconnect) if err != nil { return err } defer deregisterInstanceEventStream.Unsubscribe(deregisterContainerInstanceHandler) // start a timer and listens for tcs heartbeats/acks. The timer is reset when // we receive a heartbeat from the server or when a publish metrics message // is acked. timer := time.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() { // Close the connection if there haven't been any messages received from backend // for a long time. log.Debug("TCS Connection hasn't had a heartbeat or an ack message in too long of a timeout; disconnecting") client.Close() }) defer timer.Stop() client.AddRequestHandler(heartbeatHandler(timer)) client.AddRequestHandler(ackPublishMetricHandler(timer)) err = client.Connect() if err != nil { log.Error("Error connecting to TCS: " + err.Error()) return err } return client.Serve() }
// newDisconnectionTimer creates a new time object, with a callback to // disconnect from ACS on inactivity func newDisconnectionTimer(client wsclient.ClientServer, _time ttime.Time, timeout time.Duration, jitter time.Duration) ttime.Timer { timer := _time.AfterFunc(utils.AddJitter(timeout, jitter), func() { seelog.Warn("ACS Connection hasn't had any activity for too long; closing connection") closeErr := client.Close() if closeErr != nil { seelog.Warnf("Error disconnecting: %v", closeErr) } }) return timer }
// ECSRetryHandler defines how to retry ECS service calls. It behaves like the default retry handler, except for the SubmitStateChange operations where it has a massive upper limit on retry counts func ECSRetryHandler(r *aws.Request) { if r.Operation == nil || (r.Operation.Name != opSubmitContainerStateChange && r.Operation.Name != opSubmitTaskStateChange) { aws.AfterRetryHandler(r) return } // else this is a Submit*StateChange operation // For these operations, fake the retry count for the sake of the WillRetry check. // Do this by temporarily setting it to 0 before calling that check. // We still keep the value around for sleep calculations // See https://github.com/aws/aws-sdk-go/blob/b2d953f489cf94029392157225e893d7b69cd447/aws/handler_functions.go#L107 // for this code's inspiration realRetryCount := r.RetryCount if r.RetryCount < maxSubmitRetryCount { r.RetryCount = 0 } r.Retryable.Set(r.Service.ShouldRetry(r)) if r.WillRetry() { r.RetryCount = realRetryCount if r.RetryCount > 20 { // Hardcoded max for calling RetryRules here because it *will* overflow if you let it and result in sleeping negative time r.RetryDelay = maxSubmitRetryDelay } else { r.RetryDelay = durationMin(maxSubmitRetryDelay, r.Service.RetryRules(r)) } // AddJitter is purely additive, so subtracting half the amount of jitter // makes it average out to RetryDelay ttime.Sleep(utils.AddJitter(r.RetryDelay-submitRetryDelayJitter/2, submitRetryDelayJitter)) if r.Error != nil { if err, ok := r.Error.(awserr.Error); ok { if isCodeExpiredCreds(err.Code()) { r.Config.Credentials.Expire() } } } r.RetryCount++ r.Error = nil } }
// anyMessageHandler handles any server message. Any server message means the // connection is active and thus the heartbeat disconnect should not occur func anyMessageHandler(timer ttime.Timer) func(interface{}) { return func(interface{}) { seelog.Debug("ACS activity occured") timer.Reset(utils.AddJitter(heartbeatTimeout, heartbeatJitter)) } }
// startACSSession starts a session with ACS. It adds request handlers for various // kinds of messages expected from ACS. It returns on server disconnection or when // the context is cancelled func startACSSession(ctx context.Context, client wsclient.ClientServer, timer ttime.Timer, args StartSessionArguments, backoff *utils.SimpleBackoff, acsSessionState sessionState) error { // Any message from the server resets the disconnect timeout client.SetAnyRequestHandler(anyMessageHandler(timer)) cfg := args.Config refreshCredsHandler := newRefreshCredentialsHandler(ctx, cfg.Cluster, args.ContainerInstanceArn, client, args.CredentialsManager, args.TaskEngine) defer refreshCredsHandler.clearAcks() refreshCredsHandler.start() client.AddRequestHandler(refreshCredsHandler.handlerFunc()) // Add request handler for handling payload messages from ACS payloadHandler := newPayloadRequestHandler(ctx, args.TaskEngine, args.ECSClient, cfg.Cluster, args.ContainerInstanceArn, client, args.StateManager, refreshCredsHandler, args.CredentialsManager) // Clear the acks channel on return because acks of messageids don't have any value across sessions defer payloadHandler.clearAcks() payloadHandler.start() client.AddRequestHandler(payloadHandler.handlerFunc()) // Ignore heartbeat messages; anyMessageHandler gets 'em client.AddRequestHandler(func(*ecsacs.HeartbeatMessage) {}) updater.AddAgentUpdateHandlers(client, cfg, args.StateManager, args.TaskEngine) err := client.Connect() if err != nil { seelog.Errorf("Error connecting to ACS: %v", err) return err } acsSessionState.connectedToACS() backoffResetTimer := args.time().AfterFunc(utils.AddJitter(args.heartbeatTimeout(), args.heartbeatJitter()), func() { // If we do not have an error connecting and remain connected for at // least 5 or so minutes, reset the backoff. This prevents disconnect // errors that only happen infrequently from damaging the // reconnectability as significantly. backoff.Reset() }) defer backoffResetTimer.Stop() serveErr := make(chan error, 1) go func() { serveErr <- client.Serve() }() for { select { case <-ctx.Done(): // Stop receiving and sending messages from and to ACS when // the context received from the main function is canceled payloadHandler.stop() refreshCredsHandler.stop() return ctx.Err() case err := <-serveErr: // Stop receiving and sending messages from and to ACS when // client.Serve returns an error. This can happen when the // the connection is closed by ACS or the agent payloadHandler.stop() refreshCredsHandler.stop() return err } } }
// ackPublishMetricHandler consumes the ack message from the backend. THe backend sends // the ack each time it processes a metric message. func ackPublishMetricHandler(timer *time.Timer) func(*ecstcs.AckPublishMetric) { return func(*ecstcs.AckPublishMetric) { log.Debug("Received AckPublishMetric from tcs") timer.Reset(utils.AddJitter(heartbeatTimeout, heartbeatJitter)) } }
// heartbeatHandler resets the heartbeat timer when HeartbeatMessage message is received from tcs. func heartbeatHandler(timer *time.Timer) func(*ecstcs.HeartbeatMessage) { return func(*ecstcs.HeartbeatMessage) { log.Debug("Received HeartbeatMessage from tcs") timer.Reset(utils.AddJitter(heartbeatTimeout, heartbeatJitter)) } }
func (client *ecrClient) expirationJitter() time.Duration { return utils.AddJitter(MinimumJitterDuration, MaximumJitterDuration) }
// StartSession creates a session with ACS and handles requests using the passed // in arguments. func StartSession(ctx context.Context, args StartSessionArguments) error { ecsclient := args.ECSClient cfg := args.Config backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier) payloadBuffer := make(chan *ecsacs.PayloadMessage, payloadMessageBufferSize) ackBuffer := make(chan string, payloadMessageBufferSize) go func() { // Handle any payloads async. For correctness, they must be handled in order, hence the buffered channel which is added to synchronously. for { select { case payload := <-payloadBuffer: handlePayloadMessage(ackBuffer, cfg.Cluster, args.ContainerInstanceArn, payload, args.TaskEngine, ecsclient, args.StateManager) case <-ctx.Done(): return } } }() for { acsError := func() error { acsEndpoint, err := ecsclient.DiscoverPollEndpoint(args.ContainerInstanceArn) if err != nil { log.Error("Unable to discover poll endpoint", "err", err) return err } log.Debug("Connecting to ACS endpoint " + acsEndpoint) url := AcsWsUrl(acsEndpoint, cfg.Cluster, args.ContainerInstanceArn, args.TaskEngine) clearStrChannel(ackBuffer) client := acsclient.New(url, cfg.AWSRegion, args.CredentialProvider, args.AcceptInvalidCert) defer client.Close() // Clear the ackbuffer whenever we get a new client because acks of // messageids don't have any value across sessions defer clearStrChannel(ackBuffer) timer := ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() { log.Warn("ACS Connection hasn't had any activity for too long; closing connection") closeErr := client.Close() if closeErr != nil { log.Warn("Error disconnecting: " + closeErr.Error()) } }) defer timer.Stop() // Any message from the server resets the disconnect timeout client.SetAnyRequestHandler(anyMessageHandler(timer)) client.AddRequestHandler(payloadMessageHandler(payloadBuffer)) // Ignore heartbeat messages; anyMessageHandler gets 'em client.AddRequestHandler(func(*ecsacs.HeartbeatMessage) {}) updater.AddAgentUpdateHandlers(client, cfg, args.StateManager, args.TaskEngine) err = client.Connect() if err != nil { log.Error("Error connecting to ACS: " + err.Error()) return err } ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() { // If we do not have an error connecting and remain connected for at // least 5 or so minutes, reset the backoff. This prevents disconnect // errors that only happen infrequently from damaging the // reconnectability as significantly. backoff.Reset() }) serveErr := make(chan error, 1) go func() { serveErr <- client.Serve() }() for { select { case mid := <-ackBuffer: ackMessageId(client, cfg.Cluster, args.ContainerInstanceArn, mid) case <-ctx.Done(): return ctx.Err() case err := <-serveErr: return err } } }() select { case <-ctx.Done(): return ctx.Err() default: } if acsError == nil || acsError == io.EOF { backoff.Reset() } else { log.Info("Error from acs; backing off", "err", acsError) ttime.Sleep(backoff.Duration()) } } }