Ejemplo n.º 1
0
func startSession(url string, region string, credentialProvider *credentials.Credentials, acceptInvalidCert bool, statsEngine stats.Engine, publishMetricsInterval time.Duration, deregisterInstanceEventStream *eventstream.EventStream) error {
	client := tcsclient.New(url, region, credentialProvider, acceptInvalidCert, statsEngine, publishMetricsInterval)
	defer client.Close()

	err := deregisterInstanceEventStream.Subscribe(deregisterContainerInstanceHandler, client.Disconnect)
	if err != nil {
		return err
	}
	defer deregisterInstanceEventStream.Unsubscribe(deregisterContainerInstanceHandler)

	// start a timer and listens for tcs heartbeats/acks. The timer is reset when
	// we receive a heartbeat from the server or when a publish metrics message
	// is acked.
	timer := time.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() {
		// Close the connection if there haven't been any messages received from backend
		// for a long time.
		log.Debug("TCS Connection hasn't had a heartbeat or an ack message in too long of a timeout; disconnecting")
		client.Close()
	})
	defer timer.Stop()
	client.AddRequestHandler(heartbeatHandler(timer))
	client.AddRequestHandler(ackPublishMetricHandler(timer))
	err = client.Connect()
	if err != nil {
		log.Error("Error connecting to TCS: " + err.Error())
		return err
	}
	return client.Serve()
}
Ejemplo n.º 2
0
// newDisconnectionTimer creates a new time object, with a callback to
// disconnect from ACS on inactivity
func newDisconnectionTimer(client wsclient.ClientServer, _time ttime.Time, timeout time.Duration, jitter time.Duration) ttime.Timer {
	timer := _time.AfterFunc(utils.AddJitter(timeout, jitter), func() {
		seelog.Warn("ACS Connection hasn't had any activity for too long; closing connection")
		closeErr := client.Close()
		if closeErr != nil {
			seelog.Warnf("Error disconnecting: %v", closeErr)
		}
	})

	return timer
}
Ejemplo n.º 3
0
// ECSRetryHandler defines how to retry ECS service calls. It behaves like the default retry handler, except for the SubmitStateChange operations where it has a massive upper limit on retry counts
func ECSRetryHandler(r *aws.Request) {
	if r.Operation == nil || (r.Operation.Name != opSubmitContainerStateChange && r.Operation.Name != opSubmitTaskStateChange) {
		aws.AfterRetryHandler(r)
		return
	}
	// else this is a Submit*StateChange operation
	// For these operations, fake the retry count for the sake of the WillRetry check.
	// Do this by temporarily setting it to 0 before calling that check.
	// We still keep the value around for sleep calculations
	// See https://github.com/aws/aws-sdk-go/blob/b2d953f489cf94029392157225e893d7b69cd447/aws/handler_functions.go#L107
	// for this code's inspiration
	realRetryCount := r.RetryCount
	if r.RetryCount < maxSubmitRetryCount {
		r.RetryCount = 0
	}

	r.Retryable.Set(r.Service.ShouldRetry(r))
	if r.WillRetry() {
		r.RetryCount = realRetryCount
		if r.RetryCount > 20 {
			// Hardcoded max for calling RetryRules here because it *will* overflow if you let it and result in sleeping negative time
			r.RetryDelay = maxSubmitRetryDelay
		} else {
			r.RetryDelay = durationMin(maxSubmitRetryDelay, r.Service.RetryRules(r))
		}
		// AddJitter is purely additive, so subtracting half the amount of jitter
		// makes it average out to RetryDelay
		ttime.Sleep(utils.AddJitter(r.RetryDelay-submitRetryDelayJitter/2, submitRetryDelayJitter))

		if r.Error != nil {
			if err, ok := r.Error.(awserr.Error); ok {
				if isCodeExpiredCreds(err.Code()) {
					r.Config.Credentials.Expire()
				}
			}
		}

		r.RetryCount++
		r.Error = nil
	}
}
Ejemplo n.º 4
0
// anyMessageHandler handles any server message. Any server message means the
// connection is active and thus the heartbeat disconnect should not occur
func anyMessageHandler(timer ttime.Timer) func(interface{}) {
	return func(interface{}) {
		seelog.Debug("ACS activity occured")
		timer.Reset(utils.AddJitter(heartbeatTimeout, heartbeatJitter))
	}
}
Ejemplo n.º 5
0
// startACSSession starts a session with ACS. It adds request handlers for various
// kinds of messages expected from ACS. It returns on server disconnection or when
// the context is cancelled
func startACSSession(ctx context.Context, client wsclient.ClientServer, timer ttime.Timer, args StartSessionArguments, backoff *utils.SimpleBackoff, acsSessionState sessionState) error {
	// Any message from the server resets the disconnect timeout
	client.SetAnyRequestHandler(anyMessageHandler(timer))
	cfg := args.Config

	refreshCredsHandler := newRefreshCredentialsHandler(ctx, cfg.Cluster, args.ContainerInstanceArn, client, args.CredentialsManager, args.TaskEngine)
	defer refreshCredsHandler.clearAcks()
	refreshCredsHandler.start()
	client.AddRequestHandler(refreshCredsHandler.handlerFunc())

	// Add request handler for handling payload messages from ACS
	payloadHandler := newPayloadRequestHandler(ctx, args.TaskEngine, args.ECSClient, cfg.Cluster, args.ContainerInstanceArn, client, args.StateManager, refreshCredsHandler, args.CredentialsManager)
	// Clear the acks channel on return because acks of messageids don't have any value across sessions
	defer payloadHandler.clearAcks()
	payloadHandler.start()
	client.AddRequestHandler(payloadHandler.handlerFunc())

	// Ignore heartbeat messages; anyMessageHandler gets 'em
	client.AddRequestHandler(func(*ecsacs.HeartbeatMessage) {})

	updater.AddAgentUpdateHandlers(client, cfg, args.StateManager, args.TaskEngine)

	err := client.Connect()
	if err != nil {
		seelog.Errorf("Error connecting to ACS: %v", err)
		return err
	}
	acsSessionState.connectedToACS()

	backoffResetTimer := args.time().AfterFunc(utils.AddJitter(args.heartbeatTimeout(), args.heartbeatJitter()), func() {
		// If we do not have an error connecting and remain connected for at
		// least 5 or so minutes, reset the backoff. This prevents disconnect
		// errors that only happen infrequently from damaging the
		// reconnectability as significantly.
		backoff.Reset()
	})
	defer backoffResetTimer.Stop()

	serveErr := make(chan error, 1)
	go func() {
		serveErr <- client.Serve()
	}()

	for {
		select {
		case <-ctx.Done():
			// Stop receiving and sending messages from and to ACS when
			// the context received from the main function is canceled
			payloadHandler.stop()
			refreshCredsHandler.stop()
			return ctx.Err()
		case err := <-serveErr:
			// Stop receiving and sending messages from and to ACS when
			// client.Serve returns an error. This can happen when the
			// the connection is closed by ACS or the agent
			payloadHandler.stop()
			refreshCredsHandler.stop()
			return err
		}
	}
}
Ejemplo n.º 6
0
// ackPublishMetricHandler consumes the ack message from the backend. THe backend sends
// the ack each time it processes a metric message.
func ackPublishMetricHandler(timer *time.Timer) func(*ecstcs.AckPublishMetric) {
	return func(*ecstcs.AckPublishMetric) {
		log.Debug("Received AckPublishMetric from tcs")
		timer.Reset(utils.AddJitter(heartbeatTimeout, heartbeatJitter))
	}
}
Ejemplo n.º 7
0
// heartbeatHandler resets the heartbeat timer when HeartbeatMessage message is received from tcs.
func heartbeatHandler(timer *time.Timer) func(*ecstcs.HeartbeatMessage) {
	return func(*ecstcs.HeartbeatMessage) {
		log.Debug("Received HeartbeatMessage from tcs")
		timer.Reset(utils.AddJitter(heartbeatTimeout, heartbeatJitter))
	}
}
Ejemplo n.º 8
0
func (client *ecrClient) expirationJitter() time.Duration {
	return utils.AddJitter(MinimumJitterDuration, MaximumJitterDuration)
}
Ejemplo n.º 9
0
// StartSession creates a session with ACS and handles requests using the passed
// in arguments.
func StartSession(ctx context.Context, args StartSessionArguments) error {
	ecsclient := args.ECSClient
	cfg := args.Config
	backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier)

	payloadBuffer := make(chan *ecsacs.PayloadMessage, payloadMessageBufferSize)
	ackBuffer := make(chan string, payloadMessageBufferSize)

	go func() {
		// Handle any payloads async. For correctness, they must be handled in order, hence the buffered channel which is added to synchronously.
		for {
			select {
			case payload := <-payloadBuffer:
				handlePayloadMessage(ackBuffer, cfg.Cluster, args.ContainerInstanceArn, payload, args.TaskEngine, ecsclient, args.StateManager)
			case <-ctx.Done():
				return
			}
		}
	}()

	for {
		acsError := func() error {
			acsEndpoint, err := ecsclient.DiscoverPollEndpoint(args.ContainerInstanceArn)
			if err != nil {
				log.Error("Unable to discover poll endpoint", "err", err)
				return err
			}
			log.Debug("Connecting to ACS endpoint " + acsEndpoint)

			url := AcsWsUrl(acsEndpoint, cfg.Cluster, args.ContainerInstanceArn, args.TaskEngine)

			clearStrChannel(ackBuffer)
			client := acsclient.New(url, cfg.AWSRegion, args.CredentialProvider, args.AcceptInvalidCert)
			defer client.Close()
			// Clear the ackbuffer whenever we get a new client because acks of
			// messageids don't have any value across sessions
			defer clearStrChannel(ackBuffer)

			timer := ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() {
				log.Warn("ACS Connection hasn't had any activity for too long; closing connection")
				closeErr := client.Close()
				if closeErr != nil {
					log.Warn("Error disconnecting: " + closeErr.Error())
				}
			})
			defer timer.Stop()
			// Any message from the server resets the disconnect timeout
			client.SetAnyRequestHandler(anyMessageHandler(timer))
			client.AddRequestHandler(payloadMessageHandler(payloadBuffer))
			// Ignore heartbeat messages; anyMessageHandler gets 'em
			client.AddRequestHandler(func(*ecsacs.HeartbeatMessage) {})

			updater.AddAgentUpdateHandlers(client, cfg, args.StateManager, args.TaskEngine)

			err = client.Connect()
			if err != nil {
				log.Error("Error connecting to ACS: " + err.Error())
				return err
			}
			ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() {
				// If we do not have an error connecting and remain connected for at
				// least 5 or so minutes, reset the backoff. This prevents disconnect
				// errors that only happen infrequently from damaging the
				// reconnectability as significantly.
				backoff.Reset()
			})

			serveErr := make(chan error, 1)
			go func() {
				serveErr <- client.Serve()
			}()

			for {
				select {
				case mid := <-ackBuffer:
					ackMessageId(client, cfg.Cluster, args.ContainerInstanceArn, mid)
				case <-ctx.Done():
					return ctx.Err()
				case err := <-serveErr:
					return err
				}
			}
		}()

		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		if acsError == nil || acsError == io.EOF {
			backoff.Reset()
		} else {
			log.Info("Error from acs; backing off", "err", acsError)
			ttime.Sleep(backoff.Duration())
		}
	}
}