Пример #1
0
// StartSession creates a session with the backend and handles requests
// using the passed in arguments.
// The engine is expected to initialized and gathering container metrics by
// the time the websocket client starts using it.
func StartSession(params TelemetrySessionParams, statsEngine stats.Engine) error {
	backoff := utils.NewSimpleBackoff(time.Second, 1*time.Minute, 0.2, 2)
	for {
		tcsError := startTelemetrySession(params, statsEngine)
		if tcsError == nil || tcsError == io.EOF {
			backoff.Reset()
		} else {
			log.Info("Error from tcs; backing off", "err", tcsError)
			ttime.Sleep(backoff.Duration())
		}
	}
}
Пример #2
0
// RetryWithBackoff takes a Backoff and a function to call that returns an error
// If the error is nil then the function will no longer be called
// If the error is Retriable then that will be used to determine if it should be
// retried
func RetryWithBackoff(backoff Backoff, fn func() error) error {
	var err error
	for err = fn(); true; err = fn() {
		retriable, isRetriable := err.(Retriable)

		if err == nil || isRetriable && !retriable.Retry() {
			return err
		}

		ttime.Sleep(backoff.Duration())
	}
	return err
}
Пример #3
0
// StartSession creates a session with the backend and handles requests
// using the passed in arguments.
// The engine is expected to initialized and gathering container metrics by
// the time the websocket client starts using it.
func StartSession(params TelemetrySessionParams, statsEngine stats.Engine) error {
	backoff := utils.NewSimpleBackoff(time.Second, 1*time.Minute, 0.2, 2)
	for {
		tcsEndpoint, err := params.EcsClient.DiscoverTelemetryEndpoint(params.ContainerInstanceArn)
		if err != nil {
			log.Error("Unable to discover poll endpoint", "err", err)
			return err
		}
		log.Debug("Connecting to TCS endpoint " + tcsEndpoint)
		url := formatURL(tcsEndpoint, params.Cfg.Cluster, params.ContainerInstanceArn)
		tcsError := startSession(url, params.Cfg.AWSRegion, params.CredentialProvider, params.AcceptInvalidCert, statsEngine, defaultPublishMetricsInterval)
		if tcsError == nil || tcsError == io.EOF {
			backoff.Reset()
		} else {
			log.Info("Error from tcs; backing off", "err", tcsError)
			ttime.Sleep(backoff.Duration())
		}
	}
}
Пример #4
0
// ECSRetryHandler defines how to retry ECS service calls. It behaves like the default retry handler, except for the SubmitStateChange operations where it has a massive upper limit on retry counts
func ECSRetryHandler(r *aws.Request) {
	if r.Operation == nil || (r.Operation.Name != opSubmitContainerStateChange && r.Operation.Name != opSubmitTaskStateChange) {
		aws.AfterRetryHandler(r)
		return
	}
	// else this is a Submit*StateChange operation
	// For these operations, fake the retry count for the sake of the WillRetry check.
	// Do this by temporarily setting it to 0 before calling that check.
	// We still keep the value around for sleep calculations
	// See https://github.com/aws/aws-sdk-go/blob/b2d953f489cf94029392157225e893d7b69cd447/aws/handler_functions.go#L107
	// for this code's inspiration
	realRetryCount := r.RetryCount
	if r.RetryCount < maxSubmitRetryCount {
		r.RetryCount = 0
	}

	r.Retryable.Set(r.Service.ShouldRetry(r))
	if r.WillRetry() {
		r.RetryCount = realRetryCount
		if r.RetryCount > 20 {
			// Hardcoded max for calling RetryRules here because it *will* overflow if you let it and result in sleeping negative time
			r.RetryDelay = maxSubmitRetryDelay
		} else {
			r.RetryDelay = durationMin(maxSubmitRetryDelay, r.Service.RetryRules(r))
		}
		// AddJitter is purely additive, so subtracting half the amount of jitter
		// makes it average out to RetryDelay
		ttime.Sleep(utils.AddJitter(r.RetryDelay-submitRetryDelayJitter/2, submitRetryDelayJitter))

		if r.Error != nil {
			if err, ok := r.Error.(awserr.Error); ok {
				if isCodeExpiredCreds(err.Code()) {
					r.Config.Credentials.Expire()
				}
			}
		}

		r.RetryCount++
		r.Error = nil
	}
}
Пример #5
0
// StartSession creates a session with ACS and handles requests using the passed
// in arguments.
func StartSession(ctx context.Context, args StartSessionArguments) error {
	ecsclient := args.ECSClient
	cfg := args.Config
	backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier)

	payloadBuffer := make(chan *ecsacs.PayloadMessage, payloadMessageBufferSize)
	ackBuffer := make(chan string, payloadMessageBufferSize)

	go func() {
		// Handle any payloads async. For correctness, they must be handled in order, hence the buffered channel which is added to synchronously.
		for {
			select {
			case payload := <-payloadBuffer:
				handlePayloadMessage(ackBuffer, cfg.Cluster, args.ContainerInstanceArn, payload, args.TaskEngine, ecsclient, args.StateManager)
			case <-ctx.Done():
				return
			}
		}
	}()

	for {
		acsError := func() error {
			acsEndpoint, err := ecsclient.DiscoverPollEndpoint(args.ContainerInstanceArn)
			if err != nil {
				log.Error("Unable to discover poll endpoint", "err", err)
				return err
			}
			log.Debug("Connecting to ACS endpoint " + acsEndpoint)

			url := AcsWsUrl(acsEndpoint, cfg.Cluster, args.ContainerInstanceArn, args.TaskEngine)

			clearStrChannel(ackBuffer)
			client := acsclient.New(url, cfg.AWSRegion, args.CredentialProvider, args.AcceptInvalidCert)
			defer client.Close()
			// Clear the ackbuffer whenever we get a new client because acks of
			// messageids don't have any value across sessions
			defer clearStrChannel(ackBuffer)

			timer := ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() {
				log.Warn("ACS Connection hasn't had any activity for too long; closing connection")
				closeErr := client.Close()
				if closeErr != nil {
					log.Warn("Error disconnecting: " + closeErr.Error())
				}
			})
			defer timer.Stop()
			// Any message from the server resets the disconnect timeout
			client.SetAnyRequestHandler(anyMessageHandler(timer))
			client.AddRequestHandler(payloadMessageHandler(payloadBuffer))
			// Ignore heartbeat messages; anyMessageHandler gets 'em
			client.AddRequestHandler(func(*ecsacs.HeartbeatMessage) {})

			updater.AddAgentUpdateHandlers(client, cfg, args.StateManager, args.TaskEngine)

			err = client.Connect()
			if err != nil {
				log.Error("Error connecting to ACS: " + err.Error())
				return err
			}
			ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() {
				// If we do not have an error connecting and remain connected for at
				// least 5 or so minutes, reset the backoff. This prevents disconnect
				// errors that only happen infrequently from damaging the
				// reconnectability as significantly.
				backoff.Reset()
			})

			serveErr := make(chan error, 1)
			go func() {
				serveErr <- client.Serve()
			}()

			for {
				select {
				case mid := <-ackBuffer:
					ackMessageId(client, cfg.Cluster, args.ContainerInstanceArn, mid)
				case <-ctx.Done():
					return ctx.Err()
				case err := <-serveErr:
					return err
				}
			}
		}()

		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		if acsError == nil || acsError == io.EOF {
			backoff.Reset()
		} else {
			log.Info("Error from acs; backing off", "err", acsError)
			ttime.Sleep(backoff.Duration())
		}
	}
}