func (c *ec2MetadataClientImpl) ReadResource(path string) ([]byte, error) { endpoint := c.ResourceServiceUrl(path) var err error var resp *http.Response utils.RetryNWithBackoff(utils.NewSimpleBackoff(metadataRetryStartDelay, metadataRetryMaxDelay, metadataRetryDelayMultiple, 0.2), metadataRetries, func() error { resp, err = c.client.Get(endpoint) if err == nil && resp.StatusCode == 200 { return nil } if resp != nil && resp.Body != nil { resp.Body.Close() } if err == nil { seelog.Warnf("Error accessing the EC2 Metadata Service; non-200 response: %v", resp.StatusCode) return fmt.Errorf("Error contacting EC2 Metadata service; non-200 response: %v", resp.StatusCode) } else { seelog.Warnf("Error accessing the EC2 Metadata Service; retrying: %v", err) return err } }) if resp != nil && resp.Body != nil { defer resp.Body.Close() } if err != nil { return nil, err } return ioutil.ReadAll(resp.Body) }
// ServeHttp serves IAM Role Credentials for Tasks being managed by the agent. func ServeHttp(credentialsManager credentials.Manager, containerInstanceArn string, cfg *config.Config) { // Create and initialize the audit log // TODO Use seelog's programmatic configuration instead of xml. logger, err := log.LoggerFromConfigAsString(audit.AuditLoggerConfig(cfg)) if err != nil { log.Errorf("Error initializing the audit log: %v", err) // If the logger cannot be initialized, use the provided dummy seelog.LoggerInterface, seelog.Disabled. logger = log.Disabled } auditLogger := audit.NewAuditLog(containerInstanceArn, cfg, logger) server := setupServer(credentialsManager, auditLogger) for { utils.RetryWithBackoff(utils.NewSimpleBackoff(time.Second, time.Minute, 0.2, 2), func() error { // TODO, make this cancellable and use the passed in context; err := server.ListenAndServe() if err != nil { log.Errorf("Error running http api: %v", err) } return err }) } }
// TestHandlerReconnectsCorrectlySetsSendCredentialsURLParameter tests if // the 'sendCredentials' URL parameter is set correctly for successive // invocations of startACSSession func TestHandlerReconnectsCorrectlySetsSendCredentialsURLParameter(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() taskEngine := engine.NewMockTaskEngine(ctrl) ecsClient := mock_api.NewMockECSClient(ctrl) statemanager := statemanager.NewNoopStateManager() ctx, cancel := context.WithCancel(context.Background()) mockWsClient := mock_wsclient.NewMockClientServer(ctrl) args := StartSessionArguments{ ContainerInstanceArn: "myArn", CredentialProvider: credentials.AnonymousCredentials, Config: &config.Config{Cluster: "someCluster"}, TaskEngine: taskEngine, ECSClient: ecsClient, StateManager: statemanager, AcceptInvalidCert: true, _heartbeatTimeout: 20 * time.Millisecond, _heartbeatJitter: 10 * time.Millisecond, } session := newSessionResources(args) mockWsClient.EXPECT().SetAnyRequestHandler(gomock.Any()).AnyTimes() mockWsClient.EXPECT().AddRequestHandler(gomock.Any()).AnyTimes() mockWsClient.EXPECT().Close().Return(nil).AnyTimes() mockWsClient.EXPECT().Serve().Return(io.EOF).AnyTimes() gomock.InOrder( // When the websocket client connects to ACS for the first // time, 'sendCredentials' should be set to true mockWsClient.EXPECT().Connect().Do(func() { validateSendCredentialsInSession(t, session, "true") }).Return(nil), // For all subsequent connections to ACS, 'sendCredentials' // should be set to false mockWsClient.EXPECT().Connect().Do(func() { validateSendCredentialsInSession(t, session, "false") }).Return(nil).AnyTimes(), ) backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier) timer := newDisconnectionTimer(mockWsClient, args.time(), args.heartbeatTimeout(), args.heartbeatJitter()) defer timer.Stop() go func() { for i := 0; i < 10; i++ { startACSSession(ctx, mockWsClient, timer, args, backoff, session) } cancel() }() // Wait for context to be cancelled select { case <-ctx.Done(): } }
// StartSession creates a session with the backend and handles requests // using the passed in arguments. // The engine is expected to initialized and gathering container metrics by // the time the websocket client starts using it. func StartSession(params TelemetrySessionParams, statsEngine stats.Engine) error { backoff := utils.NewSimpleBackoff(time.Second, 1*time.Minute, 0.2, 2) for { tcsError := startTelemetrySession(params, statsEngine) if tcsError == nil || tcsError == io.EOF { backoff.Reset() } else { log.Info("Error from tcs; backing off", "err", tcsError) ttime.Sleep(backoff.Duration()) } } }
// TestHandlerReconnectsOnServeErrors tests if the handler retries to // to establish the session with ACS when ClientServer.Connect() returns errors func TestHandlerReconnectsOnServeErrors(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() taskEngine := engine.NewMockTaskEngine(ctrl) taskEngine.EXPECT().Version().Return("Docker: 1.5.0", nil).AnyTimes() ecsClient := mock_api.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() statemanager := statemanager.NewNoopStateManager() ctx, cancel := context.WithCancel(context.Background()) mockWsClient := mock_wsclient.NewMockClientServer(ctrl) mockWsClient.EXPECT().SetAnyRequestHandler(gomock.Any()).AnyTimes() mockWsClient.EXPECT().AddRequestHandler(gomock.Any()).AnyTimes() mockWsClient.EXPECT().Connect().Return(nil).AnyTimes() mockWsClient.EXPECT().Close().Return(nil).AnyTimes() gomock.InOrder( // Serve fails 10 times mockWsClient.EXPECT().Serve().Return(io.EOF).Times(10), // Cancel trying to Serve ACS requests on the 11th attempt // Failure to retry on Serve() errors should cause the // test to time out as the context is never cancelled mockWsClient.EXPECT().Serve().Do(func() { cancel() }).Return(io.EOF), ) session := &mockSession{mockWsClient} args := StartSessionArguments{ ContainerInstanceArn: "myArn", CredentialProvider: credentials.AnonymousCredentials, Config: &config.Config{Cluster: "someCluster"}, TaskEngine: taskEngine, ECSClient: ecsClient, StateManager: statemanager, AcceptInvalidCert: true, _heartbeatTimeout: 20 * time.Millisecond, _heartbeatJitter: 10 * time.Millisecond, } backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier) go func() { startSession(ctx, args, backoff, session) }() // Wait for context to be cancelled select { case <-ctx.Done(): } }
func ServeHttp(containerInstanceArn *string, taskEngine engine.TaskEngine, cfg *config.Config) { serverFunctions := map[string]func(w http.ResponseWriter, r *http.Request){ "/v1/metadata": MetadataV1RequestHandlerMaker(containerInstanceArn, cfg), "/v1/tasks": TasksV1RequestHandlerMaker(taskEngine), "/license": LicenseHandler, } paths := make([]string, 0, len(serverFunctions)) for path := range serverFunctions { paths = append(paths, path) } availableCommands := &RootResponse{paths} // Autogenerated list of the above serverFunctions paths availableCommandResponse, _ := json.Marshal(&availableCommands) defaultHandler := func(w http.ResponseWriter, r *http.Request) { w.Write(availableCommandResponse) } serverMux := http.NewServeMux() serverMux.HandleFunc("/", defaultHandler) for key, fn := range serverFunctions { serverMux.HandleFunc(key, fn) } // Log all requests and then pass through to serverMux loggingServeMux := http.NewServeMux() loggingServeMux.Handle("/", LoggingHandler{serverMux}) server := http.Server{ Addr: ":" + strconv.Itoa(config.AGENT_INTROSPECTION_PORT), Handler: loggingServeMux, ReadTimeout: 5 * time.Second, WriteTimeout: 5 * time.Second, } for { once := sync.Once{} utils.RetryWithBackoff(utils.NewSimpleBackoff(time.Second, time.Minute, 0.2, 2), func() error { // TODO, make this cancellable and use the passed in context; for // now, not critical if this gets interrupted err := server.ListenAndServe() once.Do(func() { log.Error("Error running http api", "err", err) }) return err }) } }
// TestConnectionIsClosedOnIdle tests if the connection to ACS is closed // when the channel is idle func TestConnectionIsClosedOnIdle(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() taskEngine := engine.NewMockTaskEngine(ctrl) taskEngine.EXPECT().Version().Return("Docker: 1.5.0", nil).AnyTimes() ecsClient := mock_api.NewMockECSClient(ctrl) statemanager := statemanager.NewNoopStateManager() mockWsClient := mock_wsclient.NewMockClientServer(ctrl) mockWsClient.EXPECT().SetAnyRequestHandler(gomock.Any()).Do(func(v interface{}) {}).AnyTimes() mockWsClient.EXPECT().AddRequestHandler(gomock.Any()).Do(func(v interface{}) {}).AnyTimes() mockWsClient.EXPECT().Connect().Return(nil) mockWsClient.EXPECT().Serve().Do(func() { // Pretend as if the maximum heartbeatTimeout duration has // been breached while Serving requests time.Sleep(30 * time.Millisecond) }).Return(io.EOF) connectionClosed := make(chan bool) mockWsClient.EXPECT().Close().Do(func() { // Record connection closed connectionClosed <- true }).Return(nil) ctx := context.Background() backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier) args := StartSessionArguments{ ContainerInstanceArn: "myArn", CredentialProvider: credentials.AnonymousCredentials, Config: &config.Config{Cluster: "someCluster"}, TaskEngine: taskEngine, ECSClient: ecsClient, StateManager: statemanager, AcceptInvalidCert: true, _heartbeatTimeout: 20 * time.Millisecond, _heartbeatJitter: 10 * time.Millisecond, } go func() { timer := newDisconnectionTimer(mockWsClient, args.time(), args.heartbeatTimeout(), args.heartbeatJitter()) defer timer.Stop() startACSSession(ctx, mockWsClient, timer, args, backoff, &mockSession{}) }() // Wait for connection to be closed. If the connection is not closed // due to inactivity, the test will time out <-connectionClosed }
// MustInit blocks and retries until an engine can be initialized. func (engine *DockerTaskEngine) MustInit() { if engine.client != nil { return } errorOnce := sync.Once{} taskEngineConnectBackoff := utils.NewSimpleBackoff(200*time.Millisecond, 2*time.Second, 0.20, 1.5) utils.RetryWithBackoff(taskEngineConnectBackoff, func() error { err := engine.Init() if err != nil { errorOnce.Do(func() { log.Error("Could not connect to docker daemon", "err", err) }) } return err }) }
// ServeHttp serves information about this agent / containerInstance and tasks // running on it. func ServeHttp(containerInstanceArn *string, taskEngine engine.TaskEngine, cfg *config.Config) { // Is this the right level to type assert, assuming we'd abstract multiple taskengines here? // Revisit if we ever add another type.. dockerTaskEngine := taskEngine.(*engine.DockerTaskEngine) server := setupServer(containerInstanceArn, dockerTaskEngine, cfg) for { once := sync.Once{} utils.RetryWithBackoff(utils.NewSimpleBackoff(time.Second, time.Minute, 0.2, 2), func() error { // TODO, make this cancellable and use the passed in context; for // now, not critical if this gets interrupted err := server.ListenAndServe() once.Do(func() { log.Error("Error running http api", "err", err) }) return err }) } }
// StartSession creates a session with the backend and handles requests // using the passed in arguments. // The engine is expected to initialized and gathering container metrics by // the time the websocket client starts using it. func StartSession(params TelemetrySessionParams, statsEngine stats.Engine) error { backoff := utils.NewSimpleBackoff(time.Second, 1*time.Minute, 0.2, 2) for { tcsEndpoint, err := params.EcsClient.DiscoverTelemetryEndpoint(params.ContainerInstanceArn) if err != nil { log.Error("Unable to discover poll endpoint", "err", err) return err } log.Debug("Connecting to TCS endpoint " + tcsEndpoint) url := formatURL(tcsEndpoint, params.Cfg.Cluster, params.ContainerInstanceArn) tcsError := startSession(url, params.Cfg.AWSRegion, params.CredentialProvider, params.AcceptInvalidCert, statsEngine, defaultPublishMetricsInterval) if tcsError == nil || tcsError == io.EOF { backoff.Reset() } else { log.Info("Error from tcs; backing off", "err", tcsError) ttime.Sleep(backoff.Duration()) } } }
// StartSession creates a session with ACS and handles requests from ACS. // It creates resources required to invoke the package scoped 'startSession()' // method and invokes the same to repeatedly connect to ACS when disconnected func StartSession(ctx context.Context, args StartSessionArguments) error { backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier) session := newSessionResources(args) return startSession(ctx, args, backoff, session) }
// Continuously retries sending an event until it succeeds, sleeping between each // attempt func SubmitTaskEvents(events *eventList, client api.ECSClient) { backoff := utils.NewSimpleBackoff(1*time.Second, 30*time.Second, 0.20, 1.3) // Mirror events.sending, but without the need to lock since this is local // to our goroutine done := false for !done { // If we looped back up here, we successfully submitted an event, but // we haven't emptied the list so we should keep submitting backoff.Reset() utils.RetryWithBackoff(backoff, func() error { // Lock and unlock within this function, allowing the list to be added // to while we're not actively sending an event log.Debug("Waiting on semaphore to send...") handler.submitSemaphore.Wait() defer handler.submitSemaphore.Post() log.Debug("Aquiring lock for sending event...") events.Lock() defer events.Unlock() log.Debug("Aquired lock!") var err utils.RetriableError if events.Len() == 0 { log.Debug("No events left; not retrying more") events.sending = false done = true return nil } eventToSubmit := events.Front() event := eventToSubmit.Value.(*sendableEvent) llog := log.New("event", event) if event.containerShouldBeSent() { llog.Info("Sending container change", "change", event.containerChange) err = client.SubmitContainerStateChange(event.containerChange) if err == nil || !err.Retry() { // submitted or can't be retried; ensure we don't retry it event.containerSent = true if event.containerChange.SentStatus != nil { *event.containerChange.SentStatus = event.containerChange.Status } statesaver.Save() if err != nil { llog.Error("Unretriable error submitting container state change", "err", err) } else { llog.Debug("Submitted container") } events.Remove(eventToSubmit) } // else, leave event on and retry it next loop through } else if event.taskShouldBeSent() { llog.Info("Sending task change", "change", event.taskChange) err = client.SubmitTaskStateChange(event.taskChange) if err == nil || !err.Retry() { // submitted or can't be retried; ensure we don't retry it event.taskSent = true if event.taskChange.SentStatus != nil { *event.taskChange.SentStatus = event.taskChange.Status } statesaver.Save() if err != nil { llog.Error("Unretriable error submitting container state change", "err", err) } else { llog.Debug("Submitted container") backoff.Reset() } events.Remove(eventToSubmit) } } else { // Shouldn't be sent as either a task or container change event; must have been already sent llog.Info("Not submitting redundant event; just removing") events.Remove(eventToSubmit) } if events.Len() == 0 { llog.Debug("Removed the last element, no longer sending") events.sending = false done = true return nil } return err }) } }
// StartSession creates a session with ACS and handles requests using the passed // in arguments. func StartSession(ctx context.Context, args StartSessionArguments) error { ecsclient := args.ECSClient cfg := args.Config backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier) payloadBuffer := make(chan *ecsacs.PayloadMessage, payloadMessageBufferSize) ackBuffer := make(chan string, payloadMessageBufferSize) go func() { // Handle any payloads async. For correctness, they must be handled in order, hence the buffered channel which is added to synchronously. for { select { case payload := <-payloadBuffer: handlePayloadMessage(ackBuffer, cfg.Cluster, args.ContainerInstanceArn, payload, args.TaskEngine, ecsclient, args.StateManager) case <-ctx.Done(): return } } }() for { acsError := func() error { acsEndpoint, err := ecsclient.DiscoverPollEndpoint(args.ContainerInstanceArn) if err != nil { log.Error("Unable to discover poll endpoint", "err", err) return err } log.Debug("Connecting to ACS endpoint " + acsEndpoint) url := AcsWsUrl(acsEndpoint, cfg.Cluster, args.ContainerInstanceArn, args.TaskEngine) clearStrChannel(ackBuffer) client := acsclient.New(url, cfg.AWSRegion, args.CredentialProvider, args.AcceptInvalidCert) defer client.Close() // Clear the ackbuffer whenever we get a new client because acks of // messageids don't have any value across sessions defer clearStrChannel(ackBuffer) timer := ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() { log.Warn("ACS Connection hasn't had any activity for too long; closing connection") closeErr := client.Close() if closeErr != nil { log.Warn("Error disconnecting: " + closeErr.Error()) } }) defer timer.Stop() // Any message from the server resets the disconnect timeout client.SetAnyRequestHandler(anyMessageHandler(timer)) client.AddRequestHandler(payloadMessageHandler(payloadBuffer)) // Ignore heartbeat messages; anyMessageHandler gets 'em client.AddRequestHandler(func(*ecsacs.HeartbeatMessage) {}) updater.AddAgentUpdateHandlers(client, cfg, args.StateManager, args.TaskEngine) err = client.Connect() if err != nil { log.Error("Error connecting to ACS: " + err.Error()) return err } ttime.AfterFunc(utils.AddJitter(heartbeatTimeout, heartbeatJitter), func() { // If we do not have an error connecting and remain connected for at // least 5 or so minutes, reset the backoff. This prevents disconnect // errors that only happen infrequently from damaging the // reconnectability as significantly. backoff.Reset() }) serveErr := make(chan error, 1) go func() { serveErr <- client.Serve() }() for { select { case mid := <-ackBuffer: ackMessageId(client, cfg.Cluster, args.ContainerInstanceArn, mid) case <-ctx.Done(): return ctx.Err() case err := <-serveErr: return err } } }() select { case <-ctx.Done(): return ctx.Err() default: } if acsError == nil || acsError == io.EOF { backoff.Reset() } else { log.Info("Error from acs; backing off", "err", acsError) ttime.Sleep(backoff.Duration()) } } }
// TestHandlerReconnectsOnDiscoverPollEndpointError tests if handler retries // to establish the session with ACS on DiscoverPollEndpoint errors func TestHandlerReconnectsOnDiscoverPollEndpointError(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() taskEngine := engine.NewMockTaskEngine(ctrl) taskEngine.EXPECT().Version().Return("Docker: 1.5.0", nil).AnyTimes() ecsClient := mock_api.NewMockECSClient(ctrl) statemanager := statemanager.NewNoopStateManager() ctx, cancel := context.WithCancel(context.Background()) mockWsClient := mock_wsclient.NewMockClientServer(ctrl) mockWsClient.EXPECT().SetAnyRequestHandler(gomock.Any()).AnyTimes() mockWsClient.EXPECT().AddRequestHandler(gomock.Any()).AnyTimes() mockWsClient.EXPECT().Connect().Return(nil).AnyTimes() mockWsClient.EXPECT().Close().Return(nil).AnyTimes() mockWsClient.EXPECT().Serve().Do(func() { // Serve() cancels the context cancel() }).Return(io.EOF) session := &mockSession{mockWsClient} gomock.InOrder( // DiscoverPollEndpoint returns an error on its first invocation ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return("", fmt.Errorf("oops")).Times(1), // Second invocation returns a success ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).Times(1), ) args := StartSessionArguments{ ContainerInstanceArn: "myArn", CredentialProvider: credentials.AnonymousCredentials, Config: &config.Config{Cluster: "someCluster"}, TaskEngine: taskEngine, ECSClient: ecsClient, StateManager: statemanager, AcceptInvalidCert: true, _heartbeatTimeout: 20 * time.Millisecond, _heartbeatJitter: 10 * time.Millisecond, } backoff := utils.NewSimpleBackoff(connectionBackoffMin, connectionBackoffMax, connectionBackoffJitter, connectionBackoffMultiplier) go func() { startSession(ctx, args, backoff, session) }() start := time.Now() // Wait for context to be cancelled select { case <-ctx.Done(): } // Measure the duration between retries timeSinceStart := time.Since(start) if timeSinceStart < connectionBackoffMin { t.Errorf("Duration since start is less than minimum threshold for backoff: %s", timeSinceStart.String()) } // The upper limit here should really be connectionBackoffMin + (connectionBackoffMin * jitter) // But, it can be off by a few milliseconds to account for execution of other instructions // In any case, it should never be higher than 2*connectionBackoffMin if timeSinceStart > 2*connectionBackoffMin { t.Errorf("Duration since start is greater than maximum anticipated wait time: %v", timeSinceStart.String()) } }