func TestHandlerReconnects(t *testing.T) {
	ctrl := gomock.NewController(t)
	defer ctrl.Finish()
	taskEngine := engine.NewMockTaskEngine(ctrl)
	ecsclient := mock_api.NewMockECSClient(ctrl)
	statemanager := statemanager.NewNoopStateManager()

	closeWS := make(chan bool)
	server, serverIn, requests, errs, err := startMockAcsServer(t, closeWS)
	if err != nil {
		t.Fatal(err)
	}
	go func() {
		for {
			select {
			case <-requests:
			case <-errs:
			}
		}
	}()

	ecsclient.EXPECT().DiscoverPollEndpoint("myArn").Return(server.URL, nil).Times(10)
	taskEngine.EXPECT().Version().Return("Docker: 1.5.0", nil).AnyTimes()

	ctx, cancel := context.WithCancel(context.Background())
	ended := make(chan bool, 1)
	go func() {
		handler.StartSession(ctx, handler.StartSessionArguments{
			ContainerInstanceArn: "myArn",
			CredentialProvider:   credentials.AnonymousCredentials,
			Config:               &config.Config{Cluster: "someCluster"},
			TaskEngine:           taskEngine,
			ECSClient:            ecsclient,
			StateManager:         statemanager,
			AcceptInvalidCert:    true,
		})
		// This should never return
		ended <- true
	}()
	start := time.Now()
	for i := 0; i < 10; i++ {
		serverIn <- `{"type":"HeartbeatMessage","message":{"healthy":true}}`
		closeWS <- true
	}
	if time.Since(start) > 2*time.Second {
		t.Error("Test took longer than expected; backoff should not have occured for EOF")
	}

	select {
	case <-ended:
		t.Fatal("Should not have stopped session")
	default:
	}
	cancel()
	<-ended
}
func TestHandlerReconnects(t *testing.T) {
	ctrl := gomock.NewController(t)
	defer ctrl.Finish()
	taskEngine := mock_engine.NewMockTaskEngine(ctrl)
	ecsclient := mock_api.NewMockECSClient(ctrl)
	statemanager := statemanager.NewNoopStateManager()

	closeWS := make(chan bool)
	server, serverIn, _, _, err := startMockAcsServer(t, closeWS)
	if err != nil {
		t.Fatal(err)
	}

	ecsclient.EXPECT().DiscoverPollEndpoint("myArn").Return(server.URL, nil).AnyTimes()
	taskEngine.EXPECT().Version().Return("Docker: 1.5.0", nil).AnyTimes()

	ended := make(chan bool, 1)
	go func() {
		handler.StartSession("myArn", credentials.NewCredentialProvider("", ""), &config.Config{Cluster: "someCluster"}, taskEngine, ecsclient, statemanager, true)
		// This should never return
		ended <- true
	}()
	start := time.Now()
	for i := 0; i < 10; i++ {
		serverIn <- `{"type":"HeartbeatMessage","message":{"healthy":true}}`
		closeWS <- true
	}
	if time.Since(start) > 2*time.Second {
		t.Error("Test took longer than expected; backoff should not have occured for EOF")
	}

	select {
	case <-ended:
		t.Fatal("Should never stop session")
	default:
	}
}
Esempio n. 3
0
func _main() int {
	defer log.Flush()
	versionFlag := flag.Bool("version", false, "Print the agent version information and exit")
	acceptInsecureCert := flag.Bool("k", false, "Do not verify ssl certs")
	logLevel := flag.String("loglevel", "", "Loglevel: [<crit>|<error>|<warn>|<info>|<debug>]")
	flag.Parse()

	logger.SetLevel(*logLevel)

	log.Infof("Starting Agent: %v", version.String())

	log.Info("Loading configuration")
	cfg, err := config.NewConfig()
	// Load cfg before doing 'versionFlag' so that it has the DOCKER_HOST
	// variable loaded if needed
	if *versionFlag {
		versionableEngine := engine.NewTaskEngine(cfg)
		version.PrintVersion(versionableEngine)
		return exitcodes.ExitSuccess
	}

	if err != nil {
		log.Criticalf("Error loading config: %v", err)
		// All required config values can be inferred from EC2 Metadata, so this error could be transient.
		return exitcodes.ExitError
	}
	log.Debug("Loaded config: " + cfg.String())

	var currentEc2InstanceID, containerInstanceArn string
	var taskEngine engine.TaskEngine

	if cfg.Checkpoint {
		log.Info("Checkpointing is enabled. Attempting to load state")
		var previousCluster, previousEc2InstanceID, previousContainerInstanceArn string
		previousTaskEngine := engine.NewTaskEngine(cfg)
		// previousState is used to verify that our current runtime configuration is
		// compatible with our past configuration as reflected by our state-file
		previousState, err := initializeStateManager(cfg, previousTaskEngine, &previousCluster, &previousContainerInstanceArn, &previousEc2InstanceID, acshandler.SequenceNumber)
		if err != nil {
			log.Criticalf("Error creating state manager: %v", err)
			return exitcodes.ExitTerminal
		}

		err = previousState.Load()
		if err != nil {
			log.Criticalf("Error loading previously saved state: %v", err)
			return exitcodes.ExitTerminal
		}

		if previousCluster != "" {
			// TODO Handle default cluster in a sane and unified way across the codebase
			configuredCluster := cfg.Cluster
			if configuredCluster == "" {
				log.Debug("Setting cluster to default; none configured")
				configuredCluster = config.DEFAULT_CLUSTER_NAME
			}
			if previousCluster != configuredCluster {
				log.Criticalf("Data mismatch; saved cluster '%v' does not match configured cluster '%v'. Perhaps you want to delete the configured checkpoint file?", previousCluster, configuredCluster)
				return exitcodes.ExitTerminal
			}
			cfg.Cluster = previousCluster
			log.Infof("Restored cluster '%v'", cfg.Cluster)
		}

		if instanceIdentityDoc, err := ec2.GetInstanceIdentityDocument(); err == nil {
			currentEc2InstanceID = instanceIdentityDoc.InstanceId
		} else {
			log.Criticalf("Unable to access EC2 Metadata service to determine EC2 ID: %v", err)
		}

		if previousEc2InstanceID != "" && previousEc2InstanceID != currentEc2InstanceID {
			log.Warnf("Data mismatch; saved InstanceID '%v' does not match current InstanceID '%v'. Overwriting old datafile", previousEc2InstanceID, currentEc2InstanceID)

			// Reset taskEngine; all the other values are still default
			taskEngine = engine.NewTaskEngine(cfg)
		} else {
			// Use the values we loaded if there's no issue
			containerInstanceArn = previousContainerInstanceArn
			taskEngine = previousTaskEngine
		}
	} else {
		log.Info("Checkpointing not enabled; a new container instance will be created each time the agent is run")
		taskEngine = engine.NewTaskEngine(cfg)
	}

	stateManager, err := initializeStateManager(cfg, taskEngine, &cfg.Cluster, &containerInstanceArn, &currentEc2InstanceID, acshandler.SequenceNumber)
	if err != nil {
		log.Criticalf("Error creating state manager: %v", err)
		return exitcodes.ExitTerminal
	}

	credentialProvider := auth.NewBasicAWSCredentialProvider()
	awsCreds := auth.ToSDK(credentialProvider)
	// Preflight request to make sure they're good
	if preflightCreds, err := awsCreds.Credentials(); err != nil || preflightCreds.AccessKeyID == "" {
		if preflightCreds != nil {
			log.Warnf("Error getting valid credentials (AKID %v): %v", preflightCreds.AccessKeyID, err)
		} else {
			log.Warnf("Error getting preflight credentials: %v", err)
		}
	}
	client := api.NewECSClient(awsCreds, cfg, *acceptInsecureCert)

	if containerInstanceArn == "" {
		log.Info("Registering Instance with ECS")
		containerInstanceArn, err = client.RegisterContainerInstance()
		if err != nil {
			log.Errorf("Error registering: %v", err)
			if retriable, ok := err.(utils.Retriable); ok && !retriable.Retry() {
				return exitcodes.ExitTerminal
			}
			return exitcodes.ExitError
		}
		log.Infof("Registration completed successfully. I am running as '%v' in cluster '%v'", containerInstanceArn, cfg.Cluster)
		// Save our shiny new containerInstanceArn
		stateManager.Save()
	} else {
		log.Infof("Restored from checkpoint file. I am running as '%v' in cluster '%v'", containerInstanceArn, cfg.Cluster)
	}

	// Begin listening to the docker daemon and saving changes
	taskEngine.SetSaver(stateManager)
	taskEngine.MustInit()

	go sighandlers.StartTerminationHandler(stateManager, taskEngine)

	// Agent introspection api
	go handlers.ServeHttp(&containerInstanceArn, taskEngine, cfg)

	// Start sending events to the backend
	go eventhandler.HandleEngineEvents(taskEngine, client, stateManager)

	log.Info("Beginning Polling for updates")
	err = acshandler.StartSession(containerInstanceArn, credentialProvider, cfg, taskEngine, client, stateManager, *acceptInsecureCert)
	if err != nil {
		log.Criticalf("Unretriable error starting communicating with ACS: %v", err)
		return exitcodes.ExitTerminal
	}
	log.Critical("ACS Session handler should never exit")
	return exitcodes.ExitError
}
Esempio n. 4
0
func _main() int {
	defer log.Flush()
	flagset := flag.NewFlagSet("Amazon ECS Agent", flag.ContinueOnError)
	versionFlag := flagset.Bool("version", false, "Print the agent version information and exit")
	logLevel := flagset.String("loglevel", "", "Loglevel: [<crit>|<error>|<warn>|<info>|<debug>]")
	acceptInsecureCert := flagset.Bool("k", false, "Disable SSL certificate verification. We do not recommend setting this option.")
	licenseFlag := flagset.Bool("license", false, "Print the LICENSE and NOTICE files and exit")
	blackholeEc2Metadata := flagset.Bool("blackhole-ec2-metadata", false, "Blackhole the EC2 Metadata requests. Setting this option can cause the ECS Agent to fail to work properly.  We do not recommend setting this option")
	err := flagset.Parse(os.Args[1:])
	if err != nil {
		return exitcodes.ExitTerminal
	}

	if *licenseFlag {
		license := utils.NewLicenseProvider()
		text, err := license.GetText()
		if err != nil {
			fmt.Fprintln(os.Stderr, err)
			return exitcodes.ExitError
		}
		fmt.Println(text)
		return exitcodes.ExitSuccess
	}

	logger.SetLevel(*logLevel)
	ec2MetadataClient := ec2.DefaultClient
	if *blackholeEc2Metadata {
		ec2MetadataClient = ec2.NewBlackholeEC2MetadataClient()
	}

	log.Infof("Starting Agent: %s", version.String())
	if *acceptInsecureCert {
		log.Warn("SSL certificate verification disabled. This is not recommended.")
	}
	log.Info("Loading configuration")
	cfg, cfgErr := config.NewConfig(ec2MetadataClient)
	// Load cfg and create Docker client before doing 'versionFlag' so that it has the DOCKER_HOST variable loaded if needed
	clientFactory := dockerclient.NewFactory(cfg.DockerEndpoint)
	dockerClient, err := engine.NewDockerGoClient(clientFactory, *acceptInsecureCert, cfg)
	if err != nil {
		log.Criticalf("Error creating Docker client: %v", err)
		return exitcodes.ExitError
	}

	ctx := context.Background()
	// Create the DockerContainerChange event stream for tcs
	containerChangeEventStream := eventstream.NewEventStream(ContainerChangeEventStream, ctx)
	containerChangeEventStream.StartListening()

	// Create credentials manager. This will be used by the task engine and
	// the credentials handler
	credentialsManager := credentials.NewManager()
	// Create image manager. This will be used by the task engine for saving image states
	state := dockerstate.NewDockerTaskEngineState()
	imageManager := engine.NewImageManager(cfg, dockerClient, state)
	if *versionFlag {
		versionableEngine := engine.NewTaskEngine(cfg, dockerClient, credentialsManager, containerChangeEventStream, imageManager, state)
		version.PrintVersion(versionableEngine)
		return exitcodes.ExitSuccess
	}

	sighandlers.StartDebugHandler()

	if cfgErr != nil {
		log.Criticalf("Error loading config: %v", err)
		// All required config values can be inferred from EC2 Metadata, so this error could be transient.
		return exitcodes.ExitError
	}
	log.Debug("Loaded config: " + cfg.String())

	var currentEc2InstanceID, containerInstanceArn string
	var taskEngine engine.TaskEngine

	if cfg.Checkpoint {
		log.Info("Checkpointing is enabled. Attempting to load state")
		var previousCluster, previousEc2InstanceID, previousContainerInstanceArn string
		previousTaskEngine := engine.NewTaskEngine(cfg, dockerClient, credentialsManager, containerChangeEventStream, imageManager, state)
		// previousState is used to verify that our current runtime configuration is
		// compatible with our past configuration as reflected by our state-file
		previousState, err := initializeStateManager(cfg, previousTaskEngine, &previousCluster, &previousContainerInstanceArn, &previousEc2InstanceID)
		if err != nil {
			log.Criticalf("Error creating state manager: %v", err)
			return exitcodes.ExitTerminal
		}

		err = previousState.Load()
		if err != nil {
			log.Criticalf("Error loading previously saved state: %v", err)
			return exitcodes.ExitTerminal
		}

		if previousCluster != "" {
			// TODO Handle default cluster in a sane and unified way across the codebase
			configuredCluster := cfg.Cluster
			if configuredCluster == "" {
				log.Debug("Setting cluster to default; none configured")
				configuredCluster = config.DefaultClusterName
			}
			if previousCluster != configuredCluster {
				log.Criticalf("Data mismatch; saved cluster '%v' does not match configured cluster '%v'. Perhaps you want to delete the configured checkpoint file?", previousCluster, configuredCluster)
				return exitcodes.ExitTerminal
			}
			cfg.Cluster = previousCluster
			log.Infof("Restored cluster '%v'", cfg.Cluster)
		}

		if instanceIdentityDoc, err := ec2MetadataClient.InstanceIdentityDocument(); err == nil {
			currentEc2InstanceID = instanceIdentityDoc.InstanceId
		} else {
			log.Criticalf("Unable to access EC2 Metadata service to determine EC2 ID: %v", err)
		}

		if previousEc2InstanceID != "" && previousEc2InstanceID != currentEc2InstanceID {
			log.Warnf("Data mismatch; saved InstanceID '%s' does not match current InstanceID '%s'. Overwriting old datafile", previousEc2InstanceID, currentEc2InstanceID)

			// Reset taskEngine; all the other values are still default
			taskEngine = engine.NewTaskEngine(cfg, dockerClient, credentialsManager, containerChangeEventStream, imageManager, state)
		} else {
			// Use the values we loaded if there's no issue
			containerInstanceArn = previousContainerInstanceArn
			taskEngine = previousTaskEngine
		}
	} else {
		log.Info("Checkpointing not enabled; a new container instance will be created each time the agent is run")
		taskEngine = engine.NewTaskEngine(cfg, dockerClient, credentialsManager, containerChangeEventStream, imageManager, state)
	}

	stateManager, err := initializeStateManager(cfg, taskEngine, &cfg.Cluster, &containerInstanceArn, &currentEc2InstanceID)
	if err != nil {
		log.Criticalf("Error creating state manager: %v", err)
		return exitcodes.ExitTerminal
	}

	capabilities := taskEngine.Capabilities()

	// We instantiate our own credentialProvider for use in acs/tcs. This tries
	// to mimic roughly the way it's instantiated by the SDK for a default
	// session.
	credentialProvider := defaults.CredChain(defaults.Config(), defaults.Handlers())
	// Preflight request to make sure they're good
	if preflightCreds, err := credentialProvider.Get(); err != nil || preflightCreds.AccessKeyID == "" {
		log.Warnf("Error getting valid credentials (AKID %s): %v", preflightCreds.AccessKeyID, err)
	}
	client := api.NewECSClient(credentialProvider, cfg, httpclient.New(api.RoundtripTimeout, *acceptInsecureCert), ec2MetadataClient)

	if containerInstanceArn == "" {
		log.Info("Registering Instance with ECS")
		containerInstanceArn, err = client.RegisterContainerInstance("", capabilities)
		if err != nil {
			log.Errorf("Error registering: %v", err)
			if retriable, ok := err.(utils.Retriable); ok && !retriable.Retry() {
				return exitcodes.ExitTerminal
			}
			return exitcodes.ExitError
		}
		log.Infof("Registration completed successfully. I am running as '%s' in cluster '%s'", containerInstanceArn, cfg.Cluster)
		// Save our shiny new containerInstanceArn
		stateManager.Save()
	} else {
		log.Infof("Restored from checkpoint file. I am running as '%s' in cluster '%s'", containerInstanceArn, cfg.Cluster)
		_, err = client.RegisterContainerInstance(containerInstanceArn, capabilities)
		if err != nil {
			log.Errorf("Error re-registering: %v", err)
			if awserr, ok := err.(awserr.Error); ok && api.IsInstanceTypeChangedError(awserr) {
				log.Criticalf("The current instance type does not match the registered instance type. Please revert the instance type change, or alternatively launch a new instance. Error: %v", err)
				return exitcodes.ExitTerminal
			}
			return exitcodes.ExitError
		}
	}

	// Begin listening to the docker daemon and saving changes
	taskEngine.SetSaver(stateManager)
	imageManager.SetSaver(stateManager)
	taskEngine.MustInit()

	// start of the periodic image cleanup process
	if !cfg.ImageCleanupDisabled {
		go imageManager.StartImageCleanupProcess(ctx)
	}

	go sighandlers.StartTerminationHandler(stateManager, taskEngine)

	// Agent introspection api
	go handlers.ServeHttp(&containerInstanceArn, taskEngine, cfg)

	// Start serving the endpoint to fetch IAM Role credentials
	go credentialshandler.ServeHttp(credentialsManager, containerInstanceArn, cfg)

	// Start sending events to the backend
	go eventhandler.HandleEngineEvents(taskEngine, client, stateManager)

	deregisterInstanceEventStream := eventstream.NewEventStream(DeregisterContainerInstanceEventStream, ctx)
	deregisterInstanceEventStream.StartListening()

	telemetrySessionParams := tcshandler.TelemetrySessionParams{
		ContainerInstanceArn: containerInstanceArn,
		CredentialProvider:   credentialProvider,
		Cfg:                  cfg,
		DeregisterInstanceEventStream: deregisterInstanceEventStream,
		ContainerChangeEventStream:    containerChangeEventStream,
		DockerClient:                  dockerClient,
		AcceptInvalidCert:             *acceptInsecureCert,
		EcsClient:                     client,
		TaskEngine:                    taskEngine,
	}

	// Start metrics session in a go routine
	go tcshandler.StartMetricsSession(telemetrySessionParams)

	log.Info("Beginning Polling for updates")
	err = acshandler.StartSession(ctx, acshandler.StartSessionArguments{
		AcceptInvalidCert: *acceptInsecureCert,
		Config:            cfg,
		DeregisterInstanceEventStream: deregisterInstanceEventStream,
		ContainerInstanceArn:          containerInstanceArn,
		CredentialProvider:            credentialProvider,
		ECSClient:                     client,
		StateManager:                  stateManager,
		TaskEngine:                    taskEngine,
		CredentialsManager:            credentialsManager,
	})
	if err != nil {
		log.Criticalf("Unretriable error starting communicating with ACS: %v", err)
		return exitcodes.ExitTerminal
	}
	log.Critical("ACS Session handler should never exit")
	return exitcodes.ExitError
}
func TestHandlerDoesntLeakGouroutines(t *testing.T) {
	ctrl := gomock.NewController(t)
	defer ctrl.Finish()
	taskEngine := engine.NewMockTaskEngine(ctrl)
	ecsclient := mock_api.NewMockECSClient(ctrl)
	statemanager := statemanager.NewNoopStateManager()
	testTime := ttime.NewTestTime()
	ttime.SetTime(testTime)

	closeWS := make(chan bool)
	server, serverIn, requests, errs, err := startMockAcsServer(t, closeWS)
	if err != nil {
		t.Fatal(err)
	}
	go func() {
		for {
			select {
			case <-requests:
			case <-errs:
			}
		}
	}()

	timesConnected := 0
	ecsclient.EXPECT().DiscoverPollEndpoint("myArn").Return(server.URL, nil).AnyTimes().Do(func(_ interface{}) {
		timesConnected++
	})
	taskEngine.EXPECT().Version().Return("Docker: 1.5.0", nil).AnyTimes()
	taskEngine.EXPECT().AddTask(gomock.Any()).AnyTimes()

	ctx, cancel := context.WithCancel(context.Background())
	ended := make(chan bool, 1)
	go func() {
		handler.StartSession(ctx, handler.StartSessionArguments{"myArn", credentials.AnonymousCredentials, &config.Config{Cluster: "someCluster"}, taskEngine, ecsclient, statemanager, true})
		ended <- true
	}()
	// Warm it up
	serverIn <- `{"type":"HeartbeatMessage","message":{"healthy":true}}`
	serverIn <- samplePayloadMessage

	beforeGoroutines := runtime.NumGoroutine()
	for i := 0; i < 100; i++ {
		serverIn <- `{"type":"HeartbeatMessage","message":{"healthy":true}}`
		serverIn <- samplePayloadMessage
		closeWS <- true
	}

	cancel()
	testTime.Cancel()
	<-ended

	afterGoroutines := runtime.NumGoroutine()

	t.Logf("Gorutines after 1 and after 100 acs messages: %v and %v", beforeGoroutines, afterGoroutines)

	if timesConnected < 50 {
		t.Fatal("Expected times connected to be a large number, was ", timesConnected)
	}
	if afterGoroutines > beforeGoroutines+5 {
		t.Error("Goroutine leak, oh no!")
		pprof.Lookup("goroutine").WriteTo(os.Stdout, 1)
	}

}
func TestHeartbeatOnlyWhenIdle(t *testing.T) {
	testTime := ttime.NewTestTime()
	ttime.SetTime(testTime)
	ctrl := gomock.NewController(t)
	defer ctrl.Finish()
	taskEngine := engine.NewMockTaskEngine(ctrl)
	ecsclient := mock_api.NewMockECSClient(ctrl)
	statemanager := statemanager.NewNoopStateManager()

	closeWS := make(chan bool)
	server, serverIn, requestsChan, errChan, err := startMockAcsServer(t, closeWS)
	defer close(serverIn)

	go func() {
		for {
			<-requestsChan
		}
	}()
	if err != nil {
		t.Fatal(err)
	}

	// We're testing that it does not reconnect here; must be the case
	ecsclient.EXPECT().DiscoverPollEndpoint("myArn").Return(server.URL, nil).Times(1)
	taskEngine.EXPECT().Version().Return("Docker: 1.5.0", nil).AnyTimes()

	ctx, cancel := context.WithCancel(context.Background())
	ended := make(chan bool, 1)
	go func() {
		handler.StartSession(ctx, handler.StartSessionArguments{
			ContainerInstanceArn: "myArn",
			CredentialProvider:   credentials.AnonymousCredentials,
			Config:               &config.Config{Cluster: "someCluster"},
			TaskEngine:           taskEngine,
			ECSClient:            ecsclient,
			StateManager:         statemanager,
			AcceptInvalidCert:    true,
		})
		ended <- true
	}()

	taskAdded := make(chan bool)
	taskEngine.EXPECT().AddTask(gomock.Any()).Do(func(interface{}) {
		taskAdded <- true
	}).Times(10)
	for i := 0; i < 10; i++ {
		serverIn <- samplePayloadMessage
		testTime.Warp(1 * time.Minute)
		<-taskAdded
	}

	select {
	case <-ended:
		t.Fatal("Should not have stop session")
	case err := <-errChan:
		t.Fatal("Error should not have been returned from server", err)
	default:
	}
	go server.Close()
	cancel()
	<-ended
}