func TestRetryWithBackoff(t *testing.T) { test_time := ttime.NewTestTime() test_time.LudicrousSpeed(true) ttime.SetTime(test_time) start := ttime.Now() counter := 3 RetryWithBackoff(NewSimpleBackoff(100*time.Millisecond, 100*time.Millisecond, 0, 1), func() error { if counter == 0 { return nil } counter-- return errors.New("err") }) if counter != 0 { t.Error("Counter didn't go to 0; didn't get retried enough") } testTime := ttime.Since(start) if testTime.Seconds() < .29 || testTime.Seconds() > .31 { t.Error("Retry didn't backoff for as long as expected") } start = ttime.Now() RetryWithBackoff(NewSimpleBackoff(10*time.Second, 20*time.Second, 0, 2), func() error { return NewRetriableError(NewRetriable(false), errors.New("can't retry")) }) if ttime.Since(start).Seconds() > .1 { t.Error("Retry for the trivial function took too long") } }
func (task *managedTask) cleanupTask() { cleanupTime := ttime.After(task.KnownStatusTime.Add(taskStoppedDuration).Sub(ttime.Now())) cleanupTimeBool := make(chan bool) go func() { <-cleanupTime cleanupTimeBool <- true close(cleanupTimeBool) }() for !task.waitEvent(cleanupTimeBool) { } log.Debug("Cleaning up task's containers and data", "task", task.Task) // First make an attempt to cleanup resources task.engine.sweepTask(task.Task) task.engine.state.RemoveTask(task.Task) // Now remove ourselves from the global state and cleanup channels task.engine.processTasks.Lock() delete(task.engine.managedTasks, task.Arn) task.engine.processTasks.Unlock() task.engine.saver.Save() // Cleanup any leftover messages before closing their channels. No new // messages possible because we deleted ourselves from managedTasks, so this // removes all stale ones task.discardPendingMessages() close(task.dockerMessages) close(task.acsMessages) }
func TestServerExceptionRetries(t *testing.T) { ctrl, client, mockRoundTripper := setup(t) defer ctrl.Finish() timesCalled := 0 // This resp.Body song and dance is because it *must* be reset between // retries for the sdk to behave sanely; it rewinds request bodies, not // response bodies. The actual server would, indeed put a new body each time // so this is not a bad thing to do resp := operationErrorResp(500, `{"__type":"BadStuffHappenedException","message":"something went wrong"}`) mockRoundTripper.EXPECT().RoundTrip(mock_http.NewHTTPOperationMatcher(versionedOperation("DiscoverPollEndpoint"))).AnyTimes().Do(func(_ interface{}) { timesCalled++ resp.Body = operationErrorResp(500, `{"__type":"BadStuffHappenedException","message":"something went wrong"}`).Body }).Return(resp, nil).AnyTimes() start := ttime.Now() _, err := client.DiscoverPollEndpoint("foo") if err == nil { t.Error("Expected it to error after retrying") } duration := ttime.Since(start) if duration < 100*time.Millisecond { t.Error("Retries should have taken some time; took " + duration.String()) } if timesCalled < 2 || timesCalled > 10 { // Actaully 4 at the time of writing, but a reasonable range is fine t.Error("Retries should happen a reasonable number of times") } }
func TestRetryNWithBackoff(t *testing.T) { test_time := ttime.NewTestTime() test_time.LudicrousSpeed(true) ttime.SetTime(test_time) start := ttime.Now() counter := 3 err := RetryNWithBackoff(NewSimpleBackoff(100*time.Millisecond, 100*time.Millisecond, 0, 1), 2, func() error { counter-- return errors.New("err") }) if counter != 1 { t.Error("Should have stopped after two tries") } if err == nil { t.Error("Should have returned appropriate error") } testTime := ttime.Since(start) // Expect that it tried twice, sleeping once between them if testTime.Seconds() < 0.09 || testTime.Seconds() > 0.11 { t.Errorf("Retry didn't backoff for as long as expected: %v", testTime.Seconds()) } start = ttime.Now() counter = 3 err = RetryNWithBackoff(NewSimpleBackoff(100*time.Millisecond, 100*time.Millisecond, 0, 1), 5, func() error { counter-- if counter == 0 { return nil } return errors.New("err") }) testTime = ttime.Since(start) if counter != 0 { t.Errorf("Counter expected to be 0, was %v", counter) } if err != nil { t.Errorf("Expected no error, got %v", err) } // 3 tries; 2 backoffs if testTime.Seconds() < 0.190 || testTime.Seconds() > 0.210 { t.Errorf("Retry didn't backoff for as long as expected: %v", testTime.Seconds()) } }
func (task *managedTask) cleanupTask(taskStoppedDuration time.Duration) { cleanupTimeDuration := task.GetKnownStatusTime().Add(taskStoppedDuration).Sub(ttime.Now()) // There is a potential deadlock here if cleanupTime is negative. Ignore the computed // value in this case in favor of the default config value. if cleanupTimeDuration < 0 { log.Debug("Task Cleanup Duration is too short. Resetting to " + config.DefaultTaskCleanupWaitDuration.String()) cleanupTimeDuration = config.DefaultTaskCleanupWaitDuration } cleanupTime := task.time().After(cleanupTimeDuration) cleanupTimeBool := make(chan bool) go func() { <-cleanupTime cleanupTimeBool <- true close(cleanupTimeBool) }() for !task.waitEvent(cleanupTimeBool) { } log.Debug("Cleaning up task's containers and data", "task", task.Task) // For the duration of this, simply discard any task events; this ensures the // speedy processing of other events for other tasks handleCleanupDone := make(chan struct{}) go func() { task.engine.sweepTask(task.Task) task.engine.state.RemoveTask(task.Task) handleCleanupDone <- struct{}{} }() // discard events while the task is being removed from engine state task.discardEventsUntil(handleCleanupDone) log.Debug("Finished removing task data; removing from state no longer managing", "task", task.Task) // Now remove ourselves from the global state and cleanup channels go task.discardEventsUntil(handleCleanupDone) // keep discarding events until the taks is fully gone task.engine.processTasks.Lock() delete(task.engine.managedTasks, task.Arn) handleCleanupDone <- struct{}{} task.engine.processTasks.Unlock() task.engine.saver.Save() // Cleanup any leftover messages before closing their channels. No new // messages possible because we deleted ourselves from managedTasks, so this // removes all stale ones task.discardPendingMessages() close(task.dockerMessages) close(task.acsMessages) }
func TestSubmitRetries(t *testing.T) { ctrl, client, mockRoundTripper := setup(t) defer ctrl.Finish() timesCalled := 0 resp := operationErrorResp(500, `{"__type":"SubmitContainerStateChangeException","message":"something broke horribly"}`) mockRoundTripper.EXPECT().RoundTrip(mock_http.NewHTTPOperationMatcher(versionedOperation("SubmitContainerStateChange"))).AnyTimes().Do(func(_ interface{}) { timesCalled++ resp.Body = operationErrorResp(500, `{"__type":"SubmitContainerStateChangeException","message":"something broke horribly"}`).Body }).Return(resp, nil) start := ttime.Now() err := client.SubmitContainerStateChange(ContainerStateChange{ContainerName: "foo", TaskArn: "bar", Status: ContainerRunning}) if err == nil { t.Fatal("Expected it to error after retrying") } duration := ttime.Since(start) if duration < 23*time.Hour || duration > 25*time.Hour { t.Fatal("Retries should have taken roughly 24 hours; took " + duration.String()) } if timesCalled < 10 { t.Fatal("Expected to be called many times") } }
func (task *Task) updateKnownStatusTime() { task.knownStatusTimeLock.Lock() defer task.knownStatusTimeLock.Unlock() task.KnownStatusTime = ttime.Now() }
func (u *updater) stageUpdateHandler() func(req *ecsacs.StageUpdateMessage) { return func(req *ecsacs.StageUpdateMessage) { u.Lock() defer u.Unlock() if req == nil || req.MessageId == nil { log.Error("Nil request to stage update or missing MessageID") return } nack := func(reason string) { seelog.Errorf("Nacking StageUpdate; reason: %s", reason) u.acs.MakeRequest(&ecsacs.NackRequest{ Cluster: req.ClusterArn, ContainerInstance: req.ContainerInstanceArn, MessageId: req.MessageId, Reason: aws.String(reason), }) u.reset() } if !u.config.UpdatesEnabled { nack("Updates are disabled") return } if err := validateUpdateInfo(req.UpdateInfo); err != nil { nack("Invalid update: " + err.Error()) return } log.Debug("Staging update", "update", req) if u.stage != updateNone { if u.updateID != "" && u.updateID == *req.UpdateInfo.Signature { log.Debug("Update already in progress, acking duplicate message", "id", u.updateID) // Acking here is safe as any currently-downloading update will already be holding // the update lock. A failed download will nack and clear state (while holding the // update lock) before this code is reached, meaning that the above conditional will // not evaluate true (no matching, in-progress update). u.acs.MakeRequest(&ecsacs.AckRequest{ Cluster: req.ClusterArn, ContainerInstance: req.ContainerInstanceArn, MessageId: req.MessageId, }) return } else { // Nack previous update reason := "New update arrived: " + *req.MessageId u.acs.MakeRequest(&ecsacs.NackRequest{ Cluster: req.ClusterArn, ContainerInstance: req.ContainerInstanceArn, MessageId: &u.downloadMessageID, Reason: &reason, }) } } u.updateID = *req.UpdateInfo.Signature u.stage = updateDownloading u.stageTime = ttime.Now() u.downloadMessageID = *req.MessageId err := u.download(req.UpdateInfo) if err != nil { nack("Unable to download: " + err.Error()) return } u.stage = updateDownloaded u.acs.MakeRequest(&ecsacs.AckRequest{ Cluster: req.ClusterArn, ContainerInstance: req.ContainerInstanceArn, MessageId: req.MessageId, }) } }
func (t *Task) SetKnownStatus(status TaskStatus) { t.KnownStatus = status t.KnownStatusTime = ttime.Now() }
func TestTaskFromACS(t *testing.T) { test_time := ttime.Now().Truncate(1 * time.Second).Format(time.RFC3339) intptr := func(i int64) *int64 { return &i } boolptr := func(b bool) *bool { return &b } // Testing type conversions, bleh. At least the type conversion itself // doesn't look this messy. taskFromAcs := ecsacs.Task{ Arn: strptr("myArn"), DesiredStatus: strptr("RUNNING"), Family: strptr("myFamily"), Version: strptr("1"), Containers: []*ecsacs.Container{ &ecsacs.Container{ Name: strptr("myName"), Cpu: intptr(10), Command: []*string{strptr("command"), strptr("command2")}, EntryPoint: []*string{strptr("sh"), strptr("-c")}, Environment: map[string]*string{"key": strptr("value")}, Essential: boolptr(true), Image: strptr("image:tag"), Links: []*string{strptr("link1"), strptr("link2")}, Memory: intptr(100), MountPoints: []*ecsacs.MountPoint{ &ecsacs.MountPoint{ ContainerPath: strptr("/container/path"), ReadOnly: boolptr(true), SourceVolume: strptr("sourceVolume"), }, }, Overrides: strptr(`{"command":["a","b","c"]}`), PortMappings: []*ecsacs.PortMapping{ &ecsacs.PortMapping{ HostPort: intptr(800), ContainerPort: intptr(900), Protocol: strptr("udp"), }, }, VolumesFrom: []*ecsacs.VolumeFrom{ &ecsacs.VolumeFrom{ ReadOnly: boolptr(true), SourceContainer: strptr("volumeLink"), }, }, DockerConfig: &ecsacs.DockerConfig{ Config: strptr("config json"), HostConfig: strptr("hostconfig json"), Version: strptr("version string"), }, }, }, Volumes: []*ecsacs.Volume{ &ecsacs.Volume{ Name: strptr("volName"), Host: &ecsacs.HostVolumeProperties{ SourcePath: strptr("/host/path"), }, }, }, RoleCredentials: &ecsacs.IAMRoleCredentials{ CredentialsId: strptr("credsId"), AccessKeyId: strptr("keyId"), Expiration: strptr(test_time), RoleArn: strptr("roleArn"), SecretAccessKey: strptr("OhhSecret"), SessionToken: strptr("sessionToken"), }, } expectedTask := &Task{ Arn: "myArn", DesiredStatus: TaskRunning, Family: "myFamily", Version: "1", Containers: []*Container{ &Container{ Name: "myName", Image: "image:tag", Command: []string{"command", "command2"}, Links: []string{"link1", "link2"}, EntryPoint: &[]string{"sh", "-c"}, Essential: true, Environment: map[string]string{"key": "value"}, Cpu: 10, Memory: 100, MountPoints: []MountPoint{ MountPoint{ ContainerPath: "/container/path", ReadOnly: true, SourceVolume: "sourceVolume", }, }, Overrides: ContainerOverrides{ Command: &[]string{"a", "b", "c"}, }, Ports: []PortBinding{ PortBinding{ HostPort: 800, ContainerPort: 900, Protocol: TransportProtocolUDP, }, }, VolumesFrom: []VolumeFrom{ VolumeFrom{ ReadOnly: true, SourceContainer: "volumeLink", }, }, DockerConfig: DockerConfig{ Config: strptr("config json"), HostConfig: strptr("hostconfig json"), Version: strptr("version string"), }, }, }, Volumes: []TaskVolume{ TaskVolume{ Name: "volName", Volume: &FSHostVolume{ FSSourcePath: "/host/path", }, }, }, StartSequenceNumber: 42, } seqNum := int64(42) task, err := TaskFromACS(&taskFromAcs, &ecsacs.PayloadMessage{SeqNum: &seqNum}) if err != nil { t.Fatalf("Should be able to handle acs task: %v", err) } if !reflect.DeepEqual(task.Containers, expectedTask.Containers) { t.Fatal("Containers should be equal") } if !reflect.DeepEqual(task.Volumes, expectedTask.Volumes) { t.Fatal("Volumes should be equal") } if !reflect.DeepEqual(task.StartSequenceNumber, expectedTask.StartSequenceNumber) { t.Fatal("StartSequenceNumber should be equal") } if !reflect.DeepEqual(task.StopSequenceNumber, expectedTask.StopSequenceNumber) { t.Fatal("StopSequenceNumber should be equal") } }
func TestDockerStopTimeout(t *testing.T) { os.Setenv("ECS_CONTAINER_STOP_TIMEOUT", testDockerStopTimeout.String()) defer os.Unsetenv("ECS_CONTAINER_STOP_TIMEOUT") cfg := defaultTestConfig() taskEngine, done, _ := setup(cfg, t) dockerTaskEngine := taskEngine.(*DockerTaskEngine) if dockerTaskEngine.cfg.DockerStopTimeout != testDockerStopTimeout { t.Errorf("Expect the docker stop timeout read from environment variable when ECS_CONTAINER_STOP_TIMEOUT is set, %v", dockerTaskEngine.cfg.DockerStopTimeout) } testTask := createTestTask("TestDockerStopTimeout") testTask.Containers = append(testTask.Containers, createTestContainer()) testTask.Containers[0].Command = []string{"sh", "-c", "while true; do echo `date +%T`; sleep 1s; done;"} testTask.Containers[0].Image = testBusyboxImage testTask.Containers[0].Name = "test-docker-timeout" taskEvents, contEvents := dockerTaskEngine.TaskEvents() ctx, cancel := context.WithCancel(context.Background()) go func() { for { select { case <-taskEvents: case <-ctx.Done(): return } } }() defer func() { done() cancel() }() go dockerTaskEngine.AddTask(testTask) for contEvent := range contEvents { if contEvent.TaskArn != testTask.Arn { continue } if contEvent.Status == api.ContainerRunning { break } if contEvent.Status > api.ContainerRunning { t.Error("Expect container to run not stop") } } startTime := ttime.Now() dockerTaskEngine.stopContainer(testTask, testTask.Containers[0]) for contEvent := range contEvents { if contEvent.TaskArn != testTask.Arn { continue } if contEvent.Status == api.ContainerRunning { break } if contEvent.Status > api.ContainerStopped { t.Error("Expect container to stop") } } if ttime.Since(startTime) < testDockerStopTimeout { t.Errorf("Container stopped before the timeout: %v", ttime.Since(startTime)) } if ttime.Since(startTime) > testDockerStopTimeout+1*time.Second { t.Errorf("Container should have stopped eariler, but stopped after %v", ttime.Since(startTime)) } }