func (t *Tester) testCreateGeneratesNameReturnsServerTimeout(valid runtime.Object) { objectMeta := t.getObjectMetaOrFail(valid) objectMeta.Name = "" objectMeta.GenerateName = "test-" t.withStorageError(errors.NewAlreadyExists("kind", "thing"), func() { _, err := t.storage.(rest.Creater).Create(t.TestContext(), valid) if err == nil || !errors.IsServerTimeout(err) { t.Fatalf("Unexpected error: %v", err) } }) }
func (t *Tester) TestCreateGeneratesNameReturnsServerTimeout(valid runtime.Object) { objectMeta, err := api.ObjectMetaFor(valid) if err != nil { t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, valid) } objectMeta.Name = "" objectMeta.GenerateName = "test-" t.withStorageError(errors.NewAlreadyExists("kind", "thing"), func() { _, err := t.storage.(rest.Creater).Create(t.TestContext(), valid) if err == nil || !errors.IsServerTimeout(err) { t.Fatalf("Unexpected error: %v", err) } }) }
// updateService fetches a service, calls the update function on it, // and then attempts to send the updated service. It retries up to 2 // times in the face of timeouts and conflicts. func updateService(c *client.Client, namespace, serviceName string, update func(*api.Service)) (*api.Service, error) { var service *api.Service var err error for i := 0; i < 3; i++ { service, err = c.Services(namespace).Get(serviceName) if err != nil { return service, err } update(service) service, err = c.Services(namespace).Update(service) if !errors.IsConflict(err) && !errors.IsServerTimeout(err) { return service, err } } return service, err }
func TestCheckGeneratedNameError(t *testing.T) { expect := errors.NewNotFound("foo", "bar") if err := CheckGeneratedNameError(Services, expect, &api.Pod{}); err != expect { t.Errorf("NotFoundError should be ignored: %v", err) } expect = errors.NewAlreadyExists("foo", "bar") if err := CheckGeneratedNameError(Services, expect, &api.Pod{}); err != expect { t.Errorf("AlreadyExists should be returned when no GenerateName field: %v", err) } expect = errors.NewAlreadyExists("foo", "bar") if err := CheckGeneratedNameError(Services, expect, &api.Pod{ObjectMeta: api.ObjectMeta{GenerateName: "foo"}}); err == nil || !errors.IsServerTimeout(err) { t.Errorf("expected try again later error: %v", err) } }
// reconcile an unknown (from the perspective of our registry) non-terminal task func (k *KubernetesScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { // attempt to recover task from pod info: // - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil // - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace // - pull the pod metadata down from the api server // - perform task recovery based on pod metadata taskId := taskStatus.TaskId.GetValue() if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER { // there will be no data in the task status that we can use to determine the associated pod switch taskStatus.GetState() { case mesos.TaskState_TASK_STAGING: // there is still hope for this task, don't kill it just yet //TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state return default: // for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for. // if the scheduler failed over before the executor fired TASK_STARTING, then we should *not* // be processing this reconciliation update before we process the one from the executor. // point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod), // so it gets killed. log.Errorf("killing non-terminal, unrecoverable task %v", taskId) } } else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil { // possible rogue pod exists at this point because we can't identify it; should kill the task log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err) } else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil { // possible rogue pod exists at this point because we can't identify it; should kill the task log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v", podStatus.Name, taskId, err) } else if pod, err := k.client.Pods(namespace).Get(name); err == nil { if t, ok, err := podtask.RecoverFrom(*pod); ok { log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name) _, err := k.taskRegistry.Register(t, nil) if err != nil { // someone beat us to it?! log.Warningf("failed to register recovered task: %v", err) return } else { k.taskRegistry.UpdateStatus(taskStatus) } return } else if err != nil { //should kill the pod and the task log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err) if err := k.client.Pods(namespace).Delete(name, nil); err != nil { log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err) } } else { //this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod //metadata is not appropriate for task reconstruction -- which should almost certainly never //be the case unless someone swapped out the pod on us (and kept the same namespace/name) while //we were failed over. //kill this task, allow the newly launched scheduler to schedule the new pod log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod) } } else if errors.IsNotFound(err) { // pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name) } else if errors.IsServerTimeout(err) { log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err) return } else { log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err) return } if _, err := driver.KillTask(taskStatus.TaskId); err != nil { log.Errorf("failed to kill task %v: %v", taskId, err) } }