func TestParsePodFullName(t *testing.T) { type nameTuple struct { Name string Namespace string } successfulCases := map[string]nameTuple{ "bar_foo": {Name: "bar", Namespace: "foo"}, "bar.org_foo.com": {Name: "bar.org", Namespace: "foo.com"}, "bar-bar_foo": {Name: "bar-bar", Namespace: "foo"}, } failedCases := []string{"barfoo", "bar_foo_foo", ""} for podFullName, expected := range successfulCases { name, namespace, err := qingcontainer.ParsePodFullName(podFullName) if err != nil { t.Errorf("unexpected error when parsing the full name: %v", err) continue } if name != expected.Name || namespace != expected.Namespace { t.Errorf("expected name %q, namespace %q; got name %q, namespace %q", expected.Name, expected.Namespace, name, namespace) } } for _, podFullName := range failedCases { _, _, err := qingcontainer.ParsePodFullName(podFullName) if err == nil { t.Errorf("expected error when parsing the full name, got none") } } }
// Deletes a mirror pod. func (mc *basicMirrorClient) DeleteMirrorPod(podFullName string) error { if mc.apiserverClient == nil { return nil } name, namespace, err := qingcontainer.ParsePodFullName(podFullName) if err != nil { glog.Errorf("Failed to parse a pod full name %q", podFullName) return err } glog.V(4).Infof("Deleting a mirror pod %q", podFullName) if err := mc.apiserverClient.Pods(namespace).Delete(name, nil); err != nil { glog.Errorf("Failed deleting a mirror pod %q: %v", podFullName, err) } return nil }
// reconcile an unknown (from the perspective of our registry) non-terminal task func (k *QingYuanScheduler) reconcileNonTerminalTask(driver bindings.SchedulerDriver, taskStatus *mesos.TaskStatus) { // attempt to recover task from pod info: // - task data may contain an api.PodStatusResult; if status.reason == REASON_RECONCILIATION then status.data == nil // - the Name can be parsed by container.ParseFullName() to yield a pod Name and Namespace // - pull the pod metadata down from the api server // - perform task recovery based on pod metadata taskId := taskStatus.TaskId.GetValue() if taskStatus.GetReason() == mesos.TaskStatus_REASON_RECONCILIATION && taskStatus.GetSource() == mesos.TaskStatus_SOURCE_MASTER { // there will be no data in the task status that we can use to determine the associated pod switch taskStatus.GetState() { case mesos.TaskState_TASK_STAGING: // there is still hope for this task, don't kill it just yet //TODO(jdef) there should probably be a limit for how long we tolerate tasks stuck in this state return default: // for TASK_{STARTING,RUNNING} we should have already attempted to recoverTasks() for. // if the scheduler failed over before the executor fired TASK_STARTING, then we should *not* // be processing this reconciliation update before we process the one from the executor. // point: we don't know what this task is (perhaps there was unrecoverable metadata in the pod), // so it gets killed. log.Errorf("killing non-terminal, unrecoverable task %v", taskId) } } else if podStatus, err := podtask.ParsePodStatusResult(taskStatus); err != nil { // possible rogue pod exists at this point because we can't identify it; should kill the task log.Errorf("possible rogue pod; illegal task status data for task %v, expected an api.PodStatusResult: %v", taskId, err) } else if name, namespace, err := container.ParsePodFullName(podStatus.Name); err != nil { // possible rogue pod exists at this point because we can't identify it; should kill the task log.Errorf("possible rogue pod; illegal api.PodStatusResult, unable to parse full pod name from: '%v' for task %v: %v", podStatus.Name, taskId, err) } else if pod, err := k.client.Pods(namespace).Get(name); err == nil { if t, ok, err := podtask.RecoverFrom(*pod); ok { log.Infof("recovered task %v from metadata in pod %v/%v", taskId, namespace, name) _, err := k.taskRegistry.Register(t, nil) if err != nil { // someone beat us to it?! log.Warningf("failed to register recovered task: %v", err) return } else { k.taskRegistry.UpdateStatus(taskStatus) } return } else if err != nil { //should kill the pod and the task log.Errorf("killing pod, failed to recover task from pod %v/%v: %v", namespace, name, err) if err := k.client.Pods(namespace).Delete(name, nil); err != nil { log.Errorf("failed to delete pod %v/%v: %v", namespace, name, err) } } else { //this is pretty unexpected: we received a TASK_{STARTING,RUNNING} message, but the apiserver's pod //metadata is not appropriate for task reconstruction -- which should almost certainly never //be the case unless someone swapped out the pod on us (and kept the same namespace/name) while //we were failed over. //kill this task, allow the newly launched scheduler to schedule the new pod log.Warningf("unexpected pod metadata for task %v in apiserver, assuming new unscheduled pod spec: %+v", taskId, pod) } } else if errors.IsNotFound(err) { // pod lookup failed, should delete the task since the pod is no longer valid; may be redundant, that's ok log.Infof("killing task %v since pod %v/%v no longer exists", taskId, namespace, name) } else if errors.IsServerTimeout(err) { log.V(2).Infof("failed to reconcile task due to API server timeout: %v", err) return } else { log.Errorf("unexpected API server error, aborting reconcile for task %v: %v", taskId, err) return } if _, err := driver.KillTask(taskStatus.TaskId); err != nil { log.Errorf("failed to kill task %v: %v", taskId, err) } }