func (k *deleter) deleteOne(pod *Pod) error { ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace) podKey, err := podtask.MakePodKey(ctx, pod.Name) if err != nil { return err } log.V(2).Infof("pod deleted: %v", podKey) // order is important here: we want to make sure we have the lock before // removing the pod from the scheduling queue. this makes the concurrent // execution of scheduler-error-handling and delete-handling easier to // reason about. k.api.Lock() defer k.api.Unlock() // prevent the scheduler from attempting to pop this; it's also possible that // it's concurrently being scheduled (somewhere between pod scheduling and // binding) - if so, then we'll end up removing it from taskRegistry which // will abort Bind()ing k.qr.dequeue(pod.GetUID()) switch task, state := k.api.tasks().ForPod(podKey); state { case podtask.StateUnknown: log.V(2).Infof("Could not resolve pod '%s' to task id", podKey) return noSuchPodErr // determine if the task has already been launched to mesos, if not then // cleanup is easier (unregister) since there's no state to sync case podtask.StatePending: if !task.Has(podtask.Launched) { // we've been invoked in between Schedule() and Bind() if task.HasAcceptedOffer() { task.Offer.Release() task.Reset() task.Set(podtask.Deleted) //TODO(jdef) probably want better handling here if err := k.api.tasks().Update(task); err != nil { return err } } k.api.tasks().Unregister(task) return nil } fallthrough case podtask.StateRunning: // signal to watchers that the related pod is going down task.Set(podtask.Deleted) if err := k.api.tasks().Update(task); err != nil { log.Errorf("failed to update task w/ Deleted status: %v", err) } return k.api.killTask(task.ID) default: log.Infof("cannot kill pod '%s': non-terminal task not found %v", podKey, task.ID) return noSuchTaskErr } }
// this pod may be out of sync with respect to the API server registry: // this pod | apiserver registry // -------------|---------------------- // host=.* | 404 ; pod was deleted // host=.* | 5xx ; failed to sync, try again later? // host="" | host="" ; perhaps no updates to process? // host="" | host="..." ; pod has been scheduled and assigned, is there a task assigned? (check TaskIdKey in binding?) // host="..." | host="" ; pod is no longer scheduled, does it need to be re-queued? // host="..." | host="..." ; perhaps no updates to process? // // TODO(jdef) this needs an integration test func (s *schedulingPlugin) reconcilePod(oldPod api.Pod) { log.V(1).Infof("reconcile pod %v", oldPod.Name) ctx := api.WithNamespace(api.NewDefaultContext(), oldPod.Namespace) pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(oldPod.Name) if err != nil { if errors.IsNotFound(err) { // attempt to delete if err = s.deleter.deleteOne(&Pod{Pod: &oldPod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr { log.Errorf("failed to delete pod: %v: %v", oldPod.Name, err) } } else { //TODO(jdef) other errors should probably trigger a retry (w/ backoff). //For now, drop the pod on the floor log.Warning("aborting reconciliation for pod %v: %v", oldPod.Name, err) } return } if oldPod.Spec.NodeName != pod.Spec.NodeName { if pod.Spec.NodeName == "" { // pod is unscheduled. // it's possible that we dropped the pod in the scheduler error handler // because of task misalignment with the pod (task.Has(podtask.Launched) == true) podKey, err := podtask.MakePodKey(ctx, pod.Name) if err != nil { log.Error(err) return } s.api.Lock() defer s.api.Unlock() if _, state := s.api.tasks().ForPod(podKey); state != podtask.StateUnknown { //TODO(jdef) reconcile the task log.Errorf("task already registered for pod %v", pod.Name) return } now := time.Now() log.V(3).Infof("reoffering pod %v", podKey) s.qr.reoffer(&Pod{ Pod: pod, deadline: &now, }) } else { // pod is scheduled. // not sure how this happened behind our backs. attempt to reconstruct // at least a partial podtask.T record. //TODO(jdef) reconcile the task log.Errorf("pod already scheduled: %v", pod.Name) } } else { //TODO(jdef) for now, ignore the fact that the rest of the spec may be different //and assume that our knowledge of the pod aligns with that of the apiserver log.Error("pod reconciliation does not support updates; not yet implemented") } }
// implementation of scheduling plugin's Error func; see plugin/pkg/scheduler func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error) { if schedulingErr == noSuchPodErr { log.V(2).Infof("Not rescheduling non-existent pod %v", pod.Name) return } log.Infof("Error scheduling %v: %v; retrying", pod.Name, schedulingErr) defer util.HandleCrash() // default upstream scheduler passes pod.Name as binding.PodID ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace) podKey, err := podtask.MakePodKey(ctx, pod.Name) if err != nil { log.Errorf("Failed to construct pod key, aborting scheduling for pod %v: %v", pod.Name, err) return } k.backoff.GC() k.api.Lock() defer k.api.Unlock() switch task, state := k.api.tasks().ForPod(podKey); state { case podtask.StateUnknown: // if we don't have a mapping here any more then someone deleted the pod log.V(2).Infof("Could not resolve pod to task, aborting pod reschdule: %s", podKey) return case podtask.StatePending: if task.Has(podtask.Launched) { log.V(2).Infof("Skipping re-scheduling for already-launched pod %v", podKey) return } breakoutEarly := queue.BreakChan(nil) if schedulingErr == noSuitableOffersErr { log.V(3).Infof("adding backoff breakout handler for pod %v", podKey) breakoutEarly = queue.BreakChan(k.api.offers().Listen(podKey, func(offer *mesos.Offer) bool { k.api.Lock() defer k.api.Unlock() switch task, state := k.api.tasks().Get(task.ID); state { case podtask.StatePending: return !task.Has(podtask.Launched) && task.AcceptOffer(offer) default: // no point in continuing to check for matching offers return true } })) } delay := k.backoff.Get(podKey) log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay) k.qr.requeue(&Pod{Pod: pod, delay: &delay, notify: breakoutEarly}) default: log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey) } }
// Schedule implements the Scheduler interface of Kubernetes. // It returns the selectedMachine's name and error (if there's any). func (k *kubeScheduler) Schedule(pod *api.Pod, unused algorithm.MinionLister) (string, error) { log.Infof("Try to schedule pod %v\n", pod.Name) ctx := api.WithNamespace(api.NewDefaultContext(), pod.Namespace) // default upstream scheduler passes pod.Name as binding.PodID podKey, err := podtask.MakePodKey(ctx, pod.Name) if err != nil { return "", err } k.api.Lock() defer k.api.Unlock() switch task, state := k.api.tasks().ForPod(podKey); state { case podtask.StateUnknown: // There's a bit of a potential race here, a pod could have been yielded() and // then before we get *here* it could be deleted. // We use meta to index the pod in the store since that's what k8s reflector does. podName, err := cache.MetaNamespaceKeyFunc(pod) if err != nil { log.Warningf("aborting Schedule, unable to understand pod object %+v", pod) return "", noSuchPodErr } if deleted := k.podUpdates.Poll(podName, queue.DELETE_EVENT); deleted { // avoid scheduling a pod that's been deleted between yieldPod() and Schedule() log.Infof("aborting Schedule, pod has been deleted %+v", pod) return "", noSuchPodErr } return k.doSchedule(k.api.tasks().Register(k.api.createPodTask(ctx, pod))) //TODO(jdef) it's possible that the pod state has diverged from what //we knew previously, we should probably update the task.Pod state here //before proceeding with scheduling case podtask.StatePending: if pod.UID != task.Pod.UID { // we're dealing with a brand new pod spec here, so the old one must have been // deleted -- and so our task store is out of sync w/ respect to reality //TODO(jdef) reconcile task return "", fmt.Errorf("task %v spec is out of sync with pod %v spec, aborting schedule", task.ID, pod.Name) } else if task.Has(podtask.Launched) { // task has been marked as "launched" but the pod binding creation may have failed in k8s, // but we're going to let someone else handle it, probably the mesos task error handler return "", fmt.Errorf("task %s has already been launched, aborting schedule", task.ID) } else { return k.doSchedule(task, nil) } default: return "", fmt.Errorf("task %s is not pending, nothing to schedule", task.ID) } }
// implements binding.Registry, launches the pod-associated-task in mesos func (b *binder) Bind(binding *api.Binding) error { ctx := api.WithNamespace(api.NewContext(), binding.Namespace) // default upstream scheduler passes pod.Name as binding.Name podKey, err := podtask.MakePodKey(ctx, binding.Name) if err != nil { return err } b.api.Lock() defer b.api.Unlock() switch task, state := b.api.tasks().ForPod(podKey); state { case podtask.StatePending: return b.bind(ctx, binding, task) default: // in this case it's likely that the pod has been deleted between Schedule // and Bind calls log.Infof("No pending task for pod %s", podKey) return noSuchPodErr //TODO(jdef) this error is somewhat misleading since the task could be running?! } }