// A first-come-first-serve scheduler: acquires the first offer that can support the task func FCFSScheduleFunc(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) { podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name) var acceptedOffer offers.Perishable err := r.Walk(func(p offers.Perishable) (bool, error) { offer := p.Details() if offer == nil { return false, fmt.Errorf("nil offer while scheduling task %v", task.ID) } if task.AcceptOffer(offer) { if p.Acquire() { acceptedOffer = p log.V(3).Infof("Pod %s accepted offer %v", podName, offer.Id.GetValue()) return true, nil // stop, we found an offer } } return false, nil // continue }) if acceptedOffer != nil { if err != nil { log.Warningf("problems walking the offer registry: %v, attempting to continue", err) } return acceptedOffer, nil } if err != nil { log.V(2).Infof("failed to find a fit for pod: %s, err = %v", podName, err) return nil, err } log.V(2).Infof("failed to find a fit for pod: %s", podName) return nil, noSuitableOffersErr }
func (b *binder) rollback(task *podtask.T, err error) error { task.Offer.Release() task.Reset() if err2 := b.api.tasks().Update(task); err2 != nil { log.Errorf("failed to update pod task: %v", err2) } return err }
func (k *k8smScheduler) launchTask(task *podtask.T) error { // assume caller is holding scheduler lock taskList := []*mesos.TaskInfo{task.BuildTaskInfo()} offerIds := []*mesos.OfferID{task.Offer.Details().Id} filters := &mesos.Filters{} _, err := k.internal.driver.LaunchTasks(offerIds, taskList, filters) return err }
// filter func used for explicit task reconciliation, selects only non-terminal tasks which // have been communicated to mesos (read: launched). func explicitTaskFilter(t *podtask.T) bool { switch t.State { case podtask.StateRunning: return true case podtask.StatePending: return t.Has(podtask.Launched) default: return false } }
// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on func (k *kubeScheduler) doSchedule(task *podtask.T, err error) (string, error) { var offer offers.Perishable if task.HasAcceptedOffer() { // verify that the offer is still on the table offerId := task.GetOfferId() if offer, ok := k.api.offers().Get(offerId); ok && !offer.HasExpired() { // skip tasks that have already have assigned offers offer = task.Offer } else { task.Offer.Release() task.Reset() if err = k.api.tasks().Update(task); err != nil { return "", err } } } if err == nil && offer == nil { offer, err = k.api.algorithm()(k.api.offers(), k.api, task) } if err != nil { return "", err } details := offer.Details() if details == nil { return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID) } slaveId := details.GetSlaveId().GetValue() if slave, ok := k.api.slaveFor(slaveId); !ok { // not much sense in Release()ing the offer here since its owner died offer.Release() k.api.offers().Invalidate(details.Id.GetValue()) return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID) } else { if task.Offer != nil && task.Offer != offer { return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer) } // write resource limits into the pod spec which is transfered to the executor. From here // on we can expect that the pod spec of a task has proper limits for CPU and memory. // TODO(sttts): For a later separation of the kubelet and the executor also patch the pod on the apiserver if unlimitedCPU := mresource.LimitPodCPU(&task.Pod, k.defaultContainerCPULimit); unlimitedCPU { log.Warningf("Pod %s/%s without cpu limits is admitted %.2f cpu shares", task.Pod.Namespace, task.Pod.Name, mresource.PodCPULimit(&task.Pod)) } if unlimitedMem := mresource.LimitPodMem(&task.Pod, k.defaultContainerMemLimit); unlimitedMem { log.Warningf("Pod %s/%s without memory limits is admitted %.2f MB", task.Pod.Namespace, task.Pod.Name, mresource.PodMemLimit(&task.Pod)) } task.Offer = offer task.FillFromDetails(details) if err := k.api.tasks().Update(task); err != nil { offer.Release() return "", err } return slave.HostName, nil } }
//TODO(jdef) unit test this, ensure that task's copy of api.Pod is not modified func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *podtask.T, offerId string) error { pod := task.Pod // we make an effort here to avoid making changes to the task's copy of the pod, since // we want that to reflect the initial user spec, and not the modified spec that we // build for the executor to consume. oemCt := pod.Spec.Containers pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod if pod.Annotations == nil { pod.Annotations = make(map[string]string) } else { oemAnn := pod.Annotations pod.Annotations = make(map[string]string) for k, v := range oemAnn { pod.Annotations[k] = v } } pod.Annotations[annotation.BindingHostKey] = machine task.SaveRecoveryInfo(pod.Annotations) for _, entry := range task.Spec.PortMap { oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports ports := append([]api.ContainerPort{}, oemPorts...) p := &ports[entry.PortIdx] p.HostPort = int(entry.OfferPort) op := strconv.FormatUint(entry.OfferPort, 10) pod.Annotations[fmt.Sprintf(annotation.PortMappingKeyFormat, p.Protocol, p.ContainerPort)] = op if p.Name != "" { pod.Annotations[fmt.Sprintf(annotation.PortNameMappingKeyFormat, p.Protocol, p.Name)] = op } pod.Spec.Containers[entry.ContainerIdx].Ports = ports } // the kubelet-executor uses this to instantiate the pod log.V(3).Infof("prepared pod spec: %+v", pod) data, err := api.Codec.Encode(&pod) if err != nil { log.V(2).Infof("Failed to marshal the pod spec: %v", err) return err } task.Spec.Data = data return nil }
// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on func (k *kubeScheduler) doSchedule(task *podtask.T, err error) (string, error) { var offer offers.Perishable if task.HasAcceptedOffer() { // verify that the offer is still on the table offerId := task.GetOfferId() if offer, ok := k.api.offers().Get(offerId); ok && !offer.HasExpired() { // skip tasks that have already have assigned offers offer = task.Offer } else { task.Offer.Release() task.Reset() if err = k.api.tasks().Update(task); err != nil { return "", err } } } if err == nil && offer == nil { offer, err = k.api.algorithm()(k.api.offers(), k.api, task) } if err != nil { return "", err } details := offer.Details() if details == nil { return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID) } slaveId := details.GetSlaveId().GetValue() if slave, ok := k.api.slaveFor(slaveId); !ok { // not much sense in Release()ing the offer here since its owner died offer.Release() k.api.offers().Invalidate(details.Id.GetValue()) return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID) } else { if task.Offer != nil && task.Offer != offer { return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer) } task.Offer = offer //TODO(jdef) FillFromDetails currently allocates fixed (hardwired) cpu and memory resources for all //tasks. This will be fixed once we properly integrate parent-cgroup support into the kublet-executor. //For now we are completely ignoring the resources specified in the pod. //see: https://github.com/mesosphere/kubernetes-mesos/issues/68 task.FillFromDetails(details) if err := k.api.tasks().Update(task); err != nil { offer.Release() return "", err } return slave.HostName, nil } }
// assumes that: caller has acquired scheduler lock and that the task is still pending func (b *binder) bind(ctx api.Context, binding *api.Binding, task *podtask.T) (err error) { // sanity check: ensure that the task hasAcceptedOffer(), it's possible that between // Schedule() and now that the offer for this task was rescinded or invalidated. // ((we should never see this here)) if !task.HasAcceptedOffer() { return fmt.Errorf("task has not accepted a valid offer %v", task.ID) } // By this time, there is a chance that the slave is disconnected. offerId := task.GetOfferId() if offer, ok := b.api.offers().Get(offerId); !ok || offer.HasExpired() { // already rescinded or timed out or otherwise invalidated return b.rollback(task, fmt.Errorf("failed prior to launchTask due to expired offer for task %v", task.ID)) } if err = b.prepareTaskForLaunch(ctx, binding.Target.Name, task, offerId); err == nil { log.V(2).Infof("launching task: %q on target %q slave %q for pod \"%v/%v\"", task.ID, binding.Target.Name, task.Spec.SlaveID, task.Pod.Namespace, task.Pod.Name) if err = b.api.launchTask(task); err == nil { b.api.offers().Invalidate(offerId) task.Set(podtask.Launched) if err = b.api.tasks().Update(task); err != nil { // this should only happen if the task has been removed or has changed status, // which SHOULD NOT HAPPEN as long as we're synchronizing correctly log.Errorf("failed to update task w/ Launched status: %v", err) } return } } return b.rollback(task, fmt.Errorf("Failed to launch task %v: %v", task.ID, err)) }