// Call ScheduleFunc and subtract some resources, returning the name of the machine the task is scheduled on func (k *kubeScheduler) doSchedule(task *podtask.T, err error) (string, error) { var offer offers.Perishable if task.HasAcceptedOffer() { // verify that the offer is still on the table offerId := task.GetOfferId() if offer, ok := k.api.offers().Get(offerId); ok && !offer.HasExpired() { // skip tasks that have already have assigned offers offer = task.Offer } else { task.Offer.Release() task.Reset() if err = k.api.tasks().Update(task); err != nil { return "", err } } } if err == nil && offer == nil { offer, err = k.api.algorithm()(k.api.offers(), k.api, task) } if err != nil { return "", err } details := offer.Details() if details == nil { return "", fmt.Errorf("offer already invalid/expired for task %v", task.ID) } slaveId := details.GetSlaveId().GetValue() if slave, ok := k.api.slaveFor(slaveId); !ok { // not much sense in Release()ing the offer here since its owner died offer.Release() k.api.offers().Invalidate(details.Id.GetValue()) return "", fmt.Errorf("Slave disappeared (%v) while scheduling task %v", slaveId, task.ID) } else { if task.Offer != nil && task.Offer != offer { return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer) } // write resource limits into the pod spec which is transfered to the executor. From here // on we can expect that the pod spec of a task has proper limits for CPU and memory. // TODO(sttts): For a later separation of the kubelet and the executor also patch the pod on the apiserver if unlimitedCPU := mresource.LimitPodCPU(&task.Pod, k.defaultContainerCPULimit); unlimitedCPU { log.Warningf("Pod %s/%s without cpu limits is admitted %.2f cpu shares", task.Pod.Namespace, task.Pod.Name, mresource.PodCPULimit(&task.Pod)) } if unlimitedMem := mresource.LimitPodMem(&task.Pod, k.defaultContainerMemLimit); unlimitedMem { log.Warningf("Pod %s/%s without memory limits is admitted %.2f MB", task.Pod.Namespace, task.Pod.Name, mresource.PodMemLimit(&task.Pod)) } task.Offer = offer task.FillFromDetails(details) if err := k.api.tasks().Update(task); err != nil { offer.Release() return "", err } return slave.HostName, nil } }
func TestAcceptOfferPorts(t *testing.T) { t.Parallel() task, _ := fakePodTask("foo") pod := &task.Pod offer := &mesos.Offer{ Resources: []*mesos.Resource{ mutil.NewScalarResource("cpus", t_min_cpu), mutil.NewScalarResource("mem", t_min_mem), rangeResource("ports", []uint64{1, 1}), }, } if ok := task.AcceptOffer(offer); !ok { t.Fatalf("did not accepted offer %v:", offer) } pod.Spec = api.PodSpec{ Containers: []api.Container{{ Ports: []api.ContainerPort{{ HostPort: 123, }}, }}, } mresource.LimitPodCPU(&task.Pod, mresource.DefaultDefaultContainerCPULimit) mresource.LimitPodMem(&task.Pod, mresource.DefaultDefaultContainerMemLimit) if ok := task.AcceptOffer(offer); ok { t.Fatalf("accepted offer %v:", offer) } pod.Spec.Containers[0].Ports[0].HostPort = 1 if ok := task.AcceptOffer(offer); !ok { t.Fatalf("did not accepted offer %v:", offer) } pod.Spec.Containers[0].Ports[0].HostPort = 0 if ok := task.AcceptOffer(offer); !ok { t.Fatalf("did not accepted offer %v:", offer) } offer.Resources = []*mesos.Resource{ mutil.NewScalarResource("cpus", t_min_cpu), mutil.NewScalarResource("mem", t_min_mem), } if ok := task.AcceptOffer(offer); ok { t.Fatalf("accepted offer %v:", offer) } pod.Spec.Containers[0].Ports[0].HostPort = 1 if ok := task.AcceptOffer(offer); ok { t.Fatalf("accepted offer %v:", offer) } }
func TestLimitedResources(t *testing.T) { assert := assert.New(t) task, _ := fakePodTask("limited") pod := &task.Pod pod.Spec = api.PodSpec{ Containers: []api.Container{{ Name: "a", Resources: api.ResourceRequirements{ Limits: api.ResourceList{ api.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), api.ResourceMemory: *resource.NewQuantity(256*1024*1024, resource.BinarySI), }, }, }, { Name: "b", Resources: api.ResourceRequirements{ Limits: api.ResourceList{ api.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), api.ResourceMemory: *resource.NewQuantity(512*1024*1024, resource.BinarySI), }, }, }}, } beforeLimitingCPU := mresource.CPUForPod(pod, mresource.DefaultDefaultContainerCPULimit) beforeLimitingMem := mresource.MemForPod(pod, mresource.DefaultDefaultContainerMemLimit) unboundedCPU := mresource.LimitPodCPU(pod, mresource.DefaultDefaultContainerCPULimit) unboundedMem := mresource.LimitPodMem(pod, mresource.DefaultDefaultContainerMemLimit) cpu := mresource.PodCPULimit(pod) mem := mresource.PodMemLimit(pod) assert.False(unboundedCPU, "CPU resources are defined as limited") assert.False(unboundedMem, "mem resources are defined as limited") assert.Equal(3.0, float64(cpu)) assert.Equal(768.0, float64(mem)) assert.Equal(cpu, beforeLimitingCPU) assert.Equal(mem, beforeLimitingMem) }
func TestEmptyOffer(t *testing.T) { t.Parallel() task, err := fakePodTask("foo") if err != nil { t.Fatal(err) } task.Pod.Spec = api.PodSpec{ Containers: []api.Container{{ Name: "a", }}, } mresource.LimitPodCPU(&task.Pod, mresource.DefaultDefaultContainerCPULimit) mresource.LimitPodMem(&task.Pod, mresource.DefaultDefaultContainerMemLimit) if ok := task.AcceptOffer(nil); ok { t.Fatalf("accepted nil offer") } if ok := task.AcceptOffer(&mesos.Offer{}); ok { t.Fatalf("accepted empty offer") } }
func TestNoPortsInPodOrOffer(t *testing.T) { t.Parallel() task, err := fakePodTask("foo") if err != nil || task == nil { t.Fatal(err) } task.Pod.Spec = api.PodSpec{ Containers: []api.Container{{ Name: "a", }}, } mresource.LimitPodCPU(&task.Pod, mresource.DefaultDefaultContainerCPULimit) mresource.LimitPodMem(&task.Pod, mresource.DefaultDefaultContainerMemLimit) offer := &mesos.Offer{ Resources: []*mesos.Resource{ mutil.NewScalarResource("cpus", 0.001), mutil.NewScalarResource("mem", 0.001), }, } if ok := task.AcceptOffer(offer); ok { t.Fatalf("accepted offer %v:", offer) } offer = &mesos.Offer{ Resources: []*mesos.Resource{ mutil.NewScalarResource("cpus", t_min_cpu), mutil.NewScalarResource("mem", t_min_mem), }, } if ok := task.AcceptOffer(offer); !ok { t.Fatalf("did not accepted offer %v:", offer) } }
func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.ExecutorInfo, *uid.UID, error) { ci := &mesos.CommandInfo{ Shell: proto.Bool(false), } if s.ExecutorPath != "" { uri, executorCmd := s.serveFrameworkArtifact(s.ExecutorPath) ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)}) ci.Value = proto.String(fmt.Sprintf("./%s", executorCmd)) } else if !hks.FindServer(hyperkube.CommandMinion) { return nil, nil, fmt.Errorf("either run this scheduler via km or else --executor-path is required") } else { if strings.Index(s.KMPath, "://") > 0 { // URI could point directly to executable, e.g. hdfs:///km // or else indirectly, e.g. http://acmestorage/tarball.tgz // so we assume that for this case the command will always "km" ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(s.KMPath), Executable: proto.Bool(true)}) ci.Value = proto.String("./km") // TODO(jdef) extract constant } else if s.KMPath != "" { uri, kmCmd := s.serveFrameworkArtifact(s.KMPath) ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)}) ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd)) } else { uri, kmCmd := s.serveFrameworkArtifact(s.executable) ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri), Executable: proto.Bool(true)}) ci.Value = proto.String(fmt.Sprintf("./%s", kmCmd)) } ci.Arguments = append(ci.Arguments, hyperkube.CommandMinion) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--run-proxy=%v", s.RunProxy)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-bindall=%v", s.ProxyBindall)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--proxy-logv=%d", s.ProxyLogV)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-size=%v", s.MinionLogMaxSize.String())) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-backups=%d", s.MinionLogMaxBackups)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--max-log-age=%d", s.MinionLogMaxAgeInDays)) } //TODO(jdef): provide some way (env var?) for users to customize executor config //TODO(jdef): set -address to 127.0.0.1 if `address` is 127.0.0.1 //TODO(jdef): propagate dockercfg from RootDirectory? apiServerArgs := strings.Join(s.APIServerList, ",") ci.Arguments = append(ci.Arguments, fmt.Sprintf("--api-servers=%s", apiServerArgs)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--v=%d", s.ExecutorLogV)) // this also applies to the minion ci.Arguments = append(ci.Arguments, fmt.Sprintf("--allow-privileged=%t", s.AllowPrivileged)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--suicide-timeout=%v", s.ExecutorSuicideTimeout)) if s.ExecutorBindall { //TODO(jdef) determine whether hostname-override is really needed for bindall because //it conflicts with kubelet node status checks/updates //ci.Arguments = append(ci.Arguments, "--hostname-override=0.0.0.0") ci.Arguments = append(ci.Arguments, "--address=0.0.0.0") } ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cgroup-prefix=%v", s.ExecutorCgroupPrefix)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.KubeletCadvisorPort)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.KubeletSyncFrequency)) if s.AuthPath != "" { //TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file uri, basename := s.serveFrameworkArtifact(s.AuthPath) ci.Uris = append(ci.Uris, &mesos.CommandInfo_URI{Value: proto.String(uri)}) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--auth-path=%s", basename)) } appendOptional := func(name string, value string) { if value != "" { ci.Arguments = append(ci.Arguments, fmt.Sprintf("--%s=%s", name, value)) } } if s.ClusterDNS != nil { appendOptional("cluster-dns", s.ClusterDNS.String()) } appendOptional("cluster-domain", s.ClusterDomain) appendOptional("root-dir", s.KubeletRootDirectory) appendOptional("docker-endpoint", s.KubeletDockerEndpoint) appendOptional("pod-infra-container-image", s.KubeletPodInfraContainerImage) appendOptional("host-network-sources", s.KubeletHostNetworkSources) appendOptional("network-plugin", s.KubeletNetworkPluginName) log.V(1).Infof("prepared executor command %q with args '%+v'", ci.GetValue(), ci.Arguments) // Create mesos scheduler driver. execInfo := &mesos.ExecutorInfo{ Command: ci, Name: proto.String(execcfg.DefaultInfoName), Source: proto.String(execcfg.DefaultInfoSource), } // Check for staticPods var staticPodCPUs, staticPodMem float64 if s.StaticPodsConfigPath != "" { bs, paths, err := archive.ZipDir(s.StaticPodsConfigPath) if err != nil { return nil, nil, err } // try to read pod files and sum resources // TODO(sttts): don't terminate when static pods are broken, but skip them // TODO(sttts): add a directory watch and tell running executors about updates for _, podPath := range paths { podJson, err := ioutil.ReadFile(podPath) if err != nil { return nil, nil, fmt.Errorf("error reading static pod spec: %v", err) } pod := api.Pod{} err = json.Unmarshal(podJson, &pod) if err != nil { return nil, nil, fmt.Errorf("error parsing static pod spec at %v: %v", podPath, err) } // TODO(sttts): allow unlimited static pods as well and patch in the default resource limits unlimitedCPU := mresource.LimitPodCPU(&pod, s.DefaultContainerCPULimit) unlimitedMem := mresource.LimitPodMem(&pod, s.DefaultContainerMemLimit) if unlimitedCPU { return nil, nil, fmt.Errorf("found static pod without limit on cpu resources: %v", podPath) } if unlimitedMem { return nil, nil, fmt.Errorf("found static pod without limit on memory resources: %v", podPath) } cpu := mresource.PodCPULimit(&pod) mem := mresource.PodMemLimit(&pod) log.V(2).Infof("reserving %.2f cpu shares and %.2f MB of memory to static pod %s", cpu, mem, pod.Name) staticPodCPUs += float64(cpu) staticPodMem += float64(mem) } // pass zipped pod spec to executor execInfo.Data = bs } execInfo.Resources = []*mesos.Resource{ mutil.NewScalarResource("cpus", float64(executorCPUs)+staticPodCPUs), mutil.NewScalarResource("mem", float64(executorMem)+staticPodMem), } // calculate ExecutorInfo hash to be used for validating compatibility // of ExecutorInfo's generated by other HA schedulers. ehash := hashExecutorInfo(execInfo) eid := uid.New(ehash, execcfg.DefaultInfoID) execInfo.ExecutorId = &mesos.ExecutorID{Value: proto.String(eid.String())} return execInfo, eid, nil }