func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) error { if runningInHost, err := isProcessRunningInHost(pid); err != nil { // Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context. return err } else if !runningInHost { // Process is running inside a container. Don't touch that. return nil } var errs []error cont, err := getContainer(pid) if err != nil { errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err)) } if cont != manager.Cgroups.Name { err = manager.Apply(pid) if err != nil { errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name)) } } // Also apply oom-score-adj to processes oomAdjuster := oom.NewOOMAdjuster() if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil { errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid)) } return utilerrors.NewAggregate(errs) }
// Creates resource-only containerName if it does not already exist and moves // the current process to it. // // containerName must be an absolute container name. func RunInResourceContainer(containerName string) error { manager := fs.Manager{ Cgroups: &configs.Cgroup{ Name: containerName, AllowAllDevices: true, }, } return manager.Apply(os.Getpid()) }
// Ensures that the Docker daemon is in the desired container. func ensureDockerInContainer(cadvisor cadvisor.Interface, oomScoreAdj int, manager *fs.Manager) error { // What container is Docker in? out, err := exec.Command("pidof", "docker").Output() if err != nil { return fmt.Errorf("failed to find pid of Docker container: %v", err) } // The output of pidof is a list of pids. // Docker may be forking and thus there would be more than one result. pids := []int{} for _, pidStr := range strings.Split(strings.TrimSpace(string(out)), " ") { pid, err := strconv.Atoi(pidStr) if err != nil { continue } pids = append(pids, pid) } // Move if the pid is not already in the desired container. errs := []error{} for _, pid := range pids { if runningInHost, err := isProcessRunningInHost(pid); err != nil { errs = append(errs, err) // Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context. continue } else if !runningInHost { // Docker daemon is running inside a container. Don't touch that. continue } cont, err := getContainer(pid) if err != nil { errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err)) } if cont != manager.Cgroups.Name { err = manager.Apply(pid) if err != nil { errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name)) } } // Also apply oom-score-adj to processes oomAdjuster := oom.NewOOMAdjuster() if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil { errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid)) } } return utilerrors.NewAggregate(errs) }
// Creates resource-only containerName if it does not already exist and moves // the current process to it. // // containerName must be an absolute container name. func RunInResourceContainer(containerName string) error { allowAllDevices := true manager := fs.Manager{ Cgroups: &configs.Cgroup{ Parent: "/", Name: containerName, Resources: &configs.Resources{ AllowAllDevices: &allowAllDevices, }, }, } return manager.Apply(os.Getpid()) }
// Ensures the system container is created and all non-kernel threads and process 1 // without a container are moved to it. // // The reason of leaving kernel threads at root cgroup is that we don't want to tie the // execution of these threads with to-be defined /system quota and create priority inversions. // func ensureSystemCgroups(rootContainer *fs.Manager, manager *fs.Manager) error { // Move non-kernel PIDs to the system container. attemptsRemaining := 10 var errs []error for attemptsRemaining >= 0 { // Only keep errors on latest attempt. errs = []error{} attemptsRemaining-- allPids, err := rootContainer.GetPids() if err != nil { errs = append(errs, fmt.Errorf("failed to list PIDs for root: %v", err)) continue } // Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers) pids := make([]int, 0, len(allPids)) for _, pid := range allPids { if pid == 1 || isKernelPid(pid) { continue } pids = append(pids, pid) } glog.Infof("Found %d PIDs in root, %d of them are not to be moved", len(allPids), len(allPids)-len(pids)) // Check if we have moved all the non-kernel PIDs. if len(pids) == 0 { break } glog.Infof("Moving non-kernel processes: %v", pids) for _, pid := range pids { err := manager.Apply(pid) if err != nil { errs = append(errs, fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, manager.Cgroups.Name, err)) } } } if attemptsRemaining < 0 { errs = append(errs, fmt.Errorf("ran out of attempts to create system containers %q", manager.Cgroups.Name)) } return utilerrors.NewAggregate(errs) }
// Ensures that the Docker daemon is in the desired container. func ensureDockerInContainer(cadvisor cadvisor.Interface, oomScoreAdj int, manager *fs.Manager) error { pids, err := getPidsForProcess("docker") if err != nil { return err } // Move if the pid is not already in the desired container. errs := []error{} for _, pid := range pids { if runningInHost, err := isProcessRunningInHost(pid); err != nil { errs = append(errs, err) // Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context. continue } else if !runningInHost { // Docker daemon is running inside a container. Don't touch that. continue } cont, err := getContainer(pid) if err != nil { errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err)) } if cont != manager.Cgroups.Name { err = manager.Apply(pid) if err != nil { errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name)) } } // Also apply oom-score-adj to processes oomAdjuster := oom.NewOOMAdjuster() if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil { errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid)) } } return utilerrors.NewAggregate(errs) }
func (cm *containerManagerImpl) setupNode() error { f, err := validateSystemRequirements(cm.mountUtil) if err != nil { return err } if !f.cpuHardcapping { cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported") } // TODO: plumb kernel tunable options into container manager, right now, we modify by default if err := setupKernelTunables(KernelTunableModify); err != nil { return err } systemContainers := []*systemContainer{} if cm.ContainerRuntime == "docker" { if cm.RuntimeCgroupsName != "" { cont := newSystemCgroups(cm.RuntimeCgroupsName) info, err := cm.cadvisorInterface.MachineInfo() var capacity = api.ResourceList{} if err != nil { } else { capacity = cadvisor.CapacityFromMachineInfo(info) } memoryLimit := (int64(capacity.Memory().Value() * DockerMemoryLimitThresholdPercent / 100)) if memoryLimit < MinDockerMemoryLimit { glog.Warningf("Memory limit %d for container %s is too small, reset it to %d", memoryLimit, cm.RuntimeCgroupsName, MinDockerMemoryLimit) memoryLimit = MinDockerMemoryLimit } glog.V(2).Infof("Configure resource-only container %s with memory limit: %d", cm.RuntimeCgroupsName, memoryLimit) dockerContainer := &fs.Manager{ Cgroups: &configs.Cgroup{ Parent: "/", Name: cm.RuntimeCgroupsName, Resources: &configs.Resources{ Memory: memoryLimit, MemorySwap: -1, AllowAllDevices: true, }, }, } dockerVersion := getDockerVersion(cm.cadvisorInterface) cont.ensureStateFunc = func(manager *fs.Manager) error { return ensureDockerInContainer(dockerVersion, -900, dockerContainer) } systemContainers = append(systemContainers, cont) } else { cm.periodicTasks = append(cm.periodicTasks, func() { cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile) if err != nil { glog.Error(err) return } glog.V(2).Infof("Discovered runtime cgroups name: %s", cont) cm.Lock() defer cm.Unlock() cm.RuntimeCgroupsName = cont }) } } if cm.SystemCgroupsName != "" { if cm.SystemCgroupsName == "/" { return fmt.Errorf("system container cannot be root (\"/\")") } cont := newSystemCgroups(cm.SystemCgroupsName) rootContainer := &fs.Manager{ Cgroups: &configs.Cgroup{ Parent: "/", Name: "/", }, } cont.ensureStateFunc = func(manager *fs.Manager) error { return ensureSystemCgroups(rootContainer, manager) } systemContainers = append(systemContainers, cont) } if cm.KubeletCgroupsName != "" { cont := newSystemCgroups(cm.KubeletCgroupsName) manager := fs.Manager{ Cgroups: &configs.Cgroup{ Parent: "/", Name: cm.KubeletCgroupsName, Resources: &configs.Resources{ AllowAllDevices: true, }, }, } cont.ensureStateFunc = func(_ *fs.Manager) error { return manager.Apply(os.Getpid()) } systemContainers = append(systemContainers, cont) } else { cm.periodicTasks = append(cm.periodicTasks, func() { cont, err := getContainer(os.Getpid()) if err != nil { glog.Errorf("failed to find cgroups of kubelet - %v", err) return } cm.Lock() defer cm.Unlock() cm.KubeletCgroupsName = cont }) } cm.systemContainers = systemContainers return nil }
// spawnDaemon executes a double fork to start the user command with proper // isolation. Stores the child process for use in Wait. func (e *LinuxExecutor) spawnDaemon() error { bin, err := discover.NomadExecutable() if err != nil { return fmt.Errorf("Failed to determine the nomad executable: %v", err) } // Serialize the cmd and the cgroup configuration so it can be passed to the // sub-process. var buffer bytes.Buffer enc := json.NewEncoder(&buffer) c := command.DaemonConfig{ Cmd: e.cmd.Cmd, Chroot: e.taskDir, StdoutFile: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stdout", e.taskName)), StderrFile: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stderr", e.taskName)), StdinFile: "/dev/null", } if err := enc.Encode(c); err != nil { return fmt.Errorf("Failed to serialize daemon configuration: %v", err) } // Create a pipe to capture Stdout. pr, pw, err := os.Pipe() if err != nil { return err } e.spawnOutputWriter = pw e.spawnOutputReader = pr // Call ourselves using a hidden flag. The new instance of nomad will join // the passed cgroup, forkExec the cmd, and output status codes through // Stdout. escaped := strconv.Quote(buffer.String()) spawn := exec.Command(bin, "spawn-daemon", escaped) spawn.Stdout = e.spawnOutputWriter // Capture its Stdin. spawnStdIn, err := spawn.StdinPipe() if err != nil { return err } if err := spawn.Start(); err != nil { fmt.Errorf("Failed to call spawn-daemon on nomad executable: %v", err) } // Join the spawn-daemon to the cgroup. if e.groups != nil { manager := cgroupFs.Manager{} manager.Cgroups = e.groups // Apply will place the current pid into the tasks file for each of the // created cgroups: // /sys/fs/cgroup/memory/user/1000.user/4.session/<uuid>/tasks // // Apply requires superuser permissions, and may fail if Nomad is not run with // the required permissions if err := manager.Apply(spawn.Process.Pid); err != nil { errs := new(multierror.Error) errs = multierror.Append(errs, fmt.Errorf("Failed to join spawn-daemon to the cgroup (config => %+v): %v", manager.Cgroups, err)) if err := sendAbortCommand(spawnStdIn); err != nil { errs = multierror.Append(errs, err) } return errs } } // Tell it to start. if err := sendStartCommand(spawnStdIn); err != nil { return err } // Parse the response. dec := json.NewDecoder(e.spawnOutputReader) var resp command.SpawnStartStatus if err := dec.Decode(&resp); err != nil { return fmt.Errorf("Failed to parse spawn-daemon start response: %v", err) } if resp.ErrorMsg != "" { return fmt.Errorf("Failed to execute user command: %s", resp.ErrorMsg) } e.spawnChild = *spawn return nil }
// Ensures the system container is created and all non-kernel threads and process 1 // without a container are moved to it. // // The reason of leaving kernel threads at root cgroup is that we don't want to tie the // execution of these threads with to-be defined /system quota and create priority inversions. // // The reason of leaving process 1 at root cgroup is that libcontainer hardcoded on // the base cgroup path based on process 1. Please see: // https://github.com/kubernetes/kubernetes/issues/12789#issuecomment-132384126 // for detail explanation. func ensureSystemContainer(rootContainer *fs.Manager, manager *fs.Manager) error { // Move non-kernel PIDs to the system container. attemptsRemaining := 10 var errs []error for attemptsRemaining >= 0 { // Only keep errors on latest attempt. errs = []error{} attemptsRemaining-- allPids, err := rootContainer.GetPids() if err != nil { errs = append(errs, fmt.Errorf("failed to list PIDs for root: %v", err)) continue } // Get PIDs already in target group so we can remove them from the list of // PIDs to move. systemCgroupPIDs, err := manager.GetPids() if err != nil { errs = append(errs, fmt.Errorf("failed to list PIDs for %s: %v", manager.Cgroups.Name, err)) continue } systemCgroupPIDMap := make(map[int]struct{}, len(systemCgroupPIDs)) for _, pid := range systemCgroupPIDs { systemCgroupPIDMap[pid] = struct{}{} } // Remove kernel pids and process 1 pids := make([]int, 0, len(allPids)) for _, pid := range allPids { if isKernelPid(pid) { continue } if _, ok := systemCgroupPIDMap[pid]; ok { continue } pids = append(pids, pid) } glog.Infof("Found %d PIDs in root, %d of them are kernel related", len(allPids), len(allPids)-len(pids)) // Check if we moved all the non-kernel PIDs. if len(pids) == 0 { break } glog.Infof("Moving non-kernel threads: %v", pids) for _, pid := range pids { err := manager.Apply(pid) if err != nil { errs = append(errs, fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, manager.Cgroups.Name, err)) } } } if attemptsRemaining < 0 { errs = append(errs, fmt.Errorf("ran out of attempts to create system containers %q", manager.Cgroups.Name)) } return utilerrors.NewAggregate(errs) }