func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) error {
	if runningInHost, err := isProcessRunningInHost(pid); err != nil {
		// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
		return err
	} else if !runningInHost {
		// Process is running inside a container. Don't touch that.
		return nil
	}

	var errs []error
	cont, err := getContainer(pid)
	if err != nil {
		errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
	}

	if cont != manager.Cgroups.Name {
		err = manager.Apply(pid)
		if err != nil {
			errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name))
		}
	}

	// Also apply oom-score-adj to processes
	oomAdjuster := oom.NewOOMAdjuster()
	if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
		errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid))
	}
	return utilerrors.NewAggregate(errs)
}
示例#2
0
// Destroy destroys the specified cgroup
func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error {
	//cgroup name
	name := cgroupConfig.Name

	// Get map of all cgroup paths on the system for the particular cgroup
	cgroupPaths := make(map[string]string, len(m.subsystems.MountPoints))
	for key, val := range m.subsystems.MountPoints {
		cgroupPaths[key] = path.Join(val, name)
	}

	// Initialize libcontainer's cgroup config
	libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
		Name:   path.Base(name),
		Parent: path.Dir(name),
	}
	fsCgroupManager := cgroupfs.Manager{
		Cgroups: libcontainerCgroupConfig,
		Paths:   cgroupPaths,
	}

	// Delete cgroups using libcontainers Managers Destroy() method
	if err := fsCgroupManager.Destroy(); err != nil {
		return fmt.Errorf("Unable to destroy cgroup paths for cgroup %v : %v", name, err)
	}
	return nil
}
示例#3
0
func getPids(cgroupName string) ([]int, error) {
	fsManager := fs.Manager{
		Cgroups: &configs.Cgroup{
			Name: cgroupName,
		},
	}
	return fsManager.GetPids()
}
// Creates resource-only containerName if it does not already exist and moves
// the current process to it.
//
// containerName must be an absolute container name.
func RunInResourceContainer(containerName string) error {
	manager := fs.Manager{
		Cgroups: &configs.Cgroup{
			Name:            containerName,
			AllowAllDevices: true,
		},
	}

	return manager.Apply(os.Getpid())
}
// Ensures that the Docker daemon is in the desired container.
func ensureDockerInContainer(cadvisor cadvisor.Interface, oomScoreAdj int, manager *fs.Manager) error {
	// What container is Docker in?
	out, err := exec.Command("pidof", "docker").Output()
	if err != nil {
		return fmt.Errorf("failed to find pid of Docker container: %v", err)
	}

	// The output of pidof is a list of pids.
	// Docker may be forking and thus there would be more than one result.
	pids := []int{}
	for _, pidStr := range strings.Split(strings.TrimSpace(string(out)), " ") {
		pid, err := strconv.Atoi(pidStr)
		if err != nil {
			continue
		}
		pids = append(pids, pid)
	}

	// Move if the pid is not already in the desired container.
	errs := []error{}
	for _, pid := range pids {
		if runningInHost, err := isProcessRunningInHost(pid); err != nil {
			errs = append(errs, err)
			// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
			continue
		} else if !runningInHost {
			// Docker daemon is running inside a container. Don't touch that.
			continue
		}

		cont, err := getContainer(pid)
		if err != nil {
			errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
		}

		if cont != manager.Cgroups.Name {
			err = manager.Apply(pid)
			if err != nil {
				errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name))
			}
		}

		// Also apply oom-score-adj to processes
		oomAdjuster := oom.NewOOMAdjuster()
		if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
			errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid))
		}
	}

	return utilerrors.NewAggregate(errs)
}
// Creates resource-only containerName if it does not already exist and moves
// the current process to it.
//
// containerName must be an absolute container name.
func RunInResourceContainer(containerName string) error {
	allowAllDevices := true
	manager := fs.Manager{
		Cgroups: &configs.Cgroup{
			Parent: "/",
			Name:   containerName,
			Resources: &configs.Resources{
				AllowAllDevices: &allowAllDevices,
			},
		},
	}

	return manager.Apply(os.Getpid())
}
示例#7
0
// Stats collects all the resource usage information from a container.
func Stats(containerDir string, containerMemoryLimit int64, machineMemory int64) (*ResourceStats, error) {
	f, err := os.Open(filepath.Join(containerDir, "state.json"))
	if err != nil {
		return nil, err
	}
	defer f.Close()

	type network struct {
		Type              string
		HostInterfaceName string
	}

	state := struct {
		CgroupPaths map[string]string `json:"cgroup_paths"`
		Networks    []network
	}{}

	if err := json.NewDecoder(f).Decode(&state); err != nil {
		return nil, err
	}
	now := time.Now()

	mgr := fs.Manager{Paths: state.CgroupPaths}
	cstats, err := mgr.GetStats()
	if err != nil {
		return nil, err
	}
	stats := &libcontainer.Stats{CgroupStats: cstats}
	// if the container does not have any memory limit specified set the
	// limit to the machines memory
	memoryLimit := containerMemoryLimit
	if memoryLimit == 0 {
		memoryLimit = machineMemory
	}
	for _, iface := range state.Networks {
		switch iface.Type {
		case "veth":
			istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
			if err != nil {
				return nil, err
			}
			stats.Interfaces = append(stats.Interfaces, istats)
		}
	}
	return &ResourceStats{
		Stats:       stats,
		Read:        now,
		MemoryLimit: memoryLimit,
	}, nil
}
// Ensures the system container is created and all non-kernel threads and process 1
// without a container are moved to it.
//
// The reason of leaving kernel threads at root cgroup is that we don't want to tie the
// execution of these threads with to-be defined /system quota and create priority inversions.
//
func ensureSystemCgroups(rootContainer *fs.Manager, manager *fs.Manager) error {
	// Move non-kernel PIDs to the system container.
	attemptsRemaining := 10
	var errs []error
	for attemptsRemaining >= 0 {
		// Only keep errors on latest attempt.
		errs = []error{}
		attemptsRemaining--

		allPids, err := rootContainer.GetPids()
		if err != nil {
			errs = append(errs, fmt.Errorf("failed to list PIDs for root: %v", err))
			continue
		}

		// Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers)
		pids := make([]int, 0, len(allPids))
		for _, pid := range allPids {
			if pid == 1 || isKernelPid(pid) {
				continue
			}

			pids = append(pids, pid)
		}
		glog.Infof("Found %d PIDs in root, %d of them are not to be moved", len(allPids), len(allPids)-len(pids))

		// Check if we have moved all the non-kernel PIDs.
		if len(pids) == 0 {
			break
		}

		glog.Infof("Moving non-kernel processes: %v", pids)
		for _, pid := range pids {
			err := manager.Apply(pid)
			if err != nil {
				errs = append(errs, fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, manager.Cgroups.Name, err))
			}
		}

	}
	if attemptsRemaining < 0 {
		errs = append(errs, fmt.Errorf("ran out of attempts to create system containers %q", manager.Cgroups.Name))
	}

	return utilerrors.NewAggregate(errs)
}
// Ensures that the Docker daemon is in the desired container.
func ensureDockerInContainer(cadvisor cadvisor.Interface, oomScoreAdj int, manager *fs.Manager) error {
	pids, err := getPidsForProcess("docker")
	if err != nil {
		return err
	}
	// Move if the pid is not already in the desired container.
	errs := []error{}
	for _, pid := range pids {
		if runningInHost, err := isProcessRunningInHost(pid); err != nil {
			errs = append(errs, err)
			// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
			continue
		} else if !runningInHost {
			// Docker daemon is running inside a container. Don't touch that.
			continue
		}

		cont, err := getContainer(pid)
		if err != nil {
			errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
		}

		if cont != manager.Cgroups.Name {
			err = manager.Apply(pid)
			if err != nil {
				errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name))
			}
		}

		// Also apply oom-score-adj to processes
		oomAdjuster := oom.NewOOMAdjuster()
		if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
			errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid))
		}
	}

	return utilerrors.NewAggregate(errs)
}
示例#10
0
func (e *LinuxExecutor) destroyCgroup() error {
	if e.groups == nil {
		return errors.New("Can't destroy: cgroup configuration empty")
	}

	manager := cgroupFs.Manager{}
	manager.Cgroups = e.groups
	pids, err := manager.GetPids()
	if err != nil {
		return fmt.Errorf("Failed to get pids in the cgroup %v: %v", e.groups.Name, err)
	}

	errs := new(multierror.Error)
	for _, pid := range pids {
		process, err := os.FindProcess(pid)
		if err != nil {
			multierror.Append(errs, fmt.Errorf("Failed to find Pid %v: %v", pid, err))
			continue
		}

		if err := process.Kill(); err != nil {
			multierror.Append(errs, fmt.Errorf("Failed to kill Pid %v: %v", pid, err))
			continue
		}

		if _, err := process.Wait(); err != nil {
			multierror.Append(errs, fmt.Errorf("Failed to wait Pid %v: %v", pid, err))
			continue
		}
	}

	// Remove the cgroup.
	if err := manager.Destroy(); err != nil {
		multierror.Append(errs, fmt.Errorf("Failed to delete the cgroup directories: %v", err))
	}

	if len(errs.Errors) != 0 {
		return fmt.Errorf("Failed to destroy cgroup: %v", errs)
	}

	return nil
}
func (cm *containerManagerImpl) setupNode() error {
	f, err := validateSystemRequirements(cm.mountUtil)
	if err != nil {
		return err
	}
	if !f.cpuHardcapping {
		cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported")
	}
	// TODO: plumb kernel tunable options into container manager, right now, we modify by default
	if err := setupKernelTunables(KernelTunableModify); err != nil {
		return err
	}

	systemContainers := []*systemContainer{}
	if cm.ContainerRuntime == "docker" {
		if cm.RuntimeCgroupsName != "" {
			cont := newSystemCgroups(cm.RuntimeCgroupsName)
			info, err := cm.cadvisorInterface.MachineInfo()
			var capacity = api.ResourceList{}
			if err != nil {
			} else {
				capacity = cadvisor.CapacityFromMachineInfo(info)
			}
			memoryLimit := (int64(capacity.Memory().Value() * DockerMemoryLimitThresholdPercent / 100))
			if memoryLimit < MinDockerMemoryLimit {
				glog.Warningf("Memory limit %d for container %s is too small, reset it to %d", memoryLimit, cm.RuntimeCgroupsName, MinDockerMemoryLimit)
				memoryLimit = MinDockerMemoryLimit
			}

			glog.V(2).Infof("Configure resource-only container %s with memory limit: %d", cm.RuntimeCgroupsName, memoryLimit)

			dockerContainer := &fs.Manager{
				Cgroups: &configs.Cgroup{
					Parent: "/",
					Name:   cm.RuntimeCgroupsName,
					Resources: &configs.Resources{
						Memory:          memoryLimit,
						MemorySwap:      -1,
						AllowAllDevices: true,
					},
				},
			}
			dockerVersion := getDockerVersion(cm.cadvisorInterface)
			cont.ensureStateFunc = func(manager *fs.Manager) error {
				return ensureDockerInContainer(dockerVersion, -900, dockerContainer)
			}
			systemContainers = append(systemContainers, cont)
		} else {
			cm.periodicTasks = append(cm.periodicTasks, func() {
				cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
				if err != nil {
					glog.Error(err)
					return
				}
				glog.V(2).Infof("Discovered runtime cgroups name: %s", cont)
				cm.Lock()
				defer cm.Unlock()
				cm.RuntimeCgroupsName = cont
			})
		}
	}

	if cm.SystemCgroupsName != "" {
		if cm.SystemCgroupsName == "/" {
			return fmt.Errorf("system container cannot be root (\"/\")")
		}
		cont := newSystemCgroups(cm.SystemCgroupsName)
		rootContainer := &fs.Manager{
			Cgroups: &configs.Cgroup{
				Parent: "/",
				Name:   "/",
			},
		}
		cont.ensureStateFunc = func(manager *fs.Manager) error {
			return ensureSystemCgroups(rootContainer, manager)
		}
		systemContainers = append(systemContainers, cont)
	}

	if cm.KubeletCgroupsName != "" {
		cont := newSystemCgroups(cm.KubeletCgroupsName)
		manager := fs.Manager{
			Cgroups: &configs.Cgroup{
				Parent: "/",
				Name:   cm.KubeletCgroupsName,
				Resources: &configs.Resources{
					AllowAllDevices: true,
				},
			},
		}
		cont.ensureStateFunc = func(_ *fs.Manager) error {
			return manager.Apply(os.Getpid())
		}
		systemContainers = append(systemContainers, cont)
	} else {
		cm.periodicTasks = append(cm.periodicTasks, func() {
			cont, err := getContainer(os.Getpid())
			if err != nil {
				glog.Errorf("failed to find cgroups of kubelet - %v", err)
				return
			}
			cm.Lock()
			defer cm.Unlock()

			cm.KubeletCgroupsName = cont
		})
	}

	cm.systemContainers = systemContainers
	return nil
}
示例#12
0
// spawnDaemon executes a double fork to start the user command with proper
// isolation. Stores the child process for use in Wait.
func (e *LinuxExecutor) spawnDaemon() error {
	bin, err := discover.NomadExecutable()
	if err != nil {
		return fmt.Errorf("Failed to determine the nomad executable: %v", err)
	}

	// Serialize the cmd and the cgroup configuration so it can be passed to the
	// sub-process.
	var buffer bytes.Buffer
	enc := json.NewEncoder(&buffer)

	c := command.DaemonConfig{
		Cmd:        e.cmd.Cmd,
		Chroot:     e.taskDir,
		StdoutFile: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stdout", e.taskName)),
		StderrFile: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stderr", e.taskName)),
		StdinFile:  "/dev/null",
	}
	if err := enc.Encode(c); err != nil {
		return fmt.Errorf("Failed to serialize daemon configuration: %v", err)
	}

	// Create a pipe to capture Stdout.
	pr, pw, err := os.Pipe()
	if err != nil {
		return err
	}
	e.spawnOutputWriter = pw
	e.spawnOutputReader = pr

	// Call ourselves using a hidden flag. The new instance of nomad will join
	// the passed cgroup, forkExec the cmd, and output status codes through
	// Stdout.
	escaped := strconv.Quote(buffer.String())
	spawn := exec.Command(bin, "spawn-daemon", escaped)
	spawn.Stdout = e.spawnOutputWriter

	// Capture its Stdin.
	spawnStdIn, err := spawn.StdinPipe()
	if err != nil {
		return err
	}

	if err := spawn.Start(); err != nil {
		fmt.Errorf("Failed to call spawn-daemon on nomad executable: %v", err)
	}

	// Join the spawn-daemon to the cgroup.
	if e.groups != nil {
		manager := cgroupFs.Manager{}
		manager.Cgroups = e.groups

		// Apply will place the current pid into the tasks file for each of the
		// created cgroups:
		//  /sys/fs/cgroup/memory/user/1000.user/4.session/<uuid>/tasks
		//
		// Apply requires superuser permissions, and may fail if Nomad is not run with
		// the required permissions
		if err := manager.Apply(spawn.Process.Pid); err != nil {
			errs := new(multierror.Error)
			errs = multierror.Append(errs, fmt.Errorf("Failed to join spawn-daemon to the cgroup (config => %+v): %v", manager.Cgroups, err))

			if err := sendAbortCommand(spawnStdIn); err != nil {
				errs = multierror.Append(errs, err)
			}

			return errs
		}
	}

	// Tell it to start.
	if err := sendStartCommand(spawnStdIn); err != nil {
		return err
	}

	// Parse the response.
	dec := json.NewDecoder(e.spawnOutputReader)
	var resp command.SpawnStartStatus
	if err := dec.Decode(&resp); err != nil {
		return fmt.Errorf("Failed to parse spawn-daemon start response: %v", err)
	}

	if resp.ErrorMsg != "" {
		return fmt.Errorf("Failed to execute user command: %s", resp.ErrorMsg)
	}

	e.spawnChild = *spawn
	return nil
}
// Ensures the system container is created and all non-kernel threads and process 1
// without a container are moved to it.
//
// The reason of leaving kernel threads at root cgroup is that we don't want to tie the
// execution of these threads with to-be defined /system quota and create priority inversions.
//
// The reason of leaving process 1 at root cgroup is that libcontainer hardcoded on
// the base cgroup path based on process 1. Please see:
// https://github.com/kubernetes/kubernetes/issues/12789#issuecomment-132384126
// for detail explanation.
func ensureSystemContainer(rootContainer *fs.Manager, manager *fs.Manager) error {
	// Move non-kernel PIDs to the system container.
	attemptsRemaining := 10
	var errs []error
	for attemptsRemaining >= 0 {
		// Only keep errors on latest attempt.
		errs = []error{}
		attemptsRemaining--

		allPids, err := rootContainer.GetPids()
		if err != nil {
			errs = append(errs, fmt.Errorf("failed to list PIDs for root: %v", err))
			continue
		}

		// Get PIDs already in target group so we can remove them from the list of
		// PIDs to move.
		systemCgroupPIDs, err := manager.GetPids()
		if err != nil {
			errs = append(errs, fmt.Errorf("failed to list PIDs for %s: %v", manager.Cgroups.Name, err))
			continue
		}

		systemCgroupPIDMap := make(map[int]struct{}, len(systemCgroupPIDs))
		for _, pid := range systemCgroupPIDs {
			systemCgroupPIDMap[pid] = struct{}{}
		}

		// Remove kernel pids and process 1
		pids := make([]int, 0, len(allPids))
		for _, pid := range allPids {
			if isKernelPid(pid) {
				continue
			}

			if _, ok := systemCgroupPIDMap[pid]; ok {
				continue
			}

			pids = append(pids, pid)
		}
		glog.Infof("Found %d PIDs in root, %d of them are kernel related", len(allPids), len(allPids)-len(pids))

		// Check if we moved all the non-kernel PIDs.
		if len(pids) == 0 {
			break
		}

		glog.Infof("Moving non-kernel threads: %v", pids)
		for _, pid := range pids {
			err := manager.Apply(pid)
			if err != nil {
				errs = append(errs, fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, manager.Cgroups.Name, err))
			}
		}

	}
	if attemptsRemaining < 0 {
		errs = append(errs, fmt.Errorf("ran out of attempts to create system containers %q", manager.Cgroups.Name))
	}

	return utilerrors.NewAggregate(errs)
}