func switchCgroup(src, target int) error { cgroupFile := fmt.Sprintf("/proc/%d/cgroup", target) f, err := os.Open(cgroupFile) if err != nil { return err } defer f.Close() targetCgroups := map[string]string{} s := bufio.NewScanner(f) for s.Scan() { text := s.Text() parts := strings.Split(text, ":") subparts := strings.Split(parts[1], "=") subsystem := subparts[0] if len(subparts) > 1 { subsystem = subparts[1] } targetPath := fmt.Sprintf("/host/sys/fs/cgroup/%s%s", subsystem, parts[2]) log.Infof("Moving Docker to cgroup %s", targetPath) targetCgroups[subsystem] = targetPath } if err := s.Err(); err != nil { return err } return cgroups.EnterPid(targetCgroups, src) }
func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() if err = p.execSetns(); err != nil { return newSystemError(err) } if len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemError(err) } } if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return newSystemError(err) } if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } // wait for the child process to fully complete and receive an error message // if one was encoutered var ierr *genericError if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { return newSystemError(err) } if ierr != nil { return newSystemError(ierr) } return nil }
func (m *Manager) Apply(pid int) (err error) { if m.Cgroups == nil { return nil } var c = m.Cgroups d, err := getCgroupData(m.Cgroups, pid) if err != nil { return err } if c.Paths != nil { paths := make(map[string]string) for name, path := range c.Paths { _, err := d.path(name) if err != nil { if cgroups.IsNotFound(err) { continue } return err } paths[name] = path } m.Paths = paths return cgroups.EnterPid(m.Paths, pid) } paths := make(map[string]string) defer func() { if err != nil { cgroups.RemovePaths(paths) } }() for _, sys := range subsystems { if err := sys.Apply(d); err != nil { return err } // TODO: Apply should, ideally, be reentrant or be broken up into a separate // create and join phase so that the cgroup hierarchy for a container can be // created then join consists of writing the process pids to cgroup.procs p, err := d.path(sys.Name()) if err != nil { if cgroups.IsNotFound(err) { continue } return err } paths[sys.Name()] = p } m.Paths = paths return nil }
func (m *Manager) Apply(pid int) (err error) { if m.Cgroups == nil { return nil } var c = m.Cgroups d, err := getCgroupData(m.Cgroups, pid) if err != nil { return err } if c.Paths != nil { paths := make(map[string]string) for name, path := range c.Paths { _, err := d.path(name) if err != nil { if cgroups.IsNotFound(err) { continue } return err } paths[name] = path } m.Paths = paths return cgroups.EnterPid(m.Paths, pid) } m.mu.Lock() defer m.mu.Unlock() paths := make(map[string]string) for _, sys := range subsystems { if err := sys.Apply(d); err != nil { return err } // TODO: Apply should, ideally, be reentrant or be broken up into a separate // create and join phase so that the cgroup hierarchy for a container can be // created then join consists of writing the process pids to cgroup.procs p, err := d.path(sys.Name()) if err != nil { // The non-presence of the devices subsystem is // considered fatal for security reasons. if cgroups.IsNotFound(err) && sys.Name() != "devices" { continue } return err } paths[sys.Name()] = p } m.Paths = paths return nil }
func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() err = p.cmd.Start() p.childPipe.Close() p.rootDir.Close() if err != nil { return newSystemErrorWithCause(err, "starting setns process") } if p.bootstrapData != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } } if err = p.execSetns(); err != nil { return newSystemErrorWithCause(err, "executing setns process") } if len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } // set oom_score_adj if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { return newSystemErrorWithCause(err, "setting oom score") } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { return newSystemErrorWithCause(err, "setting rlimits for process") } if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { return newSystemErrorWithCause(err, "writing config to pipe") } if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemErrorWithCause(err, "calling shutdown on init pipe") } // wait for the child process to fully complete and receive an error message // if one was encoutered var ierr *genericError if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { return newSystemErrorWithCause(err, "decoding init error from pipe") } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() return ierr } return nil }
func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() err = p.cmd.Start() p.childPipe.Close() if err != nil { return newSystemError(err) } if p.bootstrapData != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { return newSystemError(err) } } if err = p.execSetns(); err != nil { return newSystemError(err) } if len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemError(err) } } if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { return newSystemError(err) } // set oom_score_adj if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { return newSystemError(err) } if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } // wait for the child process to fully complete and receive an error message // if one was encoutered var ierr *genericError if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { return newSystemError(err) } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() return newSystemError(ierr) } return nil }
func (m *Manager) Apply(pid int) error { var ( c = m.Cgroups unitName = getUnitName(c) slice = "system.slice" properties []systemdDbus.Property ) if c.Paths != nil { paths := make(map[string]string) for name, path := range c.Paths { _, err := getSubsystemPath(m.Cgroups, name) if err != nil { // Don't fail if a cgroup hierarchy was not found, just skip this subsystem if cgroups.IsNotFound(err) { continue } return err } paths[name] = path } m.Paths = paths return cgroups.EnterPid(m.Paths, pid) } if c.Parent != "" { slice = c.Parent } properties = append(properties, systemdDbus.PropSlice(slice), systemdDbus.PropDescription("docker container "+c.Name), newProp("PIDs", []uint32{uint32(pid)}), // This is only supported on systemd versions 218 and above. newProp("Delegate", true), ) // Always enable accounting, this gets us the same behaviour as the fs implementation, // plus the kernel has some problems with joining the memory cgroup at a later time. properties = append(properties, newProp("MemoryAccounting", true), newProp("CPUAccounting", true), newProp("BlockIOAccounting", true)) if hasTransientDefaultDependencies { properties = append(properties, newProp("DefaultDependencies", false)) } if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryLimit", uint64(c.Resources.Memory))) } if c.Resources.CpuShares != 0 { properties = append(properties, newProp("CPUShares", uint64(c.Resources.CpuShares))) } if c.Resources.BlkioWeight != 0 { properties = append(properties, newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } // We need to set kernel memory before processes join cgroup because // kmem.limit_in_bytes can only be set when the cgroup is empty. // And swap memory limit needs to be set after memory limit, only // memory limit is handled by systemd, so it's kind of ugly here. if c.Resources.KernelMemory > 0 { if err := setKernelMemory(c); err != nil { return err } } if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil { return err } if err := joinCgroups(c, pid); err != nil { return err } paths := make(map[string]string) for _, s := range subsystems { subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) if err != nil { // Don't fail if a cgroup hierarchy was not found, just skip this subsystem if cgroups.IsNotFound(err) { continue } return err } paths[s.Name()] = subsystemPath } m.Paths = paths return nil }
func (m *Manager) Apply(pid int) error { var ( c = m.Cgroups unitName = getUnitName(c) slice = "system.slice" properties []systemdDbus.Property ) if c.Paths != nil { paths := make(map[string]string) for name, path := range c.Paths { _, err := getSubsystemPath(m.Cgroups, name) if err != nil { // Don't fail if a cgroup hierarchy was not found, just skip this subsystem if cgroups.IsNotFound(err) { continue } return err } paths[name] = path } m.Paths = paths return cgroups.EnterPid(m.Paths, pid) } if c.Parent != "" { slice = c.Parent } properties = append(properties, systemdDbus.PropSlice(slice), systemdDbus.PropDescription("docker container "+c.Name), newProp("PIDs", []uint32{uint32(pid)}), ) // Always enable accounting, this gets us the same behaviour as the fs implementation, // plus the kernel has some problems with joining the memory cgroup at a later time. properties = append(properties, newProp("MemoryAccounting", true), newProp("CPUAccounting", true), newProp("BlockIOAccounting", true)) if hasTransientDefaultDependencies { properties = append(properties, newProp("DefaultDependencies", false)) } if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryLimit", uint64(c.Resources.Memory))) } if c.Resources.CpuShares != 0 { properties = append(properties, newProp("CPUShares", uint64(c.Resources.CpuShares))) } if c.Resources.BlkioWeight != 0 { properties = append(properties, newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } // We need to set kernel memory before processes join cgroup because // kmem.limit_in_bytes can only be set when the cgroup is empty. // And swap memory limit needs to be set after memory limit, only // memory limit is handled by systemd, so it's kind of ugly here. if c.Resources.KernelMemory > 0 { if err := setKernelMemory(c); err != nil { return err } } if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil { return err } if err := joinDevices(c, pid); err != nil { return err } // TODO: CpuQuota and CpuPeriod not available in systemd // we need to manually join the cpu.cfs_quota_us and cpu.cfs_period_us if err := joinCpu(c, pid); err != nil { return err } // TODO: MemoryReservation and MemorySwap not available in systemd if err := joinMemory(c, pid); err != nil { return err } // we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd // because it does not currently support it via the dbus api. if err := joinFreezer(c, pid); err != nil { return err } if err := joinNetPrio(c, pid); err != nil { return err } if err := joinNetCls(c, pid); err != nil { return err } if err := joinPids(c, pid); err != nil { return err } if err := joinCpuset(c, pid); err != nil { return err } if err := joinHugetlb(c, pid); err != nil { return err } if err := joinPerfEvent(c, pid); err != nil { return err } // FIXME: Systemd does have `BlockIODeviceWeight` property, but we got problem // using that (at least on systemd 208, see https://github.com/opencontainers/runc/libcontainer/pull/354), // so use fs work around for now. if err := joinBlkio(c, pid); err != nil { return err } paths := make(map[string]string) for _, s := range subsystems { subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) if err != nil { // Don't fail if a cgroup hierarchy was not found, just skip this subsystem if cgroups.IsNotFound(err) { continue } return err } paths[s.Name()] = subsystemPath } m.Paths = paths return nil }
func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() err = p.cmd.Start() p.childPipe.Close() p.rootDir.Close() if err != nil { return newSystemErrorWithCause(err, "starting setns process") } if p.bootstrapData != nil { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } } if err = p.execSetns(); err != nil { return newSystemErrorWithCause(err, "executing setns process") } if len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } // set oom_score_adj if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { return newSystemErrorWithCause(err, "setting oom score") } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { return newSystemErrorWithCause(err, "setting rlimits for process") } if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { return newSystemErrorWithCause(err, "writing config to pipe") } ierr := parseSync(p.parentPipe, func(sync *syncT) error { switch sync.Type { case procConsole: if err := writeSync(p.parentPipe, procConsoleReq); err != nil { return newSystemErrorWithCause(err, "writing syncT 'request fd'") } masterFile, err := utils.RecvFd(p.parentPipe) if err != nil { return newSystemErrorWithCause(err, "getting master pty from child pipe") } if p.process.consoleChan == nil { // TODO: Don't panic here, do something more sane. panic("consoleChan is nil") } p.process.consoleChan <- masterFile if err := writeSync(p.parentPipe, procConsoleAck); err != nil { return newSystemErrorWithCause(err, "writing syncT 'ack fd'") } case procReady: // This shouldn't happen. panic("unexpected procReady in setns") case procHooks: // This shouldn't happen. panic("unexpected procHooks in setns") default: return newSystemError(fmt.Errorf("invalid JSON payload from child")) } return nil }) if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemErrorWithCause(err, "calling shutdown on init pipe") } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() return ierr } return nil }
func (m *Manager) Apply(pid int) error { var ( c = m.Cgroups unitName = getUnitName(c) slice = "system.slice" properties []systemdDbus.Property ) if c.Paths != nil { paths := make(map[string]string) for name, path := range c.Paths { _, err := getSubsystemPath(m.Cgroups, name) if err != nil { // Don't fail if a cgroup hierarchy was not found, just skip this subsystem if cgroups.IsNotFound(err) { continue } return err } paths[name] = path } m.Paths = paths return cgroups.EnterPid(m.Paths, pid) } if c.Parent != "" { slice = c.Parent } properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) // if we create a slice, the parent is defined via a Wants= if strings.HasSuffix(unitName, ".slice") { // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 if !hasStartTransientSliceUnit { return fmt.Errorf("systemd version does not support ability to start a slice as transient unit") } properties = append(properties, systemdDbus.PropWants(slice)) } else { // otherwise, we use Slice= properties = append(properties, systemdDbus.PropSlice(slice)) } // only add pid if its valid, -1 is used w/ general slice creation. if pid != -1 { properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) } if hasDelegate { // This is only supported on systemd versions 218 and above. properties = append(properties, newProp("Delegate", true)) } // Always enable accounting, this gets us the same behaviour as the fs implementation, // plus the kernel has some problems with joining the memory cgroup at a later time. properties = append(properties, newProp("MemoryAccounting", true), newProp("CPUAccounting", true), newProp("BlockIOAccounting", true)) if hasTransientDefaultDependencies { properties = append(properties, newProp("DefaultDependencies", false)) } if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryLimit", uint64(c.Resources.Memory))) } if c.Resources.CpuShares != 0 { properties = append(properties, newProp("CPUShares", uint64(c.Resources.CpuShares))) } if c.Resources.BlkioWeight != 0 { properties = append(properties, newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } // We have to set kernel memory here, as we can't change it once // processes have been attached to the cgroup. if c.Resources.KernelMemory != 0 { if err := setKernelMemory(c); err != nil { return err } } if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil { return err } if err := joinCgroups(c, pid); err != nil { return err } paths := make(map[string]string) for _, s := range subsystems { subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) if err != nil { // Don't fail if a cgroup hierarchy was not found, just skip this subsystem if cgroups.IsNotFound(err) { continue } return err } paths[s.Name()] = subsystemPath } m.Paths = paths return nil }