func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { root, err := getCgroupRoot() if err != nil { return nil, err } if (c.Name != "" || c.Parent != "") && c.Path != "" { return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used") } // XXX: Do not remove this code. Path safety is important! -- cyphar cgPath := libcontainerUtils.CleanPath(c.Path) cgParent := libcontainerUtils.CleanPath(c.Parent) cgName := libcontainerUtils.CleanPath(c.Name) innerPath := cgPath if innerPath == "" { innerPath = filepath.Join(cgParent, cgName) } return &cgroupData{ root: root, innerPath: innerPath, config: c, pid: pid, }, nil }
// Do the mount operation followed by additional mounts required to take care // of propagation flags. func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { var ( dest = m.Destination data = label.FormatMountLabel(m.Data, mountLabel) flags = m.Flags ) if libcontainerUtils.CleanPath(dest) == "/dev" { flags &= ^syscall.MS_RDONLY } copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP if !(copyUp || strings.HasPrefix(dest, rootfs)) { dest = filepath.Join(rootfs, dest) } if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil { return err } for _, pflag := range m.PropagationFlags { if err := syscall.Mount("", dest, "", uintptr(pflag), ""); err != nil { return err } } return nil }
// needsSetupDev returns true if /dev needs to be set up. func needsSetupDev(config *configs.Config) bool { for _, m := range config.Mounts { if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" { return false } } return true }
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc. // dest is required to be an abs path and have any symlinks resolved before calling this function. func checkMountDestination(rootfs, dest string) error { if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) { return fmt.Errorf("mounting into / is prohibited") } invalidDestinations := []string{ "/proc", } // White list, it should be sub directories of invalid destinations validDestinations := []string{ // These entries can be bind mounted by files emulated by fuse, // so commands like top, free displays stats in container. "/proc/cpuinfo", "/proc/diskstats", "/proc/meminfo", "/proc/stat", "/proc/swaps", "/proc/uptime", "/proc/net/dev", } for _, valid := range validDestinations { path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) if err != nil { return err } if path == "." { return nil } } for _, invalid := range invalidDestinations { path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest) if err != nil { return err } if path == "." || !strings.HasPrefix(path, "..") { return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid) } } return nil }
// ensureParent makes sure that the parent directory of current is created // and populated with the proper cpus and mems files copied from // it's parent. func (s *CpusetGroup) ensureParent(current, root string) error { parent := filepath.Dir(current) if libcontainerUtils.CleanPath(parent) == root { return nil } // Avoid infinite recursion. if parent == current { return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") } if err := s.ensureParent(parent, root); err != nil { return err } if err := os.MkdirAll(current, 0755); err != nil { return err } return s.copyIfNeeded(current, parent) }
// getCgroupPath gets the file path to the "devices" subsystem of the desired cgroup. // cgroupPath is the path in the cgroup hierarchy. func getCgroupPath(cgroupPath string) (string, error) { cgroupPath = libcontainerutils.CleanPath(cgroupPath) mnt, root, err := libcontainercgroups.FindCgroupMountpointAndRoot("devices") // If we didn't mount the subsystem, there is no point we make the path. if err != nil { return "", err } // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. if filepath.IsAbs(cgroupPath) { // Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'. return filepath.Join(root, mnt, cgroupPath), nil } parentPath, err := getCgroupParentPath(mnt, root) if err != nil { return "", err } return filepath.Join(parentPath, cgroupPath), nil }
// finalizeRootfs actually switches the root of the process and sets anything // to ro if necessary. You must call prepareRootfs first. func finalizeRootfs(config *configs.Config) (err error) { // remount dev as ro if specified for _, m := range config.Mounts { if libcontainerUtils.CleanPath(m.Destination) == "/dev" { if m.Flags&syscall.MS_RDONLY != 0 { if err := remountReadonly(m.Destination); err != nil { return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) } } break } } // set rootfs ( / ) as readonly if config.Readonlyfs { if err := setReadonly(); err != nil { return newSystemErrorWithCause(err, "setting rootfs as readonly") } } syscall.Umask(0022) return nil }
func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*configs.Cgroup, error) { var myCgroupPath string c := &configs.Cgroup{ Resources: &configs.Resources{}, } if spec.Linux != nil && spec.Linux.CgroupsPath != nil { myCgroupPath = libcontainerUtils.CleanPath(*spec.Linux.CgroupsPath) if useSystemdCgroup { myCgroupPath = *spec.Linux.CgroupsPath } } if useSystemdCgroup { if myCgroupPath == "" { c.Parent = "system.slice" c.ScopePrefix = "runc" c.Name = name } else { // Parse the path from expected "slice:prefix:name" // for e.g. "system.slice:docker:1234" parts := strings.Split(myCgroupPath, ":") if len(parts) != 3 { return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups") } c.Parent = parts[0] c.ScopePrefix = parts[1] c.Name = parts[2] } } else { if myCgroupPath == "" { c.Name = name } c.Path = myCgroupPath } c.Resources.AllowedDevices = allowedDevices if spec.Linux == nil { return c, nil } r := spec.Linux.Resources if r == nil { return c, nil } for i, d := range spec.Linux.Resources.Devices { var ( t = "a" major = int64(-1) minor = int64(-1) ) if d.Type != nil { t = *d.Type } if d.Major != nil { major = *d.Major } if d.Minor != nil { minor = *d.Minor } if d.Access == nil || *d.Access == "" { return nil, fmt.Errorf("device access at %d field cannot be empty", i) } dt, err := stringToDeviceRune(t) if err != nil { return nil, err } dd := &configs.Device{ Type: dt, Major: major, Minor: minor, Permissions: *d.Access, Allow: d.Allow, } c.Resources.Devices = append(c.Resources.Devices, dd) } // append the default allowed devices to the end of the list c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) if r.Memory != nil { if r.Memory.Limit != nil { c.Resources.Memory = int64(*r.Memory.Limit) } if r.Memory.Reservation != nil { c.Resources.MemoryReservation = int64(*r.Memory.Reservation) } if r.Memory.Swap != nil { c.Resources.MemorySwap = int64(*r.Memory.Swap) } if r.Memory.Kernel != nil { c.Resources.KernelMemory = int64(*r.Memory.Kernel) } if r.Memory.KernelTCP != nil { c.Resources.KernelMemoryTCP = int64(*r.Memory.KernelTCP) } if r.Memory.Swappiness != nil { swappiness := int64(*r.Memory.Swappiness) c.Resources.MemorySwappiness = &swappiness } } if r.CPU != nil { if r.CPU.Shares != nil { c.Resources.CpuShares = int64(*r.CPU.Shares) } if r.CPU.Quota != nil { c.Resources.CpuQuota = int64(*r.CPU.Quota) } if r.CPU.Period != nil { c.Resources.CpuPeriod = int64(*r.CPU.Period) } if r.CPU.RealtimeRuntime != nil { c.Resources.CpuRtRuntime = int64(*r.CPU.RealtimeRuntime) } if r.CPU.RealtimePeriod != nil { c.Resources.CpuRtPeriod = int64(*r.CPU.RealtimePeriod) } if r.CPU.Cpus != nil { c.Resources.CpusetCpus = *r.CPU.Cpus } if r.CPU.Mems != nil { c.Resources.CpusetMems = *r.CPU.Mems } } if r.Pids != nil { c.Resources.PidsLimit = *r.Pids.Limit } if r.BlockIO != nil { if r.BlockIO.Weight != nil { c.Resources.BlkioWeight = *r.BlockIO.Weight } if r.BlockIO.LeafWeight != nil { c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight } if r.BlockIO.WeightDevice != nil { for _, wd := range r.BlockIO.WeightDevice { var weight, leafWeight uint16 if wd.Weight != nil { weight = *wd.Weight } if wd.LeafWeight != nil { leafWeight = *wd.LeafWeight } weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) } } if r.BlockIO.ThrottleReadBpsDevice != nil { for _, td := range r.BlockIO.ThrottleReadBpsDevice { var rate uint64 if td.Rate != nil { rate = *td.Rate } throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) } } if r.BlockIO.ThrottleWriteBpsDevice != nil { for _, td := range r.BlockIO.ThrottleWriteBpsDevice { var rate uint64 if td.Rate != nil { rate = *td.Rate } throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) } } if r.BlockIO.ThrottleReadIOPSDevice != nil { for _, td := range r.BlockIO.ThrottleReadIOPSDevice { var rate uint64 if td.Rate != nil { rate = *td.Rate } throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) } } if r.BlockIO.ThrottleWriteIOPSDevice != nil { for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { var rate uint64 if td.Rate != nil { rate = *td.Rate } throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) } } } for _, l := range r.HugepageLimits { c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ Pagesize: *l.Pagesize, Limit: *l.Limit, }) } if r.DisableOOMKiller != nil { c.Resources.OomKillDisable = *r.DisableOOMKiller } if r.Network != nil { if r.Network.ClassID != nil { c.Resources.NetClsClassid = *r.Network.ClassID } for _, m := range r.Network.Priorities { c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ Interface: m.Name, Priority: int64(m.Priority), }) } } return c, nil }
// setupRootfs sets up the devices, mount points, and filesystems for use inside a // new mount namespace. func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) { if err := prepareRoot(config); err != nil { return newSystemErrorWithCause(err, "preparing rootfs") } setupDev := needsSetupDev(config) for _, m := range config.Mounts { for _, precmd := range m.PremountCmds { if err := mountCmd(precmd); err != nil { return newSystemErrorWithCause(err, "running premount command") } } if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) } for _, postcmd := range m.PostmountCmds { if err := mountCmd(postcmd); err != nil { return newSystemErrorWithCause(err, "running postmount command") } } } if setupDev { if err := createDevices(config); err != nil { return newSystemErrorWithCause(err, "creating device nodes") } if err := setupPtmx(config, console); err != nil { return newSystemErrorWithCause(err, "setting up ptmx") } if err := setupDevSymlinks(config.Rootfs); err != nil { return newSystemErrorWithCause(err, "setting up /dev symlinks") } } // Signal the parent to run the pre-start hooks. // The hooks are run after the mounts are setup, but before we switch to the new // root, so that the old root is still available in the hooks for any mount // manipulations. if err := syncParentHooks(pipe); err != nil { return err } if err := syscall.Chdir(config.Rootfs); err != nil { return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) } if config.NoPivotRoot { err = msMoveRoot(config.Rootfs) } else { err = pivotRoot(config.Rootfs, config.PivotDir) } if err != nil { return newSystemErrorWithCause(err, "jailing process inside rootfs") } if setupDev { if err := reOpenDevNull(); err != nil { return newSystemErrorWithCause(err, "reopening /dev/null inside container") } } // remount dev as ro if specifed for _, m := range config.Mounts { if libcontainerUtils.CleanPath(m.Destination) == "/dev" { if m.Flags&syscall.MS_RDONLY != 0 { if err := remountReadonly(m.Destination); err != nil { return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) } } break } } // set rootfs ( / ) as readonly if config.Readonlyfs { if err := setReadonly(); err != nil { return newSystemErrorWithCause(err, "setting rootfs as readonly") } } syscall.Umask(0022) return nil }
func createCgroupConfig(name string, spec *specs.LinuxSpec) (*configs.Cgroup, error) { var ( err error myCgroupPath string ) if spec.Linux.CgroupsPath != nil { myCgroupPath = libcontainerUtils.CleanPath(*spec.Linux.CgroupsPath) } else { myCgroupPath, err = cgroups.GetThisCgroupDir("devices") if err != nil { return nil, err } } c := &configs.Cgroup{ Path: filepath.Join(myCgroupPath, name), Resources: &configs.Resources{}, } c.Resources.AllowedDevices = allowedDevices r := spec.Linux.Resources if r == nil { return c, nil } for i, d := range spec.Linux.Resources.Devices { var ( t = 'a' major = int64(-1) minor = int64(-1) ) if d.Type != nil { t = *d.Type } if d.Major != nil { major = *d.Major } if d.Minor != nil { minor = *d.Minor } if d.Access == nil || *d.Access == "" { return nil, fmt.Errorf("device access at %d field canot be empty", i) } dd := &configs.Device{ Type: t, Major: major, Minor: minor, Permissions: *d.Access, Allow: d.Allow, } c.Resources.Devices = append(c.Resources.Devices, dd) } // append the default allowed devices to the end of the list c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) if r.Memory != nil { if r.Memory.Limit != nil { c.Resources.Memory = int64(*r.Memory.Limit) } if r.Memory.Reservation != nil { c.Resources.MemoryReservation = int64(*r.Memory.Reservation) } if r.Memory.Swap != nil { c.Resources.MemorySwap = int64(*r.Memory.Swap) } if r.Memory.Kernel != nil { c.Resources.KernelMemory = int64(*r.Memory.Kernel) } if r.Memory.Swappiness != nil { c.Resources.MemorySwappiness = int64(*r.Memory.Swappiness) } } if r.CPU != nil { if r.CPU.Shares != nil { c.Resources.CpuShares = int64(*r.CPU.Shares) } if r.CPU.Quota != nil { c.Resources.CpuQuota = int64(*r.CPU.Quota) } if r.CPU.Period != nil { c.Resources.CpuPeriod = int64(*r.CPU.Period) } if r.CPU.RealtimeRuntime != nil { c.Resources.CpuRtRuntime = int64(*r.CPU.RealtimeRuntime) } if r.CPU.RealtimePeriod != nil { c.Resources.CpuRtPeriod = int64(*r.CPU.RealtimePeriod) } if r.CPU.Cpus != nil { c.Resources.CpusetCpus = *r.CPU.Cpus } if r.CPU.Mems != nil { c.Resources.CpusetMems = *r.CPU.Mems } } if r.Pids != nil { c.Resources.PidsLimit = *r.Pids.Limit } if r.BlockIO != nil { if r.BlockIO.Weight != nil { c.Resources.BlkioWeight = *r.BlockIO.Weight } if r.BlockIO.LeafWeight != nil { c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight } if r.BlockIO.WeightDevice != nil { for _, wd := range r.BlockIO.WeightDevice { weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, *wd.Weight, *wd.LeafWeight) c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) } } if r.BlockIO.ThrottleReadBpsDevice != nil { for _, td := range r.BlockIO.ThrottleReadBpsDevice { throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate) c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) } } if r.BlockIO.ThrottleWriteBpsDevice != nil { for _, td := range r.BlockIO.ThrottleWriteBpsDevice { throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate) c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) } } if r.BlockIO.ThrottleReadIOPSDevice != nil { for _, td := range r.BlockIO.ThrottleReadIOPSDevice { throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate) c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) } } if r.BlockIO.ThrottleWriteIOPSDevice != nil { for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate) c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) } } } for _, l := range r.HugepageLimits { c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ Pagesize: *l.Pagesize, Limit: *l.Limit, }) } if r.DisableOOMKiller != nil { c.Resources.OomKillDisable = *r.DisableOOMKiller } if r.Network != nil { if r.Network.ClassID != nil { c.Resources.NetClsClassid = string(*r.Network.ClassID) } for _, m := range r.Network.Priorities { c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ Interface: m.Name, Priority: int64(m.Priority), }) } } return c, nil }