// applyLayer is the entry-point for docker-applylayer on re-exec. This is not // used on Windows as it does not support chroot, hence no point sandboxing // through chroot and rexec. func applyLayer() { var ( tmpDir string err error options *archive.TarOptions ) runtime.LockOSThread() flag.Parse() inUserns := rsystem.RunningInUserNS() if err := chroot(flag.Arg(0)); err != nil { fatal(err) } // We need to be able to set any perms oldmask, err := system.Umask(0) defer system.Umask(oldmask) if err != nil { fatal(err) } if err := json.Unmarshal([]byte(os.Getenv("OPT")), &options); err != nil { fatal(err) } if inUserns { options.InUserNS = true } if tmpDir, err = ioutil.TempDir("/", "temp-docker-extract"); err != nil { fatal(err) } os.Setenv("TMPDIR", tmpDir) size, err := archive.UnpackLayer("/", os.Stdin, options) os.RemoveAll(tmpDir) if err != nil { fatal(err) } encoder := json.NewEncoder(os.Stdout) if err := encoder.Encode(applyLayerResponse{size}); err != nil { fatal(fmt.Errorf("unable to encode layerSize JSON: %s", err)) } if _, err := flush(os.Stdin); err != nil { fatal(err) } os.Exit(0) }
// Create the device nodes in the container. func createDevices(config *configs.Config) error { useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) oldMask := syscall.Umask(0000) for _, node := range config.Devices { // containers running in a user namespace are not allowed to mknod // devices so we can just bind mount it from the host. if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil { syscall.Umask(oldMask) return err } } syscall.Umask(oldMask) return nil }
// applyLayerHandler parses a diff in the standard layer format from `layer`, and // applies it to the directory `dest`. Returns the size in bytes of the // contents of the layer. func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions, decompress bool) (size int64, err error) { dest = filepath.Clean(dest) if decompress { decompressed, err := archive.DecompressStream(layer) if err != nil { return 0, err } defer decompressed.Close() layer = decompressed } if options == nil { options = &archive.TarOptions{} if rsystem.RunningInUserNS() { options.InUserNS = true } } if options.ExcludePatterns == nil { options.ExcludePatterns = []string{} } data, err := json.Marshal(options) if err != nil { return 0, fmt.Errorf("ApplyLayer json encode: %v", err) } cmd := reexec.Command("docker-applyLayer", dest) cmd.Stdin = layer cmd.Env = append(cmd.Env, fmt.Sprintf("OPT=%s", data)) outBuf, errBuf := new(bytes.Buffer), new(bytes.Buffer) cmd.Stdout, cmd.Stderr = outBuf, errBuf if err = cmd.Run(); err != nil { return 0, fmt.Errorf("ApplyLayer %s stdout: %s stderr: %s", err, outBuf, errBuf) } // Stdout should be a valid JSON struct representing an applyLayerResponse. response := applyLayerResponse{} decoder := json.NewDecoder(outBuf) if err = decoder.Decode(&response); err != nil { return 0, fmt.Errorf("unable to decode ApplyLayer JSON response: %s", err) } return response.LayerSize, nil }
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { if system.RunningInUserNS() { return nil } devices := cgroup.Resources.Devices if len(devices) > 0 { for _, dev := range devices { file := "devices.deny" if dev.Allow { file = "devices.allow" } if err := writeFile(path, file, dev.CgroupString()); err != nil { return err } } return nil } if cgroup.Resources.AllowAllDevices != nil { if *cgroup.Resources.AllowAllDevices == false { if err := writeFile(path, "devices.deny", "a"); err != nil { return err } for _, dev := range cgroup.Resources.AllowedDevices { if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil { return err } } return nil } if err := writeFile(path, "devices.allow", "a"); err != nil { return err } } for _, dev := range cgroup.Resources.DeniedDevices { if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil { return err } } return nil }
func setupOOMScoreAdj(score int) error { f, err := os.OpenFile("/proc/self/oom_score_adj", os.O_WRONLY, 0) if err != nil { return err } defer f.Close() stringScore := strconv.Itoa(score) _, err = f.WriteString(stringScore) if os.IsPermission(err) { // Setting oom_score_adj does not work in an // unprivileged container. Ignore the error, but log // it if we appear not to be in that situation. if !rsystem.RunningInUserNS() { logrus.Debugf("Permission denied writing %q to /proc/self/oom_score_adj", stringScore) } return nil } return err }
func setOOMScore(pid, score int) error { oomScoreAdjPath := fmt.Sprintf("/proc/%d/oom_score_adj", pid) f, err := os.OpenFile(oomScoreAdjPath, os.O_WRONLY, 0) if err != nil { return err } stringScore := strconv.Itoa(score) _, err = f.WriteString(stringScore) f.Close() if os.IsPermission(err) { // Setting oom_score_adj does not work in an // unprivileged container. Ignore the error, but log // it if we appear not to be in that situation. if !system.RunningInUserNS() { logrus.Debugf("Permission denied writing %q to %s", stringScore, oomScoreAdjPath) } return nil } return err }
// handleTarTypeBlockCharFifo is an OS-specific helper function used by // createTarFile to handle the following types of header: Block; Char; Fifo func handleTarTypeBlockCharFifo(hdr *tar.Header, path string) error { if rsystem.RunningInUserNS() { // cannot create a device if running in user namespace return nil } mode := uint32(hdr.Mode & 07777) switch hdr.Typeflag { case tar.TypeBlock: mode |= syscall.S_IFBLK case tar.TypeChar: mode |= syscall.S_IFCHR case tar.TypeFifo: mode |= syscall.S_IFIFO } if err := system.Mknod(path, mode, int(system.Mkdev(hdr.Devmajor, hdr.Devminor))); err != nil { return err } return nil }
// Return a nil error if the kernel supports aufs // We cannot modprobe because inside dind modprobe fails // to run func supportsAufs() error { // We can try to modprobe aufs first before looking at // proc/filesystems for when aufs is supported exec.Command("modprobe", "aufs").Run() if rsystem.RunningInUserNS() { return ErrAufsNested } f, err := os.Open("/proc/filesystems") if err != nil { return err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { if strings.Contains(s.Text(), "aufs") { return nil } } return ErrAufsNotSupported }
func copyDir(srcDir, dstDir string, flags copyFlags) error { err := filepath.Walk(srcDir, func(srcPath string, f os.FileInfo, err error) error { if err != nil { return err } // Rebase path relPath, err := filepath.Rel(srcDir, srcPath) if err != nil { return err } dstPath := filepath.Join(dstDir, relPath) if err != nil { return err } stat, ok := f.Sys().(*syscall.Stat_t) if !ok { return fmt.Errorf("Unable to get raw syscall.Stat_t data for %s", srcPath) } isHardlink := false switch f.Mode() & os.ModeType { case 0: // Regular file if flags©Hardlink != 0 { isHardlink = true if err := os.Link(srcPath, dstPath); err != nil { return err } } else { if err := copyRegular(srcPath, dstPath, f.Mode()); err != nil { return err } } case os.ModeDir: if err := os.Mkdir(dstPath, f.Mode()); err != nil && !os.IsExist(err) { return err } case os.ModeSymlink: link, err := os.Readlink(srcPath) if err != nil { return err } if err := os.Symlink(link, dstPath); err != nil { return err } case os.ModeNamedPipe: fallthrough case os.ModeSocket: if rsystem.RunningInUserNS() { // cannot create a device if running in user namespace return nil } if err := syscall.Mkfifo(dstPath, stat.Mode); err != nil { return err } case os.ModeDevice: if err := syscall.Mknod(dstPath, stat.Mode, int(stat.Rdev)); err != nil { return err } default: return fmt.Errorf("Unknown file type for %s\n", srcPath) } // Everything below is copying metadata from src to dst. All this metadata // already shares an inode for hardlinks. if isHardlink { return nil } if err := os.Lchown(dstPath, int(stat.Uid), int(stat.Gid)); err != nil { return err } if err := copyXattr(srcPath, dstPath, "security.capability"); err != nil { return err } // We need to copy this attribute if it appears in an overlay upper layer, as // this function is used to copy those. It is set by overlay if a directory // is removed and then re-created and should not inherit anything from the // same dir in the lower dir. if err := copyXattr(srcPath, dstPath, "trusted.overlay.opaque"); err != nil { return err } isSymlink := f.Mode()&os.ModeSymlink != 0 // There is no LChmod, so ignore mode for symlink. Also, this // must happen after chown, as that can modify the file mode if !isSymlink { if err := os.Chmod(dstPath, f.Mode()); err != nil { return err } } // system.Chtimes doesn't support a NOFOLLOW flag atm if !isSymlink { aTime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) mTime := time.Unix(int64(stat.Mtim.Sec), int64(stat.Mtim.Nsec)) if err := system.Chtimes(dstPath, aTime, mTime); err != nil { return err } } else { ts := []syscall.Timespec{stat.Atim, stat.Mtim} if err := system.LUtimesNano(dstPath, ts); err != nil { return err } } return nil }) return err }
// chroot on linux uses pivot_root instead of chroot // pivot_root takes a new root and an old root. // Old root must be a sub-dir of new root, it is where the current rootfs will reside after the call to pivot_root. // New root is where the new rootfs is set to. // Old root is removed after the call to pivot_root so it is no longer available under the new root. // This is similar to how libcontainer sets up a container's rootfs func chroot(path string) (err error) { // if the engine is running in a user namespace we need to use actual chroot if rsystem.RunningInUserNS() { return realChroot(path) } if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { return fmt.Errorf("Error creating mount namespace before pivot: %v", err) } // make everything in new ns private if err := mount.MakeRPrivate("/"); err != nil { return err } // ensure path is a mountpoint if err := mount.MakePrivate(path); err != nil { return err } // setup oldRoot for pivot_root pivotDir, err := ioutil.TempDir(path, ".pivot_root") if err != nil { return fmt.Errorf("Error setting up pivot dir: %v", err) } var mounted bool defer func() { if mounted { // make sure pivotDir is not mounted before we try to remove it if errCleanup := syscall.Unmount(pivotDir, syscall.MNT_DETACH); errCleanup != nil { if err == nil { err = errCleanup } return } } errCleanup := os.Remove(pivotDir) // pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful // because we already cleaned it up on failed pivot_root if errCleanup != nil && !os.IsNotExist(errCleanup) { errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup) if err == nil { err = errCleanup } } }() if err := syscall.PivotRoot(path, pivotDir); err != nil { // If pivot fails, fall back to the normal chroot after cleaning up temp dir if err := os.Remove(pivotDir); err != nil { return fmt.Errorf("Error cleaning up after failed pivot: %v", err) } return realChroot(path) } mounted = true // This is the new path for where the old root (prior to the pivot) has been moved to // This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction pivotDir = filepath.Join("/", filepath.Base(pivotDir)) if err := syscall.Chdir("/"); err != nil { return fmt.Errorf("Error changing to new root: %v", err) } // Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("Error making old root private after pivot: %v", err) } // Now unmount the old root so it's no longer visible from the new root if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { return fmt.Errorf("Error while unmounting old root after pivot: %v", err) } mounted = false return nil }