func withClearedCloExec(lfd int, f func() error) error { err := sys.CloseOnExec(lfd, false) if err != nil { return err } defer sys.CloseOnExec(lfd, true) return f() }
func (b *Builder) Build() error { logs.WithF(b.fields).Info("Building aci") lfd, err := rktcommon.GetRktLockFD() if err != nil { return errs.WithEF(err, b.fields, "can't get rkt lock fd") } if err := sys.CloseOnExec(lfd, true); err != nil { return errs.WithEF(err, b.fields, "can't set FD_CLOEXEC on rkt lock") } if err := b.runBuild(); err != nil { return err } if err := b.writeManifest(); err != nil { return err } if err := b.tarAci(); err != nil { return err } return nil }
func run() int { lfd, err := common.GetRktLockFD() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get rkt lock fd: %v\n", err) return 1 } if err := sys.CloseOnExec(lfd, true); err != nil { fmt.Fprintf(os.Stderr, "Failed to set FD_CLOEXEC on rkt lock: %v\n", err) return 1 } if err := stage1common.WritePpid(os.Getpid()); err != nil { fmt.Fprintf(os.Stderr, "write ppid: %v", err) return 1 } fmt.Println("success, stub stage1 would at this point switch to stage2") return 0 }
func sdListenFDs(unsetEnvironment bool) (int, error) { defer func() { if unsetEnvironment { os.Unsetenv("LISTEN_PID") os.Unsetenv("LISTEN_FDS") } }() e := os.Getenv("LISTEN_PID") if e == "" { return 0, nil } pid, err := strconv.Atoi(e) if err != nil { return -1, err } if os.Getpid() != pid { return 0, nil } e = os.Getenv("LISTEN_FDS") if e == "" { return 0, nil } n, err := strconv.Atoi(e) if err != nil { return -1, err } for fd := SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START+n; fd++ { if err := sys.CloseOnExec(fd, true); err != nil { return -1, err } } return n, nil }
// Run mounts the right overlay filesystems and actually runs the prepared // pod by exec()ing the stage1 init inside the pod filesystem. func Run(cfg RunConfig, dir string) { useOverlay, err := preparedWithOverlay(dir) if err != nil { log.Fatalf("error: %v", err) } log.Printf("Setting up stage1") if err := setupStage1Image(cfg, cfg.Stage1Image, dir, useOverlay); err != nil { log.Fatalf("error setting up stage1: %v", err) } log.Printf("Wrote filesystem to %s\n", dir) for _, app := range cfg.Apps { if err := setupAppImage(cfg, app.Name, app.Image.ID, dir, useOverlay); err != nil { log.Fatalf("error setting up app image: %v", err) } } if err := os.Setenv(common.EnvLockFd, fmt.Sprintf("%v", cfg.LockFd)); err != nil { log.Fatalf("setting lock fd environment: %v", err) } log.Printf("Pivoting to filesystem %s", dir) if err := os.Chdir(dir); err != nil { log.Fatalf("failed changing to dir: %v", err) } ep, err := getStage1Entrypoint(dir, runEntrypoint) if err != nil { log.Fatalf("error determining init entrypoint: %v", err) } args := []string{filepath.Join(common.Stage1RootfsPath(dir), ep)} log.Printf("Execing %s", ep) if cfg.Debug { args = append(args, "--debug") } if cfg.PrivateNet.Any() { args = append(args, "--private-net="+cfg.PrivateNet.String()) } if cfg.Interactive { args = append(args, "--interactive") } if cfg.MDSRegister { mdsToken, err := registerPod(".", cfg.UUID, cfg.Apps) if err != nil { log.Fatalf("failed to register the pod: %v", err) } args = append(args, "--mds-token="+mdsToken) } if cfg.LocalConfig != "" { args = append(args, "--local-config="+cfg.LocalConfig) } args = append(args, cfg.UUID.String()) // make sure the lock fd stays open across exec if err := sys.CloseOnExec(cfg.LockFd, false); err != nil { log.Fatalf("error clearing FD_CLOEXEC on lock fd") } if err := label.SetProcessLabel(cfg.ProcessLabel); err != nil { log.Fatalf("error setting process SELinux label: %v", err) } if err := syscall.Exec(args[0], args, os.Environ()); err != nil { log.Fatalf("error execing init: %v", err) } }
// Run mounts the right overlay filesystems and actually runs the prepared // pod by exec()ing the stage1 init inside the pod filesystem. func Run(cfg RunConfig, dir string, dataDir string) { useOverlay, err := preparedWithOverlay(dir) if err != nil { log.Fatalf("error: %v", err) } privateUsers, err := preparedWithPrivateUsers(dir) if err != nil { log.Fatalf("error: %v", err) } debug("Setting up stage1") if err := setupStage1Image(cfg, dir, useOverlay); err != nil { log.Fatalf("error setting up stage1: %v", err) } debug("Wrote filesystem to %s\n", dir) for _, app := range cfg.Apps { if err := setupAppImage(cfg, app.Name, app.Image.ID, dir, useOverlay); err != nil { log.Fatalf("error setting up app image: %v", err) } } destRootfs := common.Stage1RootfsPath(dir) flavor, err := os.Readlink(filepath.Join(destRootfs, "flavor")) if err != nil { log.Printf("error reading flavor: %v\n", err) } if flavor == "kvm" { err := kvmCheckSSHSetup(destRootfs, dataDir) if err != nil { log.Fatalf("error setting up ssh keys: %v", err) } } if err := os.Setenv(common.EnvLockFd, fmt.Sprintf("%v", cfg.LockFd)); err != nil { log.Fatalf("setting lock fd environment: %v", err) } if err := os.Setenv(common.EnvSELinuxContext, fmt.Sprintf("%v", cfg.ProcessLabel)); err != nil { log.Fatalf("setting SELinux context environment: %v", err) } debug("Pivoting to filesystem %s", dir) if err := os.Chdir(dir); err != nil { log.Fatalf("failed changing to dir: %v", err) } ep, err := getStage1Entrypoint(dir, runEntrypoint) if err != nil { log.Fatalf("error determining 'run' entrypoint: %v", err) } args := []string{filepath.Join(destRootfs, ep)} debug("Execing %s", ep) if cfg.Debug { args = append(args, "--debug") } args = append(args, "--net="+cfg.Net.String()) if cfg.Interactive { args = append(args, "--interactive") } if len(privateUsers) > 0 { args = append(args, "--private-users="+privateUsers) } if cfg.MDSRegister { mdsToken, err := registerPod(".", cfg.UUID, cfg.Apps) if err != nil { log.Fatalf("failed to register the pod: %v", err) } args = append(args, "--mds-token="+mdsToken) } if cfg.LocalConfig != "" { args = append(args, "--local-config="+cfg.LocalConfig) } args = append(args, cfg.UUID.String()) // make sure the lock fd stays open across exec if err := sys.CloseOnExec(cfg.LockFd, false); err != nil { log.Fatalf("error clearing FD_CLOEXEC on lock fd") } tpmEvent := fmt.Sprintf("rkt: Rootfs: %s Manifest: %s Stage 1 args: %s", cfg.CommonConfig.RootHash, cfg.CommonConfig.ManifestData, strings.Join(args, " ")) // If there's no TPM available or there's a failure for some other // reason, ignore it and continue anyway. Long term we'll want policy // that enforces TPM behaviour, but we don't have any infrastructure // around that yet. _ = tpm.Extend(tpmEvent) if err := syscall.Exec(args[0], args, os.Environ()); err != nil { log.Fatalf("error execing init: %v", err) } }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { log.FatalE("UUID is missing or malformed", err) } root := "." p, err := stage1commontypes.LoadPod(root, uuid) if err != nil { log.FatalE("failed to load pod", err) } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking // network plugins lfd, err := common.GetRktLockFD() if err != nil { log.FatalE("failed to get rkt lock fd", err) } if err := sys.CloseOnExec(lfd, true); err != nil { log.FatalE("failed to set FD_CLOEXEC on rkt lock", err) } mirrorLocalZoneInfo(p.Root) flavor, _, err := stage1initcommon.GetFlavor(p) if err != nil { log.FatalE("failed to get stage1 flavor", err) } var n *networking.Networking if netList.Contained() { fps, err := commonnet.ForwardedPorts(p.Manifest) if err != nil { log.FatalE("error initializing forwarding ports", err) } noDNS := dnsConfMode.Pairs["resolv"] != "default" // force ignore CNI DNS results n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor, noDNS, debug) if err != nil { log.FatalE("failed to setup network", err) } if err = n.Save(); err != nil { log.PrintE("failed to save networking state", err) n.Teardown(flavor, debug) return 254 } if len(mdsToken) > 0 { hostIP, err := n.GetForwardableNetHostIP() if err != nil { log.FatalE("failed to get default Host IP", err) } p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken) } } else { if flavor == "kvm" { log.Fatal("flavor kvm requires private network configuration (try --net)") } if len(mdsToken) > 0 { p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken) } } insecureOptions := stage1initcommon.Stage1InsecureOptions{ DisablePaths: disablePaths, DisableCapabilities: disableCapabilities, DisableSeccomp: disableSeccomp, } mnt := fs.NewLoggingMounter( fs.MounterFunc(syscall.Mount), fs.UnmounterFunc(syscall.Unmount), diag.Printf, ) if dnsConfMode.Pairs["resolv"] == "host" { stage1initcommon.UseHostResolv(mnt, root) } if dnsConfMode.Pairs["hosts"] == "host" { stage1initcommon.UseHostHosts(mnt, root) } if mutable { if err = stage1initcommon.MutableEnv(p); err != nil { log.FatalE("cannot initialize mutable environment", err) } } else { if err = stage1initcommon.ImmutableEnv(p, interactive, privateUsers, insecureOptions); err != nil { log.FatalE("cannot initialize immutable environment", err) } } if err := stage1initcommon.SetJournalPermissions(p); err != nil { log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err) } if flavor == "kvm" { kvm.InitDebug(debug) if err := KvmNetworkingToSystemd(p, n); err != nil { log.FatalE("failed to configure systemd for kvm", err) } } canMachinedRegister := false if flavor != "kvm" { // kvm doesn't register with systemd right now, see #2664. canMachinedRegister = machinedRegister() } diag.Printf("canMachinedRegister %t", canMachinedRegister) args, env, err := getArgsEnv(p, flavor, canMachinedRegister, debug, n, insecureOptions) if err != nil { log.FatalE("cannot get environment", err) } diag.Printf("args %q", args) diag.Printf("env %q", env) // create a separate mount namespace so the cgroup filesystems // are unmounted when exiting the pod if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.FatalE("error unsharing", err) } // we recursively make / a "shared and slave" so mount events from the // new namespace don't propagate to the host namespace but mount events // from the host propagate to the new namespace and are forwarded to // its peer group // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { log.FatalE("error making / a slave mount", err) } if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { log.FatalE("error making / a shared and slave mount", err) } unifiedCgroup, err := cgroup.IsCgroupUnified("/") if err != nil { log.FatalE("error determining cgroup version", err) } diag.Printf("unifiedCgroup %t", unifiedCgroup) s1Root := common.Stage1RootfsPath(p.Root) machineID := stage1initcommon.GetMachineID(p) subcgroup, err := getContainerSubCgroup(machineID, canMachinedRegister, unifiedCgroup) if err != nil { log.FatalE("error getting container subcgroup", err) } diag.Printf("subcgroup %q", subcgroup) if err := ioutil.WriteFile(filepath.Join(p.Root, "subcgroup"), []byte(fmt.Sprintf("%s", subcgroup)), 0644); err != nil { log.FatalE("cannot write subcgroup file", err) } if !unifiedCgroup { enabledCgroups, err := v1.GetEnabledCgroups() if err != nil { log.FatalE("error getting v1 cgroups", err) } diag.Printf("enabledCgroups %q", enabledCgroups) if err := mountHostV1Cgroups(mnt, enabledCgroups); err != nil { log.FatalE("couldn't mount the host v1 cgroups", err) } if !canMachinedRegister { if err := v1.JoinSubcgroup("systemd", subcgroup); err != nil { log.FatalE(fmt.Sprintf("error joining subcgroup %q", subcgroup), err) } } var serviceNames []string for _, app := range p.Manifest.Apps { serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name)) } diag.Printf("serviceNames %q", serviceNames) if err := mountContainerV1Cgroups(mnt, s1Root, enabledCgroups, subcgroup, serviceNames, insecureOptions); err != nil { log.FatalE("couldn't mount the container v1 cgroups", err) } } // KVM flavor has a bit different logic in handling pid vs ppid, for details look into #2389 // it doesn't require the existence of a "ppid", instead it registers the current pid (which // will be reused by lkvm binary) as a pod process pid used during entering pid_filename := "ppid" if flavor == "kvm" { pid_filename = "pid" } if err = stage1common.WritePid(os.Getpid(), pid_filename); err != nil { log.FatalE("error writing pid", err) } if flavor == "kvm" { if err := KvmPrepareMounts(s1Root, p); err != nil { log.FatalE("error preparing mounts", err) } } err = stage1common.WithClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { log.FatalE(fmt.Sprintf("failed to execute %q", args[0]), err) } return 0 }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { log.Print("UUID is missing or malformed\n") return 1 } root := "." p, err := stage1commontypes.LoadPod(root, uuid) if err != nil { log.PrintE("can't load pod", err) return 1 } // Sanity checks if len(p.Manifest.Apps) != 1 { log.Printf("flavor %q only supports 1 application per Pod for now", flavor) return 1 } ra := p.Manifest.Apps[0] imgName := p.AppNameToImageName(ra.Name) args := ra.App.Exec if len(args) == 0 { log.Printf(`image %q has an empty "exec" (try --exec=BINARY)`, imgName) return 1 } lfd, err := common.GetRktLockFD() if err != nil { log.PrintE("can't get rkt lock fd", err) return 1 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed after execution is finished if err := sys.CloseOnExec(lfd, true); err != nil { log.PrintE("can't set FD_CLOEXEC on rkt lock", err) return 1 } workDir := "/" if ra.App.WorkingDirectory != "" { workDir = ra.App.WorkingDirectory } env := []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"} for _, e := range ra.App.Environment { env = append(env, e.Name+"="+e.Value) } rfs := filepath.Join(common.AppPath(p.Root, ra.Name), "rootfs") if err := copyResolv(p); err != nil { log.PrintE("can't copy /etc/resolv.conf", err) return 1 } argFlyMounts, err := evaluateMounts(rfs, string(ra.Name), p) if err != nil { log.PrintE("can't evaluate mounts", err) return 1 } effectiveMounts := append( []flyMount{ {"", "", "/dev", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/dev", rfs, "/dev", "none", syscall.MS_BIND | syscall.MS_REC}, {"", "", "/proc", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/proc", rfs, "/proc", "none", syscall.MS_BIND | syscall.MS_REC}, {"", "", "/sys", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/sys", rfs, "/sys", "none", syscall.MS_BIND | syscall.MS_REC}, {"tmpfs", rfs, "/tmp", "tmpfs", 0}, }, argFlyMounts..., ) for _, mount := range effectiveMounts { var ( err error hostPathInfo os.FileInfo targetPathInfo os.FileInfo ) if strings.HasPrefix(mount.HostPath, "/") { if hostPathInfo, err = os.Stat(mount.HostPath); err != nil { log.PrintE(fmt.Sprintf("stat of host path %s", mount.HostPath), err) return 1 } } else { hostPathInfo = nil } absTargetPath := filepath.Join(mount.TargetPrefixPath, mount.RelTargetPath) if targetPathInfo, err = os.Stat(absTargetPath); err != nil && !os.IsNotExist(err) { log.PrintE(fmt.Sprintf("stat of target path %s", absTargetPath), err) return 1 } switch { case targetPathInfo == nil: absTargetPathParent, _ := filepath.Split(absTargetPath) if err := os.MkdirAll(absTargetPathParent, 0755); err != nil { log.PrintE(fmt.Sprintf("can't create directory %q", absTargetPath), err) return 1 } switch { case hostPathInfo == nil || hostPathInfo.IsDir(): if err := os.Mkdir(absTargetPath, 0755); err != nil { log.PrintE(fmt.Sprintf("can't create directory %q", absTargetPath), err) return 1 } case !hostPathInfo.IsDir(): file, err := os.OpenFile(absTargetPath, os.O_CREATE, 0700) if err != nil { log.PrintE(fmt.Sprintf("can't create file %q", absTargetPath), err) return 1 } file.Close() } case hostPathInfo != nil: switch { case hostPathInfo.IsDir() && !targetPathInfo.IsDir(): log.Printf("can't mount because %q is a directory while %q is not", mount.HostPath, absTargetPath) return 1 case !hostPathInfo.IsDir() && targetPathInfo.IsDir(): log.Printf("can't mount because %q is not a directory while %q is", mount.HostPath, absTargetPath) return 1 } } if err := syscall.Mount(mount.HostPath, absTargetPath, mount.Fs, mount.Flags, ""); err != nil { log.PrintE(fmt.Sprintf("can't mount %q on %q with flags %v", mount.HostPath, absTargetPath, mount.Flags), err) return 1 } } if err = stage1common.WritePid(os.Getpid(), "pid"); err != nil { log.Error(err) return 1 } var uidResolver, gidResolver user.Resolver var uid, gid int uidResolver, err = user.NumericIDs(ra.App.User) if err != nil { uidResolver, err = user.IDsFromStat(rfs, ra.App.User, nil) } if err != nil { // give up log.PrintE(fmt.Sprintf("invalid user %q", ra.App.User), err) return 1 } if uid, _, err = uidResolver.IDs(); err != nil { log.PrintE(fmt.Sprintf("failed to configure user %q", ra.App.User), err) return 1 } gidResolver, err = user.NumericIDs(ra.App.Group) if err != nil { gidResolver, err = user.IDsFromStat(rfs, ra.App.Group, nil) } if err != nil { // give up log.PrintE(fmt.Sprintf("invalid group %q", ra.App.Group), err) return 1 } if _, gid, err = gidResolver.IDs(); err != nil { log.PrintE(fmt.Sprintf("failed to configure group %q", ra.App.Group), err) return 1 } diag.Printf("chroot to %q", rfs) if err := syscall.Chroot(rfs); err != nil { log.PrintE("can't chroot", err) return 1 } if err := os.Chdir(workDir); err != nil { log.PrintE(fmt.Sprintf("can't change to working directory %q", workDir), err) return 1 } // lock the current goroutine to its current OS thread. // This will force the subsequent syscalls to be executed in the same OS thread as Setresuid, and Setresgid, // see https://github.com/golang/go/issues/1435#issuecomment-66054163. runtime.LockOSThread() diag.Printf("setting uid %d gid %d", uid, gid) if err := syscall.Setresgid(gid, gid, gid); err != nil { log.PrintE(fmt.Sprintf("can't set gid %d", gid), err) return 1 } if err := syscall.Setresuid(uid, uid, uid); err != nil { log.PrintE(fmt.Sprintf("can't set uid %d", uid), err) return 1 } diag.Printf("execing %q in %q", args, rfs) err = stage1common.WithClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { log.PrintE(fmt.Sprintf("can't execute %q", args[0]), err) return 1 } return 0 }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { fmt.Fprintln(os.Stderr, "UUID is missing or malformed") return 1 } root := "." p, err := LoadPod(root, uuid) if err != nil { fmt.Fprintf(os.Stderr, "Failed to load pod: %v\n", err) return 1 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking // network plugins lfd, err := common.GetRktLockFD() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get rkt lock fd: %v\n", err) return 1 } if err := sys.CloseOnExec(lfd, true); err != nil { fmt.Fprintf(os.Stderr, "Failed to set FD_CLOEXEC on rkt lock: %v\n", err) return 1 } mirrorLocalZoneInfo(p.Root) flavor, _, err := p.getFlavor() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get stage1 flavor: %v\n", err) return 3 } var n *networking.Networking if privNet.Any() { fps, err := forwardedPorts(p) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) return 6 } n, err = networking.Setup(root, p.UUID, fps, privNet, localConfig, flavor) if err != nil { fmt.Fprintf(os.Stderr, "Failed to setup network: %v\n", err) return 6 } if err = n.Save(); err != nil { fmt.Fprintf(os.Stderr, "Failed to save networking state %v\n", err) n.Teardown(flavor) return 6 } if len(mdsToken) > 0 { hostIP, err := n.GetDefaultHostIP() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get default Host IP: %v\n", err) return 6 } p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken) } } else { if flavor == "kvm" { fmt.Fprintf(os.Stderr, "Flavor kvm requires private network configuration (try --private-net).\n") return 6 } if len(mdsToken) > 0 { p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken) } } if err = p.WritePrepareAppTemplate(); err != nil { fmt.Fprintf(os.Stderr, "Failed to write prepare-app service template: %v\n", err) return 2 } if err = p.PodToSystemd(interactive, flavor); err != nil { fmt.Fprintf(os.Stderr, "Failed to configure systemd: %v\n", err) return 2 } args, env, err := getArgsEnv(p, flavor, debug, n) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) return 3 } // create a separate mount namespace so the cgroup filesystems // are unmounted when exiting the pod if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.Fatalf("error unsharing: %v", err) } // we recursively make / a "shared and slave" so mount events from the // new namespace don't propagate to the host namespace but mount events // from the host propagate to the new namespace and are forwarded to // its peer group // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { log.Fatalf("error making / a slave mount: %v", err) } if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { log.Fatalf("error making / a shared and slave mount: %v", err) } var serviceNames []string for _, app := range p.Manifest.Apps { serviceNames = append(serviceNames, ServiceUnitName(app.Name)) } s1Root := common.Stage1RootfsPath(p.Root) machineID := p.GetMachineID() subcgroup, err := getContainerSubCgroup(machineID) if err == nil { if err := cgroup.CreateCgroups(s1Root, subcgroup, serviceNames); err != nil { fmt.Fprintf(os.Stderr, "Error creating cgroups: %v\n", err) return 5 } } else { fmt.Fprintf(os.Stderr, "Continuing with per-app isolators disabled: %v\n", err) } if err = writePpid(os.Getpid()); err != nil { fmt.Fprintln(os.Stderr, err.Error()) return 4 } err = withClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { fmt.Fprintf(os.Stderr, "Failed to execute %q: %v\n", args[0], err) return 7 } return 0 }
// Run mounts the right overlay filesystems and actually runs the prepared // pod by exec()ing the stage1 init inside the pod filesystem. func Run(cfg RunConfig, dir string) { useOverlay, err := preparedWithOverlay(dir) if err != nil { log.Fatalf("error: %v", err) } // create a separate mount namespace so the cgroup filesystems and/or // overlay mounts are unmounted when exiting the pod if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.Fatalf("error unsharing: %v", err) } // we recursively make / a "shared and slave" so mount events from the // new namespace don't propagate to the host namespace but mount events // from the host propagate to the new namespace and are forwarded to // its peer group // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { log.Fatalf("error making / a slave mount: %v", err) } if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { log.Fatalf("error making / a shared and slave mount: %v", err) } log.Printf("Setting up stage1") if err := setupStage1Image(cfg, cfg.Stage1Image, dir, useOverlay); err != nil { log.Fatalf("error setting up stage1: %v", err) } log.Printf("Wrote filesystem to %s\n", dir) for _, img := range cfg.Images { if err := setupAppImage(cfg, img, dir, useOverlay); err != nil { log.Fatalf("error setting up app image: %v", err) } } if err := os.Setenv(common.EnvLockFd, fmt.Sprintf("%v", cfg.LockFd)); err != nil { log.Fatalf("setting lock fd environment: %v", err) } log.Printf("Pivoting to filesystem %s", dir) if err := os.Chdir(dir); err != nil { log.Fatalf("failed changing to dir: %v", err) } ep, err := getStage1Entrypoint(dir, runEntrypoint) if err != nil { log.Fatalf("error determining init entrypoint: %v", err) } log.Printf("Execing %s", ep) args := []string{filepath.Join(common.Stage1RootfsPath(dir), ep)} if cfg.Debug { args = append(args, "--debug") } if cfg.PrivateNet.Any() { args = append(args, "--private-net="+cfg.PrivateNet.String()) } if cfg.Interactive { args = append(args, "--interactive") } args = append(args, cfg.UUID.String()) // make sure the lock fd stays open across exec if err := sys.CloseOnExec(cfg.LockFd, false); err != nil { log.Fatalf("error clearing FD_CLOEXEC on lock fd") } if err := syscall.Exec(args[0], args, os.Environ()); err != nil { log.Fatalf("error execing init: %v", err) } }
// Run mounts the right overlay filesystems and actually runs the prepared // pod by exec()ing the stage1 init inside the pod filesystem. func Run(cfg RunConfig, dir string, dataDir string) { useOverlay, err := preparedWithOverlay(dir) if err != nil { log.FatalE("error preparing overlay", err) } privateUsers, err := preparedWithPrivateUsers(dir) if err != nil { log.FatalE("error preparing private users", err) } debug("Setting up stage1") if err := setupStage1Image(cfg, dir, useOverlay); err != nil { log.FatalE("error setting up stage1", err) } debug("Wrote filesystem to %s\n", dir) for _, app := range cfg.Apps { if err := setupAppImage(cfg, app.Name, app.Image.ID, dir, useOverlay); err != nil { log.FatalE("error setting up app image", err) } } destRootfs := common.Stage1RootfsPath(dir) if len(cfg.DNS) > 0 || len(cfg.DNSSearch) > 0 || len(cfg.DNSOpt) > 0 { addResolvConf(cfg, destRootfs) } if err := os.Setenv(common.EnvLockFd, fmt.Sprintf("%v", cfg.LockFd)); err != nil { log.FatalE("setting lock fd environment", err) } if err := os.Setenv(common.EnvSELinuxContext, fmt.Sprintf("%v", cfg.ProcessLabel)); err != nil { log.FatalE("setting SELinux context environment", err) } if err := os.Setenv(common.EnvSELinuxMountContext, fmt.Sprintf("%v", cfg.MountLabel)); err != nil { log.FatalE("setting SELinux mount context enviroment", err) } debug("Pivoting to filesystem %s", dir) if err := os.Chdir(dir); err != nil { log.FatalE("failed changing to dir", err) } ep, err := getStage1Entrypoint(dir, runEntrypoint) if err != nil { log.FatalE("error determining 'run' entrypoint", err) } args := []string{filepath.Join(destRootfs, ep)} debug("Execing %s", ep) if cfg.Debug { args = append(args, "--debug") } args = append(args, "--net="+cfg.Net.String()) if cfg.Interactive { args = append(args, "--interactive") } if len(privateUsers) > 0 { args = append(args, "--private-users="+privateUsers) } if cfg.MDSRegister { mdsToken, err := registerPod(".", cfg.UUID, cfg.Apps) if err != nil { log.FatalE("failed to register the pod", err) } args = append(args, "--mds-token="+mdsToken) } if cfg.LocalConfig != "" { args = append(args, "--local-config="+cfg.LocalConfig) } s1v, err := getStage1InterfaceVersion(dir) if err != nil { log.FatalE("error determining stage1 interface version", err) } if cfg.Hostname != "" { if interfaceVersionSupportsHostname(s1v) { args = append(args, "--hostname="+cfg.Hostname) } else { log.Printf("warning: --hostname option is not supported by stage1") } } args = append(args, cfg.UUID.String()) // make sure the lock fd stays open across exec if err := sys.CloseOnExec(cfg.LockFd, false); err != nil { log.Fatalf("error clearing FD_CLOEXEC on lock fd") } tpmEvent := fmt.Sprintf("rkt: Rootfs: %s Manifest: %s Stage1 args: %s", cfg.CommonConfig.RootHash, cfg.CommonConfig.ManifestData, strings.Join(args, " ")) // If there's no TPM available or there's a failure for some other // reason, ignore it and continue anyway. Long term we'll want policy // that enforces TPM behaviour, but we don't have any infrastructure // around that yet. _ = tpm.Extend(tpmEvent) if err := syscall.Exec(args[0], args, os.Environ()); err != nil { log.FatalE("error execing init", err) } }
// Run mounts the right overlay filesystems and actually runs the prepared // pod by exec()ing the stage1 init inside the pod filesystem. func Run(cfg RunConfig, dir string, dataDir string) { useOverlay, err := preparedWithOverlay(dir) if err != nil { log.Fatalf("error: %v", err) } privateUsers, err := preparedWithPrivateUsers(dir) if err != nil { log.Fatalf("error: %v", err) } debug("Setting up stage1") if err := setupStage1Image(cfg, dir, useOverlay); err != nil { log.Fatalf("error setting up stage1: %v", err) } debug("Wrote filesystem to %s\n", dir) for _, app := range cfg.Apps { if err := setupAppImage(cfg, app.Name, app.Image.ID, dir, useOverlay); err != nil { log.Fatalf("error setting up app image: %v", err) } } destRootfs := common.Stage1RootfsPath(dir) flavor, err := os.Readlink(filepath.Join(destRootfs, "flavor")) if err != nil { log.Printf("error reading flavor: %v\n", err) } if flavor == "kvm" { err := kvmCheckSSHSetup(destRootfs, dataDir) if err != nil { log.Fatalf("error setting up ssh keys: %v", err) } } if err := os.Setenv(common.EnvLockFd, fmt.Sprintf("%v", cfg.LockFd)); err != nil { log.Fatalf("setting lock fd environment: %v", err) } if err := os.Setenv(common.EnvSELinuxContext, fmt.Sprintf("%v", cfg.ProcessLabel)); err != nil { log.Fatalf("setting SELinux context environment: %v", err) } debug("Pivoting to filesystem %s", dir) if err := os.Chdir(dir); err != nil { log.Fatalf("failed changing to dir: %v", err) } ep, err := getStage1Entrypoint(dir, runEntrypoint) if err != nil { log.Fatalf("error determining 'run' entrypoint: %v", err) } args := []string{filepath.Join(destRootfs, ep)} debug("Execing %s", ep) if cfg.Debug { args = append(args, "--debug") } args = append(args, "--net="+cfg.Net.String()) if cfg.Interactive { args = append(args, "--interactive") } if len(privateUsers) > 0 { args = append(args, "--private-users="+privateUsers) } if cfg.MDSRegister { mdsToken, err := registerPod(".", cfg.UUID, cfg.Apps) if err != nil { log.Fatalf("failed to register the pod: %v", err) } args = append(args, "--mds-token="+mdsToken) } if cfg.LocalConfig != "" { args = append(args, "--local-config="+cfg.LocalConfig) } args = append(args, cfg.UUID.String()) // make sure the lock fd stays open across exec if err := sys.CloseOnExec(cfg.LockFd, false); err != nil { log.Fatalf("error clearing FD_CLOEXEC on lock fd") } if err := syscall.Exec(args[0], args, os.Environ()); err != nil { log.Fatalf("error execing init: %v", err) } }
func stage1(rp *stage1commontypes.RuntimePod) int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { log.Print("UUID is missing or malformed\n") return 254 } root := "." p, err := stage1commontypes.LoadPod(root, uuid, rp) if err != nil { log.PrintE("can't load pod", err) return 254 } if err := p.SaveRuntime(); err != nil { log.FatalE("failed to save runtime parameters", err) } // Sanity checks if len(p.Manifest.Apps) != 1 { log.Printf("flavor %q only supports 1 application per Pod for now", flavor) return 254 } ra := p.Manifest.Apps[0] imgName := p.AppNameToImageName(ra.Name) args := ra.App.Exec if len(args) == 0 { log.Printf(`image %q has an empty "exec" (try --exec=BINARY)`, imgName) return 254 } lfd, err := common.GetRktLockFD() if err != nil { log.PrintE("can't get rkt lock fd", err) return 254 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed after execution is finished if err := sys.CloseOnExec(lfd, true); err != nil { log.PrintE("can't set FD_CLOEXEC on rkt lock", err) return 254 } workDir := "/" if ra.App.WorkingDirectory != "" { workDir = ra.App.WorkingDirectory } env := []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"} for _, e := range ra.App.Environment { env = append(env, e.Name+"="+e.Value) } rfs := filepath.Join(common.AppPath(p.Root, ra.Name), "rootfs") argFlyMounts, err := evaluateMounts(rfs, string(ra.Name), p) if err != nil { log.PrintE("can't evaluate mounts", err) return 254 } effectiveMounts := append( []flyMount{ {"", "", "/dev", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/dev", rfs, "/dev", "none", syscall.MS_BIND | syscall.MS_REC}, {"", "", "/proc", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/proc", rfs, "/proc", "none", syscall.MS_BIND | syscall.MS_REC}, {"", "", "/sys", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/sys", rfs, "/sys", "none", syscall.MS_BIND | syscall.MS_REC}, {"tmpfs", rfs, "/tmp", "tmpfs", 0}, }, argFlyMounts..., ) /* Process DNS config files * * /etc/resolv.conf: four modes * 'host' - bind-mount host's file * 'stage0' - bind-mount the file created by stage0 * 'default' - do nothing (we would respect CNI if fly had networking) * 'none' - do nothing */ switch p.ResolvConfMode { case "host": effectiveMounts = append(effectiveMounts, flyMount{"/etc/resolv.conf", rfs, "/etc/resolv.conf", "none", syscall.MS_BIND | syscall.MS_RDONLY}) case "stage0": if err := copyResolv(p); err != nil { log.PrintE("can't copy /etc/resolv.conf", err) return 254 } } /* * /etc/hosts: three modes: * 'host' - bind-mount hosts's file * 'stage0' - bind mount the file created by stage1 * 'default' - create a stub /etc/hosts if needed */ switch p.EtcHostsMode { case "host": effectiveMounts = append(effectiveMounts, flyMount{"/etc/hosts", rfs, "/etc/hosts", "none", syscall.MS_BIND | syscall.MS_RDONLY}) case "stage0": effectiveMounts = append(effectiveMounts, flyMount{ filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "rkt-hosts"), rfs, "/etc/hosts", "none", syscall.MS_BIND | syscall.MS_RDONLY}) case "default": stage2HostsPath := filepath.Join(common.AppRootfsPath(p.Root, ra.Name), "etc", "hosts") if _, err := os.Stat(stage2HostsPath); err != nil && os.IsNotExist(err) { fallbackHosts := []byte("127.0.0.1 localhost localdomain\n") ioutil.WriteFile(stage2HostsPath, fallbackHosts, 0644) } } for _, mount := range effectiveMounts { diag.Printf("Processing %+v", mount) var ( err error hostPathInfo os.FileInfo targetPathInfo os.FileInfo ) if strings.HasPrefix(mount.HostPath, "/") { if hostPathInfo, err = os.Stat(mount.HostPath); err != nil { log.PrintE(fmt.Sprintf("stat of host path %s", mount.HostPath), err) return 254 } } else { hostPathInfo = nil } absTargetPath := filepath.Join(mount.TargetPrefixPath, mount.RelTargetPath) if targetPathInfo, err = os.Stat(absTargetPath); err != nil && !os.IsNotExist(err) { log.PrintE(fmt.Sprintf("stat of target path %s", absTargetPath), err) return 254 } switch { case (mount.Flags & syscall.MS_REMOUNT) != 0: { diag.Printf("don't attempt to create files for remount of %q", absTargetPath) } case targetPathInfo == nil: absTargetPathParent, _ := filepath.Split(absTargetPath) if err := os.MkdirAll(absTargetPathParent, 0755); err != nil { log.PrintE(fmt.Sprintf("can't create directory %q", absTargetPath), err) return 254 } switch { case hostPathInfo == nil || hostPathInfo.IsDir(): if err := os.Mkdir(absTargetPath, 0755); err != nil { log.PrintE(fmt.Sprintf("can't create directory %q", absTargetPath), err) return 254 } case !hostPathInfo.IsDir(): file, err := os.OpenFile(absTargetPath, os.O_CREATE, 0700) if err != nil { log.PrintE(fmt.Sprintf("can't create file %q", absTargetPath), err) return 254 } file.Close() } case hostPathInfo != nil: switch { case hostPathInfo.IsDir() && !targetPathInfo.IsDir(): log.Printf("can't mount because %q is a directory while %q is not", mount.HostPath, absTargetPath) return 254 case !hostPathInfo.IsDir() && targetPathInfo.IsDir(): log.Printf("can't mount because %q is not a directory while %q is", mount.HostPath, absTargetPath) return 254 } } if err := syscall.Mount(mount.HostPath, absTargetPath, mount.Fs, mount.Flags, ""); err != nil { log.PrintE(fmt.Sprintf("can't mount %q on %q with flags %v", mount.HostPath, absTargetPath, mount.Flags), err) return 254 } } if err = stage1common.WritePid(os.Getpid(), "pid"); err != nil { log.Error(err) return 254 } var uidResolver, gidResolver user.Resolver var uid, gid int uidResolver, err = user.NumericIDs(ra.App.User) if err != nil { uidResolver, err = user.IDsFromStat(rfs, ra.App.User, nil) } if err != nil { // give up log.PrintE(fmt.Sprintf("invalid user %q", ra.App.User), err) return 254 } if uid, _, err = uidResolver.IDs(); err != nil { log.PrintE(fmt.Sprintf("failed to configure user %q", ra.App.User), err) return 254 } gidResolver, err = user.NumericIDs(ra.App.Group) if err != nil { gidResolver, err = user.IDsFromStat(rfs, ra.App.Group, nil) } if err != nil { // give up log.PrintE(fmt.Sprintf("invalid group %q", ra.App.Group), err) return 254 } if _, gid, err = gidResolver.IDs(); err != nil { log.PrintE(fmt.Sprintf("failed to configure group %q", ra.App.Group), err) return 254 } diag.Printf("chroot to %q", rfs) if err := syscall.Chroot(rfs); err != nil { log.PrintE("can't chroot", err) return 254 } if err := os.Chdir(workDir); err != nil { log.PrintE(fmt.Sprintf("can't change to working directory %q", workDir), err) return 254 } // lock the current goroutine to its current OS thread. // This will force the subsequent syscalls to be executed in the same OS thread as Setresuid, and Setresgid, // see https://github.com/golang/go/issues/1435#issuecomment-66054163. runtime.LockOSThread() diag.Printf("setting uid %d gid %d", uid, gid) if err := syscall.Setresgid(gid, gid, gid); err != nil { log.PrintE(fmt.Sprintf("can't set gid %d", gid), err) return 254 } if err := syscall.Setresuid(uid, uid, uid); err != nil { log.PrintE(fmt.Sprintf("can't set uid %d", uid), err) return 254 } diag.Printf("execing %q in %q", args, rfs) err = stage1common.WithClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { log.PrintE(fmt.Sprintf("can't execute %q", args[0]), err) return 254 } return 0 }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { fmt.Fprintln(os.Stderr, "UUID is missing or malformed") return 1 } root := "." p, err := LoadPod(root, uuid) if err != nil { fmt.Fprintf(os.Stderr, "Failed to load pod: %v\n", err) return 1 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking // network plugins lfd, err := common.GetRktLockFD() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get rkt lock fd: %v\n", err) return 1 } if err := sys.CloseOnExec(lfd, true); err != nil { fmt.Fprintf(os.Stderr, "Failed to set FD_CLOEXEC on rkt lock: %v\n", err) return 1 } mirrorLocalZoneInfo(p.Root) if privNet { fps, err := forwardedPorts(p) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) return 6 } n, err := networking.Setup(root, p.UUID, fps) if err != nil { fmt.Fprintf(os.Stderr, "Failed to setup network: %v\n", err) return 6 } defer n.Teardown() if err = n.Save(); err != nil { fmt.Fprintf(os.Stderr, "Failed to save networking state %v\n", err) return 6 } p.MetadataServiceURL = common.MetadataServicePublicURL(n.GetDefaultHostIP()) if err = registerPod(p, n.GetDefaultIP()); err != nil { fmt.Fprintf(os.Stderr, "Failed to register pod: %v\n", err) return 6 } defer unregisterPod(p) } if err = p.PodToSystemd(interactive); err != nil { fmt.Fprintf(os.Stderr, "Failed to configure systemd: %v\n", err) return 2 } args, env, err := getArgsEnv(p, debug) if err != nil { fmt.Fprintf(os.Stderr, "Failed to get execution parameters: %v\n", err) return 3 } var execFn func() error if privNet { cmd := exec.Cmd{ Path: args[0], Args: args, Stdin: os.Stdin, Stdout: os.Stdout, Stderr: os.Stderr, Env: env, } execFn = cmd.Run } else { execFn = func() error { return syscall.Exec(args[0], args, env) } } err = withClearedCloExec(lfd, execFn) if err != nil { fmt.Fprintf(os.Stderr, "Failed to execute nspawn: %v\n", err) return 5 } return 0 }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { log.PrintE("UUID is missing or malformed", err) return 1 } root := "." p, err := stage1commontypes.LoadPod(root, uuid) if err != nil { log.PrintE("failed to load pod", err) return 1 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking // network plugins lfd, err := common.GetRktLockFD() if err != nil { log.PrintE("failed to get rkt lock fd", err) return 1 } if err := sys.CloseOnExec(lfd, true); err != nil { log.PrintE("failed to set FD_CLOEXEC on rkt lock", err) return 1 } mirrorLocalZoneInfo(p.Root) flavor, _, err := stage1initcommon.GetFlavor(p) if err != nil { log.PrintE("failed to get stage1 flavor", err) return 3 } var n *networking.Networking if netList.Contained() { fps, err := forwardedPorts(p) if err != nil { log.Error(err) return 6 } n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor, debug) if err != nil { log.PrintE("failed to setup network", err) return 6 } if err = n.Save(); err != nil { log.PrintE("failed to save networking state", err) n.Teardown(flavor, debug) return 6 } if len(mdsToken) > 0 { hostIP, err := n.GetDefaultHostIP() if err != nil { log.PrintE("failed to get default Host IP", err) return 6 } p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken) } } else { if flavor == "kvm" { log.Print("flavor kvm requires private network configuration (try --net)") return 6 } if len(mdsToken) > 0 { p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken) } } if err = stage1initcommon.WriteDefaultTarget(p); err != nil { log.PrintE("failed to write default.target", err) return 2 } if err = stage1initcommon.WritePrepareAppTemplate(p); err != nil { log.PrintE("failed to write prepare-app service template", err) return 2 } if err := stage1initcommon.SetJournalPermissions(p); err != nil { log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err) } if flavor == "kvm" { if err := KvmPodToSystemd(p, n); err != nil { log.PrintE("failed to configure systemd for kvm", err) return 2 } } if err = stage1initcommon.PodToSystemd(p, interactive, flavor, privateUsers); err != nil { log.PrintE("failed to configure systemd", err) return 2 } args, env, err := getArgsEnv(p, flavor, debug, n) if err != nil { log.Error(err) return 3 } // create a separate mount namespace so the cgroup filesystems // are unmounted when exiting the pod if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.FatalE("error unsharing", err) } // we recursively make / a "shared and slave" so mount events from the // new namespace don't propagate to the host namespace but mount events // from the host propagate to the new namespace and are forwarded to // its peer group // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { log.FatalE("error making / a slave mount", err) } if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { log.FatalE("error making / a shared and slave mount", err) } enabledCgroups, err := cgroup.GetEnabledCgroups() if err != nil { log.FatalE("error getting cgroups", err) return 5 } // mount host cgroups in the rkt mount namespace if err := mountHostCgroups(enabledCgroups); err != nil { log.FatalE("couldn't mount the host cgroups", err) return 5 } var serviceNames []string for _, app := range p.Manifest.Apps { serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name)) } s1Root := common.Stage1RootfsPath(p.Root) machineID := stage1initcommon.GetMachineID(p) subcgroup, err := getContainerSubCgroup(machineID) if err == nil { if err := mountContainerCgroups(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil { log.PrintE("couldn't mount the container cgroups", err) return 5 } } else { log.PrintE("continuing with per-app isolators disabled", err) } if err = stage1common.WritePpid(os.Getpid()); err != nil { log.Error(err) return 4 } err = stage1common.WithClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { log.PrintE(fmt.Sprintf("failed to execute %q", args[0]), err) return 7 } return 0 }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { log.Print("UUID is missing or malformed\n") return 1 } root := "." p, err := stage1commontypes.LoadPod(root, uuid) if err != nil { log.PrintE("can't load pod", err) return 1 } if len(p.Manifest.Apps) != 1 { log.Printf("flavor %q only supports 1 application per Pod for now", flavor) return 1 } lfd, err := common.GetRktLockFD() if err != nil { log.PrintE("can't get rkt lock fd", err) return 1 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed after execution is finished if err := sys.CloseOnExec(lfd, true); err != nil { log.PrintE("can't set FD_CLOEXEC on rkt lock", err) return 1 } env := []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"} for _, e := range p.Manifest.Apps[0].App.Environment { env = append(env, e.Name+"="+e.Value) } args := p.Manifest.Apps[0].App.Exec rfs := filepath.Join(common.AppPath(p.Root, p.Manifest.Apps[0].Name), "rootfs") argFlyMounts, err := evaluateMounts(rfs, string(p.Manifest.Apps[0].Name), p) if err != nil { log.PrintE("can't evaluate mounts", err) return 1 } effectiveMounts := append( []flyMount{ {"", "", "/dev", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/dev", rfs, "/dev", "none", syscall.MS_BIND | syscall.MS_REC}, {"", "", "/proc", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/proc", rfs, "/proc", "none", syscall.MS_BIND | syscall.MS_REC}, {"", "", "/sys", "none", syscall.MS_REC | syscall.MS_SHARED}, {"/sys", rfs, "/sys", "none", syscall.MS_BIND | syscall.MS_REC}, {"tmpfs", rfs, "/tmp", "tmpfs", 0}, }, argFlyMounts..., ) for _, mount := range effectiveMounts { var ( err error hostPathInfo os.FileInfo targetPathInfo os.FileInfo ) if strings.HasPrefix(mount.HostPath, "/") { if hostPathInfo, err = os.Stat(mount.HostPath); err != nil { log.PrintE(fmt.Sprintf("stat of host directory %s", mount.HostPath), err) return 1 } } else { hostPathInfo = nil } absTargetPath := filepath.Join(mount.TargetPrefixPath, mount.RelTargetPath) if targetPathInfo, err = os.Stat(absTargetPath); err != nil && !os.IsNotExist(err) { log.PrintE(fmt.Sprintf("stat of target directory %s", absTargetPath), err) return 1 } switch { case targetPathInfo == nil: absTargetPathParent, _ := filepath.Split(absTargetPath) if err := os.MkdirAll(absTargetPathParent, 0700); err != nil { log.PrintE(fmt.Sprintf("can't create directory %q", absTargetPath), err) return 1 } switch { case hostPathInfo == nil || hostPathInfo.IsDir(): if err := os.Mkdir(absTargetPath, 0700); err != nil { log.PrintE(fmt.Sprintf("can't create directory %q", absTargetPath), err) return 1 } case !hostPathInfo.IsDir(): file, err := os.OpenFile(absTargetPath, os.O_CREATE, 0700) if err != nil { log.PrintE(fmt.Sprintf("can't create file %q", absTargetPath), err) return 1 } file.Close() } case hostPathInfo != nil: switch { case hostPathInfo.IsDir() && !targetPathInfo.IsDir(): log.Printf("can't mount because %q is a directory while %q is not", mount.HostPath, absTargetPath) return 1 case !hostPathInfo.IsDir() && targetPathInfo.IsDir(): log.Printf("can't mount because %q is not a directory while %q is", mount.HostPath, absTargetPath) return 1 } } if err := syscall.Mount(mount.HostPath, absTargetPath, mount.Fs, mount.Flags, ""); err != nil { log.PrintE(fmt.Sprintf("can't mount %q on %q with flags %v", mount.HostPath, absTargetPath, mount.Flags), err) return 1 } } if err = stage1common.WritePpid(os.Getpid()); err != nil { log.Error(err) return 4 } diag.Printf("chroot to %q", rfs) if err := syscall.Chroot(rfs); err != nil { log.PrintE("can't chroot", err) return 1 } if err := os.Chdir("/"); err != nil { log.PrintE("can't change to root new directory", err) return 1 } diag.Printf("execing %q in %q", args, rfs) err = stage1common.WithClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { log.PrintE(fmt.Sprintf("can't execute %q", args[0]), err) return 7 } return 0 }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { fmt.Fprintln(os.Stderr, "UUID is missing or malformed") return 1 } root := "." p, err := LoadPod(root, uuid) if err != nil { fmt.Fprintf(os.Stderr, "Failed to load pod: %v\n", err) return 1 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking // network plugins lfd, err := common.GetRktLockFD() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get rkt lock fd: %v\n", err) return 1 } if err := sys.CloseOnExec(lfd, true); err != nil { fmt.Fprintf(os.Stderr, "Failed to set FD_CLOEXEC on rkt lock: %v\n", err) return 1 } mirrorLocalZoneInfo(p.Root) if privNet.Any() { fps, err := forwardedPorts(p) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) return 6 } n, err := networking.Setup(root, p.UUID, fps, privNet) if err != nil { fmt.Fprintf(os.Stderr, "Failed to setup network: %v\n", err) return 6 } defer n.Teardown() if err = n.Save(); err != nil { fmt.Fprintf(os.Stderr, "Failed to save networking state %v\n", err) return 6 } hostIP, err := n.GetDefaultHostIP() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get default Host IP: %v\n", err) return 6 } mdsToken, err := generateMDSToken() if err != nil { fmt.Fprintf(os.Stderr, "Failed to generate MDS token: %v", err) return 8 } p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken) if err = registerPod(p, mdsToken); err != nil { fmt.Fprintf(os.Stderr, "Failed to register pod: %v\n", err) return 8 } defer unregisterPod(p) } flavor, systemdStage1Version, err := p.getFlavor() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get stage1 flavor: %v\n", err) return 3 } if err = p.WritePrepareAppTemplate(systemdStage1Version); err != nil { fmt.Fprintf(os.Stderr, "Failed to write prepare-app service template: %v\n", err) return 2 } if err = p.PodToSystemd(interactive); err != nil { fmt.Fprintf(os.Stderr, "Failed to configure systemd: %v\n", err) return 2 } args, env, err := getArgsEnv(p, flavor, systemdStage1Version, debug) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) return 3 } appHashes := p.GetAppHashes() s1Root := common.Stage1RootfsPath(p.Root) machineID := p.GetMachineID() subcgroup, err := getContainerSubCgroup(machineID) if err == nil { if err := cgroup.CreateCgroups(s1Root, subcgroup, appHashes); err != nil { fmt.Fprintf(os.Stderr, "Error creating cgroups: %v\n", err) return 5 } } else { fmt.Fprintf(os.Stderr, "Continuing with per-app isolators disabled: %v\n", err) } var execFn func() error if privNet.Any() { cmd := exec.Cmd{ Path: args[0], Args: args, Stdin: os.Stdin, Stdout: os.Stdout, Stderr: os.Stderr, Env: env, } execFn = func() error { err = cmd.Start() if err != nil { return fmt.Errorf("Failed to start nspawn: %v\n", err) } if err = writePpid(cmd.Process.Pid); err != nil { return err } return cmd.Wait() } } else { if err = writePpid(os.Getpid()); err != nil { fmt.Fprintln(os.Stderr, err.Error()) return 4 } execFn = func() error { return syscall.Exec(args[0], args, env) } } err = withClearedCloExec(lfd, execFn) if err != nil { fmt.Fprintf(os.Stderr, "Failed to execute nspawn: %v\n", err) return 7 } return 0 }