func unshareAndBind(workingRootDir string) bool { if *unshare { // Re-exec myself using the unshare syscall while on a locked thread. // This hack is required because syscall.Unshare() operates on only one // thread in the process, and Go switches execution between threads // randomly. Thus, the namespace can be suddenly switched for running // code. This is an aspect of Go that was not well thought out. runtime.LockOSThread() err := syscall.Unshare(syscall.CLONE_NEWNS) if err != nil { fmt.Printf("Unable to unshare mount namesace\t%s\n", err) return false } args := append(os.Args, "-unshare=false") err = syscall.Exec(args[0], args, os.Environ()) if err != nil { fmt.Printf("Unable to Exec:%s\t%s\n", args[0], err) return false } } err := syscall.Mount("none", "/", "", syscall.MS_REC|syscall.MS_PRIVATE, "") if err != nil { fmt.Printf("Unable to set mount sharing to private\t%s\n", err) return false } syscall.Unmount(workingRootDir, 0) err = syscall.Mount(*rootDir, workingRootDir, "", syscall.MS_BIND, "") if err != nil { fmt.Printf("Unable to bind mount %s to %s\t%s\n", *rootDir, workingRootDir, err) return false } return true }
func newNetNS() (hostNS, childNS *os.File, err error) { defer func() { if err != nil { if hostNS != nil { hostNS.Close() } if childNS != nil { childNS.Close() } } }() hostNS, err = os.Open(selfNetNS) if err != nil { return } if err = syscall.Unshare(syscall.CLONE_NEWNET); err != nil { return } childNS, err = os.Open(selfNetNS) if err != nil { ns.SetNS(hostNS, syscall.CLONE_NEWNET) return } return }
// enterPrivateMountNamespace does just that: the current mount ns is unshared (isolated) // and then made a slave to the root mount / of the parent mount ns (mount events from / // or its children that happen in the parent NS propagate to us). // // this is not yet compatible with volume plugins as implemented by the kubelet, which // depends on using host-volume args to 'docker run' to attach plugin volumes to CT's // at runtime. as such, docker needs to be able to see the volumes mounted by k8s plugins, // which is impossible if k8s volume plugins are running in an isolated mount ns. // // an alternative approach would be to always run the kubelet in the host's mount-ns and // rely upon mesos to forcibly umount bindings in the task sandbox before rmdir'ing it: // https://issues.apache.org/jira/browse/MESOS-349. // // use at your own risk. func enterPrivateMountNamespace() { log.Warningln("EXPERIMENTAL FEATURE: entering private mount ns") // enter a new mount NS, useful for isolating changes to the mount table // that are made by the kubelet for storage volumes. err := syscall.Unshare(syscall.CLONE_NEWNS) if err != nil { log.Fatalf("failed to enter private mount NS: %v", err) } // make the rootfs / rslave to the parent mount NS so that we // pick up on any changes made there err = syscall.Mount("", "/", "dontcare", syscall.MS_REC|syscall.MS_SLAVE, "") if err != nil { log.Fatalf("failed to mark / rslave: %v", err) } }
// SetupTestNetNS joins a new network namespace, and returns its associated // teardown function. // // Example usage: // // defer SetupTestNetNS(t)() // func SetupTestNetNS(t *testing.T) func() { runtime.LockOSThread() if err := syscall.Unshare(syscall.CLONE_NEWNET); err != nil { t.Fatalf("Failed to enter netns: %v", err) } fd, err := syscall.Open("/proc/self/ns/net", syscall.O_RDONLY, 0) if err != nil { t.Fatal("Failed to open netns file") } return func() { if err := syscall.Close(fd); err != nil { t.Logf("Warning: netns closing failed (%v)", err) } runtime.UnlockOSThread() } }
// SetupTestOSContext joins a new network namespace, and returns its associated // teardown function. // // Example usage: // // defer SetupTestOSContext(t)() // func SetupTestOSContext(t *testing.T) func() { runtime.LockOSThread() if err := syscall.Unshare(syscall.CLONE_NEWNET); err != nil { t.Fatalf("Failed to enter netns: %v", err) } fd, err := syscall.Open("/proc/self/ns/net", syscall.O_RDONLY, 0) if err != nil { t.Fatal("Failed to open netns file") } // Since we are switching to a new test namespace make // sure to re-initialize initNs context ns.Init() return func() { if err := syscall.Close(fd); err != nil { t.Logf("Warning: netns closing failed (%v)", err) } runtime.UnlockOSThread() } }
func main() { flag.Usage = usage flag.Parse() if flag.NArg() < 1 { usage() } var flags int if *mflag { flags |= syscall.CLONE_NEWNS } if *uflag { flags |= syscall.CLONE_NEWUTS } if *iflag { flags |= syscall.CLONE_NEWIPC } if *nflag { flags |= syscall.CLONE_NEWNET } if *pflag { flags |= syscall.CLONE_NEWPID } if *Uflag { flags |= syscall.CLONE_NEWUSER } ck(syscall.Unshare(flags)) args := flag.Args() cmd := exec.Command(args[0], args[1:]...) cmd.Stdin = os.Stdin cmd.Stderr = os.Stderr cmd.Stdout = os.Stdout ck(cmd.Run()) }
func run(ctx *kingpin.ParseContext) error { mounts := getMounts() if len(mounts) == 0 { return errors.New("No suitable mounts found") } if !dbtm.StringInSlice(mounts, mount) { return errors.New(fmt.Sprintf("Mountpoint %s not found", mount)) } if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { return err } if err := syscall.Mount("none", mount, "none", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { return err } if err := syscall.Unmount(mount, syscall.MNT_DETACH); err != nil { return err } if f, err := os.Create(path.Join(mount, "junk")); err == nil { defer f.Close() w := bufio.NewWriter(f) defer w.Flush() for { if _, err := w.Write([]byte("junk")); err != nil { break } } return nil } else { return err } }
func Unshare(flags int) error { return syscall.Unshare(flags) }
// chroot on linux uses pivot_root instead of chroot // pivot_root takes a new root and an old root. // Old root must be a sub-dir of new root, it is where the current rootfs will reside after the call to pivot_root. // New root is where the new rootfs is set to. // Old root is removed after the call to pivot_root so it is no longer available under the new root. // This is similar to how libcontainer sets up a container's rootfs func chroot(path string) (err error) { // Create new mount namespace so mounts don't leak if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { return fmt.Errorf("Error creating mount namespace before pivot: %v", err) } // path must be a different fs for pivot_root, so bind-mount to itself to ensure this if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("Error mounting pivot dir before pivot: %v", err) } // setup oldRoot for pivot_root pivotDir, err := ioutil.TempDir(path, ".pivot_root") if err != nil { return fmt.Errorf("Error setting up pivot dir: %v", err) } var mounted bool defer func() { if mounted { // make sure pivotDir is not mounted before we try to remove it if errCleanup := syscall.Unmount(pivotDir, syscall.MNT_DETACH); errCleanup != nil { if err == nil { err = errCleanup } return } } errCleanup := os.Remove(pivotDir) // pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful // but we already cleaned it up on failed pivot_root if errCleanup != nil && !os.IsNotExist(errCleanup) { errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup) if err == nil { err = errCleanup } } }() if err := syscall.PivotRoot(path, pivotDir); err != nil { // If pivot fails, fall back to the normal chroot after cleaning up temp dir for pivot_root if err := os.Remove(pivotDir); err != nil { return fmt.Errorf("Error cleaning up after failed pivot: %v", err) } return realChroot(path) } mounted = true // This is the new path for where the old root (prior to the pivot) has been moved to // This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction pivotDir = filepath.Join("/", filepath.Base(pivotDir)) if err := syscall.Chdir("/"); err != nil { return fmt.Errorf("Error changing to new root: %v", err) } // Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("Error making old root private after pivot: %v", err) } // Now unmount the old root so it's no longer visible from the new root if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { return fmt.Errorf("Error while unmounting old root after pivot: %v", err) } mounted = false return nil }
// New creates a new network namespace and returns a handle to it. func New() (ns NsHandle, err error) { if err := syscall.Unshare(CLONE_NEWNET); err != nil { return -1, err } return Get() }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { log.FatalE("UUID is missing or malformed", err) } root := "." p, err := stage1commontypes.LoadPod(root, uuid) if err != nil { log.FatalE("failed to load pod", err) } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking // network plugins lfd, err := common.GetRktLockFD() if err != nil { log.FatalE("failed to get rkt lock fd", err) } if err := sys.CloseOnExec(lfd, true); err != nil { log.FatalE("failed to set FD_CLOEXEC on rkt lock", err) } mirrorLocalZoneInfo(p.Root) flavor, _, err := stage1initcommon.GetFlavor(p) if err != nil { log.FatalE("failed to get stage1 flavor", err) } var n *networking.Networking if netList.Contained() { fps, err := commonnet.ForwardedPorts(p.Manifest) if err != nil { log.FatalE("error initializing forwarding ports", err) } noDNS := dnsConfMode.Pairs["resolv"] != "default" // force ignore CNI DNS results n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor, noDNS, debug) if err != nil { log.FatalE("failed to setup network", err) } if err = n.Save(); err != nil { log.PrintE("failed to save networking state", err) n.Teardown(flavor, debug) return 254 } if len(mdsToken) > 0 { hostIP, err := n.GetForwardableNetHostIP() if err != nil { log.FatalE("failed to get default Host IP", err) } p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken) } } else { if flavor == "kvm" { log.Fatal("flavor kvm requires private network configuration (try --net)") } if len(mdsToken) > 0 { p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken) } } insecureOptions := stage1initcommon.Stage1InsecureOptions{ DisablePaths: disablePaths, DisableCapabilities: disableCapabilities, DisableSeccomp: disableSeccomp, } mnt := fs.NewLoggingMounter( fs.MounterFunc(syscall.Mount), fs.UnmounterFunc(syscall.Unmount), diag.Printf, ) if dnsConfMode.Pairs["resolv"] == "host" { stage1initcommon.UseHostResolv(mnt, root) } if dnsConfMode.Pairs["hosts"] == "host" { stage1initcommon.UseHostHosts(mnt, root) } if mutable { if err = stage1initcommon.MutableEnv(p); err != nil { log.FatalE("cannot initialize mutable environment", err) } } else { if err = stage1initcommon.ImmutableEnv(p, interactive, privateUsers, insecureOptions); err != nil { log.FatalE("cannot initialize immutable environment", err) } } if err := stage1initcommon.SetJournalPermissions(p); err != nil { log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err) } if flavor == "kvm" { kvm.InitDebug(debug) if err := KvmNetworkingToSystemd(p, n); err != nil { log.FatalE("failed to configure systemd for kvm", err) } } canMachinedRegister := false if flavor != "kvm" { // kvm doesn't register with systemd right now, see #2664. canMachinedRegister = machinedRegister() } diag.Printf("canMachinedRegister %t", canMachinedRegister) args, env, err := getArgsEnv(p, flavor, canMachinedRegister, debug, n, insecureOptions) if err != nil { log.FatalE("cannot get environment", err) } diag.Printf("args %q", args) diag.Printf("env %q", env) // create a separate mount namespace so the cgroup filesystems // are unmounted when exiting the pod if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.FatalE("error unsharing", err) } // we recursively make / a "shared and slave" so mount events from the // new namespace don't propagate to the host namespace but mount events // from the host propagate to the new namespace and are forwarded to // its peer group // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { log.FatalE("error making / a slave mount", err) } if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { log.FatalE("error making / a shared and slave mount", err) } unifiedCgroup, err := cgroup.IsCgroupUnified("/") if err != nil { log.FatalE("error determining cgroup version", err) } diag.Printf("unifiedCgroup %t", unifiedCgroup) s1Root := common.Stage1RootfsPath(p.Root) machineID := stage1initcommon.GetMachineID(p) subcgroup, err := getContainerSubCgroup(machineID, canMachinedRegister, unifiedCgroup) if err != nil { log.FatalE("error getting container subcgroup", err) } diag.Printf("subcgroup %q", subcgroup) if err := ioutil.WriteFile(filepath.Join(p.Root, "subcgroup"), []byte(fmt.Sprintf("%s", subcgroup)), 0644); err != nil { log.FatalE("cannot write subcgroup file", err) } if !unifiedCgroup { enabledCgroups, err := v1.GetEnabledCgroups() if err != nil { log.FatalE("error getting v1 cgroups", err) } diag.Printf("enabledCgroups %q", enabledCgroups) if err := mountHostV1Cgroups(mnt, enabledCgroups); err != nil { log.FatalE("couldn't mount the host v1 cgroups", err) } if !canMachinedRegister { if err := v1.JoinSubcgroup("systemd", subcgroup); err != nil { log.FatalE(fmt.Sprintf("error joining subcgroup %q", subcgroup), err) } } var serviceNames []string for _, app := range p.Manifest.Apps { serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name)) } diag.Printf("serviceNames %q", serviceNames) if err := mountContainerV1Cgroups(mnt, s1Root, enabledCgroups, subcgroup, serviceNames, insecureOptions); err != nil { log.FatalE("couldn't mount the container v1 cgroups", err) } } // KVM flavor has a bit different logic in handling pid vs ppid, for details look into #2389 // it doesn't require the existence of a "ppid", instead it registers the current pid (which // will be reused by lkvm binary) as a pod process pid used during entering pid_filename := "ppid" if flavor == "kvm" { pid_filename = "pid" } if err = stage1common.WritePid(os.Getpid(), pid_filename); err != nil { log.FatalE("error writing pid", err) } if flavor == "kvm" { if err := KvmPrepareMounts(s1Root, p); err != nil { log.FatalE("error preparing mounts", err) } } err = stage1common.WithClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { log.FatalE(fmt.Sprintf("failed to execute %q", args[0]), err) } return 0 }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { fmt.Fprintln(os.Stderr, "UUID is missing or malformed") return 1 } root := "." p, err := LoadPod(root, uuid) if err != nil { fmt.Fprintf(os.Stderr, "Failed to load pod: %v\n", err) return 1 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking // network plugins lfd, err := common.GetRktLockFD() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get rkt lock fd: %v\n", err) return 1 } if err := sys.CloseOnExec(lfd, true); err != nil { fmt.Fprintf(os.Stderr, "Failed to set FD_CLOEXEC on rkt lock: %v\n", err) return 1 } mirrorLocalZoneInfo(p.Root) flavor, _, err := p.getFlavor() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get stage1 flavor: %v\n", err) return 3 } var n *networking.Networking if privNet.Any() { fps, err := forwardedPorts(p) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) return 6 } n, err = networking.Setup(root, p.UUID, fps, privNet, localConfig, flavor) if err != nil { fmt.Fprintf(os.Stderr, "Failed to setup network: %v\n", err) return 6 } if err = n.Save(); err != nil { fmt.Fprintf(os.Stderr, "Failed to save networking state %v\n", err) n.Teardown(flavor) return 6 } if len(mdsToken) > 0 { hostIP, err := n.GetDefaultHostIP() if err != nil { fmt.Fprintf(os.Stderr, "Failed to get default Host IP: %v\n", err) return 6 } p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken) } } else { if flavor == "kvm" { fmt.Fprintf(os.Stderr, "Flavor kvm requires private network configuration (try --private-net).\n") return 6 } if len(mdsToken) > 0 { p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken) } } if err = p.WritePrepareAppTemplate(); err != nil { fmt.Fprintf(os.Stderr, "Failed to write prepare-app service template: %v\n", err) return 2 } if err = p.PodToSystemd(interactive, flavor); err != nil { fmt.Fprintf(os.Stderr, "Failed to configure systemd: %v\n", err) return 2 } args, env, err := getArgsEnv(p, flavor, debug, n) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) return 3 } // create a separate mount namespace so the cgroup filesystems // are unmounted when exiting the pod if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.Fatalf("error unsharing: %v", err) } // we recursively make / a "shared and slave" so mount events from the // new namespace don't propagate to the host namespace but mount events // from the host propagate to the new namespace and are forwarded to // its peer group // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { log.Fatalf("error making / a slave mount: %v", err) } if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { log.Fatalf("error making / a shared and slave mount: %v", err) } var serviceNames []string for _, app := range p.Manifest.Apps { serviceNames = append(serviceNames, ServiceUnitName(app.Name)) } s1Root := common.Stage1RootfsPath(p.Root) machineID := p.GetMachineID() subcgroup, err := getContainerSubCgroup(machineID) if err == nil { if err := cgroup.CreateCgroups(s1Root, subcgroup, serviceNames); err != nil { fmt.Fprintf(os.Stderr, "Error creating cgroups: %v\n", err) return 5 } } else { fmt.Fprintf(os.Stderr, "Continuing with per-app isolators disabled: %v\n", err) } if err = writePpid(os.Getpid()); err != nil { fmt.Fprintln(os.Stderr, err.Error()) return 4 } err = withClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { fmt.Fprintf(os.Stderr, "Failed to execute %q: %v\n", args[0], err) return 7 } return 0 }
func main() { opts := imports.Options{ Fragment: true, AllErrors: true, Comments: true, TabIndent: true, TabWidth: 8, } flag.Parse() a := flag.Args() if len(a) < 2 || len(a)%2 != 0 { log.Fatalf("Usage: builtin <command> <code> [<command> <code>]*") } filemap := make(map[string][]byte) for ; len(a) > 0; a = a[2:] { goCode := startPart // Simple programs are just bits of code for main ... if a[1][0] == '{' { goCode = goCode + fmt.Sprintf(initPart, a[0], a[0], a[0]) goCode = goCode + a[1][1:len(a[1])-1] } else { for _, v := range a[1:] { if v == "{" { goCode = goCode + fmt.Sprintf(initPart, a[0]) continue } // FIXME: should only look for last arg. if v == "}" { break } goCode = goCode + v + "\n" } } goCode = goCode + endPart log.Printf("\n---------------------\n%v\n------------------------\n", goCode) fullCode, err := imports.Process("commandline", []byte(goCode), &opts) if err != nil { log.Fatalf("bad parse: '%v': %v", goCode, err) } log.Printf("\n----FULLCODE---------\n%v\n------FULLCODE----------\n", string(fullCode)) bName := path.Join("/src/cmds/sh", a[0]+".go") //fmt.Printf("filemap %v\n", filemap) filemap[bName] = fullCode //log.Printf("%v: %v", bName, fullCode) } // processed code, read in shell files. globs, err := filepath.Glob("/src/cmds/sh/*.go") if err != nil { log.Fatal(err) } for _, i := range globs { if b, err := ioutil.ReadFile(i); err != nil { log.Fatal(err) } else { if _, ok := filemap[i]; ok { log.Fatal("%v exists", i) } filemap[i] = b } } if b, err := ioutil.ReadFile("/proc/mounts"); err == nil && false { log.Printf("m %v\n", string(b)) } // we'd like to do this here, but it seems it doesn't end // up applying to all procs in this group, leading to confusion. // sometimes they get the private mount, sometimes not. So we had // to hack it in the shell. // FIXME if false { if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.Fatal(err) } } if b, err := ioutil.ReadFile("/proc/mounts"); err == nil { fmt.Printf("m %v\n", b) } // We are rewriting the shell. We need to create a new binary, i.e. // rewrite the one in /bin. Sadly, there is no way to say "mount THIS bin // before THAT bin". There will be ca. 3.18 and we might as well wait for // that to become common. For now, we essentially erase /bin but mounting // a tmpfs over it. // This would be infinitely easier with a true union file system. Oh well. for _, m := range namespace { if err := syscall.Mount(m.source, m.target, m.fstype, m.flags, m.opts); err != nil { log.Printf("Mount :%s: on :%s: type :%s: flags %x: %v\n", m.source, m.target, m.fstype, m.flags, m.opts, err) } } //log.Printf("filemap: %v", filemap) // write the new /src/cmds/sh for i, v := range filemap { if err = ioutil.WriteFile(i, v, 0600); err != nil { log.Fatal(err) } } // the big fun: just run it. The Right Things Happen. cmd := exec.Command("/buildbin/sh") cmd.Stdin = os.Stdin cmd.Stderr = os.Stderr cmd.Stdout = os.Stdout // TODO: figure out why we get EPERM when we use this. //cmd.SysProcAttr = &syscall.SysProcAttr{Setctty: true, Setsid: true,} log.Printf("Run %v", cmd) err = cmd.Run() if err != nil { log.Printf("%v\n", err) } // Unshare doesn't work in a sane way due to a Go issue? for _, m := range namespace { if err := syscall.Unmount(m.target, syscall.MNT_FORCE); err != nil { log.Printf("Umount :%s: %v\n", m.target, err) } } log.Printf("init: /bin/sh returned!\n") }
// bind mount the repo source tree into the chroot and run a command func enterChrootHelper(args []string) (err error) { if len(args) < 3 { return fmt.Errorf("got %d args, need at least 3", len(args)) } e := enter{ RepoRoot: args[0], Chroot: args[1], Cmd: args[2:], } username := os.Getenv("SUDO_USER") if username == "" { return fmt.Errorf("SUDO_USER environment variable is not set.") } if e.User, err = user.Lookup(username); err != nil { return err } e.UserRunDir = filepath.Join(e.Chroot, "run", "user", e.User.Uid) newRepoRoot := filepath.Join(e.Chroot, chrootRepoRoot) if err := os.MkdirAll(newRepoRoot, 0755); err != nil { return err } // Only copy if resolv.conf exists, if missing resolver uses localhost resolv := "/etc/resolv.conf" if _, err := os.Stat(resolv); err == nil { chrootResolv := filepath.Join(e.Chroot, resolv) if err := system.InstallRegularFile(resolv, chrootResolv); err != nil { return err } } // namespaces are per-thread attributes runtime.LockOSThread() defer runtime.UnlockOSThread() if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { return fmt.Errorf("Unsharing mount namespace failed: %v", err) } if err := system.RecursiveSlave("/"); err != nil { return err } if err := system.Bind(e.RepoRoot, newRepoRoot); err != nil { return err } if err := e.MountAPI(); err != nil { return err } if err = os.MkdirAll(e.UserRunDir, 0755); err != nil { return err } if err = os.Chown(e.UserRunDir, e.User.UidNo, e.User.GidNo); err != nil { return err } if err := e.MountAgent("SSH_AUTH_SOCK"); err != nil { return err } if err := e.MountGnupg(); err != nil { return err } if err := e.CopyGoogleCreds(); err != nil { return err } if err := syscall.Chroot(e.Chroot); err != nil { return fmt.Errorf("Chrooting to %q failed: %v", e.Chroot, err) } if err := os.Chdir(chrootRepoRoot); err != nil { return err } sudo := "/usr/bin/sudo" sudoArgs := append([]string{sudo, "-u", username, "--"}, e.Cmd...) return syscall.Exec(sudo, sudoArgs, os.Environ()) }
// Run mounts the right overlay filesystems and actually runs the prepared // pod by exec()ing the stage1 init inside the pod filesystem. func Run(cfg RunConfig, dir string) { useOverlay, err := preparedWithOverlay(dir) if err != nil { log.Fatalf("error: %v", err) } // create a separate mount namespace so the cgroup filesystems and/or // overlay mounts are unmounted when exiting the pod if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.Fatalf("error unsharing: %v", err) } // we recursively make / a "shared and slave" so mount events from the // new namespace don't propagate to the host namespace but mount events // from the host propagate to the new namespace and are forwarded to // its peer group // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { log.Fatalf("error making / a slave mount: %v", err) } if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { log.Fatalf("error making / a shared and slave mount: %v", err) } log.Printf("Setting up stage1") if err := setupStage1Image(cfg, cfg.Stage1Image, dir, useOverlay); err != nil { log.Fatalf("error setting up stage1: %v", err) } log.Printf("Wrote filesystem to %s\n", dir) for _, img := range cfg.Images { if err := setupAppImage(cfg, img, dir, useOverlay); err != nil { log.Fatalf("error setting up app image: %v", err) } } if err := os.Setenv(common.EnvLockFd, fmt.Sprintf("%v", cfg.LockFd)); err != nil { log.Fatalf("setting lock fd environment: %v", err) } log.Printf("Pivoting to filesystem %s", dir) if err := os.Chdir(dir); err != nil { log.Fatalf("failed changing to dir: %v", err) } ep, err := getStage1Entrypoint(dir, runEntrypoint) if err != nil { log.Fatalf("error determining init entrypoint: %v", err) } log.Printf("Execing %s", ep) args := []string{filepath.Join(common.Stage1RootfsPath(dir), ep)} if cfg.Debug { args = append(args, "--debug") } if cfg.PrivateNet.Any() { args = append(args, "--private-net="+cfg.PrivateNet.String()) } if cfg.Interactive { args = append(args, "--interactive") } args = append(args, cfg.UUID.String()) // make sure the lock fd stays open across exec if err := sys.CloseOnExec(cfg.LockFd, false); err != nil { log.Fatalf("error clearing FD_CLOEXEC on lock fd") } if err := syscall.Exec(args[0], args, os.Environ()); err != nil { log.Fatalf("error execing init: %v", err) } }
func stage1() int { uuid, err := types.NewUUID(flag.Arg(0)) if err != nil { log.PrintE("UUID is missing or malformed", err) return 1 } root := "." p, err := stage1commontypes.LoadPod(root, uuid) if err != nil { log.PrintE("failed to load pod", err) return 1 } // set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking // network plugins lfd, err := common.GetRktLockFD() if err != nil { log.PrintE("failed to get rkt lock fd", err) return 1 } if err := sys.CloseOnExec(lfd, true); err != nil { log.PrintE("failed to set FD_CLOEXEC on rkt lock", err) return 1 } mirrorLocalZoneInfo(p.Root) flavor, _, err := stage1initcommon.GetFlavor(p) if err != nil { log.PrintE("failed to get stage1 flavor", err) return 3 } var n *networking.Networking if netList.Contained() { fps, err := forwardedPorts(p) if err != nil { log.Error(err) return 6 } n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor, debug) if err != nil { log.PrintE("failed to setup network", err) return 6 } if err = n.Save(); err != nil { log.PrintE("failed to save networking state", err) n.Teardown(flavor, debug) return 6 } if len(mdsToken) > 0 { hostIP, err := n.GetDefaultHostIP() if err != nil { log.PrintE("failed to get default Host IP", err) return 6 } p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken) } } else { if flavor == "kvm" { log.Print("flavor kvm requires private network configuration (try --net)") return 6 } if len(mdsToken) > 0 { p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken) } } if err = stage1initcommon.WriteDefaultTarget(p); err != nil { log.PrintE("failed to write default.target", err) return 2 } if err = stage1initcommon.WritePrepareAppTemplate(p); err != nil { log.PrintE("failed to write prepare-app service template", err) return 2 } if err := stage1initcommon.SetJournalPermissions(p); err != nil { log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err) } if flavor == "kvm" { if err := KvmPodToSystemd(p, n); err != nil { log.PrintE("failed to configure systemd for kvm", err) return 2 } } if err = stage1initcommon.PodToSystemd(p, interactive, flavor, privateUsers); err != nil { log.PrintE("failed to configure systemd", err) return 2 } args, env, err := getArgsEnv(p, flavor, debug, n) if err != nil { log.Error(err) return 3 } // create a separate mount namespace so the cgroup filesystems // are unmounted when exiting the pod if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil { log.FatalE("error unsharing", err) } // we recursively make / a "shared and slave" so mount events from the // new namespace don't propagate to the host namespace but mount events // from the host propagate to the new namespace and are forwarded to // its peer group // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil { log.FatalE("error making / a slave mount", err) } if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { log.FatalE("error making / a shared and slave mount", err) } enabledCgroups, err := cgroup.GetEnabledCgroups() if err != nil { log.FatalE("error getting cgroups", err) return 5 } // mount host cgroups in the rkt mount namespace if err := mountHostCgroups(enabledCgroups); err != nil { log.FatalE("couldn't mount the host cgroups", err) return 5 } var serviceNames []string for _, app := range p.Manifest.Apps { serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name)) } s1Root := common.Stage1RootfsPath(p.Root) machineID := stage1initcommon.GetMachineID(p) subcgroup, err := getContainerSubCgroup(machineID) if err == nil { if err := mountContainerCgroups(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil { log.PrintE("couldn't mount the container cgroups", err) return 5 } } else { log.PrintE("continuing with per-app isolators disabled", err) } if err = stage1common.WritePpid(os.Getpid()); err != nil { log.Error(err) return 4 } err = stage1common.WithClearedCloExec(lfd, func() error { return syscall.Exec(args[0], args, env) }) if err != nil { log.PrintE(fmt.Sprintf("failed to execute %q", args[0]), err) return 7 } return 0 }
func setupNsListener() { runtime.LockOSThread() defer runtime.UnlockOSThread() /* create own netns */ if err := syscall.Unshare(syscall.CLONE_NEWNET); err != nil { glog.Error(err) return } childPipe := os.NewFile(uintptr(3), "child") enc := gob.NewEncoder(childPipe) dec := gob.NewDecoder(childPipe) /* notify containerd to execute prestart hooks */ if err := enc.Encode("init"); err != nil { glog.Error(err) return } /* after execute prestart hooks */ var ready string if err := dec.Decode(&ready); err != nil { glog.Error(err) return } if ready != "init" { glog.Errorf("get incorrect init message: %s", ready) return } // Get network namespace info for the first time and send to the containerd /* get route info before link down */ routes, err := netlink.RouteList(nil, netlink.FAMILY_V4) if err != nil { glog.Error(err) return } /* send interface info to containerd */ infos := collectionInterfaceInfo() if err := enc.Encode(infos); err != nil { glog.Error(err) return } if err := enc.Encode(routes); err != nil { glog.Error(err) return } // This is a call back function. // Use to send netlink update informations to containerd. netNs2Containerd := func(netlinkUpdate supervisor.NetlinkUpdate) { if err := enc.Encode(netlinkUpdate); err != nil { glog.Info("err Encode(netlinkUpdate) is :", err) } } // Keep collecting network namespace info and sending to the containerd setupNetworkNsTrap(netNs2Containerd) }