Example #1
1
func unshareAndBind(workingRootDir string) bool {
	if *unshare {
		// Re-exec myself using the unshare syscall while on a locked thread.
		// This hack is required because syscall.Unshare() operates on only one
		// thread in the process, and Go switches execution between threads
		// randomly. Thus, the namespace can be suddenly switched for running
		// code. This is an aspect of Go that was not well thought out.
		runtime.LockOSThread()
		err := syscall.Unshare(syscall.CLONE_NEWNS)
		if err != nil {
			fmt.Printf("Unable to unshare mount namesace\t%s\n", err)
			return false
		}
		args := append(os.Args, "-unshare=false")
		err = syscall.Exec(args[0], args, os.Environ())
		if err != nil {
			fmt.Printf("Unable to Exec:%s\t%s\n", args[0], err)
			return false
		}
	}
	err := syscall.Mount("none", "/", "", syscall.MS_REC|syscall.MS_PRIVATE, "")
	if err != nil {
		fmt.Printf("Unable to set mount sharing to private\t%s\n", err)
		return false
	}
	syscall.Unmount(workingRootDir, 0)
	err = syscall.Mount(*rootDir, workingRootDir, "", syscall.MS_BIND, "")
	if err != nil {
		fmt.Printf("Unable to bind mount %s to %s\t%s\n",
			*rootDir, workingRootDir, err)
		return false
	}
	return true
}
Example #2
0
func newNetNS() (hostNS, childNS *os.File, err error) {
	defer func() {
		if err != nil {
			if hostNS != nil {
				hostNS.Close()
			}
			if childNS != nil {
				childNS.Close()
			}
		}
	}()

	hostNS, err = os.Open(selfNetNS)
	if err != nil {
		return
	}

	if err = syscall.Unshare(syscall.CLONE_NEWNET); err != nil {
		return
	}

	childNS, err = os.Open(selfNetNS)
	if err != nil {
		ns.SetNS(hostNS, syscall.CLONE_NEWNET)
		return
	}

	return
}
Example #3
0
// enterPrivateMountNamespace does just that: the current mount ns is unshared (isolated)
// and then made a slave to the root mount / of the parent mount ns (mount events from /
// or its children that happen in the parent NS propagate to us).
//
// this is not yet compatible with volume plugins as implemented by the kubelet, which
// depends on using host-volume args to 'docker run' to attach plugin volumes to CT's
// at runtime. as such, docker needs to be able to see the volumes mounted by k8s plugins,
// which is impossible if k8s volume plugins are running in an isolated mount ns.
//
// an alternative approach would be to always run the kubelet in the host's mount-ns and
// rely upon mesos to forcibly umount bindings in the task sandbox before rmdir'ing it:
// https://issues.apache.org/jira/browse/MESOS-349.
//
// use at your own risk.
func enterPrivateMountNamespace() {
	log.Warningln("EXPERIMENTAL FEATURE: entering private mount ns")

	// enter a new mount NS, useful for isolating changes to the mount table
	// that are made by the kubelet for storage volumes.
	err := syscall.Unshare(syscall.CLONE_NEWNS)
	if err != nil {
		log.Fatalf("failed to enter private mount NS: %v", err)
	}

	// make the rootfs / rslave to the parent mount NS so that we
	// pick up on any changes made there
	err = syscall.Mount("", "/", "dontcare", syscall.MS_REC|syscall.MS_SLAVE, "")
	if err != nil {
		log.Fatalf("failed to mark / rslave: %v", err)
	}
}
Example #4
0
// SetupTestNetNS joins a new network namespace, and returns its associated
// teardown function.
//
// Example usage:
//
//     defer SetupTestNetNS(t)()
//
func SetupTestNetNS(t *testing.T) func() {
	runtime.LockOSThread()
	if err := syscall.Unshare(syscall.CLONE_NEWNET); err != nil {
		t.Fatalf("Failed to enter netns: %v", err)
	}

	fd, err := syscall.Open("/proc/self/ns/net", syscall.O_RDONLY, 0)
	if err != nil {
		t.Fatal("Failed to open netns file")
	}

	return func() {
		if err := syscall.Close(fd); err != nil {
			t.Logf("Warning: netns closing failed (%v)", err)
		}
		runtime.UnlockOSThread()
	}
}
Example #5
0
// SetupTestOSContext joins a new network namespace, and returns its associated
// teardown function.
//
// Example usage:
//
//     defer SetupTestOSContext(t)()
//
func SetupTestOSContext(t *testing.T) func() {
	runtime.LockOSThread()
	if err := syscall.Unshare(syscall.CLONE_NEWNET); err != nil {
		t.Fatalf("Failed to enter netns: %v", err)
	}

	fd, err := syscall.Open("/proc/self/ns/net", syscall.O_RDONLY, 0)
	if err != nil {
		t.Fatal("Failed to open netns file")
	}

	// Since we are switching to a new test namespace make
	// sure to re-initialize initNs context
	ns.Init()

	return func() {
		if err := syscall.Close(fd); err != nil {
			t.Logf("Warning: netns closing failed (%v)", err)
		}
		runtime.UnlockOSThread()
	}
}
Example #6
0
func main() {
	flag.Usage = usage
	flag.Parse()
	if flag.NArg() < 1 {
		usage()
	}

	var flags int
	if *mflag {
		flags |= syscall.CLONE_NEWNS
	}
	if *uflag {
		flags |= syscall.CLONE_NEWUTS
	}
	if *iflag {
		flags |= syscall.CLONE_NEWIPC
	}
	if *nflag {
		flags |= syscall.CLONE_NEWNET
	}
	if *pflag {
		flags |= syscall.CLONE_NEWPID
	}
	if *Uflag {
		flags |= syscall.CLONE_NEWUSER
	}

	ck(syscall.Unshare(flags))

	args := flag.Args()
	cmd := exec.Command(args[0], args[1:]...)
	cmd.Stdin = os.Stdin
	cmd.Stderr = os.Stderr
	cmd.Stdout = os.Stdout
	ck(cmd.Run())
}
func run(ctx *kingpin.ParseContext) error {
	mounts := getMounts()
	if len(mounts) == 0 {
		return errors.New("No suitable mounts found")
	}

	if !dbtm.StringInSlice(mounts, mount) {
		return errors.New(fmt.Sprintf("Mountpoint %s not found", mount))
	}

	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
		return err
	}

	if err := syscall.Mount("none", mount, "none", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
		return err
	}

	if err := syscall.Unmount(mount, syscall.MNT_DETACH); err != nil {
		return err
	}

	if f, err := os.Create(path.Join(mount, "junk")); err == nil {
		defer f.Close()
		w := bufio.NewWriter(f)
		defer w.Flush()
		for {
			if _, err := w.Write([]byte("junk")); err != nil {
				break
			}
		}
		return nil
	} else {
		return err
	}
}
Example #8
0
func Unshare(flags int) error {
	return syscall.Unshare(flags)
}
Example #9
0
// chroot on linux uses pivot_root instead of chroot
// pivot_root takes a new root and an old root.
// Old root must be a sub-dir of new root, it is where the current rootfs will reside after the call to pivot_root.
// New root is where the new rootfs is set to.
// Old root is removed after the call to pivot_root so it is no longer available under the new root.
// This is similar to how libcontainer sets up a container's rootfs
func chroot(path string) (err error) {
	// Create new mount namespace so mounts don't leak
	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
		return fmt.Errorf("Error creating mount namespace before pivot: %v", err)
	}
	// path must be a different fs for pivot_root, so bind-mount to itself to ensure this
	if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil {
		return fmt.Errorf("Error mounting pivot dir before pivot: %v", err)
	}

	// setup oldRoot for pivot_root
	pivotDir, err := ioutil.TempDir(path, ".pivot_root")
	if err != nil {
		return fmt.Errorf("Error setting up pivot dir: %v", err)
	}

	var mounted bool
	defer func() {
		if mounted {
			// make sure pivotDir is not mounted before we try to remove it
			if errCleanup := syscall.Unmount(pivotDir, syscall.MNT_DETACH); errCleanup != nil {
				if err == nil {
					err = errCleanup
				}
				return
			}
		}

		errCleanup := os.Remove(pivotDir)
		// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
		// but we already cleaned it up on failed pivot_root
		if errCleanup != nil && !os.IsNotExist(errCleanup) {
			errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
			if err == nil {
				err = errCleanup
			}
		}
	}()

	if err := syscall.PivotRoot(path, pivotDir); err != nil {
		// If pivot fails, fall back to the normal chroot after cleaning up temp dir for pivot_root
		if err := os.Remove(pivotDir); err != nil {
			return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
		}
		return realChroot(path)
	}
	mounted = true

	// This is the new path for where the old root (prior to the pivot) has been moved to
	// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
	pivotDir = filepath.Join("/", filepath.Base(pivotDir))

	if err := syscall.Chdir("/"); err != nil {
		return fmt.Errorf("Error changing to new root: %v", err)
	}

	// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
	if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
		return fmt.Errorf("Error making old root private after pivot: %v", err)
	}

	// Now unmount the old root so it's no longer visible from the new root
	if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
		return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
	}
	mounted = false

	return nil
}
Example #10
0
// New creates a new network namespace and returns a handle to it.
func New() (ns NsHandle, err error) {
	if err := syscall.Unshare(CLONE_NEWNET); err != nil {
		return -1, err
	}
	return Get()
}
Example #11
0
File: init.go Project: joshix/rkt
func stage1() int {
	uuid, err := types.NewUUID(flag.Arg(0))
	if err != nil {
		log.FatalE("UUID is missing or malformed", err)
	}

	root := "."
	p, err := stage1commontypes.LoadPod(root, uuid)
	if err != nil {
		log.FatalE("failed to load pod", err)
	}

	// set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking
	// network plugins
	lfd, err := common.GetRktLockFD()
	if err != nil {
		log.FatalE("failed to get rkt lock fd", err)
	}

	if err := sys.CloseOnExec(lfd, true); err != nil {
		log.FatalE("failed to set FD_CLOEXEC on rkt lock", err)
	}

	mirrorLocalZoneInfo(p.Root)

	flavor, _, err := stage1initcommon.GetFlavor(p)
	if err != nil {
		log.FatalE("failed to get stage1 flavor", err)
	}

	var n *networking.Networking
	if netList.Contained() {
		fps, err := commonnet.ForwardedPorts(p.Manifest)
		if err != nil {
			log.FatalE("error initializing forwarding ports", err)
		}

		noDNS := dnsConfMode.Pairs["resolv"] != "default" // force ignore CNI DNS results
		n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor, noDNS, debug)
		if err != nil {
			log.FatalE("failed to setup network", err)
		}

		if err = n.Save(); err != nil {
			log.PrintE("failed to save networking state", err)
			n.Teardown(flavor, debug)
			return 254
		}

		if len(mdsToken) > 0 {
			hostIP, err := n.GetForwardableNetHostIP()
			if err != nil {
				log.FatalE("failed to get default Host IP", err)
			}

			p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken)
		}
	} else {
		if flavor == "kvm" {
			log.Fatal("flavor kvm requires private network configuration (try --net)")
		}
		if len(mdsToken) > 0 {
			p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken)
		}
	}

	insecureOptions := stage1initcommon.Stage1InsecureOptions{
		DisablePaths:        disablePaths,
		DisableCapabilities: disableCapabilities,
		DisableSeccomp:      disableSeccomp,
	}

	mnt := fs.NewLoggingMounter(
		fs.MounterFunc(syscall.Mount),
		fs.UnmounterFunc(syscall.Unmount),
		diag.Printf,
	)

	if dnsConfMode.Pairs["resolv"] == "host" {
		stage1initcommon.UseHostResolv(mnt, root)
	}

	if dnsConfMode.Pairs["hosts"] == "host" {
		stage1initcommon.UseHostHosts(mnt, root)
	}

	if mutable {
		if err = stage1initcommon.MutableEnv(p); err != nil {
			log.FatalE("cannot initialize mutable environment", err)
		}
	} else {
		if err = stage1initcommon.ImmutableEnv(p, interactive, privateUsers, insecureOptions); err != nil {
			log.FatalE("cannot initialize immutable environment", err)
		}
	}

	if err := stage1initcommon.SetJournalPermissions(p); err != nil {
		log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err)
	}

	if flavor == "kvm" {
		kvm.InitDebug(debug)
		if err := KvmNetworkingToSystemd(p, n); err != nil {
			log.FatalE("failed to configure systemd for kvm", err)
		}
	}

	canMachinedRegister := false
	if flavor != "kvm" {
		// kvm doesn't register with systemd right now, see #2664.
		canMachinedRegister = machinedRegister()
	}
	diag.Printf("canMachinedRegister %t", canMachinedRegister)

	args, env, err := getArgsEnv(p, flavor, canMachinedRegister, debug, n, insecureOptions)
	if err != nil {
		log.FatalE("cannot get environment", err)
	}
	diag.Printf("args %q", args)
	diag.Printf("env %q", env)

	// create a separate mount namespace so the cgroup filesystems
	// are unmounted when exiting the pod
	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
		log.FatalE("error unsharing", err)
	}

	// we recursively make / a "shared and slave" so mount events from the
	// new namespace don't propagate to the host namespace but mount events
	// from the host propagate to the new namespace and are forwarded to
	// its peer group
	// See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
	if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil {
		log.FatalE("error making / a slave mount", err)
	}
	if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil {
		log.FatalE("error making / a shared and slave mount", err)
	}

	unifiedCgroup, err := cgroup.IsCgroupUnified("/")
	if err != nil {
		log.FatalE("error determining cgroup version", err)
	}
	diag.Printf("unifiedCgroup %t", unifiedCgroup)

	s1Root := common.Stage1RootfsPath(p.Root)
	machineID := stage1initcommon.GetMachineID(p)

	subcgroup, err := getContainerSubCgroup(machineID, canMachinedRegister, unifiedCgroup)
	if err != nil {
		log.FatalE("error getting container subcgroup", err)
	}
	diag.Printf("subcgroup %q", subcgroup)

	if err := ioutil.WriteFile(filepath.Join(p.Root, "subcgroup"),
		[]byte(fmt.Sprintf("%s", subcgroup)), 0644); err != nil {
		log.FatalE("cannot write subcgroup file", err)
	}

	if !unifiedCgroup {
		enabledCgroups, err := v1.GetEnabledCgroups()
		if err != nil {
			log.FatalE("error getting v1 cgroups", err)
		}
		diag.Printf("enabledCgroups %q", enabledCgroups)

		if err := mountHostV1Cgroups(mnt, enabledCgroups); err != nil {
			log.FatalE("couldn't mount the host v1 cgroups", err)
		}

		if !canMachinedRegister {
			if err := v1.JoinSubcgroup("systemd", subcgroup); err != nil {
				log.FatalE(fmt.Sprintf("error joining subcgroup %q", subcgroup), err)
			}
		}

		var serviceNames []string
		for _, app := range p.Manifest.Apps {
			serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name))
		}
		diag.Printf("serviceNames %q", serviceNames)

		if err := mountContainerV1Cgroups(mnt, s1Root, enabledCgroups, subcgroup, serviceNames, insecureOptions); err != nil {
			log.FatalE("couldn't mount the container v1 cgroups", err)
		}

	}

	// KVM flavor has a bit different logic in handling pid vs ppid, for details look into #2389
	// it doesn't require the existence of a "ppid", instead it registers the current pid (which
	// will be reused by lkvm binary) as a pod process pid used during entering
	pid_filename := "ppid"
	if flavor == "kvm" {
		pid_filename = "pid"
	}

	if err = stage1common.WritePid(os.Getpid(), pid_filename); err != nil {
		log.FatalE("error writing pid", err)
	}

	if flavor == "kvm" {
		if err := KvmPrepareMounts(s1Root, p); err != nil {
			log.FatalE("error preparing mounts", err)
		}
	}

	err = stage1common.WithClearedCloExec(lfd, func() error {
		return syscall.Exec(args[0], args, env)
	})

	if err != nil {
		log.FatalE(fmt.Sprintf("failed to execute %q", args[0]), err)
	}

	return 0
}
Example #12
0
func stage1() int {
	uuid, err := types.NewUUID(flag.Arg(0))
	if err != nil {
		fmt.Fprintln(os.Stderr, "UUID is missing or malformed")
		return 1
	}

	root := "."
	p, err := LoadPod(root, uuid)
	if err != nil {
		fmt.Fprintf(os.Stderr, "Failed to load pod: %v\n", err)
		return 1
	}

	// set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking
	// network plugins
	lfd, err := common.GetRktLockFD()
	if err != nil {
		fmt.Fprintf(os.Stderr, "Failed to get rkt lock fd: %v\n", err)
		return 1
	}

	if err := sys.CloseOnExec(lfd, true); err != nil {
		fmt.Fprintf(os.Stderr, "Failed to set FD_CLOEXEC on rkt lock: %v\n", err)
		return 1
	}

	mirrorLocalZoneInfo(p.Root)

	flavor, _, err := p.getFlavor()
	if err != nil {
		fmt.Fprintf(os.Stderr, "Failed to get stage1 flavor: %v\n", err)
		return 3
	}

	var n *networking.Networking
	if privNet.Any() {
		fps, err := forwardedPorts(p)
		if err != nil {
			fmt.Fprintln(os.Stderr, err.Error())
			return 6
		}

		n, err = networking.Setup(root, p.UUID, fps, privNet, localConfig, flavor)
		if err != nil {
			fmt.Fprintf(os.Stderr, "Failed to setup network: %v\n", err)
			return 6
		}

		if err = n.Save(); err != nil {
			fmt.Fprintf(os.Stderr, "Failed to save networking state %v\n", err)
			n.Teardown(flavor)
			return 6
		}

		if len(mdsToken) > 0 {
			hostIP, err := n.GetDefaultHostIP()
			if err != nil {
				fmt.Fprintf(os.Stderr, "Failed to get default Host IP: %v\n", err)
				return 6
			}

			p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken)
		}
	} else {
		if flavor == "kvm" {
			fmt.Fprintf(os.Stderr, "Flavor kvm requires private network configuration (try --private-net).\n")
			return 6
		}
		if len(mdsToken) > 0 {
			p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken)
		}
	}

	if err = p.WritePrepareAppTemplate(); err != nil {
		fmt.Fprintf(os.Stderr, "Failed to write prepare-app service template: %v\n", err)
		return 2
	}

	if err = p.PodToSystemd(interactive, flavor); err != nil {
		fmt.Fprintf(os.Stderr, "Failed to configure systemd: %v\n", err)
		return 2
	}

	args, env, err := getArgsEnv(p, flavor, debug, n)
	if err != nil {
		fmt.Fprintf(os.Stderr, "%v\n", err)
		return 3
	}

	// create a separate mount namespace so the cgroup filesystems
	// are unmounted when exiting the pod
	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
		log.Fatalf("error unsharing: %v", err)
	}

	// we recursively make / a "shared and slave" so mount events from the
	// new namespace don't propagate to the host namespace but mount events
	// from the host propagate to the new namespace and are forwarded to
	// its peer group
	// See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil {
		log.Fatalf("error making / a slave mount: %v", err)
	}
	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil {
		log.Fatalf("error making / a shared and slave mount: %v", err)
	}

	var serviceNames []string
	for _, app := range p.Manifest.Apps {
		serviceNames = append(serviceNames, ServiceUnitName(app.Name))
	}
	s1Root := common.Stage1RootfsPath(p.Root)
	machineID := p.GetMachineID()
	subcgroup, err := getContainerSubCgroup(machineID)
	if err == nil {
		if err := cgroup.CreateCgroups(s1Root, subcgroup, serviceNames); err != nil {
			fmt.Fprintf(os.Stderr, "Error creating cgroups: %v\n", err)
			return 5
		}
	} else {
		fmt.Fprintf(os.Stderr, "Continuing with per-app isolators disabled: %v\n", err)
	}

	if err = writePpid(os.Getpid()); err != nil {
		fmt.Fprintln(os.Stderr, err.Error())
		return 4
	}

	err = withClearedCloExec(lfd, func() error {
		return syscall.Exec(args[0], args, env)
	})
	if err != nil {
		fmt.Fprintf(os.Stderr, "Failed to execute %q: %v\n", args[0], err)
		return 7
	}

	return 0
}
Example #13
0
func main() {
	opts := imports.Options{
		Fragment:  true,
		AllErrors: true,
		Comments:  true,
		TabIndent: true,
		TabWidth:  8,
	}
	flag.Parse()
	a := flag.Args()
	if len(a) < 2 || len(a)%2 != 0 {
		log.Fatalf("Usage: builtin <command> <code> [<command> <code>]*")
	}
	filemap := make(map[string][]byte)
	for ; len(a) > 0; a = a[2:] {
		goCode := startPart
		// Simple programs are just bits of code for main ...
		if a[1][0] == '{' {
			goCode = goCode + fmt.Sprintf(initPart, a[0], a[0], a[0])
			goCode = goCode + a[1][1:len(a[1])-1]
		} else {
			for _, v := range a[1:] {
				if v == "{" {
					goCode = goCode + fmt.Sprintf(initPart, a[0])
					continue
				}
				// FIXME: should only look for last arg.
				if v == "}" {
					break
				}
				goCode = goCode + v + "\n"
			}
		}
		goCode = goCode + endPart
		log.Printf("\n---------------------\n%v\n------------------------\n", goCode)
		fullCode, err := imports.Process("commandline", []byte(goCode), &opts)
		if err != nil {
			log.Fatalf("bad parse: '%v': %v", goCode, err)
		}
		log.Printf("\n----FULLCODE---------\n%v\n------FULLCODE----------\n", string(fullCode))
		bName := path.Join("/src/cmds/sh", a[0]+".go")
		//fmt.Printf("filemap %v\n", filemap)
		filemap[bName] = fullCode
		//log.Printf("%v: %v", bName, fullCode)
	}

	// processed code, read in shell files.
	globs, err := filepath.Glob("/src/cmds/sh/*.go")
	if err != nil {
		log.Fatal(err)
	}
	for _, i := range globs {
		if b, err := ioutil.ReadFile(i); err != nil {
			log.Fatal(err)
		} else {
			if _, ok := filemap[i]; ok {
				log.Fatal("%v exists", i)
			}
			filemap[i] = b
		}
	}

	if b, err := ioutil.ReadFile("/proc/mounts"); err == nil && false {
		log.Printf("m %v\n", string(b))
	}
	// we'd like to do this here, but it seems it doesn't end
	// up applying to all procs in this group, leading to confusion.
	// sometimes they get the private mount, sometimes not. So we had
	// to hack it in the shell.
	// FIXME
	if false {
		if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
			log.Fatal(err)
		}
	}
	if b, err := ioutil.ReadFile("/proc/mounts"); err == nil {
		fmt.Printf("m %v\n", b)
	}

	// We are rewriting the shell. We need to create a new binary, i.e.
	// rewrite the one in /bin. Sadly, there is no way to say "mount THIS bin
	// before THAT bin". There will be ca. 3.18 and we might as well wait for
	// that to become common. For now, we essentially erase /bin but mounting
	// a tmpfs over it.
	// This would be infinitely easier with a true union file system. Oh well.
	for _, m := range namespace {
		if err := syscall.Mount(m.source, m.target, m.fstype, m.flags, m.opts); err != nil {
			log.Printf("Mount :%s: on :%s: type :%s: flags %x: %v\n", m.source, m.target, m.fstype, m.flags, m.opts, err)
		}
	}
	//log.Printf("filemap: %v", filemap)
	// write the new /src/cmds/sh
	for i, v := range filemap {
		if err = ioutil.WriteFile(i, v, 0600); err != nil {
			log.Fatal(err)
		}
	}

	// the big fun: just run it. The Right Things Happen.
	cmd := exec.Command("/buildbin/sh")
	cmd.Stdin = os.Stdin
	cmd.Stderr = os.Stderr
	cmd.Stdout = os.Stdout
	// TODO: figure out why we get EPERM when we use this.
	//cmd.SysProcAttr = &syscall.SysProcAttr{Setctty: true, Setsid: true,}
	log.Printf("Run %v", cmd)
	err = cmd.Run()
	if err != nil {
		log.Printf("%v\n", err)
	}
	// Unshare doesn't work in a sane way due to a Go issue?
	for _, m := range namespace {
		if err := syscall.Unmount(m.target, syscall.MNT_FORCE); err != nil {
			log.Printf("Umount :%s: %v\n", m.target, err)
		}
	}
	log.Printf("init: /bin/sh returned!\n")
}
Example #14
0
// bind mount the repo source tree into the chroot and run a command
func enterChrootHelper(args []string) (err error) {
	if len(args) < 3 {
		return fmt.Errorf("got %d args, need at least 3", len(args))
	}

	e := enter{
		RepoRoot: args[0],
		Chroot:   args[1],
		Cmd:      args[2:],
	}

	username := os.Getenv("SUDO_USER")
	if username == "" {
		return fmt.Errorf("SUDO_USER environment variable is not set.")
	}
	if e.User, err = user.Lookup(username); err != nil {
		return err
	}
	e.UserRunDir = filepath.Join(e.Chroot, "run", "user", e.User.Uid)

	newRepoRoot := filepath.Join(e.Chroot, chrootRepoRoot)
	if err := os.MkdirAll(newRepoRoot, 0755); err != nil {
		return err
	}

	// Only copy if resolv.conf exists, if missing resolver uses localhost
	resolv := "/etc/resolv.conf"
	if _, err := os.Stat(resolv); err == nil {
		chrootResolv := filepath.Join(e.Chroot, resolv)
		if err := system.InstallRegularFile(resolv, chrootResolv); err != nil {
			return err
		}
	}

	// namespaces are per-thread attributes
	runtime.LockOSThread()
	defer runtime.UnlockOSThread()

	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
		return fmt.Errorf("Unsharing mount namespace failed: %v", err)
	}

	if err := system.RecursiveSlave("/"); err != nil {
		return err
	}

	if err := system.Bind(e.RepoRoot, newRepoRoot); err != nil {
		return err
	}

	if err := e.MountAPI(); err != nil {
		return err
	}

	if err = os.MkdirAll(e.UserRunDir, 0755); err != nil {
		return err
	}

	if err = os.Chown(e.UserRunDir, e.User.UidNo, e.User.GidNo); err != nil {
		return err
	}

	if err := e.MountAgent("SSH_AUTH_SOCK"); err != nil {
		return err
	}

	if err := e.MountGnupg(); err != nil {
		return err
	}

	if err := e.CopyGoogleCreds(); err != nil {
		return err
	}

	if err := syscall.Chroot(e.Chroot); err != nil {
		return fmt.Errorf("Chrooting to %q failed: %v", e.Chroot, err)
	}

	if err := os.Chdir(chrootRepoRoot); err != nil {
		return err
	}

	sudo := "/usr/bin/sudo"
	sudoArgs := append([]string{sudo, "-u", username, "--"}, e.Cmd...)
	return syscall.Exec(sudo, sudoArgs, os.Environ())
}
Example #15
0
// Run mounts the right overlay filesystems and actually runs the prepared
// pod by exec()ing the stage1 init inside the pod filesystem.
func Run(cfg RunConfig, dir string) {
	useOverlay, err := preparedWithOverlay(dir)
	if err != nil {
		log.Fatalf("error: %v", err)
	}

	// create a separate mount namespace so the cgroup filesystems and/or
	// overlay mounts are unmounted when exiting the pod
	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
		log.Fatalf("error unsharing: %v", err)
	}

	// we recursively make / a "shared and slave" so mount events from the
	// new namespace don't propagate to the host namespace but mount events
	// from the host propagate to the new namespace and are forwarded to
	// its peer group
	// See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil {
		log.Fatalf("error making / a slave mount: %v", err)
	}
	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil {
		log.Fatalf("error making / a shared and slave mount: %v", err)
	}

	log.Printf("Setting up stage1")
	if err := setupStage1Image(cfg, cfg.Stage1Image, dir, useOverlay); err != nil {
		log.Fatalf("error setting up stage1: %v", err)
	}
	log.Printf("Wrote filesystem to %s\n", dir)

	for _, img := range cfg.Images {
		if err := setupAppImage(cfg, img, dir, useOverlay); err != nil {
			log.Fatalf("error setting up app image: %v", err)
		}
	}

	if err := os.Setenv(common.EnvLockFd, fmt.Sprintf("%v", cfg.LockFd)); err != nil {
		log.Fatalf("setting lock fd environment: %v", err)
	}

	log.Printf("Pivoting to filesystem %s", dir)
	if err := os.Chdir(dir); err != nil {
		log.Fatalf("failed changing to dir: %v", err)
	}

	ep, err := getStage1Entrypoint(dir, runEntrypoint)
	if err != nil {
		log.Fatalf("error determining init entrypoint: %v", err)
	}
	log.Printf("Execing %s", ep)

	args := []string{filepath.Join(common.Stage1RootfsPath(dir), ep)}
	if cfg.Debug {
		args = append(args, "--debug")
	}
	if cfg.PrivateNet.Any() {
		args = append(args, "--private-net="+cfg.PrivateNet.String())
	}
	if cfg.Interactive {
		args = append(args, "--interactive")
	}
	args = append(args, cfg.UUID.String())

	// make sure the lock fd stays open across exec
	if err := sys.CloseOnExec(cfg.LockFd, false); err != nil {
		log.Fatalf("error clearing FD_CLOEXEC on lock fd")
	}

	if err := syscall.Exec(args[0], args, os.Environ()); err != nil {
		log.Fatalf("error execing init: %v", err)
	}
}
Example #16
0
func stage1() int {
	uuid, err := types.NewUUID(flag.Arg(0))
	if err != nil {
		log.PrintE("UUID is missing or malformed", err)
		return 1
	}

	root := "."
	p, err := stage1commontypes.LoadPod(root, uuid)
	if err != nil {
		log.PrintE("failed to load pod", err)
		return 1
	}

	// set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking
	// network plugins
	lfd, err := common.GetRktLockFD()
	if err != nil {
		log.PrintE("failed to get rkt lock fd", err)
		return 1
	}

	if err := sys.CloseOnExec(lfd, true); err != nil {
		log.PrintE("failed to set FD_CLOEXEC on rkt lock", err)
		return 1
	}

	mirrorLocalZoneInfo(p.Root)

	flavor, _, err := stage1initcommon.GetFlavor(p)
	if err != nil {
		log.PrintE("failed to get stage1 flavor", err)
		return 3
	}

	var n *networking.Networking
	if netList.Contained() {
		fps, err := forwardedPorts(p)
		if err != nil {
			log.Error(err)
			return 6
		}

		n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor, debug)
		if err != nil {
			log.PrintE("failed to setup network", err)
			return 6
		}

		if err = n.Save(); err != nil {
			log.PrintE("failed to save networking state", err)
			n.Teardown(flavor, debug)
			return 6
		}

		if len(mdsToken) > 0 {
			hostIP, err := n.GetDefaultHostIP()
			if err != nil {
				log.PrintE("failed to get default Host IP", err)
				return 6
			}

			p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken)
		}
	} else {
		if flavor == "kvm" {
			log.Print("flavor kvm requires private network configuration (try --net)")
			return 6
		}
		if len(mdsToken) > 0 {
			p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken)
		}
	}

	if err = stage1initcommon.WriteDefaultTarget(p); err != nil {
		log.PrintE("failed to write default.target", err)
		return 2
	}

	if err = stage1initcommon.WritePrepareAppTemplate(p); err != nil {
		log.PrintE("failed to write prepare-app service template", err)
		return 2
	}

	if err := stage1initcommon.SetJournalPermissions(p); err != nil {
		log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err)
	}

	if flavor == "kvm" {
		if err := KvmPodToSystemd(p, n); err != nil {
			log.PrintE("failed to configure systemd for kvm", err)
			return 2
		}
	}

	if err = stage1initcommon.PodToSystemd(p, interactive, flavor, privateUsers); err != nil {
		log.PrintE("failed to configure systemd", err)
		return 2
	}

	args, env, err := getArgsEnv(p, flavor, debug, n)
	if err != nil {
		log.Error(err)
		return 3
	}

	// create a separate mount namespace so the cgroup filesystems
	// are unmounted when exiting the pod
	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
		log.FatalE("error unsharing", err)
	}

	// we recursively make / a "shared and slave" so mount events from the
	// new namespace don't propagate to the host namespace but mount events
	// from the host propagate to the new namespace and are forwarded to
	// its peer group
	// See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil {
		log.FatalE("error making / a slave mount", err)
	}
	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil {
		log.FatalE("error making / a shared and slave mount", err)
	}

	enabledCgroups, err := cgroup.GetEnabledCgroups()
	if err != nil {
		log.FatalE("error getting cgroups", err)
		return 5
	}

	// mount host cgroups in the rkt mount namespace
	if err := mountHostCgroups(enabledCgroups); err != nil {
		log.FatalE("couldn't mount the host cgroups", err)
		return 5
	}

	var serviceNames []string
	for _, app := range p.Manifest.Apps {
		serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name))
	}
	s1Root := common.Stage1RootfsPath(p.Root)
	machineID := stage1initcommon.GetMachineID(p)
	subcgroup, err := getContainerSubCgroup(machineID)
	if err == nil {
		if err := mountContainerCgroups(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil {
			log.PrintE("couldn't mount the container cgroups", err)
			return 5
		}
	} else {
		log.PrintE("continuing with per-app isolators disabled", err)
	}

	if err = stage1common.WritePpid(os.Getpid()); err != nil {
		log.Error(err)
		return 4
	}

	err = stage1common.WithClearedCloExec(lfd, func() error {
		return syscall.Exec(args[0], args, env)
	})
	if err != nil {
		log.PrintE(fmt.Sprintf("failed to execute %q", args[0]), err)
		return 7
	}

	return 0
}
Example #17
0
func setupNsListener() {
	runtime.LockOSThread()
	defer runtime.UnlockOSThread()

	/* create own netns */
	if err := syscall.Unshare(syscall.CLONE_NEWNET); err != nil {
		glog.Error(err)
		return
	}

	childPipe := os.NewFile(uintptr(3), "child")
	enc := gob.NewEncoder(childPipe)
	dec := gob.NewDecoder(childPipe)

	/* notify containerd to execute prestart hooks */
	if err := enc.Encode("init"); err != nil {
		glog.Error(err)
		return
	}

	/* after execute prestart hooks */
	var ready string
	if err := dec.Decode(&ready); err != nil {
		glog.Error(err)
		return
	}

	if ready != "init" {
		glog.Errorf("get incorrect init message: %s", ready)
		return
	}

	// Get network namespace info for the first time and send to the containerd
	/* get route info before link down */
	routes, err := netlink.RouteList(nil, netlink.FAMILY_V4)
	if err != nil {
		glog.Error(err)
		return
	}

	/* send interface info to containerd */
	infos := collectionInterfaceInfo()
	if err := enc.Encode(infos); err != nil {
		glog.Error(err)
		return
	}

	if err := enc.Encode(routes); err != nil {
		glog.Error(err)
		return
	}

	// This is a call back function.
	// Use to send netlink update informations to containerd.
	netNs2Containerd := func(netlinkUpdate supervisor.NetlinkUpdate) {
		if err := enc.Encode(netlinkUpdate); err != nil {
			glog.Info("err Encode(netlinkUpdate) is :", err)
		}
	}
	// Keep collecting network namespace info and sending to the containerd
	setupNetworkNsTrap(netNs2Containerd)
}