// InstantiatedPrepareAppUnitName returns the systemd service unit name for prepare-app // instantiated for the given root func InstantiatedPrepareAppUnitName(imageID types.Hash) string { // Naming respecting escaping rules, see systemd.unit(5) and systemd-escape(1) escaped_root := common.RelAppRootfsPath(imageID) escaped_root = strings.Replace(escaped_root, "-", "\\x2d", -1) escaped_root = strings.Replace(escaped_root, "/", "-", -1) return "prepare-app@" + escaped_root + ".service" }
// AppReaperUnit writes an app reaper service unit for the given app in the given path using the given unit options. func (uw *UnitWriter) AppReaperUnit(appName types.ACName, binPath string, opts ...*unit.UnitOption) { if uw.err != nil { return } opts = append(opts, []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("%s Reaper", appName)), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Unit", "StopWhenUnneeded", "yes"), unit.NewUnitOption("Unit", "Before", "halt.target"), unit.NewUnitOption("Unit", "Conflicts", "exit.target"), unit.NewUnitOption("Unit", "Conflicts", "halt.target"), unit.NewUnitOption("Unit", "Conflicts", "poweroff.target"), unit.NewUnitOption("Service", "RemainAfterExit", "yes"), unit.NewUnitOption("Service", "ExecStop", fmt.Sprintf( "/reaper.sh \"%s\" \"%s\" \"%s\"", appName, common.RelAppRootfsPath(appName), binPath, )), }...) uw.WriteUnit( ServiceUnitPath(uw.p.Root, types.ACName(fmt.Sprintf("reaper-%s", appName))), fmt.Sprintf("failed to write app %q reaper service", appName), opts..., ) }
// AppToSystemdMountUnits prepare bind mount unit for empty or host kind mounting // between stage1 rootfs and chrooted filesystem for application func AppToSystemdMountUnits(root string, appName types.ACName, mountPoints []types.MountPoint, unitsDir string) error { for _, mountPoint := range mountPoints { name := mountPoint.Name.String() // source relative to stage1 rootfs to relative pod root whatPath := filepath.Join(stage1MntDir, name) whatFullPath := filepath.Join(root, whatPath) // destination relative to stage1 rootfs and relative to pod root wherePath := filepath.Join(common.RelAppRootfsPath(appName), mountPoint.Path) whereFullPath := filepath.Join(root, wherePath) // readOnly mountOptions := "bind" if mountPoint.ReadOnly { mountOptions += ",ro" } // assertion to make sure that "what" exists (created earlier by podToSystemdHostMountUnits) log.Printf("checking required source path: %q", whatFullPath) if _, err := os.Stat(whatFullPath); os.IsNotExist(err) { return fmt.Errorf("app requires a volume that is not defined in Pod (try adding --volume=%s,kind=empty)!", name) } // optionally prepare app directory log.Printf("optionally preparing destination path: %q", whereFullPath) err := os.MkdirAll(whereFullPath, 0700) if err != nil { return fmt.Errorf("failed to prepare dir for mountPoint %v: %v", mountPoint.Name, err) } // install new mount unit for bind mount /mnt/volumeName -> /opt/stage2/{app-id}/rootfs/{{mountPoint.Path}} err = installNewMountUnit( root, // where put a mount unit whatPath, // what - stage1 rootfs /mnt/VolumeName wherePath, // where - inside chroot app filesystem "bind", // fstype mountOptions, serviceUnitName(appName), unitsDir, ) if err != nil { return fmt.Errorf("cannot install new mount unit for app %q: %v", appName.String(), err) } } return nil }
// generateDeviceAllows generates a DeviceAllow= line for an app. // To make it work, the path needs to start with "/dev" but the device won't // exist inside the container. So for a given mount, if the volume is a device // node, we create a symlink to its target in "/rkt/volumes". Later, // prepare-app will copy those to "/dev/.rkt/" so that's what we use in the // DeviceAllow= line. func generateDeviceAllows(root string, appName types.ACName, mountPoints []types.MountPoint, mounts []mountWrapper, vols map[types.ACName]types.Volume, uidRange *user.UidRange) ([]string, error) { var devAllow []string rktVolumeLinksPath := filepath.Join(root, "rkt", "volumes") if err := os.MkdirAll(rktVolumeLinksPath, 0600); err != nil { return nil, err } if err := user.ShiftFiles([]string{rktVolumeLinksPath}, uidRange); err != nil { return nil, err } for _, m := range mounts { v := vols[m.Volume] if v.Kind != "host" { continue } if fileutil.IsDeviceNode(v.Source) { mode := "r" if !IsMountReadOnly(v, mountPoints) { mode += "w" } tgt := filepath.Join(common.RelAppRootfsPath(appName), m.Path) // the DeviceAllow= line needs the link path in /dev/.rkt/ linkRel := filepath.Join("/dev/.rkt", v.Name.String()) // the real link should be in /rkt/volumes for now link := filepath.Join(rktVolumeLinksPath, v.Name.String()) err := os.Symlink(tgt, link) // if the link already exists, we don't need to do anything if err != nil && !os.IsExist(err) { return nil, err } devAllow = append(devAllow, linkRel+" "+mode) } } return devAllow, nil }
// appToSystemd transforms the provided RuntimeApp+ImageManifest into systemd units func (p *Pod) appToSystemd(ra *schema.RuntimeApp, interactive bool) error { name := ra.Name.String() id := ra.Image.ID app := ra.App workDir := "/" if app.WorkingDirectory != "" { workDir = app.WorkingDirectory } env := app.Environment env.Set("AC_APP_NAME", name) env.Set("AC_METADATA_URL", p.MetadataServiceURL) if err := p.writeEnvFile(env, id); err != nil { return fmt.Errorf("unable to write environment file: %v", err) } // This is a partial implementation for app.User and app.Group: // For now, only numeric ids (and the string "root") are supported. var uid, gid int var err error if app.User == "root" { uid = 0 } else { uid, err = strconv.Atoi(app.User) if err != nil { return fmt.Errorf("non-numerical user id not supported yet") } } if app.Group == "root" { gid = 0 } else { gid, err = strconv.Atoi(app.Group) if err != nil { return fmt.Errorf("non-numerical group id not supported yet") } } execWrap := []string{"/diagexec", common.RelAppRootfsPath(id), workDir, RelEnvFilePath(id), strconv.Itoa(uid), strconv.Itoa(gid)} execStart := quoteExec(append(execWrap, app.Exec...)) opts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", name), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Unit", "OnFailure", "reaper.service"), unit.NewUnitOption("Unit", "Wants", "exit-watcher.service"), unit.NewUnitOption("Service", "Restart", "no"), unit.NewUnitOption("Service", "ExecStart", execStart), unit.NewUnitOption("Service", "User", "0"), unit.NewUnitOption("Service", "Group", "0"), } _, systemdStage1Version, err := p.getFlavor() if err != nil { return fmt.Errorf("Failed to get stage1 flavor: %v\n", err) } if interactive { opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty")) opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty")) opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty")) } else if systemdSupportsJournalLinking(systemdStage1Version) { opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console")) opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console")) opts = append(opts, unit.NewUnitOption("Service", "SyslogIdentifier", filepath.Base(app.Exec[0]))) } for _, eh := range app.EventHandlers { var typ string switch eh.Name { case "pre-start": typ = "ExecStartPre" case "post-stop": typ = "ExecStopPost" default: return fmt.Errorf("unrecognized eventHandler: %v", eh.Name) } exec := quoteExec(append(execWrap, eh.Exec...)) opts = append(opts, unit.NewUnitOption("Service", typ, exec)) } saPorts := []types.Port{} for _, p := range app.Ports { if p.SocketActivated { saPorts = append(saPorts, p) } } for _, i := range app.Isolators { switch v := i.Value().(type) { case *types.ResourceMemory: limit := v.Limit().String() opts, err = cgroup.MaybeAddIsolator(opts, "memory", limit) if err != nil { return err } case *types.ResourceCPU: limit := v.Limit().String() opts, err = cgroup.MaybeAddIsolator(opts, "cpu", limit) if err != nil { return err } } } if len(saPorts) > 0 { sockopts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", name+" socket-activated ports"), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Socket", "BindIPv6Only", "both"), unit.NewUnitOption("Socket", "Service", ServiceUnitName(id)), } for _, sap := range saPorts { var proto string switch sap.Protocol { case "tcp": proto = "ListenStream" case "udp": proto = "ListenDatagram" default: return fmt.Errorf("unrecognized protocol: %v", sap.Protocol) } sockopts = append(sockopts, unit.NewUnitOption("Socket", proto, fmt.Sprintf("%v", sap.Port))) } file, err := os.OpenFile(SocketUnitPath(p.Root, id), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return fmt.Errorf("failed to create socket file: %v", err) } defer file.Close() if _, err = io.Copy(file, unit.Serialize(sockopts)); err != nil { return fmt.Errorf("failed to write socket unit file: %v", err) } if err = os.Symlink(path.Join("..", SocketUnitName(id)), SocketWantPath(p.Root, id)); err != nil { return fmt.Errorf("failed to link socket want: %v", err) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", SocketUnitName(id))) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", InstantiatedPrepareAppUnitName(id))) opts = append(opts, unit.NewUnitOption("Unit", "After", InstantiatedPrepareAppUnitName(id))) file, err := os.OpenFile(ServiceUnitPath(p.Root, id), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return fmt.Errorf("failed to create service unit file: %v", err) } defer file.Close() if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { return fmt.Errorf("failed to write service unit file: %v", err) } if err = os.Symlink(path.Join("..", ServiceUnitName(id)), ServiceWantPath(p.Root, id)); err != nil { return fmt.Errorf("failed to link service want: %v", err) } return nil }
// protectKernelTunables restricts access to some security-sensitive paths under // /proc and /sys. Entries are either hidden or just made read-only to app. // This protection is enabled by default. func protectKernelTunables(opts []*unit.UnitOption, appName types.ACName, systemdVersion int) []*unit.UnitOption { roPaths := []string{ "/proc/bus/", "/proc/sys/kernel/core_pattern", "/proc/sys/kernel/modprobe", "/proc/sys/vm/panic_on_oom", "/proc/sysrq-trigger", "/sys/block/", "/sys/bus/", "/sys/class/", "/sys/dev/", "/sys/devices/", "/sys/kernel/", } hiddenDirs := []string{ "/sys/firmware/", "/sys/fs/", "/sys/hypervisor/", "/sys/module/", "/sys/power/", } hiddenPaths := []string{ "/proc/config.gz", "/proc/kallsyms", "/proc/sched_debug", "/proc/kcore", "/proc/kmem", "/proc/mem", } // Paths prefixed with "-" are ignored if they do not exist: // https://www.freedesktop.org/software/systemd/man/systemd.exec.html#ReadWriteDirectories= for _, p := range roPaths { opts = append(opts, unit.NewUnitOption("Service", "ReadOnlyDirectories", fmt.Sprintf("-%s", filepath.Join(common.RelAppRootfsPath(appName), p)))) } for _, p := range hiddenDirs { opts = append(opts, unit.NewUnitOption("Service", "InaccessibleDirectories", fmt.Sprintf("-%s", filepath.Join(common.RelAppRootfsPath(appName), p)))) } if systemdVersion >= 231 { for _, p := range hiddenPaths { opts = append(opts, unit.NewUnitOption("Service", "InaccessiblePaths", fmt.Sprintf("-%s", filepath.Join(common.RelAppRootfsPath(appName), p)))) } } if systemdVersion >= 233 { opts = append(opts, unit.NewUnitOption("Service", "ProtectKernelTunables", "true")) } return opts }
// appToNspawnArgs transforms the given app manifest, with the given associated // app name, into a subset of applicable systemd-nspawn argument func appToNspawnArgs(p *stage1commontypes.Pod, ra *schema.RuntimeApp) ([]string, error) { var args []string appName := ra.Name app := ra.App sharedVolPath, err := common.CreateSharedVolumesPath(p.Root) if err != nil { return nil, err } vols := make(map[types.ACName]types.Volume) for _, v := range p.Manifest.Volumes { vols[v.Name] = v } imageManifest := p.Images[appName.String()] mounts, err := GenerateMounts(ra, p.Manifest.Volumes, ConvertedFromDocker(imageManifest)) if err != nil { return nil, errwrap.Wrap(fmt.Errorf("could not generate app %q mounts", appName), err) } for _, m := range mounts { shPath := filepath.Join(sharedVolPath, m.Volume.Name.String()) absRoot, err := filepath.Abs(p.Root) // Absolute path to the pod's rootfs. if err != nil { return nil, errwrap.Wrap(errors.New("could not get pod's root absolute path"), err) } appRootfs := common.AppRootfsPath(absRoot, appName) // Evaluate symlinks within the app's rootfs. This is needed because symlinks // within the container can be absolute, which will, of course, be wrong in our ns. // Systemd also gets this wrong, see https://github.com/systemd/systemd/issues/2860 // When the above issue is fixed, we can pass the un-evaluated path to --bind instead. mntPath, err := EvaluateSymlinksInsideApp(appRootfs, m.Mount.Path) if err != nil { return nil, errwrap.Wrap(fmt.Errorf("could not evaluate path %v", m.Mount.Path), err) } mntAbsPath := filepath.Join(appRootfs, mntPath) if err := PrepareMountpoints(shPath, mntAbsPath, &m.Volume, m.DockerImplicit); err != nil { return nil, err } opt := make([]string, 6) if m.ReadOnly { opt[0] = "--bind-ro=" } else { opt[0] = "--bind=" } opt[1] = m.Source(absRoot) opt[2] = ":" opt[3] = filepath.Join(common.RelAppRootfsPath(appName), mntPath) opt[4] = ":" // If Recursive is not set, default to recursive. recursive := true if m.Volume.Recursive != nil { recursive = *m.Volume.Recursive } // rbind/norbind options exist since systemd-nspawn v226 if recursive { opt[5] = "rbind" } else { opt[5] = "norbind" } args = append(args, strings.Join(opt, "")) } if !p.InsecureOptions.DisableCapabilities { capabilitiesStr, err := getAppCapabilities(app.Isolators) if err != nil { return nil, err } capList := strings.Join(capabilitiesStr, ",") args = append(args, "--capability="+capList) } return args, nil }
// appToNspawnArgs transforms the given app manifest, with the given associated // app name, into a subset of applicable systemd-nspawn argument func (p *Pod) appToNspawnArgs(ra *schema.RuntimeApp) ([]string, error) { var args []string appName := ra.Name id := ra.Image.ID app := ra.App vols := make(map[types.ACName]types.Volume) // TODO(philips): this is implicitly creating a mapping from MountPoint // to volumes. This is a nice convenience for users but we will need to // introduce a --mount flag so they can control which mountPoint maps to // which volume. sharedVolPath := common.SharedVolumesPath(p.Root) if err := os.MkdirAll(sharedVolPath, sharedVolPerm); err != nil { return nil, fmt.Errorf("could not create shared volumes directory: %v", err) } if err := os.Chmod(sharedVolPath, sharedVolPerm); err != nil { return nil, fmt.Errorf("could not change permissions of %q: %v", sharedVolPath, err) } for _, v := range p.Manifest.Volumes { vols[v.Name] = v if v.Kind == "empty" { if err := os.MkdirAll(filepath.Join(sharedVolPath, v.Name.String()), sharedVolPerm); err != nil { return nil, fmt.Errorf("could not create shared volume %q: %v", v.Name, err) } } } for _, mp := range app.MountPoints { key := mp.Name vol, ok := vols[key] if !ok { catCmd := fmt.Sprintf("sudo rkt image cat-manifest --pretty-print %v", id) volumeCmd := "" for _, mp := range app.MountPoints { volumeCmd += fmt.Sprintf("--volume %s,kind=host,source=/some/path ", mp.Name) } return nil, fmt.Errorf("no volume for mountpoint %q in app %q.\n"+ "You can inspect the volumes with:\n\t%v\n"+ "App %q requires the following volumes:\n\t%v", key, appName, catCmd, appName, volumeCmd) } opt := make([]string, 4) // If the readonly flag in the pod manifest is not nil, // then use it to override the readonly flag in the image manifest. readOnly := mp.ReadOnly if vol.ReadOnly != nil { readOnly = *vol.ReadOnly } if readOnly { opt[0] = "--bind-ro=" } else { opt[0] = "--bind=" } switch vol.Kind { case "host": opt[1] = vol.Source case "empty": absRoot, err := filepath.Abs(p.Root) if err != nil { return nil, fmt.Errorf("cannot get pod's root absolute path: %v\n", err) } opt[1] = filepath.Join(common.SharedVolumesPath(absRoot), vol.Name.String()) default: return nil, fmt.Errorf(`invalid volume kind %q. Must be one of "host" or "empty".`, vol.Kind) } opt[2] = ":" opt[3] = filepath.Join(common.RelAppRootfsPath(appName), mp.Path) args = append(args, strings.Join(opt, "")) } for _, i := range app.Isolators { switch v := i.Value().(type) { case types.LinuxCapabilitiesSet: var caps []string // TODO: cleanup the API on LinuxCapabilitiesSet to give strings easily. for _, c := range v.Set() { caps = append(caps, string(c)) } if i.Name == types.LinuxCapabilitiesRetainSetName { capList := strings.Join(caps, ",") args = append(args, "--capability="+capList) } } } return args, nil }
// appToNspawnArgs transforms the given app manifest, with the given associated // app name, into a subset of applicable systemd-nspawn argument func appToNspawnArgs(p *stage1commontypes.Pod, ra *schema.RuntimeApp) ([]string, error) { var args []string appName := ra.Name app := ra.App sharedVolPath := common.SharedVolumesPath(p.Root) if err := os.MkdirAll(sharedVolPath, sharedVolPerm); err != nil { return nil, fmt.Errorf("could not create shared volumes directory: %v", err) } if err := os.Chmod(sharedVolPath, sharedVolPerm); err != nil { return nil, fmt.Errorf("could not change permissions of %q: %v", sharedVolPath, err) } vols := make(map[types.ACName]types.Volume) for _, v := range p.Manifest.Volumes { vols[v.Name] = v } mounts := GenerateMounts(ra, vols) for _, m := range mounts { vol := vols[m.Volume] if vol.Kind == "empty" { p := filepath.Join(sharedVolPath, vol.Name.String()) if err := os.MkdirAll(p, sharedVolPerm); err != nil { return nil, fmt.Errorf("could not create shared volume %q: %v", vol.Name, err) } if err := os.Chown(p, *vol.UID, *vol.GID); err != nil { return nil, fmt.Errorf("could not change owner of %q: %v", p, err) } mod, err := strconv.ParseUint(*vol.Mode, 8, 32) if err != nil { return nil, fmt.Errorf("invalid mode %q for volume %q: %v", *vol.Mode, vol.Name, err) } if err := os.Chmod(p, os.FileMode(mod)); err != nil { return nil, fmt.Errorf("could not change permissions of %q: %v", p, err) } } opt := make([]string, 4) if IsMountReadOnly(vol, app.MountPoints) { opt[0] = "--bind-ro=" } else { opt[0] = "--bind=" } switch vol.Kind { case "host": opt[1] = vol.Source case "empty": absRoot, err := filepath.Abs(p.Root) if err != nil { return nil, fmt.Errorf("cannot get pod's root absolute path: %v\n", err) } opt[1] = filepath.Join(common.SharedVolumesPath(absRoot), vol.Name.String()) default: return nil, fmt.Errorf(`invalid volume kind %q. Must be one of "host" or "empty"`, vol.Kind) } opt[2] = ":" opt[3] = filepath.Join(common.RelAppRootfsPath(appName), m.Path) args = append(args, strings.Join(opt, "")) } for _, i := range app.Isolators { switch v := i.Value().(type) { case types.LinuxCapabilitiesSet: var caps []string // TODO: cleanup the API on LinuxCapabilitiesSet to give strings easily. for _, c := range v.Set() { caps = append(caps, string(c)) } if i.Name == types.LinuxCapabilitiesRetainSetName { capList := strings.Join(caps, ",") args = append(args, "--capability="+capList) } } } return args, nil }
// appToSystemd transforms the provided RuntimeApp+ImageManifest into systemd units func appToSystemd(p *stage1commontypes.Pod, ra *schema.RuntimeApp, interactive bool, flavor string, privateUsers string) error { app := ra.App appName := ra.Name imgName := p.AppNameToImageName(appName) if len(app.Exec) == 0 { return fmt.Errorf(`image %q has an empty "exec" (try --exec=BINARY)`, imgName) } workDir := "/" if app.WorkingDirectory != "" { workDir = app.WorkingDirectory } env := app.Environment env.Set("AC_APP_NAME", appName.String()) if p.MetadataServiceURL != "" { env.Set("AC_METADATA_URL", p.MetadataServiceURL) } if err := writeEnvFile(p, env, appName, privateUsers); err != nil { return errwrap.Wrap(errors.New("unable to write environment file"), err) } var _uid, gid int var err error uidRange := uid.NewBlankUidRange() if err := uidRange.Deserialize([]byte(privateUsers)); err != nil { return errwrap.Wrap(errors.New("unable to deserialize uid range"), err) } if strings.HasPrefix(app.User, "/") { var stat syscall.Stat_t if err = syscall.Lstat(filepath.Join(common.AppRootfsPath(p.Root, appName), app.User), &stat); err != nil { return errwrap.Wrap(fmt.Errorf("unable to get uid from file %q", app.User), err) } uidReal, _, err := uidRange.UnshiftRange(stat.Uid, 0) if err != nil { return errwrap.Wrap(errors.New("unable to determine real uid"), err) } _uid = int(uidReal) } else { _uid, err = strconv.Atoi(app.User) if err != nil { _uid, err = passwd.LookupUidFromFile(app.User, filepath.Join(common.AppRootfsPath(p.Root, appName), "etc/passwd")) if err != nil { return errwrap.Wrap(fmt.Errorf("cannot lookup user %q", app.User), err) } } } if strings.HasPrefix(app.Group, "/") { var stat syscall.Stat_t if err = syscall.Lstat(filepath.Join(common.AppRootfsPath(p.Root, appName), app.Group), &stat); err != nil { return errwrap.Wrap(fmt.Errorf("unable to get gid from file %q", app.Group), err) } _, gidReal, err := uidRange.UnshiftRange(0, stat.Gid) if err != nil { return errwrap.Wrap(errors.New("unable to determine real gid"), err) } gid = int(gidReal) } else { gid, err = strconv.Atoi(app.Group) if err != nil { gid, err = group.LookupGidFromFile(app.Group, filepath.Join(common.AppRootfsPath(p.Root, appName), "etc/group")) if err != nil { return errwrap.Wrap(fmt.Errorf("cannot lookup group %q", app.Group), err) } } } execWrap := []string{"/appexec", common.RelAppRootfsPath(appName), workDir, RelEnvFilePath(appName), strconv.Itoa(_uid), generateGidArg(gid, app.SupplementaryGIDs), "--"} execStart := quoteExec(append(execWrap, app.Exec...)) opts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v", appName, imgName)), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Unit", "Wants", fmt.Sprintf("reaper-%s.service", appName)), unit.NewUnitOption("Service", "Restart", "no"), unit.NewUnitOption("Service", "ExecStart", execStart), unit.NewUnitOption("Service", "User", "0"), unit.NewUnitOption("Service", "Group", "0"), } if interactive { opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty")) opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty")) opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty")) } else { opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console")) opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console")) opts = append(opts, unit.NewUnitOption("Service", "SyslogIdentifier", filepath.Base(app.Exec[0]))) } // When an app fails, we shut down the pod opts = append(opts, unit.NewUnitOption("Unit", "OnFailure", "halt.target")) for _, eh := range app.EventHandlers { var typ string switch eh.Name { case "pre-start": typ = "ExecStartPre" case "post-stop": typ = "ExecStopPost" default: return fmt.Errorf("unrecognized eventHandler: %v", eh.Name) } exec := quoteExec(append(execWrap, eh.Exec...)) opts = append(opts, unit.NewUnitOption("Service", typ, exec)) } // Some pre-start jobs take a long time, set the timeout to 0 opts = append(opts, unit.NewUnitOption("Service", "TimeoutStartSec", "0")) var saPorts []types.Port for _, p := range app.Ports { if p.SocketActivated { saPorts = append(saPorts, p) } } for _, i := range app.Isolators { switch v := i.Value().(type) { case *types.ResourceMemory: opts, err = cgroup.MaybeAddIsolator(opts, "memory", v.Limit()) if err != nil { return err } case *types.ResourceCPU: opts, err = cgroup.MaybeAddIsolator(opts, "cpu", v.Limit()) if err != nil { return err } } } if len(saPorts) > 0 { sockopts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v %s", appName, imgName, "socket-activated ports")), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Socket", "BindIPv6Only", "both"), unit.NewUnitOption("Socket", "Service", ServiceUnitName(appName)), } for _, sap := range saPorts { var proto string switch sap.Protocol { case "tcp": proto = "ListenStream" case "udp": proto = "ListenDatagram" default: return fmt.Errorf("unrecognized protocol: %v", sap.Protocol) } // We find the host port for the pod's port and use that in the // socket unit file. // This is so because systemd inside the pod will match based on // the socket port number, and since the socket was created on the // host, it will have the host port number. port := findHostPort(*p.Manifest, sap.Name) if port == 0 { log.Printf("warning: no --port option for socket-activated port %q, assuming port %d as specified in the manifest", sap.Name, sap.Port) port = sap.Port } sockopts = append(sockopts, unit.NewUnitOption("Socket", proto, fmt.Sprintf("%v", port))) } file, err := os.OpenFile(SocketUnitPath(p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return errwrap.Wrap(errors.New("failed to create socket file"), err) } defer file.Close() if _, err = io.Copy(file, unit.Serialize(sockopts)); err != nil { return errwrap.Wrap(errors.New("failed to write socket unit file"), err) } if err = os.Symlink(path.Join("..", SocketUnitName(appName)), SocketWantPath(p.Root, appName)); err != nil { return errwrap.Wrap(errors.New("failed to link socket want"), err) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", SocketUnitName(appName))) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", InstantiatedPrepareAppUnitName(appName))) opts = append(opts, unit.NewUnitOption("Unit", "After", InstantiatedPrepareAppUnitName(appName))) file, err := os.OpenFile(ServiceUnitPath(p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return errwrap.Wrap(errors.New("failed to create service unit file"), err) } defer file.Close() if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { return errwrap.Wrap(errors.New("failed to write service unit file"), err) } if err = os.Symlink(path.Join("..", ServiceUnitName(appName)), ServiceWantPath(p.Root, appName)); err != nil { return errwrap.Wrap(errors.New("failed to link service want"), err) } if flavor == "kvm" { // bind mount all shared volumes from /mnt/volumeName (we don't use mechanism for bind-mounting given by nspawn) err := AppToSystemdMountUnits(common.Stage1RootfsPath(p.Root), appName, p.Manifest.Volumes, ra, UnitsDir) if err != nil { return errwrap.Wrap(errors.New("failed to prepare mount units"), err) } } if err = writeAppReaper(p, appName.String()); err != nil { return errwrap.Wrap(fmt.Errorf("failed to write app %q reaper service", appName), err) } return nil }
// appToNspawnArgs transforms the given app manifest, with the given associated // app name, into a subset of applicable systemd-nspawn argument func appToNspawnArgs(p *stage1commontypes.Pod, ra *schema.RuntimeApp) ([]string, error) { var args []string appName := ra.Name app := ra.App sharedVolPath := common.SharedVolumesPath(p.Root) if err := os.MkdirAll(sharedVolPath, SharedVolPerm); err != nil { return nil, errwrap.Wrap(errors.New("could not create shared volumes directory"), err) } if err := os.Chmod(sharedVolPath, SharedVolPerm); err != nil { return nil, errwrap.Wrap(fmt.Errorf("could not change permissions of %q", sharedVolPath), err) } vols := make(map[types.ACName]types.Volume) for _, v := range p.Manifest.Volumes { vols[v.Name] = v } imageManifest := p.Images[appName.String()] mounts := GenerateMounts(ra, vols, imageManifest) for _, m := range mounts { vol := vols[m.Volume] shPath := filepath.Join(sharedVolPath, vol.Name.String()) absRoot, err := filepath.Abs(p.Root) // Absolute path to the pod's rootfs. if err != nil { return nil, errwrap.Wrap(errors.New("could not get pod's root absolute path"), err) } appRootfs := common.AppRootfsPath(absRoot, appName) // TODO(yifan): This is a temporary fix for systemd-nspawn not handling symlink mounts well. // Could be removed when https://github.com/systemd/systemd/issues/2860 is resolved, and systemd // version is bumped. mntPath, err := EvaluateSymlinksInsideApp(appRootfs, m.Path) if err != nil { return nil, errwrap.Wrap(fmt.Errorf("could not evaluate path %v", m.Path), err) } mntAbsPath := filepath.Join(appRootfs, mntPath) if err := PrepareMountpoints(shPath, mntAbsPath, &vol, m.DockerImplicit); err != nil { return nil, err } opt := make([]string, 4) if IsMountReadOnly(vol, app.MountPoints) { opt[0] = "--bind-ro=" } else { opt[0] = "--bind=" } switch vol.Kind { case "host": opt[1] = vol.Source case "empty": opt[1] = filepath.Join(common.SharedVolumesPath(absRoot), vol.Name.String()) default: return nil, fmt.Errorf(`invalid volume kind %q. Must be one of "host" or "empty"`, vol.Kind) } opt[2] = ":" opt[3] = filepath.Join(common.RelAppRootfsPath(appName), mntPath) args = append(args, strings.Join(opt, "")) } capabilitiesStr, err := getAppCapabilities(app.Isolators) if err != nil { return nil, err } capList := strings.Join(capabilitiesStr, ",") args = append(args, "--capability="+capList) return args, nil }
// appToSystemd transforms the provided RuntimeApp+ImageManifest into systemd units func appToSystemd(p *stage1commontypes.Pod, ra *schema.RuntimeApp, interactive bool, flavor string, privateUsers string) error { app := ra.App appName := ra.Name imgName := p.AppNameToImageName(appName) if len(app.Exec) == 0 { return fmt.Errorf(`image %q has an empty "exec" (try --exec=BINARY)`, imgName) } workDir := "/" if app.WorkingDirectory != "" { workDir = app.WorkingDirectory } env := app.Environment env.Set("AC_APP_NAME", appName.String()) if p.MetadataServiceURL != "" { env.Set("AC_METADATA_URL", p.MetadataServiceURL) } envFilePath := EnvFilePath(p.Root, appName) uidRange := user.NewBlankUidRange() if err := uidRange.Deserialize([]byte(privateUsers)); err != nil { return err } if err := writeEnvFile(p, env, appName, uidRange, '\n', envFilePath); err != nil { return errwrap.Wrap(errors.New("unable to write environment file for systemd"), err) } u, g, err := parseUserGroup(p, ra, uidRange) if err != nil { return err } if err := generateSysusers(p, ra, u, g, uidRange); err != nil { return errwrap.Wrap(errors.New("unable to generate sysusers"), err) } binPath, err := findBinPath(p, appName, *app, workDir, app.Exec[0]) if err != nil { return err } var supplementaryGroups []string for _, g := range app.SupplementaryGIDs { supplementaryGroups = append(supplementaryGroups, strconv.Itoa(g)) } capabilitiesStr, err := getAppCapabilities(app.Isolators) if err != nil { return err } noNewPrivileges := getAppNoNewPrivileges(app.Isolators) execStart := append([]string{binPath}, app.Exec[1:]...) execStartString := quoteExec(execStart) opts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v", appName, imgName)), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Unit", "Wants", fmt.Sprintf("reaper-%s.service", appName)), unit.NewUnitOption("Service", "Restart", "no"), unit.NewUnitOption("Service", "ExecStart", execStartString), unit.NewUnitOption("Service", "RootDirectory", common.RelAppRootfsPath(appName)), // MountFlags=shared creates a new mount namespace and (as unintuitive // as it might seem) makes sure the mount is slave+shared. unit.NewUnitOption("Service", "MountFlags", "shared"), unit.NewUnitOption("Service", "WorkingDirectory", workDir), unit.NewUnitOption("Service", "EnvironmentFile", RelEnvFilePath(appName)), unit.NewUnitOption("Service", "User", strconv.Itoa(u)), unit.NewUnitOption("Service", "Group", strconv.Itoa(g)), unit.NewUnitOption("Service", "SupplementaryGroups", strings.Join(supplementaryGroups, " ")), unit.NewUnitOption("Service", "CapabilityBoundingSet", strings.Join(capabilitiesStr, " ")), unit.NewUnitOption("Service", "NoNewPrivileges", strconv.FormatBool(noNewPrivileges)), // This helps working around a race // (https://github.com/systemd/systemd/issues/2913) that causes the // systemd unit name not getting written to the journal if the unit is // short-lived and runs as non-root. unit.NewUnitOption("Service", "SyslogIdentifier", appName.String()), } // Restrict access to sensitive paths (eg. procfs) opts = protectSystemFiles(opts, appName) if ra.ReadOnlyRootFS { opts = append(opts, unit.NewUnitOption("Service", "ReadOnlyDirectories", common.RelAppRootfsPath(appName))) } // TODO(tmrts): Extract this logic into a utility function. vols := make(map[types.ACName]types.Volume) for _, v := range p.Manifest.Volumes { vols[v.Name] = v } absRoot, err := filepath.Abs(p.Root) // Absolute path to the pod's rootfs. if err != nil { return err } appRootfs := common.AppRootfsPath(absRoot, appName) rwDirs := []string{} imageManifest := p.Images[appName.String()] for _, m := range GenerateMounts(ra, vols, imageManifest) { mntPath, err := EvaluateSymlinksInsideApp(appRootfs, m.Path) if err != nil { return err } if !IsMountReadOnly(vols[m.Volume], app.MountPoints) { rwDirs = append(rwDirs, filepath.Join(common.RelAppRootfsPath(appName), mntPath)) } } opts = append(opts, unit.NewUnitOption("Service", "ReadWriteDirectories", strings.Join(rwDirs, " "))) if interactive { opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty")) opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty")) opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty")) } else { opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console")) opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console")) } // When an app fails, we shut down the pod opts = append(opts, unit.NewUnitOption("Unit", "OnFailure", "halt.target")) for _, eh := range app.EventHandlers { var typ string switch eh.Name { case "pre-start": typ = "ExecStartPre" case "post-stop": typ = "ExecStopPost" default: return fmt.Errorf("unrecognized eventHandler: %v", eh.Name) } exec := quoteExec(eh.Exec) opts = append(opts, unit.NewUnitOption("Service", typ, exec)) } // Some pre-start jobs take a long time, set the timeout to 0 opts = append(opts, unit.NewUnitOption("Service", "TimeoutStartSec", "0")) var saPorts []types.Port for _, p := range app.Ports { if p.SocketActivated { saPorts = append(saPorts, p) } } for _, i := range app.Isolators { switch v := i.Value().(type) { case *types.ResourceMemory: opts, err = cgroup.MaybeAddIsolator(opts, "memory", v.Limit()) if err != nil { return err } case *types.ResourceCPU: opts, err = cgroup.MaybeAddIsolator(opts, "cpu", v.Limit()) if err != nil { return err } } } if len(saPorts) > 0 { sockopts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v %s", appName, imgName, "socket-activated ports")), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Socket", "BindIPv6Only", "both"), unit.NewUnitOption("Socket", "Service", ServiceUnitName(appName)), } for _, sap := range saPorts { var proto string switch sap.Protocol { case "tcp": proto = "ListenStream" case "udp": proto = "ListenDatagram" default: return fmt.Errorf("unrecognized protocol: %v", sap.Protocol) } // We find the host port for the pod's port and use that in the // socket unit file. // This is so because systemd inside the pod will match based on // the socket port number, and since the socket was created on the // host, it will have the host port number. port := findHostPort(*p.Manifest, sap.Name) if port == 0 { log.Printf("warning: no --port option for socket-activated port %q, assuming port %d as specified in the manifest", sap.Name, sap.Port) port = sap.Port } sockopts = append(sockopts, unit.NewUnitOption("Socket", proto, fmt.Sprintf("%v", port))) } file, err := os.OpenFile(SocketUnitPath(p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return errwrap.Wrap(errors.New("failed to create socket file"), err) } defer file.Close() if _, err = io.Copy(file, unit.Serialize(sockopts)); err != nil { return errwrap.Wrap(errors.New("failed to write socket unit file"), err) } if err = os.Symlink(path.Join("..", SocketUnitName(appName)), SocketWantPath(p.Root, appName)); err != nil { return errwrap.Wrap(errors.New("failed to link socket want"), err) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", SocketUnitName(appName))) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", InstantiatedPrepareAppUnitName(appName))) opts = append(opts, unit.NewUnitOption("Unit", "After", InstantiatedPrepareAppUnitName(appName))) opts = append(opts, unit.NewUnitOption("Unit", "Requires", "sysusers.service")) opts = append(opts, unit.NewUnitOption("Unit", "After", "sysusers.service")) file, err := os.OpenFile(ServiceUnitPath(p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return errwrap.Wrap(errors.New("failed to create service unit file"), err) } defer file.Close() if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { return errwrap.Wrap(errors.New("failed to write service unit file"), err) } if err = os.Symlink(path.Join("..", ServiceUnitName(appName)), ServiceWantPath(p.Root, appName)); err != nil { return errwrap.Wrap(errors.New("failed to link service want"), err) } if err = writeAppReaper(p, appName.String(), common.RelAppRootfsPath(appName), binPath); err != nil { return errwrap.Wrap(fmt.Errorf("failed to write app %q reaper service", appName), err) } return nil }
func (uw *UnitWriter) AppUnit( ra *schema.RuntimeApp, binPath, privateUsers string, insecureOptions Stage1InsecureOptions, opts ...*unit.UnitOption, ) { if uw.err != nil { return } flavor, systemdVersion, err := GetFlavor(uw.p) if err != nil { uw.err = errwrap.Wrap(errors.New("unable to determine stage1 flavor"), err) return } app := ra.App appName := ra.Name imgName := uw.p.AppNameToImageName(appName) if len(app.Exec) == 0 { uw.err = fmt.Errorf(`image %q has an empty "exec" (try --exec=BINARY)`, imgName) return } env := app.Environment env.Set("AC_APP_NAME", appName.String()) if uw.p.MetadataServiceURL != "" { env.Set("AC_METADATA_URL", uw.p.MetadataServiceURL) } envFilePath := EnvFilePath(uw.p.Root, appName) uidRange := user.NewBlankUidRange() if err := uidRange.Deserialize([]byte(privateUsers)); err != nil { uw.err = err return } if err := common.WriteEnvFile(env, uidRange, envFilePath); err != nil { uw.err = errwrap.Wrap(errors.New("unable to write environment file for systemd"), err) return } u, g, err := parseUserGroup(uw.p, ra, uidRange) if err != nil { uw.err = err return } if err := generateSysusers(uw.p, ra, u, g, uidRange); err != nil { uw.err = errwrap.Wrap(errors.New("unable to generate sysusers"), err) return } var supplementaryGroups []string for _, g := range app.SupplementaryGIDs { supplementaryGroups = append(supplementaryGroups, strconv.Itoa(g)) } capabilitiesStr, err := getAppCapabilities(app.Isolators) if err != nil { uw.err = err return } execStart := append([]string{binPath}, app.Exec[1:]...) execStartString := quoteExec(execStart) opts = append(opts, []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v", appName, imgName)), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Unit", "Wants", fmt.Sprintf("reaper-%s.service", appName)), unit.NewUnitOption("Service", "Restart", "no"), unit.NewUnitOption("Service", "ExecStart", execStartString), unit.NewUnitOption("Service", "RootDirectory", common.RelAppRootfsPath(appName)), // MountFlags=shared creates a new mount namespace and (as unintuitive // as it might seem) makes sure the mount is slave+shared. unit.NewUnitOption("Service", "MountFlags", "shared"), unit.NewUnitOption("Service", "WorkingDirectory", app.WorkingDirectory), unit.NewUnitOption("Service", "EnvironmentFile", RelEnvFilePath(appName)), unit.NewUnitOption("Service", "User", strconv.Itoa(u)), unit.NewUnitOption("Service", "Group", strconv.Itoa(g)), // This helps working around a race // (https://github.com/systemd/systemd/issues/2913) that causes the // systemd unit name not getting written to the journal if the unit is // short-lived and runs as non-root. unit.NewUnitOption("Service", "SyslogIdentifier", appName.String()), }...) if len(supplementaryGroups) > 0 { opts = appendOptionsList(opts, "Service", "SupplementaryGroups", "", supplementaryGroups) } if supportsNotify(uw.p, appName.String()) { opts = append(opts, unit.NewUnitOption("Service", "Type", "notify")) } if !insecureOptions.DisableCapabilities { opts = append(opts, unit.NewUnitOption("Service", "CapabilityBoundingSet", strings.Join(capabilitiesStr, " "))) } noNewPrivileges := getAppNoNewPrivileges(app.Isolators) // Apply seccomp isolator, if any and not opt-ing out; // see https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter= if !insecureOptions.DisableSeccomp { var forceNoNewPrivileges bool unprivileged := (u != 0) opts, forceNoNewPrivileges, err = getSeccompFilter(opts, uw.p, unprivileged, app.Isolators) if err != nil { uw.err = err return } // Seccomp filters require NoNewPrivileges for unprivileged apps, that may override // manifest annotation. if forceNoNewPrivileges { noNewPrivileges = true } } opts = append(opts, unit.NewUnitOption("Service", "NoNewPrivileges", strconv.FormatBool(noNewPrivileges))) if ra.ReadOnlyRootFS { opts = append(opts, unit.NewUnitOption("Service", "ReadOnlyDirectories", common.RelAppRootfsPath(appName))) } // TODO(tmrts): Extract this logic into a utility function. vols := make(map[types.ACName]types.Volume) for _, v := range uw.p.Manifest.Volumes { vols[v.Name] = v } absRoot, err := filepath.Abs(uw.p.Root) // Absolute path to the pod's rootfs. if err != nil { uw.err = err return } appRootfs := common.AppRootfsPath(absRoot, appName) rwDirs := []string{} imageManifest := uw.p.Images[appName.String()] mounts := GenerateMounts(ra, vols, imageManifest) for _, m := range mounts { mntPath, err := EvaluateSymlinksInsideApp(appRootfs, m.Path) if err != nil { uw.err = err return } if !IsMountReadOnly(vols[m.Volume], app.MountPoints) { rwDirs = append(rwDirs, filepath.Join(common.RelAppRootfsPath(appName), mntPath)) } } if len(rwDirs) > 0 { opts = appendOptionsList(opts, "Service", "ReadWriteDirectories", "", rwDirs) } // Restrict access to sensitive paths (eg. procfs and sysfs entries). if !insecureOptions.DisablePaths { opts = protectKernelTunables(opts, appName, systemdVersion) } // Generate default device policy for the app, as well as the list of allowed devices. // For kvm flavor, devices are VM-specific and restricting them is not strictly needed. if !insecureOptions.DisablePaths && flavor != "kvm" { opts = append(opts, unit.NewUnitOption("Service", "DevicePolicy", "closed")) deviceAllows, err := generateDeviceAllows(common.Stage1RootfsPath(absRoot), appName, app.MountPoints, mounts, vols, uidRange) if err != nil { uw.err = err return } for _, dev := range deviceAllows { opts = append(opts, unit.NewUnitOption("Service", "DeviceAllow", dev)) } } // When an app fails, we shut down the pod opts = append(opts, unit.NewUnitOption("Unit", "OnFailure", "halt.target")) for _, eh := range app.EventHandlers { var typ string switch eh.Name { case "pre-start": typ = "ExecStartPre" case "post-stop": typ = "ExecStopPost" default: uw.err = fmt.Errorf("unrecognized eventHandler: %v", eh.Name) return } exec := quoteExec(eh.Exec) opts = append(opts, unit.NewUnitOption("Service", typ, exec)) } // Some pre-start jobs take a long time, set the timeout to 0 opts = append(opts, unit.NewUnitOption("Service", "TimeoutStartSec", "0")) var saPorts []types.Port for _, p := range app.Ports { if p.SocketActivated { saPorts = append(saPorts, p) } } doWithIsolator := func(isolator string, f func() error) bool { ok, err := cgroup.IsIsolatorSupported(isolator) if err != nil { uw.err = err return true } if !ok { fmt.Fprintf(os.Stderr, "warning: resource/%s isolator set but support disabled in the kernel, skipping\n", isolator) } if err := f(); err != nil { uw.err = err return true } return false } exit := false for _, i := range app.Isolators { if exit { return } switch v := i.Value().(type) { case *types.ResourceMemory: exit = doWithIsolator("memory", func() error { if v.Limit() == nil { return nil } opts = append(opts, unit.NewUnitOption("Service", "MemoryLimit", strconv.Itoa(int(v.Limit().Value())))) return nil }) case *types.ResourceCPU: exit = doWithIsolator("cpu", func() error { if v.Limit() == nil { return nil } if v.Limit().Value() > resource.MaxMilliValue { return fmt.Errorf("cpu limit exceeds the maximum millivalue: %v", v.Limit().String()) } quota := strconv.Itoa(int(v.Limit().MilliValue()/10)) + "%" opts = append(opts, unit.NewUnitOption("Service", "CPUQuota", quota)) return nil }) } } if len(saPorts) > 0 { sockopts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v %s", appName, imgName, "socket-activated ports")), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Socket", "BindIPv6Only", "both"), unit.NewUnitOption("Socket", "Service", ServiceUnitName(appName)), } for _, sap := range saPorts { var proto string switch sap.Protocol { case "tcp": proto = "ListenStream" case "udp": proto = "ListenDatagram" default: uw.err = fmt.Errorf("unrecognized protocol: %v", sap.Protocol) return } // We find the host port for the pod's port and use that in the // socket unit file. // This is so because systemd inside the pod will match based on // the socket port number, and since the socket was created on the // host, it will have the host port number. port := findHostPort(*uw.p.Manifest, sap.Name) if port == 0 { log.Printf("warning: no --port option for socket-activated port %q, assuming port %d as specified in the manifest", sap.Name, sap.Port) port = sap.Port } sockopts = append(sockopts, unit.NewUnitOption("Socket", proto, fmt.Sprintf("%v", port))) } file, err := os.OpenFile(SocketUnitPath(uw.p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { uw.err = errwrap.Wrap(errors.New("failed to create socket file"), err) return } defer file.Close() if _, err = io.Copy(file, unit.Serialize(sockopts)); err != nil { uw.err = errwrap.Wrap(errors.New("failed to write socket unit file"), err) return } if err = os.Symlink(path.Join("..", SocketUnitName(appName)), SocketWantPath(uw.p.Root, appName)); err != nil { uw.err = errwrap.Wrap(errors.New("failed to link socket want"), err) return } opts = append(opts, unit.NewUnitOption("Unit", "Requires", SocketUnitName(appName))) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", InstantiatedPrepareAppUnitName(appName))) opts = append(opts, unit.NewUnitOption("Unit", "After", InstantiatedPrepareAppUnitName(appName))) opts = append(opts, unit.NewUnitOption("Unit", "Requires", "sysusers.service")) opts = append(opts, unit.NewUnitOption("Unit", "After", "sysusers.service")) uw.WriteUnit(ServiceUnitPath(uw.p.Root, appName), "failed to create service unit file", opts...) uw.Activate(ServiceUnitName(appName), ServiceWantPath(uw.p.Root, appName)) }
// appToNspawnArgs transforms the given app manifest, with the given associated // app image id, into a subset of applicable systemd-nspawn argument func (p *Pod) appToNspawnArgs(ra *schema.RuntimeApp) ([]string, error) { args := []string{} name := ra.Name.String() id := ra.Image.ID app := ra.App vols := make(map[types.ACName]types.Volume) // TODO(philips): this is implicitly creating a mapping from MountPoint // to volumes. This is a nice convenience for users but we will need to // introduce a --mount flag so they can control which mountPoint maps to // which volume. for _, v := range p.Manifest.Volumes { vols[v.Name] = v } for _, mp := range app.MountPoints { key := mp.Name vol, ok := vols[key] if !ok { catCmd := fmt.Sprintf("sudo rkt image cat-manifest --pretty-print %v", id) volumeCmd := "" for _, mp := range app.MountPoints { volumeCmd += fmt.Sprintf("--volume %s,kind=host,source=/some/path ", mp.Name) } return nil, fmt.Errorf("no volume for mountpoint %q in app %q.\n"+ "You can inspect the volumes with:\n\t%v\n"+ "App %q requires the following volumes:\n\t%v", key, name, catCmd, name, volumeCmd) } opt := make([]string, 4) // If the readonly flag in the pod manifest is not nil, // then use it to override the readonly flag in the image manifest. readOnly := mp.ReadOnly if vol.ReadOnly != nil { readOnly = *vol.ReadOnly } if readOnly { opt[0] = "--bind-ro=" } else { opt[0] = "--bind=" } opt[1] = vol.Source opt[2] = ":" opt[3] = filepath.Join(common.RelAppRootfsPath(id), mp.Path) args = append(args, strings.Join(opt, "")) } for _, i := range app.Isolators { switch v := i.Value().(type) { case types.LinuxCapabilitiesSet: var caps []string // TODO: cleanup the API on LinuxCapabilitiesSet to give strings easily. for _, c := range v.Set() { caps = append(caps, string(c)) } if i.Name == types.LinuxCapabilitiesRetainSetName { capList := strings.Join(caps, ",") args = append(args, "--capability="+capList) } } } return args, nil }
// InstantiatedPrepareAppUnitName returns the systemd service unit name for prepare-app // instantiated for the given root. func InstantiatedPrepareAppUnitName(appName types.ACName) string { // Naming respecting escaping rules, see systemd.unit(5) and systemd-escape(1) escapedRoot := unit.UnitNamePathEscape(common.RelAppRootfsPath(appName)) return "prepare-app@" + escapedRoot + ".service" }
// appToSystemd transforms the provided RuntimeApp+ImageManifest into systemd units func appToSystemd(p *stage1commontypes.Pod, ra *schema.RuntimeApp, interactive bool, flavor string, privateUsers string) error { app := ra.App appName := ra.Name image, ok := p.Images[appName.String()] if !ok { // This is impossible as we have updated the map in LoadPod(). panic(fmt.Sprintf("No images for app %q", ra.Name.String())) } imgName := image.Name if len(app.Exec) == 0 { return fmt.Errorf(`image %q has an empty "exec" (try --exec=BINARY)`, imgName) } workDir := "/" if app.WorkingDirectory != "" { workDir = app.WorkingDirectory } env := app.Environment env.Set("AC_APP_NAME", appName.String()) if p.MetadataServiceURL != "" { env.Set("AC_METADATA_URL", p.MetadataServiceURL) } if err := writeEnvFile(p, env, appName, privateUsers); err != nil { return fmt.Errorf("unable to write environment file: %v", err) } // This is a partial implementation for app.User and app.Group: // For now, only numeric ids (and the string "root") are supported. var uid, gid int var err error if app.User == "root" { uid = 0 } else { uid, err = strconv.Atoi(app.User) if err != nil { return fmt.Errorf("non-numerical user id not supported yet") } } if app.Group == "root" { gid = 0 } else { gid, err = strconv.Atoi(app.Group) if err != nil { return fmt.Errorf("non-numerical group id not supported yet") } } execWrap := []string{"/appexec", common.RelAppRootfsPath(appName), workDir, RelEnvFilePath(appName), strconv.Itoa(uid), generateGidArg(gid, app.SupplementaryGIDs)} execStart := quoteExec(append(execWrap, app.Exec...)) opts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v", appName, imgName)), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Unit", "Wants", fmt.Sprintf("reaper-%s.service", appName)), unit.NewUnitOption("Service", "Restart", "no"), unit.NewUnitOption("Service", "ExecStart", execStart), unit.NewUnitOption("Service", "User", "0"), unit.NewUnitOption("Service", "Group", "0"), } if interactive { opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty")) opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty")) opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty")) } else { opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console")) opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console")) opts = append(opts, unit.NewUnitOption("Service", "SyslogIdentifier", filepath.Base(app.Exec[0]))) } // When an app fails, we shut down the pod opts = append(opts, unit.NewUnitOption("Unit", "OnFailure", "halt.target")) for _, eh := range app.EventHandlers { var typ string switch eh.Name { case "pre-start": typ = "ExecStartPre" case "post-stop": typ = "ExecStopPost" default: return fmt.Errorf("unrecognized eventHandler: %v", eh.Name) } exec := quoteExec(append(execWrap, eh.Exec...)) opts = append(opts, unit.NewUnitOption("Service", typ, exec)) } // Some pre-start jobs take a long time, set the timeout to 0 opts = append(opts, unit.NewUnitOption("Service", "TimeoutStartSec", "0")) var saPorts []types.Port for _, p := range app.Ports { if p.SocketActivated { saPorts = append(saPorts, p) } } for _, i := range app.Isolators { switch v := i.Value().(type) { case *types.ResourceMemory: opts, err = cgroup.MaybeAddIsolator(opts, "memory", v.Limit()) if err != nil { return err } case *types.ResourceCPU: opts, err = cgroup.MaybeAddIsolator(opts, "cpu", v.Limit()) if err != nil { return err } } } if len(saPorts) > 0 { sockopts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v %s", appName, imgName, "socket-activated ports")), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Socket", "BindIPv6Only", "both"), unit.NewUnitOption("Socket", "Service", ServiceUnitName(appName)), } for _, sap := range saPorts { var proto string switch sap.Protocol { case "tcp": proto = "ListenStream" case "udp": proto = "ListenDatagram" default: return fmt.Errorf("unrecognized protocol: %v", sap.Protocol) } sockopts = append(sockopts, unit.NewUnitOption("Socket", proto, fmt.Sprintf("%v", sap.Port))) } file, err := os.OpenFile(SocketUnitPath(p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return fmt.Errorf("failed to create socket file: %v", err) } defer file.Close() if _, err = io.Copy(file, unit.Serialize(sockopts)); err != nil { return fmt.Errorf("failed to write socket unit file: %v", err) } if err = os.Symlink(path.Join("..", SocketUnitName(appName)), SocketWantPath(p.Root, appName)); err != nil { return fmt.Errorf("failed to link socket want: %v", err) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", SocketUnitName(appName))) } opts = append(opts, unit.NewUnitOption("Unit", "Requires", InstantiatedPrepareAppUnitName(appName))) opts = append(opts, unit.NewUnitOption("Unit", "After", InstantiatedPrepareAppUnitName(appName))) file, err := os.OpenFile(ServiceUnitPath(p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return fmt.Errorf("failed to create service unit file: %v", err) } defer file.Close() if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { return fmt.Errorf("failed to write service unit file: %v", err) } if err = os.Symlink(path.Join("..", ServiceUnitName(appName)), ServiceWantPath(p.Root, appName)); err != nil { return fmt.Errorf("failed to link service want: %v", err) } if flavor == "kvm" { // bind mount all shared volumes from /mnt/volumeName (we don't use mechanism for bind-mounting given by nspawn) err := AppToSystemdMountUnits(common.Stage1RootfsPath(p.Root), appName, p.Manifest.Volumes, ra, UnitsDir) if err != nil { return fmt.Errorf("failed to prepare mount units: %v", err) } } if err = writeAppReaper(p, appName.String()); err != nil { return fmt.Errorf("Failed to write app %q reaper service: %v\n", appName, err) } return nil }
// appToNspawnArgs transforms the given app manifest, with the given associated // app name, into a subset of applicable systemd-nspawn argument func appToNspawnArgs(p *stage1commontypes.Pod, ra *schema.RuntimeApp, insecureOptions Stage1InsecureOptions) ([]string, error) { var args []string appName := ra.Name app := ra.App sharedVolPath := common.SharedVolumesPath(p.Root) if err := os.MkdirAll(sharedVolPath, SharedVolPerm); err != nil { return nil, errwrap.Wrap(errors.New("could not create shared volumes directory"), err) } if err := os.Chmod(sharedVolPath, SharedVolPerm); err != nil { return nil, errwrap.Wrap(fmt.Errorf("could not change permissions of %q", sharedVolPath), err) } vols := make(map[types.ACName]types.Volume) for _, v := range p.Manifest.Volumes { vols[v.Name] = v } imageManifest := p.Images[appName.String()] mounts, err := GenerateMounts(ra, p.Manifest.Volumes, ConvertedFromDocker(imageManifest)) if err != nil { return nil, errwrap.Wrap(fmt.Errorf("could not generate app %q mounts", appName), err) } for _, m := range mounts { shPath := filepath.Join(sharedVolPath, m.Volume.Name.String()) absRoot, err := filepath.Abs(p.Root) // Absolute path to the pod's rootfs. if err != nil { return nil, errwrap.Wrap(errors.New("could not get pod's root absolute path"), err) } appRootfs := common.AppRootfsPath(absRoot, appName) // TODO(yifan): This is a temporary fix for systemd-nspawn not handling symlink mounts well. // Could be removed when https://github.com/systemd/systemd/issues/2860 is resolved, and systemd // version is bumped. mntPath, err := EvaluateSymlinksInsideApp(appRootfs, m.Mount.Path) if err != nil { return nil, errwrap.Wrap(fmt.Errorf("could not evaluate path %v", m.Mount.Path), err) } mntAbsPath := filepath.Join(appRootfs, mntPath) if err := PrepareMountpoints(shPath, mntAbsPath, &m.Volume, m.DockerImplicit); err != nil { return nil, err } opt := make([]string, 6) if m.ReadOnly { opt[0] = "--bind-ro=" } else { opt[0] = "--bind=" } switch m.Volume.Kind { case "host": opt[1] = m.Volume.Source case "empty": opt[1] = filepath.Join(common.SharedVolumesPath(absRoot), m.Volume.Name.String()) default: return nil, fmt.Errorf(`invalid volume kind %q. Must be one of "host" or "empty"`, m.Volume.Kind) } opt[2] = ":" opt[3] = filepath.Join(common.RelAppRootfsPath(appName), mntPath) opt[4] = ":" // If Recursive is not set, default to recursive. recursive := true if m.Volume.Recursive != nil { recursive = *m.Volume.Recursive } // rbind/norbind options exist since systemd-nspawn v226 if recursive { opt[5] = "rbind" } else { opt[5] = "norbind" } args = append(args, strings.Join(opt, "")) } if !insecureOptions.DisableCapabilities { capabilitiesStr, err := getAppCapabilities(app.Isolators) if err != nil { return nil, err } capList := strings.Join(capabilitiesStr, ",") args = append(args, "--capability="+capList) } return args, nil }
// AppToSystemdMountUnits prepare bind mount unit for empty or host kind mounting // between stage1 rootfs and chrooted filesystem for application func AppToSystemdMountUnits(root string, appName types.ACName, volumes []types.Volume, ra *schema.RuntimeApp, unitsDir string) error { app := ra.App vols := make(map[types.ACName]types.Volume) for _, v := range volumes { vols[v.Name] = v } mounts := GenerateMounts(ra, vols) for _, m := range mounts { vol := vols[m.Volume] // source relative to stage1 rootfs to relative pod root whatPath := filepath.Join(stage1MntDir, vol.Name.String()) whatFullPath := filepath.Join(root, whatPath) if vol.Kind == "empty" { log.Printf("creating an empty volume folder for sharing: %q", whatFullPath) err := os.MkdirAll(whatFullPath, 0700) if err != nil { return err } } // destination relative to stage1 rootfs and relative to pod root wherePath := filepath.Join(common.RelAppRootfsPath(appName), m.Path) whereFullPath := filepath.Join(root, wherePath) // assertion to make sure that "what" exists (created earlier by PodToSystemdHostMountUnits) log.Printf("checking required source path: %q", whatFullPath) if _, err := os.Stat(whatFullPath); os.IsNotExist(err) { return fmt.Errorf("bug: missing source for volume %v", vol.Name) } // optionally prepare app directory log.Printf("optionally preparing destination path: %q", whereFullPath) err := os.MkdirAll(whereFullPath, 0700) if err != nil { return errwrap.Wrap(fmt.Errorf("failed to prepare dir for mount %v", m.Volume), err) } // install new mount unit for bind mount /mnt/volumeName -> /opt/stage2/{app-id}/rootfs/{{mountPoint.Path}} mu, err := installNewMountUnit( root, // where put a mount unit whatPath, // what - stage1 rootfs /mnt/VolumeName wherePath, // where - inside chroot app filesystem "bind", // fstype "bind", // options serviceUnitName(appName), unitsDir, ) if err != nil { return errwrap.Wrap(fmt.Errorf("cannot install new mount unit for app %q", appName.String()), err) } // TODO(iaguis) when we update util-linux to 2.27, this code can go // away and we can bind-mount RO with one unit file. // http://ftp.kernel.org/pub/linux/utils/util-linux/v2.27/v2.27-ReleaseNotes if IsMountReadOnly(vol, app.MountPoints) { opts := []*unit.UnitOption{ unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Remount read-only unit for %s", wherePath)), unit.NewUnitOption("Unit", "DefaultDependencies", "false"), unit.NewUnitOption("Unit", "After", mu), unit.NewUnitOption("Unit", "Wants", mu), unit.NewUnitOption("Service", "ExecStart", fmt.Sprintf("/usr/bin/mount -o remount,ro %s", wherePath)), unit.NewUnitOption("Install", "RequiredBy", mu), } remountUnitPath := filepath.Join(root, unitsDir, unit.UnitNamePathEscape(wherePath+"-remount.service")) if err := writeUnit(opts, remountUnitPath); err != nil { return err } } } return nil }
// appToNspawnArgs transforms the given app manifest, with the given associated // app name, into a subset of applicable systemd-nspawn argument func (p *Pod) appToNspawnArgs(ra *schema.RuntimeApp) ([]string, error) { var args []string appName := ra.Name id := ra.Image.ID app := ra.App vols := make(map[types.ACName]types.Volume) mounts := make(map[string]schema.Mount) for _, m := range ra.Mounts { mounts[m.Path] = m } sharedVolPath := common.SharedVolumesPath(p.Root) if err := os.MkdirAll(sharedVolPath, sharedVolPerm); err != nil { return nil, fmt.Errorf("could not create shared volumes directory: %v", err) } if err := os.Chmod(sharedVolPath, sharedVolPerm); err != nil { return nil, fmt.Errorf("could not change permissions of %q: %v", sharedVolPath, err) } // Here we bind the volumes to the mountpoints via runtime mounts (--mount) for _, v := range p.Manifest.Volumes { vols[v.Name] = v if v.Kind == "empty" { if err := os.MkdirAll(filepath.Join(sharedVolPath, v.Name.String()), sharedVolPerm); err != nil { return nil, fmt.Errorf("could not create shared volume %q: %v", v.Name, err) } } } for _, mp := range app.MountPoints { // there's already an injected mount for this target path, skip if _, ok := mounts[mp.Path]; ok { continue } vol, ok := vols[mp.Name] if !ok { catCmd := fmt.Sprintf("sudo rkt image cat-manifest --pretty-print %v", id) volumeCmd := "" for _, mp := range app.MountPoints { volumeCmd += fmt.Sprintf("--volume %s,kind=host,source=/some/path ", mp.Name) } return nil, fmt.Errorf("no volume for mountpoint %q:%q in app %q.\n"+ "You can inspect the volumes with:\n\t%v\n"+ "App %q requires the following volumes:\n\t%v", mp.Name, mp.Path, appName, catCmd, appName, volumeCmd) } ra.Mounts = append(ra.Mounts, schema.Mount{Volume: vol.Name, Path: mp.Path}) } for _, m := range ra.Mounts { vol := vols[m.Volume] opt := make([]string, 4) // If the readonly flag in the pod manifest is not nil, // then use it to override the readonly flag in the image manifest. readOnly := isMPReadOnly(app.MountPoints, vol.Name) if vol.ReadOnly != nil { readOnly = *vol.ReadOnly } if readOnly { opt[0] = "--bind-ro=" } else { opt[0] = "--bind=" } switch vol.Kind { case "host": opt[1] = vol.Source case "empty": absRoot, err := filepath.Abs(p.Root) if err != nil { return nil, fmt.Errorf("cannot get pod's root absolute path: %v\n", err) } opt[1] = filepath.Join(common.SharedVolumesPath(absRoot), vol.Name.String()) default: return nil, fmt.Errorf(`invalid volume kind %q. Must be one of "host" or "empty".`, vol.Kind) } opt[2] = ":" opt[3] = filepath.Join(common.RelAppRootfsPath(appName), m.Path) args = append(args, strings.Join(opt, "")) } for _, i := range app.Isolators { switch v := i.Value().(type) { case types.LinuxCapabilitiesSet: var caps []string // TODO: cleanup the API on LinuxCapabilitiesSet to give strings easily. for _, c := range v.Set() { caps = append(caps, string(c)) } if i.Name == types.LinuxCapabilitiesRetainSetName { capList := strings.Join(caps, ",") args = append(args, "--capability="+capList) } } } return args, nil }