func runDaemon(args *docopt.Args) { hostname, _ := os.Hostname() externalIP := args.String["--external-ip"] stateFile := args.String["--state"] hostID := args.String["--id"] force := args.Bool["--force"] volPath := args.String["--volpath"] backendName := args.String["--backend"] flynnInit := args.String["--flynn-init"] nsumount := args.String["--nsumount"] logDir := args.String["--log-dir"] discoveryToken := args.String["--discovery"] var peerIPs []string if args.String["--peer-ips"] != "" { peerIPs = strings.Split(args.String["--peer-ips"], ",") } grohl.AddContext("app", "host") grohl.Log(grohl.Data{"at": "start"}) g := grohl.NewContext(grohl.Data{"fn": "main"}) if hostID == "" { hostID = strings.Replace(hostname, "-", "", -1) } if strings.Contains(hostID, "-") { shutdown.Fatal("host id must not contain dashes") } if externalIP == "" { var err error externalIP, err = config.DefaultExternalIP() if err != nil { shutdown.Fatal(err) } } publishAddr := net.JoinHostPort(externalIP, "1113") if discoveryToken != "" { // TODO: retry discoveryID, err := discovery.RegisterInstance(discovery.Info{ ClusterURL: discoveryToken, InstanceURL: "http://" + publishAddr, Name: hostID, }) if err != nil { g.Log(grohl.Data{"at": "register_discovery", "status": "error", "err": err.Error()}) shutdown.Fatal(err) } g.Log(grohl.Data{"at": "register_discovery", "id": discoveryID}) } state := NewState(hostID, stateFile) var backend Backend var err error // create volume manager vman, err := volumemanager.New( filepath.Join(volPath, "volumes.bolt"), func() (volume.Provider, error) { // use a zpool backing file size of either 70% of the device on which // volumes will reside, or 100GB if that can't be determined. var size int64 var dev syscall.Statfs_t if err := syscall.Statfs(volPath, &dev); err == nil { size = (dev.Bsize * int64(dev.Blocks) * 7) / 10 } else { size = 100000000000 } g.Log(grohl.Data{"at": "zpool_size", "size": size}) return zfsVolume.NewProvider(&zfsVolume.ProviderConfig{ DatasetName: "flynn-default", Make: &zfsVolume.MakeDev{ BackingFilename: filepath.Join(volPath, "zfs/vdev/flynn-default-zpool.vdev"), Size: size, }, WorkingDir: filepath.Join(volPath, "zfs"), }) }, ) if err != nil { shutdown.Fatal(err) } mux := logmux.New(1000) shutdown.BeforeExit(func() { mux.Close() }) switch backendName { case "libvirt-lxc": backend, err = NewLibvirtLXCBackend(state, vman, logDir, flynnInit, nsumount, mux) default: log.Fatalf("unknown backend %q", backendName) } if err != nil { shutdown.Fatal(err) } backend.SetDefaultEnv("EXTERNAL_IP", externalIP) discoverdManager := NewDiscoverdManager(backend, mux, hostID, publishAddr) publishURL := "http://" + publishAddr host := &Host{ id: hostID, url: publishURL, state: state, backend: backend, status: &host.HostStatus{ID: hostID, URL: publishURL}, } // stopJobs stops all jobs, leaving discoverd until the end so other // jobs can unregister themselves on shutdown. stopJobs := func() (err error) { var except []string host.statusMtx.RLock() if host.status.Discoverd != nil && host.status.Discoverd.JobID != "" { except = []string{host.status.Discoverd.JobID} } host.statusMtx.RUnlock() if err := backend.Cleanup(except); err != nil { return err } for _, id := range except { if e := backend.Stop(id); e != nil { err = e } } return } resurrect, err := state.Restore(backend) if err != nil { shutdown.Fatal(err) } shutdown.BeforeExit(func() { // close discoverd before stopping jobs so we can unregister first discoverdManager.Close() stopJobs() }) shutdown.BeforeExit(func() { if err := state.MarkForResurrection(); err != nil { log.Print("error marking for resurrection", err) } }) if err := serveHTTP( host, &attachHandler{state: state, backend: backend}, cluster.NewClient(), vman, discoverdManager.ConnectLocal, ); err != nil { shutdown.Fatal(err) } if force { if err := stopJobs(); err != nil { shutdown.Fatal(err) } } if discoveryToken != "" { instances, err := discovery.GetCluster(discoveryToken) if err != nil { // TODO(titanous): retry? shutdown.Fatal(err) } peerIPs = make([]string, 0, len(instances)) for _, inst := range instances { u, err := url.Parse(inst.URL) if err != nil { continue } ip, _, err := net.SplitHostPort(u.Host) if err != nil || ip == externalIP { continue } peerIPs = append(peerIPs, ip) } } if err := discoverdManager.ConnectPeer(peerIPs); err != nil { // No peers have working discoverd, so resurrect any available jobs resurrect() } <-make(chan struct{}) }
func runDaemon(args *docopt.Args) { hostname, _ := os.Hostname() httpPort := args.String["--http-port"] externalIP := args.String["--external-ip"] listenIP := args.String["--listen-ip"] stateFile := args.String["--state"] hostID := args.String["--id"] tags := parseTagArgs(args.String["--tags"]) force := args.Bool["--force"] volPath := args.String["--volpath"] volProvider := args.String["--vol-provider"] backendName := args.String["--backend"] flynnInit := args.String["--flynn-init"] logDir := args.String["--log-dir"] discoveryToken := args.String["--discovery"] bridgeName := args.String["--bridge-name"] logger, err := setupLogger(logDir) if err != nil { shutdown.Fatalf("error setting up logger: %s", err) } var peerIPs []string if args.String["--peer-ips"] != "" { peerIPs = strings.Split(args.String["--peer-ips"], ",") } if hostID == "" { hostID = strings.Replace(hostname, "-", "", -1) } var maxJobConcurrency uint64 = 4 if m, err := strconv.ParseUint(args.String["--max-job-concurrency"], 10, 64); err == nil { maxJobConcurrency = m } var partitionCGroups = make(map[string]int64) // name -> cpu shares for _, p := range strings.Split(args.String["--partitions"], " ") { nameShares := strings.Split(p, "=cpu_shares:") if len(nameShares) != 2 { shutdown.Fatalf("invalid partition specifier: %q", p) } shares, err := strconv.ParseInt(nameShares[1], 10, 64) if err != nil || shares < 2 { shutdown.Fatalf("invalid cpu shares specifier: %q", shares) } partitionCGroups[nameShares[0]] = shares } for _, s := range []string{"user", "system", "background"} { if _, ok := partitionCGroups[s]; !ok { shutdown.Fatalf("missing mandatory resource partition: %s", s) } } log := logger.New("fn", "runDaemon", "host.id", hostID) log.Info("starting daemon") log.Info("validating host ID") if strings.Contains(hostID, "-") { shutdown.Fatal("host id must not contain dashes") } if externalIP == "" { log.Info("detecting external IP") var err error externalIP, err = config.DefaultExternalIP() if err != nil { log.Error("error detecting external IP", "err", err) shutdown.Fatal(err) } log.Info("using external IP " + externalIP) } publishAddr := net.JoinHostPort(externalIP, httpPort) if discoveryToken != "" { // TODO: retry log.Info("registering with cluster discovery service", "token", discoveryToken, "addr", publishAddr, "name", hostID) discoveryID, err := discovery.RegisterInstance(discovery.Info{ ClusterURL: discoveryToken, InstanceURL: "http://" + publishAddr, Name: hostID, }) if err != nil { log.Error("error registering with cluster discovery service", "err", err) shutdown.Fatal(err) } log.Info("registered with cluster discovery service", "id", discoveryID) } state := NewState(hostID, stateFile) shutdown.BeforeExit(func() { state.CloseDB() }) log.Info("initializing volume manager", "provider", volProvider) var newVolProvider func() (volume.Provider, error) switch volProvider { case "zfs": newVolProvider = func() (volume.Provider, error) { // use a zpool backing file size of either 70% of the device on which // volumes will reside, or 100GB if that can't be determined. log.Info("determining ZFS zpool size") var size int64 var dev syscall.Statfs_t if err := syscall.Statfs(volPath, &dev); err == nil { size = (dev.Bsize * int64(dev.Blocks) * 7) / 10 } else { size = 100000000000 } log.Info(fmt.Sprintf("using ZFS zpool size %d", size)) return zfsVolume.NewProvider(&zfsVolume.ProviderConfig{ DatasetName: "flynn-default", Make: &zfsVolume.MakeDev{ BackingFilename: filepath.Join(volPath, "zfs/vdev/flynn-default-zpool.vdev"), Size: size, }, WorkingDir: filepath.Join(volPath, "zfs"), }) } case "mock": newVolProvider = func() (volume.Provider, error) { return nil, nil } default: shutdown.Fatalf("unknown volume provider: %q", volProvider) } vman := volumemanager.New( filepath.Join(volPath, "volumes.bolt"), newVolProvider, ) shutdown.BeforeExit(func() { vman.CloseDB() }) mux := logmux.New(hostID, logDir, logger.New("host.id", hostID, "component", "logmux")) log.Info("initializing job backend", "type", backendName) var backend Backend switch backendName { case "libcontainer": backend, err = NewLibcontainerBackend(state, vman, bridgeName, flynnInit, mux, partitionCGroups, logger.New("host.id", hostID, "component", "backend", "backend", "libcontainer")) case "mock": backend = MockBackend{} default: shutdown.Fatalf("unknown backend %q", backendName) } if err != nil { shutdown.Fatal(err) } backend.SetDefaultEnv("EXTERNAL_IP", externalIP) backend.SetDefaultEnv("LISTEN_IP", listenIP) var buffers host.LogBuffers discoverdManager := NewDiscoverdManager(backend, mux, hostID, publishAddr, tags) publishURL := "http://" + publishAddr host := &Host{ id: hostID, url: publishURL, status: &host.HostStatus{ ID: hostID, PID: os.Getpid(), URL: publishURL, Tags: tags, Version: version.String(), }, state: state, backend: backend, vman: vman, discMan: discoverdManager, log: logger.New("host.id", hostID), maxJobConcurrency: maxJobConcurrency, } backend.SetHost(host) // restore the host status if set in the environment if statusEnv := os.Getenv("FLYNN_HOST_STATUS"); statusEnv != "" { log.Info("restoring host status from parent") if err := json.Unmarshal([]byte(statusEnv), &host.status); err != nil { log.Error("error restoring host status from parent", "err", err) shutdown.Fatal(err) } pid := os.Getpid() log.Info("setting status PID", "pid", pid) host.status.PID = pid // keep the same tags as the parent discoverdManager.UpdateTags(host.status.Tags) } log.Info("creating HTTP listener") l, err := newHTTPListener(net.JoinHostPort(listenIP, httpPort)) if err != nil { log.Error("error creating HTTP listener", "err", err) shutdown.Fatal(err) } host.listener = l shutdown.BeforeExit(func() { host.Close() }) // if we have a control socket FD, wait for a "resume" message before // opening state DBs and serving requests. var controlFD int if fdEnv := os.Getenv("FLYNN_CONTROL_FD"); fdEnv != "" { log.Info("parsing control socket file descriptor") controlFD, err = strconv.Atoi(fdEnv) if err != nil { log.Error("error parsing control socket file descriptor", "err", err) shutdown.Fatal(err) } log.Info("waiting for resume message from parent") msg := make([]byte, len(ControlMsgResume)) if _, err := syscall.Read(controlFD, msg); err != nil { log.Error("error waiting for resume message from parent", "err", err) shutdown.Fatal(err) } log.Info("validating resume message") if !bytes.Equal(msg, ControlMsgResume) { log.Error(fmt.Sprintf("unexpected resume message from parent: %v", msg)) shutdown.ExitWithCode(1) } log.Info("receiving log buffers from parent") if err := json.NewDecoder(&controlSock{controlFD}).Decode(&buffers); err != nil { log.Error("error receiving log buffers from parent", "err", err) shutdown.Fatal(err) } } log.Info("opening state databases") if err := host.OpenDBs(); err != nil { log.Error("error opening state databases", "err", err) shutdown.Fatal(err) } // stopJobs stops all jobs, leaving discoverd until the end so other // jobs can unregister themselves on shutdown. stopJobs := func() (err error) { var except []string host.statusMtx.RLock() if host.status.Discoverd != nil && host.status.Discoverd.JobID != "" { except = []string{host.status.Discoverd.JobID} } host.statusMtx.RUnlock() log.Info("stopping all jobs except discoverd") if err := backend.Cleanup(except); err != nil { log.Error("error stopping all jobs except discoverd", "err", err) return err } for _, id := range except { log.Info("stopping discoverd") if e := backend.Stop(id); e != nil { log.Error("error stopping discoverd", "err", err) err = e } } return } log.Info("restoring state") resurrect, err := state.Restore(backend, buffers) if err != nil { log.Error("error restoring state", "err", err) shutdown.Fatal(err) } shutdown.BeforeExit(func() { // close discoverd before stopping jobs so we can unregister first log.Info("unregistering with service discovery") if err := discoverdManager.Close(); err != nil { log.Error("error unregistering with service discovery", "err", err) } stopJobs() }) log.Info("serving HTTP requests") host.ServeHTTP() if controlFD > 0 { // now that we are serving requests, send an "ok" message to the parent log.Info("sending ok message to parent") if _, err := syscall.Write(controlFD, ControlMsgOK); err != nil { log.Error("error sending ok message to parent", "err", err) shutdown.Fatal(err) } log.Info("closing control socket") if err := syscall.Close(controlFD); err != nil { log.Error("error closing control socket", "err", err) } } if force { log.Info("forcibly stopping existing jobs") if err := stopJobs(); err != nil { log.Error("error forcibly stopping existing jobs", "err", err) shutdown.Fatal(err) } } if discoveryToken != "" { log.Info("getting cluster peer IPs") instances, err := discovery.GetCluster(discoveryToken) if err != nil { // TODO(titanous): retry? log.Error("error getting discovery cluster", "err", err) shutdown.Fatal(err) } peerIPs = make([]string, 0, len(instances)) for _, inst := range instances { u, err := url.Parse(inst.URL) if err != nil { continue } ip, _, err := net.SplitHostPort(u.Host) if err != nil || ip == externalIP { continue } peerIPs = append(peerIPs, ip) } log.Info("got cluster peer IPs", "peers", peerIPs) } log.Info("connecting to cluster peers") if err := discoverdManager.ConnectPeer(peerIPs); err != nil { log.Info("no cluster peers available") } if !args.Bool["--no-resurrect"] { log.Info("resurrecting jobs") resurrect() } monitor := NewMonitor(host.discMan, externalIP, logger) shutdown.BeforeExit(func() { monitor.Shutdown() }) go monitor.Run() log.Info("blocking main goroutine") <-make(chan struct{}) }