Exemplo n.º 1
0
func runDaemon(args *docopt.Args) {
	hostname, _ := os.Hostname()
	externalIP := args.String["--external-ip"]
	stateFile := args.String["--state"]
	hostID := args.String["--id"]
	force := args.Bool["--force"]
	volPath := args.String["--volpath"]
	backendName := args.String["--backend"]
	flynnInit := args.String["--flynn-init"]
	nsumount := args.String["--nsumount"]
	logDir := args.String["--log-dir"]
	discoveryToken := args.String["--discovery"]

	var peerIPs []string
	if args.String["--peer-ips"] != "" {
		peerIPs = strings.Split(args.String["--peer-ips"], ",")
	}

	grohl.AddContext("app", "host")
	grohl.Log(grohl.Data{"at": "start"})
	g := grohl.NewContext(grohl.Data{"fn": "main"})

	if hostID == "" {
		hostID = strings.Replace(hostname, "-", "", -1)
	}
	if strings.Contains(hostID, "-") {
		shutdown.Fatal("host id must not contain dashes")
	}
	if externalIP == "" {
		var err error
		externalIP, err = config.DefaultExternalIP()
		if err != nil {
			shutdown.Fatal(err)
		}
	}

	publishAddr := net.JoinHostPort(externalIP, "1113")
	if discoveryToken != "" {
		// TODO: retry
		discoveryID, err := discovery.RegisterInstance(discovery.Info{
			ClusterURL:  discoveryToken,
			InstanceURL: "http://" + publishAddr,
			Name:        hostID,
		})
		if err != nil {
			g.Log(grohl.Data{"at": "register_discovery", "status": "error", "err": err.Error()})
			shutdown.Fatal(err)
		}
		g.Log(grohl.Data{"at": "register_discovery", "id": discoveryID})
	}

	state := NewState(hostID, stateFile)
	var backend Backend
	var err error

	// create volume manager
	vman, err := volumemanager.New(
		filepath.Join(volPath, "volumes.bolt"),
		func() (volume.Provider, error) {
			// use a zpool backing file size of either 70% of the device on which
			// volumes will reside, or 100GB if that can't be determined.
			var size int64
			var dev syscall.Statfs_t
			if err := syscall.Statfs(volPath, &dev); err == nil {
				size = (dev.Bsize * int64(dev.Blocks) * 7) / 10
			} else {
				size = 100000000000
			}
			g.Log(grohl.Data{"at": "zpool_size", "size": size})

			return zfsVolume.NewProvider(&zfsVolume.ProviderConfig{
				DatasetName: "flynn-default",
				Make: &zfsVolume.MakeDev{
					BackingFilename: filepath.Join(volPath, "zfs/vdev/flynn-default-zpool.vdev"),
					Size:            size,
				},
				WorkingDir: filepath.Join(volPath, "zfs"),
			})
		},
	)
	if err != nil {
		shutdown.Fatal(err)
	}

	mux := logmux.New(1000)
	shutdown.BeforeExit(func() { mux.Close() })

	switch backendName {
	case "libvirt-lxc":
		backend, err = NewLibvirtLXCBackend(state, vman, logDir, flynnInit, nsumount, mux)
	default:
		log.Fatalf("unknown backend %q", backendName)
	}
	if err != nil {
		shutdown.Fatal(err)
	}
	backend.SetDefaultEnv("EXTERNAL_IP", externalIP)

	discoverdManager := NewDiscoverdManager(backend, mux, hostID, publishAddr)
	publishURL := "http://" + publishAddr
	host := &Host{
		id:      hostID,
		url:     publishURL,
		state:   state,
		backend: backend,
		status:  &host.HostStatus{ID: hostID, URL: publishURL},
	}

	// stopJobs stops all jobs, leaving discoverd until the end so other
	// jobs can unregister themselves on shutdown.
	stopJobs := func() (err error) {
		var except []string
		host.statusMtx.RLock()
		if host.status.Discoverd != nil && host.status.Discoverd.JobID != "" {
			except = []string{host.status.Discoverd.JobID}
		}
		host.statusMtx.RUnlock()
		if err := backend.Cleanup(except); err != nil {
			return err
		}
		for _, id := range except {
			if e := backend.Stop(id); e != nil {
				err = e
			}
		}
		return
	}

	resurrect, err := state.Restore(backend)
	if err != nil {
		shutdown.Fatal(err)
	}
	shutdown.BeforeExit(func() {
		// close discoverd before stopping jobs so we can unregister first
		discoverdManager.Close()
		stopJobs()
	})
	shutdown.BeforeExit(func() {
		if err := state.MarkForResurrection(); err != nil {
			log.Print("error marking for resurrection", err)
		}
	})

	if err := serveHTTP(
		host,
		&attachHandler{state: state, backend: backend},
		cluster.NewClient(),
		vman,
		discoverdManager.ConnectLocal,
	); err != nil {
		shutdown.Fatal(err)
	}

	if force {
		if err := stopJobs(); err != nil {
			shutdown.Fatal(err)
		}
	}

	if discoveryToken != "" {
		instances, err := discovery.GetCluster(discoveryToken)
		if err != nil {
			// TODO(titanous): retry?
			shutdown.Fatal(err)
		}
		peerIPs = make([]string, 0, len(instances))
		for _, inst := range instances {
			u, err := url.Parse(inst.URL)
			if err != nil {
				continue
			}
			ip, _, err := net.SplitHostPort(u.Host)
			if err != nil || ip == externalIP {
				continue
			}
			peerIPs = append(peerIPs, ip)
		}
	}
	if err := discoverdManager.ConnectPeer(peerIPs); err != nil {
		// No peers have working discoverd, so resurrect any available jobs
		resurrect()
	}

	<-make(chan struct{})
}
Exemplo n.º 2
0
func runDaemon(args *docopt.Args) {
	hostname, _ := os.Hostname()
	httpPort := args.String["--http-port"]
	externalIP := args.String["--external-ip"]
	listenIP := args.String["--listen-ip"]
	stateFile := args.String["--state"]
	hostID := args.String["--id"]
	tags := parseTagArgs(args.String["--tags"])
	force := args.Bool["--force"]
	volPath := args.String["--volpath"]
	volProvider := args.String["--vol-provider"]
	backendName := args.String["--backend"]
	flynnInit := args.String["--flynn-init"]
	logDir := args.String["--log-dir"]
	discoveryToken := args.String["--discovery"]
	bridgeName := args.String["--bridge-name"]

	logger, err := setupLogger(logDir)
	if err != nil {
		shutdown.Fatalf("error setting up logger: %s", err)
	}

	var peerIPs []string
	if args.String["--peer-ips"] != "" {
		peerIPs = strings.Split(args.String["--peer-ips"], ",")
	}

	if hostID == "" {
		hostID = strings.Replace(hostname, "-", "", -1)
	}

	var maxJobConcurrency uint64 = 4
	if m, err := strconv.ParseUint(args.String["--max-job-concurrency"], 10, 64); err == nil {
		maxJobConcurrency = m
	}

	var partitionCGroups = make(map[string]int64) // name -> cpu shares
	for _, p := range strings.Split(args.String["--partitions"], " ") {
		nameShares := strings.Split(p, "=cpu_shares:")
		if len(nameShares) != 2 {
			shutdown.Fatalf("invalid partition specifier: %q", p)
		}
		shares, err := strconv.ParseInt(nameShares[1], 10, 64)
		if err != nil || shares < 2 {
			shutdown.Fatalf("invalid cpu shares specifier: %q", shares)
		}
		partitionCGroups[nameShares[0]] = shares
	}
	for _, s := range []string{"user", "system", "background"} {
		if _, ok := partitionCGroups[s]; !ok {
			shutdown.Fatalf("missing mandatory resource partition: %s", s)
		}
	}

	log := logger.New("fn", "runDaemon", "host.id", hostID)
	log.Info("starting daemon")

	log.Info("validating host ID")
	if strings.Contains(hostID, "-") {
		shutdown.Fatal("host id must not contain dashes")
	}
	if externalIP == "" {
		log.Info("detecting external IP")
		var err error
		externalIP, err = config.DefaultExternalIP()
		if err != nil {
			log.Error("error detecting external IP", "err", err)
			shutdown.Fatal(err)
		}
		log.Info("using external IP " + externalIP)
	}

	publishAddr := net.JoinHostPort(externalIP, httpPort)
	if discoveryToken != "" {
		// TODO: retry
		log.Info("registering with cluster discovery service", "token", discoveryToken, "addr", publishAddr, "name", hostID)
		discoveryID, err := discovery.RegisterInstance(discovery.Info{
			ClusterURL:  discoveryToken,
			InstanceURL: "http://" + publishAddr,
			Name:        hostID,
		})
		if err != nil {
			log.Error("error registering with cluster discovery service", "err", err)
			shutdown.Fatal(err)
		}
		log.Info("registered with cluster discovery service", "id", discoveryID)
	}

	state := NewState(hostID, stateFile)
	shutdown.BeforeExit(func() { state.CloseDB() })

	log.Info("initializing volume manager", "provider", volProvider)
	var newVolProvider func() (volume.Provider, error)
	switch volProvider {
	case "zfs":
		newVolProvider = func() (volume.Provider, error) {
			// use a zpool backing file size of either 70% of the device on which
			// volumes will reside, or 100GB if that can't be determined.
			log.Info("determining ZFS zpool size")
			var size int64
			var dev syscall.Statfs_t
			if err := syscall.Statfs(volPath, &dev); err == nil {
				size = (dev.Bsize * int64(dev.Blocks) * 7) / 10
			} else {
				size = 100000000000
			}
			log.Info(fmt.Sprintf("using ZFS zpool size %d", size))

			return zfsVolume.NewProvider(&zfsVolume.ProviderConfig{
				DatasetName: "flynn-default",
				Make: &zfsVolume.MakeDev{
					BackingFilename: filepath.Join(volPath, "zfs/vdev/flynn-default-zpool.vdev"),
					Size:            size,
				},
				WorkingDir: filepath.Join(volPath, "zfs"),
			})
		}
	case "mock":
		newVolProvider = func() (volume.Provider, error) { return nil, nil }
	default:
		shutdown.Fatalf("unknown volume provider: %q", volProvider)
	}
	vman := volumemanager.New(
		filepath.Join(volPath, "volumes.bolt"),
		newVolProvider,
	)
	shutdown.BeforeExit(func() { vman.CloseDB() })

	mux := logmux.New(hostID, logDir, logger.New("host.id", hostID, "component", "logmux"))

	log.Info("initializing job backend", "type", backendName)
	var backend Backend
	switch backendName {
	case "libcontainer":
		backend, err = NewLibcontainerBackend(state, vman, bridgeName, flynnInit, mux, partitionCGroups, logger.New("host.id", hostID, "component", "backend", "backend", "libcontainer"))
	case "mock":
		backend = MockBackend{}
	default:
		shutdown.Fatalf("unknown backend %q", backendName)
	}
	if err != nil {
		shutdown.Fatal(err)
	}
	backend.SetDefaultEnv("EXTERNAL_IP", externalIP)
	backend.SetDefaultEnv("LISTEN_IP", listenIP)

	var buffers host.LogBuffers
	discoverdManager := NewDiscoverdManager(backend, mux, hostID, publishAddr, tags)
	publishURL := "http://" + publishAddr
	host := &Host{
		id:  hostID,
		url: publishURL,
		status: &host.HostStatus{
			ID:      hostID,
			PID:     os.Getpid(),
			URL:     publishURL,
			Tags:    tags,
			Version: version.String(),
		},
		state:   state,
		backend: backend,
		vman:    vman,
		discMan: discoverdManager,
		log:     logger.New("host.id", hostID),

		maxJobConcurrency: maxJobConcurrency,
	}
	backend.SetHost(host)

	// restore the host status if set in the environment
	if statusEnv := os.Getenv("FLYNN_HOST_STATUS"); statusEnv != "" {
		log.Info("restoring host status from parent")
		if err := json.Unmarshal([]byte(statusEnv), &host.status); err != nil {
			log.Error("error restoring host status from parent", "err", err)
			shutdown.Fatal(err)
		}
		pid := os.Getpid()
		log.Info("setting status PID", "pid", pid)
		host.status.PID = pid
		// keep the same tags as the parent
		discoverdManager.UpdateTags(host.status.Tags)
	}

	log.Info("creating HTTP listener")
	l, err := newHTTPListener(net.JoinHostPort(listenIP, httpPort))
	if err != nil {
		log.Error("error creating HTTP listener", "err", err)
		shutdown.Fatal(err)
	}
	host.listener = l
	shutdown.BeforeExit(func() { host.Close() })

	// if we have a control socket FD, wait for a "resume" message before
	// opening state DBs and serving requests.
	var controlFD int
	if fdEnv := os.Getenv("FLYNN_CONTROL_FD"); fdEnv != "" {
		log.Info("parsing control socket file descriptor")
		controlFD, err = strconv.Atoi(fdEnv)
		if err != nil {
			log.Error("error parsing control socket file descriptor", "err", err)
			shutdown.Fatal(err)
		}

		log.Info("waiting for resume message from parent")
		msg := make([]byte, len(ControlMsgResume))
		if _, err := syscall.Read(controlFD, msg); err != nil {
			log.Error("error waiting for resume message from parent", "err", err)
			shutdown.Fatal(err)
		}

		log.Info("validating resume message")
		if !bytes.Equal(msg, ControlMsgResume) {
			log.Error(fmt.Sprintf("unexpected resume message from parent: %v", msg))
			shutdown.ExitWithCode(1)
		}

		log.Info("receiving log buffers from parent")
		if err := json.NewDecoder(&controlSock{controlFD}).Decode(&buffers); err != nil {
			log.Error("error receiving log buffers from parent", "err", err)
			shutdown.Fatal(err)
		}
	}

	log.Info("opening state databases")
	if err := host.OpenDBs(); err != nil {
		log.Error("error opening state databases", "err", err)
		shutdown.Fatal(err)
	}

	// stopJobs stops all jobs, leaving discoverd until the end so other
	// jobs can unregister themselves on shutdown.
	stopJobs := func() (err error) {
		var except []string
		host.statusMtx.RLock()
		if host.status.Discoverd != nil && host.status.Discoverd.JobID != "" {
			except = []string{host.status.Discoverd.JobID}
		}
		host.statusMtx.RUnlock()
		log.Info("stopping all jobs except discoverd")
		if err := backend.Cleanup(except); err != nil {
			log.Error("error stopping all jobs except discoverd", "err", err)
			return err
		}
		for _, id := range except {
			log.Info("stopping discoverd")
			if e := backend.Stop(id); e != nil {
				log.Error("error stopping discoverd", "err", err)
				err = e
			}
		}
		return
	}

	log.Info("restoring state")
	resurrect, err := state.Restore(backend, buffers)
	if err != nil {
		log.Error("error restoring state", "err", err)
		shutdown.Fatal(err)
	}
	shutdown.BeforeExit(func() {
		// close discoverd before stopping jobs so we can unregister first
		log.Info("unregistering with service discovery")
		if err := discoverdManager.Close(); err != nil {
			log.Error("error unregistering with service discovery", "err", err)
		}
		stopJobs()
	})

	log.Info("serving HTTP requests")
	host.ServeHTTP()

	if controlFD > 0 {
		// now that we are serving requests, send an "ok" message to the parent
		log.Info("sending ok message to parent")
		if _, err := syscall.Write(controlFD, ControlMsgOK); err != nil {
			log.Error("error sending ok message to parent", "err", err)
			shutdown.Fatal(err)
		}

		log.Info("closing control socket")
		if err := syscall.Close(controlFD); err != nil {
			log.Error("error closing control socket", "err", err)
		}
	}

	if force {
		log.Info("forcibly stopping existing jobs")
		if err := stopJobs(); err != nil {
			log.Error("error forcibly stopping existing jobs", "err", err)
			shutdown.Fatal(err)
		}
	}

	if discoveryToken != "" {
		log.Info("getting cluster peer IPs")
		instances, err := discovery.GetCluster(discoveryToken)
		if err != nil {
			// TODO(titanous): retry?
			log.Error("error getting discovery cluster", "err", err)
			shutdown.Fatal(err)
		}
		peerIPs = make([]string, 0, len(instances))
		for _, inst := range instances {
			u, err := url.Parse(inst.URL)
			if err != nil {
				continue
			}
			ip, _, err := net.SplitHostPort(u.Host)
			if err != nil || ip == externalIP {
				continue
			}
			peerIPs = append(peerIPs, ip)
		}
		log.Info("got cluster peer IPs", "peers", peerIPs)
	}
	log.Info("connecting to cluster peers")
	if err := discoverdManager.ConnectPeer(peerIPs); err != nil {
		log.Info("no cluster peers available")
	}

	if !args.Bool["--no-resurrect"] {
		log.Info("resurrecting jobs")
		resurrect()
	}

	monitor := NewMonitor(host.discMan, externalIP, logger)
	shutdown.BeforeExit(func() { monitor.Shutdown() })
	go monitor.Run()

	log.Info("blocking main goroutine")
	<-make(chan struct{})
}