Example #1
0
func checkOnlineHosts(expected int, state *State, urls []string, timeout time.Duration) error {
	if len(urls) == 0 {
		urls = []string{"http://127.0.0.1:1113"}
	}
	t := time.After(timeout)
	for {
		if state.ClusterURL != "" {
			instances, err := discovery.GetCluster(state.ClusterURL)
			if err != nil {
				return fmt.Errorf("error discovering cluster: %s", err)
			}
			urls = make([]string, len(instances))
			for i, inst := range instances {
				urls[i] = inst.URL
			}
		}

		known := len(urls)
		remaining := make(map[string]struct{}, known)
		online := 0
		if known >= expected {
			for _, url := range urls {
				remaining[url] = struct{}{}
			}
			state.Hosts = make([]*cluster.Host, 0, known)
			for _, url := range urls {
				h := cluster.NewHost("", url, nil, nil)
				status, err := h.GetStatus()
				if err != nil {
					continue
				}
				delete(remaining, url)
				online++
				state.Hosts = append(state.Hosts, cluster.NewHost(status.ID, status.URL, nil, nil))
			}
			if online >= expected {
				break
			}
		}

		select {
		case <-t:
			msg := fmt.Sprintf("timed out waiting for %d hosts to come online (currently %d online)\n\n", expected, online)
			msg += "The following hosts were discovered but remained unreachable:\n"
			for url := range remaining {
				msg += "\n" + url + "\n"
			}
			msg += "\n"
			return fmt.Errorf(msg)
		default:
			time.Sleep(time.Second)
		}
	}
	return nil
}
Example #2
0
func (s *HostSuite) TestUpdateTags(t *c.C) {
	events := make(chan *discoverd.Event)
	stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events)
	t.Assert(err, c.IsNil)
	defer stream.Close()

	nextEvent := func() *discoverd.Event {
		select {
		case e, ok := <-events:
			if !ok {
				t.Fatal("unexpected close of discoverd stream")
			}
			return e
		case <-time.After(10 * time.Second):
			t.Fatal("timed out waiting for discoverd event")
		}
		return nil
	}

	var client *cluster.Host
	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUp && client == nil {
			client = cluster.NewHost(e.Instance.Meta["id"], e.Instance.Addr, nil, nil)
		}
		if e.Kind == discoverd.EventKindCurrent {
			break
		}
	}
	if client == nil {
		t.Fatal("did not initialize flynn-host client")
	}

	t.Assert(client.UpdateTags(map[string]string{"foo": "bar"}), c.IsNil)

	var meta map[string]string
	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() {
			meta = e.Instance.Meta
			break
		}
	}
	t.Assert(meta["tag:foo"], c.Equals, "bar")

	// setting to empty string should delete the tag
	t.Assert(client.UpdateTags(map[string]string{"foo": ""}), c.IsNil)

	for {
		e := nextEvent()
		if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() {
			meta = e.Instance.Meta
			break
		}
	}
	if _, ok := meta["tag:foo"]; ok {
		t.Fatal("expected tag to be deleted but is still present")
	}
}
Example #3
0
func checkOnlineHosts(expected int, state *State, urls []string, timeoutSecs int) error {
	if len(urls) == 0 {
		urls = []string{"http://127.0.0.1:1113"}
	}
	timeout := time.After(time.Duration(timeoutSecs) * time.Second)
	for {
		if state.ClusterURL != "" {
			instances, err := discovery.GetCluster(state.ClusterURL)
			if err != nil {
				return fmt.Errorf("error discovering cluster: %s", err)
			}
			urls = make([]string, len(instances))
			for i, inst := range instances {
				urls[i] = inst.URL
			}
		}

		known := len(urls)
		online := 0
		if known >= expected {
			state.Hosts = make([]*cluster.Host, 0, known)
			for _, url := range urls {
				h := cluster.NewHost("", url, nil)
				status, err := h.GetStatus()
				if err != nil {
					continue
				}
				online++
				state.Hosts = append(state.Hosts, cluster.NewHost(status.ID, status.URL, nil))
			}
			if online >= expected {
				break
			}
		}

		select {
		case <-timeout:
			return fmt.Errorf("timed out waiting for %d hosts to come online (currently %d online)", expected, online)
		default:
			time.Sleep(time.Second)
		}
	}
	return nil
}
Example #4
0
func main() {
	const waitMax = time.Minute
	const waitInterval = 500 * time.Millisecond
	h := cluster.NewHost("", "http://127.0.0.1:1113", nil)

	timeout := time.After(waitMax)
	var status *host.HostStatus
	for {
		var err error
		status, err = h.GetStatus()
		if err == nil && status.Network != nil && status.Network.Subnet != "" {
			break
		}
		select {
		case <-timeout:
			if err == nil {
				err = errors.New("network didn't come up")
			}
			log.Fatal("timed out getting host status: ", err)
		default:
			time.Sleep(waitInterval)
		}
	}

	discoverd, err := exec.LookPath("discoverd")
	if err != nil {
		log.Fatal(err)
	}
	ip, _, err := net.ParseCIDR(status.Network.Subnet)
	if err != nil {
		log.Fatal(err)
	}

	if err := syscall.Exec(discoverd,
		[]string{
			discoverd,
			"-http-addr=:" + os.Getenv("PORT_0"),
			fmt.Sprintf("-dns-addr=%s:53", ip),
			"-recursors=" + strings.Join(status.Network.Resolvers, ","),
			"-notify=http://127.0.0.1:1113/host/discoverd",
		},
		os.Environ(),
	); err != nil {
		log.Fatal(err)
	}
}
Example #5
0
func (s *HostUpdateSuite) TestUpdateLogs(t *c.C) {
	if testCluster == nil {
		t.Skip("cannot boot new hosts")
	}

	instance := s.addHost(t)
	defer s.removeHost(t, instance)
	httpClient := &http.Client{Transport: &http.Transport{Dial: dialer.Retry.Dial}}
	client := cluster.NewHost(instance.ID, fmt.Sprintf("http://%s:1113", instance.IP), httpClient)

	// start partial logger job
	cmd := exec.JobUsingHost(
		client,
		exec.DockerImage(imageURIs["test-apps"]),
		&host.Job{
			Config: host.ContainerConfig{Cmd: []string{"/bin/partial-logger"}},
			Metadata: map[string]string{
				"flynn-controller.app": "partial-logger",
			},
		},
	)
	t.Assert(cmd.Start(), c.IsNil)
	defer cmd.Kill()

	// wait for partial line
	_, err := s.discoverdClient(t).Instances("partial-logger", 10*time.Second)
	t.Assert(err, c.IsNil)

	// update flynn-host
	pid, err := client.Update("/usr/local/bin/flynn-host", "daemon", "--id", cmd.HostID)
	t.Assert(err, c.IsNil)
	// update the pid file so removeHost works
	t.Assert(instance.Run(fmt.Sprintf("echo -n %d | sudo tee /var/run/flynn-host.pid", pid), nil), c.IsNil)

	// finish logging
	t.Assert(client.SignalJob(cmd.Job.ID, int(syscall.SIGUSR1)), c.IsNil)

	// check we get a single log line
	logc, err := logaggc.New("")
	t.Assert(err, c.IsNil)
	log, err := logc.GetLog("partial-logger", &logaggc.LogOpts{Follow: true})
	t.Assert(err, c.IsNil)
	defer log.Close()
	msgs := make(chan *logaggc.Message)
	go func() {
		defer close(msgs)
		dec := json.NewDecoder(log)
		for {
			var msg logaggc.Message
			if err := dec.Decode(&msg); err != nil {
				debugf(t, "error decoding message: %s", err)
				return
			}
			msgs <- &msg
		}
	}()
	for {
		select {
		case msg, ok := <-msgs:
			if !ok {
				t.Fatal("error getting log")
			}
			if msg.Stream == "stdout" {
				t.Assert(msg.Msg, c.Equals, "hello world")
				return
			}
		case <-time.After(10 * time.Second):
			t.Fatal("timed out waiting for log")
		}
	}
}
Example #6
0
func (f *ClusterFixer) Run(args *docopt.Args, c *cluster.Client) error {
	f.c = c
	f.l = log15.New()
	var err error

	minHosts, err := strconv.Atoi(args.String["--min-hosts"])
	if err != nil || minHosts < 1 {
		return fmt.Errorf("invalid or missing --min-hosts value")
	}

	f.hosts, err = c.Hosts()
	if err != nil {
		f.l.Error("unable to list hosts from discoverd, falling back to peer IP list", "error", err)
		var ips []string
		if ipList := args.String["--peer-ips"]; ipList != "" {
			ips = strings.Split(ipList, ",")
			if minHosts == 0 {
				minHosts = len(ips)
			}
		}
		if len(ips) == 0 {
			return fmt.Errorf("error connecting to discoverd, use --peer-ips: %s", err)
		}
		if len(ips) < minHosts {
			return fmt.Errorf("number of peer IPs provided (%d) is less than --min-hosts (%d)", len(ips), minHosts)
		}

		f.hosts = make([]*cluster.Host, len(ips))
		for i, ip := range ips {
			url := fmt.Sprintf("http://%s:1113", ip)
			status, err := cluster.NewHost("", url, nil, nil).GetStatus()
			if err != nil {
				return fmt.Errorf("error connecting to %s: %s", ip, err)
			}
			f.hosts[i] = cluster.NewHost(status.ID, url, nil, nil)
		}
	}
	// check expected number of hosts
	if len(f.hosts) < minHosts {
		// TODO(titanous): be smarter about this
		return fmt.Errorf("expected at least %d hosts, but %d found", minHosts, len(f.hosts))
	}
	f.l.Info("found expected hosts", "n", len(f.hosts))

	if err := f.FixDiscoverd(); err != nil {
		return err
	}
	if err := f.FixFlannel(); err != nil {
		return err
	}

	f.l.Info("waiting for discoverd to be available")
	timeout := time.After(time.Minute)
	for {
		var err error
		if _, err = discoverd.GetInstances("discoverd", 30*time.Second); err != nil {
			time.Sleep(100 * time.Millisecond)
		} else {
			break
		}
		select {
		case <-timeout:
			return fmt.Errorf("timed out waiting for discoverd, last error: %s", err)
		}
	}

	f.l.Info("checking for running controller API")
	controllerService := discoverd.NewService("controller")
	controllerInstances, _ := controllerService.Instances()
	if len(controllerInstances) > 0 {
		f.l.Info("found running controller API instances", "n", len(controllerInstances))
		if err := f.FixController(controllerInstances, false); err != nil {
			f.l.Error("error fixing controller", "err", err)
			// if unable to write correct formations, we need to kill the scheduler so that the rest of this works
			if err := f.KillSchedulers(); err != nil {
				return err
			}
		}
	}

	f.l.Info("checking status of sirenia databases")
	for _, db := range []string{"postgres", "mariadb", "mongodb"} {
		f.l.Info("checking for database state", "db", db)
		if _, err := discoverd.NewService(db).GetMeta(); err != nil {
			if discoverd.IsNotFound(err) {
				f.l.Info("skipping recovery of db, no state in discoverd", "db", db)
				continue
			}
			f.l.Error("error checking database state", "db", db)
			return err
		}
		if err := f.FixSirenia(db); err != nil {
			return err
		}
	}

	f.l.Info("checking for running controller API")
	controllerInstances, _ = controllerService.Instances()
	if len(controllerInstances) == 0 {
		// kill schedulers to prevent interference
		if err := f.KillSchedulers(); err != nil {
			return err
		}
		controllerInstances, err = f.StartAppJob("controller", "web", "controller")
		if err != nil {
			return err
		}
	} else {
		f.l.Info("found running controller API instances", "n", len(controllerInstances))
	}

	if err := f.FixController(controllerInstances, true); err != nil {
		f.l.Error("error fixing controller", "err", err)
		return err
	}

	f.l.Info("cluster fix complete")

	return nil
}
Example #7
0
func (s *HostSuite) TestUpdate(t *c.C) {
	dir := t.MkDir()
	flynnHost := filepath.Join(dir, "flynn-host")
	run(t, osexec.Command("cp", args.FlynnHost, flynnHost))

	// start flynn-host
	id := random.String(8)
	var out bytes.Buffer
	cmd := osexec.Command(
		flynnHost,
		"daemon",
		"--http-port", "11113",
		"--state", filepath.Join(dir, "host-state.bolt"),
		"--id", id,
		"--backend", "mock",
		"--vol-provider", "mock",
		"--volpath", filepath.Join(dir, "volumes"),
		"--log-dir", filepath.Join(dir, "logs"),
	)
	cmd.Stdout = &out
	cmd.Stderr = &out
	defer func() {
		debug(t, "*** flynn-host output ***")
		debug(t, out.String())
		debug(t, "*************************")
	}()
	t.Assert(cmd.Start(), c.IsNil)
	defer cmd.Process.Kill()

	httpClient := &http.Client{Transport: &http.Transport{Dial: dialer.Retry.Dial}}
	client := cluster.NewHost(id, "http://127.0.0.1:11113", httpClient, nil)

	// exec a program which exits straight away
	_, err := client.Update("/bin/true")
	t.Assert(err, c.NotNil)
	status, err := client.GetStatus()
	t.Assert(err, c.IsNil)
	t.Assert(status.ID, c.Equals, id)
	t.Assert(status.PID, c.Equals, cmd.Process.Pid)

	// exec a program which reads the control socket but then exits
	_, err = client.Update("/bin/bash", "-c", "<&4; exit")
	t.Assert(err, c.NotNil)
	status, err = client.GetStatus()
	t.Assert(err, c.IsNil)
	t.Assert(status.ID, c.Equals, id)
	t.Assert(status.PID, c.Equals, cmd.Process.Pid)

	// exec flynn-host and check we get the status from the new daemon
	pid, err := client.Update(
		flynnHost,
		"daemon",
		"--http-port", "11113",
		"--state", filepath.Join(dir, "host-state.bolt"),
		"--id", id,
		"--backend", "mock",
		"--vol-provider", "mock",
		"--volpath", filepath.Join(dir, "volumes"),
		"--log-dir", filepath.Join(dir, "logs"),
	)
	t.Assert(err, c.IsNil)
	defer syscall.Kill(pid, syscall.SIGKILL)

	done := make(chan struct{})
	go func() {
		cmd.Process.Signal(syscall.SIGTERM)
		syscall.Wait4(cmd.Process.Pid, nil, 0, nil)
		close(done)
	}()
	select {
	case <-done:
	case <-time.After(15 * time.Second):
		t.Fatal("timed out waiting for flynn-host daemon to exit")
	}

	// client.GetStatus intermittently returns io.EOF right after the update. We
	// don't currently understand why (likely due to the way the listener is
	// passed around), so for now just retry the request.
	//
	// TODO(lmars): figure out why and remove this loop.
	delay := 100 * time.Millisecond
	for start := time.Now(); time.Since(start) < 10*time.Second; time.Sleep(delay) {
		status, err = client.GetStatus()
		if e, ok := err.(*url.Error); ok && strings.Contains(e.Err.Error(), "EOF") {
			debugf(t, "got io.EOF from flynn-host, trying again in %s", delay)
			continue
		}
		break
	}
	t.Assert(err, c.IsNil)
	t.Assert(status.ID, c.Equals, id)
	t.Assert(status.PID, c.Equals, pid)
}