func checkOnlineHosts(expected int, state *State, urls []string, timeout time.Duration) error { if len(urls) == 0 { urls = []string{"http://127.0.0.1:1113"} } t := time.After(timeout) for { if state.ClusterURL != "" { instances, err := discovery.GetCluster(state.ClusterURL) if err != nil { return fmt.Errorf("error discovering cluster: %s", err) } urls = make([]string, len(instances)) for i, inst := range instances { urls[i] = inst.URL } } known := len(urls) remaining := make(map[string]struct{}, known) online := 0 if known >= expected { for _, url := range urls { remaining[url] = struct{}{} } state.Hosts = make([]*cluster.Host, 0, known) for _, url := range urls { h := cluster.NewHost("", url, nil, nil) status, err := h.GetStatus() if err != nil { continue } delete(remaining, url) online++ state.Hosts = append(state.Hosts, cluster.NewHost(status.ID, status.URL, nil, nil)) } if online >= expected { break } } select { case <-t: msg := fmt.Sprintf("timed out waiting for %d hosts to come online (currently %d online)\n\n", expected, online) msg += "The following hosts were discovered but remained unreachable:\n" for url := range remaining { msg += "\n" + url + "\n" } msg += "\n" return fmt.Errorf(msg) default: time.Sleep(time.Second) } } return nil }
func (s *HostSuite) TestUpdateTags(t *c.C) { events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service("flynn-host").Watch(events) t.Assert(err, c.IsNil) defer stream.Close() nextEvent := func() *discoverd.Event { select { case e, ok := <-events: if !ok { t.Fatal("unexpected close of discoverd stream") } return e case <-time.After(10 * time.Second): t.Fatal("timed out waiting for discoverd event") } return nil } var client *cluster.Host for { e := nextEvent() if e.Kind == discoverd.EventKindUp && client == nil { client = cluster.NewHost(e.Instance.Meta["id"], e.Instance.Addr, nil, nil) } if e.Kind == discoverd.EventKindCurrent { break } } if client == nil { t.Fatal("did not initialize flynn-host client") } t.Assert(client.UpdateTags(map[string]string{"foo": "bar"}), c.IsNil) var meta map[string]string for { e := nextEvent() if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() { meta = e.Instance.Meta break } } t.Assert(meta["tag:foo"], c.Equals, "bar") // setting to empty string should delete the tag t.Assert(client.UpdateTags(map[string]string{"foo": ""}), c.IsNil) for { e := nextEvent() if e.Kind == discoverd.EventKindUpdate && e.Instance.Meta["id"] == client.ID() { meta = e.Instance.Meta break } } if _, ok := meta["tag:foo"]; ok { t.Fatal("expected tag to be deleted but is still present") } }
func checkOnlineHosts(expected int, state *State, urls []string, timeoutSecs int) error { if len(urls) == 0 { urls = []string{"http://127.0.0.1:1113"} } timeout := time.After(time.Duration(timeoutSecs) * time.Second) for { if state.ClusterURL != "" { instances, err := discovery.GetCluster(state.ClusterURL) if err != nil { return fmt.Errorf("error discovering cluster: %s", err) } urls = make([]string, len(instances)) for i, inst := range instances { urls[i] = inst.URL } } known := len(urls) online := 0 if known >= expected { state.Hosts = make([]*cluster.Host, 0, known) for _, url := range urls { h := cluster.NewHost("", url, nil) status, err := h.GetStatus() if err != nil { continue } online++ state.Hosts = append(state.Hosts, cluster.NewHost(status.ID, status.URL, nil)) } if online >= expected { break } } select { case <-timeout: return fmt.Errorf("timed out waiting for %d hosts to come online (currently %d online)", expected, online) default: time.Sleep(time.Second) } } return nil }
func main() { const waitMax = time.Minute const waitInterval = 500 * time.Millisecond h := cluster.NewHost("", "http://127.0.0.1:1113", nil) timeout := time.After(waitMax) var status *host.HostStatus for { var err error status, err = h.GetStatus() if err == nil && status.Network != nil && status.Network.Subnet != "" { break } select { case <-timeout: if err == nil { err = errors.New("network didn't come up") } log.Fatal("timed out getting host status: ", err) default: time.Sleep(waitInterval) } } discoverd, err := exec.LookPath("discoverd") if err != nil { log.Fatal(err) } ip, _, err := net.ParseCIDR(status.Network.Subnet) if err != nil { log.Fatal(err) } if err := syscall.Exec(discoverd, []string{ discoverd, "-http-addr=:" + os.Getenv("PORT_0"), fmt.Sprintf("-dns-addr=%s:53", ip), "-recursors=" + strings.Join(status.Network.Resolvers, ","), "-notify=http://127.0.0.1:1113/host/discoverd", }, os.Environ(), ); err != nil { log.Fatal(err) } }
func (s *HostUpdateSuite) TestUpdateLogs(t *c.C) { if testCluster == nil { t.Skip("cannot boot new hosts") } instance := s.addHost(t) defer s.removeHost(t, instance) httpClient := &http.Client{Transport: &http.Transport{Dial: dialer.Retry.Dial}} client := cluster.NewHost(instance.ID, fmt.Sprintf("http://%s:1113", instance.IP), httpClient) // start partial logger job cmd := exec.JobUsingHost( client, exec.DockerImage(imageURIs["test-apps"]), &host.Job{ Config: host.ContainerConfig{Cmd: []string{"/bin/partial-logger"}}, Metadata: map[string]string{ "flynn-controller.app": "partial-logger", }, }, ) t.Assert(cmd.Start(), c.IsNil) defer cmd.Kill() // wait for partial line _, err := s.discoverdClient(t).Instances("partial-logger", 10*time.Second) t.Assert(err, c.IsNil) // update flynn-host pid, err := client.Update("/usr/local/bin/flynn-host", "daemon", "--id", cmd.HostID) t.Assert(err, c.IsNil) // update the pid file so removeHost works t.Assert(instance.Run(fmt.Sprintf("echo -n %d | sudo tee /var/run/flynn-host.pid", pid), nil), c.IsNil) // finish logging t.Assert(client.SignalJob(cmd.Job.ID, int(syscall.SIGUSR1)), c.IsNil) // check we get a single log line logc, err := logaggc.New("") t.Assert(err, c.IsNil) log, err := logc.GetLog("partial-logger", &logaggc.LogOpts{Follow: true}) t.Assert(err, c.IsNil) defer log.Close() msgs := make(chan *logaggc.Message) go func() { defer close(msgs) dec := json.NewDecoder(log) for { var msg logaggc.Message if err := dec.Decode(&msg); err != nil { debugf(t, "error decoding message: %s", err) return } msgs <- &msg } }() for { select { case msg, ok := <-msgs: if !ok { t.Fatal("error getting log") } if msg.Stream == "stdout" { t.Assert(msg.Msg, c.Equals, "hello world") return } case <-time.After(10 * time.Second): t.Fatal("timed out waiting for log") } } }
func (f *ClusterFixer) Run(args *docopt.Args, c *cluster.Client) error { f.c = c f.l = log15.New() var err error minHosts, err := strconv.Atoi(args.String["--min-hosts"]) if err != nil || minHosts < 1 { return fmt.Errorf("invalid or missing --min-hosts value") } f.hosts, err = c.Hosts() if err != nil { f.l.Error("unable to list hosts from discoverd, falling back to peer IP list", "error", err) var ips []string if ipList := args.String["--peer-ips"]; ipList != "" { ips = strings.Split(ipList, ",") if minHosts == 0 { minHosts = len(ips) } } if len(ips) == 0 { return fmt.Errorf("error connecting to discoverd, use --peer-ips: %s", err) } if len(ips) < minHosts { return fmt.Errorf("number of peer IPs provided (%d) is less than --min-hosts (%d)", len(ips), minHosts) } f.hosts = make([]*cluster.Host, len(ips)) for i, ip := range ips { url := fmt.Sprintf("http://%s:1113", ip) status, err := cluster.NewHost("", url, nil, nil).GetStatus() if err != nil { return fmt.Errorf("error connecting to %s: %s", ip, err) } f.hosts[i] = cluster.NewHost(status.ID, url, nil, nil) } } // check expected number of hosts if len(f.hosts) < minHosts { // TODO(titanous): be smarter about this return fmt.Errorf("expected at least %d hosts, but %d found", minHosts, len(f.hosts)) } f.l.Info("found expected hosts", "n", len(f.hosts)) if err := f.FixDiscoverd(); err != nil { return err } if err := f.FixFlannel(); err != nil { return err } f.l.Info("waiting for discoverd to be available") timeout := time.After(time.Minute) for { var err error if _, err = discoverd.GetInstances("discoverd", 30*time.Second); err != nil { time.Sleep(100 * time.Millisecond) } else { break } select { case <-timeout: return fmt.Errorf("timed out waiting for discoverd, last error: %s", err) } } f.l.Info("checking for running controller API") controllerService := discoverd.NewService("controller") controllerInstances, _ := controllerService.Instances() if len(controllerInstances) > 0 { f.l.Info("found running controller API instances", "n", len(controllerInstances)) if err := f.FixController(controllerInstances, false); err != nil { f.l.Error("error fixing controller", "err", err) // if unable to write correct formations, we need to kill the scheduler so that the rest of this works if err := f.KillSchedulers(); err != nil { return err } } } f.l.Info("checking status of sirenia databases") for _, db := range []string{"postgres", "mariadb", "mongodb"} { f.l.Info("checking for database state", "db", db) if _, err := discoverd.NewService(db).GetMeta(); err != nil { if discoverd.IsNotFound(err) { f.l.Info("skipping recovery of db, no state in discoverd", "db", db) continue } f.l.Error("error checking database state", "db", db) return err } if err := f.FixSirenia(db); err != nil { return err } } f.l.Info("checking for running controller API") controllerInstances, _ = controllerService.Instances() if len(controllerInstances) == 0 { // kill schedulers to prevent interference if err := f.KillSchedulers(); err != nil { return err } controllerInstances, err = f.StartAppJob("controller", "web", "controller") if err != nil { return err } } else { f.l.Info("found running controller API instances", "n", len(controllerInstances)) } if err := f.FixController(controllerInstances, true); err != nil { f.l.Error("error fixing controller", "err", err) return err } f.l.Info("cluster fix complete") return nil }
func (s *HostSuite) TestUpdate(t *c.C) { dir := t.MkDir() flynnHost := filepath.Join(dir, "flynn-host") run(t, osexec.Command("cp", args.FlynnHost, flynnHost)) // start flynn-host id := random.String(8) var out bytes.Buffer cmd := osexec.Command( flynnHost, "daemon", "--http-port", "11113", "--state", filepath.Join(dir, "host-state.bolt"), "--id", id, "--backend", "mock", "--vol-provider", "mock", "--volpath", filepath.Join(dir, "volumes"), "--log-dir", filepath.Join(dir, "logs"), ) cmd.Stdout = &out cmd.Stderr = &out defer func() { debug(t, "*** flynn-host output ***") debug(t, out.String()) debug(t, "*************************") }() t.Assert(cmd.Start(), c.IsNil) defer cmd.Process.Kill() httpClient := &http.Client{Transport: &http.Transport{Dial: dialer.Retry.Dial}} client := cluster.NewHost(id, "http://127.0.0.1:11113", httpClient, nil) // exec a program which exits straight away _, err := client.Update("/bin/true") t.Assert(err, c.NotNil) status, err := client.GetStatus() t.Assert(err, c.IsNil) t.Assert(status.ID, c.Equals, id) t.Assert(status.PID, c.Equals, cmd.Process.Pid) // exec a program which reads the control socket but then exits _, err = client.Update("/bin/bash", "-c", "<&4; exit") t.Assert(err, c.NotNil) status, err = client.GetStatus() t.Assert(err, c.IsNil) t.Assert(status.ID, c.Equals, id) t.Assert(status.PID, c.Equals, cmd.Process.Pid) // exec flynn-host and check we get the status from the new daemon pid, err := client.Update( flynnHost, "daemon", "--http-port", "11113", "--state", filepath.Join(dir, "host-state.bolt"), "--id", id, "--backend", "mock", "--vol-provider", "mock", "--volpath", filepath.Join(dir, "volumes"), "--log-dir", filepath.Join(dir, "logs"), ) t.Assert(err, c.IsNil) defer syscall.Kill(pid, syscall.SIGKILL) done := make(chan struct{}) go func() { cmd.Process.Signal(syscall.SIGTERM) syscall.Wait4(cmd.Process.Pid, nil, 0, nil) close(done) }() select { case <-done: case <-time.After(15 * time.Second): t.Fatal("timed out waiting for flynn-host daemon to exit") } // client.GetStatus intermittently returns io.EOF right after the update. We // don't currently understand why (likely due to the way the listener is // passed around), so for now just retry the request. // // TODO(lmars): figure out why and remove this loop. delay := 100 * time.Millisecond for start := time.Now(); time.Since(start) < 10*time.Second; time.Sleep(delay) { status, err = client.GetStatus() if e, ok := err.(*url.Error); ok && strings.Contains(e.Err.Error(), "EOF") { debugf(t, "got io.EOF from flynn-host, trying again in %s", delay) continue } break } t.Assert(err, c.IsNil) t.Assert(status.ID, c.Equals, id) t.Assert(status.PID, c.Equals, pid) }