func (d *discoverdWrapper) Register() (bool, error) { log := logger.New("fn", "discoverd.Register") log.Info("registering with service discovery") hb, err := discoverd.AddServiceAndRegister(serviceName, d.addr) if err != nil { log.Error("error registering with service discovery", "err", err) return false, err } shutdown.BeforeExit(func() { hb.Close() }) selfAddr := hb.Addr() log = log.New("self.addr", selfAddr) service := discoverd.NewService(serviceName) var leaders chan *discoverd.Instance var stream stream.Stream connect := func() (err error) { log.Info("connecting service leader stream") leaders = make(chan *discoverd.Instance) stream, err = service.Leaders(leaders) if err != nil { log.Error("error connecting service leader stream", "err", err) } return } if err := connect(); err != nil { return false, err } go func() { outer: for { for leader := range leaders { if leader == nil { // a nil leader indicates there are no instances for // the service, ignore and wait for an actual leader log.Warn("received nil leader event") continue } log.Info("received leader event", "leader.addr", leader.Addr) d.leader <- leader.Addr == selfAddr } log.Warn("service leader stream disconnected", "err", stream.Err()) for { if err := connect(); err == nil { continue outer } time.Sleep(100 * time.Millisecond) } } }() select { case isLeader := <-d.leader: return isLeader, nil case <-time.After(30 * time.Second): return false, errors.New("timed out waiting for current service leader") } }
func (s *Scheduler) streamHostEvents() error { log := logger.New("fn", "streamHostEvents") var events chan *discoverd.Event var stream stream.Stream connect := func() (err error) { log.Info("connecting host event stream") events = make(chan *discoverd.Event, eventBufferSize) stream, err = s.StreamHostEvents(events) if err != nil { log.Error("error connecting host event stream", "err", err) } return } if err := connect(); err != nil { return err } current := make(chan struct{}) go func() { var isCurrent bool outer: for { for event := range events { switch event.Kind { case discoverd.EventKindCurrent: if !isCurrent { isCurrent = true close(current) } case discoverd.EventKindUp, discoverd.EventKindDown: // if we are not current, explicitly handle the event // so that the scheduler is streaming job events from // all current hosts before starting the main loop. if !isCurrent { s.HandleHostEvent(event) continue } s.hostEvents <- event } } log.Warn("host event stream disconnected", "err", stream.Err()) for { if err := connect(); err == nil { continue outer } time.Sleep(100 * time.Millisecond) } } }() select { case <-current: return nil case <-time.After(30 * time.Second): return errors.New("timed out waiting for current host list") } }
func waitForJobEvents(t *c.C, stream stream.Stream, events chan *ct.JobEvent, expected jobEvents) (lastID int64, jobID string) { debugf(t, "waiting for job events: %v", expected) actual := make(jobEvents) for { inner: select { case event, ok := <-events: if !ok { t.Fatalf("job event stream closed: %s", stream.Err()) } debugf(t, "got job event: %s %s %s", event.Type, event.JobID, event.State) lastID = event.ID jobID = event.JobID if _, ok := actual[event.Type]; !ok { actual[event.Type] = make(map[string]int) } switch event.State { case "starting", "up", "down": actual[event.Type][event.State] += 1 case "crashed": actual[event.Type]["down"] += 1 default: break inner } if jobEventsEqual(expected, actual) { return } case <-time.After(60 * time.Second): t.Fatal("timed out waiting for job events: ", expected) } } }
func waitForJobRestart(t *c.C, stream stream.Stream, events chan *ct.JobEvent, typ string, timeout time.Duration) string { debug(t, "waiting for job restart") for { select { case event, ok := <-events: if !ok { t.Fatalf("job event stream closed: %s", stream.Err()) } debug(t, "got job event: ", event.Type, event.JobID, event.State) if event.Type == typ && event.State == "up" { return event.JobID } case <-time.After(timeout): t.Fatal("timed out waiting for job restart") } } }
func (d *discoverdWrapper) Register() bool { log := d.logger.New("fn", "discoverd.Register") var hb discoverd.Heartbeater for { var err error log.Info("registering with service discovery") hb, err = discoverd.AddServiceAndRegister(serviceName, ":"+os.Getenv("PORT")) if err == nil { break } log.Error("error registering with service discovery", "err", err) time.Sleep(time.Second) } shutdown.BeforeExit(func() { hb.Close() }) selfAddr := hb.Addr() log = log.New("self.addr", selfAddr) service := discoverd.NewService(serviceName) var leaders chan *discoverd.Instance var stream stream.Stream connect := func() (err error) { log.Info("connecting service leader stream") leaders = make(chan *discoverd.Instance) stream, err = service.Leaders(leaders) if err != nil { log.Error("error connecting service leader stream", "err", err) } return } go func() { for { for { if err := connect(); err == nil { break } time.Sleep(100 * time.Millisecond) } for leader := range leaders { if leader == nil { // a nil leader indicates there are no instances for // the service, ignore and wait for an actual leader log.Warn("received nil leader event") continue } log.Info("received leader event", "leader.addr", leader.Addr) d.leader <- leader.Addr == selfAddr } log.Warn("service leader stream disconnected", "err", stream.Err()) } }() start := time.Now() tick := time.Tick(30 * time.Second) for { select { case isLeader := <-d.leader: return isLeader case <-tick: log.Warn("still waiting for current service leader", "duration", time.Since(start)) } } }
func (s *Scheduler) streamFormationEvents() error { log := logger.New("fn", "streamFormationEvents") var events chan *ct.ExpandedFormation var stream stream.Stream var since *time.Time connect := func() (err error) { log.Info("connecting formation event stream") events = make(chan *ct.ExpandedFormation, eventBufferSize) stream, err = s.StreamFormations(since, events) if err != nil { log.Error("error connecting formation event stream", "err", err) } return } strategy := attempt.Strategy{Delay: 100 * time.Millisecond, Total: time.Minute} if err := strategy.Run(connect); err != nil { return err } current := make(chan struct{}) go func() { var isCurrent bool outer: for { for formation := range events { // an empty formation indicates we now have the // current list of formations. if formation.App == nil { if !isCurrent { isCurrent = true close(current) } continue } since = &formation.UpdatedAt // if we are not current, explicitly handle the event // so that the scheduler has the current list of // formations before starting the main loop. if !isCurrent { s.HandleFormationChange(formation) continue } s.formationEvents <- formation } log.Warn("formation event stream disconnected", "err", stream.Err()) for { if err := connect(); err == nil { continue outer } time.Sleep(100 * time.Millisecond) } } }() select { case <-current: return nil case <-time.After(30 * time.Second): return errors.New("timed out waiting for current formation list") } }
func (r *Router) watchBackends() { log := r.logger.New("fn", "router.watchBackends", "router.id", r.ID) var events chan *router.StreamEvent var stream stream.Stream connect := func() (err error) { log.Info("connecting router event stream") events = make(chan *router.StreamEvent) opts := &router.StreamEventsOptions{ EventTypes: []router.EventType{ router.EventTypeBackendUp, router.EventTypeBackendDrained, }, } stream, err = r.client.StreamEvents(opts, events) if err != nil { log.Error("error connecting router event stream", "err", err) } return } // make initial connection for { if err := connect(); err == nil { defer stream.Close() break } select { case <-r.stop: return case <-time.After(100 * time.Millisecond): } } for { eventLoop: for { select { case event, ok := <-events: if !ok { break eventLoop } r.events <- &RouterEvent{ RouterID: r.ID, Type: event.Event, Backend: event.Backend, } case <-r.stop: return } } log.Warn("router event stream disconnected", "err", stream.Err()) // keep trying to reconnect, unless we are told to stop retryLoop: for { select { case <-r.stop: return default: } if err := connect(); err == nil { break retryLoop } time.Sleep(100 * time.Millisecond) } } }
// StreamEventsTo streams all job events from the host to the given channel in // a goroutine, returning the current list of active jobs. func (h *Host) StreamEventsTo(ch chan *host.Event) (map[string]host.ActiveJob, error) { log := h.logger.New("fn", "StreamEventsTo", "host.id", h.ID) var events chan *host.Event var stream stream.Stream connect := func() (err error) { log.Info("connecting job event stream") events = make(chan *host.Event) stream, err = h.client.StreamEvents("all", events) if err != nil { log.Error("error connecting job event stream", "err", err) } return } if err := connect(); err != nil { return nil, err } log.Info("getting active jobs") jobs, err := h.client.ListJobs() if err != nil { log.Error("error getting active jobs", "err", err) return nil, err } log.Info(fmt.Sprintf("got %d active job(s) for host %s", len(jobs), h.ID)) go func() { defer stream.Close() defer close(h.done) for { eventLoop: for { select { case event, ok := <-events: if !ok { break eventLoop } ch <- event case <-h.stop: return } } log.Warn("job event stream disconnected", "err", stream.Err()) // keep trying to reconnect, unless we are told to stop retryLoop: for { select { case <-h.stop: return default: } if err := connect(); err == nil { break retryLoop } time.Sleep(100 * time.Millisecond) } } }() return jobs, nil }
func (c *serviceConn) watch(srv discoverd.Service, eventc <-chan *discoverd.Event, stream stream.Stream) { g := grohl.NewContext(grohl.Data{"at": "logmux_service_watch"}) var ( resetc = make(chan time.Time) reconc <-chan time.Time = resetc ) defer close(resetc) for { select { case event, ok := <-eventc: if !ok { c.hangup() return } g.Log(grohl.Data{"status": "event", "event": event.Kind.String()}) switch event.Kind { case discoverd.EventKindLeader: reconc = resetc if err := c.reset(); err != nil { g.Log(grohl.Data{"status": "error", "err": err.Error()}) } if err := c.connect(srv); err != nil { g.Log(grohl.Data{"status": "error", "err": err.Error()}) reconc = time.After(100 * time.Millisecond) } default: } case err := <-c.errc: g.Log(grohl.Data{"status": "write-error", "err": err.Error()}) reconc = resetc if err := c.reset(); err != nil { g.Log(grohl.Data{"status": "error", "err": err.Error()}) } if err := c.connect(srv); err != nil { g.Log(grohl.Data{"status": "error", "err": err.Error()}) reconc = time.After(100 * time.Millisecond) } case <-reconc: if err := c.connect(srv); err != nil { g.Log(grohl.Data{"status": "reconnect-error", "err": err.Error()}) reconc = time.After(100 * time.Millisecond) } case <-c.donec: if err := stream.Close(); err != nil { g.Log(grohl.Data{"status": "error", "err": err.Error()}) } if err := c.reset(); err != nil { g.Log(grohl.Data{"status": "error", "err": err.Error()}) } return case <-c.closec: if err := stream.Close(); err != nil { g.Log(grohl.Data{"status": "error", "err": err.Error()}) } c.hangup() return } } }
func (s *SchedulerSuite) TestDeployController(t *c.C) { if testCluster == nil { t.Skip("cannot determine test cluster size") } // get the current controller release client := s.controllerClient(t) app, err := client.GetApp("controller") t.Assert(err, c.IsNil) release, err := client.GetAppRelease(app.ID) t.Assert(err, c.IsNil) // create a controller deployment release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) deployment, err := client.CreateDeployment(app.ID, release.ID) t.Assert(err, c.IsNil) // use a function to create the event stream as a new stream will be needed // after deploying the controller var events chan *ct.DeploymentEvent var eventStream stream.Stream connectStream := func() { events = make(chan *ct.DeploymentEvent) err := attempt.Strategy{ Total: 10 * time.Second, Delay: 500 * time.Millisecond, }.Run(func() (err error) { eventStream, err = client.StreamDeployment(deployment.ID, events) return }) t.Assert(err, c.IsNil) } connectStream() defer eventStream.Close() // wait for the deploy to complete (this doesn't wait for specific events // due to the fact that when the deployer deploys itself, some events will // not get sent) loop: for { select { case e, ok := <-events: if !ok { // reconnect the stream as it may of been closed // due to the controller being deployed debug(t, "reconnecting deployment event stream") connectStream() continue } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) switch e.Status { case "complete": break loop case "failed": t.Fatal("the deployment failed") } case <-time.After(60 * time.Second): t.Fatal("timed out waiting for the deploy to complete") } } // check the correct controller jobs are running hosts, err := s.clusterClient(t).ListHosts() t.Assert(err, c.IsNil) actual := make(map[string]map[string]int) for _, host := range hosts { for _, job := range host.Jobs { appID := job.Metadata["flynn-controller.app"] if appID != app.ID { continue } releaseID := job.Metadata["flynn-controller.release"] if _, ok := actual[releaseID]; !ok { actual[releaseID] = make(map[string]int) } typ := job.Metadata["flynn-controller.type"] actual[releaseID][typ]++ } } expected := map[string]map[string]int{release.ID: { "web": 2, "deployer": 2, "scheduler": testCluster.Size(), }} t.Assert(actual, c.DeepEquals, expected) }