func startJob(s *State, hc *cluster.Host, job *host.Job) error { jobStatus := make(chan error) events := make(chan *host.Event) stream, err := hc.StreamEvents(job.ID, events) if err != nil { return err } go func() { defer stream.Close() loop: for { select { case e, ok := <-events: if !ok { break loop } switch e.Event { case "start", "stop": jobStatus <- nil return case "error": job, err := hc.GetJob(job.ID) if err != nil { jobStatus <- err return } if job.Error == nil { jobStatus <- fmt.Errorf("bootstrap: unknown error from host") return } jobStatus <- fmt.Errorf("bootstrap: host error while launching job: %q", *job.Error) return default: } case <-time.After(30 * time.Second): jobStatus <- errors.New("bootstrap: timed out waiting for job event") return } } jobStatus <- fmt.Errorf("bootstrap: host job stream disconnected unexpectedly: %q", stream.Err()) }() if err := hc.AddJob(job); err != nil { return err } return <-jobStatus }
func startJob(s *State, hc *cluster.Host, job *host.Job) (*Job, error) { data := &Job{HostID: hc.ID(), JobID: job.ID} jobStatus := make(chan error) events := make(chan *host.Event) stream, err := hc.StreamEvents(data.JobID, events) if err != nil { return nil, err } go func() { defer stream.Close() for e := range events { switch e.Event { case "start", "stop": jobStatus <- nil return case "error": job, err := hc.GetJob(data.JobID) if err != nil { jobStatus <- err return } if job.Error == nil { jobStatus <- fmt.Errorf("bootstrap: unknown error from host") return } jobStatus <- fmt.Errorf("bootstrap: host error while launching job: %q", *job.Error) return default: } } jobStatus <- fmt.Errorf("bootstrap: host job stream disconnected unexpectedly: %q", stream.Err()) }() if err := hc.AddJob(job); err != nil { return nil, err } return data, <-jobStatus }
func (c *context) watchHost(h *cluster.Host, ready chan struct{}) { if !c.hosts.Add(h.ID()) { if ready != nil { ready <- struct{}{} } return } defer c.hosts.Remove(h.ID()) g := grohl.NewContext(grohl.Data{"fn": "watchHost", "host.id": h.ID()}) c.hosts.Set(h.ID(), h) g.Log(grohl.Data{"at": "start"}) ch := make(chan *host.Event) h.StreamEvents("all", ch) if ready != nil { ready <- struct{}{} } // Call PutJob in a goroutine so we don't block receiving job events whilst potentially // making multiple requests to the controller (e.g. if the controller is down). // // Use a channel (rather than spawning a goroutine per event) so that events are delivered in order. jobs := make(chan *ct.Job, 10) go func() { for job := range jobs { putJobAttempts.Run(func() error { if err := c.PutJob(job); err != nil { g.Log(grohl.Data{"at": "put_job_error", "job.id": job.ID, "state": job.State, "err": err}) // ignore validation / not found errors if httphelper.IsValidationError(err) || err == controller.ErrNotFound { return nil } return err } g.Log(grohl.Data{"at": "put_job", "job.id": job.ID, "state": job.State}) return nil }) } }() for event := range ch { meta := event.Job.Job.Metadata appID := meta["flynn-controller.app"] releaseID := meta["flynn-controller.release"] jobType := meta["flynn-controller.type"] if appID == "" || releaseID == "" { continue } job := &ct.Job{ ID: event.JobID, AppID: appID, ReleaseID: releaseID, Type: jobType, State: jobState(event), Meta: jobMetaFromMetadata(meta), } g.Log(grohl.Data{"at": "event", "job.id": event.JobID, "event": event.Event}) jobs <- job // get a read lock on the mutex to ensure we are not currently // syncing with the cluster c.mtx.RLock() j := c.jobs.Get(h.ID(), event.JobID) c.mtx.RUnlock() if j == nil { continue } j.startedAt = event.Job.StartedAt if event.Event != "error" && event.Event != "stop" { continue } g.Log(grohl.Data{"at": "remove", "job.id": event.JobID, "event": event.Event}) c.jobs.Remove(h.ID(), event.JobID) go func(event *host.Event) { c.mtx.RLock() j.Formation.RestartJob(jobType, h.ID(), event.JobID) c.mtx.RUnlock() }(event) } // TODO: check error/reconnect }