func startJob(s *State, hc *cluster.Host, job *host.Job) error { jobStatus := make(chan error) events := make(chan *host.Event) stream, err := hc.StreamEvents(job.ID, events) if err != nil { return err } go func() { defer stream.Close() loop: for { select { case e, ok := <-events: if !ok { break loop } switch e.Event { case "start", "stop": jobStatus <- nil return case "error": job, err := hc.GetJob(job.ID) if err != nil { jobStatus <- err return } if job.Error == nil { jobStatus <- fmt.Errorf("bootstrap: unknown error from host") return } jobStatus <- fmt.Errorf("bootstrap: host error while launching job: %q", *job.Error) return default: } case <-time.After(30 * time.Second): jobStatus <- errors.New("bootstrap: timed out waiting for job event") return } } jobStatus <- fmt.Errorf("bootstrap: host job stream disconnected unexpectedly: %q", stream.Err()) }() if err := hc.AddJob(job); err != nil { return err } return <-jobStatus }
func startJob(s *State, hc *cluster.Host, job *host.Job) (*Job, error) { data := &Job{HostID: hc.ID(), JobID: job.ID} jobStatus := make(chan error) events := make(chan *host.Event) stream, err := hc.StreamEvents(data.JobID, events) if err != nil { return nil, err } go func() { defer stream.Close() for e := range events { switch e.Event { case "start", "stop": jobStatus <- nil return case "error": job, err := hc.GetJob(data.JobID) if err != nil { jobStatus <- err return } if job.Error == nil { jobStatus <- fmt.Errorf("bootstrap: unknown error from host") return } jobStatus <- fmt.Errorf("bootstrap: host error while launching job: %q", *job.Error) return default: } } jobStatus <- fmt.Errorf("bootstrap: host job stream disconnected unexpectedly: %q", stream.Err()) }() if err := hc.AddJob(job); err != nil { return nil, err } return data, <-jobStatus }
func (f *ClusterFixer) FixPostgres() error { f.l.Info("checking postgres") service := discoverd.NewService("postgres") leader, _ := service.Leader() if leader == nil || leader.Addr == "" { f.l.Info("no running postgres leader") leader = nil } else { f.l.Info("found running postgres leader") } instances, _ := service.Instances() f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances))) f.l.Info("getting postgres status") var status *pgmanager.Status if leader != nil && leader.Addr != "" { client := pgmanager.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { f.l.Error("error getting status from postgres leader", "error", err) } } if status != nil && status.Postgres.ReadWrite { f.l.Info("postgres claims to be read-write") return nil } f.l.Info("getting postgres service metadata") meta, err := discoverd.NewService("postgres").GetMeta() if err != nil { return fmt.Errorf("error getting postgres state from discoverd: %s", err) } var state pgstate.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding postgres state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in postgres state") } f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { if state.Sync != nil { f.l.Error("unable to get primary job info", "error", err) f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { return fmt.Errorf("unable to get postgres primary or sync job details: %s", err) } } else { return fmt.Errorf("unable to get postgres primary job details: %s", err) } } if leader != nil && state.Singleton { return fmt.Errorf("postgres leader is running in singleton mode, unable to fix") } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { f.l.Info("waiting for postgres instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for postgres instance to come up") } }, nil } var wait func() (string, error) have := len(instances) want := 2 if state.Singleton { want = 1 } if have >= want { return fmt.Errorf("already have enough postgres instances, unable to fix") } f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have) if leader == nil { // if no postgres, attempt to start job.ID = cluster.GenerateJobID(host.ID(), "") f.FixJobEnv(job) f.l.Info("starting postgres primary job", "job.id", job.ID) wait, err = waitForInstance(job.ID) if err != nil { return err } if err := host.AddJob(job); err != nil { return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err) } have++ } if want > have { // if not enough postgres instances, start another var secondHost *cluster.Host for _, h := range f.hosts { if h.ID() != host.ID() { secondHost = h break } } if secondHost == nil { // if there are no other hosts, use the same one we put the primary on secondHost = host } job.ID = cluster.GenerateJobID(secondHost.ID(), "") f.FixJobEnv(job) f.l.Info("starting second postgres job", "job.id", job.ID) if wait == nil { wait, err = waitForInstance(job.ID) if err != nil { return err } } if err := utils.ProvisionVolume(secondHost, job); err != nil { return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err) } if err := secondHost.AddJob(job); err != nil { return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil { addr = leader.Addr } f.l.Info("waiting for postgres to come up read-write") return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }
func (f *ClusterFixer) FixSirenia(svc string) error { log := f.l.New("fn", "FixSirenia", "service", svc) service := discoverd.NewService(svc) instances, _ := service.Instances() leader, _ := service.Leader() log.Info("getting service metadata") meta, err := service.GetMeta() if err != nil { return fmt.Errorf("error getting sirenia state from discoverd: %s", err) } var state state.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in sirenia state") } log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get primary job info") } var syncJob *host.Job var syncHost *cluster.Host if state.Sync != nil { log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get sync job info") } } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { log.Info("waiting for instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for sirenia instance to come up") } }, nil } log.Info("terminating unassigned sirenia instances") outer: for _, i := range instances { if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) { continue } for _, a := range state.Async { if i.Addr == a.Addr { continue outer } } // job not assigned in state, attempt to terminate it if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok { hostID, err := cluster.ExtractHostID(jobID) if err != nil { log.Error("error extracting host id from jobID", "jobID", jobID, "err", err) } h := f.Host(hostID) if h != nil { if err := h.StopJob(jobID); err != nil { log.Error("error stopping unassigned sirenia job", "jobID", jobID) } } else { log.Error("host not found", "hostID", hostID) } } } isRunning := func(addr string) bool { for _, i := range instances { if i.Addr == addr { return true } } return false } // if the leader isn't currently running then start it using primaryJob/primaryHost var wait func() (string, error) if !isRunning(state.Primary.Addr) { // if we don't have info about the primary job attempt to promote the sync if primaryJob == nil { if syncJob != nil { // set primary job to sync primaryJob = syncJob primaryHost = syncHost // nil out sync job now so we can re-allocate it. syncJob = nil syncHost = nil } else { return fmt.Errorf("neither primary or sync job info available") } } primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "") f.FixJobEnv(primaryJob) log.Info("starting primary job", "job.id", primaryJob.ID) wait, err = waitForInstance(primaryJob.ID) if err != nil { return err } if err := primaryHost.AddJob(primaryJob); err != nil { return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err) } } if !state.Singleton && !isRunning(state.Sync.Addr) { if syncHost == nil { for _, h := range f.hosts { if h.ID() != primaryHost.ID() { syncHost = h break } } if syncHost == nil { // if there are no other hosts, use the same one we put the primary on syncHost = primaryHost } } // if we don't have a sync job then copy the primary job // and provision a new volume if syncJob == nil { syncJob = primaryJob vol := &ct.VolumeReq{Path: "/data"} if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil { return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err) } } syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "") f.FixJobEnv(syncJob) log.Info("starting sync job", "job.id", syncJob.ID) if wait == nil { wait, err = waitForInstance(syncJob.ID) if err != nil { return err } } if err := syncHost.AddJob(syncJob); err != nil { return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil && leader.Addr != "" { addr = leader.Addr } log.Info("waiting for cluster to come up read-write", "addr", addr) return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }