func (p *Postgres) waitForUpstream(upstream *discoverd.Instance) error { log := p.log.New("fn", "waitForUpstream", "upstream", upstream.Addr) log.Info("waiting for upstream to come online") client := pgmanager.NewClient(upstream.Addr) start := time.Now() for { status, err := client.Status() if err != nil { log.Error("error getting upstream status", "err", err) } else if status.Postgres.Running && status.Postgres.XLog != "" && status.Postgres.UserExists { log.Info("upstream is online") return nil } time.Sleep(checkInterval) if time.Now().Sub(start) > upstreamTimeout { log.Error("upstream did not come online in time") return errors.New("upstream is offline") } } }
func (f *ClusterFixer) FixPostgres() error { f.l.Info("checking postgres") service := discoverd.NewService("postgres") leader, _ := service.Leader() if leader == nil || leader.Addr == "" { f.l.Info("no running postgres leader") leader = nil } else { f.l.Info("found running postgres leader") } instances, _ := service.Instances() f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances))) f.l.Info("getting postgres status") var status *pgmanager.Status if leader != nil && leader.Addr != "" { client := pgmanager.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { f.l.Error("error getting status from postgres leader", "error", err) } } if status != nil && status.Postgres.ReadWrite { f.l.Info("postgres claims to be read-write") return nil } f.l.Info("getting postgres service metadata") meta, err := discoverd.NewService("postgres").GetMeta() if err != nil { return fmt.Errorf("error getting postgres state from discoverd: %s", err) } var state pgstate.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding postgres state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in postgres state") } f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { if state.Sync != nil { f.l.Error("unable to get primary job info", "error", err) f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { return fmt.Errorf("unable to get postgres primary or sync job details: %s", err) } } else { return fmt.Errorf("unable to get postgres primary job details: %s", err) } } if leader != nil && state.Singleton { return fmt.Errorf("postgres leader is running in singleton mode, unable to fix") } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { f.l.Info("waiting for postgres instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for postgres instance to come up") } }, nil } var wait func() (string, error) have := len(instances) want := 2 if state.Singleton { want = 1 } if have >= want { return fmt.Errorf("already have enough postgres instances, unable to fix") } f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have) if leader == nil { // if no postgres, attempt to start job.ID = cluster.GenerateJobID(host.ID(), "") f.FixJobEnv(job) f.l.Info("starting postgres primary job", "job.id", job.ID) wait, err = waitForInstance(job.ID) if err != nil { return err } if err := host.AddJob(job); err != nil { return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err) } have++ } if want > have { // if not enough postgres instances, start another var secondHost *cluster.Host for _, h := range f.hosts { if h.ID() != host.ID() { secondHost = h break } } if secondHost == nil { // if there are no other hosts, use the same one we put the primary on secondHost = host } job.ID = cluster.GenerateJobID(secondHost.ID(), "") f.FixJobEnv(job) f.l.Info("starting second postgres job", "job.id", job.ID) if wait == nil { wait, err = waitForInstance(job.ID) if err != nil { return err } } if err := utils.ProvisionVolume(secondHost, job); err != nil { return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err) } if err := secondHost.AddJob(job); err != nil { return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil { addr = leader.Addr } f.l.Info("waiting for postgres to come up read-write") return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }