func (f *ClusterFixer) CheckSirenia(svc string) error { log := f.l.New("fn", "CheckSirenia", "service", svc) log.Info("checking sirenia cluster status") service := discoverd.NewService(svc) leader, _ := service.Leader() if leader == nil || leader.Addr == "" { log.Info("no running leader") leader = nil } else { log.Info("found running leader") } instances, _ := service.Instances() log.Info("found running instances", "count", len(instances)) log.Info("getting sirenia status") var status *sirenia.Status if leader != nil && leader.Addr != "" { client := sirenia.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { log.Error("error getting status from leader", "error", err) } } if status != nil && status.Database != nil && status.Database.ReadWrite { log.Info("cluster claims to be read-write") return nil } return fmt.Errorf("cluster isn't read-write") }
func (p *Postgres) waitForUpstream(upstream *discoverd.Instance) error { log := p.log.New("fn", "waitForUpstream", "upstream", upstream.Addr) log.Info("waiting for upstream to come online") client := client.NewClient(upstream.Addr) timeout := time.After(upstreamTimeout) for { status, err := client.Status() if err != nil { log.Error("error getting upstream status", "err", err) } else if status.Database != nil && status.Database.Running && status.Database.XLog != "" && status.Database.UserExists { log.Info("upstream is online") return nil } select { case <-timeout: log.Error("upstream did not come online in time") return errors.New("upstream is offline") case <-time.After(checkInterval): } } }
func (f *ClusterFixer) FixSirenia(svc string) error { log := f.l.New("fn", "FixSirenia", "service", svc) log.Info("checking sirenia cluster status") service := discoverd.NewService(svc) leader, _ := service.Leader() if leader == nil || leader.Addr == "" { log.Info("no running leader") leader = nil } else { log.Info("found running leader") } instances, _ := service.Instances() log.Info("found running instances", "count", len(instances)) log.Info("getting sirenia status") var status *sirenia.Status if leader != nil && leader.Addr != "" { client := sirenia.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { log.Error("error getting status from leader", "error", err) } } if status != nil && status.Database != nil && status.Database.ReadWrite { log.Info("cluster claims to be read-write") return nil } log.Info("getting service metadata") meta, err := discoverd.NewService(svc).GetMeta() if err != nil { return fmt.Errorf("error getting sirenia state from discoverd: %s", err) } var state state.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in sirenia state") } log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get primary job info") } var syncJob *host.Job var syncHost *cluster.Host if state.Sync != nil { log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get sync job info") } } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { log.Info("waiting for instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for sirenia instance to come up") } }, nil } log.Info("terminating unassigned sirenia instances") outer: for _, i := range instances { if i.Addr == state.Primary.Addr || i.Addr == state.Sync.Addr { continue } for _, a := range state.Async { if i.Addr == a.Addr { continue outer } } // job not assigned in state, attempt to terminate it if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok { hostID, err := cluster.ExtractHostID(jobID) if err != nil { log.Error("error extracting host id from jobID", "jobID", jobID, "err", err) } h := f.Host(hostID) if h != nil { if err := h.StopJob(jobID); err != nil { log.Error("error stopping unassigned sirenia job", "jobID", jobID) } } else { log.Error("host not found", "hostID", hostID) } } } isRunning := func(addr string) bool { for _, i := range instances { if i.Addr == addr { return true } } return false } // if the leader isn't currently running then start it using primaryJob/primaryHost var wait func() (string, error) if !isRunning(state.Primary.Addr) { // if we don't have info about the primary job attempt to promote the sync if primaryJob == nil { if syncJob != nil { // set primary job to sync primaryJob = syncJob primaryHost = syncHost // nil out sync job now so we can re-allocate it. syncJob = nil syncHost = nil } else { return fmt.Errorf("neither primary or sync job info available") } } primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "") f.FixJobEnv(primaryJob) log.Info("starting primary job", "job.id", primaryJob.ID) wait, err = waitForInstance(primaryJob.ID) if err != nil { return err } if err := primaryHost.AddJob(primaryJob); err != nil { return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err) } } if !state.Singleton && !isRunning(state.Sync.Addr) { if syncHost == nil { for _, h := range f.hosts { if h.ID() != primaryHost.ID() { syncHost = h break } } if syncHost == nil { // if there are no other hosts, use the same one we put the primary on syncHost = primaryHost } } // if we don't have a sync job then copy the primary job // and provision a new volume if syncJob == nil { syncJob = primaryJob if err := utils.ProvisionVolume(syncHost, syncJob); err != nil { return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err) } } syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "") f.FixJobEnv(syncJob) log.Info("starting sync job", "job.id", syncJob.ID) if wait == nil { wait, err = waitForInstance(syncJob.ID) if err != nil { return err } } if err := syncHost.AddJob(syncJob); err != nil { return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil { addr = leader.Addr } log.Info("waiting for cluster to come up read-write") return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }