Beispiel #1
0
func (p *Postgres) waitForUpstream(upstream *discoverd.Instance) error {
	log := p.log.New("fn", "waitForUpstream", "upstream", upstream.Addr)
	log.Info("waiting for upstream to come online")
	client := pgmanager.NewClient(upstream.Addr)

	start := time.Now()
	for {
		status, err := client.Status()
		if err != nil {
			log.Error("error getting upstream status", "err", err)
		} else if status.Postgres.Running && status.Postgres.XLog != "" && status.Postgres.UserExists {
			log.Info("upstream is online")
			return nil
		}
		time.Sleep(checkInterval)
		if time.Now().Sub(start) > upstreamTimeout {
			log.Error("upstream did not come online in time")
			return errors.New("upstream is offline")
		}
	}
}
Beispiel #2
0
func (f *ClusterFixer) FixPostgres() error {
	f.l.Info("checking postgres")
	service := discoverd.NewService("postgres")
	leader, _ := service.Leader()
	if leader == nil || leader.Addr == "" {
		f.l.Info("no running postgres leader")
		leader = nil
	} else {
		f.l.Info("found running postgres leader")
	}
	instances, _ := service.Instances()
	f.l.Info(fmt.Sprintf("found %d running postgres instances", len(instances)))

	f.l.Info("getting postgres status")
	var status *pgmanager.Status
	if leader != nil && leader.Addr != "" {
		client := pgmanager.NewClient(leader.Addr)
		var err error
		status, err = client.Status()
		if err != nil {
			f.l.Error("error getting status from postgres leader", "error", err)
		}
	}
	if status != nil && status.Postgres.ReadWrite {
		f.l.Info("postgres claims to be read-write")
		return nil
	}

	f.l.Info("getting postgres service metadata")
	meta, err := discoverd.NewService("postgres").GetMeta()
	if err != nil {
		return fmt.Errorf("error getting postgres state from discoverd: %s", err)
	}

	var state pgstate.State
	if err := json.Unmarshal(meta.Data, &state); err != nil {
		return fmt.Errorf("error decoding postgres state: %s", err)
	}
	if state.Primary == nil {
		return fmt.Errorf("no primary in postgres state")
	}

	f.l.Info("getting postgres primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"])
	job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"])
	if err != nil {
		if state.Sync != nil {
			f.l.Error("unable to get primary job info", "error", err)
			f.l.Info("getting postgres sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"])
			job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"])
			if err != nil {
				return fmt.Errorf("unable to get postgres primary or sync job details: %s", err)
			}
		} else {
			return fmt.Errorf("unable to get postgres primary job details: %s", err)
		}
	}

	if leader != nil && state.Singleton {
		return fmt.Errorf("postgres leader is running in singleton mode, unable to fix")
	}

	waitForInstance := func(jobID string) (func() (string, error), error) {
		watchCh := make(chan *discoverd.Event)
		upCh := make(chan string)
		stream, err := service.Watch(watchCh)
		if err != nil {
			return nil, fmt.Errorf("error watching discoverd service: %s", err)
		}
		go func() {
			var current bool
			for event := range watchCh {
				if event.Kind == discoverd.EventKindCurrent {
					current = true
					continue
				}
				if !current || event.Kind != discoverd.EventKindUp {
					continue
				}
				if event.Instance.Meta["FLYNN_JOB_ID"] == jobID {
					upCh <- event.Instance.Addr
				}
			}
		}()
		return func() (string, error) {
			f.l.Info("waiting for postgres instance to start", "job.id", jobID)
			defer stream.Close()
			select {
			case addr := <-upCh:
				return addr, nil
			case <-time.After(time.Minute):
				return "", fmt.Errorf("timed out waiting for postgres instance to come up")
			}
		}, nil
	}

	var wait func() (string, error)
	have := len(instances)
	want := 2
	if state.Singleton {
		want = 1
	}
	if have >= want {
		return fmt.Errorf("already have enough postgres instances, unable to fix")
	}
	f.l.Info("attempting to start missing postgres jobs", "want", want, "have", have)
	if leader == nil {
		// if no postgres, attempt to start
		job.ID = cluster.GenerateJobID(host.ID(), "")
		f.FixJobEnv(job)
		f.l.Info("starting postgres primary job", "job.id", job.ID)
		wait, err = waitForInstance(job.ID)
		if err != nil {
			return err
		}
		if err := host.AddJob(job); err != nil {
			return fmt.Errorf("error starting postgres primary job on %s: %s", host.ID(), err)
		}
		have++
	}
	if want > have {
		// if not enough postgres instances, start another
		var secondHost *cluster.Host
		for _, h := range f.hosts {
			if h.ID() != host.ID() {
				secondHost = h
				break
			}
		}
		if secondHost == nil {
			// if there are no other hosts, use the same one we put the primary on
			secondHost = host
		}
		job.ID = cluster.GenerateJobID(secondHost.ID(), "")
		f.FixJobEnv(job)
		f.l.Info("starting second postgres job", "job.id", job.ID)
		if wait == nil {
			wait, err = waitForInstance(job.ID)
			if err != nil {
				return err
			}
		}
		if err := utils.ProvisionVolume(secondHost, job); err != nil {
			return fmt.Errorf("error creating postgres volume on %s: %s", secondHost.ID(), err)
		}
		if err := secondHost.AddJob(job); err != nil {
			return fmt.Errorf("error starting additional postgres job on %s: %s", secondHost.ID(), err)
		}
	}

	if wait != nil {
		addr, err := wait()
		if err != nil {
			return err
		}
		if leader != nil {
			addr = leader.Addr
		}
		f.l.Info("waiting for postgres to come up read-write")
		return pgmanager.NewClient(addr).WaitForReadWrite(5 * time.Minute)
	}
	return nil
}