Пример #1
0
func (p *Process) waitForUpstream(upstream *discoverd.Instance) error {
	logger := p.Logger.New("fn", "waitForUpstream", "upstream", upstream.Addr, "upstream_http_addr", httpAddr(upstream.Addr))
	logger.Info("waiting for upstream to come online")
	upstreamClient := client.NewClient(upstream.Addr)

	timer := time.NewTimer(upstreamTimeout)
	defer timer.Stop()

	ticker := time.NewTicker(checkInterval)
	defer ticker.Stop()

	for {
		status, err := upstreamClient.Status()
		if err != nil {
			logger.Error("error getting upstream status", "err", err)
		} else if status.Database != nil {
			logger.Info("status", "running", status.Database.Running, "xlog", status.Database.XLog, "user_exists", status.Database.UserExists)
			if status.Database.Running && status.Database.XLog != "" && status.Database.UserExists {
				logger.Info("upstream is online")
				return nil
			}
		} else {
			logger.Info("status", "running", "false")
		}

		select {
		case <-timer.C:
			logger.Error("upstream did not come online in time")
			return errors.New("upstream is offline")
		case <-ticker.C:
		}
	}
}
Пример #2
0
func (p *Postgres) waitForUpstream(upstream *discoverd.Instance) error {
	log := p.log.New("fn", "waitForUpstream", "upstream", upstream.Addr)
	log.Info("waiting for upstream to come online")
	client := client.NewClient(upstream.Addr)

	timeout := time.After(upstreamTimeout)
	for {
		status, err := client.Status()
		if err != nil {
			log.Error("error getting upstream status", "err", err)
		} else if status.Database != nil && status.Database.Running && status.Database.XLog != "" && status.Database.UserExists {
			log.Info("upstream is online")
			return nil
		}
		select {
		case <-timeout:
			log.Error("upstream did not come online in time")
			return errors.New("upstream is offline")
		case <-time.After(checkInterval):
		}
	}
}
Пример #3
0
func (a *SireniaWaitAction) Run(s *State) error {
	// use discoverd client to lookup leader
	d, err := s.DiscoverdClient()
	if err != nil {
		return err
	}

	var leader *discoverd.Instance
	err = attempt.Strategy{
		Min:   5,
		Total: 5 * time.Minute,
		Delay: 500 * time.Millisecond,
	}.Run(func() error {
		leader, err = d.Service(a.Service).Leader()
		return err
	})
	if err != nil {
		return err
	}

	// connect using sirenia client and wait until database reports read/write
	return client.NewClient(leader.Addr).WaitForReadWrite(5 * time.Minute)
}
Пример #4
0
func (d *DeployJob) deploySirenia() (err error) {
	log := d.logger.New("fn", "deploySirenia")
	log.Info("starting sirenia deployment")

	defer func() {
		if err != nil {
			err = ErrSkipRollback{err.Error()}
		}
	}()

	loggedErr := func(e string) error {
		log.Error(e)
		return errors.New(e)
	}

	processType := d.oldRelease.Env["SIRENIA_PROCESS"]
	// if the process type isn't set try getting it from the new release
	if processType == "" {
		processType = d.newRelease.Env["SIRENIA_PROCESS"]
	}
	// if it's still not set we have a problem.
	if processType == "" {
		return fmt.Errorf("unable to determine sirenia process type")
	}

	// if sirenia process type is scaled to 0, skip and deploy non-sirenia processes
	if d.Processes[processType] == 0 {
		log.Info("sirenia process type scale = 0, skipping")
		return d.deployOneByOne()
	}

	if d.serviceMeta == nil {
		return loggedErr("missing sirenia cluster state")
	}

	var state state.State
	log.Info("decoding sirenia cluster state")
	if err := json.Unmarshal(d.serviceMeta.Data, &state); err != nil {
		log.Error("error decoding sirenia cluster state", "err", err)
		return err
	}

	// abort if in singleton mode or not deploying from a clean state
	if state.Singleton {
		return loggedErr("sirenia cluster in singleton mode")
	}
	if len(state.Async) == 0 {
		return loggedErr("sirenia cluster in unhealthy state (has no asyncs)")
	}
	if 2+len(state.Async) != d.Processes[processType] {
		return loggedErr(fmt.Sprintf("sirenia cluster in unhealthy state (too few asyncs)"))
	}
	if processesEqual(d.newReleaseState, d.Processes) {
		log.Info("deployment already completed, nothing to do")
		return nil
	}
	if d.newReleaseState[processType] > 0 {
		return loggedErr("sirenia cluster in unexpected state")
	}

	stopInstance := func(inst *discoverd.Instance) error {
		log := log.New("job_id", inst.Meta["FLYNN_JOB_ID"])

		d.deployEvents <- ct.DeploymentEvent{
			ReleaseID: d.OldReleaseID,
			JobState:  ct.JobStateStopping,
			JobType:   processType,
		}
		peer := client.NewClient(inst.Addr)
		log.Info("stopping peer")
		if err := peer.Stop(); err != nil {
			log.Error("error stopping peer", "err", err)
			return err
		}
		log.Info("waiting for peer to stop")
		jobEvents := d.ReleaseJobEvents(d.OldReleaseID)
		for {
			select {
			case e := <-jobEvents:
				if e.Type == JobEventTypeError {
					return e.Error
				}
				if e.Type != JobEventTypeDiscoverd {
					continue
				}
				event := e.DiscoverdEvent
				if event.Kind == discoverd.EventKindDown && event.Instance.ID == inst.ID {
					d.deployEvents <- ct.DeploymentEvent{
						ReleaseID: d.OldReleaseID,
						JobState:  ct.JobStateDown,
						JobType:   processType,
					}
					return nil
				}
			case <-time.After(time.Duration(d.DeployTimeout) * time.Second):
				return loggedErr("timed out waiting for peer to stop")
			}
		}
	}

	// newPrimary is the first new instance started, newSync the second
	var newPrimary, newSync *discoverd.Instance
	startInstance := func() (*discoverd.Instance, error) {
		log.Info("starting new instance")
		d.deployEvents <- ct.DeploymentEvent{
			ReleaseID: d.NewReleaseID,
			JobState:  ct.JobStateStarting,
			JobType:   processType,
		}
		d.newReleaseState[processType]++
		if err := d.client.PutFormation(&ct.Formation{
			AppID:     d.AppID,
			ReleaseID: d.NewReleaseID,
			Processes: d.newReleaseState,
		}); err != nil {
			log.Error("error scaling formation up by one", "err", err)
			return nil, err
		}
		log.Info("waiting for new instance to come up")
		var inst *discoverd.Instance
		jobEvents := d.ReleaseJobEvents(d.NewReleaseID)
	loop:
		for {
			select {
			case e := <-jobEvents:
				if e.Type == JobEventTypeError {
					return nil, e.Error
				}
				if e.Type != JobEventTypeDiscoverd {
					continue
				}
				event := e.DiscoverdEvent
				if event.Kind == discoverd.EventKindUp &&
					event.Instance.Meta != nil &&
					event.Instance.Meta["FLYNN_RELEASE_ID"] == d.NewReleaseID &&
					event.Instance.Meta["FLYNN_PROCESS_TYPE"] == processType {
					inst = event.Instance
					break loop
				}
			case <-time.After(time.Duration(d.DeployTimeout) * time.Second):
				return nil, loggedErr("timed out waiting for new instance to come up")
			}
		}
		if newPrimary == nil {
			newPrimary = inst
		} else if newSync == nil {
			newSync = inst
		}
		d.deployEvents <- ct.DeploymentEvent{
			ReleaseID: d.NewReleaseID,
			JobState:  ct.JobStateUp,
			JobType:   processType,
		}
		return inst, nil
	}
	waitForSync := func(upstream, downstream *discoverd.Instance) error {
		log.Info("waiting for replication sync", "upstream", upstream.Addr, "downstream", downstream.Addr)
		client := client.NewClient(upstream.Addr)
		if err := client.WaitForReplSync(downstream, 3*time.Minute); err != nil {
			log.Error("error waiting for replication sync", "err", err)
			return err
		}
		return nil
	}
	waitForReadWrite := func(inst *discoverd.Instance) error {
		log.Info("waiting for read-write", "inst", inst.Addr)
		client := client.NewClient(inst.Addr)
		if err := client.WaitForReadWrite(3 * time.Minute); err != nil {
			log.Error("error waiting for read-write", "err", err)
			return err
		}
		return nil
	}

	// asyncUpstream is the instance we will query for replication status
	// of the new async, which will be the sync if there is only one
	// async, or the tail async otherwise.
	asyncUpstream := state.Sync
	if len(state.Async) > 1 {
		asyncUpstream = state.Async[len(state.Async)-1]
	}
	for i := 0; i < len(state.Async); i++ {
		log.Info("replacing an Async node")
		newInst, err := startInstance()
		if err != nil {
			return err
		}
		if err := stopInstance(state.Async[i]); err != nil {
			return err
		}
		if err := waitForSync(asyncUpstream, newInst); err != nil {
			return err
		}
		// the new instance is now the tail async
		asyncUpstream = newInst
	}

	log.Info("replacing the Sync node")
	_, err = startInstance()
	if err != nil {
		return err
	}
	if err := stopInstance(state.Sync); err != nil {
		return err
	}
	if err := waitForSync(state.Primary, newPrimary); err != nil {
		return err
	}

	// wait for the new Sync to catch the new Primary *before* killing the
	// old Primary to avoid backups failing
	if err := waitForSync(newPrimary, newSync); err != nil {
		return err
	}

	log.Info("replacing the Primary node")
	_, err = startInstance()
	if err != nil {
		return err
	}
	if err := stopInstance(state.Primary); err != nil {
		return err
	}
	if err := waitForReadWrite(newPrimary); err != nil {
		return err
	}

	log.Info("stopping old jobs")
	d.oldReleaseState[processType] = 0
	if err := d.client.PutFormation(&ct.Formation{
		AppID:     d.AppID,
		ReleaseID: d.OldReleaseID,
		Processes: d.oldReleaseState,
	}); err != nil {
		log.Error("error scaling old formation", "err", err)
		return err
	}

	log.Info(fmt.Sprintf("waiting for %d job down events", d.Processes[processType]))
	actual := 0
	jobEvents := d.ReleaseJobEvents(d.OldReleaseID)
loop:
	for {
		select {
		case e := <-jobEvents:
			if e.Type == JobEventTypeError {
				return loggedErr(e.Error.Error())
			}
			if e.Type != JobEventTypeController {
				continue
			}
			event := e.JobEvent
			log.Info("got job event", "job_id", event.ID, "type", event.Type, "state", event.State)
			if event.State == ct.JobStateDown && event.Type == processType {
				actual++
				if actual == d.Processes[processType] {
					break loop
				}
			}
		case <-time.After(time.Duration(d.DeployTimeout) * time.Second):
			return loggedErr("timed out waiting for job events")
		}
	}

	// do a one-by-one deploy for the other process types
	return d.deployOneByOne()
}