func (p *PostgresKeeper) isDifferentTimelineBranch(followedDB *cluster.DB, pgState *cluster.PostgresState) bool { if followedDB.Status.TimelineID < pgState.TimelineID { log.Info("followed instance timeline < than our timeline", zap.Uint64("followedTimeline", followedDB.Status.TimelineID), zap.Uint64("timeline", pgState.TimelineID)) return true } // if the timelines are the same check that also the switchpoints are the same. if followedDB.Status.TimelineID == pgState.TimelineID { if pgState.TimelineID <= 1 { // if timeline <= 1 then no timeline history file exists. return false } ftlh := followedDB.Status.TimelinesHistory.GetTimelineHistory(pgState.TimelineID - 1) tlh := pgState.TimelinesHistory.GetTimelineHistory(pgState.TimelineID - 1) if ftlh == nil || tlh == nil { // No timeline history to check return false } if ftlh.SwitchPoint == tlh.SwitchPoint { return false } log.Info("followed instance timeline forked at a different xlog pos than our timeline", zap.Uint64("followedTimeline", followedDB.Status.TimelineID), zap.Uint64("followedXlogpos", ftlh.SwitchPoint), zap.Uint64("timeline", pgState.TimelineID), zap.Uint64("xlogpos", tlh.SwitchPoint)) return true } // followedDB.Status.TimelineID > pgState.TimelineID ftlh := followedDB.Status.TimelinesHistory.GetTimelineHistory(pgState.TimelineID) if ftlh != nil { if ftlh.SwitchPoint < pgState.XLogPos { log.Info("followed instance timeline forked before our current state", zap.Uint64("followedTimeline", followedDB.Status.TimelineID), zap.Uint64("followedXlogpos", ftlh.SwitchPoint), zap.Uint64("timeline", pgState.TimelineID), zap.Uint64("xlogpos", pgState.XLogPos)) return true } } return false }
func (c *ClusterChecker) Check() error { cd, _, err := c.e.GetClusterData() if err != nil { log.Error("cannot get cluster data", zap.Error(err)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) if c.stopListening { c.stopPollonProxy() } return nil } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) if cd == nil { log.Info("no clusterdata available, closing connections to previous master") c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) return nil } if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) return nil } if err = cd.Cluster.Spec.Validate(); err != nil { log.Error("clusterdata validation failed", zap.Error(err)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) return nil } // Start pollon if not active if err = c.startPollonProxy(); err != nil { log.Error("failed to start proxy", zap.Error(err)) return nil } proxy := cd.Proxy if proxy == nil { log.Info("no proxy object available, closing connections to previous master") c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil { log.Error("failed to update proxyInfo", zap.Error(err)) } return nil } db, ok := cd.DBs[proxy.Spec.MasterDBUID] if !ok { log.Info("no db object available, closing connections to previous master", zap.String("db", proxy.Spec.MasterDBUID)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil { log.Error("failed to update proxyInfo", zap.Error(err)) } return nil } addr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("%s:%s", db.Status.ListenAddress, db.Status.Port)) if err != nil { log.Error("error", zap.Error(err)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) return nil } log.Info("master address", zap.Stringer("address", addr)) if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil { log.Error("failed to update proxyInfo", zap.Error(err)) } c.sendPollonConfData(pollon.ConfData{DestAddr: addr}) return nil }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm cd, _, err := e.GetClusterData() if err != nil { log.Error("error retrieving cluster data", zap.Error(err)) return } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) if cd == nil { log.Info("no cluster data available, waiting for it to appear") return } if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) return } if cd.Cluster != nil { p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration if p.keeperLocalState.ClusterUID != cd.Cluster.UID { p.keeperLocalState.ClusterUID = cd.Cluster.UID if err = p.saveKeeperLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } } k, ok := cd.Keepers[p.keeperLocalState.UID] if !ok { log.Info("our keeper data is not available, waiting for it to appear") return } // TODO(sgotti) Check that the Keeper.Status address:port has been updated db := cd.FindDB(k) if db == nil { log.Info("no db assigned") return } // TODO(sgotti) Check that the DB.Status address:port has been updated followersUIDs := db.Spec.Followers prevPGParameters := pgm.GetParameters() // create postgres parameteres pgParameters := p.createPGParameters(db) // update pgm postgres parameters pgm.SetParameters(pgParameters) dbls := p.dbLocalState if dbls.Initializing { // If we are here this means that the db initialization or // resync as failed so we have to clean up stale data log.Error("db failed to initialize or resync") // Clean up cluster db datadir if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } // Reset current db local state since it's not valid anymore p.localStateMutex.Lock() dbls.UID = "" dbls.Generation = cluster.NoGeneration dbls.Initializing = false p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } initialized, err := pgm.IsInitialized() if err != nil { log.Error("failed to detect if instance is initialized", zap.Error(err)) return } started := false if initialized { started, err = pgm.IsStarted() if err != nil { // log error getting instance state but go ahead. log.Info("failed to retrieve instance status", zap.Error(err)) } } log.Debug("db status", zap.Bool("started", started)) // if the db is initialized but there isn't a db local state then generate a new one if initialized && dbls.UID == "" { p.localStateMutex.Lock() dbls.UID = common.UID() dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = false p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } if dbls.UID != db.UID { log.Info("current db UID different than cluster data db UID", zap.String("db", dbls.UID), zap.String("cdDB", db.UID)) switch db.Spec.InitMode { case cluster.DBInitModeNew: log.Info("initializing the database cluster") p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = true p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } if err = pgm.Init(); err != nil { log.Error("failed to initialize postgres database cluster", zap.Error(err)) return } initialized = true if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } log.Info("setting roles") if err = pgm.SetupRoles(); err != nil { log.Error("failed to setup roles", zap.Error(err)) return } if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModePITR: log.Info("restoring the database cluster") p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = true p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } if err = pgm.Restore(db.Spec.PITRConfig.DataRestoreCommand); err != nil { log.Error("failed to restore postgres database cluster", zap.Error(err)) return } if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(nil, db.Spec.PITRConfig.ArchiveRecoverySettings)); err != nil { log.Error("err", zap.Error(err)) return } if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } initialized = true if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModeExisting: // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } log.Info("updating our db UID with the cluster data provided db UID") // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModeNone: // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } return default: log.Error("unknown db init mode", zap.String("initMode", string(db.Spec.InitMode))) return } } pgm.SetParameters(pgParameters) var localRole common.Role var systemID string if !initialized { log.Info("database cluster not initialized") localRole = common.RoleUndefined } else { localRole, err = pgm.GetRole() if err != nil { log.Error("error retrieving current pg role", zap.Error(err)) return } systemID, err = p.pgm.GetSystemdID() if err != nil { log.Error("error retrieving systemd ID", zap.Error(err)) return } } targetRole := db.Spec.Role log.Debug("target role", zap.String("targetRole", string(targetRole))) switch targetRole { case common.RoleMaster: // We are the elected master log.Info("our db requested role is master") if localRole == common.RoleUndefined { log.Error("database cluster not initialized but requested role is master. This shouldn't happen!") return } if !started { if err = pgm.Start(); err != nil { log.Error("failed to start postgres", zap.Error(err)) return } started = true } if localRole == common.RoleStandby { log.Info("promoting to master") if err = pgm.Promote(); err != nil { log.Error("err", zap.Error(err)) return } } else { log.Info("already master") } var replSlots []string replSlots, err = pgm.GetReplicatinSlots() log.Debug("replication slots", zap.Object("replSlots", replSlots)) if err != nil { log.Error("err", zap.Error(err)) return } // Drop replication slots for _, slotName := range replSlots { if !common.IsStolonName(slotName) { continue } if !util.StringInSlice(followersUIDs, common.NameFromStolonName(slotName)) { log.Info("dropping replication slot since db not marked as follower", zap.String("slot", slotName), zap.String("db", common.NameFromStolonName(slotName))) if err = pgm.DropReplicationSlot(slotName); err != nil { log.Error("err", zap.Error(err)) } } } // Create replication slots for _, followerUID := range followersUIDs { if followerUID == dbls.UID { continue } replSlot := common.StolonName(followerUID) if !util.StringInSlice(replSlots, replSlot) { log.Info("creating replication slot", zap.String("slot", replSlot), zap.String("db", followerUID)) if err = pgm.CreateReplicationSlot(replSlot); err != nil { log.Error("err", zap.Error(err)) } } } case common.RoleStandby: // We are a standby followedUID := db.Spec.FollowConfig.DBUID log.Info("our db requested role is standby", zap.String("followedDB", followedUID)) followedDB, ok := cd.DBs[followedUID] if !ok { log.Error("no db data available for followed db", zap.String("followedDB", followedUID)) return } switch localRole { case common.RoleMaster: if systemID == followedDB.Status.SystemID { // There can be the possibility that this // database is on the same branch of the // current followed instance. // So we try to put it in recovery and then // check if it's on the same branch or force a // resync replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if !started { if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } else { if err = pgm.Restart(true); err != nil { log.Error("err", zap.Error(err)) return } } // TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines. // So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream. // Check timeline history // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state var pgState *cluster.PostgresState pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, true, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true // Check again if it was really synced pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } } } else { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } case common.RoleStandby: log.Info("already standby") if !started { replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("failed to start postgres", zap.Error(err)) return } started = true } // Check that we can sync with followed instance // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state var pgState *cluster.PostgresState pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } needsResync := false tryPgrewind := false // If the db has a different systemdID then a resync is needed if systemID != followedDB.Status.SystemID { needsResync = true // Check timeline history } else if p.isDifferentTimelineBranch(followedDB, pgState) { needsResync = true tryPgrewind = true } if needsResync { // TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines. // So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream. if err = p.resync(db, followedDB, tryPgrewind, started); err != nil { log.Error("failed to full resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true // Check again if it was really synced pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } } // TODO(sgotti) Check that the followed instance has all the needed WAL segments // Update our primary_conninfo if replConnString changed var curReplConnParams postgresql.ConnParams curReplConnParams, err = pgm.GetPrimaryConninfo() if err != nil { log.Error("err", zap.Error(err)) return } log.Debug("curReplConnParams", zap.Object("curReplConnParams", curReplConnParams)) newReplConnParams := p.getReplConnParams(db, followedDB) log.Debug("newReplConnParams", zap.Object("newReplConnParams", newReplConnParams)) if !curReplConnParams.Equals(newReplConnParams) { log.Info("connection parameters changed. Reconfiguring.", zap.String("followedDB", followedUID), zap.Object("replConnParams", newReplConnParams)) standbySettings := &cluster.StandbySettings{PrimaryConninfo: newReplConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if err = pgm.Restart(true); err != nil { log.Error("err", zap.Error(err)) return } } case common.RoleUndefined: if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to full resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } case common.RoleUndefined: log.Info("our db requested role is none") return } // update pg parameters pgParameters = p.createPGParameters(db) // Log synchronous replication changes prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"] syncStandbyNames := pgParameters["synchronous_standby_names"] if db.Spec.SynchronousReplication { if prevSyncStandbyNames != syncStandbyNames { log.Info("needed synchronous_standby_names changed", zap.String("prevSyncStandbyNames", prevSyncStandbyNames), zap.String("syncStandbyNames", syncStandbyNames)) } } else { if prevSyncStandbyNames != "" { log.Info("sync replication disabled, removing current synchronous_standby_names", zap.String("syncStandbyNames", prevSyncStandbyNames)) } } if !pgParameters.Equals(prevPGParameters) { log.Info("postgres parameters changed, reloading postgres instance") pgm.SetParameters(pgParameters) if err := pgm.Reload(); err != nil { log.Error("failed to reload postgres instance", zap.Error(err)) } } else { // for tests log.Info("postgres parameters not changed") } // If we are here, then all went well and we can update the db generation and save it locally p.localStateMutex.Lock() dbls.Generation = db.Generation dbls.Initializing = false p.localStateMutex.Unlock() if err := p.saveDBLocalState(); err != nil { log.Error("err", zap.Error(err)) return } }
func (p *PostgresKeeper) Start() { endSMCh := make(chan struct{}) endPgStatecheckerCh := make(chan struct{}) endUpdateKeeperInfo := make(chan struct{}) var err error var cd *cluster.ClusterData cd, _, err = p.e.GetClusterData() if err != nil { log.Error("error retrieving cluster data", zap.Error(err)) } else if cd != nil { if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) } else if cd.Cluster != nil { p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration } } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) // TODO(sgotti) reconfigure the various configurations options // (RequestTimeout) after a changed cluster config pgParameters := make(common.Parameters) pgm := postgresql.NewManager(p.pgBinPath, p.dataDir, pgParameters, p.getLocalConnParams(), p.getOurReplConnParams(), p.pgSUUsername, p.pgSUPassword, p.pgReplUsername, p.pgReplPassword, p.requestTimeout) p.pgm = pgm p.pgm.Stop(true) ctx, cancel := context.WithCancel(context.Background()) smTimerCh := time.NewTimer(0).C updatePGStateTimerCh := time.NewTimer(0).C updateKeeperInfoTimerCh := time.NewTimer(0).C for true { select { case <-p.stop: log.Debug("stopping stolon keeper") cancel() p.pgm.Stop(true) p.end <- nil return case <-smTimerCh: go func() { p.postgresKeeperSM(ctx) endSMCh <- struct{}{} }() case <-endSMCh: smTimerCh = time.NewTimer(p.sleepInterval).C case <-updatePGStateTimerCh: // updateKeeperInfo two times faster than the sleep interval go func() { p.updatePGState(ctx) endPgStatecheckerCh <- struct{}{} }() case <-endPgStatecheckerCh: // updateKeeperInfo two times faster than the sleep interval updatePGStateTimerCh = time.NewTimer(p.sleepInterval / 2).C case <-updateKeeperInfoTimerCh: go func() { if err := p.updateKeeperInfo(); err != nil { log.Error("failed to update keeper info", zap.Error(err)) } endUpdateKeeperInfo <- struct{}{} }() case <-endUpdateKeeperInfo: updateKeeperInfoTimerCh = time.NewTimer(p.sleepInterval).C } } }
func (s *Sentinel) clusterSentinelCheck(pctx context.Context) { s.updateMutex.Lock() defer s.updateMutex.Unlock() e := s.e cd, prevCDPair, err := e.GetClusterData() if err != nil { log.Error("error retrieving cluster data", zap.Error(err)) return } if cd != nil { if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) return } if cd.Cluster != nil { s.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration s.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration } } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) if cd == nil { // Cluster first initialization if s.initialClusterSpec == nil { log.Info("no cluster data available, waiting for it to appear") return } c := cluster.NewCluster(s.UIDFn(), s.initialClusterSpec) log.Info("writing initial cluster data") newcd := cluster.NewClusterData(c) log.Debug("newcd dump", zap.String("newcd", spew.Sdump(newcd))) if _, err = e.AtomicPutClusterData(newcd, nil); err != nil { log.Error("error saving cluster data", zap.Error(err)) } return } if err = s.setSentinelInfo(2 * s.sleepInterval); err != nil { log.Error("cannot update sentinel info", zap.Error(err)) return } ctx, cancel := context.WithTimeout(pctx, s.requestTimeout) keepersInfo, err := s.getKeepersInfo(ctx) cancel() if err != nil { log.Error("err", zap.Error(err)) return } log.Debug("keepersInfo dump", zap.String("keepersInfo", spew.Sdump(keepersInfo))) isLeader, leadershipCount := s.leaderInfo() if !isLeader { return } // detect if this is the first check after (re)gaining leadership firstRun := false if s.lastLeadershipCount != leadershipCount { firstRun = true s.lastLeadershipCount = leadershipCount } // if this is the first check after (re)gaining leadership reset all // the internal timers if firstRun { s.keeperErrorTimers = make(map[string]int64) s.dbErrorTimers = make(map[string]int64) s.keeperInfoHistories = make(KeeperInfoHistories) } newcd, newKeeperInfoHistories := s.updateKeepersStatus(cd, keepersInfo, firstRun) newcd, err = s.updateCluster(newcd) if err != nil { log.Error("failed to update cluster data", zap.Error(err)) return } log.Debug("newcd dump after updateCluster", zap.String("newcd", spew.Sdump(newcd))) if newcd != nil { if _, err := e.AtomicPutClusterData(newcd, prevCDPair); err != nil { log.Error("error saving clusterdata", zap.Error(err)) } } // Save the new keeperInfoHistories only on successfull cluster data // update or in the next run we'll think that the saved keeperInfo was // already applied. s.keeperInfoHistories = newKeeperInfoHistories }
func (s *Sentinel) findBestStandby(cd *cluster.ClusterData, masterDB *cluster.DB) (*cluster.DB, error) { var bestDB *cluster.DB for _, db := range cd.DBs { if db.UID == masterDB.UID { log.Debug("ignoring db since it's the current master", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) continue } if db.Status.SystemID != masterDB.Status.SystemID { log.Debug("ignoring db since the postgres systemdID is different that the master one", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID), zap.String("dbSystemdID", db.Status.SystemID), zap.String("masterSystemID", masterDB.Status.SystemID)) continue } if !db.Status.Healthy { log.Debug("ignoring db since it's not healthy", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) continue } if db.Status.CurrentGeneration != db.Generation { log.Debug("ignoring keeper since its generation is different that the current one", zap.String("db", db.UID), zap.Int64("currentGeneration", db.Status.CurrentGeneration), zap.Int64("generation", db.Generation)) continue } if db.Status.TimelineID != masterDB.Status.TimelineID { log.Debug("ignoring keeper since its pg timeline is different than master timeline", zap.String("db", db.UID), zap.Uint64("dbTimeline", db.Status.TimelineID), zap.Uint64("masterTimeline", masterDB.Status.TimelineID)) continue } if bestDB == nil { bestDB = db continue } if db.Status.XLogPos > bestDB.Status.XLogPos { bestDB = db } } if bestDB == nil { return nil, fmt.Errorf("no standbys available") } return bestDB, nil }
func (s *Sentinel) findBestNewMasters(cd *cluster.ClusterData, masterDB *cluster.DB) []*cluster.DB { bestNewMasters := s.findBestStandbys(cd, masterDB) // Add the previous masters to the best standbys (if valid and in good state) goodMasters, _, _ := s.validMastersByStatus(cd) log.Debug("goodMasters", zap.String("goodMasters", spew.Sdump(goodMasters))) for _, db := range goodMasters { if db.UID == masterDB.UID { log.Debug("ignoring db since it's the current master", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) continue } if db.Status.TimelineID != masterDB.Status.TimelineID { log.Debug("ignoring keeper since its pg timeline is different than master timeline", zap.String("db", db.UID), zap.Uint64("dbTimeline", db.Status.TimelineID), zap.Uint64("masterTimeline", masterDB.Status.TimelineID)) continue } bestNewMasters = append(bestNewMasters, db) } // Sort by XLogPos sort.Sort(dbSlice(bestNewMasters)) log.Debug("bestNewMasters", zap.String("bestNewMasters", spew.Sdump(bestNewMasters))) return bestNewMasters }
func (s *Sentinel) findBestStandbys(cd *cluster.ClusterData, masterDB *cluster.DB) []*cluster.DB { goodStandbys, _, _ := s.validStandbysByStatus(cd) bestDBs := []*cluster.DB{} for _, db := range goodStandbys { if db.Status.TimelineID != masterDB.Status.TimelineID { log.Debug("ignoring keeper since its pg timeline is different than master timeline", zap.String("db", db.UID), zap.Uint64("dbTimeline", db.Status.TimelineID), zap.Uint64("masterTimeline", masterDB.Status.TimelineID)) continue } bestDBs = append(bestDBs, db) } // Sort by XLogPos sort.Sort(dbSlice(bestDBs)) return bestDBs }