func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm cv, _, err := e.GetClusterView() if err != nil { log.Errorf("error retrieving cluster view: %v", err) return } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) if cv == nil { log.Infof("no clusterview available, waiting for it to appear") return } followersIDs := cv.GetFollowersIDs(p.id) // Update cluster config clusterConfig := cv.Config.ToConfig() log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock p.clusterConfig = clusterConfig prevPGParameters := pgm.GetParameters() // create postgres parameteres pgParameters := p.createPGParameters(followersIDs) // update pgm postgres parameters pgm.SetParameters(pgParameters) keepersState, _, err := e.GetKeepersState() if err != nil { log.Errorf("err: %v", err) return } if keepersState == nil { keepersState = cluster.KeepersState{} } log.Debugf(spew.Sprintf("keepersState: %#v", keepersState)) keeper := keepersState[p.id] log.Debugf(spew.Sprintf("keeperState: %#v", keeper)) initialized, err := pgm.IsInitialized() if err != nil { log.Errorf("failed to detect if instance is initialized: %v", err) return } if len(cv.KeepersRole) == 0 { if !initialized { log.Infof("Initializing database") err = pgm.Init() if err != nil { log.Errorf("failed to initialized postgres instance: %v", err) return } initialized = true } } started := false if initialized { started, err = pgm.IsStarted() if err != nil { log.Errorf("failed to retrieve instance status: %v", err) } else if !started { err = pgm.Start() if err != nil { log.Errorf("failed to start postgres: %v", err) } else { started = true } } } if cv != nil { if !started && p.id == cv.Master { // If the clusterView says we are master but we cannot get // instance status or start then stop here, if we are standby then we can // recover return } } role, err := pgm.GetRole() if err != nil { log.Infof("error retrieving current pg role: %v", err) return } isMaster := false if role == common.MasterRole { log.Infof("current pg state: master") isMaster = true } else { log.Infof("current pg state: standby") } // publish ourself for discovery if err := p.publish(); err != nil { log.Errorf("failed to publish ourself to the cluster: %v", err) return } if cv == nil { return } // cv != nil masterID := cv.Master log.Debugf("masterID: %q", masterID) master := keepersState[masterID] log.Debugf(spew.Sprintf("masterState: %#v", master)) keeperRole, ok := cv.KeepersRole[p.id] if !ok { log.Infof("our keeper requested role is not available") return } if keeperRole.Follow == "" { log.Infof("our cluster requested state is master") if role != common.MasterRole { log.Infof("promoting to master") err := pgm.Promote() if err != nil { log.Errorf("err: %v", err) return } } else { log.Infof("already master") replSlots := []string{} replSlots, err = pgm.GetReplicatinSlots() if err != nil { log.Errorf("err: %v", err) return } // Create replication slots for _, slotName := range replSlots { if !util.StringInSlice(followersIDs, slotName) { log.Infof("dropping replication slot for keeper %q not marked as follower", slotName) err := pgm.DropReplicationSlot(slotName) if err != nil { log.Errorf("err: %v", err) } } } for _, followerID := range followersIDs { if followerID == p.id { continue } if !util.StringInSlice(replSlots, followerID) { err := pgm.CreateReplicationSlot(followerID) if err != nil { log.Errorf("err: %v", err) } } } } } else { log.Infof("our cluster requested state is standby following %q", keeperRole.Follow) if isMaster { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } else { log.Infof("already standby") curConnParams, err := pgm.GetPrimaryConninfo() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams)) replConnString := p.getReplConnString(master) newConnParams, err := pg.URLToConnParams(replConnString) if err != nil { log.Errorf("cannot get conn params: %v", err) return } log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams)) // Check that we can sync with master // Check timeline history // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state p.updatePGState(pctx) pgState := p.getLastPGState() if pgState == nil { log.Errorf("our pgstate is unknown: %v", err) return } mPGState := master.PGState if p.isDifferentTimelineBranch(mPGState, pgState) { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } // TODO(sgotti) Check that the master has all the needed WAL segments // Update our primary_conninfo if replConnString changed if !curConnParams.Equals(newConnParams) { log.Infof("master connection parameters changed. Reconfiguring...") log.Infof("following %s with connection url %s", keeperRole.Follow, replConnString) err = pgm.BecomeStandby(replConnString) if err != nil { log.Errorf("err: %v", err) return } err = pgm.Restart(true) if err != nil { log.Errorf("err: %v", err) return } } } } // Log synchronous replication changes prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"] syncStandbyNames := pgParameters["synchronous_standby_names"] if p.clusterConfig.SynchronousReplication { if prevSyncStandbyNames != syncStandbyNames { log.Infof("needed synchronous_standby_names changed from %q to %q", prevSyncStandbyNames, syncStandbyNames) } } else { if prevSyncStandbyNames != "" { log.Infof("sync replication disabled, removing current synchronous_standby_names %q", prevSyncStandbyNames) } } if !pgParameters.Equals(prevPGParameters) { log.Infof("postgres parameters changed, reloading postgres instance") pgm.SetParameters(pgParameters) if err := pgm.Reload(); err != nil { log.Errorf("failed to reload postgres instance: %v", err) } } else { // for tests log.Debugf("postgres parameters not changed") } if err := p.saveCVVersion(cv.Version); err != nil { log.Errorf("err: %v", err) return } }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm // Update cluster config clusterConfig, _, err := e.GetClusterConfig() if err != nil { log.Errorf("cannot get cluster config: %v", err) return } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock p.clusterConfig = clusterConfig cv, _, err := e.GetClusterView() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) membersState, _, err := e.GetMembersState() if err != nil { log.Errorf("err: %v", err) return } if membersState == nil { membersState = cluster.MembersState{} } log.Debugf(spew.Sprintf("membersState: %#v", membersState)) member := membersState[p.id] log.Debugf(spew.Sprintf("memberState: %#v", member)) initialized, err := pgm.IsInitialized() if err != nil { log.Errorf("failed to detect if instance is initialized: %v", err) return } if cv == nil { if !initialized { log.Infof("Initializing database") err = pgm.Init() if err != nil { log.Errorf("failed to initialized postgres instance: %v", err) return } initialized = true } } started := false if initialized { started, err = pgm.IsStarted() if err != nil { log.Errorf("failed to retrieve instance status: %v", err) } else if !started { err = pgm.Start() if err != nil { log.Errorf("err: %v", err) } else { started = true } } } if cv != nil { if !started && p.id == cv.Master { // If the clusterView says we are master but we cannot get // instance status or start then stop here, if we are standby then we can // recover return } } role, err := pgm.GetRole() if err != nil { log.Infof("error retrieving current pg role: %v", err) return } isMaster := false if role == common.MasterRole { log.Infof("current pg state: master") isMaster = true } else { log.Infof("current pg state: standby") } // publish ourself for discovery if err := p.publish(); err != nil { log.Errorf("failed to publish ourself to the cluster: %v", err) return } if cv == nil { return } // cv != nil masterID := cv.Master log.Debugf("masterID: %q", masterID) master := membersState[masterID] log.Debugf(spew.Sprintf("masterState: %#v", master)) followersIDs := cv.GetFollowersIDs(p.id) memberRole, ok := cv.MembersRole[p.id] if !ok { log.Infof("our member state is not available") return } if memberRole.Follow == "" { log.Infof("our cluster requested state is master") if role != common.MasterRole { log.Infof("promoting to master") err := pgm.Promote() if err != nil { log.Errorf("err: %v", err) return } } else { log.Infof("already master") replSlots := []string{} replSlots, err = pgm.GetReplicatinSlots() if err != nil { log.Errorf("err: %v", err) return } // Create replication slots for _, slotName := range replSlots { if !util.StringInSlice(followersIDs, slotName) { log.Infof("dropping replication slot for member %q not marked as follower", slotName) err := pgm.DropReplicationSlot(slotName) if err != nil { log.Errorf("err: %v", err) } } } for _, followerID := range followersIDs { if followerID == p.id { continue } if !util.StringInSlice(replSlots, followerID) { err := pgm.CreateReplicationSlot(followerID) if err != nil { log.Errorf("err: %v", err) } } } // Setup synchronous replication syncStandbyNames, _ := pgm.GetServerParameter("synchronous_standby_names") if p.clusterConfig.SynchronousReplication { newSyncStandbyNames := strings.Join(followersIDs, ",") if syncStandbyNames != newSyncStandbyNames { log.Infof("needed synchronous_standby_names changed from %q to %q, reconfiguring", syncStandbyNames, newSyncStandbyNames) pgm.SetServerParameter("synchronous_standby_names", newSyncStandbyNames) pgm.Reload() } } else { if syncStandbyNames != "" { log.Infof("sync replication disabled, removing current synchronous_standby_names %q", syncStandbyNames) pgm.SetServerParameter("synchronous_standby_names", "") pgm.Reload() } } } } else { log.Infof("our cluster requested state is standby following %q", memberRole.Follow) if isMaster { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } else { log.Infof("already standby") curConnParams, err := pgm.GetPrimaryConninfo() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams)) replConnString := p.getReplConnString(master) newConnParams, err := pg.URLToConnParams(replConnString) if err != nil { log.Errorf("cannot get conn params: %v", err) return } log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams)) // Check that we can sync with master // Check timeline history ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout) pgState, err := pg.GetPGState(ctx, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("cannot get our pgstate: %v", err) return } mPGState := master.PGState if p.isDifferentTimelineBranch(mPGState, pgState) { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } // TODO(sgotti) Check that the master has all the needed WAL segments // Update our primary_conninfo if replConnString changed if !curConnParams.Equals(newConnParams) { log.Infof("master connection parameters changed. Reconfiguring...") log.Infof("following %s with connection url %s", memberRole.Follow, replConnString) err = pgm.BecomeStandby(replConnString) if err != nil { log.Errorf("err: %v", err) return } err = pgm.Restart(true) if err != nil { log.Errorf("err: %v", err) return } } } } if err := p.saveCVVersion(cv.Version); err != nil { log.Errorf("err: %v", err) return } }