func (p *PostgresKeeper) updatePGState(pctx context.Context) { p.pgStateMutex.Lock() pgState := &cluster.PostgresState{} defer func() { p.lastPGState = pgState p.pgStateMutex.Unlock() }() initialized, err := p.pgm.IsInitialized() if err != nil { pgState = nil return } if !initialized { pgState.Initialized = false } else { var err error ctx, cancel := context.WithTimeout(pctx, p.clusterConfig.RequestTimeout) pgState, err = pg.GetPGState(ctx, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("error getting pg state: %v", err) pgState = nil return } pgState.Initialized = true // if timeline <= 1 then no timeline history file exists. pgState.TimelinesHistory = cluster.PostgresTimeLinesHistory{} if pgState.TimelineID > 1 { ctx, cancel = context.WithTimeout(pctx, p.clusterConfig.RequestTimeout) tlsh, err := pg.GetTimelinesHistory(ctx, pgState.TimelineID, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("error getting timeline history: %v", err) pgState = nil return } pgState.TimelinesHistory = tlsh } } }
func (p *PostgresKeeper) pgStateHandler(w http.ResponseWriter, req *http.Request) { pgState := &cluster.PostgresState{} p.cvMutex.Lock() defer p.cvMutex.Unlock() initialized, err := p.pgm.IsInitialized() if err != nil { w.WriteHeader(http.StatusInternalServerError) return } if !initialized { pgState.Initialized = false } else { var err error ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout) pgState, err = pg.GetPGState(ctx, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("error getting pg state: %v", err) w.WriteHeader(http.StatusInternalServerError) return } pgState.Initialized = true // if timeline <= 1 then no timeline history file exists. pgState.TimelinesHistory = cluster.PostgresTimeLinesHistory{} if pgState.TimelineID > 1 { ctx, cancel = context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout) tlsh, err := pg.GetTimelinesHistory(ctx, pgState.TimelineID, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("error getting timeline history: %v", err) w.WriteHeader(http.StatusInternalServerError) return } pgState.TimelinesHistory = tlsh } } if err := json.NewEncoder(w).Encode(&pgState); err != nil { w.WriteHeader(http.StatusInternalServerError) } }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm // Update cluster config clusterConfig, _, err := e.GetClusterConfig() if err != nil { log.Errorf("cannot get cluster config: %v", err) return } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock p.clusterConfig = clusterConfig cv, _, err := e.GetClusterView() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) membersState, _, err := e.GetMembersState() if err != nil { log.Errorf("err: %v", err) return } if membersState == nil { membersState = cluster.MembersState{} } log.Debugf(spew.Sprintf("membersState: %#v", membersState)) member := membersState[p.id] log.Debugf(spew.Sprintf("memberState: %#v", member)) initialized, err := pgm.IsInitialized() if err != nil { log.Errorf("failed to detect if instance is initialized: %v", err) return } if cv == nil { if !initialized { log.Infof("Initializing database") err = pgm.Init() if err != nil { log.Errorf("failed to initialized postgres instance: %v", err) return } initialized = true } } started := false if initialized { started, err = pgm.IsStarted() if err != nil { log.Errorf("failed to retrieve instance status: %v", err) } else if !started { err = pgm.Start() if err != nil { log.Errorf("err: %v", err) } else { started = true } } } if cv != nil { if !started && p.id == cv.Master { // If the clusterView says we are master but we cannot get // instance status or start then stop here, if we are standby then we can // recover return } } role, err := pgm.GetRole() if err != nil { log.Infof("error retrieving current pg role: %v", err) return } isMaster := false if role == common.MasterRole { log.Infof("current pg state: master") isMaster = true } else { log.Infof("current pg state: standby") } // publish ourself for discovery if err := p.publish(); err != nil { log.Errorf("failed to publish ourself to the cluster: %v", err) return } if cv == nil { return } // cv != nil masterID := cv.Master log.Debugf("masterID: %q", masterID) master := membersState[masterID] log.Debugf(spew.Sprintf("masterState: %#v", master)) followersIDs := cv.GetFollowersIDs(p.id) memberRole, ok := cv.MembersRole[p.id] if !ok { log.Infof("our member state is not available") return } if memberRole.Follow == "" { log.Infof("our cluster requested state is master") if role != common.MasterRole { log.Infof("promoting to master") err := pgm.Promote() if err != nil { log.Errorf("err: %v", err) return } } else { log.Infof("already master") replSlots := []string{} replSlots, err = pgm.GetReplicatinSlots() if err != nil { log.Errorf("err: %v", err) return } // Create replication slots for _, slotName := range replSlots { if !util.StringInSlice(followersIDs, slotName) { log.Infof("dropping replication slot for member %q not marked as follower", slotName) err := pgm.DropReplicationSlot(slotName) if err != nil { log.Errorf("err: %v", err) } } } for _, followerID := range followersIDs { if followerID == p.id { continue } if !util.StringInSlice(replSlots, followerID) { err := pgm.CreateReplicationSlot(followerID) if err != nil { log.Errorf("err: %v", err) } } } // Setup synchronous replication syncStandbyNames, _ := pgm.GetServerParameter("synchronous_standby_names") if p.clusterConfig.SynchronousReplication { newSyncStandbyNames := strings.Join(followersIDs, ",") if syncStandbyNames != newSyncStandbyNames { log.Infof("needed synchronous_standby_names changed from %q to %q, reconfiguring", syncStandbyNames, newSyncStandbyNames) pgm.SetServerParameter("synchronous_standby_names", newSyncStandbyNames) pgm.Reload() } } else { if syncStandbyNames != "" { log.Infof("sync replication disabled, removing current synchronous_standby_names %q", syncStandbyNames) pgm.SetServerParameter("synchronous_standby_names", "") pgm.Reload() } } } } else { log.Infof("our cluster requested state is standby following %q", memberRole.Follow) if isMaster { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } else { log.Infof("already standby") curConnParams, err := pgm.GetPrimaryConninfo() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams)) replConnString := p.getReplConnString(master) newConnParams, err := pg.URLToConnParams(replConnString) if err != nil { log.Errorf("cannot get conn params: %v", err) return } log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams)) // Check that we can sync with master // Check timeline history ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout) pgState, err := pg.GetPGState(ctx, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("cannot get our pgstate: %v", err) return } mPGState := master.PGState if p.isDifferentTimelineBranch(mPGState, pgState) { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } // TODO(sgotti) Check that the master has all the needed WAL segments // Update our primary_conninfo if replConnString changed if !curConnParams.Equals(newConnParams) { log.Infof("master connection parameters changed. Reconfiguring...") log.Infof("following %s with connection url %s", memberRole.Follow, replConnString) err = pgm.BecomeStandby(replConnString) if err != nil { log.Errorf("err: %v", err) return } err = pgm.Restart(true) if err != nil { log.Errorf("err: %v", err) return } } } } if err := p.saveCVVersion(cv.Version); err != nil { log.Errorf("err: %v", err) return } }