Ejemplo n.º 1
0
func (p *PostgresKeeper) updatePGState(pctx context.Context) {
	p.pgStateMutex.Lock()
	pgState := &cluster.PostgresState{}

	defer func() {
		p.lastPGState = pgState
		p.pgStateMutex.Unlock()
	}()

	initialized, err := p.pgm.IsInitialized()
	if err != nil {
		pgState = nil
		return
	}
	if !initialized {
		pgState.Initialized = false
	} else {
		var err error
		ctx, cancel := context.WithTimeout(pctx, p.clusterConfig.RequestTimeout)
		pgState, err = pg.GetPGState(ctx, p.getOurReplConnString())
		cancel()
		if err != nil {
			log.Errorf("error getting pg state: %v", err)
			pgState = nil
			return
		}
		pgState.Initialized = true

		// if timeline <= 1 then no timeline history file exists.
		pgState.TimelinesHistory = cluster.PostgresTimeLinesHistory{}
		if pgState.TimelineID > 1 {
			ctx, cancel = context.WithTimeout(pctx, p.clusterConfig.RequestTimeout)
			tlsh, err := pg.GetTimelinesHistory(ctx, pgState.TimelineID, p.getOurReplConnString())
			cancel()
			if err != nil {
				log.Errorf("error getting timeline history: %v", err)
				pgState = nil
				return
			}
			pgState.TimelinesHistory = tlsh
		}
	}
}
Ejemplo n.º 2
0
func (p *PostgresKeeper) pgStateHandler(w http.ResponseWriter, req *http.Request) {
	pgState := &cluster.PostgresState{}
	p.cvMutex.Lock()
	defer p.cvMutex.Unlock()

	initialized, err := p.pgm.IsInitialized()
	if err != nil {
		w.WriteHeader(http.StatusInternalServerError)
		return
	}
	if !initialized {
		pgState.Initialized = false
	} else {
		var err error
		ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout)
		pgState, err = pg.GetPGState(ctx, p.getOurReplConnString())
		cancel()
		if err != nil {
			log.Errorf("error getting pg state: %v", err)
			w.WriteHeader(http.StatusInternalServerError)
			return
		}
		pgState.Initialized = true

		// if timeline <= 1 then no timeline history file exists.
		pgState.TimelinesHistory = cluster.PostgresTimeLinesHistory{}
		if pgState.TimelineID > 1 {
			ctx, cancel = context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout)
			tlsh, err := pg.GetTimelinesHistory(ctx, pgState.TimelineID, p.getOurReplConnString())
			cancel()
			if err != nil {
				log.Errorf("error getting timeline history: %v", err)
				w.WriteHeader(http.StatusInternalServerError)
				return
			}
			pgState.TimelinesHistory = tlsh
		}
	}
	if err := json.NewEncoder(w).Encode(&pgState); err != nil {
		w.WriteHeader(http.StatusInternalServerError)
	}
}
Ejemplo n.º 3
0
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
	e := p.e
	pgm := p.pgm

	// Update cluster config
	clusterConfig, _, err := e.GetClusterConfig()
	if err != nil {
		log.Errorf("cannot get cluster config: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))
	// This shouldn't need a lock
	p.clusterConfig = clusterConfig

	cv, _, err := e.GetClusterView()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	membersState, _, err := e.GetMembersState()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	if membersState == nil {
		membersState = cluster.MembersState{}
	}
	log.Debugf(spew.Sprintf("membersState: %#v", membersState))

	member := membersState[p.id]
	log.Debugf(spew.Sprintf("memberState: %#v", member))

	initialized, err := pgm.IsInitialized()
	if err != nil {
		log.Errorf("failed to detect if instance is initialized: %v", err)
		return
	}

	if cv == nil {
		if !initialized {
			log.Infof("Initializing database")
			err = pgm.Init()
			if err != nil {
				log.Errorf("failed to initialized postgres instance: %v", err)
				return
			}
			initialized = true
		}
	}

	started := false

	if initialized {
		started, err = pgm.IsStarted()
		if err != nil {
			log.Errorf("failed to retrieve instance status: %v", err)
		} else if !started {
			err = pgm.Start()
			if err != nil {
				log.Errorf("err: %v", err)
			} else {
				started = true
			}
		}
	}

	if cv != nil {
		if !started && p.id == cv.Master {
			// If the clusterView says we are master but we cannot get
			// instance status or start then stop here, if we are standby then we can
			// recover
			return
		}
	}

	role, err := pgm.GetRole()
	if err != nil {
		log.Infof("error retrieving current pg role: %v", err)
		return
	}
	isMaster := false
	if role == common.MasterRole {
		log.Infof("current pg state: master")
		isMaster = true
	} else {
		log.Infof("current pg state: standby")
	}

	// publish ourself for discovery
	if err := p.publish(); err != nil {
		log.Errorf("failed to publish ourself to the cluster: %v", err)
		return
	}

	if cv == nil {
		return
	}

	// cv != nil

	masterID := cv.Master
	log.Debugf("masterID: %q", masterID)

	master := membersState[masterID]
	log.Debugf(spew.Sprintf("masterState: %#v", master))

	followersIDs := cv.GetFollowersIDs(p.id)

	memberRole, ok := cv.MembersRole[p.id]
	if !ok {
		log.Infof("our member state is not available")
		return
	}
	if memberRole.Follow == "" {
		log.Infof("our cluster requested state is master")
		if role != common.MasterRole {
			log.Infof("promoting to master")
			err := pgm.Promote()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
		} else {
			log.Infof("already master")

			replSlots := []string{}
			replSlots, err = pgm.GetReplicatinSlots()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
			// Create replication slots
			for _, slotName := range replSlots {
				if !util.StringInSlice(followersIDs, slotName) {
					log.Infof("dropping replication slot for member %q not marked as follower", slotName)
					err := pgm.DropReplicationSlot(slotName)
					if err != nil {
						log.Errorf("err: %v", err)
					}
				}
			}

			for _, followerID := range followersIDs {
				if followerID == p.id {
					continue
				}
				if !util.StringInSlice(replSlots, followerID) {
					err := pgm.CreateReplicationSlot(followerID)
					if err != nil {
						log.Errorf("err: %v", err)
					}
				}
			}

			// Setup synchronous replication
			syncStandbyNames, _ := pgm.GetServerParameter("synchronous_standby_names")
			if p.clusterConfig.SynchronousReplication {
				newSyncStandbyNames := strings.Join(followersIDs, ",")
				if syncStandbyNames != newSyncStandbyNames {
					log.Infof("needed synchronous_standby_names changed from %q to %q, reconfiguring", syncStandbyNames, newSyncStandbyNames)
					pgm.SetServerParameter("synchronous_standby_names", newSyncStandbyNames)
					pgm.Reload()
				}
			} else {
				if syncStandbyNames != "" {
					log.Infof("sync replication disabled, removing current synchronous_standby_names %q", syncStandbyNames)
					pgm.SetServerParameter("synchronous_standby_names", "")
					pgm.Reload()
				}
			}
		}
	} else {
		log.Infof("our cluster requested state is standby following %q", memberRole.Follow)
		if isMaster {
			if err := p.fullResync(master, initialized, started); err != nil {
				log.Errorf("failed to full resync from master: %v", err)
				return
			}
		} else {
			log.Infof("already standby")
			curConnParams, err := pgm.GetPrimaryConninfo()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
			log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams))

			replConnString := p.getReplConnString(master)
			newConnParams, err := pg.URLToConnParams(replConnString)
			if err != nil {
				log.Errorf("cannot get conn params: %v", err)
				return
			}
			log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams))

			// Check that we can sync with master

			// Check timeline history
			ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout)
			pgState, err := pg.GetPGState(ctx, p.getOurReplConnString())
			cancel()
			if err != nil {
				log.Errorf("cannot get our pgstate: %v", err)
				return
			}
			mPGState := master.PGState
			if p.isDifferentTimelineBranch(mPGState, pgState) {
				if err := p.fullResync(master, initialized, started); err != nil {
					log.Errorf("failed to full resync from master: %v", err)
					return
				}
			}

			// TODO(sgotti) Check that the master has all the needed WAL segments

			// Update our primary_conninfo if replConnString changed
			if !curConnParams.Equals(newConnParams) {
				log.Infof("master connection parameters changed. Reconfiguring...")
				log.Infof("following %s with connection url %s", memberRole.Follow, replConnString)
				err = pgm.BecomeStandby(replConnString)
				if err != nil {
					log.Errorf("err: %v", err)
					return
				}
				err = pgm.Restart(true)
				if err != nil {
					log.Errorf("err: %v", err)
					return
				}
			}
		}
	}
	if err := p.saveCVVersion(cv.Version); err != nil {
		log.Errorf("err: %v", err)
		return
	}
}