func NewTestKeeperWithID(t *testing.T, dir, id, clusterName, pgSUUsername, pgSUPassword, pgReplUsername, pgReplPassword string, storeBackend store.Backend, storeEndpoints string, a ...string) (*TestKeeper, error) { args := []string{} dataDir := filepath.Join(dir, fmt.Sprintf("st%s", id)) // Hack to find a free tcp port ln, err := net.Listen("tcp", "localhost:0") if err != nil { return nil, err } defer ln.Close() pgListenAddress := ln.Addr().(*net.TCPAddr).IP.String() pgPort := strconv.Itoa(ln.Addr().(*net.TCPAddr).Port) args = append(args, fmt.Sprintf("--id=%s", id)) args = append(args, fmt.Sprintf("--cluster-name=%s", clusterName)) args = append(args, fmt.Sprintf("--pg-port=%s", pgPort)) args = append(args, fmt.Sprintf("--data-dir=%s", dataDir)) args = append(args, fmt.Sprintf("--store-backend=%s", storeBackend)) args = append(args, fmt.Sprintf("--store-endpoints=%s", storeEndpoints)) args = append(args, fmt.Sprintf("--pg-su-username=%s", pgSUUsername)) if pgSUPassword != "" { args = append(args, fmt.Sprintf("--pg-su-password=%s", pgSUPassword)) } args = append(args, fmt.Sprintf("--pg-repl-username=%s", pgReplUsername)) args = append(args, fmt.Sprintf("--pg-repl-password=%s", pgReplPassword)) if os.Getenv("DEBUG") != "" { args = append(args, "--debug") } args = append(args, a...) connParams := pg.ConnParams{ "user": pgSUUsername, "password": pgSUPassword, "host": pgListenAddress, "port": pgPort, "dbname": "postgres", "sslmode": "disable", } connString := connParams.ConnString() db, err := sql.Open("postgres", connString) if err != nil { return nil, err } bin := os.Getenv("STKEEPER_BIN") if bin == "" { return nil, fmt.Errorf("missing STKEEPER_BIN env") } tk := &TestKeeper{ t: t, Process: Process{ t: t, id: id, name: "keeper", bin: bin, args: args, }, dataDir: dataDir, pgListenAddress: pgListenAddress, pgPort: pgPort, pgSUUsername: pgSUUsername, pgSUPassword: pgSUPassword, pgReplUsername: pgReplUsername, pgReplPassword: pgReplPassword, db: db, } return tk, nil }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm cd, _, err := e.GetClusterData() if err != nil { log.Error("error retrieving cluster data", zap.Error(err)) return } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) if cd == nil { log.Info("no cluster data available, waiting for it to appear") return } if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) return } if cd.Cluster != nil { p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration if p.keeperLocalState.ClusterUID != cd.Cluster.UID { p.keeperLocalState.ClusterUID = cd.Cluster.UID if err = p.saveKeeperLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } } k, ok := cd.Keepers[p.keeperLocalState.UID] if !ok { log.Info("our keeper data is not available, waiting for it to appear") return } // TODO(sgotti) Check that the Keeper.Status address:port has been updated db := cd.FindDB(k) if db == nil { log.Info("no db assigned") return } // TODO(sgotti) Check that the DB.Status address:port has been updated followersUIDs := db.Spec.Followers prevPGParameters := pgm.GetParameters() // create postgres parameteres pgParameters := p.createPGParameters(db) // update pgm postgres parameters pgm.SetParameters(pgParameters) dbls := p.dbLocalState if dbls.Initializing { // If we are here this means that the db initialization or // resync as failed so we have to clean up stale data log.Error("db failed to initialize or resync") // Clean up cluster db datadir if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } // Reset current db local state since it's not valid anymore p.localStateMutex.Lock() dbls.UID = "" dbls.Generation = cluster.NoGeneration dbls.Initializing = false p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } initialized, err := pgm.IsInitialized() if err != nil { log.Error("failed to detect if instance is initialized", zap.Error(err)) return } started := false if initialized { started, err = pgm.IsStarted() if err != nil { // log error getting instance state but go ahead. log.Info("failed to retrieve instance status", zap.Error(err)) } } log.Debug("db status", zap.Bool("started", started)) // if the db is initialized but there isn't a db local state then generate a new one if initialized && dbls.UID == "" { p.localStateMutex.Lock() dbls.UID = common.UID() dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = false p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } if dbls.UID != db.UID { log.Info("current db UID different than cluster data db UID", zap.String("db", dbls.UID), zap.String("cdDB", db.UID)) switch db.Spec.InitMode { case cluster.DBInitModeNew: log.Info("initializing the database cluster") p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = true p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } if err = pgm.Init(); err != nil { log.Error("failed to initialize postgres database cluster", zap.Error(err)) return } initialized = true if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } log.Info("setting roles") if err = pgm.SetupRoles(); err != nil { log.Error("failed to setup roles", zap.Error(err)) return } if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModePITR: log.Info("restoring the database cluster") p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = true p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } if err = pgm.Restore(db.Spec.PITRConfig.DataRestoreCommand); err != nil { log.Error("failed to restore postgres database cluster", zap.Error(err)) return } if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(nil, db.Spec.PITRConfig.ArchiveRecoverySettings)); err != nil { log.Error("err", zap.Error(err)) return } if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } initialized = true if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModeExisting: // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } log.Info("updating our db UID with the cluster data provided db UID") // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModeNone: // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } return default: log.Error("unknown db init mode", zap.String("initMode", string(db.Spec.InitMode))) return } } pgm.SetParameters(pgParameters) var localRole common.Role var systemID string if !initialized { log.Info("database cluster not initialized") localRole = common.RoleUndefined } else { localRole, err = pgm.GetRole() if err != nil { log.Error("error retrieving current pg role", zap.Error(err)) return } systemID, err = p.pgm.GetSystemdID() if err != nil { log.Error("error retrieving systemd ID", zap.Error(err)) return } } targetRole := db.Spec.Role log.Debug("target role", zap.String("targetRole", string(targetRole))) switch targetRole { case common.RoleMaster: // We are the elected master log.Info("our db requested role is master") if localRole == common.RoleUndefined { log.Error("database cluster not initialized but requested role is master. This shouldn't happen!") return } if !started { if err = pgm.Start(); err != nil { log.Error("failed to start postgres", zap.Error(err)) return } started = true } if localRole == common.RoleStandby { log.Info("promoting to master") if err = pgm.Promote(); err != nil { log.Error("err", zap.Error(err)) return } } else { log.Info("already master") } var replSlots []string replSlots, err = pgm.GetReplicatinSlots() log.Debug("replication slots", zap.Object("replSlots", replSlots)) if err != nil { log.Error("err", zap.Error(err)) return } // Drop replication slots for _, slotName := range replSlots { if !common.IsStolonName(slotName) { continue } if !util.StringInSlice(followersUIDs, common.NameFromStolonName(slotName)) { log.Info("dropping replication slot since db not marked as follower", zap.String("slot", slotName), zap.String("db", common.NameFromStolonName(slotName))) if err = pgm.DropReplicationSlot(slotName); err != nil { log.Error("err", zap.Error(err)) } } } // Create replication slots for _, followerUID := range followersUIDs { if followerUID == dbls.UID { continue } replSlot := common.StolonName(followerUID) if !util.StringInSlice(replSlots, replSlot) { log.Info("creating replication slot", zap.String("slot", replSlot), zap.String("db", followerUID)) if err = pgm.CreateReplicationSlot(replSlot); err != nil { log.Error("err", zap.Error(err)) } } } case common.RoleStandby: // We are a standby followedUID := db.Spec.FollowConfig.DBUID log.Info("our db requested role is standby", zap.String("followedDB", followedUID)) followedDB, ok := cd.DBs[followedUID] if !ok { log.Error("no db data available for followed db", zap.String("followedDB", followedUID)) return } switch localRole { case common.RoleMaster: if systemID == followedDB.Status.SystemID { // There can be the possibility that this // database is on the same branch of the // current followed instance. // So we try to put it in recovery and then // check if it's on the same branch or force a // resync replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if !started { if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } else { if err = pgm.Restart(true); err != nil { log.Error("err", zap.Error(err)) return } } // TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines. // So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream. // Check timeline history // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state var pgState *cluster.PostgresState pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, true, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true // Check again if it was really synced pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } } } else { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } case common.RoleStandby: log.Info("already standby") if !started { replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("failed to start postgres", zap.Error(err)) return } started = true } // Check that we can sync with followed instance // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state var pgState *cluster.PostgresState pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } needsResync := false tryPgrewind := false // If the db has a different systemdID then a resync is needed if systemID != followedDB.Status.SystemID { needsResync = true // Check timeline history } else if p.isDifferentTimelineBranch(followedDB, pgState) { needsResync = true tryPgrewind = true } if needsResync { // TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines. // So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream. if err = p.resync(db, followedDB, tryPgrewind, started); err != nil { log.Error("failed to full resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true // Check again if it was really synced pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } } // TODO(sgotti) Check that the followed instance has all the needed WAL segments // Update our primary_conninfo if replConnString changed var curReplConnParams postgresql.ConnParams curReplConnParams, err = pgm.GetPrimaryConninfo() if err != nil { log.Error("err", zap.Error(err)) return } log.Debug("curReplConnParams", zap.Object("curReplConnParams", curReplConnParams)) newReplConnParams := p.getReplConnParams(db, followedDB) log.Debug("newReplConnParams", zap.Object("newReplConnParams", newReplConnParams)) if !curReplConnParams.Equals(newReplConnParams) { log.Info("connection parameters changed. Reconfiguring.", zap.String("followedDB", followedUID), zap.Object("replConnParams", newReplConnParams)) standbySettings := &cluster.StandbySettings{PrimaryConninfo: newReplConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if err = pgm.Restart(true); err != nil { log.Error("err", zap.Error(err)) return } } case common.RoleUndefined: if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to full resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } case common.RoleUndefined: log.Info("our db requested role is none") return } // update pg parameters pgParameters = p.createPGParameters(db) // Log synchronous replication changes prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"] syncStandbyNames := pgParameters["synchronous_standby_names"] if db.Spec.SynchronousReplication { if prevSyncStandbyNames != syncStandbyNames { log.Info("needed synchronous_standby_names changed", zap.String("prevSyncStandbyNames", prevSyncStandbyNames), zap.String("syncStandbyNames", syncStandbyNames)) } } else { if prevSyncStandbyNames != "" { log.Info("sync replication disabled, removing current synchronous_standby_names", zap.String("syncStandbyNames", prevSyncStandbyNames)) } } if !pgParameters.Equals(prevPGParameters) { log.Info("postgres parameters changed, reloading postgres instance") pgm.SetParameters(pgParameters) if err := pgm.Reload(); err != nil { log.Error("failed to reload postgres instance", zap.Error(err)) } } else { // for tests log.Info("postgres parameters not changed") } // If we are here, then all went well and we can update the db generation and save it locally p.localStateMutex.Lock() dbls.Generation = db.Generation dbls.Initializing = false p.localStateMutex.Unlock() if err := p.saveDBLocalState(); err != nil { log.Error("err", zap.Error(err)) return } }
func NewTestKeeperWithID(t *testing.T, dir, uid, clusterName, pgSUUsername, pgSUPassword, pgReplUsername, pgReplPassword string, storeBackend store.Backend, storeEndpoints string, a ...string) (*TestKeeper, error) { args := []string{} dataDir := filepath.Join(dir, fmt.Sprintf("st%s", uid)) pgListenAddress, pgPort, err := getFreePort(true, false) if err != nil { return nil, err } args = append(args, fmt.Sprintf("--uid=%s", uid)) args = append(args, fmt.Sprintf("--cluster-name=%s", clusterName)) args = append(args, fmt.Sprintf("--pg-listen-address=%s", pgListenAddress)) args = append(args, fmt.Sprintf("--pg-port=%s", pgPort)) args = append(args, fmt.Sprintf("--data-dir=%s", dataDir)) args = append(args, fmt.Sprintf("--store-backend=%s", storeBackend)) args = append(args, fmt.Sprintf("--store-endpoints=%s", storeEndpoints)) args = append(args, fmt.Sprintf("--pg-su-username=%s", pgSUUsername)) if pgSUPassword != "" { args = append(args, fmt.Sprintf("--pg-su-password=%s", pgSUPassword)) } args = append(args, fmt.Sprintf("--pg-repl-username=%s", pgReplUsername)) args = append(args, fmt.Sprintf("--pg-repl-password=%s", pgReplPassword)) if os.Getenv("DEBUG") != "" { args = append(args, "--debug") } args = append(args, a...) connParams := pg.ConnParams{ "user": pgSUUsername, "password": pgSUPassword, "host": pgListenAddress, "port": pgPort, "dbname": "postgres", "sslmode": "disable", } connString := connParams.ConnString() db, err := sql.Open("postgres", connString) if err != nil { return nil, err } bin := os.Getenv("STKEEPER_BIN") if bin == "" { return nil, fmt.Errorf("missing STKEEPER_BIN env") } tk := &TestKeeper{ t: t, Process: Process{ t: t, uid: uid, name: "keeper", bin: bin, args: args, }, dataDir: dataDir, pgListenAddress: pgListenAddress, pgPort: pgPort, pgSUUsername: pgSUUsername, pgSUPassword: pgSUPassword, pgReplUsername: pgReplUsername, pgReplPassword: pgReplPassword, db: db, } return tk, nil }