func (s *Server) serveH2r(w http.ResponseWriter, r *http.Request) { defer func() { if err := recover(); err != nil { w.WriteHeader(http.StatusInternalServerError) Log.Error("REVERSE failed", zap.Object("err", err)) } else { w.WriteHeader(http.StatusOK) } }() remote, err := net.DialTimeout("tcp", r.Host, time.Second*10) if err != nil { Log.Error("dail failed", zap.Error(err), zap.String("host", r.Host)) w.WriteHeader(http.StatusNotImplemented) return } defer remote.Close() go io.Copy(remote, r.Body) // go io.Copy(remote, io.TeeReader(r.Body, os.Stdout)) resr := io.TeeReader(remote, w) // resr = io.TeeReader(resr, os.Stdout) res, err := http.ReadResponse(bufio.NewReader(resr), nil) if err != nil { return } if res.Body != nil { defer res.Body.Close() io.Copy(ioutil.Discard, res.Body) } }
func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string) error { // ioutil.Tempfile already creates files with 0600 permissions pgpass, err := ioutil.TempFile("", "pgpass") if err != nil { return err } defer os.Remove(pgpass.Name()) defer pgpass.Close() host := followedConnParams.Get("host") port := followedConnParams.Get("port") user := followedConnParams.Get("user") pgpass.WriteString(fmt.Sprintf("%s:%s:*:%s:%s\n", host, port, user, password)) // Disable synchronous commits. pg_rewind needs to create a // temporary table on the master but if synchronous replication is // enabled and there're no active standbys it will hang. followedConnParams.Set("options", "-c synchronous_commit=off") followedConnString := followedConnParams.ConnString() log.Info("running pg_rewind") name := filepath.Join(p.pgBinPath, "pg_rewind") cmd := exec.Command(name, "--debug", "-D", p.dataDir, "--source-server="+followedConnString) cmd.Env = append(cmd.Env, fmt.Sprintf("PGPASSFILE=%s", pgpass.Name())) log.Debug("execing cmd", zap.Object("cmd", cmd)) out, err := cmd.CombinedOutput() if err != nil { return fmt.Errorf("error: %v, output: %s", err, string(out)) } log.Debug("cmd out", zap.String("out", string(out))) return nil }
func (p *Manager) SyncFromFollowed(followedConnParams ConnParams) error { // ioutil.Tempfile already creates files with 0600 permissions pgpass, err := ioutil.TempFile("", "pgpass") if err != nil { return err } defer os.Remove(pgpass.Name()) defer pgpass.Close() host := followedConnParams.Get("host") port := followedConnParams.Get("port") user := followedConnParams.Get("user") password := followedConnParams.Get("password") pgpass.WriteString(fmt.Sprintf("%s:%s:*:%s:%s\n", host, port, user, password)) log.Info("running pg_basebackup") name := filepath.Join(p.pgBinPath, "pg_basebackup") cmd := exec.Command(name, "-R", "-D", p.dataDir, "--host="+host, "--port="+port, "-U", user) cmd.Env = append(cmd.Env, fmt.Sprintf("PGPASSFILE=%s", pgpass.Name())) log.Debug("execing cmd", zap.Object("cmd", cmd)) if out, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("error: %v, output: %s", err, string(out)) } return nil }
func newBot() (b MMJira) { b = MMJira{l: zap.NewJSON(zap.DebugLevel), reg: metrics.NewRegistry()} data, err := ioutil.ReadFile("config.yaml") if err != nil { b.l.Panic("not able to read the file", zap.Error(err)) } var config InstanceConfig if err = yaml.Unmarshal(data, &config); err != nil { b.l.Panic("not able to marshal the file", zap.Error(err)) } b.c = &config if !b.c.Debug { b.l.SetLevel(zap.ErrorLevel) } mmpost, err := mmcontroller.NewController(b.c.MMicon, b.c.MMuser, b.c.Hooks, b.c.Debug, metrics.NewPrefixedChildRegistry(b.reg, "mmc.")) if err != nil { panic(err) } b.m = mmpost b.l.Debug("outputting config", zap.Object("config", b.c)) b.r = mux.NewRouter() b.r.HandleFunc("/", b.homeHandler) b.r.HandleFunc("/hooks/", b.getHandler).Methods("GET") b.r.HandleFunc("/hooks/{hookid}", b.postHandler).Methods("POST") b.r.Handle("/metrics", exp.ExpHandler(b.reg)) b.r.HandleFunc("/config/", b.configGetHandler).Methods("GET") return b }
func (conn *Connection) reader(responses chan<- Response, logger zap.Logger) { buffer := make([]byte, 6) for { n, err := conn.conn.Read(buffer) if err != nil && n < 6 { logger.Info("APNS: Connection error before reading complete response", zap.Int("connectionId", conn.id), zap.Int("n", n), zap.Error(err), ) conn.shouldReconnect <- true return } else if err != nil { logger.Info("APNS: Connection error before reading complete response", zap.Int("connectionId", conn.id), zap.Error(err), ) } command := uint8(buffer[0]) if command != 8 { logger.Info("APNS: Something went wrong in a connection - Command should have been 8 but it had other value instead", zap.Int("connectionId", conn.id), zap.Object("commandValue", command), ) } resp := newResponse() resp.Identifier = binary.BigEndian.Uint32(buffer[2:6]) resp.Status = uint8(buffer[1]) responses <- resp conn.shouldReconnect <- true return } }
func (s *Server) serveH2c(w http.ResponseWriter, r *http.Request) { defer func() { if err := recover(); err != nil { w.WriteHeader(http.StatusInternalServerError) Log.Error("CONNECT failed", zap.Object("err", err)) } }() remote, err := net.DialTimeout("tcp", r.Host, time.Second*10) if err != nil { Log.Error("dail failed", zap.Error(err), zap.String("host", r.Host)) w.WriteHeader(http.StatusNotImplemented) return } defer remote.Close() fw := &flushWriter{w} fw.FlushHeader(http.StatusOK) go io.Copy(remote, r.Body) srcRemote := &TryReader{ c: remote, ignore: 3, maxRetry: 2, tryDur: time.Millisecond * 600, timeout: time.Second * 15, } io.Copy(fw, srcRemote) }
// handleInbox handles incomming chat message func (b *satpamBot) handleInbox() { for { select { case rawMsg := <-b.in: if rawMsg == nil { log.Fatal("handleInbox input channel is closed") } switch msg := rawMsg.(type) { case *bot.Message: if msg.Date.Before(startedAt) { // ignore message that is received before the process started log.Debug("message before started at", zap.Object("msg", msg), zap.String("startedAt", startedAt.String()), zap.String("date", msg.Date.String())) continue } log.Debug("handleInbox got message", zap.Object("msg", msg)) if msg.From.ID != adminID { continue } msgType := msg.Chat.Type if msgType == bot.Private { log.Debug("Got private message", zap.Object("msg", msg)) if msg.From.ID == adminID { // TODO } continue } // ## Handle Commands ## switch msg.Text { case "/leave", "/leave@" + botName: if b.cmdLeave(msg) { continue } } } } } }
func (p *Manager) start(args ...string) error { log.Info("starting database") name := filepath.Join(p.pgBinPath, "pg_ctl") args = append([]string{"start", "-w", "-D", p.dataDir, "-o", "-c unix_socket_directories=/tmp"}, args...) cmd := exec.Command(name, args...) log.Debug("execing cmd", zap.Object("cmd", cmd)) // TODO(sgotti) attaching a pipe to sdtout/stderr makes the postgres // process executed by pg_ctl inheriting it's file descriptors. So // cmd.Wait() will block and waiting on them to be closed (will happend // only when postgres is stopped). So this functions will never return. // To avoid this no output is captured. If needed there's the need to // find a way to get the output whitout blocking. if err := cmd.Run(); err != nil { return fmt.Errorf("error: %v", err) } return nil }
func (s *Server) serveWs(w http.ResponseWriter, r *http.Request) { defer func() { if err := recover(); err != nil { w.WriteHeader(http.StatusInternalServerError) Log.Error("serveWs error", zap.Object("err", err)) } }() Log.Debug("websocket start") ws, err := s.upgrader.Upgrade(w, r, nil) if err != nil { w.WriteHeader(http.StatusInternalServerError) Log.Error("websocket failed", zap.Error(err)) return } Log.Debug("websocket ok") s.globalWsChan <- NewWs(ws, s.WsBufSize) }
//Analyse the response from mm func (c *Controller) Analyse(in <-chan Response) { count := metrics.GetOrRegisterCounter("analyse.response.total", c.reg) count.Inc(1) response := <-in if response.StatusCode != 200 { n := "analyse.response." + response.Project + ".error" count := metrics.GetOrRegisterCounter(n, c.reg) count.Inc(1) } else { n := "analyse.response." + response.Project + ".ok" count := metrics.GetOrRegisterCounter(n, c.reg) count.Inc(1) } c.l.Info("response received", zap.Object("response", response)) }
func (p *Manager) SyncFromFollowed(followedConnParams ConnParams) error { fcp := followedConnParams.Copy() // ioutil.Tempfile already creates files with 0600 permissions pgpass, err := ioutil.TempFile("", "pgpass") if err != nil { return err } defer os.Remove(pgpass.Name()) defer pgpass.Close() host := fcp.Get("host") port := fcp.Get("port") user := fcp.Get("user") password := fcp.Get("password") pgpass.WriteString(fmt.Sprintf("%s:%s:*:%s:%s\n", host, port, user, password)) // Remove password from the params passed to pg_basebackup fcp.Del("password") // Disable synchronous commits. pg_basebackup calls // pg_start_backup()/pg_stop_backup() on the master but if synchronous // replication is enabled and there're no active standbys they will hang. fcp.Set("options", "-c synchronous_commit=off") followedConnString := fcp.ConnString() log.Info("running pg_basebackup") name := filepath.Join(p.pgBinPath, "pg_basebackup") cmd := exec.Command(name, "-R", "-D", p.dataDir, "-d", followedConnString) cmd.Env = append(cmd.Env, fmt.Sprintf("PGPASSFILE=%s", pgpass.Name())) log.Debug("execing cmd", zap.Object("cmd", cmd)) if out, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("error: %v, output: %s", err, string(out)) } return nil }
// GetTarget retrieve the hook assigned to a projet, return an error in anyother case func (b MMJira) postHandler(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) hookid := strings.ToLower(vars["hookid"]) b.l.Info("project", zap.String("hook", hookid)) if b.c.Hooks[hookid] == "" { c := metrics.GetOrRegisterCounter("hooks.post.unknown.project", b.reg) c.Inc(1) http.Error(w, "unknwon project", http.StatusBadRequest) return } b.l.Debug("received a request") c := metrics.GetOrRegisterCounter("hooks.received."+hookid, b.reg) c.Inc(1) if b.c.Debug { if err := utils.DumpRequest(r, b.c.DumpDir); err != nil { b.l.Info("unable to dump the request in the directory", zap.String("Directory", b.c.DumpDir)) } } issue, err := b.m.Create(r.Body) if err != nil { http.Error(w, fmt.Sprint(err), http.StatusBadRequest) return } if err != nil { http.Error(w, fmt.Sprint(err), http.StatusBadRequest) return } // We only know our top-level keys are strings b.l.Debug("sending", zap.Object("issue", issue)) ch := b.m.Inform(issue) go b.m.Analyse(ch) }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm cd, _, err := e.GetClusterData() if err != nil { log.Error("error retrieving cluster data", zap.Error(err)) return } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) if cd == nil { log.Info("no cluster data available, waiting for it to appear") return } if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) return } if cd.Cluster != nil { p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration if p.keeperLocalState.ClusterUID != cd.Cluster.UID { p.keeperLocalState.ClusterUID = cd.Cluster.UID if err = p.saveKeeperLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } } k, ok := cd.Keepers[p.keeperLocalState.UID] if !ok { log.Info("our keeper data is not available, waiting for it to appear") return } // TODO(sgotti) Check that the Keeper.Status address:port has been updated db := cd.FindDB(k) if db == nil { log.Info("no db assigned") return } // TODO(sgotti) Check that the DB.Status address:port has been updated followersUIDs := db.Spec.Followers prevPGParameters := pgm.GetParameters() // create postgres parameteres pgParameters := p.createPGParameters(db) // update pgm postgres parameters pgm.SetParameters(pgParameters) dbls := p.dbLocalState if dbls.Initializing { // If we are here this means that the db initialization or // resync as failed so we have to clean up stale data log.Error("db failed to initialize or resync") // Clean up cluster db datadir if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } // Reset current db local state since it's not valid anymore p.localStateMutex.Lock() dbls.UID = "" dbls.Generation = cluster.NoGeneration dbls.Initializing = false p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } initialized, err := pgm.IsInitialized() if err != nil { log.Error("failed to detect if instance is initialized", zap.Error(err)) return } started := false if initialized { started, err = pgm.IsStarted() if err != nil { // log error getting instance state but go ahead. log.Info("failed to retrieve instance status", zap.Error(err)) } } log.Debug("db status", zap.Bool("started", started)) // if the db is initialized but there isn't a db local state then generate a new one if initialized && dbls.UID == "" { p.localStateMutex.Lock() dbls.UID = common.UID() dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = false p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } if dbls.UID != db.UID { log.Info("current db UID different than cluster data db UID", zap.String("db", dbls.UID), zap.String("cdDB", db.UID)) switch db.Spec.InitMode { case cluster.DBInitModeNew: log.Info("initializing the database cluster") p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = true p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } if err = pgm.Init(); err != nil { log.Error("failed to initialize postgres database cluster", zap.Error(err)) return } initialized = true if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } log.Info("setting roles") if err = pgm.SetupRoles(); err != nil { log.Error("failed to setup roles", zap.Error(err)) return } if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModePITR: log.Info("restoring the database cluster") p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = true p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } if err = pgm.Restore(db.Spec.PITRConfig.DataRestoreCommand); err != nil { log.Error("failed to restore postgres database cluster", zap.Error(err)) return } if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(nil, db.Spec.PITRConfig.ArchiveRecoverySettings)); err != nil { log.Error("err", zap.Error(err)) return } if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } initialized = true if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModeExisting: // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } log.Info("updating our db UID with the cluster data provided db UID") // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModeNone: // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } return default: log.Error("unknown db init mode", zap.String("initMode", string(db.Spec.InitMode))) return } } pgm.SetParameters(pgParameters) var localRole common.Role var systemID string if !initialized { log.Info("database cluster not initialized") localRole = common.RoleUndefined } else { localRole, err = pgm.GetRole() if err != nil { log.Error("error retrieving current pg role", zap.Error(err)) return } systemID, err = p.pgm.GetSystemdID() if err != nil { log.Error("error retrieving systemd ID", zap.Error(err)) return } } targetRole := db.Spec.Role log.Debug("target role", zap.String("targetRole", string(targetRole))) switch targetRole { case common.RoleMaster: // We are the elected master log.Info("our db requested role is master") if localRole == common.RoleUndefined { log.Error("database cluster not initialized but requested role is master. This shouldn't happen!") return } if !started { if err = pgm.Start(); err != nil { log.Error("failed to start postgres", zap.Error(err)) return } started = true } if localRole == common.RoleStandby { log.Info("promoting to master") if err = pgm.Promote(); err != nil { log.Error("err", zap.Error(err)) return } } else { log.Info("already master") } var replSlots []string replSlots, err = pgm.GetReplicatinSlots() log.Debug("replication slots", zap.Object("replSlots", replSlots)) if err != nil { log.Error("err", zap.Error(err)) return } // Drop replication slots for _, slotName := range replSlots { if !common.IsStolonName(slotName) { continue } if !util.StringInSlice(followersUIDs, common.NameFromStolonName(slotName)) { log.Info("dropping replication slot since db not marked as follower", zap.String("slot", slotName), zap.String("db", common.NameFromStolonName(slotName))) if err = pgm.DropReplicationSlot(slotName); err != nil { log.Error("err", zap.Error(err)) } } } // Create replication slots for _, followerUID := range followersUIDs { if followerUID == dbls.UID { continue } replSlot := common.StolonName(followerUID) if !util.StringInSlice(replSlots, replSlot) { log.Info("creating replication slot", zap.String("slot", replSlot), zap.String("db", followerUID)) if err = pgm.CreateReplicationSlot(replSlot); err != nil { log.Error("err", zap.Error(err)) } } } case common.RoleStandby: // We are a standby followedUID := db.Spec.FollowConfig.DBUID log.Info("our db requested role is standby", zap.String("followedDB", followedUID)) followedDB, ok := cd.DBs[followedUID] if !ok { log.Error("no db data available for followed db", zap.String("followedDB", followedUID)) return } switch localRole { case common.RoleMaster: if systemID == followedDB.Status.SystemID { // There can be the possibility that this // database is on the same branch of the // current followed instance. // So we try to put it in recovery and then // check if it's on the same branch or force a // resync replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if !started { if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } else { if err = pgm.Restart(true); err != nil { log.Error("err", zap.Error(err)) return } } // TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines. // So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream. // Check timeline history // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state var pgState *cluster.PostgresState pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, true, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true // Check again if it was really synced pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } } } else { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } case common.RoleStandby: log.Info("already standby") if !started { replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("failed to start postgres", zap.Error(err)) return } started = true } // Check that we can sync with followed instance // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state var pgState *cluster.PostgresState pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } needsResync := false tryPgrewind := false // If the db has a different systemdID then a resync is needed if systemID != followedDB.Status.SystemID { needsResync = true // Check timeline history } else if p.isDifferentTimelineBranch(followedDB, pgState) { needsResync = true tryPgrewind = true } if needsResync { // TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines. // So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream. if err = p.resync(db, followedDB, tryPgrewind, started); err != nil { log.Error("failed to full resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true // Check again if it was really synced pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } } // TODO(sgotti) Check that the followed instance has all the needed WAL segments // Update our primary_conninfo if replConnString changed var curReplConnParams postgresql.ConnParams curReplConnParams, err = pgm.GetPrimaryConninfo() if err != nil { log.Error("err", zap.Error(err)) return } log.Debug("curReplConnParams", zap.Object("curReplConnParams", curReplConnParams)) newReplConnParams := p.getReplConnParams(db, followedDB) log.Debug("newReplConnParams", zap.Object("newReplConnParams", newReplConnParams)) if !curReplConnParams.Equals(newReplConnParams) { log.Info("connection parameters changed. Reconfiguring.", zap.String("followedDB", followedUID), zap.Object("replConnParams", newReplConnParams)) standbySettings := &cluster.StandbySettings{PrimaryConninfo: newReplConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if err = pgm.Restart(true); err != nil { log.Error("err", zap.Error(err)) return } } case common.RoleUndefined: if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to full resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } case common.RoleUndefined: log.Info("our db requested role is none") return } // update pg parameters pgParameters = p.createPGParameters(db) // Log synchronous replication changes prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"] syncStandbyNames := pgParameters["synchronous_standby_names"] if db.Spec.SynchronousReplication { if prevSyncStandbyNames != syncStandbyNames { log.Info("needed synchronous_standby_names changed", zap.String("prevSyncStandbyNames", prevSyncStandbyNames), zap.String("syncStandbyNames", syncStandbyNames)) } } else { if prevSyncStandbyNames != "" { log.Info("sync replication disabled, removing current synchronous_standby_names", zap.String("syncStandbyNames", prevSyncStandbyNames)) } } if !pgParameters.Equals(prevPGParameters) { log.Info("postgres parameters changed, reloading postgres instance") pgm.SetParameters(pgParameters) if err := pgm.Reload(); err != nil { log.Error("failed to reload postgres instance", zap.Error(err)) } } else { // for tests log.Info("postgres parameters not changed") } // If we are here, then all went well and we can update the db generation and save it locally p.localStateMutex.Lock() dbls.Generation = db.Generation dbls.Initializing = false p.localStateMutex.Unlock() if err := p.saveDBLocalState(); err != nil { log.Error("err", zap.Error(err)) return } }
func (p *PostgresKeeper) GetPGState(pctx context.Context) (*cluster.PostgresState, error) { p.getPGStateMutex.Lock() defer p.getPGStateMutex.Unlock() // Just get one pgstate at a time to avoid exausting available connections pgState := &cluster.PostgresState{} p.localStateMutex.Lock() pgState.UID = p.dbLocalState.UID pgState.Generation = p.dbLocalState.Generation p.localStateMutex.Unlock() pgState.ListenAddress = p.pgListenAddress pgState.Port = p.pgPort initialized, err := p.pgm.IsInitialized() if err != nil { return nil, err } if initialized { pgParameters, err := p.pgm.GetConfigFilePGParameters() if err != nil { log.Error("cannot get configured pg parameters", zap.Error(err)) return pgState, nil } log.Debug("got configured pg parameters", zap.Object("pgParameters", pgParameters)) filteredPGParameters := common.Parameters{} for k, v := range pgParameters { if !util.StringInSlice(managedPGParameters, k) { filteredPGParameters[k] = v } } log.Debug("filtered out managed pg parameters", zap.Object("filteredPGParameters", filteredPGParameters)) pgState.PGParameters = filteredPGParameters sd, err := p.pgm.GetSystemData() if err != nil { log.Error("error getting pg state", zap.Error(err)) return pgState, nil } pgState.SystemID = sd.SystemID pgState.TimelineID = sd.TimelineID pgState.XLogPos = sd.XLogPos // if timeline <= 1 then no timeline history file exists. pgState.TimelinesHistory = cluster.PostgresTimelinesHistory{} if pgState.TimelineID > 1 { tlsh, err := p.pgm.GetTimelinesHistory(pgState.TimelineID) if err != nil { log.Error("error getting timeline history", zap.Error(err)) return pgState, nil } ctlsh := cluster.PostgresTimelinesHistory{} for _, tlh := range tlsh { ctlh := &cluster.PostgresTimelineHistory{ TimelineID: tlh.TimelineID, SwitchPoint: tlh.SwitchPoint, Reason: tlh.Reason, } ctlsh = append(ctlsh, ctlh) } pgState.TimelinesHistory = ctlsh } pgState.Healthy = true } return pgState, nil }
func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo cluster.KeepersInfo, firstRun bool) (*cluster.ClusterData, KeeperInfoHistories) { // Create a copy of cd cd = cd.DeepCopy() kihs := s.keeperInfoHistories.DeepCopy() // Remove keepers with wrong cluster UID tmpKeepersInfo := keepersInfo.DeepCopy() for _, ki := range keepersInfo { if ki.ClusterUID != cd.Cluster.UID { delete(tmpKeepersInfo, ki.UID) } } keepersInfo = tmpKeepersInfo // On first run just insert keepers info in the history with Seen set // to false and don't do any change to the keepers' state if firstRun { for keeperUID, ki := range keepersInfo { kihs[keeperUID] = &KeeperInfoHistory{KeeperInfo: ki, Seen: false} } return cd, kihs } tmpKeepersInfo = keepersInfo.DeepCopy() // keep only updated keepers info for keeperUID, ki := range keepersInfo { if kih, ok := kihs[keeperUID]; ok { log.Debug("kih", zap.Object("kih", kih)) if kih.KeeperInfo.InfoUID == ki.InfoUID { if !kih.Seen { //Remove since it was already there and wasn't updated delete(tmpKeepersInfo, ki.UID) } else if kih.Seen && timer.Since(kih.Timer) > s.sleepInterval { //Remove since it wasn't updated delete(tmpKeepersInfo, ki.UID) } } if kih.KeeperInfo.InfoUID != ki.InfoUID { kihs[keeperUID] = &KeeperInfoHistory{KeeperInfo: ki, Seen: true, Timer: timer.Now()} } } else { kihs[keeperUID] = &KeeperInfoHistory{KeeperInfo: ki, Seen: true, Timer: timer.Now()} } } keepersInfo = tmpKeepersInfo // Create new keepers from keepersInfo for keeperUID, ki := range keepersInfo { if _, ok := cd.Keepers[keeperUID]; !ok { k := cluster.NewKeeperFromKeeperInfo(ki) cd.Keepers[k.UID] = k } } // Mark keepers without a keeperInfo (cleaned up above from not updated // ones) as in error for keeperUID, _ := range cd.Keepers { if _, ok := keepersInfo[keeperUID]; !ok { s.SetKeeperError(keeperUID) } else { s.CleanKeeperError(keeperUID) } } // Update keepers' healthy states for _, k := range cd.Keepers { k.Status.Healthy = s.isKeeperHealthy(cd, k) } // Update dbs' states for _, db := range cd.DBs { // Mark not found DBs in DBstates in error k, ok := keepersInfo[db.Spec.KeeperUID] if !ok { log.Error("no keeper info available", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) s.SetDBError(db.UID) continue } dbs := k.PostgresState if dbs == nil { log.Error("no db state available", zap.String("db", db.UID)) s.SetDBError(db.UID) continue } if dbs.UID != db.UID { log.Warn("received db state for unexpected db uid", zap.String("receivedDB", dbs.UID), zap.String("db", db.UID)) s.SetDBError(db.UID) continue } log.Debug("received db state", zap.String("db", db.UID)) db.Status.ListenAddress = dbs.ListenAddress db.Status.Port = dbs.Port db.Status.CurrentGeneration = dbs.Generation if dbs.Healthy { s.CleanDBError(db.UID) db.Status.SystemID = dbs.SystemID db.Status.TimelineID = dbs.TimelineID db.Status.XLogPos = dbs.XLogPos db.Status.TimelinesHistory = dbs.TimelinesHistory db.Status.PGParameters = cluster.PGParameters(dbs.PGParameters) } else { s.SetDBError(db.UID) } } // Update dbs' healthy state for _, db := range cd.DBs { db.Status.Healthy = s.isDBHealthy(cd, db) } return cd, kihs }
func (s *Sentinel) updateCluster(cd *cluster.ClusterData) (*cluster.ClusterData, error) { newcd := cd.DeepCopy() switch cd.Cluster.Status.Phase { case cluster.ClusterPhaseInitializing: switch *cd.Cluster.DefSpec().InitMode { case cluster.ClusterInitModeNew: // Is there already a keeper choosed to be the new master? if cd.Cluster.Status.Master == "" { log.Info("trying to find initial master") k, err := s.findInitialKeeper(cd) if err != nil { return nil, fmt.Errorf("cannot choose initial master: %v", err) } log.Info("initializing cluster", zap.String("keeper", k.UID)) db := &cluster.DB{ UID: s.UIDFn(), Generation: cluster.InitialGeneration, ChangeTime: time.Now(), Spec: &cluster.DBSpec{ KeeperUID: k.UID, InitMode: cluster.DBInitModeNew, Role: common.RoleMaster, Followers: []string{}, IncludeConfig: *cd.Cluster.DefSpec().MergePgParameters, }, } newcd.DBs[db.UID] = db newcd.Cluster.Status.Master = db.UID log.Debug("newcd dump", zap.String("newcd", spew.Sdump(newcd))) } else { db, ok := cd.DBs[cd.Cluster.Status.Master] if !ok { panic(fmt.Errorf("db %q object doesn't exists. This shouldn't happen", cd.Cluster.Status.Master)) } // Check that the choosed db for being the master has correctly initialized switch s.dbConvergenceState(db, cd.Cluster.DefSpec().InitTimeout.Duration) { case Converged: if db.Status.Healthy { log.Info("db initialized", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) // Set db initMode to none, not needed but just a security measure db.Spec.InitMode = cluster.DBInitModeNone // Don't include previous config anymore db.Spec.IncludeConfig = false // Replace reported pg parameters in cluster spec if *cd.Cluster.DefSpec().MergePgParameters { newcd.Cluster.Spec.PGParameters = db.Status.PGParameters } // Cluster initialized, switch to Normal state newcd.Cluster.Status.Phase = cluster.ClusterPhaseNormal } case Converging: log.Info("waiting for db", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) case ConvergenceFailed: log.Info("db failed to initialize", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) // Empty DBs newcd.DBs = cluster.DBs{} // Unset master so another keeper can be choosen newcd.Cluster.Status.Master = "" } } case cluster.ClusterInitModeExisting: if cd.Cluster.Status.Master == "" { wantedKeeper := cd.Cluster.DefSpec().ExistingConfig.KeeperUID log.Info("trying to use keeper as initial master", zap.String("keeper", wantedKeeper)) k, ok := cd.Keepers[wantedKeeper] if !ok { return nil, fmt.Errorf("keeper %q state not available", wantedKeeper) } log.Info("initializing cluster using selected keeper as master db owner", zap.String("keeper", k.UID)) db := &cluster.DB{ UID: s.UIDFn(), Generation: cluster.InitialGeneration, ChangeTime: time.Now(), Spec: &cluster.DBSpec{ KeeperUID: k.UID, InitMode: cluster.DBInitModeExisting, Role: common.RoleMaster, Followers: []string{}, IncludeConfig: *cd.Cluster.DefSpec().MergePgParameters, }, } newcd.DBs[db.UID] = db newcd.Cluster.Status.Master = db.UID log.Debug("newcd dump", zap.String("newcd", spew.Sdump(newcd))) } else { db, ok := newcd.DBs[cd.Cluster.Status.Master] if !ok { panic(fmt.Errorf("db %q object doesn't exists. This shouldn't happen", cd.Cluster.Status.Master)) } // Check that the choosed db for being the master has correctly initialized if db.Status.Healthy && s.dbConvergenceState(db, cd.Cluster.DefSpec().ConvergenceTimeout.Duration) == Converged { log.Info("db initialized", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) // Don't include previous config anymore db.Spec.IncludeConfig = false // Replace reported pg parameters in cluster spec if *cd.Cluster.DefSpec().MergePgParameters { newcd.Cluster.Spec.PGParameters = db.Status.PGParameters } // Cluster initialized, switch to Normal state newcd.Cluster.Status.Phase = cluster.ClusterPhaseNormal } } case cluster.ClusterInitModePITR: // Is there already a keeper choosed to be the new master? if cd.Cluster.Status.Master == "" { log.Info("trying to find initial master") k, err := s.findInitialKeeper(cd) if err != nil { return nil, fmt.Errorf("cannot choose initial master: %v", err) } log.Info("initializing cluster using selected keeper as master db owner", zap.String("keeper", k.UID)) db := &cluster.DB{ UID: s.UIDFn(), Generation: cluster.InitialGeneration, ChangeTime: time.Now(), Spec: &cluster.DBSpec{ KeeperUID: k.UID, InitMode: cluster.DBInitModePITR, PITRConfig: cd.Cluster.DefSpec().PITRConfig, Role: common.RoleMaster, Followers: []string{}, IncludeConfig: *cd.Cluster.DefSpec().MergePgParameters, }, } newcd.DBs[db.UID] = db newcd.Cluster.Status.Master = db.UID log.Debug("newcd dump", zap.String("newcd", spew.Sdump(newcd))) } else { db, ok := cd.DBs[cd.Cluster.Status.Master] if !ok { panic(fmt.Errorf("db %q object doesn't exists. This shouldn't happen", cd.Cluster.Status.Master)) } // Check that the choosed db for being the master has correctly initialized // TODO(sgotti) set a timeout (the max time for a restore operation) switch s.dbConvergenceState(db, 0) { case Converged: if db.Status.Healthy { log.Info("db initialized", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) // Set db initMode to none, not needed but just a security measure db.Spec.InitMode = cluster.DBInitModeNone // Don't include previous config anymore db.Spec.IncludeConfig = false // Replace reported pg parameters in cluster spec if *cd.Cluster.DefSpec().MergePgParameters { newcd.Cluster.Spec.PGParameters = db.Status.PGParameters } // Cluster initialized, switch to Normal state newcd.Cluster.Status.Phase = cluster.ClusterPhaseNormal } case Converging: log.Info("waiting for db to converge", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) case ConvergenceFailed: log.Info("db failed to initialize", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) // Empty DBs newcd.DBs = cluster.DBs{} // Unset master so another keeper can be choosen newcd.Cluster.Status.Master = "" } } default: return nil, fmt.Errorf("unknown init mode %q", cd.Cluster.DefSpec().InitMode) } case cluster.ClusterPhaseNormal: // TODO(sgotti) When keeper removal is implemented, remove DBs for unexistent keepers // Calculate current master status curMasterDBUID := cd.Cluster.Status.Master wantedMasterDBUID := curMasterDBUID masterOK := true curMasterDB := cd.DBs[curMasterDBUID] if curMasterDB == nil { return nil, fmt.Errorf("db for keeper %q not available. This shouldn't happen!", curMasterDBUID) } log.Debug("db dump", zap.String("db", spew.Sdump(curMasterDB))) if !curMasterDB.Status.Healthy { log.Info("master db is failed", zap.String("db", curMasterDB.UID), zap.String("keeper", curMasterDB.Spec.KeeperUID)) masterOK = false } // Check that the wanted master is in master state (i.e. check that promotion from standby to master happened) if s.dbConvergenceState(curMasterDB, cd.Cluster.DefSpec().ConvergenceTimeout.Duration) == ConvergenceFailed { log.Info("db not converged", zap.String("db", curMasterDB.UID), zap.String("keeper", curMasterDB.Spec.KeeperUID)) masterOK = false } if !masterOK { log.Info("trying to find a new master to replace failed master") bestNewMasters := s.findBestNewMasters(cd, curMasterDB) if len(bestNewMasters) == 0 { log.Error("no eligible masters") } else { // if synchronous replication is enabled, only choose new master in the synchronous replication standbys. var bestNewMasterDB *cluster.DB if *cd.Cluster.DefSpec().SynchronousReplication { onlyFake := true // if only fake synchronous standbys are defined we cannot choose any standby for _, dbUID := range curMasterDB.Spec.SynchronousStandbys { if dbUID != fakeStandbyName { onlyFake = false } } if !onlyFake { if !util.CompareStringSlice(curMasterDB.Status.SynchronousStandbys, curMasterDB.Spec.SynchronousStandbys) { log.Warn("cannot choose synchronous standby since the latest master reported synchronous standbys are different from the db spec ones", zap.Object("reported", spew.Sdump(curMasterDB.Status.SynchronousStandbys)), zap.Object("spec", spew.Sdump(curMasterDB.Spec.SynchronousStandbys))) } else { for _, nm := range bestNewMasters { if util.StringInSlice(curMasterDB.Spec.SynchronousStandbys, nm.UID) { bestNewMasterDB = nm break } } } } } else { bestNewMasterDB = bestNewMasters[0] } if bestNewMasterDB != nil { log.Info("electing db as the new master", zap.String("db", bestNewMasterDB.UID), zap.String("keeper", bestNewMasterDB.Spec.KeeperUID)) wantedMasterDBUID = bestNewMasterDB.UID } else { log.Error("no eligible masters") } } } // New master elected if curMasterDBUID != wantedMasterDBUID { // maintain the current role, remove followers oldMasterdb := newcd.DBs[curMasterDBUID] oldMasterdb.Spec.Followers = []string{} newcd.Cluster.Status.Master = wantedMasterDBUID newMasterDB := newcd.DBs[wantedMasterDBUID] newMasterDB.Spec.Role = common.RoleMaster newMasterDB.Spec.FollowConfig = nil // Tell proxy that there's currently no active master newcd.Proxy.Spec.MasterDBUID = "" newcd.Proxy.ChangeTime = time.Now() // Setup synchronous standbys to the one of the previous master (replacing ourself with the previous master) if *cd.Cluster.DefSpec().SynchronousReplication { for _, dbUID := range oldMasterdb.Spec.SynchronousStandbys { newMasterDB.Spec.SynchronousStandbys = []string{} if dbUID != newMasterDB.UID { newMasterDB.Spec.SynchronousStandbys = append(newMasterDB.Spec.SynchronousStandbys, dbUID) } else { newMasterDB.Spec.SynchronousStandbys = append(newMasterDB.Spec.SynchronousStandbys, oldMasterdb.UID) } } if len(newMasterDB.Spec.SynchronousStandbys) == 0 { newMasterDB.Spec.SynchronousStandbys = []string{fakeStandbyName} } } } // TODO(sgotti) Wait for the proxies being converged (closed connections to old master)? // Setup standbys, do this only when there's no master change if curMasterDBUID == wantedMasterDBUID { masterDB := newcd.DBs[curMasterDBUID] // Set standbys to follow master only if it's healthy and converged if masterDB.Status.Healthy && s.dbConvergenceState(masterDB, cd.Cluster.DefSpec().ConvergenceTimeout.Duration) == Converged { // Tell proxy that there's a new active master newcd.Proxy.Spec.MasterDBUID = wantedMasterDBUID newcd.Proxy.ChangeTime = time.Now() // Remove old masters toRemove := []*cluster.DB{} for _, db := range newcd.DBs { if db.UID == wantedMasterDBUID { continue } if s.dbType(newcd, db.UID) != dbTypeMaster { continue } log.Info("removing old master db", zap.String("db", db.UID)) toRemove = append(toRemove, db) } for _, db := range toRemove { delete(newcd.DBs, db.UID) } // Remove invalid dbs toRemove = []*cluster.DB{} for _, db := range newcd.DBs { if db.UID == wantedMasterDBUID { continue } if s.dbValidity(newcd, db.UID) != dbValidityInvalid { continue } log.Info("removing invalid db", zap.String("db", db.UID)) toRemove = append(toRemove, db) } for _, db := range toRemove { delete(newcd.DBs, db.UID) } goodStandbys, failedStandbys, convergingStandbys := s.validStandbysByStatus(newcd) goodStandbysCount := len(goodStandbys) failedStandbysCount := len(failedStandbys) convergingStandbysCount := len(convergingStandbys) log.Debug("standbys states", zap.Int("good", goodStandbysCount), zap.Int("failed", failedStandbysCount), zap.Int("converging", convergingStandbysCount)) // Setup synchronous standbys if *cd.Cluster.DefSpec().SynchronousReplication { // make a map of synchronous standbys starting from the current ones synchronousStandbys := map[string]struct{}{} for _, dbUID := range masterDB.Spec.SynchronousStandbys { // filter out fake standby if dbUID == fakeStandbyName { continue } synchronousStandbys[dbUID] = struct{}{} } // Check if the current synchronous standbys are healthy or remove them toRemove := map[string]struct{}{} for dbUID, _ := range synchronousStandbys { if _, ok := goodStandbys[dbUID]; !ok { log.Info("removing failed synchronous standby", zap.String("masterDB", masterDB.UID), zap.String("db", dbUID)) toRemove[dbUID] = struct{}{} } } for dbUID, _ := range toRemove { delete(synchronousStandbys, dbUID) } // Remove synchronous standbys in excess if uint16(len(synchronousStandbys)) > *cd.Cluster.DefSpec().MaxSynchronousStandbys { rc := len(synchronousStandbys) - int(*cd.Cluster.DefSpec().MaxSynchronousStandbys) removedCount := 0 toRemove = map[string]struct{}{} for dbUID, _ := range synchronousStandbys { if removedCount >= rc { break } log.Info("removing synchronous standby in excess", zap.String("masterDB", masterDB.UID), zap.String("db", dbUID)) toRemove[dbUID] = struct{}{} removedCount++ } for dbUID, _ := range toRemove { delete(synchronousStandbys, dbUID) } } // try to add missing standbys up to *cd.Cluster.DefSpec().MaxSynchronousStandbys bestStandbys := s.findBestStandbys(newcd, curMasterDB) ac := int(*cd.Cluster.DefSpec().MaxSynchronousStandbys) - len(synchronousStandbys) addedCount := 0 for _, bestStandby := range bestStandbys { if addedCount >= ac { break } if _, ok := synchronousStandbys[bestStandby.UID]; ok { continue } log.Info("adding synchronous standby", zap.String("masterDB", masterDB.UID), zap.String("synchronousStandbyDB", bestStandby.UID)) synchronousStandbys[bestStandby.UID] = struct{}{} addedCount++ } // If there're not enough real synchronous standbys add a fake synchronous standby because we have to be strict and make the master block transactions until MaxSynchronousStandbys real standbys are available if len(synchronousStandbys) < int(*cd.Cluster.DefSpec().MinSynchronousStandbys) { log.Info("using a fake synchronous standby since there are not enough real standbys available", zap.String("masterDB", masterDB.UID), zap.Int("required", int(*cd.Cluster.DefSpec().MinSynchronousStandbys))) synchronousStandbys[fakeStandbyName] = struct{}{} } masterDB.Spec.SynchronousStandbys = []string{} for dbUID, _ := range synchronousStandbys { masterDB.Spec.SynchronousStandbys = append(masterDB.Spec.SynchronousStandbys, dbUID) } // Sort synchronousStandbys so we can compare the slice regardless of its order sort.Sort(sort.StringSlice(masterDB.Spec.SynchronousStandbys)) } // NotFailed != Good since there can be some dbs that are converging // it's the total number of standbys - the failed standbys // or the sum of good + converging standbys notFailedStandbysCount := goodStandbysCount + convergingStandbysCount // Remove dbs in excess if we have a good number >= MaxStandbysPerSender if uint16(goodStandbysCount) >= *cd.Cluster.DefSpec().MaxStandbysPerSender { toRemove := []*cluster.DB{} // Remove all non good standbys for _, db := range newcd.DBs { if s.dbType(newcd, db.UID) != dbTypeStandby { continue } if _, ok := goodStandbys[db.UID]; !ok { log.Info("removing non good standby", zap.String("db", db.UID)) toRemove = append(toRemove, db) } } // Remove good standbys in excess nr := int(uint16(goodStandbysCount) - *cd.Cluster.DefSpec().MaxStandbysPerSender) i := 0 for _, db := range goodStandbys { if i >= nr { break } // Don't remove standbys marked as synchronous standbys if util.StringInSlice(masterDB.Spec.SynchronousStandbys, db.UID) { continue } log.Info("removing good standby in excess", zap.String("db", db.UID)) toRemove = append(toRemove, db) i++ } for _, db := range toRemove { delete(newcd.DBs, db.UID) } } else { // Add new dbs to substitute failed dbs. we // don't remove failed db until the number of // good db is >= MaxStandbysPerSender since they can come back // define, if there're available keepers, new dbs // nc can be negative if MaxStandbysPerSender has been lowered nc := int(*cd.Cluster.DefSpec().MaxStandbysPerSender - uint16(notFailedStandbysCount)) // Add missing DBs until MaxStandbysPerSender freeKeepers := s.freeKeepers(newcd) nf := len(freeKeepers) for i := 0; i < nc && i < nf; i++ { freeKeeper := freeKeepers[i] db := &cluster.DB{ UID: s.UIDFn(), Generation: cluster.InitialGeneration, ChangeTime: time.Now(), Spec: &cluster.DBSpec{ KeeperUID: freeKeeper.UID, InitMode: cluster.DBInitModeResync, Role: common.RoleStandby, Followers: []string{}, FollowConfig: &cluster.FollowConfig{Type: cluster.FollowTypeInternal, DBUID: wantedMasterDBUID}, }, } newcd.DBs[db.UID] = db log.Info("added new standby db", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID)) } } // Reconfigure all standbys as followers of the current master for _, db := range newcd.DBs { if s.dbType(newcd, db.UID) != dbTypeStandby { continue } db.Spec.Role = common.RoleStandby // Remove followers db.Spec.Followers = []string{} db.Spec.FollowConfig = &cluster.FollowConfig{Type: cluster.FollowTypeInternal, DBUID: wantedMasterDBUID} } // Set followers for master DB masterDB.Spec.Followers = []string{} for _, db := range newcd.DBs { if masterDB.UID == db.UID { continue } fc := db.Spec.FollowConfig if fc != nil { if fc.Type == cluster.FollowTypeInternal && fc.DBUID == wantedMasterDBUID { masterDB.Spec.Followers = append(masterDB.Spec.Followers, db.UID) } } } // Sort followers so the slice won't be considered changed due to different order of the same entries. sort.Strings(masterDB.Spec.Followers) } } default: return nil, fmt.Errorf("unknown cluster phase %s", cd.Cluster.Status.Phase) } // Copy the clusterSpec parameters to the dbSpec s.setDBSpecFromClusterSpec(newcd) // Update generation on DBs if they have changed for dbUID, db := range newcd.DBs { prevDB, ok := cd.DBs[dbUID] if !ok { continue } if !reflect.DeepEqual(db.Spec, prevDB.Spec) { log.Debug("db spec changed, updating generation", zap.String("prevDB", spew.Sdump(prevDB.Spec)), zap.String("db", spew.Sdump(db.Spec))) db.Generation++ db.ChangeTime = time.Now() } } return newcd, nil }