func (s *Sentinel) updateConfigHandler(w http.ResponseWriter, req *http.Request) { defer req.Body.Close() vars := mux.Vars(req) configName := vars["name"] // only configID == current is currently supported if configName != "current" { log.Errorf("wrong config name %q", configName) http.Error(w, fmt.Sprintf("wrong config name %q", configName), http.StatusBadRequest) } decoder := json.NewDecoder(req.Body) var config *cluster.NilConfig err := decoder.Decode(&config) if err != nil { w.WriteHeader(http.StatusBadRequest) } s.updateMutex.Lock() defer s.updateMutex.Unlock() if !s.isLeader() { log.Errorf("we aren't the sentinels leader. cannot process config update request.") http.Error(w, "we aren't the sentinels leader. cannot process config update request.", http.StatusBadRequest) } log.Infof(spew.Sprintf("updating config to %#v", config)) e := s.e cd, pair, err := e.GetClusterData() if err != nil { log.Errorf("error retrieving cluster data: %v", err) http.Error(w, fmt.Sprintf("error retrieving cluster data: %v", err), http.StatusInternalServerError) } if cd == nil { log.Errorf("empty cluster data") http.Error(w, "empty cluster data", http.StatusInternalServerError) } if cd.ClusterView == nil { log.Errorf("empty cluster view") http.Error(w, "empty cluster view", http.StatusInternalServerError) } log.Debugf(spew.Sprintf("keepersState: %#v", cd.KeepersState)) log.Debugf(spew.Sprintf("clusterView: %#v", cd.ClusterView)) newcv := cd.ClusterView.Copy() newcv.Config = config newcv.Version += 1 if _, err := e.SetClusterData(cd.KeepersState, newcv, pair); err != nil { log.Errorf("error saving clusterdata: %v", err) http.Error(w, fmt.Sprintf("error saving clusterdata: %v", err), http.StatusInternalServerError) } }
func NewPostgresKeeper(id string, cfg config, stop chan bool, end chan error) (*PostgresKeeper, error) { etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName) e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create etcd manager: %v", err) } clusterConfig, _, err := e.GetClusterConfig() if err != nil { return nil, fmt.Errorf("cannot get cluster config: %v", err) } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) p := &PostgresKeeper{id: id, dataDir: cfg.dataDir, e: e, listenAddress: cfg.listenAddress, port: cfg.port, pgListenAddress: cfg.pgListenAddress, pgPort: cfg.pgPort, clusterConfig: clusterConfig, stop: stop, end: end, } serverParameters := p.createServerParameters() pgm, err := postgresql.NewManager(id, cfg.pgBinPath, cfg.dataDir, serverParameters, p.getOurConnString(), p.getOurReplConnString(), clusterConfig.PGReplUser, clusterConfig.PGReplPassword, clusterConfig.RequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create postgres manager: %v", err) } p.pgm = pgm return p, nil }
func (c *ClusterChecker) Check() { cv, _, err := c.e.GetClusterView() if err != nil { log.Errorf("cannot get clusterview: %v", err) c.C <- pollon.ConfData{DestAddr: nil} return } log.Debugf(spew.Sprintf("clusterview: %#v", cv)) if cv == nil { log.Infof("no clusterview available, closing connections to previous master") c.C <- pollon.ConfData{DestAddr: nil} return } pc := cv.ProxyConf if pc == nil { log.Infof("no proxyconf available, closing connections to previous master") c.C <- pollon.ConfData{DestAddr: nil} if err := c.SetProxyInfo(c.e, cv.Version, 2*cluster.DefaultProxyCheckInterval); err != nil { log.Errorf("failed to update proxyInfo: %v", err) } return } addr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("%s:%s", pc.Host, pc.Port)) if err != nil { log.Errorf("err: %v", err) c.C <- pollon.ConfData{DestAddr: nil} return } log.Infof("master address: %v", addr) if err = c.SetProxyInfo(c.e, cv.Version, 2*cluster.DefaultProxyCheckInterval); err != nil { log.Errorf("failed to update proxyInfo: %v", err) } c.C <- pollon.ConfData{DestAddr: addr} }
func NewPostgresKeeper(id string, cfg config, stop chan bool, end chan error) (*PostgresKeeper, error) { etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName) e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create etcd manager: %v", err) } cd, _, err := e.GetClusterData() if err != nil { return nil, fmt.Errorf("error retrieving cluster data: %v", err) } var cv *cluster.ClusterView if cd == nil { cv = cluster.NewClusterView() } else { cv = cd.ClusterView } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) clusterConfig := cv.Config.ToConfig() log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) p := &PostgresKeeper{id: id, dataDir: cfg.dataDir, e: e, listenAddress: cfg.listenAddress, port: cfg.port, pgListenAddress: cfg.pgListenAddress, pgPort: cfg.pgPort, clusterConfig: clusterConfig, stop: stop, end: end, } followersIDs := cv.GetFollowersIDs(p.id) pgParameters := p.createPGParameters(followersIDs) pgm, err := postgresql.NewManager(id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), clusterConfig.PGReplUser, clusterConfig.PGReplPassword, clusterConfig.RequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create postgres manager: %v", err) } p.pgm = pgm return p, nil }
func (s *Sentinel) setLeaderSentinelInfo(ttl time.Duration) error { sentinelInfo := &cluster.SentinelInfo{ ID: s.id, ListenAddress: s.listenAddress, Port: s.port, } log.Debugf(spew.Sprintf("sentinelInfo: %#v", sentinelInfo)) if _, err := s.e.SetLeaderSentinelInfo(sentinelInfo, ttl); err != nil { return err } return nil }
func NewSentinel(id string, cfg config, stop chan bool, end chan bool) (*Sentinel, error) { etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName) e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create etcd manager: %v", err) } cd, _, err := e.GetClusterData() if err != nil { return nil, fmt.Errorf("error retrieving cluster data: %v", err) } var cv *cluster.ClusterView if cd == nil { cv = cluster.NewClusterView() } else { cv = cd.ClusterView } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) clusterConfig := cv.Config.ToConfig() if err != nil { return nil, fmt.Errorf("cannot get cluster config: %v", err) } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) lManager := e.NewLeaseManager() return &Sentinel{ id: id, e: e, listenAddress: cfg.listenAddress, port: cfg.port, lManager: lManager, clusterConfig: clusterConfig, stop: stop, end: end}, nil }
func (c *ClusterChecker) SetProxyInfo(e *store.StoreManager, cvVersion int, ttl time.Duration) error { proxyInfo := &cluster.ProxyInfo{ ID: c.id, ListenAddress: c.listenAddress, Port: c.port, ClusterViewVersion: cvVersion, } log.Debugf(spew.Sprintf("proxyInfo: %#v", proxyInfo)) if err := c.e.SetProxyInfo(proxyInfo, ttl); err != nil { return err } return nil }
func (p *PostgresKeeper) publish() error { if kubernetes.OnKubernetes() { log.Infof("running under kubernetes. Not using store discovery") return nil } discoveryInfo := &cluster.KeeperDiscoveryInfo{ ListenAddress: p.listenAddress, Port: p.port, } log.Debugf(spew.Sprintf("discoveryInfo: %#v", discoveryInfo)) if err := p.e.SetKeeperDiscoveryInfo(p.id, discoveryInfo); err != nil { return err } return nil }
func (p *PostgresKeeper) publish() error { if kubernetes.OnKubernetes() { log.Infof("running under kubernetes. Not publishing ourself to etcd") return nil } discoveryInfo := &cluster.MemberDiscoveryInfo{ Host: p.listenAddress, Port: p.port, } log.Debugf(spew.Sprintf("discoveryInfo: %#v", discoveryInfo)) if _, err := p.e.SetMemberDiscoveryInfo(p.id, discoveryInfo); err != nil { return err } return nil }
func NewSentinel(id string, cfg config, stop chan bool, end chan bool) (*Sentinel, error) { etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName) e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create etcd manager: %v", err) } clusterConfig, _, err := e.GetClusterConfig() if err != nil { return nil, fmt.Errorf("cannot get cluster config: %v", err) } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) lManager := e.NewLeaseManager() return &Sentinel{id: id, e: e, lManager: lManager, clusterConfig: clusterConfig, stop: stop, end: end}, nil }
func TestParseTimeLineHistory(t *testing.T) { tests := []struct { contents string tlsh cluster.PostgresTimeLinesHistory err error }{ { contents: "", tlsh: cluster.PostgresTimeLinesHistory{}, err: nil, }, { contents: `1 0/5000090 no recovery target specified`, tlsh: cluster.PostgresTimeLinesHistory{ { TimelineID: 1, SwitchPoint: 83886224, Reason: "no recovery target specified", }, }, err: nil, }, } for i, tt := range tests { tlsh, err := parseTimeLinesHistory(tt.contents) t.Logf("test #%d", i) if tt.err != nil { if err == nil { t.Errorf("got no error, wanted error: %v", tt.err) } else if tt.err.Error() != err.Error() { t.Errorf("got error: %v, wanted error: %v", err, tt.err) } } else { if err != nil { t.Errorf("unexpected error: %v", err) } if !reflect.DeepEqual(tlsh, tt.tlsh) { t.Errorf(spew.Sprintf("#%d: wrong timeline history: got: %#+v, want: %#+v", i, tlsh, tt.tlsh)) } } } }
func (s *Sentinel) GetBestStandby(cv *cluster.ClusterView, keepersState cluster.KeepersState, master string) (string, error) { var bestID string masterState := keepersState[master] for id, k := range keepersState { log.Debugf(spew.Sprintf("id: %s, k: %#v", id, k)) if id == master { log.Debugf("ignoring node %q since it's the current master", id) continue } if !k.Healthy { log.Debugf("ignoring node %q since it's not healthy", id) continue } if k.ClusterViewVersion != cv.Version { log.Debugf("ignoring node since its clusterView version (%d) is different that the actual one (%d)", k.ClusterViewVersion, cv.Version) continue } if k.PGState == nil { log.Debugf("ignoring node since its pg state is unknown") continue } if masterState.PGState.TimelineID != k.PGState.TimelineID { log.Debugf("ignoring node since its pg timeline (%s) is different than master timeline (%d)", keepersState[id].PGState.TimelineID, masterState.PGState.TimelineID) continue } if bestID == "" { bestID = id continue } if k.PGState.XLogPos > keepersState[bestID].PGState.XLogPos { bestID = id } } if bestID == "" { return "", fmt.Errorf("no standbys available") } return bestID, nil }
func (p *PostgresKeeper) Start() { endSMCh := make(chan struct{}) endPgStatecheckerCh := make(chan struct{}) endApiCh := make(chan error) var err error var cd *cluster.ClusterData // TODO(sgotti) make the postgres manager stateless and instantiate a // new one at every check loop, this will avoid the need to loop here // to get the clusterconfig for { cd, _, err = p.e.GetClusterData() if err == nil { break } log.Errorf("error retrieving cluster data: %v", err) time.Sleep(cluster.DefaultSleepInterval) } var cv *cluster.ClusterView if cd == nil { cv = cluster.NewClusterView() } else { cv = cd.ClusterView } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) p.clusterConfig = cv.Config.ToConfig() log.Debugf(spew.Sprintf("clusterConfig: %#v", p.clusterConfig)) if err := p.loadCVVersion(); err != nil { p.end <- fmt.Errorf("failed to load cluster version file: %v", err) return } // TODO(sgotti) reconfigure the various configurations options (PGRepl* // and RequestTimeout) after a changed cluster config followersIDs := cv.GetFollowersIDs(p.id) pgParameters := p.createPGParameters(followersIDs) pgm := postgresql.NewManager(p.id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), p.clusterConfig.PGReplUser, p.clusterConfig.PGReplPassword, p.clusterConfig.RequestTimeout) p.pgm = pgm p.pgm.Stop(true) http.HandleFunc("/info", p.infoHandler) http.HandleFunc("/pgstate", p.pgStateHandler) go func() { endApiCh <- http.ListenAndServe(fmt.Sprintf("%s:%s", p.listenAddress, p.port), nil) }() ctx, cancel := context.WithCancel(context.Background()) smTimerCh := time.NewTimer(0).C updatePGStateTimerCh := time.NewTimer(0).C for true { select { case <-p.stop: log.Debugf("stopping stolon keeper") cancel() p.pgm.Stop(true) p.end <- nil return case <-smTimerCh: go func() { p.postgresKeeperSM(ctx) endSMCh <- struct{}{} }() case <-endSMCh: smTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C case <-updatePGStateTimerCh: go func() { p.updatePGState(ctx) endPgStatecheckerCh <- struct{}{} }() case <-endPgStatecheckerCh: updatePGStateTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C case err := <-endApiCh: if err != nil { log.Fatal("ListenAndServe: ", err) } close(p.stop) } } }
func (s *Sentinel) updateClusterView(cv *cluster.ClusterView, keepersState cluster.KeepersState) (*cluster.ClusterView, error) { var wantedMasterID string if cv.Master == "" { log.Debugf("trying to find initial master") // Check for an initial master if len(keepersState) < 1 { return nil, fmt.Errorf("cannot init cluster, no keepers registered") } if len(keepersState) > 1 { return nil, fmt.Errorf("cannot init cluster, more than 1 keeper registered") } for id, k := range keepersState { if k.PGState == nil { return nil, fmt.Errorf("cannot init cluster using keeper %q since its pg state is unknown", id) } if !k.PGState.Initialized { return nil, fmt.Errorf("cannot init cluster using keeper %q since pg instance is not initializied", id) } log.Infof("initializing cluster with master: %q", id) wantedMasterID = id break } } else { masterID := cv.Master wantedMasterID = masterID masterOK := true master, ok := keepersState[masterID] if !ok { return nil, fmt.Errorf("keeper state for master %q not available. This shouldn't happen!", masterID) } log.Debugf(spew.Sprintf("masterState: %#v", master)) if !master.Healthy { log.Infof("master is failed") masterOK = false } // Check that the wanted master is in master state (i.e. check that promotion from standby to master happened) if !s.isKeeperConverged(master, cv) { log.Infof("keeper %s not yet master", masterID) masterOK = false } if !masterOK { log.Infof("trying to find a standby to replace failed master") bestStandby, err := s.GetBestStandby(cv, keepersState, masterID) if err != nil { log.Errorf("error trying to find the best standby: %v", err) } else { if bestStandby != masterID { log.Infof("electing new master: %q", bestStandby) wantedMasterID = bestStandby } else { log.Infof("cannot find a good standby to replace failed master") } } } } newCV := cv.Copy() newKeepersRole := newCV.KeepersRole // Add new keepersRole from keepersState for id, _ := range keepersState { if _, ok := newKeepersRole[id]; !ok { if err := newKeepersRole.Add(id, ""); err != nil { // This shouldn't happen panic(err) } } } // Setup master role if cv.Master != wantedMasterID { newCV.Master = wantedMasterID newKeepersRole[wantedMasterID].Follow = "" } // Setup standbys if cv.Master == wantedMasterID { // wanted master is the previous one masterState := keepersState[wantedMasterID] if masterState.Healthy && s.isKeeperConverged(masterState, cv) { for id, _ := range newKeepersRole { if id == wantedMasterID { continue } newKeepersRole[id].Follow = wantedMasterID } } } s.updateProxyConf(cv, newCV, keepersState) if !newCV.Equals(cv) { newCV.Version = cv.Version + 1 newCV.ChangeTime = time.Now() } return newCV, nil }
func TestParseConfig(t *testing.T) { tests := []struct { in string cfg *Config err error }{ { in: "{}", cfg: mergeDefaults(&NilConfig{}).ToConfig(), err: nil, }, // Test duration parsing { in: `{ "request_timeout": "3s" }`, cfg: mergeDefaults(&NilConfig{RequestTimeout: &Duration{3 * time.Second}}).ToConfig(), err: nil, }, { in: `{ "request_timeout": "3000ms" }`, cfg: mergeDefaults(&NilConfig{RequestTimeout: &Duration{3 * time.Second}}).ToConfig(), err: nil, }, { in: `{ "request_timeout": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: request_timeout must be positive"), }, { in: `{ "request_timeout": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: request_timeout must be positive"), }, { in: `{ "sleep_interval": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: sleep_interval must be positive"), }, { in: `{ "keeper_fail_interval": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: keeper_fail_interval must be positive"), }, { in: `{ "pg_repl_user": "" }`, cfg: nil, err: fmt.Errorf("config validation failed: pg_repl_user cannot be empty"), }, { in: `{ "pg_repl_password": "" }`, cfg: nil, err: fmt.Errorf("config validation failed: pg_repl_password cannot be empty"), }, { in: `{ "max_standbys_per_sender": 0 }`, cfg: nil, err: fmt.Errorf("config validation failed: max_standbys_per_sender must be at least 1"), }, // All options defined { in: `{ "request_timeout": "10s", "sleep_interval": "10s", "keeper_fail_interval": "100s", "pg_repl_user": "******", "pg_repl_password": "******", "max_standbys_per_sender": 5, "synchronous_replication": true, "init_with_multiple_keepers": true, "pg_parameters": { "param01": "value01" } }`, cfg: mergeDefaults(&NilConfig{ RequestTimeout: &Duration{10 * time.Second}, SleepInterval: &Duration{10 * time.Second}, KeeperFailInterval: &Duration{100 * time.Second}, PGReplUser: StringP("username"), PGReplPassword: StringP("password"), MaxStandbysPerSender: UintP(5), SynchronousReplication: BoolP(true), InitWithMultipleKeepers: BoolP(true), PGParameters: &map[string]string{ "param01": "value01", }, }).ToConfig(), err: nil, }, } for i, tt := range tests { var nilCfg *NilConfig err := json.Unmarshal([]byte(tt.in), &nilCfg) if err != nil { if tt.err == nil { t.Errorf("#%d: unexpected error: %v", i, err) } else if tt.err.Error() != err.Error() { t.Errorf("#%d: got error: %v, wanted error: %v", i, err, tt.err) } } else { nilCfg.MergeDefaults() cfg := nilCfg.ToConfig() if tt.err != nil { t.Errorf("#%d: got no error, wanted error: %v", i, tt.err) } if !reflect.DeepEqual(cfg, tt.cfg) { t.Errorf(spew.Sprintf("#%d: wrong config: got: %#v, want: %#v", i, cfg, tt.cfg)) } } } }
func TestUpdateClusterView(t *testing.T) { tests := []struct { cv *cluster.ClusterView keepersState cluster.KeepersState outCV *cluster.ClusterView err error }{ { cv: cluster.NewClusterView(), keepersState: nil, outCV: cluster.NewClusterView(), err: fmt.Errorf("cannot init cluster, no keepers registered"), }, // cluster initialization, one keeper { cv: cluster.NewClusterView(), keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{PGState: &cluster.PostgresState{Initialized: true}}, }, outCV: &cluster.ClusterView{ Version: 1, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, }, }, }, // cluster initialization, too many keepers { cv: cluster.NewClusterView(), keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{}, "02": &cluster.KeeperState{}, }, outCV: cluster.NewClusterView(), err: fmt.Errorf("cannot init cluster, more than 1 keeper registered"), }, // One master and one standby, both healthy: no change from previous cv { cv: &cluster.ClusterView{ Version: 1, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 1, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, }, // One master and one standby, master not healthy: standby elected as new master { cv: &cluster.ClusterView{ Version: 1, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Unix(0, 0), Healthy: false, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 2, Master: "02", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: ""}, }, }, }, // From the previous test, new master (02) converged. Old master setup to follow new master (02). { cv: &cluster.ClusterView{ Version: 2, Master: "02", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: ""}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Unix(0, 0), Healthy: false, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 2, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 3, Master: "02", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: "02"}, "02": &cluster.KeeperRole{ID: "02", Follow: ""}, }, }, }, // One master and one standby, master not healthy, standby with old // clusterview: no standby elected as new master. { cv: &cluster.ClusterView{ Version: 2, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 2, ErrorStartTime: time.Unix(0, 0), Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 2, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, }, // One master and one standby, master not converged to current // cv: standby elected as new master. { cv: &cluster.ClusterView{ Version: 2, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 2, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 3, Master: "02", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: ""}, }, }, }, } s := &Sentinel{id: "id", clusterConfig: cluster.NewDefaultConfig()} for i, tt := range tests { outCV, err := s.updateClusterView(tt.cv, tt.keepersState) t.Logf("test #%d", i) t.Logf(spew.Sprintf("outCV: %#v", outCV)) if tt.err != nil { if err == nil { t.Errorf("got no error, wanted error: %v", tt.err) } else if tt.err.Error() != err.Error() { t.Errorf("got error: %v, wanted error: %v", err, tt.err) } } else { if err != nil { t.Errorf("unexpected error: %v", err) } if !outCV.Equals(tt.outCV) { t.Errorf(spew.Sprintf("#%d: wrong outCV: got: %#v, want: %#v", i, outCV, tt.outCV)) } } } }
func (s *Sentinel) clusterSentinelCheck(pctx context.Context) { s.updateMutex.Lock() defer s.updateMutex.Unlock() e := s.e cd, res, err := e.GetClusterData() if err != nil { log.Errorf("error retrieving cluster data: %v", err) return } var prevCDIndex uint64 if res != nil { prevCDIndex = res.Node.ModifiedIndex } var cv *cluster.ClusterView var keepersState cluster.KeepersState if cd == nil { cv = cluster.NewClusterView() keepersState = nil } else { cv = cd.ClusterView keepersState = cd.KeepersState } log.Debugf(spew.Sprintf("keepersState: %#v", keepersState)) log.Debugf(spew.Sprintf("clusterView: %#v", cv)) // Update cluster config // This shouldn't need a lock s.clusterConfig = cv.Config.ToConfig() if err := s.setSentinelInfo(2 * s.clusterConfig.SleepInterval); err != nil { log.Errorf("cannot update leader sentinel info: %v", err) return } // TODO(sgotti) better ways to calculate leaseTTL? leaseTTL := s.clusterConfig.SleepInterval + s.clusterConfig.RequestTimeout*4 ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersDiscoveryInfo, err := s.discover(ctx) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("keepersDiscoveryInfo: %#v", keepersDiscoveryInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersInfo, err := getKeepersInfo(ctx, keepersDiscoveryInfo) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("keepersInfo: %#v", keepersInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersPGState := getKeepersPGState(ctx, keepersInfo) cancel() log.Debugf(spew.Sprintf("keepersPGState: %#v", keepersPGState)) var l lease.Lease if isLeader(s.l, s.id) { log.Infof("I'm the sentinels leader") l = renewLeadership(s.l, leaseTTL) } else { log.Infof("trying to acquire sentinels leadership") l = acquireLeadership(s.lManager, s.id, 1, leaseTTL) } // log all leadership changes if l != nil && s.l == nil && l.MachineID() != s.id { log.Infof("sentinel leader is %s", l.MachineID()) } else if l != nil && s.l != nil && l.MachineID() != l.MachineID() { log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID()) } s.l = l if !isLeader(s.l, s.id) { return } if err := s.setLeaderSentinelInfo(leaseTTL); err != nil { log.Errorf("cannot update leader sentinel info: %v", err) return } if cv.Version == 0 { // Cluster first initialization newcv := cluster.NewClusterView() newcv.Version = 1 _, err = e.SetClusterData(nil, newcv, 0) if err != nil { log.Errorf("error saving clusterdata: %v", err) } return } newKeepersState := s.updateKeepersState(keepersState, keepersInfo, keepersPGState) log.Debugf(spew.Sprintf("newKeepersState: %#v", newKeepersState)) newcv, err := s.updateClusterView(cv, newKeepersState) if err != nil { log.Errorf("failed to update clusterView: %v", err) return } log.Debugf(spew.Sprintf("newcv: %#v", newcv)) if cv.Version < newcv.Version { log.Debugf("newcv changed from previous cv") } _, err = e.SetClusterData(newKeepersState, newcv, prevCDIndex) if err != nil { log.Errorf("error saving clusterdata: %v", err) } }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm // Update cluster config clusterConfig, _, err := e.GetClusterConfig() if err != nil { log.Errorf("cannot get cluster config: %v", err) return } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock p.clusterConfig = clusterConfig cv, _, err := e.GetClusterView() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) membersState, _, err := e.GetMembersState() if err != nil { log.Errorf("err: %v", err) return } if membersState == nil { membersState = cluster.MembersState{} } log.Debugf(spew.Sprintf("membersState: %#v", membersState)) member := membersState[p.id] log.Debugf(spew.Sprintf("memberState: %#v", member)) initialized, err := pgm.IsInitialized() if err != nil { log.Errorf("failed to detect if instance is initialized: %v", err) return } if cv == nil { if !initialized { log.Infof("Initializing database") err = pgm.Init() if err != nil { log.Errorf("failed to initialized postgres instance: %v", err) return } initialized = true } } started := false if initialized { started, err = pgm.IsStarted() if err != nil { log.Errorf("failed to retrieve instance status: %v", err) } else if !started { err = pgm.Start() if err != nil { log.Errorf("err: %v", err) } else { started = true } } } if cv != nil { if !started && p.id == cv.Master { // If the clusterView says we are master but we cannot get // instance status or start then stop here, if we are standby then we can // recover return } } role, err := pgm.GetRole() if err != nil { log.Infof("error retrieving current pg role: %v", err) return } isMaster := false if role == common.MasterRole { log.Infof("current pg state: master") isMaster = true } else { log.Infof("current pg state: standby") } // publish ourself for discovery if err := p.publish(); err != nil { log.Errorf("failed to publish ourself to the cluster: %v", err) return } if cv == nil { return } // cv != nil masterID := cv.Master log.Debugf("masterID: %q", masterID) master := membersState[masterID] log.Debugf(spew.Sprintf("masterState: %#v", master)) followersIDs := cv.GetFollowersIDs(p.id) memberRole, ok := cv.MembersRole[p.id] if !ok { log.Infof("our member state is not available") return } if memberRole.Follow == "" { log.Infof("our cluster requested state is master") if role != common.MasterRole { log.Infof("promoting to master") err := pgm.Promote() if err != nil { log.Errorf("err: %v", err) return } } else { log.Infof("already master") replSlots := []string{} replSlots, err = pgm.GetReplicatinSlots() if err != nil { log.Errorf("err: %v", err) return } // Create replication slots for _, slotName := range replSlots { if !util.StringInSlice(followersIDs, slotName) { log.Infof("dropping replication slot for member %q not marked as follower", slotName) err := pgm.DropReplicationSlot(slotName) if err != nil { log.Errorf("err: %v", err) } } } for _, followerID := range followersIDs { if followerID == p.id { continue } if !util.StringInSlice(replSlots, followerID) { err := pgm.CreateReplicationSlot(followerID) if err != nil { log.Errorf("err: %v", err) } } } // Setup synchronous replication syncStandbyNames, _ := pgm.GetServerParameter("synchronous_standby_names") if p.clusterConfig.SynchronousReplication { newSyncStandbyNames := strings.Join(followersIDs, ",") if syncStandbyNames != newSyncStandbyNames { log.Infof("needed synchronous_standby_names changed from %q to %q, reconfiguring", syncStandbyNames, newSyncStandbyNames) pgm.SetServerParameter("synchronous_standby_names", newSyncStandbyNames) pgm.Reload() } } else { if syncStandbyNames != "" { log.Infof("sync replication disabled, removing current synchronous_standby_names %q", syncStandbyNames) pgm.SetServerParameter("synchronous_standby_names", "") pgm.Reload() } } } } else { log.Infof("our cluster requested state is standby following %q", memberRole.Follow) if isMaster { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } else { log.Infof("already standby") curConnParams, err := pgm.GetPrimaryConninfo() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams)) replConnString := p.getReplConnString(master) newConnParams, err := pg.URLToConnParams(replConnString) if err != nil { log.Errorf("cannot get conn params: %v", err) return } log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams)) // Check that we can sync with master // Check timeline history ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout) pgState, err := pg.GetPGState(ctx, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("cannot get our pgstate: %v", err) return } mPGState := master.PGState if p.isDifferentTimelineBranch(mPGState, pgState) { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } // TODO(sgotti) Check that the master has all the needed WAL segments // Update our primary_conninfo if replConnString changed if !curConnParams.Equals(newConnParams) { log.Infof("master connection parameters changed. Reconfiguring...") log.Infof("following %s with connection url %s", memberRole.Follow, replConnString) err = pgm.BecomeStandby(replConnString) if err != nil { log.Errorf("err: %v", err) return } err = pgm.Restart(true) if err != nil { log.Errorf("err: %v", err) return } } } } if err := p.saveCVVersion(cv.Version); err != nil { log.Errorf("err: %v", err) return } }
func TestParseConfig(t *testing.T) { tests := []struct { in string cfg *Config err error }{ { in: "{}", cfg: mergeDefaults(&NilConfig{}).ToConfig(), err: nil, }, // Test duration parsing { in: `{ "requesttimeout": "3s" }`, cfg: mergeDefaults(&NilConfig{RequestTimeout: DurationP(3 * time.Second)}).ToConfig(), err: nil, }, { in: `{ "requesttimeout": "3000ms" }`, cfg: mergeDefaults(&NilConfig{RequestTimeout: DurationP(3 * time.Second)}).ToConfig(), err: nil, }, { in: `{ "requesttimeout": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: RequestTimeout must be positive"), }, { in: `{ "requesttimeout": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: RequestTimeout must be positive"), }, { in: `{ "sleepinterval": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: SleepInterval must be positive"), }, { in: `{ "keeperfailinterval": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: KeeperFailInterval must be positive"), }, { in: `{ "pgrepluser": "" }`, cfg: nil, err: fmt.Errorf("config validation failed: PGReplUser cannot be empty"), }, { in: `{ "pgreplpassword": "" }`, cfg: nil, err: fmt.Errorf("config validation failed: PGReplPassword cannot be empty"), }, { in: `{ "maxstandbyspersender": 0 }`, cfg: nil, err: fmt.Errorf("config validation failed: MaxStandbysPerSender must be at least 1"), }, // All options defined { in: `{ "requestTimeout": "10s", "sleepInterval": "10s", "keeperFailInterval": "100s", "pgrepluser": "******", "pgreplpassword": "******", "maxstandbyspersender": 5, "synchronousreplication": true}`, cfg: mergeDefaults(&NilConfig{ RequestTimeout: DurationP(10 * time.Second), SleepInterval: DurationP(10 * time.Second), KeeperFailInterval: DurationP(100 * time.Second), PGReplUser: StringP("username"), PGReplPassword: StringP("password"), MaxStandbysPerSender: UintP(5), SynchronousReplication: BoolP(true), }).ToConfig(), err: nil, }, } for i, tt := range tests { var nilCfg *NilConfig err := json.Unmarshal([]byte(tt.in), &nilCfg) nilCfg.MergeDefaults() cfg := nilCfg.ToConfig() if tt.err != nil { if err == nil { t.Errorf("#%d: got no error, wanted error: %v", i, tt.err) } else if tt.err.Error() != err.Error() { t.Errorf("#%d: got error: %v, wanted error: %v", i, err, tt.err) } } else { if err != nil { t.Errorf("#%d: unexpected error: %v", i, err) } if !reflect.DeepEqual(cfg, tt.cfg) { t.Errorf(spew.Sprintf("#%d: wrong config: got: %#v, want: %#v", i, cfg, tt.cfg)) } } } }
func (s *Sentinel) clusterSentinelSM(pctx context.Context) { e := s.e // Update cluster config clusterConfig, _, err := e.GetClusterConfig() if err != nil { log.Errorf("cannot get cluster config: %v", err) return } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock s.clusterConfig = clusterConfig // TODO(sgotti) better ways to calculate leaseTTL? leaseTTL := clusterConfig.SleepInterval + clusterConfig.RequestTimeout*4 ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersDiscoveryInfo, err := s.discover(ctx) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("membersDiscoveryInfo: %#v", membersDiscoveryInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersInfo, err := getMembersInfo(ctx, membersDiscoveryInfo) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("membersInfo: %#v", membersInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersPGState := getMembersPGState(ctx, membersInfo) cancel() log.Debugf(spew.Sprintf("membersPGState: %#v", membersPGState)) var l lease.Lease if isLeader(s.l, s.id) { log.Infof("I'm the sentinels leader") l = renewLeadership(s.l, leaseTTL) } else { log.Infof("trying to acquire sentinels leadership") l = acquireLeadership(s.lManager, s.id, 1, leaseTTL) } // log all leadership changes if l != nil && s.l == nil && l.MachineID() != s.id { log.Infof("sentinel leader is %s", l.MachineID()) } else if l != nil && s.l != nil && l.MachineID() != l.MachineID() { log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID()) } s.l = l if !isLeader(s.l, s.id) { return } cd, res, err := e.GetClusterData() if err != nil { log.Errorf("error retrieving cluster data: %v", err) return } var prevCDIndex uint64 if res != nil { prevCDIndex = res.Node.ModifiedIndex } var cv *cluster.ClusterView var membersState cluster.MembersState if cd == nil { cv = cluster.NewClusterView() membersState = nil } else { cv = cd.ClusterView membersState = cd.MembersState } log.Debugf(spew.Sprintf("membersState: %#v", membersState)) log.Debugf(spew.Sprintf("clusterView: %#v", cv)) pv, res, err := e.GetProxyView() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("proxyview: %#v", pv)) var prevPVIndex uint64 if res != nil { prevPVIndex = res.Node.ModifiedIndex } newMembersState := s.updateMembersState(membersState, membersInfo, membersPGState) log.Debugf(spew.Sprintf("newMembersState: %#v", newMembersState)) newcv, err := s.updateClusterView(cv, newMembersState) if err != nil { log.Errorf("failed to update clusterView: %v", err) return } log.Debugf(spew.Sprintf("newcv: %#v", newcv)) if cv.Version < newcv.Version { log.Debugf("newcv changed from previous cv") if err := s.updateProxyView(cv, newcv, newMembersState, prevPVIndex); err != nil { log.Errorf("error updating proxyView: %v", err) return } } _, err = e.SetClusterData(newMembersState, newcv, prevCDIndex) if err != nil { log.Errorf("error saving clusterdata: %v", err) } }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm cv, _, err := e.GetClusterView() if err != nil { log.Errorf("error retrieving cluster view: %v", err) return } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) if cv == nil { log.Infof("no clusterview available, waiting for it to appear") return } followersIDs := cv.GetFollowersIDs(p.id) // Update cluster config clusterConfig := cv.Config.ToConfig() log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock p.clusterConfig = clusterConfig prevPGParameters := pgm.GetParameters() // create postgres parameteres pgParameters := p.createPGParameters(followersIDs) // update pgm postgres parameters pgm.SetParameters(pgParameters) keepersState, _, err := e.GetKeepersState() if err != nil { log.Errorf("err: %v", err) return } if keepersState == nil { keepersState = cluster.KeepersState{} } log.Debugf(spew.Sprintf("keepersState: %#v", keepersState)) keeper := keepersState[p.id] log.Debugf(spew.Sprintf("keeperState: %#v", keeper)) initialized, err := pgm.IsInitialized() if err != nil { log.Errorf("failed to detect if instance is initialized: %v", err) return } if len(cv.KeepersRole) == 0 { if !initialized { log.Infof("Initializing database") err = pgm.Init() if err != nil { log.Errorf("failed to initialized postgres instance: %v", err) return } initialized = true } } started := false if initialized { started, err = pgm.IsStarted() if err != nil { log.Errorf("failed to retrieve instance status: %v", err) } else if !started { err = pgm.Start() if err != nil { log.Errorf("failed to start postgres: %v", err) } else { started = true } } } if cv != nil { if !started && p.id == cv.Master { // If the clusterView says we are master but we cannot get // instance status or start then stop here, if we are standby then we can // recover return } } role, err := pgm.GetRole() if err != nil { log.Infof("error retrieving current pg role: %v", err) return } isMaster := false if role == common.MasterRole { log.Infof("current pg state: master") isMaster = true } else { log.Infof("current pg state: standby") } // publish ourself for discovery if err := p.publish(); err != nil { log.Errorf("failed to publish ourself to the cluster: %v", err) return } if cv == nil { return } // cv != nil masterID := cv.Master log.Debugf("masterID: %q", masterID) master := keepersState[masterID] log.Debugf(spew.Sprintf("masterState: %#v", master)) keeperRole, ok := cv.KeepersRole[p.id] if !ok { log.Infof("our keeper requested role is not available") return } if keeperRole.Follow == "" { log.Infof("our cluster requested state is master") if role != common.MasterRole { log.Infof("promoting to master") err := pgm.Promote() if err != nil { log.Errorf("err: %v", err) return } } else { log.Infof("already master") replSlots := []string{} replSlots, err = pgm.GetReplicatinSlots() if err != nil { log.Errorf("err: %v", err) return } // Create replication slots for _, slotName := range replSlots { if !util.StringInSlice(followersIDs, slotName) { log.Infof("dropping replication slot for keeper %q not marked as follower", slotName) err := pgm.DropReplicationSlot(slotName) if err != nil { log.Errorf("err: %v", err) } } } for _, followerID := range followersIDs { if followerID == p.id { continue } if !util.StringInSlice(replSlots, followerID) { err := pgm.CreateReplicationSlot(followerID) if err != nil { log.Errorf("err: %v", err) } } } } } else { log.Infof("our cluster requested state is standby following %q", keeperRole.Follow) if isMaster { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } else { log.Infof("already standby") curConnParams, err := pgm.GetPrimaryConninfo() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams)) replConnString := p.getReplConnString(master) newConnParams, err := pg.URLToConnParams(replConnString) if err != nil { log.Errorf("cannot get conn params: %v", err) return } log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams)) // Check that we can sync with master // Check timeline history // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state p.updatePGState(pctx) pgState := p.getLastPGState() if pgState == nil { log.Errorf("our pgstate is unknown: %v", err) return } mPGState := master.PGState if p.isDifferentTimelineBranch(mPGState, pgState) { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } // TODO(sgotti) Check that the master has all the needed WAL segments // Update our primary_conninfo if replConnString changed if !curConnParams.Equals(newConnParams) { log.Infof("master connection parameters changed. Reconfiguring...") log.Infof("following %s with connection url %s", keeperRole.Follow, replConnString) err = pgm.BecomeStandby(replConnString) if err != nil { log.Errorf("err: %v", err) return } err = pgm.Restart(true) if err != nil { log.Errorf("err: %v", err) return } } } } // Log synchronous replication changes prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"] syncStandbyNames := pgParameters["synchronous_standby_names"] if p.clusterConfig.SynchronousReplication { if prevSyncStandbyNames != syncStandbyNames { log.Infof("needed synchronous_standby_names changed from %q to %q", prevSyncStandbyNames, syncStandbyNames) } } else { if prevSyncStandbyNames != "" { log.Infof("sync replication disabled, removing current synchronous_standby_names %q", prevSyncStandbyNames) } } if !pgParameters.Equals(prevPGParameters) { log.Infof("postgres parameters changed, reloading postgres instance") pgm.SetParameters(pgParameters) if err := pgm.Reload(); err != nil { log.Errorf("failed to reload postgres instance: %v", err) } } else { // for tests log.Debugf("postgres parameters not changed") } if err := p.saveCVVersion(cv.Version); err != nil { log.Errorf("err: %v", err) return } }
func TestParseConfig(t *testing.T) { tests := []struct { in string cfg *Config err error }{ { in: "{}", cfg: &DefaultConfig, err: nil, }, // Test duration parsing { in: `{ "requesttimeout": "3s" }`, cfg: mergeDefaultConfig(&Config{RequestTimeout: 3 * time.Second}), err: nil, }, { in: `{ "requesttimeout": "3000ms" }`, cfg: mergeDefaultConfig(&Config{RequestTimeout: 3 * time.Second}), err: nil, }, { in: `{ "requesttimeout": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: RequestTimeout must be positive"), }, { in: `{ "requesttimeout": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: RequestTimeout must be positive"), }, { in: `{ "sleepinterval": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: SleepInterval must be positive"), }, { in: `{ "memberfailinterval": "-3s" }`, cfg: nil, err: fmt.Errorf("config validation failed: MemberFailInterval must be positive"), }, { in: `{ "pgrepluser": "" }`, cfg: nil, err: fmt.Errorf("config validation failed: PGReplUser cannot be empty"), }, { in: `{ "pgreplpassword": "" }`, cfg: nil, err: fmt.Errorf("config validation failed: PGReplPassword cannot be empty"), }, { in: `{ "maxstandbyspersender": 0 }`, cfg: nil, err: fmt.Errorf("config validation failed: MaxStandbysPerSender must be at least 1"), }, // All options defined { in: `{ "requestTimeout": "10s", "sleepInterval": "10s", "memberFailInterval": "100s", "pgrepluser": "******", "pgreplpassword": "******", "maxstandbyspersender": 5, "synchronousreplication": true}`, cfg: mergeDefaultConfig(&Config{ RequestTimeout: 10 * time.Second, SleepInterval: 10 * time.Second, MemberFailInterval: 100 * time.Second, PGReplUser: "******", PGReplPassword: "******", MaxStandbysPerSender: 5, SynchronousReplication: true, }), err: nil, }, } for i, tt := range tests { cfg, err := ParseConfig([]byte(tt.in)) if tt.err != nil { if err == nil { t.Errorf("got no error, wanted error: %v", tt.err) } else if tt.err.Error() != err.Error() { t.Errorf("got error: %v, wanted error: %v", err, tt.err) } } else { if err != nil { t.Errorf("unexpected error: %v", err) } if !reflect.DeepEqual(cfg, tt.cfg) { t.Errorf(spew.Sprintf("#%d: wrong config: got: %#v, want: %#v", i, cfg, tt.cfg)) } } } }
func (s *Sentinel) updateClusterView(cv *cluster.ClusterView, membersState cluster.MembersState) (*cluster.ClusterView, error) { var wantedMasterID string // Cluster first initialization if cv.Version == 0 { log.Debugf("trying to find initial master") // Check for an initial master if len(membersState) < 1 { return nil, fmt.Errorf("cannot init cluster, no members registered") } if len(membersState) > 1 { return nil, fmt.Errorf("cannot init cluster, more than 1 member registered") } for id, m := range membersState { if m.PGState == nil { return nil, fmt.Errorf("cannot init cluster using member %q since its pg state is unknown", id) } log.Infof("Initializing cluster with master: %q", id) wantedMasterID = id break } } else { masterID := cv.Master masterOK := true master, ok := membersState[masterID] if !ok { return nil, fmt.Errorf("member state for master %q not available. This shouldn't happen!", masterID) } log.Debugf(spew.Sprintf("masterState: %#v", master)) if !s.isMemberHealthy(master) { log.Infof("master is failed") masterOK = false } // Check that the wanted master is in master state (i.e. check that promotion from standby to master happened) if !s.isMemberConverged(master, cv) { log.Infof("member %s not yet master", masterID) masterOK = false } wantedMasterID = masterID if !masterOK { log.Infof("trying to find a standby to replace failed master") bestStandby, err := s.GetBestStandby(cv, membersState, masterID) if err != nil { log.Errorf("error trying to find the best standby: %v", err) } else { if bestStandby != masterID { log.Infof("electing new master: %q", bestStandby) wantedMasterID = bestStandby } else { log.Infof("cannot find a good standby to replace failed master") } } } } newCV := cv.Copy() newMembersRole := newCV.MembersRole // Add new members from membersState for id, _ := range membersState { if _, ok := newMembersRole[id]; !ok { newMembersRole[id] = &cluster.MemberRole{} } } // Setup master role if cv.Master != wantedMasterID { newCV.Master = wantedMasterID newMembersRole[wantedMasterID] = &cluster.MemberRole{Follow: ""} } // Setup standbys if cv.Master == wantedMasterID { // wanted master is the previous one masterState := membersState[wantedMasterID] if s.isMemberHealthy(masterState) && s.isMemberConverged(masterState, cv) { for id, _ := range newMembersRole { if id == wantedMasterID { continue } newMembersRole[id] = &cluster.MemberRole{Follow: wantedMasterID} } } } if !newCV.Equals(cv) { newCV.Version = cv.Version + 1 newCV.ChangeTime = time.Now() } return newCV, nil }