func NewPostgresKeeper(id string, cfg config, stop chan bool, end chan error) (*PostgresKeeper, error) { etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName) e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create etcd manager: %v", err) } cd, _, err := e.GetClusterData() if err != nil { return nil, fmt.Errorf("error retrieving cluster data: %v", err) } var cv *cluster.ClusterView if cd == nil { cv = cluster.NewClusterView() } else { cv = cd.ClusterView } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) clusterConfig := cv.Config.ToConfig() log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) p := &PostgresKeeper{id: id, dataDir: cfg.dataDir, e: e, listenAddress: cfg.listenAddress, port: cfg.port, pgListenAddress: cfg.pgListenAddress, pgPort: cfg.pgPort, clusterConfig: clusterConfig, stop: stop, end: end, } followersIDs := cv.GetFollowersIDs(p.id) pgParameters := p.createPGParameters(followersIDs) pgm, err := postgresql.NewManager(id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), clusterConfig.PGReplUser, clusterConfig.PGReplPassword, clusterConfig.RequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create postgres manager: %v", err) } p.pgm = pgm return p, nil }
func NewSentinel(id string, cfg config, stop chan bool, end chan bool) (*Sentinel, error) { etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName) e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create etcd manager: %v", err) } cd, _, err := e.GetClusterData() if err != nil { return nil, fmt.Errorf("error retrieving cluster data: %v", err) } var cv *cluster.ClusterView if cd == nil { cv = cluster.NewClusterView() } else { cv = cd.ClusterView } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) clusterConfig := cv.Config.ToConfig() if err != nil { return nil, fmt.Errorf("cannot get cluster config: %v", err) } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) lManager := e.NewLeaseManager() return &Sentinel{ id: id, e: e, listenAddress: cfg.listenAddress, port: cfg.port, lManager: lManager, clusterConfig: clusterConfig, stop: stop, end: end}, nil }
func (p *PostgresKeeper) Start() { endSMCh := make(chan struct{}) endPgStatecheckerCh := make(chan struct{}) endApiCh := make(chan error) var err error var cd *cluster.ClusterData // TODO(sgotti) make the postgres manager stateless and instantiate a // new one at every check loop, this will avoid the need to loop here // to get the clusterconfig for { cd, _, err = p.e.GetClusterData() if err == nil { break } log.Errorf("error retrieving cluster data: %v", err) time.Sleep(cluster.DefaultSleepInterval) } var cv *cluster.ClusterView if cd == nil { cv = cluster.NewClusterView() } else { cv = cd.ClusterView } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) p.clusterConfig = cv.Config.ToConfig() log.Debugf(spew.Sprintf("clusterConfig: %#v", p.clusterConfig)) if err := p.loadCVVersion(); err != nil { p.end <- fmt.Errorf("failed to load cluster version file: %v", err) return } // TODO(sgotti) reconfigure the various configurations options (PGRepl* // and RequestTimeout) after a changed cluster config followersIDs := cv.GetFollowersIDs(p.id) pgParameters := p.createPGParameters(followersIDs) pgm := postgresql.NewManager(p.id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), p.clusterConfig.PGReplUser, p.clusterConfig.PGReplPassword, p.clusterConfig.RequestTimeout) p.pgm = pgm p.pgm.Stop(true) http.HandleFunc("/info", p.infoHandler) http.HandleFunc("/pgstate", p.pgStateHandler) go func() { endApiCh <- http.ListenAndServe(fmt.Sprintf("%s:%s", p.listenAddress, p.port), nil) }() ctx, cancel := context.WithCancel(context.Background()) smTimerCh := time.NewTimer(0).C updatePGStateTimerCh := time.NewTimer(0).C for true { select { case <-p.stop: log.Debugf("stopping stolon keeper") cancel() p.pgm.Stop(true) p.end <- nil return case <-smTimerCh: go func() { p.postgresKeeperSM(ctx) endSMCh <- struct{}{} }() case <-endSMCh: smTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C case <-updatePGStateTimerCh: go func() { p.updatePGState(ctx) endPgStatecheckerCh <- struct{}{} }() case <-endPgStatecheckerCh: updatePGStateTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C case err := <-endApiCh: if err != nil { log.Fatal("ListenAndServe: ", err) } close(p.stop) } } }
func TestUpdateClusterView(t *testing.T) { tests := []struct { cv *cluster.ClusterView keepersState cluster.KeepersState outCV *cluster.ClusterView err error }{ { cv: cluster.NewClusterView(), keepersState: nil, outCV: cluster.NewClusterView(), err: fmt.Errorf("cannot init cluster, no keepers registered"), }, // cluster initialization, one keeper { cv: cluster.NewClusterView(), keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{PGState: &cluster.PostgresState{Initialized: true}}, }, outCV: &cluster.ClusterView{ Version: 1, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, }, }, }, // cluster initialization, too many keepers { cv: cluster.NewClusterView(), keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{}, "02": &cluster.KeeperState{}, }, outCV: cluster.NewClusterView(), err: fmt.Errorf("cannot init cluster, more than 1 keeper registered"), }, // One master and one standby, both healthy: no change from previous cv { cv: &cluster.ClusterView{ Version: 1, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 1, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, }, // One master and one standby, master not healthy: standby elected as new master { cv: &cluster.ClusterView{ Version: 1, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Unix(0, 0), Healthy: false, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 2, Master: "02", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: ""}, }, }, }, // From the previous test, new master (02) converged. Old master setup to follow new master (02). { cv: &cluster.ClusterView{ Version: 2, Master: "02", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: ""}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Unix(0, 0), Healthy: false, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 2, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 3, Master: "02", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: "02"}, "02": &cluster.KeeperRole{ID: "02", Follow: ""}, }, }, }, // One master and one standby, master not healthy, standby with old // clusterview: no standby elected as new master. { cv: &cluster.ClusterView{ Version: 2, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 2, ErrorStartTime: time.Unix(0, 0), Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 2, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, }, // One master and one standby, master not converged to current // cv: standby elected as new master. { cv: &cluster.ClusterView{ Version: 2, Master: "01", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: "01"}, }, }, keepersState: cluster.KeepersState{ "01": &cluster.KeeperState{ ClusterViewVersion: 1, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, "02": &cluster.KeeperState{ ClusterViewVersion: 2, ErrorStartTime: time.Time{}, Healthy: true, PGState: &cluster.PostgresState{ TimelineID: 0, }, }, }, outCV: &cluster.ClusterView{ Version: 3, Master: "02", KeepersRole: cluster.KeepersRole{ "01": &cluster.KeeperRole{ID: "01", Follow: ""}, "02": &cluster.KeeperRole{ID: "02", Follow: ""}, }, }, }, } s := &Sentinel{id: "id", clusterConfig: cluster.NewDefaultConfig()} for i, tt := range tests { outCV, err := s.updateClusterView(tt.cv, tt.keepersState) t.Logf("test #%d", i) t.Logf(spew.Sprintf("outCV: %#v", outCV)) if tt.err != nil { if err == nil { t.Errorf("got no error, wanted error: %v", tt.err) } else if tt.err.Error() != err.Error() { t.Errorf("got error: %v, wanted error: %v", err, tt.err) } } else { if err != nil { t.Errorf("unexpected error: %v", err) } if !outCV.Equals(tt.outCV) { t.Errorf(spew.Sprintf("#%d: wrong outCV: got: %#v, want: %#v", i, outCV, tt.outCV)) } } } }
func (s *Sentinel) clusterSentinelCheck(pctx context.Context) { s.updateMutex.Lock() defer s.updateMutex.Unlock() e := s.e cd, res, err := e.GetClusterData() if err != nil { log.Errorf("error retrieving cluster data: %v", err) return } var prevCDIndex uint64 if res != nil { prevCDIndex = res.Node.ModifiedIndex } var cv *cluster.ClusterView var keepersState cluster.KeepersState if cd == nil { cv = cluster.NewClusterView() keepersState = nil } else { cv = cd.ClusterView keepersState = cd.KeepersState } log.Debugf(spew.Sprintf("keepersState: %#v", keepersState)) log.Debugf(spew.Sprintf("clusterView: %#v", cv)) // Update cluster config // This shouldn't need a lock s.clusterConfig = cv.Config.ToConfig() if err := s.setSentinelInfo(2 * s.clusterConfig.SleepInterval); err != nil { log.Errorf("cannot update leader sentinel info: %v", err) return } // TODO(sgotti) better ways to calculate leaseTTL? leaseTTL := s.clusterConfig.SleepInterval + s.clusterConfig.RequestTimeout*4 ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersDiscoveryInfo, err := s.discover(ctx) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("keepersDiscoveryInfo: %#v", keepersDiscoveryInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersInfo, err := getKeepersInfo(ctx, keepersDiscoveryInfo) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("keepersInfo: %#v", keepersInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersPGState := getKeepersPGState(ctx, keepersInfo) cancel() log.Debugf(spew.Sprintf("keepersPGState: %#v", keepersPGState)) var l lease.Lease if isLeader(s.l, s.id) { log.Infof("I'm the sentinels leader") l = renewLeadership(s.l, leaseTTL) } else { log.Infof("trying to acquire sentinels leadership") l = acquireLeadership(s.lManager, s.id, 1, leaseTTL) } // log all leadership changes if l != nil && s.l == nil && l.MachineID() != s.id { log.Infof("sentinel leader is %s", l.MachineID()) } else if l != nil && s.l != nil && l.MachineID() != l.MachineID() { log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID()) } s.l = l if !isLeader(s.l, s.id) { return } if err := s.setLeaderSentinelInfo(leaseTTL); err != nil { log.Errorf("cannot update leader sentinel info: %v", err) return } if cv.Version == 0 { // Cluster first initialization newcv := cluster.NewClusterView() newcv.Version = 1 _, err = e.SetClusterData(nil, newcv, 0) if err != nil { log.Errorf("error saving clusterdata: %v", err) } return } newKeepersState := s.updateKeepersState(keepersState, keepersInfo, keepersPGState) log.Debugf(spew.Sprintf("newKeepersState: %#v", newKeepersState)) newcv, err := s.updateClusterView(cv, newKeepersState) if err != nil { log.Errorf("failed to update clusterView: %v", err) return } log.Debugf(spew.Sprintf("newcv: %#v", newcv)) if cv.Version < newcv.Version { log.Debugf("newcv changed from previous cv") } _, err = e.SetClusterData(newKeepersState, newcv, prevCDIndex) if err != nil { log.Errorf("error saving clusterdata: %v", err) } }
func (s *Sentinel) clusterSentinelSM(pctx context.Context) { e := s.e // Update cluster config clusterConfig, _, err := e.GetClusterConfig() if err != nil { log.Errorf("cannot get cluster config: %v", err) return } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock s.clusterConfig = clusterConfig // TODO(sgotti) better ways to calculate leaseTTL? leaseTTL := clusterConfig.SleepInterval + clusterConfig.RequestTimeout*4 ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersDiscoveryInfo, err := s.discover(ctx) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("membersDiscoveryInfo: %#v", membersDiscoveryInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersInfo, err := getMembersInfo(ctx, membersDiscoveryInfo) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("membersInfo: %#v", membersInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersPGState := getMembersPGState(ctx, membersInfo) cancel() log.Debugf(spew.Sprintf("membersPGState: %#v", membersPGState)) var l lease.Lease if isLeader(s.l, s.id) { log.Infof("I'm the sentinels leader") l = renewLeadership(s.l, leaseTTL) } else { log.Infof("trying to acquire sentinels leadership") l = acquireLeadership(s.lManager, s.id, 1, leaseTTL) } // log all leadership changes if l != nil && s.l == nil && l.MachineID() != s.id { log.Infof("sentinel leader is %s", l.MachineID()) } else if l != nil && s.l != nil && l.MachineID() != l.MachineID() { log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID()) } s.l = l if !isLeader(s.l, s.id) { return } cd, res, err := e.GetClusterData() if err != nil { log.Errorf("error retrieving cluster data: %v", err) return } var prevCDIndex uint64 if res != nil { prevCDIndex = res.Node.ModifiedIndex } var cv *cluster.ClusterView var membersState cluster.MembersState if cd == nil { cv = cluster.NewClusterView() membersState = nil } else { cv = cd.ClusterView membersState = cd.MembersState } log.Debugf(spew.Sprintf("membersState: %#v", membersState)) log.Debugf(spew.Sprintf("clusterView: %#v", cv)) pv, res, err := e.GetProxyView() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("proxyview: %#v", pv)) var prevPVIndex uint64 if res != nil { prevPVIndex = res.Node.ModifiedIndex } newMembersState := s.updateMembersState(membersState, membersInfo, membersPGState) log.Debugf(spew.Sprintf("newMembersState: %#v", newMembersState)) newcv, err := s.updateClusterView(cv, newMembersState) if err != nil { log.Errorf("failed to update clusterView: %v", err) return } log.Debugf(spew.Sprintf("newcv: %#v", newcv)) if cv.Version < newcv.Version { log.Debugf("newcv changed from previous cv") if err := s.updateProxyView(cv, newcv, newMembersState, prevPVIndex); err != nil { log.Errorf("error updating proxyView: %v", err) return } } _, err = e.SetClusterData(newMembersState, newcv, prevCDIndex) if err != nil { log.Errorf("error saving clusterdata: %v", err) } }