func (s *Sentinel) updateProxyConf(prevCV *cluster.ClusterView, cv *cluster.ClusterView, keepersState cluster.KeepersState) { masterID := cv.Master if prevCV.Master != masterID { log.Infof("deleting proxyconf") // Tell proxy to close connection to old master cv.ProxyConf = nil return } master, _ := keepersState[masterID] if s.isKeeperConverged(master, prevCV) { pc := &cluster.ProxyConf{ Host: master.PGListenAddress, Port: master.PGPort, } prevPC := prevCV.ProxyConf update := true if prevPC != nil { if prevPC.Host == pc.Host && prevPC.Port == pc.Port { update = false } } if update { log.Infof("updating proxyconf to %s:%s", pc.Host, pc.Port) cv.ProxyConf = pc } } return }
func NewPostgresKeeper(id string, cfg config, stop chan bool, end chan error) (*PostgresKeeper, error) { etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName) e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create etcd manager: %v", err) } cd, _, err := e.GetClusterData() if err != nil { return nil, fmt.Errorf("error retrieving cluster data: %v", err) } var cv *cluster.ClusterView if cd == nil { cv = cluster.NewClusterView() } else { cv = cd.ClusterView } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) clusterConfig := cv.Config.ToConfig() log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) p := &PostgresKeeper{id: id, dataDir: cfg.dataDir, e: e, listenAddress: cfg.listenAddress, port: cfg.port, pgListenAddress: cfg.pgListenAddress, pgPort: cfg.pgPort, clusterConfig: clusterConfig, stop: stop, end: end, } followersIDs := cv.GetFollowersIDs(p.id) pgParameters := p.createPGParameters(followersIDs) pgm, err := postgresql.NewManager(id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), clusterConfig.PGReplUser, clusterConfig.PGReplPassword, clusterConfig.RequestTimeout) if err != nil { return nil, fmt.Errorf("cannot create postgres manager: %v", err) } p.pgm = pgm return p, nil }
func printTree(id string, cv *cluster.ClusterView, level int, prefix string, tail bool) { out := prefix if level > 0 { if tail { out += "└─" } else { out += "├─" } } out += id if id == cv.Master { out += " (master)" } stdout(out) followersIDs := cv.GetFollowersIDs(id) c := len(followersIDs) for i, f := range cv.GetFollowersIDs(id) { emptyspace := "" if level > 0 { emptyspace = " " } linespace := "│ " if i < c-1 { if tail { printTree(f, cv, level+1, prefix+emptyspace, false) } else { printTree(f, cv, level+1, prefix+linespace, false) } } else { if tail { printTree(f, cv, level+1, prefix+emptyspace, true) } else { printTree(f, cv, level+1, prefix+linespace, true) } } } }
func (p *PostgresKeeper) Start() { endSMCh := make(chan struct{}) endPgStatecheckerCh := make(chan struct{}) endApiCh := make(chan error) var err error var cd *cluster.ClusterData // TODO(sgotti) make the postgres manager stateless and instantiate a // new one at every check loop, this will avoid the need to loop here // to get the clusterconfig for { cd, _, err = p.e.GetClusterData() if err == nil { break } log.Errorf("error retrieving cluster data: %v", err) time.Sleep(cluster.DefaultSleepInterval) } var cv *cluster.ClusterView if cd == nil { cv = cluster.NewClusterView() } else { cv = cd.ClusterView } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) p.clusterConfig = cv.Config.ToConfig() log.Debugf(spew.Sprintf("clusterConfig: %#v", p.clusterConfig)) if err := p.loadCVVersion(); err != nil { p.end <- fmt.Errorf("failed to load cluster version file: %v", err) return } // TODO(sgotti) reconfigure the various configurations options (PGRepl* // and RequestTimeout) after a changed cluster config followersIDs := cv.GetFollowersIDs(p.id) pgParameters := p.createPGParameters(followersIDs) pgm := postgresql.NewManager(p.id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), p.clusterConfig.PGReplUser, p.clusterConfig.PGReplPassword, p.clusterConfig.RequestTimeout) p.pgm = pgm p.pgm.Stop(true) http.HandleFunc("/info", p.infoHandler) http.HandleFunc("/pgstate", p.pgStateHandler) go func() { endApiCh <- http.ListenAndServe(fmt.Sprintf("%s:%s", p.listenAddress, p.port), nil) }() ctx, cancel := context.WithCancel(context.Background()) smTimerCh := time.NewTimer(0).C updatePGStateTimerCh := time.NewTimer(0).C for true { select { case <-p.stop: log.Debugf("stopping stolon keeper") cancel() p.pgm.Stop(true) p.end <- nil return case <-smTimerCh: go func() { p.postgresKeeperSM(ctx) endSMCh <- struct{}{} }() case <-endSMCh: smTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C case <-updatePGStateTimerCh: go func() { p.updatePGState(ctx) endPgStatecheckerCh <- struct{}{} }() case <-endPgStatecheckerCh: updatePGStateTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C case err := <-endApiCh: if err != nil { log.Fatal("ListenAndServe: ", err) } close(p.stop) } } }
func (s *Sentinel) updateClusterView(cv *cluster.ClusterView, keepersState cluster.KeepersState) (*cluster.ClusterView, error) { var wantedMasterID string if cv.Master == "" { log.Debugf("trying to find initial master") // Check for an initial master if len(keepersState) < 1 { return nil, fmt.Errorf("cannot init cluster, no keepers registered") } if len(keepersState) > 1 { return nil, fmt.Errorf("cannot init cluster, more than 1 keeper registered") } for id, k := range keepersState { if k.PGState == nil { return nil, fmt.Errorf("cannot init cluster using keeper %q since its pg state is unknown", id) } if !k.PGState.Initialized { return nil, fmt.Errorf("cannot init cluster using keeper %q since pg instance is not initializied", id) } log.Infof("initializing cluster with master: %q", id) wantedMasterID = id break } } else { masterID := cv.Master wantedMasterID = masterID masterOK := true master, ok := keepersState[masterID] if !ok { return nil, fmt.Errorf("keeper state for master %q not available. This shouldn't happen!", masterID) } log.Debugf(spew.Sprintf("masterState: %#v", master)) if !master.Healthy { log.Infof("master is failed") masterOK = false } // Check that the wanted master is in master state (i.e. check that promotion from standby to master happened) if !s.isKeeperConverged(master, cv) { log.Infof("keeper %s not yet master", masterID) masterOK = false } if !masterOK { log.Infof("trying to find a standby to replace failed master") bestStandby, err := s.GetBestStandby(cv, keepersState, masterID) if err != nil { log.Errorf("error trying to find the best standby: %v", err) } else { if bestStandby != masterID { log.Infof("electing new master: %q", bestStandby) wantedMasterID = bestStandby } else { log.Infof("cannot find a good standby to replace failed master") } } } } newCV := cv.Copy() newKeepersRole := newCV.KeepersRole // Add new keepersRole from keepersState for id, _ := range keepersState { if _, ok := newKeepersRole[id]; !ok { if err := newKeepersRole.Add(id, ""); err != nil { // This shouldn't happen panic(err) } } } // Setup master role if cv.Master != wantedMasterID { newCV.Master = wantedMasterID newKeepersRole[wantedMasterID].Follow = "" } // Setup standbys if cv.Master == wantedMasterID { // wanted master is the previous one masterState := keepersState[wantedMasterID] if masterState.Healthy && s.isKeeperConverged(masterState, cv) { for id, _ := range newKeepersRole { if id == wantedMasterID { continue } newKeepersRole[id].Follow = wantedMasterID } } } s.updateProxyConf(cv, newCV, keepersState) if !newCV.Equals(cv) { newCV.Version = cv.Version + 1 newCV.ChangeTime = time.Now() } return newCV, nil }
func (s *Sentinel) updateClusterView(cv *cluster.ClusterView, membersState cluster.MembersState) (*cluster.ClusterView, error) { var wantedMasterID string // Cluster first initialization if cv.Version == 0 { log.Debugf("trying to find initial master") // Check for an initial master if len(membersState) < 1 { return nil, fmt.Errorf("cannot init cluster, no members registered") } if len(membersState) > 1 { return nil, fmt.Errorf("cannot init cluster, more than 1 member registered") } for id, m := range membersState { if m.PGState == nil { return nil, fmt.Errorf("cannot init cluster using member %q since its pg state is unknown", id) } log.Infof("Initializing cluster with master: %q", id) wantedMasterID = id break } } else { masterID := cv.Master masterOK := true master, ok := membersState[masterID] if !ok { return nil, fmt.Errorf("member state for master %q not available. This shouldn't happen!", masterID) } log.Debugf(spew.Sprintf("masterState: %#v", master)) if !s.isMemberHealthy(master) { log.Infof("master is failed") masterOK = false } // Check that the wanted master is in master state (i.e. check that promotion from standby to master happened) if !s.isMemberConverged(master, cv) { log.Infof("member %s not yet master", masterID) masterOK = false } wantedMasterID = masterID if !masterOK { log.Infof("trying to find a standby to replace failed master") bestStandby, err := s.GetBestStandby(cv, membersState, masterID) if err != nil { log.Errorf("error trying to find the best standby: %v", err) } else { if bestStandby != masterID { log.Infof("electing new master: %q", bestStandby) wantedMasterID = bestStandby } else { log.Infof("cannot find a good standby to replace failed master") } } } } newCV := cv.Copy() newMembersRole := newCV.MembersRole // Add new members from membersState for id, _ := range membersState { if _, ok := newMembersRole[id]; !ok { newMembersRole[id] = &cluster.MemberRole{} } } // Setup master role if cv.Master != wantedMasterID { newCV.Master = wantedMasterID newMembersRole[wantedMasterID] = &cluster.MemberRole{Follow: ""} } // Setup standbys if cv.Master == wantedMasterID { // wanted master is the previous one masterState := membersState[wantedMasterID] if s.isMemberHealthy(masterState) && s.isMemberConverged(masterState, cv) { for id, _ := range newMembersRole { if id == wantedMasterID { continue } newMembersRole[id] = &cluster.MemberRole{Follow: wantedMasterID} } } } if !newCV.Equals(cv) { newCV.Version = cv.Version + 1 newCV.ChangeTime = time.Now() } return newCV, nil }