Пример #1
0
func NewPostgresKeeper(id string, cfg config, stop chan bool, end chan error) (*PostgresKeeper, error) {
	etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName)
	e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create etcd manager: %v", err)
	}

	cd, _, err := e.GetClusterData()
	if err != nil {
		return nil, fmt.Errorf("error retrieving cluster data: %v", err)
	}

	var cv *cluster.ClusterView
	if cd == nil {
		cv = cluster.NewClusterView()
	} else {
		cv = cd.ClusterView
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	clusterConfig := cv.Config.ToConfig()
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))

	p := &PostgresKeeper{id: id,
		dataDir:         cfg.dataDir,
		e:               e,
		listenAddress:   cfg.listenAddress,
		port:            cfg.port,
		pgListenAddress: cfg.pgListenAddress,
		pgPort:          cfg.pgPort,
		clusterConfig:   clusterConfig,
		stop:            stop,
		end:             end,
	}

	followersIDs := cv.GetFollowersIDs(p.id)
	pgParameters := p.createPGParameters(followersIDs)
	pgm, err := postgresql.NewManager(id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), clusterConfig.PGReplUser, clusterConfig.PGReplPassword, clusterConfig.RequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create postgres manager: %v", err)
	}
	p.pgm = pgm
	return p, nil
}
Пример #2
0
func NewSentinel(id string, cfg config, stop chan bool, end chan bool) (*Sentinel, error) {
	etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName)
	e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create etcd manager: %v", err)
	}

	cd, _, err := e.GetClusterData()
	if err != nil {
		return nil, fmt.Errorf("error retrieving cluster data: %v", err)
	}

	var cv *cluster.ClusterView
	if cd == nil {
		cv = cluster.NewClusterView()
	} else {
		cv = cd.ClusterView
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	clusterConfig := cv.Config.ToConfig()
	if err != nil {
		return nil, fmt.Errorf("cannot get cluster config: %v", err)
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))

	lManager := e.NewLeaseManager()

	return &Sentinel{
		id:            id,
		e:             e,
		listenAddress: cfg.listenAddress,
		port:          cfg.port,
		lManager:      lManager,
		clusterConfig: clusterConfig,
		stop:          stop,
		end:           end}, nil
}
Пример #3
0
func (p *PostgresKeeper) Start() {
	endSMCh := make(chan struct{})
	endPgStatecheckerCh := make(chan struct{})
	endApiCh := make(chan error)

	var err error
	var cd *cluster.ClusterData
	// TODO(sgotti) make the postgres manager stateless and instantiate a
	// new one at every check loop, this will avoid the need to loop here
	// to get the clusterconfig
	for {
		cd, _, err = p.e.GetClusterData()
		if err == nil {
			break
		}
		log.Errorf("error retrieving cluster data: %v", err)
		time.Sleep(cluster.DefaultSleepInterval)
	}

	var cv *cluster.ClusterView
	if cd == nil {
		cv = cluster.NewClusterView()
	} else {
		cv = cd.ClusterView
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	p.clusterConfig = cv.Config.ToConfig()
	log.Debugf(spew.Sprintf("clusterConfig: %#v", p.clusterConfig))

	if err := p.loadCVVersion(); err != nil {
		p.end <- fmt.Errorf("failed to load cluster version file: %v", err)
		return
	}

	// TODO(sgotti) reconfigure the various configurations options (PGRepl*
	// and RequestTimeout) after a changed cluster config
	followersIDs := cv.GetFollowersIDs(p.id)
	pgParameters := p.createPGParameters(followersIDs)
	pgm := postgresql.NewManager(p.id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), p.clusterConfig.PGReplUser, p.clusterConfig.PGReplPassword, p.clusterConfig.RequestTimeout)
	p.pgm = pgm

	p.pgm.Stop(true)

	http.HandleFunc("/info", p.infoHandler)
	http.HandleFunc("/pgstate", p.pgStateHandler)
	go func() {
		endApiCh <- http.ListenAndServe(fmt.Sprintf("%s:%s", p.listenAddress, p.port), nil)
	}()

	ctx, cancel := context.WithCancel(context.Background())
	smTimerCh := time.NewTimer(0).C
	updatePGStateTimerCh := time.NewTimer(0).C
	for true {
		select {
		case <-p.stop:
			log.Debugf("stopping stolon keeper")
			cancel()
			p.pgm.Stop(true)
			p.end <- nil
			return
		case <-smTimerCh:
			go func() {
				p.postgresKeeperSM(ctx)
				endSMCh <- struct{}{}
			}()
		case <-endSMCh:
			smTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C
		case <-updatePGStateTimerCh:
			go func() {
				p.updatePGState(ctx)
				endPgStatecheckerCh <- struct{}{}
			}()
		case <-endPgStatecheckerCh:
			updatePGStateTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C
		case err := <-endApiCh:
			if err != nil {
				log.Fatal("ListenAndServe: ", err)
			}
			close(p.stop)
		}
	}
}
Пример #4
0
func TestUpdateClusterView(t *testing.T) {
	tests := []struct {
		cv           *cluster.ClusterView
		keepersState cluster.KeepersState
		outCV        *cluster.ClusterView
		err          error
	}{
		{
			cv:           cluster.NewClusterView(),
			keepersState: nil,
			outCV:        cluster.NewClusterView(),
			err:          fmt.Errorf("cannot init cluster, no keepers registered"),
		},
		// cluster initialization, one keeper
		{
			cv: cluster.NewClusterView(),
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{PGState: &cluster.PostgresState{Initialized: true}},
			},
			outCV: &cluster.ClusterView{
				Version: 1,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
				},
			},
		},
		// cluster initialization, too many keepers
		{
			cv: cluster.NewClusterView(),
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{},
				"02": &cluster.KeeperState{},
			},
			outCV: cluster.NewClusterView(),
			err:   fmt.Errorf("cannot init cluster, more than 1 keeper registered"),
		},
		// One master and one standby, both healthy: no change from previous cv
		{
			cv: &cluster.ClusterView{
				Version: 1,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 1,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
		},
		// One master and one standby, master not healthy: standby elected as new master
		{
			cv: &cluster.ClusterView{
				Version: 1,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Unix(0, 0),
					Healthy:            false,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 2,
				Master:  "02",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: ""},
				},
			},
		},
		// From the previous test, new master (02) converged. Old master setup to follow new master (02).
		{
			cv: &cluster.ClusterView{
				Version: 2,
				Master:  "02",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: ""},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Unix(0, 0),
					Healthy:            false,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 2,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 3,
				Master:  "02",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: "02"},
					"02": &cluster.KeeperRole{ID: "02", Follow: ""},
				},
			},
		},

		// One master and one standby, master not healthy, standby with old
		// clusterview: no standby elected as new master.
		{
			cv: &cluster.ClusterView{
				Version: 2,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 2,
					ErrorStartTime:     time.Unix(0, 0),
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 2,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
		},
		// One master and one standby, master not converged to current
		// cv: standby elected as new master.
		{
			cv: &cluster.ClusterView{
				Version: 2,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 2,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 3,
				Master:  "02",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: ""},
				},
			},
		},
	}

	s := &Sentinel{id: "id", clusterConfig: cluster.NewDefaultConfig()}
	for i, tt := range tests {
		outCV, err := s.updateClusterView(tt.cv, tt.keepersState)
		t.Logf("test #%d", i)
		t.Logf(spew.Sprintf("outCV: %#v", outCV))
		if tt.err != nil {
			if err == nil {
				t.Errorf("got no error, wanted error: %v", tt.err)
			} else if tt.err.Error() != err.Error() {
				t.Errorf("got error: %v, wanted error: %v", err, tt.err)
			}
		} else {
			if err != nil {
				t.Errorf("unexpected error: %v", err)
			}
			if !outCV.Equals(tt.outCV) {
				t.Errorf(spew.Sprintf("#%d: wrong outCV: got: %#v, want: %#v", i, outCV, tt.outCV))
			}
		}
	}
}
Пример #5
0
func (s *Sentinel) clusterSentinelCheck(pctx context.Context) {
	s.updateMutex.Lock()
	defer s.updateMutex.Unlock()
	e := s.e

	cd, res, err := e.GetClusterData()
	if err != nil {
		log.Errorf("error retrieving cluster data: %v", err)
		return
	}
	var prevCDIndex uint64
	if res != nil {
		prevCDIndex = res.Node.ModifiedIndex
	}

	var cv *cluster.ClusterView
	var keepersState cluster.KeepersState
	if cd == nil {
		cv = cluster.NewClusterView()
		keepersState = nil
	} else {
		cv = cd.ClusterView
		keepersState = cd.KeepersState
	}
	log.Debugf(spew.Sprintf("keepersState: %#v", keepersState))
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	// Update cluster config
	// This shouldn't need a lock
	s.clusterConfig = cv.Config.ToConfig()

	if err := s.setSentinelInfo(2 * s.clusterConfig.SleepInterval); err != nil {
		log.Errorf("cannot update leader sentinel info: %v", err)
		return
	}

	// TODO(sgotti) better ways to calculate leaseTTL?
	leaseTTL := s.clusterConfig.SleepInterval + s.clusterConfig.RequestTimeout*4

	ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersDiscoveryInfo, err := s.discover(ctx)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("keepersDiscoveryInfo: %#v", keepersDiscoveryInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersInfo, err := getKeepersInfo(ctx, keepersDiscoveryInfo)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("keepersInfo: %#v", keepersInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersPGState := getKeepersPGState(ctx, keepersInfo)
	cancel()
	log.Debugf(spew.Sprintf("keepersPGState: %#v", keepersPGState))

	var l lease.Lease
	if isLeader(s.l, s.id) {
		log.Infof("I'm the sentinels leader")
		l = renewLeadership(s.l, leaseTTL)
	} else {
		log.Infof("trying to acquire sentinels leadership")
		l = acquireLeadership(s.lManager, s.id, 1, leaseTTL)
	}

	// log all leadership changes
	if l != nil && s.l == nil && l.MachineID() != s.id {
		log.Infof("sentinel leader is %s", l.MachineID())
	} else if l != nil && s.l != nil && l.MachineID() != l.MachineID() {
		log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID())
	}

	s.l = l

	if !isLeader(s.l, s.id) {
		return
	}
	if err := s.setLeaderSentinelInfo(leaseTTL); err != nil {
		log.Errorf("cannot update leader sentinel info: %v", err)
		return
	}

	if cv.Version == 0 {
		// Cluster first initialization
		newcv := cluster.NewClusterView()
		newcv.Version = 1
		_, err = e.SetClusterData(nil, newcv, 0)
		if err != nil {
			log.Errorf("error saving clusterdata: %v", err)
		}
		return
	}

	newKeepersState := s.updateKeepersState(keepersState, keepersInfo, keepersPGState)
	log.Debugf(spew.Sprintf("newKeepersState: %#v", newKeepersState))

	newcv, err := s.updateClusterView(cv, newKeepersState)
	if err != nil {
		log.Errorf("failed to update clusterView: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("newcv: %#v", newcv))
	if cv.Version < newcv.Version {
		log.Debugf("newcv changed from previous cv")
	}

	_, err = e.SetClusterData(newKeepersState, newcv, prevCDIndex)
	if err != nil {
		log.Errorf("error saving clusterdata: %v", err)
	}
}
Пример #6
0
func (s *Sentinel) clusterSentinelSM(pctx context.Context) {
	e := s.e

	// Update cluster config
	clusterConfig, _, err := e.GetClusterConfig()
	if err != nil {
		log.Errorf("cannot get cluster config: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))
	// This shouldn't need a lock
	s.clusterConfig = clusterConfig

	// TODO(sgotti) better ways to calculate leaseTTL?
	leaseTTL := clusterConfig.SleepInterval + clusterConfig.RequestTimeout*4

	ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersDiscoveryInfo, err := s.discover(ctx)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("membersDiscoveryInfo: %#v", membersDiscoveryInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersInfo, err := getMembersInfo(ctx, membersDiscoveryInfo)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("membersInfo: %#v", membersInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersPGState := getMembersPGState(ctx, membersInfo)
	cancel()
	log.Debugf(spew.Sprintf("membersPGState: %#v", membersPGState))

	var l lease.Lease
	if isLeader(s.l, s.id) {
		log.Infof("I'm the sentinels leader")
		l = renewLeadership(s.l, leaseTTL)
	} else {
		log.Infof("trying to acquire sentinels leadership")
		l = acquireLeadership(s.lManager, s.id, 1, leaseTTL)
	}

	// log all leadership changes
	if l != nil && s.l == nil && l.MachineID() != s.id {
		log.Infof("sentinel leader is %s", l.MachineID())
	} else if l != nil && s.l != nil && l.MachineID() != l.MachineID() {
		log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID())
	}

	s.l = l

	if !isLeader(s.l, s.id) {
		return
	}

	cd, res, err := e.GetClusterData()
	if err != nil {
		log.Errorf("error retrieving cluster data: %v", err)
		return
	}
	var prevCDIndex uint64
	if res != nil {
		prevCDIndex = res.Node.ModifiedIndex
	}

	var cv *cluster.ClusterView
	var membersState cluster.MembersState
	if cd == nil {
		cv = cluster.NewClusterView()
		membersState = nil
	} else {
		cv = cd.ClusterView
		membersState = cd.MembersState
	}
	log.Debugf(spew.Sprintf("membersState: %#v", membersState))
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	pv, res, err := e.GetProxyView()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("proxyview: %#v", pv))

	var prevPVIndex uint64
	if res != nil {
		prevPVIndex = res.Node.ModifiedIndex
	}

	newMembersState := s.updateMembersState(membersState, membersInfo, membersPGState)
	log.Debugf(spew.Sprintf("newMembersState: %#v", newMembersState))

	newcv, err := s.updateClusterView(cv, newMembersState)
	if err != nil {
		log.Errorf("failed to update clusterView: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("newcv: %#v", newcv))
	if cv.Version < newcv.Version {
		log.Debugf("newcv changed from previous cv")
		if err := s.updateProxyView(cv, newcv, newMembersState, prevPVIndex); err != nil {
			log.Errorf("error updating proxyView: %v", err)
			return
		}
	}

	_, err = e.SetClusterData(newMembersState, newcv, prevCDIndex)
	if err != nil {
		log.Errorf("error saving clusterdata: %v", err)
	}
}