Exemple #1
0
func (s *Sentinel) updateConfigHandler(w http.ResponseWriter, req *http.Request) {
	defer req.Body.Close()

	vars := mux.Vars(req)
	configName := vars["name"]

	// only configID == current is currently supported
	if configName != "current" {
		log.Errorf("wrong config name %q", configName)
		http.Error(w, fmt.Sprintf("wrong config name %q", configName), http.StatusBadRequest)

	}

	decoder := json.NewDecoder(req.Body)
	var config *cluster.NilConfig
	err := decoder.Decode(&config)
	if err != nil {
		w.WriteHeader(http.StatusBadRequest)
	}

	s.updateMutex.Lock()
	defer s.updateMutex.Unlock()

	if !s.isLeader() {
		log.Errorf("we aren't the sentinels leader. cannot process config update request.")
		http.Error(w, "we aren't the sentinels leader. cannot process config update request.", http.StatusBadRequest)
	}

	log.Infof(spew.Sprintf("updating config to %#v", config))

	e := s.e

	cd, pair, err := e.GetClusterData()
	if err != nil {
		log.Errorf("error retrieving cluster data: %v", err)
		http.Error(w, fmt.Sprintf("error retrieving cluster data: %v", err), http.StatusInternalServerError)
	}

	if cd == nil {
		log.Errorf("empty cluster data")
		http.Error(w, "empty cluster data", http.StatusInternalServerError)
	}
	if cd.ClusterView == nil {
		log.Errorf("empty cluster view")
		http.Error(w, "empty cluster view", http.StatusInternalServerError)
	}
	log.Debugf(spew.Sprintf("keepersState: %#v", cd.KeepersState))
	log.Debugf(spew.Sprintf("clusterView: %#v", cd.ClusterView))

	newcv := cd.ClusterView.Copy()
	newcv.Config = config
	newcv.Version += 1
	if _, err := e.SetClusterData(cd.KeepersState, newcv, pair); err != nil {
		log.Errorf("error saving clusterdata: %v", err)
		http.Error(w, fmt.Sprintf("error saving clusterdata: %v", err), http.StatusInternalServerError)
	}
}
Exemple #2
0
func NewPostgresKeeper(id string, cfg config, stop chan bool, end chan error) (*PostgresKeeper, error) {
	etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName)
	e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create etcd manager: %v", err)
	}

	clusterConfig, _, err := e.GetClusterConfig()
	if err != nil {
		return nil, fmt.Errorf("cannot get cluster config: %v", err)
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))

	p := &PostgresKeeper{id: id,
		dataDir:         cfg.dataDir,
		e:               e,
		listenAddress:   cfg.listenAddress,
		port:            cfg.port,
		pgListenAddress: cfg.pgListenAddress,
		pgPort:          cfg.pgPort,
		clusterConfig:   clusterConfig,
		stop:            stop,
		end:             end,
	}

	serverParameters := p.createServerParameters()
	pgm, err := postgresql.NewManager(id, cfg.pgBinPath, cfg.dataDir, serverParameters, p.getOurConnString(), p.getOurReplConnString(), clusterConfig.PGReplUser, clusterConfig.PGReplPassword, clusterConfig.RequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create postgres manager: %v", err)
	}
	p.pgm = pgm
	return p, nil
}
Exemple #3
0
func (c *ClusterChecker) Check() {
	cv, _, err := c.e.GetClusterView()
	if err != nil {
		log.Errorf("cannot get clusterview: %v", err)
		c.C <- pollon.ConfData{DestAddr: nil}
		return
	}
	log.Debugf(spew.Sprintf("clusterview: %#v", cv))
	if cv == nil {
		log.Infof("no clusterview available, closing connections to previous master")
		c.C <- pollon.ConfData{DestAddr: nil}
		return
	}
	pc := cv.ProxyConf
	if pc == nil {
		log.Infof("no proxyconf available, closing connections to previous master")
		c.C <- pollon.ConfData{DestAddr: nil}
		if err := c.SetProxyInfo(c.e, cv.Version, 2*cluster.DefaultProxyCheckInterval); err != nil {
			log.Errorf("failed to update proxyInfo: %v", err)
		}
		return
	}
	addr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("%s:%s", pc.Host, pc.Port))
	if err != nil {
		log.Errorf("err: %v", err)
		c.C <- pollon.ConfData{DestAddr: nil}
		return
	}
	log.Infof("master address: %v", addr)
	if err = c.SetProxyInfo(c.e, cv.Version, 2*cluster.DefaultProxyCheckInterval); err != nil {
		log.Errorf("failed to update proxyInfo: %v", err)
	}
	c.C <- pollon.ConfData{DestAddr: addr}
}
Exemple #4
0
func NewPostgresKeeper(id string, cfg config, stop chan bool, end chan error) (*PostgresKeeper, error) {
	etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName)
	e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create etcd manager: %v", err)
	}

	cd, _, err := e.GetClusterData()
	if err != nil {
		return nil, fmt.Errorf("error retrieving cluster data: %v", err)
	}

	var cv *cluster.ClusterView
	if cd == nil {
		cv = cluster.NewClusterView()
	} else {
		cv = cd.ClusterView
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	clusterConfig := cv.Config.ToConfig()
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))

	p := &PostgresKeeper{id: id,
		dataDir:         cfg.dataDir,
		e:               e,
		listenAddress:   cfg.listenAddress,
		port:            cfg.port,
		pgListenAddress: cfg.pgListenAddress,
		pgPort:          cfg.pgPort,
		clusterConfig:   clusterConfig,
		stop:            stop,
		end:             end,
	}

	followersIDs := cv.GetFollowersIDs(p.id)
	pgParameters := p.createPGParameters(followersIDs)
	pgm, err := postgresql.NewManager(id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), clusterConfig.PGReplUser, clusterConfig.PGReplPassword, clusterConfig.RequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create postgres manager: %v", err)
	}
	p.pgm = pgm
	return p, nil
}
Exemple #5
0
func (s *Sentinel) setLeaderSentinelInfo(ttl time.Duration) error {
	sentinelInfo := &cluster.SentinelInfo{
		ID:            s.id,
		ListenAddress: s.listenAddress,
		Port:          s.port,
	}
	log.Debugf(spew.Sprintf("sentinelInfo: %#v", sentinelInfo))

	if _, err := s.e.SetLeaderSentinelInfo(sentinelInfo, ttl); err != nil {
		return err
	}
	return nil
}
Exemple #6
0
func NewSentinel(id string, cfg config, stop chan bool, end chan bool) (*Sentinel, error) {
	etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName)
	e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create etcd manager: %v", err)
	}

	cd, _, err := e.GetClusterData()
	if err != nil {
		return nil, fmt.Errorf("error retrieving cluster data: %v", err)
	}

	var cv *cluster.ClusterView
	if cd == nil {
		cv = cluster.NewClusterView()
	} else {
		cv = cd.ClusterView
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	clusterConfig := cv.Config.ToConfig()
	if err != nil {
		return nil, fmt.Errorf("cannot get cluster config: %v", err)
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))

	lManager := e.NewLeaseManager()

	return &Sentinel{
		id:            id,
		e:             e,
		listenAddress: cfg.listenAddress,
		port:          cfg.port,
		lManager:      lManager,
		clusterConfig: clusterConfig,
		stop:          stop,
		end:           end}, nil
}
Exemple #7
0
func (c *ClusterChecker) SetProxyInfo(e *store.StoreManager, cvVersion int, ttl time.Duration) error {
	proxyInfo := &cluster.ProxyInfo{
		ID:                 c.id,
		ListenAddress:      c.listenAddress,
		Port:               c.port,
		ClusterViewVersion: cvVersion,
	}
	log.Debugf(spew.Sprintf("proxyInfo: %#v", proxyInfo))

	if err := c.e.SetProxyInfo(proxyInfo, ttl); err != nil {
		return err
	}
	return nil
}
Exemple #8
0
func (p *PostgresKeeper) publish() error {
	if kubernetes.OnKubernetes() {
		log.Infof("running under kubernetes. Not using store discovery")
		return nil
	}
	discoveryInfo := &cluster.KeeperDiscoveryInfo{
		ListenAddress: p.listenAddress,
		Port:          p.port,
	}
	log.Debugf(spew.Sprintf("discoveryInfo: %#v", discoveryInfo))

	if err := p.e.SetKeeperDiscoveryInfo(p.id, discoveryInfo); err != nil {
		return err
	}
	return nil
}
Exemple #9
0
func (p *PostgresKeeper) publish() error {
	if kubernetes.OnKubernetes() {
		log.Infof("running under kubernetes. Not publishing ourself to etcd")
		return nil
	}
	discoveryInfo := &cluster.MemberDiscoveryInfo{
		Host: p.listenAddress,
		Port: p.port,
	}
	log.Debugf(spew.Sprintf("discoveryInfo: %#v", discoveryInfo))

	if _, err := p.e.SetMemberDiscoveryInfo(p.id, discoveryInfo); err != nil {
		return err
	}
	return nil
}
Exemple #10
0
func NewSentinel(id string, cfg config, stop chan bool, end chan bool) (*Sentinel, error) {
	etcdPath := filepath.Join(common.EtcdBasePath, cfg.clusterName)
	e, err := etcdm.NewEtcdManager(cfg.etcdEndpoints, etcdPath, common.DefaultEtcdRequestTimeout)
	if err != nil {
		return nil, fmt.Errorf("cannot create etcd manager: %v", err)
	}

	clusterConfig, _, err := e.GetClusterConfig()
	if err != nil {
		return nil, fmt.Errorf("cannot get cluster config: %v", err)
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))

	lManager := e.NewLeaseManager()

	return &Sentinel{id: id, e: e, lManager: lManager, clusterConfig: clusterConfig, stop: stop, end: end}, nil
}
Exemple #11
0
func TestParseTimeLineHistory(t *testing.T) {
	tests := []struct {
		contents string
		tlsh     cluster.PostgresTimeLinesHistory
		err      error
	}{
		{
			contents: "",
			tlsh:     cluster.PostgresTimeLinesHistory{},
			err:      nil,
		},
		{
			contents: `1	0/5000090	no recovery target specified`,
			tlsh: cluster.PostgresTimeLinesHistory{
				{
					TimelineID:  1,
					SwitchPoint: 83886224,
					Reason:      "no recovery target specified",
				},
			},
			err: nil,
		},
	}

	for i, tt := range tests {
		tlsh, err := parseTimeLinesHistory(tt.contents)
		t.Logf("test #%d", i)
		if tt.err != nil {
			if err == nil {
				t.Errorf("got no error, wanted error: %v", tt.err)
			} else if tt.err.Error() != err.Error() {
				t.Errorf("got error: %v, wanted error: %v", err, tt.err)
			}
		} else {
			if err != nil {
				t.Errorf("unexpected error: %v", err)
			}
			if !reflect.DeepEqual(tlsh, tt.tlsh) {
				t.Errorf(spew.Sprintf("#%d: wrong timeline history: got: %#+v, want: %#+v", i, tlsh, tt.tlsh))
			}
		}
	}

}
Exemple #12
0
func (s *Sentinel) GetBestStandby(cv *cluster.ClusterView, keepersState cluster.KeepersState, master string) (string, error) {
	var bestID string
	masterState := keepersState[master]
	for id, k := range keepersState {
		log.Debugf(spew.Sprintf("id: %s, k: %#v", id, k))
		if id == master {
			log.Debugf("ignoring node %q since it's the current master", id)
			continue
		}
		if !k.Healthy {
			log.Debugf("ignoring node %q since it's not healthy", id)
			continue
		}
		if k.ClusterViewVersion != cv.Version {
			log.Debugf("ignoring node since its clusterView version (%d) is different that the actual one (%d)", k.ClusterViewVersion, cv.Version)
			continue
		}
		if k.PGState == nil {
			log.Debugf("ignoring node since its pg state is unknown")
			continue
		}
		if masterState.PGState.TimelineID != k.PGState.TimelineID {
			log.Debugf("ignoring node since its pg timeline (%s) is different than master timeline (%d)", keepersState[id].PGState.TimelineID, masterState.PGState.TimelineID)
			continue
		}
		if bestID == "" {
			bestID = id
			continue
		}
		if k.PGState.XLogPos > keepersState[bestID].PGState.XLogPos {
			bestID = id
		}
	}
	if bestID == "" {
		return "", fmt.Errorf("no standbys available")
	}
	return bestID, nil
}
Exemple #13
0
func (p *PostgresKeeper) Start() {
	endSMCh := make(chan struct{})
	endPgStatecheckerCh := make(chan struct{})
	endApiCh := make(chan error)

	var err error
	var cd *cluster.ClusterData
	// TODO(sgotti) make the postgres manager stateless and instantiate a
	// new one at every check loop, this will avoid the need to loop here
	// to get the clusterconfig
	for {
		cd, _, err = p.e.GetClusterData()
		if err == nil {
			break
		}
		log.Errorf("error retrieving cluster data: %v", err)
		time.Sleep(cluster.DefaultSleepInterval)
	}

	var cv *cluster.ClusterView
	if cd == nil {
		cv = cluster.NewClusterView()
	} else {
		cv = cd.ClusterView
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	p.clusterConfig = cv.Config.ToConfig()
	log.Debugf(spew.Sprintf("clusterConfig: %#v", p.clusterConfig))

	if err := p.loadCVVersion(); err != nil {
		p.end <- fmt.Errorf("failed to load cluster version file: %v", err)
		return
	}

	// TODO(sgotti) reconfigure the various configurations options (PGRepl*
	// and RequestTimeout) after a changed cluster config
	followersIDs := cv.GetFollowersIDs(p.id)
	pgParameters := p.createPGParameters(followersIDs)
	pgm := postgresql.NewManager(p.id, cfg.pgBinPath, cfg.dataDir, cfg.pgConfDir, pgParameters, p.getOurConnString(), p.getOurReplConnString(), p.clusterConfig.PGReplUser, p.clusterConfig.PGReplPassword, p.clusterConfig.RequestTimeout)
	p.pgm = pgm

	p.pgm.Stop(true)

	http.HandleFunc("/info", p.infoHandler)
	http.HandleFunc("/pgstate", p.pgStateHandler)
	go func() {
		endApiCh <- http.ListenAndServe(fmt.Sprintf("%s:%s", p.listenAddress, p.port), nil)
	}()

	ctx, cancel := context.WithCancel(context.Background())
	smTimerCh := time.NewTimer(0).C
	updatePGStateTimerCh := time.NewTimer(0).C
	for true {
		select {
		case <-p.stop:
			log.Debugf("stopping stolon keeper")
			cancel()
			p.pgm.Stop(true)
			p.end <- nil
			return
		case <-smTimerCh:
			go func() {
				p.postgresKeeperSM(ctx)
				endSMCh <- struct{}{}
			}()
		case <-endSMCh:
			smTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C
		case <-updatePGStateTimerCh:
			go func() {
				p.updatePGState(ctx)
				endPgStatecheckerCh <- struct{}{}
			}()
		case <-endPgStatecheckerCh:
			updatePGStateTimerCh = time.NewTimer(p.clusterConfig.SleepInterval).C
		case err := <-endApiCh:
			if err != nil {
				log.Fatal("ListenAndServe: ", err)
			}
			close(p.stop)
		}
	}
}
Exemple #14
0
func (s *Sentinel) updateClusterView(cv *cluster.ClusterView, keepersState cluster.KeepersState) (*cluster.ClusterView, error) {
	var wantedMasterID string
	if cv.Master == "" {
		log.Debugf("trying to find initial master")
		// Check for an initial master
		if len(keepersState) < 1 {
			return nil, fmt.Errorf("cannot init cluster, no keepers registered")
		}
		if len(keepersState) > 1 {
			return nil, fmt.Errorf("cannot init cluster, more than 1 keeper registered")
		}
		for id, k := range keepersState {
			if k.PGState == nil {
				return nil, fmt.Errorf("cannot init cluster using keeper %q since its pg state is unknown", id)
			}
			if !k.PGState.Initialized {
				return nil, fmt.Errorf("cannot init cluster using keeper %q since pg instance is not initializied", id)
			}
			log.Infof("initializing cluster with master: %q", id)
			wantedMasterID = id
			break
		}
	} else {
		masterID := cv.Master
		wantedMasterID = masterID

		masterOK := true
		master, ok := keepersState[masterID]
		if !ok {
			return nil, fmt.Errorf("keeper state for master %q not available. This shouldn't happen!", masterID)
		}
		log.Debugf(spew.Sprintf("masterState: %#v", master))

		if !master.Healthy {
			log.Infof("master is failed")
			masterOK = false
		}

		// Check that the wanted master is in master state (i.e. check that promotion from standby to master happened)
		if !s.isKeeperConverged(master, cv) {
			log.Infof("keeper %s not yet master", masterID)
			masterOK = false
		}

		if !masterOK {
			log.Infof("trying to find a standby to replace failed master")
			bestStandby, err := s.GetBestStandby(cv, keepersState, masterID)
			if err != nil {
				log.Errorf("error trying to find the best standby: %v", err)
			} else {
				if bestStandby != masterID {
					log.Infof("electing new master: %q", bestStandby)
					wantedMasterID = bestStandby
				} else {
					log.Infof("cannot find a good standby to replace failed master")
				}
			}
		}
	}

	newCV := cv.Copy()
	newKeepersRole := newCV.KeepersRole

	// Add new keepersRole from keepersState
	for id, _ := range keepersState {
		if _, ok := newKeepersRole[id]; !ok {
			if err := newKeepersRole.Add(id, ""); err != nil {
				// This shouldn't happen
				panic(err)
			}
		}
	}

	// Setup master role
	if cv.Master != wantedMasterID {
		newCV.Master = wantedMasterID
		newKeepersRole[wantedMasterID].Follow = ""
	}

	// Setup standbys
	if cv.Master == wantedMasterID {
		// wanted master is the previous one
		masterState := keepersState[wantedMasterID]
		if masterState.Healthy && s.isKeeperConverged(masterState, cv) {
			for id, _ := range newKeepersRole {
				if id == wantedMasterID {
					continue
				}
				newKeepersRole[id].Follow = wantedMasterID
			}
		}
	}

	s.updateProxyConf(cv, newCV, keepersState)

	if !newCV.Equals(cv) {
		newCV.Version = cv.Version + 1
		newCV.ChangeTime = time.Now()
	}
	return newCV, nil
}
Exemple #15
0
func TestParseConfig(t *testing.T) {
	tests := []struct {
		in  string
		cfg *Config
		err error
	}{
		{
			in:  "{}",
			cfg: mergeDefaults(&NilConfig{}).ToConfig(),
			err: nil,
		},
		// Test duration parsing
		{
			in:  `{ "request_timeout": "3s" }`,
			cfg: mergeDefaults(&NilConfig{RequestTimeout: &Duration{3 * time.Second}}).ToConfig(),
			err: nil,
		},
		{
			in:  `{ "request_timeout": "3000ms" }`,
			cfg: mergeDefaults(&NilConfig{RequestTimeout: &Duration{3 * time.Second}}).ToConfig(),
			err: nil,
		},
		{
			in:  `{ "request_timeout": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: request_timeout must be positive"),
		},
		{
			in:  `{ "request_timeout": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: request_timeout must be positive"),
		},
		{
			in:  `{ "sleep_interval": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: sleep_interval must be positive"),
		},
		{
			in:  `{ "keeper_fail_interval": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: keeper_fail_interval must be positive"),
		},
		{
			in:  `{ "pg_repl_user": "" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: pg_repl_user cannot be empty"),
		},
		{
			in:  `{ "pg_repl_password": "" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: pg_repl_password cannot be empty"),
		},
		{
			in:  `{ "max_standbys_per_sender": 0 }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: max_standbys_per_sender must be at least 1"),
		},
		// All options defined
		{
			in: `{ "request_timeout": "10s", "sleep_interval": "10s", "keeper_fail_interval": "100s", "pg_repl_user": "******", "pg_repl_password": "******", "max_standbys_per_sender": 5, "synchronous_replication": true, "init_with_multiple_keepers": true,
			       "pg_parameters": {
			         "param01": "value01"
				}
			     }`,
			cfg: mergeDefaults(&NilConfig{
				RequestTimeout:          &Duration{10 * time.Second},
				SleepInterval:           &Duration{10 * time.Second},
				KeeperFailInterval:      &Duration{100 * time.Second},
				PGReplUser:              StringP("username"),
				PGReplPassword:          StringP("password"),
				MaxStandbysPerSender:    UintP(5),
				SynchronousReplication:  BoolP(true),
				InitWithMultipleKeepers: BoolP(true),
				PGParameters: &map[string]string{
					"param01": "value01",
				},
			}).ToConfig(),
			err: nil,
		},
	}

	for i, tt := range tests {
		var nilCfg *NilConfig
		err := json.Unmarshal([]byte(tt.in), &nilCfg)
		if err != nil {
			if tt.err == nil {
				t.Errorf("#%d: unexpected error: %v", i, err)
			} else if tt.err.Error() != err.Error() {
				t.Errorf("#%d: got error: %v, wanted error: %v", i, err, tt.err)
			}
		} else {
			nilCfg.MergeDefaults()
			cfg := nilCfg.ToConfig()
			if tt.err != nil {
				t.Errorf("#%d: got no error, wanted error: %v", i, tt.err)
			}
			if !reflect.DeepEqual(cfg, tt.cfg) {
				t.Errorf(spew.Sprintf("#%d: wrong config: got: %#v, want: %#v", i, cfg, tt.cfg))
			}
		}

	}
}
Exemple #16
0
func TestUpdateClusterView(t *testing.T) {
	tests := []struct {
		cv           *cluster.ClusterView
		keepersState cluster.KeepersState
		outCV        *cluster.ClusterView
		err          error
	}{
		{
			cv:           cluster.NewClusterView(),
			keepersState: nil,
			outCV:        cluster.NewClusterView(),
			err:          fmt.Errorf("cannot init cluster, no keepers registered"),
		},
		// cluster initialization, one keeper
		{
			cv: cluster.NewClusterView(),
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{PGState: &cluster.PostgresState{Initialized: true}},
			},
			outCV: &cluster.ClusterView{
				Version: 1,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
				},
			},
		},
		// cluster initialization, too many keepers
		{
			cv: cluster.NewClusterView(),
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{},
				"02": &cluster.KeeperState{},
			},
			outCV: cluster.NewClusterView(),
			err:   fmt.Errorf("cannot init cluster, more than 1 keeper registered"),
		},
		// One master and one standby, both healthy: no change from previous cv
		{
			cv: &cluster.ClusterView{
				Version: 1,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 1,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
		},
		// One master and one standby, master not healthy: standby elected as new master
		{
			cv: &cluster.ClusterView{
				Version: 1,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Unix(0, 0),
					Healthy:            false,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 2,
				Master:  "02",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: ""},
				},
			},
		},
		// From the previous test, new master (02) converged. Old master setup to follow new master (02).
		{
			cv: &cluster.ClusterView{
				Version: 2,
				Master:  "02",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: ""},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Unix(0, 0),
					Healthy:            false,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 2,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 3,
				Master:  "02",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: "02"},
					"02": &cluster.KeeperRole{ID: "02", Follow: ""},
				},
			},
		},

		// One master and one standby, master not healthy, standby with old
		// clusterview: no standby elected as new master.
		{
			cv: &cluster.ClusterView{
				Version: 2,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 2,
					ErrorStartTime:     time.Unix(0, 0),
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 2,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
		},
		// One master and one standby, master not converged to current
		// cv: standby elected as new master.
		{
			cv: &cluster.ClusterView{
				Version: 2,
				Master:  "01",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: "01"},
				},
			},
			keepersState: cluster.KeepersState{
				"01": &cluster.KeeperState{
					ClusterViewVersion: 1,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
				"02": &cluster.KeeperState{
					ClusterViewVersion: 2,
					ErrorStartTime:     time.Time{},
					Healthy:            true,
					PGState: &cluster.PostgresState{
						TimelineID: 0,
					},
				},
			},
			outCV: &cluster.ClusterView{
				Version: 3,
				Master:  "02",
				KeepersRole: cluster.KeepersRole{
					"01": &cluster.KeeperRole{ID: "01", Follow: ""},
					"02": &cluster.KeeperRole{ID: "02", Follow: ""},
				},
			},
		},
	}

	s := &Sentinel{id: "id", clusterConfig: cluster.NewDefaultConfig()}
	for i, tt := range tests {
		outCV, err := s.updateClusterView(tt.cv, tt.keepersState)
		t.Logf("test #%d", i)
		t.Logf(spew.Sprintf("outCV: %#v", outCV))
		if tt.err != nil {
			if err == nil {
				t.Errorf("got no error, wanted error: %v", tt.err)
			} else if tt.err.Error() != err.Error() {
				t.Errorf("got error: %v, wanted error: %v", err, tt.err)
			}
		} else {
			if err != nil {
				t.Errorf("unexpected error: %v", err)
			}
			if !outCV.Equals(tt.outCV) {
				t.Errorf(spew.Sprintf("#%d: wrong outCV: got: %#v, want: %#v", i, outCV, tt.outCV))
			}
		}
	}
}
Exemple #17
0
func (s *Sentinel) clusterSentinelCheck(pctx context.Context) {
	s.updateMutex.Lock()
	defer s.updateMutex.Unlock()
	e := s.e

	cd, res, err := e.GetClusterData()
	if err != nil {
		log.Errorf("error retrieving cluster data: %v", err)
		return
	}
	var prevCDIndex uint64
	if res != nil {
		prevCDIndex = res.Node.ModifiedIndex
	}

	var cv *cluster.ClusterView
	var keepersState cluster.KeepersState
	if cd == nil {
		cv = cluster.NewClusterView()
		keepersState = nil
	} else {
		cv = cd.ClusterView
		keepersState = cd.KeepersState
	}
	log.Debugf(spew.Sprintf("keepersState: %#v", keepersState))
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	// Update cluster config
	// This shouldn't need a lock
	s.clusterConfig = cv.Config.ToConfig()

	if err := s.setSentinelInfo(2 * s.clusterConfig.SleepInterval); err != nil {
		log.Errorf("cannot update leader sentinel info: %v", err)
		return
	}

	// TODO(sgotti) better ways to calculate leaseTTL?
	leaseTTL := s.clusterConfig.SleepInterval + s.clusterConfig.RequestTimeout*4

	ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersDiscoveryInfo, err := s.discover(ctx)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("keepersDiscoveryInfo: %#v", keepersDiscoveryInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersInfo, err := getKeepersInfo(ctx, keepersDiscoveryInfo)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("keepersInfo: %#v", keepersInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersPGState := getKeepersPGState(ctx, keepersInfo)
	cancel()
	log.Debugf(spew.Sprintf("keepersPGState: %#v", keepersPGState))

	var l lease.Lease
	if isLeader(s.l, s.id) {
		log.Infof("I'm the sentinels leader")
		l = renewLeadership(s.l, leaseTTL)
	} else {
		log.Infof("trying to acquire sentinels leadership")
		l = acquireLeadership(s.lManager, s.id, 1, leaseTTL)
	}

	// log all leadership changes
	if l != nil && s.l == nil && l.MachineID() != s.id {
		log.Infof("sentinel leader is %s", l.MachineID())
	} else if l != nil && s.l != nil && l.MachineID() != l.MachineID() {
		log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID())
	}

	s.l = l

	if !isLeader(s.l, s.id) {
		return
	}
	if err := s.setLeaderSentinelInfo(leaseTTL); err != nil {
		log.Errorf("cannot update leader sentinel info: %v", err)
		return
	}

	if cv.Version == 0 {
		// Cluster first initialization
		newcv := cluster.NewClusterView()
		newcv.Version = 1
		_, err = e.SetClusterData(nil, newcv, 0)
		if err != nil {
			log.Errorf("error saving clusterdata: %v", err)
		}
		return
	}

	newKeepersState := s.updateKeepersState(keepersState, keepersInfo, keepersPGState)
	log.Debugf(spew.Sprintf("newKeepersState: %#v", newKeepersState))

	newcv, err := s.updateClusterView(cv, newKeepersState)
	if err != nil {
		log.Errorf("failed to update clusterView: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("newcv: %#v", newcv))
	if cv.Version < newcv.Version {
		log.Debugf("newcv changed from previous cv")
	}

	_, err = e.SetClusterData(newKeepersState, newcv, prevCDIndex)
	if err != nil {
		log.Errorf("error saving clusterdata: %v", err)
	}
}
Exemple #18
0
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
	e := p.e
	pgm := p.pgm

	// Update cluster config
	clusterConfig, _, err := e.GetClusterConfig()
	if err != nil {
		log.Errorf("cannot get cluster config: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))
	// This shouldn't need a lock
	p.clusterConfig = clusterConfig

	cv, _, err := e.GetClusterView()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	membersState, _, err := e.GetMembersState()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	if membersState == nil {
		membersState = cluster.MembersState{}
	}
	log.Debugf(spew.Sprintf("membersState: %#v", membersState))

	member := membersState[p.id]
	log.Debugf(spew.Sprintf("memberState: %#v", member))

	initialized, err := pgm.IsInitialized()
	if err != nil {
		log.Errorf("failed to detect if instance is initialized: %v", err)
		return
	}

	if cv == nil {
		if !initialized {
			log.Infof("Initializing database")
			err = pgm.Init()
			if err != nil {
				log.Errorf("failed to initialized postgres instance: %v", err)
				return
			}
			initialized = true
		}
	}

	started := false

	if initialized {
		started, err = pgm.IsStarted()
		if err != nil {
			log.Errorf("failed to retrieve instance status: %v", err)
		} else if !started {
			err = pgm.Start()
			if err != nil {
				log.Errorf("err: %v", err)
			} else {
				started = true
			}
		}
	}

	if cv != nil {
		if !started && p.id == cv.Master {
			// If the clusterView says we are master but we cannot get
			// instance status or start then stop here, if we are standby then we can
			// recover
			return
		}
	}

	role, err := pgm.GetRole()
	if err != nil {
		log.Infof("error retrieving current pg role: %v", err)
		return
	}
	isMaster := false
	if role == common.MasterRole {
		log.Infof("current pg state: master")
		isMaster = true
	} else {
		log.Infof("current pg state: standby")
	}

	// publish ourself for discovery
	if err := p.publish(); err != nil {
		log.Errorf("failed to publish ourself to the cluster: %v", err)
		return
	}

	if cv == nil {
		return
	}

	// cv != nil

	masterID := cv.Master
	log.Debugf("masterID: %q", masterID)

	master := membersState[masterID]
	log.Debugf(spew.Sprintf("masterState: %#v", master))

	followersIDs := cv.GetFollowersIDs(p.id)

	memberRole, ok := cv.MembersRole[p.id]
	if !ok {
		log.Infof("our member state is not available")
		return
	}
	if memberRole.Follow == "" {
		log.Infof("our cluster requested state is master")
		if role != common.MasterRole {
			log.Infof("promoting to master")
			err := pgm.Promote()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
		} else {
			log.Infof("already master")

			replSlots := []string{}
			replSlots, err = pgm.GetReplicatinSlots()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
			// Create replication slots
			for _, slotName := range replSlots {
				if !util.StringInSlice(followersIDs, slotName) {
					log.Infof("dropping replication slot for member %q not marked as follower", slotName)
					err := pgm.DropReplicationSlot(slotName)
					if err != nil {
						log.Errorf("err: %v", err)
					}
				}
			}

			for _, followerID := range followersIDs {
				if followerID == p.id {
					continue
				}
				if !util.StringInSlice(replSlots, followerID) {
					err := pgm.CreateReplicationSlot(followerID)
					if err != nil {
						log.Errorf("err: %v", err)
					}
				}
			}

			// Setup synchronous replication
			syncStandbyNames, _ := pgm.GetServerParameter("synchronous_standby_names")
			if p.clusterConfig.SynchronousReplication {
				newSyncStandbyNames := strings.Join(followersIDs, ",")
				if syncStandbyNames != newSyncStandbyNames {
					log.Infof("needed synchronous_standby_names changed from %q to %q, reconfiguring", syncStandbyNames, newSyncStandbyNames)
					pgm.SetServerParameter("synchronous_standby_names", newSyncStandbyNames)
					pgm.Reload()
				}
			} else {
				if syncStandbyNames != "" {
					log.Infof("sync replication disabled, removing current synchronous_standby_names %q", syncStandbyNames)
					pgm.SetServerParameter("synchronous_standby_names", "")
					pgm.Reload()
				}
			}
		}
	} else {
		log.Infof("our cluster requested state is standby following %q", memberRole.Follow)
		if isMaster {
			if err := p.fullResync(master, initialized, started); err != nil {
				log.Errorf("failed to full resync from master: %v", err)
				return
			}
		} else {
			log.Infof("already standby")
			curConnParams, err := pgm.GetPrimaryConninfo()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
			log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams))

			replConnString := p.getReplConnString(master)
			newConnParams, err := pg.URLToConnParams(replConnString)
			if err != nil {
				log.Errorf("cannot get conn params: %v", err)
				return
			}
			log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams))

			// Check that we can sync with master

			// Check timeline history
			ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout)
			pgState, err := pg.GetPGState(ctx, p.getOurReplConnString())
			cancel()
			if err != nil {
				log.Errorf("cannot get our pgstate: %v", err)
				return
			}
			mPGState := master.PGState
			if p.isDifferentTimelineBranch(mPGState, pgState) {
				if err := p.fullResync(master, initialized, started); err != nil {
					log.Errorf("failed to full resync from master: %v", err)
					return
				}
			}

			// TODO(sgotti) Check that the master has all the needed WAL segments

			// Update our primary_conninfo if replConnString changed
			if !curConnParams.Equals(newConnParams) {
				log.Infof("master connection parameters changed. Reconfiguring...")
				log.Infof("following %s with connection url %s", memberRole.Follow, replConnString)
				err = pgm.BecomeStandby(replConnString)
				if err != nil {
					log.Errorf("err: %v", err)
					return
				}
				err = pgm.Restart(true)
				if err != nil {
					log.Errorf("err: %v", err)
					return
				}
			}
		}
	}
	if err := p.saveCVVersion(cv.Version); err != nil {
		log.Errorf("err: %v", err)
		return
	}
}
Exemple #19
0
func TestParseConfig(t *testing.T) {
	tests := []struct {
		in  string
		cfg *Config
		err error
	}{
		{
			in:  "{}",
			cfg: mergeDefaults(&NilConfig{}).ToConfig(),
			err: nil,
		},
		// Test duration parsing
		{
			in:  `{ "requesttimeout": "3s" }`,
			cfg: mergeDefaults(&NilConfig{RequestTimeout: DurationP(3 * time.Second)}).ToConfig(),
			err: nil,
		},
		{
			in:  `{ "requesttimeout": "3000ms" }`,
			cfg: mergeDefaults(&NilConfig{RequestTimeout: DurationP(3 * time.Second)}).ToConfig(),
			err: nil,
		},
		{
			in:  `{ "requesttimeout": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: RequestTimeout must be positive"),
		},
		{
			in:  `{ "requesttimeout": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: RequestTimeout must be positive"),
		},
		{
			in:  `{ "sleepinterval": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: SleepInterval must be positive"),
		},
		{
			in:  `{ "keeperfailinterval": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: KeeperFailInterval must be positive"),
		},
		{
			in:  `{ "pgrepluser": "" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: PGReplUser cannot be empty"),
		},
		{
			in:  `{ "pgreplpassword": "" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: PGReplPassword cannot be empty"),
		},
		{
			in:  `{ "maxstandbyspersender": 0 }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: MaxStandbysPerSender must be at least 1"),
		},
		// All options defined
		{
			in: `{ "requestTimeout": "10s", "sleepInterval": "10s", "keeperFailInterval": "100s", "pgrepluser": "******", "pgreplpassword": "******", "maxstandbyspersender": 5, "synchronousreplication": true}`,
			cfg: mergeDefaults(&NilConfig{
				RequestTimeout:         DurationP(10 * time.Second),
				SleepInterval:          DurationP(10 * time.Second),
				KeeperFailInterval:     DurationP(100 * time.Second),
				PGReplUser:             StringP("username"),
				PGReplPassword:         StringP("password"),
				MaxStandbysPerSender:   UintP(5),
				SynchronousReplication: BoolP(true),
			}).ToConfig(),
			err: nil,
		},
	}

	for i, tt := range tests {
		var nilCfg *NilConfig
		err := json.Unmarshal([]byte(tt.in), &nilCfg)
		nilCfg.MergeDefaults()
		cfg := nilCfg.ToConfig()
		if tt.err != nil {
			if err == nil {
				t.Errorf("#%d: got no error, wanted error: %v", i, tt.err)
			} else if tt.err.Error() != err.Error() {
				t.Errorf("#%d: got error: %v, wanted error: %v", i, err, tt.err)
			}
		} else {
			if err != nil {
				t.Errorf("#%d: unexpected error: %v", i, err)
			}
			if !reflect.DeepEqual(cfg, tt.cfg) {
				t.Errorf(spew.Sprintf("#%d: wrong config: got: %#v, want: %#v", i, cfg, tt.cfg))
			}
		}

	}
}
Exemple #20
0
func (s *Sentinel) clusterSentinelSM(pctx context.Context) {
	e := s.e

	// Update cluster config
	clusterConfig, _, err := e.GetClusterConfig()
	if err != nil {
		log.Errorf("cannot get cluster config: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))
	// This shouldn't need a lock
	s.clusterConfig = clusterConfig

	// TODO(sgotti) better ways to calculate leaseTTL?
	leaseTTL := clusterConfig.SleepInterval + clusterConfig.RequestTimeout*4

	ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersDiscoveryInfo, err := s.discover(ctx)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("membersDiscoveryInfo: %#v", membersDiscoveryInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersInfo, err := getMembersInfo(ctx, membersDiscoveryInfo)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("membersInfo: %#v", membersInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersPGState := getMembersPGState(ctx, membersInfo)
	cancel()
	log.Debugf(spew.Sprintf("membersPGState: %#v", membersPGState))

	var l lease.Lease
	if isLeader(s.l, s.id) {
		log.Infof("I'm the sentinels leader")
		l = renewLeadership(s.l, leaseTTL)
	} else {
		log.Infof("trying to acquire sentinels leadership")
		l = acquireLeadership(s.lManager, s.id, 1, leaseTTL)
	}

	// log all leadership changes
	if l != nil && s.l == nil && l.MachineID() != s.id {
		log.Infof("sentinel leader is %s", l.MachineID())
	} else if l != nil && s.l != nil && l.MachineID() != l.MachineID() {
		log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID())
	}

	s.l = l

	if !isLeader(s.l, s.id) {
		return
	}

	cd, res, err := e.GetClusterData()
	if err != nil {
		log.Errorf("error retrieving cluster data: %v", err)
		return
	}
	var prevCDIndex uint64
	if res != nil {
		prevCDIndex = res.Node.ModifiedIndex
	}

	var cv *cluster.ClusterView
	var membersState cluster.MembersState
	if cd == nil {
		cv = cluster.NewClusterView()
		membersState = nil
	} else {
		cv = cd.ClusterView
		membersState = cd.MembersState
	}
	log.Debugf(spew.Sprintf("membersState: %#v", membersState))
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	pv, res, err := e.GetProxyView()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("proxyview: %#v", pv))

	var prevPVIndex uint64
	if res != nil {
		prevPVIndex = res.Node.ModifiedIndex
	}

	newMembersState := s.updateMembersState(membersState, membersInfo, membersPGState)
	log.Debugf(spew.Sprintf("newMembersState: %#v", newMembersState))

	newcv, err := s.updateClusterView(cv, newMembersState)
	if err != nil {
		log.Errorf("failed to update clusterView: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("newcv: %#v", newcv))
	if cv.Version < newcv.Version {
		log.Debugf("newcv changed from previous cv")
		if err := s.updateProxyView(cv, newcv, newMembersState, prevPVIndex); err != nil {
			log.Errorf("error updating proxyView: %v", err)
			return
		}
	}

	_, err = e.SetClusterData(newMembersState, newcv, prevCDIndex)
	if err != nil {
		log.Errorf("error saving clusterdata: %v", err)
	}
}
Exemple #21
0
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
	e := p.e
	pgm := p.pgm

	cv, _, err := e.GetClusterView()
	if err != nil {
		log.Errorf("error retrieving cluster view: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	if cv == nil {
		log.Infof("no clusterview available, waiting for it to appear")
		return
	}

	followersIDs := cv.GetFollowersIDs(p.id)

	// Update cluster config
	clusterConfig := cv.Config.ToConfig()
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))
	// This shouldn't need a lock
	p.clusterConfig = clusterConfig

	prevPGParameters := pgm.GetParameters()
	// create postgres parameteres
	pgParameters := p.createPGParameters(followersIDs)
	// update pgm postgres parameters
	pgm.SetParameters(pgParameters)

	keepersState, _, err := e.GetKeepersState()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	if keepersState == nil {
		keepersState = cluster.KeepersState{}
	}
	log.Debugf(spew.Sprintf("keepersState: %#v", keepersState))

	keeper := keepersState[p.id]
	log.Debugf(spew.Sprintf("keeperState: %#v", keeper))

	initialized, err := pgm.IsInitialized()
	if err != nil {
		log.Errorf("failed to detect if instance is initialized: %v", err)
		return
	}

	if len(cv.KeepersRole) == 0 {
		if !initialized {
			log.Infof("Initializing database")
			err = pgm.Init()
			if err != nil {
				log.Errorf("failed to initialized postgres instance: %v", err)
				return
			}
			initialized = true
		}
	}

	started := false

	if initialized {
		started, err = pgm.IsStarted()
		if err != nil {
			log.Errorf("failed to retrieve instance status: %v", err)
		} else if !started {
			err = pgm.Start()
			if err != nil {
				log.Errorf("failed to start postgres: %v", err)
			} else {
				started = true
			}
		}
	}

	if cv != nil {
		if !started && p.id == cv.Master {
			// If the clusterView says we are master but we cannot get
			// instance status or start then stop here, if we are standby then we can
			// recover
			return
		}
	}

	role, err := pgm.GetRole()
	if err != nil {
		log.Infof("error retrieving current pg role: %v", err)
		return
	}
	isMaster := false
	if role == common.MasterRole {
		log.Infof("current pg state: master")
		isMaster = true
	} else {
		log.Infof("current pg state: standby")
	}

	// publish ourself for discovery
	if err := p.publish(); err != nil {
		log.Errorf("failed to publish ourself to the cluster: %v", err)
		return
	}

	if cv == nil {
		return
	}

	// cv != nil

	masterID := cv.Master
	log.Debugf("masterID: %q", masterID)

	master := keepersState[masterID]
	log.Debugf(spew.Sprintf("masterState: %#v", master))

	keeperRole, ok := cv.KeepersRole[p.id]
	if !ok {
		log.Infof("our keeper requested role is not available")
		return
	}
	if keeperRole.Follow == "" {
		log.Infof("our cluster requested state is master")
		if role != common.MasterRole {
			log.Infof("promoting to master")
			err := pgm.Promote()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
		} else {
			log.Infof("already master")

			replSlots := []string{}
			replSlots, err = pgm.GetReplicatinSlots()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
			// Create replication slots
			for _, slotName := range replSlots {
				if !util.StringInSlice(followersIDs, slotName) {
					log.Infof("dropping replication slot for keeper %q not marked as follower", slotName)
					err := pgm.DropReplicationSlot(slotName)
					if err != nil {
						log.Errorf("err: %v", err)
					}
				}
			}

			for _, followerID := range followersIDs {
				if followerID == p.id {
					continue
				}
				if !util.StringInSlice(replSlots, followerID) {
					err := pgm.CreateReplicationSlot(followerID)
					if err != nil {
						log.Errorf("err: %v", err)
					}
				}
			}

		}
	} else {
		log.Infof("our cluster requested state is standby following %q", keeperRole.Follow)
		if isMaster {
			if err := p.fullResync(master, initialized, started); err != nil {
				log.Errorf("failed to full resync from master: %v", err)
				return
			}
		} else {
			log.Infof("already standby")
			curConnParams, err := pgm.GetPrimaryConninfo()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
			log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams))

			replConnString := p.getReplConnString(master)
			newConnParams, err := pg.URLToConnParams(replConnString)
			if err != nil {
				log.Errorf("cannot get conn params: %v", err)
				return
			}
			log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams))

			// Check that we can sync with master

			// Check timeline history
			// We need to update our pgState to avoid dealing with
			// an old pgState not reflecting the real state
			p.updatePGState(pctx)
			pgState := p.getLastPGState()
			if pgState == nil {
				log.Errorf("our pgstate is unknown: %v", err)
				return
			}
			mPGState := master.PGState
			if p.isDifferentTimelineBranch(mPGState, pgState) {
				if err := p.fullResync(master, initialized, started); err != nil {
					log.Errorf("failed to full resync from master: %v", err)
					return
				}
			}

			// TODO(sgotti) Check that the master has all the needed WAL segments

			// Update our primary_conninfo if replConnString changed
			if !curConnParams.Equals(newConnParams) {
				log.Infof("master connection parameters changed. Reconfiguring...")
				log.Infof("following %s with connection url %s", keeperRole.Follow, replConnString)
				err = pgm.BecomeStandby(replConnString)
				if err != nil {
					log.Errorf("err: %v", err)
					return
				}
				err = pgm.Restart(true)
				if err != nil {
					log.Errorf("err: %v", err)
					return
				}
			}
		}
	}

	// Log synchronous replication changes
	prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"]
	syncStandbyNames := pgParameters["synchronous_standby_names"]
	if p.clusterConfig.SynchronousReplication {
		if prevSyncStandbyNames != syncStandbyNames {
			log.Infof("needed synchronous_standby_names changed from %q to %q", prevSyncStandbyNames, syncStandbyNames)
		}
	} else {
		if prevSyncStandbyNames != "" {
			log.Infof("sync replication disabled, removing current synchronous_standby_names %q", prevSyncStandbyNames)
		}
	}

	if !pgParameters.Equals(prevPGParameters) {
		log.Infof("postgres parameters changed, reloading postgres instance")
		pgm.SetParameters(pgParameters)
		if err := pgm.Reload(); err != nil {
			log.Errorf("failed to reload postgres instance: %v", err)
		}
	} else {
		// for tests
		log.Debugf("postgres parameters not changed")
	}
	if err := p.saveCVVersion(cv.Version); err != nil {
		log.Errorf("err: %v", err)
		return
	}
}
Exemple #22
0
func TestParseConfig(t *testing.T) {
	tests := []struct {
		in  string
		cfg *Config
		err error
	}{
		{
			in:  "{}",
			cfg: &DefaultConfig,
			err: nil,
		},
		// Test duration parsing
		{
			in:  `{ "requesttimeout": "3s" }`,
			cfg: mergeDefaultConfig(&Config{RequestTimeout: 3 * time.Second}),
			err: nil,
		},
		{
			in:  `{ "requesttimeout": "3000ms" }`,
			cfg: mergeDefaultConfig(&Config{RequestTimeout: 3 * time.Second}),
			err: nil,
		},
		{
			in:  `{ "requesttimeout": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: RequestTimeout must be positive"),
		},
		{
			in:  `{ "requesttimeout": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: RequestTimeout must be positive"),
		},
		{
			in:  `{ "sleepinterval": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: SleepInterval must be positive"),
		},
		{
			in:  `{ "memberfailinterval": "-3s" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: MemberFailInterval must be positive"),
		},
		{
			in:  `{ "pgrepluser": "" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: PGReplUser cannot be empty"),
		},
		{
			in:  `{ "pgreplpassword": "" }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: PGReplPassword cannot be empty"),
		},
		{
			in:  `{ "maxstandbyspersender": 0 }`,
			cfg: nil,
			err: fmt.Errorf("config validation failed: MaxStandbysPerSender must be at least 1"),
		},
		// All options defined
		{
			in: `{ "requestTimeout": "10s", "sleepInterval": "10s", "memberFailInterval": "100s", "pgrepluser": "******", "pgreplpassword": "******", "maxstandbyspersender": 5, "synchronousreplication": true}`,
			cfg: mergeDefaultConfig(&Config{
				RequestTimeout:         10 * time.Second,
				SleepInterval:          10 * time.Second,
				MemberFailInterval:     100 * time.Second,
				PGReplUser:             "******",
				PGReplPassword:         "******",
				MaxStandbysPerSender:   5,
				SynchronousReplication: true,
			}),
			err: nil,
		},
	}

	for i, tt := range tests {
		cfg, err := ParseConfig([]byte(tt.in))
		if tt.err != nil {
			if err == nil {
				t.Errorf("got no error, wanted error: %v", tt.err)
			} else if tt.err.Error() != err.Error() {
				t.Errorf("got error: %v, wanted error: %v", err, tt.err)
			}
		} else {
			if err != nil {
				t.Errorf("unexpected error: %v", err)
			}
			if !reflect.DeepEqual(cfg, tt.cfg) {
				t.Errorf(spew.Sprintf("#%d: wrong config: got: %#v, want: %#v", i, cfg, tt.cfg))
			}
		}

	}
}
Exemple #23
0
func (s *Sentinel) updateClusterView(cv *cluster.ClusterView, membersState cluster.MembersState) (*cluster.ClusterView, error) {
	var wantedMasterID string
	// Cluster first initialization
	if cv.Version == 0 {
		log.Debugf("trying to find initial master")
		// Check for an initial master
		if len(membersState) < 1 {
			return nil, fmt.Errorf("cannot init cluster, no members registered")
		}
		if len(membersState) > 1 {
			return nil, fmt.Errorf("cannot init cluster, more than 1 member registered")
		}
		for id, m := range membersState {
			if m.PGState == nil {
				return nil, fmt.Errorf("cannot init cluster using member %q since its pg state is unknown", id)
			}
			log.Infof("Initializing cluster with master: %q", id)
			wantedMasterID = id
			break
		}
	} else {
		masterID := cv.Master

		masterOK := true
		master, ok := membersState[masterID]
		if !ok {
			return nil, fmt.Errorf("member state for master %q not available. This shouldn't happen!", masterID)
		}
		log.Debugf(spew.Sprintf("masterState: %#v", master))

		if !s.isMemberHealthy(master) {
			log.Infof("master is failed")
			masterOK = false
		}

		// Check that the wanted master is in master state (i.e. check that promotion from standby to master happened)
		if !s.isMemberConverged(master, cv) {
			log.Infof("member %s not yet master", masterID)
			masterOK = false
		}

		wantedMasterID = masterID
		if !masterOK {
			log.Infof("trying to find a standby to replace failed master")
			bestStandby, err := s.GetBestStandby(cv, membersState, masterID)
			if err != nil {
				log.Errorf("error trying to find the best standby: %v", err)
			} else {
				if bestStandby != masterID {
					log.Infof("electing new master: %q", bestStandby)
					wantedMasterID = bestStandby
				} else {
					log.Infof("cannot find a good standby to replace failed master")
				}
			}
		}
	}

	newCV := cv.Copy()
	newMembersRole := newCV.MembersRole

	// Add new members from membersState
	for id, _ := range membersState {
		if _, ok := newMembersRole[id]; !ok {
			newMembersRole[id] = &cluster.MemberRole{}
		}

	}

	// Setup master role
	if cv.Master != wantedMasterID {
		newCV.Master = wantedMasterID
		newMembersRole[wantedMasterID] = &cluster.MemberRole{Follow: ""}
	}

	// Setup standbys
	if cv.Master == wantedMasterID {
		// wanted master is the previous one
		masterState := membersState[wantedMasterID]
		if s.isMemberHealthy(masterState) && s.isMemberConverged(masterState, cv) {
			for id, _ := range newMembersRole {
				if id == wantedMasterID {
					continue
				}
				newMembersRole[id] = &cluster.MemberRole{Follow: wantedMasterID}
			}
		}
	}

	if !newCV.Equals(cv) {
		newCV.Version = cv.Version + 1
		newCV.ChangeTime = time.Now()
	}
	return newCV, nil
}