Example #1
0
func (tk *TestKeeper) GetKeeperInfo(timeout time.Duration) (*cluster.KeeperInfo, error) {
	ctx, cancel := context.WithTimeout(context.Background(), timeout)
	defer cancel()
	req, err := http.NewRequest("GET", fmt.Sprintf("http://%s/info", net.JoinHostPort(tk.listenAddress, tk.port)), nil)
	if err != nil {
		return nil, err
	}
	var data cluster.KeeperInfo
	err = httpDo(ctx, req, nil, func(resp *http.Response, err error) error {
		if err != nil {
			return err
		}
		defer resp.Body.Close()

		if resp.StatusCode != http.StatusOK {
			return fmt.Errorf("http error code: %d, error: %s", resp.StatusCode, resp.Status)
		}
		if err := json.NewDecoder(resp.Body).Decode(&data); err != nil {
			return err
		}
		return nil
	})
	if err != nil {
		return nil, err
	}
	return &data, nil
}
Example #2
0
func (p *Manager) GetRoleFromDB() (common.Role, error) {
	db, err := sql.Open("postgres", p.connString)
	if err != nil {
		return 0, err
	}
	defer db.Close()

	ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout)
	rows, err := Query(ctx, db, "SELECT pg_is_in_recovery from pg_is_in_recovery()")
	cancel()
	if err != nil {
		return 0, err
	}
	defer rows.Close()
	for rows.Next() {
		var isInRecovery bool
		if err := rows.Scan(&isInRecovery); err != nil {
			return 0, err
		}
		if isInRecovery {
			return common.StandbyRole, nil
		}
		return common.MasterRole, nil
	}
	return 0, fmt.Errorf("cannot get pg role from db: no rows returned")
}
Example #3
0
func (p *Manager) GetReplicatinSlots() ([]string, error) {
	db, err := sql.Open("postgres", p.connString)
	if err != nil {
		return nil, err
	}
	defer db.Close()

	replSlots := []string{}

	ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout)
	rows, err := Query(ctx, db, "SELECT slot_name from pg_replication_slots")
	cancel()
	if err != nil {
		return nil, err
	}
	defer rows.Close()
	for rows.Next() {
		var slotName string
		if err := rows.Scan(&slotName); err != nil {
			return nil, err
		}
		replSlots = append(replSlots, slotName)
	}

	return replSlots, nil
}
Example #4
0
func (te *TestEtcd) GetEtcdNode(timeout time.Duration, path string) (*etcd.Node, error) {
	ctx, cancel := context.WithTimeout(context.Background(), timeout)
	defer cancel()

	res, err := te.kAPI.Get(ctx, path, &etcd.GetOptions{Quorum: true})
	if err != nil {
		return nil, err
	}
	return res.Node, nil
}
Example #5
0
func (p *PostgresKeeper) updatePGState(pctx context.Context) {
	p.pgStateMutex.Lock()
	pgState := &cluster.PostgresState{}

	defer func() {
		p.lastPGState = pgState
		p.pgStateMutex.Unlock()
	}()

	initialized, err := p.pgm.IsInitialized()
	if err != nil {
		pgState = nil
		return
	}
	if !initialized {
		pgState.Initialized = false
	} else {
		var err error
		ctx, cancel := context.WithTimeout(pctx, p.clusterConfig.RequestTimeout)
		pgState, err = pg.GetPGState(ctx, p.getOurReplConnString())
		cancel()
		if err != nil {
			log.Errorf("error getting pg state: %v", err)
			pgState = nil
			return
		}
		pgState.Initialized = true

		// if timeline <= 1 then no timeline history file exists.
		pgState.TimelinesHistory = cluster.PostgresTimeLinesHistory{}
		if pgState.TimelineID > 1 {
			ctx, cancel = context.WithTimeout(pctx, p.clusterConfig.RequestTimeout)
			tlsh, err := pg.GetTimelinesHistory(ctx, pgState.TimelineID, p.getOurReplConnString())
			cancel()
			if err != nil {
				log.Errorf("error getting timeline history: %v", err)
				pgState = nil
				return
			}
			pgState.TimelinesHistory = tlsh
		}
	}
}
Example #6
0
func (p *PostgresKeeper) pgStateHandler(w http.ResponseWriter, req *http.Request) {
	pgState := &cluster.PostgresState{}
	p.cvMutex.Lock()
	defer p.cvMutex.Unlock()

	initialized, err := p.pgm.IsInitialized()
	if err != nil {
		w.WriteHeader(http.StatusInternalServerError)
		return
	}
	if !initialized {
		pgState.Initialized = false
	} else {
		var err error
		ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout)
		pgState, err = pg.GetPGState(ctx, p.getOurReplConnString())
		cancel()
		if err != nil {
			log.Errorf("error getting pg state: %v", err)
			w.WriteHeader(http.StatusInternalServerError)
			return
		}
		pgState.Initialized = true

		// if timeline <= 1 then no timeline history file exists.
		pgState.TimelinesHistory = cluster.PostgresTimeLinesHistory{}
		if pgState.TimelineID > 1 {
			ctx, cancel = context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout)
			tlsh, err := pg.GetTimelinesHistory(ctx, pgState.TimelineID, p.getOurReplConnString())
			cancel()
			if err != nil {
				log.Errorf("error getting timeline history: %v", err)
				w.WriteHeader(http.StatusInternalServerError)
				return
			}
			pgState.TimelinesHistory = tlsh
		}
	}
	if err := json.NewEncoder(w).Encode(&pgState); err != nil {
		w.WriteHeader(http.StatusInternalServerError)
	}
}
Example #7
0
func (p *Manager) CreateReplUser() error {
	db, err := sql.Open("postgres", p.connString)
	if err != nil {
		return err
	}
	defer db.Close()

	ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout)
	_, err = Exec(ctx, db, fmt.Sprintf(`CREATE USER "%s" WITH REPLICATION ENCRYPTED PASSWORD '%s';`, p.replUser, p.replPassword))
	cancel()
	return err
}
Example #8
0
func (p *Manager) DropReplicationSlot(name string) error {
	db, err := sql.Open("postgres", p.connString)
	if err != nil {
		return err
	}
	defer db.Close()

	ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout)
	_, err = Exec(ctx, db, fmt.Sprintf("select pg_drop_replication_slot('%s')", name))
	cancel()
	return err
}
Example #9
0
func (s *Sentinel) clusterSentinelSM(pctx context.Context) {
	e := s.e

	// Update cluster config
	clusterConfig, _, err := e.GetClusterConfig()
	if err != nil {
		log.Errorf("cannot get cluster config: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))
	// This shouldn't need a lock
	s.clusterConfig = clusterConfig

	// TODO(sgotti) better ways to calculate leaseTTL?
	leaseTTL := clusterConfig.SleepInterval + clusterConfig.RequestTimeout*4

	ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersDiscoveryInfo, err := s.discover(ctx)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("membersDiscoveryInfo: %#v", membersDiscoveryInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersInfo, err := getMembersInfo(ctx, membersDiscoveryInfo)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("membersInfo: %#v", membersInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	membersPGState := getMembersPGState(ctx, membersInfo)
	cancel()
	log.Debugf(spew.Sprintf("membersPGState: %#v", membersPGState))

	var l lease.Lease
	if isLeader(s.l, s.id) {
		log.Infof("I'm the sentinels leader")
		l = renewLeadership(s.l, leaseTTL)
	} else {
		log.Infof("trying to acquire sentinels leadership")
		l = acquireLeadership(s.lManager, s.id, 1, leaseTTL)
	}

	// log all leadership changes
	if l != nil && s.l == nil && l.MachineID() != s.id {
		log.Infof("sentinel leader is %s", l.MachineID())
	} else if l != nil && s.l != nil && l.MachineID() != l.MachineID() {
		log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID())
	}

	s.l = l

	if !isLeader(s.l, s.id) {
		return
	}

	cd, res, err := e.GetClusterData()
	if err != nil {
		log.Errorf("error retrieving cluster data: %v", err)
		return
	}
	var prevCDIndex uint64
	if res != nil {
		prevCDIndex = res.Node.ModifiedIndex
	}

	var cv *cluster.ClusterView
	var membersState cluster.MembersState
	if cd == nil {
		cv = cluster.NewClusterView()
		membersState = nil
	} else {
		cv = cd.ClusterView
		membersState = cd.MembersState
	}
	log.Debugf(spew.Sprintf("membersState: %#v", membersState))
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	pv, res, err := e.GetProxyView()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("proxyview: %#v", pv))

	var prevPVIndex uint64
	if res != nil {
		prevPVIndex = res.Node.ModifiedIndex
	}

	newMembersState := s.updateMembersState(membersState, membersInfo, membersPGState)
	log.Debugf(spew.Sprintf("newMembersState: %#v", newMembersState))

	newcv, err := s.updateClusterView(cv, newMembersState)
	if err != nil {
		log.Errorf("failed to update clusterView: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("newcv: %#v", newcv))
	if cv.Version < newcv.Version {
		log.Debugf("newcv changed from previous cv")
		if err := s.updateProxyView(cv, newcv, newMembersState, prevPVIndex); err != nil {
			log.Errorf("error updating proxyView: %v", err)
			return
		}
	}

	_, err = e.SetClusterData(newMembersState, newcv, prevCDIndex)
	if err != nil {
		log.Errorf("error saving clusterdata: %v", err)
	}
}
Example #10
0
func (p *Manager) GetRoleFromDB() (common.Role, error) {
	ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout)
	defer cancel()
	return GetRole(ctx, p.connString)
}
Example #11
0
func (p *Manager) DropReplicationSlot(name string) error {
	ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout)
	defer cancel()
	return DropReplicationSlot(ctx, p.connString, name)
}
Example #12
0
func (p *Manager) GetReplicatinSlots() ([]string, error) {
	ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout)
	defer cancel()
	return GetReplicatinSlots(ctx, p.connString)
}
Example #13
0
func (p *Manager) CreateReplRole() error {
	ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout)
	defer cancel()
	return CreateReplRole(ctx, p.connString, p.replUser, p.replPassword)
}
Example #14
0
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
	e := p.e
	pgm := p.pgm

	// Update cluster config
	clusterConfig, _, err := e.GetClusterConfig()
	if err != nil {
		log.Errorf("cannot get cluster config: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig))
	// This shouldn't need a lock
	p.clusterConfig = clusterConfig

	cv, _, err := e.GetClusterView()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	membersState, _, err := e.GetMembersState()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	if membersState == nil {
		membersState = cluster.MembersState{}
	}
	log.Debugf(spew.Sprintf("membersState: %#v", membersState))

	member := membersState[p.id]
	log.Debugf(spew.Sprintf("memberState: %#v", member))

	initialized, err := pgm.IsInitialized()
	if err != nil {
		log.Errorf("failed to detect if instance is initialized: %v", err)
		return
	}

	if cv == nil {
		if !initialized {
			log.Infof("Initializing database")
			err = pgm.Init()
			if err != nil {
				log.Errorf("failed to initialized postgres instance: %v", err)
				return
			}
			initialized = true
		}
	}

	started := false

	if initialized {
		started, err = pgm.IsStarted()
		if err != nil {
			log.Errorf("failed to retrieve instance status: %v", err)
		} else if !started {
			err = pgm.Start()
			if err != nil {
				log.Errorf("err: %v", err)
			} else {
				started = true
			}
		}
	}

	if cv != nil {
		if !started && p.id == cv.Master {
			// If the clusterView says we are master but we cannot get
			// instance status or start then stop here, if we are standby then we can
			// recover
			return
		}
	}

	role, err := pgm.GetRole()
	if err != nil {
		log.Infof("error retrieving current pg role: %v", err)
		return
	}
	isMaster := false
	if role == common.MasterRole {
		log.Infof("current pg state: master")
		isMaster = true
	} else {
		log.Infof("current pg state: standby")
	}

	// publish ourself for discovery
	if err := p.publish(); err != nil {
		log.Errorf("failed to publish ourself to the cluster: %v", err)
		return
	}

	if cv == nil {
		return
	}

	// cv != nil

	masterID := cv.Master
	log.Debugf("masterID: %q", masterID)

	master := membersState[masterID]
	log.Debugf(spew.Sprintf("masterState: %#v", master))

	followersIDs := cv.GetFollowersIDs(p.id)

	memberRole, ok := cv.MembersRole[p.id]
	if !ok {
		log.Infof("our member state is not available")
		return
	}
	if memberRole.Follow == "" {
		log.Infof("our cluster requested state is master")
		if role != common.MasterRole {
			log.Infof("promoting to master")
			err := pgm.Promote()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
		} else {
			log.Infof("already master")

			replSlots := []string{}
			replSlots, err = pgm.GetReplicatinSlots()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
			// Create replication slots
			for _, slotName := range replSlots {
				if !util.StringInSlice(followersIDs, slotName) {
					log.Infof("dropping replication slot for member %q not marked as follower", slotName)
					err := pgm.DropReplicationSlot(slotName)
					if err != nil {
						log.Errorf("err: %v", err)
					}
				}
			}

			for _, followerID := range followersIDs {
				if followerID == p.id {
					continue
				}
				if !util.StringInSlice(replSlots, followerID) {
					err := pgm.CreateReplicationSlot(followerID)
					if err != nil {
						log.Errorf("err: %v", err)
					}
				}
			}

			// Setup synchronous replication
			syncStandbyNames, _ := pgm.GetServerParameter("synchronous_standby_names")
			if p.clusterConfig.SynchronousReplication {
				newSyncStandbyNames := strings.Join(followersIDs, ",")
				if syncStandbyNames != newSyncStandbyNames {
					log.Infof("needed synchronous_standby_names changed from %q to %q, reconfiguring", syncStandbyNames, newSyncStandbyNames)
					pgm.SetServerParameter("synchronous_standby_names", newSyncStandbyNames)
					pgm.Reload()
				}
			} else {
				if syncStandbyNames != "" {
					log.Infof("sync replication disabled, removing current synchronous_standby_names %q", syncStandbyNames)
					pgm.SetServerParameter("synchronous_standby_names", "")
					pgm.Reload()
				}
			}
		}
	} else {
		log.Infof("our cluster requested state is standby following %q", memberRole.Follow)
		if isMaster {
			if err := p.fullResync(master, initialized, started); err != nil {
				log.Errorf("failed to full resync from master: %v", err)
				return
			}
		} else {
			log.Infof("already standby")
			curConnParams, err := pgm.GetPrimaryConninfo()
			if err != nil {
				log.Errorf("err: %v", err)
				return
			}
			log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams))

			replConnString := p.getReplConnString(master)
			newConnParams, err := pg.URLToConnParams(replConnString)
			if err != nil {
				log.Errorf("cannot get conn params: %v", err)
				return
			}
			log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams))

			// Check that we can sync with master

			// Check timeline history
			ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout)
			pgState, err := pg.GetPGState(ctx, p.getOurReplConnString())
			cancel()
			if err != nil {
				log.Errorf("cannot get our pgstate: %v", err)
				return
			}
			mPGState := master.PGState
			if p.isDifferentTimelineBranch(mPGState, pgState) {
				if err := p.fullResync(master, initialized, started); err != nil {
					log.Errorf("failed to full resync from master: %v", err)
					return
				}
			}

			// TODO(sgotti) Check that the master has all the needed WAL segments

			// Update our primary_conninfo if replConnString changed
			if !curConnParams.Equals(newConnParams) {
				log.Infof("master connection parameters changed. Reconfiguring...")
				log.Infof("following %s with connection url %s", memberRole.Follow, replConnString)
				err = pgm.BecomeStandby(replConnString)
				if err != nil {
					log.Errorf("err: %v", err)
					return
				}
				err = pgm.Restart(true)
				if err != nil {
					log.Errorf("err: %v", err)
					return
				}
			}
		}
	}
	if err := p.saveCVVersion(cv.Version); err != nil {
		log.Errorf("err: %v", err)
		return
	}
}
Example #15
0
func (c *simpleHTTPClient) Do(ctx context.Context, act httpAction) (*http.Response, []byte, error) {
	req := act.HTTPRequest(c.endpoint)

	if err := printcURL(req); err != nil {
		return nil, nil, err
	}

	var hctx context.Context
	var hcancel context.CancelFunc
	if c.headerTimeout > 0 {
		hctx, hcancel = context.WithTimeout(ctx, c.headerTimeout)
	} else {
		hctx, hcancel = context.WithCancel(ctx)
	}
	defer hcancel()

	reqcancel := requestCanceler(c.transport, req)

	rtchan := make(chan roundTripResponse, 1)
	go func() {
		resp, err := c.transport.RoundTrip(req)
		rtchan <- roundTripResponse{resp: resp, err: err}
		close(rtchan)
	}()

	var resp *http.Response
	var err error

	select {
	case rtresp := <-rtchan:
		resp, err = rtresp.resp, rtresp.err
	case <-hctx.Done():
		// cancel and wait for request to actually exit before continuing
		reqcancel()
		rtresp := <-rtchan
		resp = rtresp.resp
		switch {
		case ctx.Err() != nil:
			err = ctx.Err()
		case hctx.Err() != nil:
			err = fmt.Errorf("client: endpoint %s exceeded header timeout", c.endpoint.String())
		default:
			panic("failed to get error from context")
		}
	}

	// always check for resp nil-ness to deal with possible
	// race conditions between channels above
	defer func() {
		if resp != nil {
			resp.Body.Close()
		}
	}()

	if err != nil {
		return nil, nil, err
	}

	var body []byte
	done := make(chan struct{})
	go func() {
		body, err = ioutil.ReadAll(resp.Body)
		done <- struct{}{}
	}()

	select {
	case <-ctx.Done():
		resp.Body.Close()
		<-done
		return nil, nil, ctx.Err()
	case <-done:
	}

	return resp, body, err
}
Example #16
0
func (s *Sentinel) clusterSentinelCheck(pctx context.Context) {
	s.updateMutex.Lock()
	defer s.updateMutex.Unlock()
	e := s.e

	cd, res, err := e.GetClusterData()
	if err != nil {
		log.Errorf("error retrieving cluster data: %v", err)
		return
	}
	var prevCDIndex uint64
	if res != nil {
		prevCDIndex = res.Node.ModifiedIndex
	}

	var cv *cluster.ClusterView
	var keepersState cluster.KeepersState
	if cd == nil {
		cv = cluster.NewClusterView()
		keepersState = nil
	} else {
		cv = cd.ClusterView
		keepersState = cd.KeepersState
	}
	log.Debugf(spew.Sprintf("keepersState: %#v", keepersState))
	log.Debugf(spew.Sprintf("clusterView: %#v", cv))

	// Update cluster config
	// This shouldn't need a lock
	s.clusterConfig = cv.Config.ToConfig()

	if err := s.setSentinelInfo(2 * s.clusterConfig.SleepInterval); err != nil {
		log.Errorf("cannot update leader sentinel info: %v", err)
		return
	}

	// TODO(sgotti) better ways to calculate leaseTTL?
	leaseTTL := s.clusterConfig.SleepInterval + s.clusterConfig.RequestTimeout*4

	ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersDiscoveryInfo, err := s.discover(ctx)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("keepersDiscoveryInfo: %#v", keepersDiscoveryInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersInfo, err := getKeepersInfo(ctx, keepersDiscoveryInfo)
	cancel()
	if err != nil {
		log.Errorf("err: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("keepersInfo: %#v", keepersInfo))

	ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout)
	keepersPGState := getKeepersPGState(ctx, keepersInfo)
	cancel()
	log.Debugf(spew.Sprintf("keepersPGState: %#v", keepersPGState))

	var l lease.Lease
	if isLeader(s.l, s.id) {
		log.Infof("I'm the sentinels leader")
		l = renewLeadership(s.l, leaseTTL)
	} else {
		log.Infof("trying to acquire sentinels leadership")
		l = acquireLeadership(s.lManager, s.id, 1, leaseTTL)
	}

	// log all leadership changes
	if l != nil && s.l == nil && l.MachineID() != s.id {
		log.Infof("sentinel leader is %s", l.MachineID())
	} else if l != nil && s.l != nil && l.MachineID() != l.MachineID() {
		log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID())
	}

	s.l = l

	if !isLeader(s.l, s.id) {
		return
	}
	if err := s.setLeaderSentinelInfo(leaseTTL); err != nil {
		log.Errorf("cannot update leader sentinel info: %v", err)
		return
	}

	if cv.Version == 0 {
		// Cluster first initialization
		newcv := cluster.NewClusterView()
		newcv.Version = 1
		_, err = e.SetClusterData(nil, newcv, 0)
		if err != nil {
			log.Errorf("error saving clusterdata: %v", err)
		}
		return
	}

	newKeepersState := s.updateKeepersState(keepersState, keepersInfo, keepersPGState)
	log.Debugf(spew.Sprintf("newKeepersState: %#v", newKeepersState))

	newcv, err := s.updateClusterView(cv, newKeepersState)
	if err != nil {
		log.Errorf("failed to update clusterView: %v", err)
		return
	}
	log.Debugf(spew.Sprintf("newcv: %#v", newcv))
	if cv.Version < newcv.Version {
		log.Debugf("newcv changed from previous cv")
	}

	_, err = e.SetClusterData(newKeepersState, newcv, prevCDIndex)
	if err != nil {
		log.Errorf("error saving clusterdata: %v", err)
	}
}
Example #17
0
func (r *etcdLeaseManager) ctx() context.Context {
	ctx, _ := context.WithTimeout(context.Background(), r.reqTimeout)
	return ctx
}