func (tk *TestKeeper) GetKeeperInfo(timeout time.Duration) (*cluster.KeeperInfo, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() req, err := http.NewRequest("GET", fmt.Sprintf("http://%s/info", net.JoinHostPort(tk.listenAddress, tk.port)), nil) if err != nil { return nil, err } var data cluster.KeeperInfo err = httpDo(ctx, req, nil, func(resp *http.Response, err error) error { if err != nil { return err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return fmt.Errorf("http error code: %d, error: %s", resp.StatusCode, resp.Status) } if err := json.NewDecoder(resp.Body).Decode(&data); err != nil { return err } return nil }) if err != nil { return nil, err } return &data, nil }
func (p *Manager) GetRoleFromDB() (common.Role, error) { db, err := sql.Open("postgres", p.connString) if err != nil { return 0, err } defer db.Close() ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout) rows, err := Query(ctx, db, "SELECT pg_is_in_recovery from pg_is_in_recovery()") cancel() if err != nil { return 0, err } defer rows.Close() for rows.Next() { var isInRecovery bool if err := rows.Scan(&isInRecovery); err != nil { return 0, err } if isInRecovery { return common.StandbyRole, nil } return common.MasterRole, nil } return 0, fmt.Errorf("cannot get pg role from db: no rows returned") }
func (p *Manager) GetReplicatinSlots() ([]string, error) { db, err := sql.Open("postgres", p.connString) if err != nil { return nil, err } defer db.Close() replSlots := []string{} ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout) rows, err := Query(ctx, db, "SELECT slot_name from pg_replication_slots") cancel() if err != nil { return nil, err } defer rows.Close() for rows.Next() { var slotName string if err := rows.Scan(&slotName); err != nil { return nil, err } replSlots = append(replSlots, slotName) } return replSlots, nil }
func (te *TestEtcd) GetEtcdNode(timeout time.Duration, path string) (*etcd.Node, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() res, err := te.kAPI.Get(ctx, path, &etcd.GetOptions{Quorum: true}) if err != nil { return nil, err } return res.Node, nil }
func (p *PostgresKeeper) updatePGState(pctx context.Context) { p.pgStateMutex.Lock() pgState := &cluster.PostgresState{} defer func() { p.lastPGState = pgState p.pgStateMutex.Unlock() }() initialized, err := p.pgm.IsInitialized() if err != nil { pgState = nil return } if !initialized { pgState.Initialized = false } else { var err error ctx, cancel := context.WithTimeout(pctx, p.clusterConfig.RequestTimeout) pgState, err = pg.GetPGState(ctx, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("error getting pg state: %v", err) pgState = nil return } pgState.Initialized = true // if timeline <= 1 then no timeline history file exists. pgState.TimelinesHistory = cluster.PostgresTimeLinesHistory{} if pgState.TimelineID > 1 { ctx, cancel = context.WithTimeout(pctx, p.clusterConfig.RequestTimeout) tlsh, err := pg.GetTimelinesHistory(ctx, pgState.TimelineID, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("error getting timeline history: %v", err) pgState = nil return } pgState.TimelinesHistory = tlsh } } }
func (p *PostgresKeeper) pgStateHandler(w http.ResponseWriter, req *http.Request) { pgState := &cluster.PostgresState{} p.cvMutex.Lock() defer p.cvMutex.Unlock() initialized, err := p.pgm.IsInitialized() if err != nil { w.WriteHeader(http.StatusInternalServerError) return } if !initialized { pgState.Initialized = false } else { var err error ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout) pgState, err = pg.GetPGState(ctx, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("error getting pg state: %v", err) w.WriteHeader(http.StatusInternalServerError) return } pgState.Initialized = true // if timeline <= 1 then no timeline history file exists. pgState.TimelinesHistory = cluster.PostgresTimeLinesHistory{} if pgState.TimelineID > 1 { ctx, cancel = context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout) tlsh, err := pg.GetTimelinesHistory(ctx, pgState.TimelineID, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("error getting timeline history: %v", err) w.WriteHeader(http.StatusInternalServerError) return } pgState.TimelinesHistory = tlsh } } if err := json.NewEncoder(w).Encode(&pgState); err != nil { w.WriteHeader(http.StatusInternalServerError) } }
func (p *Manager) CreateReplUser() error { db, err := sql.Open("postgres", p.connString) if err != nil { return err } defer db.Close() ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout) _, err = Exec(ctx, db, fmt.Sprintf(`CREATE USER "%s" WITH REPLICATION ENCRYPTED PASSWORD '%s';`, p.replUser, p.replPassword)) cancel() return err }
func (p *Manager) DropReplicationSlot(name string) error { db, err := sql.Open("postgres", p.connString) if err != nil { return err } defer db.Close() ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout) _, err = Exec(ctx, db, fmt.Sprintf("select pg_drop_replication_slot('%s')", name)) cancel() return err }
func (s *Sentinel) clusterSentinelSM(pctx context.Context) { e := s.e // Update cluster config clusterConfig, _, err := e.GetClusterConfig() if err != nil { log.Errorf("cannot get cluster config: %v", err) return } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock s.clusterConfig = clusterConfig // TODO(sgotti) better ways to calculate leaseTTL? leaseTTL := clusterConfig.SleepInterval + clusterConfig.RequestTimeout*4 ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersDiscoveryInfo, err := s.discover(ctx) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("membersDiscoveryInfo: %#v", membersDiscoveryInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersInfo, err := getMembersInfo(ctx, membersDiscoveryInfo) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("membersInfo: %#v", membersInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) membersPGState := getMembersPGState(ctx, membersInfo) cancel() log.Debugf(spew.Sprintf("membersPGState: %#v", membersPGState)) var l lease.Lease if isLeader(s.l, s.id) { log.Infof("I'm the sentinels leader") l = renewLeadership(s.l, leaseTTL) } else { log.Infof("trying to acquire sentinels leadership") l = acquireLeadership(s.lManager, s.id, 1, leaseTTL) } // log all leadership changes if l != nil && s.l == nil && l.MachineID() != s.id { log.Infof("sentinel leader is %s", l.MachineID()) } else if l != nil && s.l != nil && l.MachineID() != l.MachineID() { log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID()) } s.l = l if !isLeader(s.l, s.id) { return } cd, res, err := e.GetClusterData() if err != nil { log.Errorf("error retrieving cluster data: %v", err) return } var prevCDIndex uint64 if res != nil { prevCDIndex = res.Node.ModifiedIndex } var cv *cluster.ClusterView var membersState cluster.MembersState if cd == nil { cv = cluster.NewClusterView() membersState = nil } else { cv = cd.ClusterView membersState = cd.MembersState } log.Debugf(spew.Sprintf("membersState: %#v", membersState)) log.Debugf(spew.Sprintf("clusterView: %#v", cv)) pv, res, err := e.GetProxyView() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("proxyview: %#v", pv)) var prevPVIndex uint64 if res != nil { prevPVIndex = res.Node.ModifiedIndex } newMembersState := s.updateMembersState(membersState, membersInfo, membersPGState) log.Debugf(spew.Sprintf("newMembersState: %#v", newMembersState)) newcv, err := s.updateClusterView(cv, newMembersState) if err != nil { log.Errorf("failed to update clusterView: %v", err) return } log.Debugf(spew.Sprintf("newcv: %#v", newcv)) if cv.Version < newcv.Version { log.Debugf("newcv changed from previous cv") if err := s.updateProxyView(cv, newcv, newMembersState, prevPVIndex); err != nil { log.Errorf("error updating proxyView: %v", err) return } } _, err = e.SetClusterData(newMembersState, newcv, prevCDIndex) if err != nil { log.Errorf("error saving clusterdata: %v", err) } }
func (p *Manager) GetRoleFromDB() (common.Role, error) { ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout) defer cancel() return GetRole(ctx, p.connString) }
func (p *Manager) DropReplicationSlot(name string) error { ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout) defer cancel() return DropReplicationSlot(ctx, p.connString, name) }
func (p *Manager) GetReplicatinSlots() ([]string, error) { ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout) defer cancel() return GetReplicatinSlots(ctx, p.connString) }
func (p *Manager) CreateReplRole() error { ctx, cancel := context.WithTimeout(context.Background(), p.requestTimeout) defer cancel() return CreateReplRole(ctx, p.connString, p.replUser, p.replPassword) }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm // Update cluster config clusterConfig, _, err := e.GetClusterConfig() if err != nil { log.Errorf("cannot get cluster config: %v", err) return } log.Debugf(spew.Sprintf("clusterConfig: %#v", clusterConfig)) // This shouldn't need a lock p.clusterConfig = clusterConfig cv, _, err := e.GetClusterView() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("clusterView: %#v", cv)) membersState, _, err := e.GetMembersState() if err != nil { log.Errorf("err: %v", err) return } if membersState == nil { membersState = cluster.MembersState{} } log.Debugf(spew.Sprintf("membersState: %#v", membersState)) member := membersState[p.id] log.Debugf(spew.Sprintf("memberState: %#v", member)) initialized, err := pgm.IsInitialized() if err != nil { log.Errorf("failed to detect if instance is initialized: %v", err) return } if cv == nil { if !initialized { log.Infof("Initializing database") err = pgm.Init() if err != nil { log.Errorf("failed to initialized postgres instance: %v", err) return } initialized = true } } started := false if initialized { started, err = pgm.IsStarted() if err != nil { log.Errorf("failed to retrieve instance status: %v", err) } else if !started { err = pgm.Start() if err != nil { log.Errorf("err: %v", err) } else { started = true } } } if cv != nil { if !started && p.id == cv.Master { // If the clusterView says we are master but we cannot get // instance status or start then stop here, if we are standby then we can // recover return } } role, err := pgm.GetRole() if err != nil { log.Infof("error retrieving current pg role: %v", err) return } isMaster := false if role == common.MasterRole { log.Infof("current pg state: master") isMaster = true } else { log.Infof("current pg state: standby") } // publish ourself for discovery if err := p.publish(); err != nil { log.Errorf("failed to publish ourself to the cluster: %v", err) return } if cv == nil { return } // cv != nil masterID := cv.Master log.Debugf("masterID: %q", masterID) master := membersState[masterID] log.Debugf(spew.Sprintf("masterState: %#v", master)) followersIDs := cv.GetFollowersIDs(p.id) memberRole, ok := cv.MembersRole[p.id] if !ok { log.Infof("our member state is not available") return } if memberRole.Follow == "" { log.Infof("our cluster requested state is master") if role != common.MasterRole { log.Infof("promoting to master") err := pgm.Promote() if err != nil { log.Errorf("err: %v", err) return } } else { log.Infof("already master") replSlots := []string{} replSlots, err = pgm.GetReplicatinSlots() if err != nil { log.Errorf("err: %v", err) return } // Create replication slots for _, slotName := range replSlots { if !util.StringInSlice(followersIDs, slotName) { log.Infof("dropping replication slot for member %q not marked as follower", slotName) err := pgm.DropReplicationSlot(slotName) if err != nil { log.Errorf("err: %v", err) } } } for _, followerID := range followersIDs { if followerID == p.id { continue } if !util.StringInSlice(replSlots, followerID) { err := pgm.CreateReplicationSlot(followerID) if err != nil { log.Errorf("err: %v", err) } } } // Setup synchronous replication syncStandbyNames, _ := pgm.GetServerParameter("synchronous_standby_names") if p.clusterConfig.SynchronousReplication { newSyncStandbyNames := strings.Join(followersIDs, ",") if syncStandbyNames != newSyncStandbyNames { log.Infof("needed synchronous_standby_names changed from %q to %q, reconfiguring", syncStandbyNames, newSyncStandbyNames) pgm.SetServerParameter("synchronous_standby_names", newSyncStandbyNames) pgm.Reload() } } else { if syncStandbyNames != "" { log.Infof("sync replication disabled, removing current synchronous_standby_names %q", syncStandbyNames) pgm.SetServerParameter("synchronous_standby_names", "") pgm.Reload() } } } } else { log.Infof("our cluster requested state is standby following %q", memberRole.Follow) if isMaster { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } else { log.Infof("already standby") curConnParams, err := pgm.GetPrimaryConninfo() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("curConnParams: %v", curConnParams)) replConnString := p.getReplConnString(master) newConnParams, err := pg.URLToConnParams(replConnString) if err != nil { log.Errorf("cannot get conn params: %v", err) return } log.Debugf(spew.Sprintf("newConnParams: %v", newConnParams)) // Check that we can sync with master // Check timeline history ctx, cancel := context.WithTimeout(context.Background(), p.clusterConfig.RequestTimeout) pgState, err := pg.GetPGState(ctx, p.getOurReplConnString()) cancel() if err != nil { log.Errorf("cannot get our pgstate: %v", err) return } mPGState := master.PGState if p.isDifferentTimelineBranch(mPGState, pgState) { if err := p.fullResync(master, initialized, started); err != nil { log.Errorf("failed to full resync from master: %v", err) return } } // TODO(sgotti) Check that the master has all the needed WAL segments // Update our primary_conninfo if replConnString changed if !curConnParams.Equals(newConnParams) { log.Infof("master connection parameters changed. Reconfiguring...") log.Infof("following %s with connection url %s", memberRole.Follow, replConnString) err = pgm.BecomeStandby(replConnString) if err != nil { log.Errorf("err: %v", err) return } err = pgm.Restart(true) if err != nil { log.Errorf("err: %v", err) return } } } } if err := p.saveCVVersion(cv.Version); err != nil { log.Errorf("err: %v", err) return } }
func (c *simpleHTTPClient) Do(ctx context.Context, act httpAction) (*http.Response, []byte, error) { req := act.HTTPRequest(c.endpoint) if err := printcURL(req); err != nil { return nil, nil, err } var hctx context.Context var hcancel context.CancelFunc if c.headerTimeout > 0 { hctx, hcancel = context.WithTimeout(ctx, c.headerTimeout) } else { hctx, hcancel = context.WithCancel(ctx) } defer hcancel() reqcancel := requestCanceler(c.transport, req) rtchan := make(chan roundTripResponse, 1) go func() { resp, err := c.transport.RoundTrip(req) rtchan <- roundTripResponse{resp: resp, err: err} close(rtchan) }() var resp *http.Response var err error select { case rtresp := <-rtchan: resp, err = rtresp.resp, rtresp.err case <-hctx.Done(): // cancel and wait for request to actually exit before continuing reqcancel() rtresp := <-rtchan resp = rtresp.resp switch { case ctx.Err() != nil: err = ctx.Err() case hctx.Err() != nil: err = fmt.Errorf("client: endpoint %s exceeded header timeout", c.endpoint.String()) default: panic("failed to get error from context") } } // always check for resp nil-ness to deal with possible // race conditions between channels above defer func() { if resp != nil { resp.Body.Close() } }() if err != nil { return nil, nil, err } var body []byte done := make(chan struct{}) go func() { body, err = ioutil.ReadAll(resp.Body) done <- struct{}{} }() select { case <-ctx.Done(): resp.Body.Close() <-done return nil, nil, ctx.Err() case <-done: } return resp, body, err }
func (s *Sentinel) clusterSentinelCheck(pctx context.Context) { s.updateMutex.Lock() defer s.updateMutex.Unlock() e := s.e cd, res, err := e.GetClusterData() if err != nil { log.Errorf("error retrieving cluster data: %v", err) return } var prevCDIndex uint64 if res != nil { prevCDIndex = res.Node.ModifiedIndex } var cv *cluster.ClusterView var keepersState cluster.KeepersState if cd == nil { cv = cluster.NewClusterView() keepersState = nil } else { cv = cd.ClusterView keepersState = cd.KeepersState } log.Debugf(spew.Sprintf("keepersState: %#v", keepersState)) log.Debugf(spew.Sprintf("clusterView: %#v", cv)) // Update cluster config // This shouldn't need a lock s.clusterConfig = cv.Config.ToConfig() if err := s.setSentinelInfo(2 * s.clusterConfig.SleepInterval); err != nil { log.Errorf("cannot update leader sentinel info: %v", err) return } // TODO(sgotti) better ways to calculate leaseTTL? leaseTTL := s.clusterConfig.SleepInterval + s.clusterConfig.RequestTimeout*4 ctx, cancel := context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersDiscoveryInfo, err := s.discover(ctx) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("keepersDiscoveryInfo: %#v", keepersDiscoveryInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersInfo, err := getKeepersInfo(ctx, keepersDiscoveryInfo) cancel() if err != nil { log.Errorf("err: %v", err) return } log.Debugf(spew.Sprintf("keepersInfo: %#v", keepersInfo)) ctx, cancel = context.WithTimeout(pctx, s.clusterConfig.RequestTimeout) keepersPGState := getKeepersPGState(ctx, keepersInfo) cancel() log.Debugf(spew.Sprintf("keepersPGState: %#v", keepersPGState)) var l lease.Lease if isLeader(s.l, s.id) { log.Infof("I'm the sentinels leader") l = renewLeadership(s.l, leaseTTL) } else { log.Infof("trying to acquire sentinels leadership") l = acquireLeadership(s.lManager, s.id, 1, leaseTTL) } // log all leadership changes if l != nil && s.l == nil && l.MachineID() != s.id { log.Infof("sentinel leader is %s", l.MachineID()) } else if l != nil && s.l != nil && l.MachineID() != l.MachineID() { log.Infof("sentinel leadership changed from %s to %s", l.MachineID(), l.MachineID()) } s.l = l if !isLeader(s.l, s.id) { return } if err := s.setLeaderSentinelInfo(leaseTTL); err != nil { log.Errorf("cannot update leader sentinel info: %v", err) return } if cv.Version == 0 { // Cluster first initialization newcv := cluster.NewClusterView() newcv.Version = 1 _, err = e.SetClusterData(nil, newcv, 0) if err != nil { log.Errorf("error saving clusterdata: %v", err) } return } newKeepersState := s.updateKeepersState(keepersState, keepersInfo, keepersPGState) log.Debugf(spew.Sprintf("newKeepersState: %#v", newKeepersState)) newcv, err := s.updateClusterView(cv, newKeepersState) if err != nil { log.Errorf("failed to update clusterView: %v", err) return } log.Debugf(spew.Sprintf("newcv: %#v", newcv)) if cv.Version < newcv.Version { log.Debugf("newcv changed from previous cv") } _, err = e.SetClusterData(newKeepersState, newcv, prevCDIndex) if err != nil { log.Errorf("error saving clusterdata: %v", err) } }
func (r *etcdLeaseManager) ctx() context.Context { ctx, _ := context.WithTimeout(context.Background(), r.reqTimeout) return ctx }