func (r *Replica) leasePostApply( ctx context.Context, newLease *roachpb.Lease, replicaID roachpb.ReplicaID, prevLease *roachpb.Lease, ) { iAmTheLeaseHolder := newLease.Replica.ReplicaID == replicaID leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID if leaseChangingHands && iAmTheLeaseHolder { // If this replica is a new holder of the lease, update the low water // mark of the timestamp cache. Note that clock offset scenarios are // handled via a stasis period inherent in the lease which is documented // in on the Lease struct. // // The introduction of lease transfers implies that the previous lease // may have been shortened and we are now applying a formally overlapping // lease (since the old lease holder has promised not to serve any more // requests, this is kosher). This means that we don't use the old // lease's expiration but instead use the new lease's start to initialize // the timestamp cache low water. log.Infof(ctx, "new range lease %s following %s [physicalTime=%s]", newLease, prevLease, r.store.Clock().PhysicalTime()) r.mu.Lock() r.mu.tsCache.SetLowWater(newLease.Start) r.mu.Unlock() // Gossip the first range whenever its lease is acquired. We check to // make sure the lease is active so that a trailing replica won't process // an old lease request and attempt to gossip the first range. if r.IsFirstRange() && newLease.Covers(r.store.Clock().Now()) { r.gossipFirstRange(ctx) } } if leaseChangingHands && !iAmTheLeaseHolder { // We're not the lease holder, reset our timestamp cache, releasing // anything currently cached. The timestamp cache is only used by the // lease holder. Note that we'll call SetLowWater when we next acquire // the lease. r.mu.Lock() r.mu.tsCache.Clear(r.store.Clock().Now()) r.mu.Unlock() } if !iAmTheLeaseHolder && newLease.Covers(r.store.Clock().Now()) { // If this replica is the raft leader but it is not the new lease holder, // then try to transfer the raft leadership to match the lease. We like it // when leases and raft leadership are collocated because that facilitates // quick command application (requests generally need to make it to both the // lease holder and the raft leader before being applied by other replicas). // // TODO(andrei): We want to do this attempt when a lease changes hands, and // then periodically check that the collocation is fine. So we keep checking // it here on lease extensions, which happen periodically, but that's pretty // arbitrary. There might be a more natural place elsewhere where this // periodic check should happen. r.maybeTransferRaftLeadership(ctx, replicaID, newLease.Replica.ReplicaID) } }
// leaseStatus returns lease status. If the lease is epoch-based, // the liveness field will be set to the liveness used to compute // its state, unless state == leaseError. // // - The lease is considered valid if the timestamp is covered by the // supplied lease. This is determined differently depending on the // lease properties. For expiration-based leases, the timestamp is // covered if it's less than the expiration (minus the maximum // clock offset). For epoch-based "node liveness" leases, the lease // epoch must match the owner node's liveness epoch -AND- the // timestamp must be within the node's liveness expiration (also // minus the maximum clock offset). // // To be valid, a lease which contains a valid ProposedTS must have // a proposed timestamp greater than the minimum proposed timestamp, // which prevents a restarted process from serving commands, since // the command queue has been wiped through the restart. // // - The lease is considered in stasis if the timestamp is within the // maximum clock offset window of the lease expiration. // // - The lease is considered expired in all other cases. // // The maximum clock offset must always be taken into consideration to // avoid a failure of linearizability on a single register during // lease changes. Without that stasis period, the following could // occur: // // * a range lease gets committed on the new lease holder (but not the old). // * client proposes and commits a write on new lease holder (with a // timestamp just greater than the expiration of the old lease). // * client tries to read what it wrote, but hits a slow coordinator // (which assigns a timestamp covered by the old lease). // * the read is served by the old lease holder (which has not // processed the change in lease holdership). // * the client fails to read their own write. func (r *Replica) leaseStatus( lease *roachpb.Lease, timestamp, minProposedTS hlc.Timestamp, ) LeaseStatus { status := LeaseStatus{timestamp: timestamp, lease: lease} if lease == nil { status.state = leaseExpired return status } var expiration hlc.Timestamp if lease.Type() == roachpb.LeaseExpiration { expiration = lease.Expiration } else { var err error status.liveness, err = r.store.cfg.NodeLiveness.GetLiveness(lease.Replica.NodeID) if err != nil || status.liveness.Epoch < *lease.Epoch { // If lease validity can't be determined (e.g. gossip is down // and liveness info isn't available for owner), we can neither // use the lease nor do we want to attempt to acquire it. status.state = leaseError return status } if status.liveness.Epoch > *lease.Epoch { status.state = leaseExpired return status } expiration = status.liveness.Expiration } stasis := expiration.Add(-int64(r.store.Clock().MaxOffset()), 0) if timestamp.Less(stasis) { status.state = leaseValid // If the replica owns the lease, additional verify that the lease's // proposed timestamp is not earlier than the min proposed timestamp. if lease.Replica.StoreID == r.store.StoreID() && lease.ProposedTS != nil && lease.ProposedTS.Less(minProposedTS) { status.state = leaseProscribed } } else if timestamp.Less(expiration) { status.state = leaseStasis } else { status.state = leaseExpired } return status }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( repl *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, status LeaseStatus, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request now := repl.store.Clock().Now() reqLease := roachpb.Lease{ Start: status.timestamp, Replica: nextLeaseHolder, ProposedTS: &now, } if repl.requiresExpiringLease() { reqLease.Expiration = status.timestamp.Add(int64(repl.store.cfg.RangeLeaseActiveDuration), 0) } else { // Get the liveness for the next lease holder and set the epoch in the lease request. liveness, err := repl.store.cfg.NodeLiveness.GetLiveness(nextLeaseHolder.NodeID) if err != nil { llChan <- roachpb.NewErrorf("couldn't request lease for %+v: %v", nextLeaseHolder, err) return llChan } reqLease.Epoch = proto.Int64(liveness.Epoch) } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, PrevLease: status.lease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, PrevLease: status.lease, } } if err := p.requestLeaseAsync(repl, nextLeaseHolder, reqLease, status, leaseReq); err != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, repl.store.StoreID(), repl.mu.state.Desc)) return llChan } // TODO(andrei): document this subtlety. p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// requestLeaseAsync sends a transfer lease or lease request to the // specified replica. The request is sent in an async task. func (p *pendingLeaseRequest) requestLeaseAsync( repl *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, reqLease roachpb.Lease, status LeaseStatus, leaseReq roachpb.Request, ) error { return repl.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = repl.AnnotateCtx(ctx) var pErr *roachpb.Error // If requesting an epoch-based lease & current state is expired, // potentially heartbeat our own liveness or increment epoch of // prior owner. Note we only do this if the previous lease was // epoch-based. if reqLease.Type() == roachpb.LeaseEpoch && status.state == leaseExpired && status.lease.Type() == roachpb.LeaseEpoch { var err error // If this replica is previous & next lease holder, manually heartbeat to become live. if status.lease.OwnedBy(nextLeaseHolder.StoreID) && repl.store.StoreID() == nextLeaseHolder.StoreID { if err = repl.store.cfg.NodeLiveness.Heartbeat(ctx, status.liveness); err != nil { log.Error(ctx, err) } } else if status.liveness.Epoch == *status.lease.Epoch { // If not owner, increment epoch if necessary to invalidate lease. if err = repl.store.cfg.NodeLiveness.IncrementEpoch(ctx, status.liveness); err != nil { log.Error(ctx, err) } } // Set error for propagation to all waiters below. if err != nil { pErr = roachpb.NewError(newNotLeaseHolderError(status.lease, repl.store.StoreID(), repl.Desc())) } } // Propose a RequestLease command and wait for it to apply. if pErr == nil { ba := roachpb.BatchRequest{} ba.Timestamp = repl.store.Clock().Now() ba.RangeID = repl.RangeID ba.Add(leaseReq) _, pErr = repl.Send(ctx, ba) } // We reset our state below regardless of whether we've gotten an error or // not, but note that an error is ambiguous - there's no guarantee that the // transfer will not still apply. That's OK, however, as the "in transfer" // state maintained by the pendingLeaseRequest is not relied on for // correctness (see repl.mu.minLeaseProposedTS), and resetting the state // is beneficial as it'll allow the replica to attempt to transfer again or // extend the existing lease in the future. // Send result of lease to all waiter channels. repl.mu.Lock() defer repl.mu.Unlock() for _, llChan := range p.llChans { // Don't send the same transaction object twice; this can lead to races. if pErr != nil { pErrClone := *pErr pErrClone.SetTxn(pErr.GetTxn()) llChan <- &pErrClone } else { llChan <- nil } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) }
func (r *Replica) leasePostApply( ctx context.Context, newLease *roachpb.Lease, replicaID roachpb.ReplicaID, prevLease *roachpb.Lease, ) { iAmTheLeaseHolder := newLease.Replica.ReplicaID == replicaID leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID if leaseChangingHands && iAmTheLeaseHolder { // If this replica is a new holder of the lease, update the low water // mark of the timestamp cache. Note that clock offset scenarios are // handled via a stasis period inherent in the lease which is documented // in on the Lease struct. // // The introduction of lease transfers implies that the previous lease // may have been shortened and we are now applying a formally overlapping // lease (since the old lease holder has promised not to serve any more // requests, this is kosher). This means that we don't use the old // lease's expiration but instead use the new lease's start to initialize // the timestamp cache low water. if log.V(1) { log.Infof(ctx, "new range lease %s following %s [physicalTime=%s]", newLease, prevLease, r.store.Clock().PhysicalTime()) } r.mu.Lock() r.mu.tsCache.SetLowWater(newLease.Start) r.mu.Unlock() // Gossip the first range whenever its lease is acquired. We check to // make sure the lease is active so that a trailing replica won't process // an old lease request and attempt to gossip the first range. if r.IsFirstRange() && r.IsLeaseValid(newLease, r.store.Clock().Now()) { r.gossipFirstRange(ctx) } } if leaseChangingHands && !iAmTheLeaseHolder { // We're not the lease holder, reset our timestamp cache, releasing // anything currently cached. The timestamp cache is only used by the // lease holder. Note that we'll call SetLowWater when we next acquire // the lease. r.mu.Lock() r.mu.tsCache.Clear(r.store.Clock().Now()) r.mu.Unlock() } if !iAmTheLeaseHolder && r.IsLeaseValid(newLease, r.store.Clock().Now()) { // If this replica is the raft leader but it is not the new lease holder, // then try to transfer the raft leadership to match the lease. We like it // when leases and raft leadership are collocated because that facilitates // quick command application (requests generally need to make it to both the // lease holder and the raft leader before being applied by other replicas). // Note that this condition is also checked periodically when computing // replica metrics. r.maybeTransferRaftLeadership(ctx, newLease.Replica.ReplicaID) } // Notify the store that a lease change occurred and it may need to // gossip the updated store descriptor (with updated capacity). if leaseChangingHands && (prevLease.OwnedBy(r.store.StoreID()) || newLease.OwnedBy(r.store.StoreID())) { r.store.maybeGossipOnCapacityChange(ctx, leaseChangeEvent) } }