Beispiel #1
0
func (r *Replica) leasePostApply(
	ctx context.Context,
	newLease *roachpb.Lease,
	replicaID roachpb.ReplicaID,
	prevLease *roachpb.Lease,
) {
	iAmTheLeaseHolder := newLease.Replica.ReplicaID == replicaID
	leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID

	if leaseChangingHands && iAmTheLeaseHolder {
		// If this replica is a new holder of the lease, update the low water
		// mark of the timestamp cache. Note that clock offset scenarios are
		// handled via a stasis period inherent in the lease which is documented
		// in on the Lease struct.
		//
		// The introduction of lease transfers implies that the previous lease
		// may have been shortened and we are now applying a formally overlapping
		// lease (since the old lease holder has promised not to serve any more
		// requests, this is kosher). This means that we don't use the old
		// lease's expiration but instead use the new lease's start to initialize
		// the timestamp cache low water.
		log.Infof(ctx, "new range lease %s following %s [physicalTime=%s]",
			newLease, prevLease, r.store.Clock().PhysicalTime())
		r.mu.Lock()
		r.mu.tsCache.SetLowWater(newLease.Start)
		r.mu.Unlock()

		// Gossip the first range whenever its lease is acquired. We check to
		// make sure the lease is active so that a trailing replica won't process
		// an old lease request and attempt to gossip the first range.
		if r.IsFirstRange() && newLease.Covers(r.store.Clock().Now()) {
			r.gossipFirstRange(ctx)
		}
	}
	if leaseChangingHands && !iAmTheLeaseHolder {
		// We're not the lease holder, reset our timestamp cache, releasing
		// anything currently cached. The timestamp cache is only used by the
		// lease holder. Note that we'll call SetLowWater when we next acquire
		// the lease.
		r.mu.Lock()
		r.mu.tsCache.Clear(r.store.Clock().Now())
		r.mu.Unlock()
	}

	if !iAmTheLeaseHolder && newLease.Covers(r.store.Clock().Now()) {
		// If this replica is the raft leader but it is not the new lease holder,
		// then try to transfer the raft leadership to match the lease. We like it
		// when leases and raft leadership are collocated because that facilitates
		// quick command application (requests generally need to make it to both the
		// lease holder and the raft leader before being applied by other replicas).
		//
		// TODO(andrei): We want to do this attempt when a lease changes hands, and
		// then periodically check that the collocation is fine. So we keep checking
		// it here on lease extensions, which happen periodically, but that's pretty
		// arbitrary. There might be a more natural place elsewhere where this
		// periodic check should happen.
		r.maybeTransferRaftLeadership(ctx, replicaID, newLease.Replica.ReplicaID)
	}
}
// leaseStatus returns lease status. If the lease is epoch-based,
// the liveness field will be set to the liveness used to compute
// its state, unless state == leaseError.
//
// - The lease is considered valid if the timestamp is covered by the
//   supplied lease. This is determined differently depending on the
//   lease properties. For expiration-based leases, the timestamp is
//   covered if it's less than the expiration (minus the maximum
//   clock offset). For epoch-based "node liveness" leases, the lease
//   epoch must match the owner node's liveness epoch -AND- the
//   timestamp must be within the node's liveness expiration (also
//   minus the maximum clock offset).
//
//   To be valid, a lease which contains a valid ProposedTS must have
//   a proposed timestamp greater than the minimum proposed timestamp,
//   which prevents a restarted process from serving commands, since
//   the command queue has been wiped through the restart.
//
// - The lease is considered in stasis if the timestamp is within the
//   maximum clock offset window of the lease expiration.
//
// - The lease is considered expired in all other cases.
//
// The maximum clock offset must always be taken into consideration to
// avoid a failure of linearizability on a single register during
// lease changes. Without that stasis period, the following could
// occur:
//
// * a range lease gets committed on the new lease holder (but not the old).
// * client proposes and commits a write on new lease holder (with a
//   timestamp just greater than the expiration of the old lease).
// * client tries to read what it wrote, but hits a slow coordinator
//   (which assigns a timestamp covered by the old lease).
// * the read is served by the old lease holder (which has not
//   processed the change in lease holdership).
// * the client fails to read their own write.
func (r *Replica) leaseStatus(
	lease *roachpb.Lease, timestamp, minProposedTS hlc.Timestamp,
) LeaseStatus {
	status := LeaseStatus{timestamp: timestamp, lease: lease}
	if lease == nil {
		status.state = leaseExpired
		return status
	}
	var expiration hlc.Timestamp
	if lease.Type() == roachpb.LeaseExpiration {
		expiration = lease.Expiration
	} else {
		var err error
		status.liveness, err = r.store.cfg.NodeLiveness.GetLiveness(lease.Replica.NodeID)
		if err != nil || status.liveness.Epoch < *lease.Epoch {
			// If lease validity can't be determined (e.g. gossip is down
			// and liveness info isn't available for owner), we can neither
			// use the lease nor do we want to attempt to acquire it.
			status.state = leaseError
			return status
		}
		if status.liveness.Epoch > *lease.Epoch {
			status.state = leaseExpired
			return status
		}
		expiration = status.liveness.Expiration
	}
	stasis := expiration.Add(-int64(r.store.Clock().MaxOffset()), 0)
	if timestamp.Less(stasis) {
		status.state = leaseValid
		// If the replica owns the lease, additional verify that the lease's
		// proposed timestamp is not earlier than the min proposed timestamp.
		if lease.Replica.StoreID == r.store.StoreID() &&
			lease.ProposedTS != nil && lease.ProposedTS.Less(minProposedTS) {
			status.state = leaseProscribed
		}
	} else if timestamp.Less(expiration) {
		status.state = leaseStasis
	} else {
		status.state = leaseExpired
	}
	return status
}
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a
// channel on which the result will be posted. If there's already a request in
// progress, we join in waiting for the results of that request.
// It is an error to call InitOrJoinRequest() while a request is in progress
// naming another replica as lease holder.
//
// replica is used to schedule and execute async work (proposing a RequestLease
// command). replica.mu is locked when delivering results, so calls from the
// replica happen either before or after a result for a pending request has
// happened.
//
// transfer needs to be set if the request represents a lease transfer (as
// opposed to an extension, or acquiring the lease when none is held).
//
// Note: Once this function gets a context to be used for cancellation, instead
// of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling
// the Raft command, similar to replica.addWriteCmd.
func (p *pendingLeaseRequest) InitOrJoinRequest(
	repl *Replica,
	nextLeaseHolder roachpb.ReplicaDescriptor,
	status LeaseStatus,
	startKey roachpb.Key,
	transfer bool,
) <-chan *roachpb.Error {
	if nextLease, ok := p.RequestPending(); ok {
		if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID {
			// Join a pending request asking for the same replica to become lease
			// holder.
			return p.JoinRequest()
		}
		llChan := make(chan *roachpb.Error, 1)
		// We can't join the request in progress.
		llChan <- roachpb.NewErrorf("request for different replica in progress "+
			"(requesting: %+v, in progress: %+v)",
			nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID)
		return llChan
	}
	llChan := make(chan *roachpb.Error, 1)
	// No request in progress. Let's propose a Lease command asynchronously.
	reqSpan := roachpb.Span{
		Key: startKey,
	}
	var leaseReq roachpb.Request
	now := repl.store.Clock().Now()
	reqLease := roachpb.Lease{
		Start:      status.timestamp,
		Replica:    nextLeaseHolder,
		ProposedTS: &now,
	}

	if repl.requiresExpiringLease() {
		reqLease.Expiration = status.timestamp.Add(int64(repl.store.cfg.RangeLeaseActiveDuration), 0)
	} else {
		// Get the liveness for the next lease holder and set the epoch in the lease request.
		liveness, err := repl.store.cfg.NodeLiveness.GetLiveness(nextLeaseHolder.NodeID)
		if err != nil {
			llChan <- roachpb.NewErrorf("couldn't request lease for %+v: %v", nextLeaseHolder, err)
			return llChan
		}
		reqLease.Epoch = proto.Int64(liveness.Epoch)
	}

	if transfer {
		leaseReq = &roachpb.TransferLeaseRequest{
			Span:      reqSpan,
			Lease:     reqLease,
			PrevLease: status.lease,
		}
	} else {
		leaseReq = &roachpb.RequestLeaseRequest{
			Span:      reqSpan,
			Lease:     reqLease,
			PrevLease: status.lease,
		}
	}

	if err := p.requestLeaseAsync(repl, nextLeaseHolder, reqLease, status, leaseReq); err != nil {
		// We failed to start the asynchronous task. Send a blank NotLeaseHolderError
		// back to indicate that we have no idea who the range lease holder might
		// be; we've withdrawn from active duty.
		llChan <- roachpb.NewError(
			newNotLeaseHolderError(nil, repl.store.StoreID(), repl.mu.state.Desc))
		return llChan
	}
	// TODO(andrei): document this subtlety.
	p.llChans = append(p.llChans, llChan)
	p.nextLease = reqLease
	return llChan
}
// requestLeaseAsync sends a transfer lease or lease request to the
// specified replica. The request is sent in an async task.
func (p *pendingLeaseRequest) requestLeaseAsync(
	repl *Replica,
	nextLeaseHolder roachpb.ReplicaDescriptor,
	reqLease roachpb.Lease,
	status LeaseStatus,
	leaseReq roachpb.Request,
) error {
	return repl.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) {
		ctx = repl.AnnotateCtx(ctx)
		var pErr *roachpb.Error

		// If requesting an epoch-based lease & current state is expired,
		// potentially heartbeat our own liveness or increment epoch of
		// prior owner. Note we only do this if the previous lease was
		// epoch-based.
		if reqLease.Type() == roachpb.LeaseEpoch && status.state == leaseExpired &&
			status.lease.Type() == roachpb.LeaseEpoch {
			var err error
			// If this replica is previous & next lease holder, manually heartbeat to become live.
			if status.lease.OwnedBy(nextLeaseHolder.StoreID) &&
				repl.store.StoreID() == nextLeaseHolder.StoreID {
				if err = repl.store.cfg.NodeLiveness.Heartbeat(ctx, status.liveness); err != nil {
					log.Error(ctx, err)
				}
			} else if status.liveness.Epoch == *status.lease.Epoch {
				// If not owner, increment epoch if necessary to invalidate lease.
				if err = repl.store.cfg.NodeLiveness.IncrementEpoch(ctx, status.liveness); err != nil {
					log.Error(ctx, err)
				}
			}
			// Set error for propagation to all waiters below.
			if err != nil {
				pErr = roachpb.NewError(newNotLeaseHolderError(status.lease, repl.store.StoreID(), repl.Desc()))
			}
		}

		// Propose a RequestLease command and wait for it to apply.
		if pErr == nil {
			ba := roachpb.BatchRequest{}
			ba.Timestamp = repl.store.Clock().Now()
			ba.RangeID = repl.RangeID
			ba.Add(leaseReq)
			_, pErr = repl.Send(ctx, ba)
		}
		// We reset our state below regardless of whether we've gotten an error or
		// not, but note that an error is ambiguous - there's no guarantee that the
		// transfer will not still apply. That's OK, however, as the "in transfer"
		// state maintained by the pendingLeaseRequest is not relied on for
		// correctness (see repl.mu.minLeaseProposedTS), and resetting the state
		// is beneficial as it'll allow the replica to attempt to transfer again or
		// extend the existing lease in the future.

		// Send result of lease to all waiter channels.
		repl.mu.Lock()
		defer repl.mu.Unlock()
		for _, llChan := range p.llChans {
			// Don't send the same transaction object twice; this can lead to races.
			if pErr != nil {
				pErrClone := *pErr
				pErrClone.SetTxn(pErr.GetTxn())
				llChan <- &pErrClone
			} else {
				llChan <- nil
			}
		}
		p.llChans = p.llChans[:0]
		p.nextLease = roachpb.Lease{}
	})
}
func (r *Replica) leasePostApply(
	ctx context.Context,
	newLease *roachpb.Lease,
	replicaID roachpb.ReplicaID,
	prevLease *roachpb.Lease,
) {
	iAmTheLeaseHolder := newLease.Replica.ReplicaID == replicaID
	leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID

	if leaseChangingHands && iAmTheLeaseHolder {
		// If this replica is a new holder of the lease, update the low water
		// mark of the timestamp cache. Note that clock offset scenarios are
		// handled via a stasis period inherent in the lease which is documented
		// in on the Lease struct.
		//
		// The introduction of lease transfers implies that the previous lease
		// may have been shortened and we are now applying a formally overlapping
		// lease (since the old lease holder has promised not to serve any more
		// requests, this is kosher). This means that we don't use the old
		// lease's expiration but instead use the new lease's start to initialize
		// the timestamp cache low water.
		if log.V(1) {
			log.Infof(ctx, "new range lease %s following %s [physicalTime=%s]",
				newLease, prevLease, r.store.Clock().PhysicalTime())
		}
		r.mu.Lock()
		r.mu.tsCache.SetLowWater(newLease.Start)
		r.mu.Unlock()

		// Gossip the first range whenever its lease is acquired. We check to
		// make sure the lease is active so that a trailing replica won't process
		// an old lease request and attempt to gossip the first range.
		if r.IsFirstRange() && r.IsLeaseValid(newLease, r.store.Clock().Now()) {
			r.gossipFirstRange(ctx)
		}
	}
	if leaseChangingHands && !iAmTheLeaseHolder {
		// We're not the lease holder, reset our timestamp cache, releasing
		// anything currently cached. The timestamp cache is only used by the
		// lease holder. Note that we'll call SetLowWater when we next acquire
		// the lease.
		r.mu.Lock()
		r.mu.tsCache.Clear(r.store.Clock().Now())
		r.mu.Unlock()
	}

	if !iAmTheLeaseHolder && r.IsLeaseValid(newLease, r.store.Clock().Now()) {
		// If this replica is the raft leader but it is not the new lease holder,
		// then try to transfer the raft leadership to match the lease. We like it
		// when leases and raft leadership are collocated because that facilitates
		// quick command application (requests generally need to make it to both the
		// lease holder and the raft leader before being applied by other replicas).
		// Note that this condition is also checked periodically when computing
		// replica metrics.
		r.maybeTransferRaftLeadership(ctx, newLease.Replica.ReplicaID)
	}

	// Notify the store that a lease change occurred and it may need to
	// gossip the updated store descriptor (with updated capacity).
	if leaseChangingHands && (prevLease.OwnedBy(r.store.StoreID()) ||
		newLease.OwnedBy(r.store.StoreID())) {
		r.store.maybeGossipOnCapacityChange(ctx, leaseChangeEvent)
	}
}