func replicaGCShouldQueueImpl( now, lastCheck, lastActivity hlc.Timestamp, isCandidate bool, ) (bool, float64) { timeout := ReplicaGCQueueInactivityThreshold priority := replicaGCPriorityDefault if isCandidate { // If the range is a candidate (which happens if its former replica set // ignores it), let it expire much earlier. timeout = ReplicaGCQueueCandidateTimeout priority = replicaGCPriorityCandidate } else if now.Less(lastCheck.Add(ReplicaGCQueueInactivityThreshold.Nanoseconds(), 0)) { // Return false immediately if the previous check was less than the // check interval in the past. Note that we don't do this if the // replica is in candidate state, in which case we want to be more // aggressive - a failed rebalance attempt could have checked this // range, and candidate state suggests that a retry succeeded. See // #7489. return false, 0 } shouldQ := lastActivity.Add(timeout.Nanoseconds(), 0).Less(now) if !shouldQ { return false, 0 } return shouldQ, priority }
// leaseStatus returns lease status. If the lease is epoch-based, // the liveness field will be set to the liveness used to compute // its state, unless state == leaseError. // // - The lease is considered valid if the timestamp is covered by the // supplied lease. This is determined differently depending on the // lease properties. For expiration-based leases, the timestamp is // covered if it's less than the expiration (minus the maximum // clock offset). For epoch-based "node liveness" leases, the lease // epoch must match the owner node's liveness epoch -AND- the // timestamp must be within the node's liveness expiration (also // minus the maximum clock offset). // // To be valid, a lease which contains a valid ProposedTS must have // a proposed timestamp greater than the minimum proposed timestamp, // which prevents a restarted process from serving commands, since // the command queue has been wiped through the restart. // // - The lease is considered in stasis if the timestamp is within the // maximum clock offset window of the lease expiration. // // - The lease is considered expired in all other cases. // // The maximum clock offset must always be taken into consideration to // avoid a failure of linearizability on a single register during // lease changes. Without that stasis period, the following could // occur: // // * a range lease gets committed on the new lease holder (but not the old). // * client proposes and commits a write on new lease holder (with a // timestamp just greater than the expiration of the old lease). // * client tries to read what it wrote, but hits a slow coordinator // (which assigns a timestamp covered by the old lease). // * the read is served by the old lease holder (which has not // processed the change in lease holdership). // * the client fails to read their own write. func (r *Replica) leaseStatus( lease *roachpb.Lease, timestamp, minProposedTS hlc.Timestamp, ) LeaseStatus { status := LeaseStatus{timestamp: timestamp, lease: lease} if lease == nil { status.state = leaseExpired return status } var expiration hlc.Timestamp if lease.Type() == roachpb.LeaseExpiration { expiration = lease.Expiration } else { var err error status.liveness, err = r.store.cfg.NodeLiveness.GetLiveness(lease.Replica.NodeID) if err != nil || status.liveness.Epoch < *lease.Epoch { // If lease validity can't be determined (e.g. gossip is down // and liveness info isn't available for owner), we can neither // use the lease nor do we want to attempt to acquire it. status.state = leaseError return status } if status.liveness.Epoch > *lease.Epoch { status.state = leaseExpired return status } expiration = status.liveness.Expiration } stasis := expiration.Add(-int64(r.store.Clock().MaxOffset()), 0) if timestamp.Less(stasis) { status.state = leaseValid // If the replica owns the lease, additional verify that the lease's // proposed timestamp is not earlier than the min proposed timestamp. if lease.Replica.StoreID == r.store.StoreID() && lease.ProposedTS != nil && lease.ProposedTS.Less(minProposedTS) { status.state = leaseProscribed } } else if timestamp.Less(expiration) { status.state = leaseStasis } else { status.state = leaseExpired } return status }
// NewTransaction creates a new transaction. The transaction key is // composed using the specified baseKey (for locality with data // affected by the transaction) and a random ID to guarantee // uniqueness. The specified user-level priority is combined with a // randomly chosen value to yield a final priority, used to settle // write conflicts in a way that avoids starvation of long-running // transactions (see Replica.PushTxn). func NewTransaction( name string, baseKey Key, userPriority UserPriority, isolation enginepb.IsolationType, now hlc.Timestamp, maxOffset int64, ) *Transaction { u := uuid.MakeV4() return &Transaction{ TxnMeta: enginepb.TxnMeta{ Key: baseKey, ID: &u, Isolation: isolation, Timestamp: now, Priority: MakePriority(userPriority), Sequence: 1, }, Name: name, OrigTimestamp: now, MaxTimestamp: now.Add(maxOffset, 0), } }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request now := replica.store.Clock().Now() reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, ProposedTS: &now, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // We reset our state below regardless of whether we've gotten an error or // not, but note that an error is ambiguous - there's no guarantee that the // transfer will not still apply. That's OK, however, as the "in transfer" // state maintained by the pendingLeaseRequest is not relied on for // correctness (see replica.mu.minLeaseProposedTS), and resetting the state // is beneficial as it'll allow the replica to attempt to transfer again or // extend the existing lease in the future. // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for _, llChan := range p.llChans { // Don't send the same transaction object twice; this can lead to races. if pErr != nil { pErrClone := *pErr pErrClone.SetTxn(pErr.GetTxn()) llChan <- &pErrClone } else { llChan <- nil } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for i, llChan := range p.llChans { // Don't send the same pErr object twice; this can lead to races. We could // clone every time but it's more efficient to send pErr itself to one of // the channels (the last one; if we send it earlier the race can still // happen). if i == len(p.llChans)-1 { llChan <- pErr } else { llChan <- protoutil.Clone(pErr).(*roachpb.Error) // works with `nil` } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// RunGC runs garbage collection for the specified descriptor on the provided // Engine (which is not mutated). It uses the provided functions pushTxnFn and // resolveIntentsFn to clarify the true status of and clean up after encountered // transactions. It returns a slice of gc'able keys from the data, transaction, // and abort spans. func RunGC( ctx context.Context, desc *roachpb.RangeDescriptor, snap engine.Reader, now hlc.Timestamp, policy config.GCPolicy, pushTxnFn pushFunc, resolveIntentsFn resolveFunc, ) ([]roachpb.GCRequest_GCKey, GCInfo, error) { iter := NewReplicaDataIterator(desc, snap, true /* replicatedOnly */) defer iter.Close() var infoMu = lockableGCInfo{} infoMu.Policy = policy infoMu.Now = now { realResolveIntentsFn := resolveIntentsFn resolveIntentsFn = func(intents []roachpb.Intent, poison bool, wait bool) (err error) { defer func() { infoMu.Lock() infoMu.ResolveTotal += len(intents) if err == nil { infoMu.ResolveSuccess += len(intents) } infoMu.Unlock() }() return realResolveIntentsFn(intents, poison, wait) } realPushTxnFn := pushTxnFn pushTxnFn = func(ts hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { infoMu.Lock() infoMu.PushTxn++ infoMu.Unlock() realPushTxnFn(ts, txn, typ) } } // Compute intent expiration (intent age at which we attempt to resolve). intentExp := now intentExp.WallTime -= intentAgeThreshold.Nanoseconds() txnExp := now txnExp.WallTime -= txnCleanupThreshold.Nanoseconds() abortSpanGCThreshold := now.Add(-int64(abortCacheAgeThreshold), 0) gc := engine.MakeGarbageCollector(now, policy) infoMu.Threshold = gc.Threshold infoMu.TxnSpanGCThreshold = txnExp var gcKeys []roachpb.GCRequest_GCKey var expBaseKey roachpb.Key var keys []engine.MVCCKey var vals [][]byte // Maps from txn ID to txn and intent key slice. txnMap := map[uuid.UUID]*roachpb.Transaction{} intentSpanMap := map[uuid.UUID][]roachpb.Span{} // processKeysAndValues is invoked with each key and its set of // values. Intents older than the intent age threshold are sent for // resolution and values after the MVCC metadata, and possible // intent, are sent for garbage collection. processKeysAndValues := func() { // If there's more than a single value for the key, possibly send for GC. if len(keys) > 1 { meta := &enginepb.MVCCMetadata{} if err := proto.Unmarshal(vals[0], meta); err != nil { log.Errorf(ctx, "unable to unmarshal MVCC metadata for key %q: %s", keys[0], err) } else { // In the event that there's an active intent, send for // intent resolution if older than the threshold. startIdx := 1 if meta.Txn != nil { // Keep track of intent to resolve if older than the intent // expiration threshold. if meta.Timestamp.Less(intentExp) { txnID := *meta.Txn.ID txn := &roachpb.Transaction{ TxnMeta: *meta.Txn, } txnMap[txnID] = txn infoMu.IntentsConsidered++ intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey}) } // With an active intent, GC ignores MVCC metadata & intent value. startIdx = 2 } // See if any values may be GC'd. if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(hlc.ZeroTimestamp) { // TODO(spencer): need to split the requests up into // multiple requests in the event that more than X keys // are added to the request. gcKeys = append(gcKeys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS}) } } } } // Iterate through the keys and values of this replica's range. for ; iter.Valid(); iter.Next() { iterKey := iter.Key() if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) { // Moving to the next key (& values). processKeysAndValues() expBaseKey = iterKey.Key if !iterKey.IsValue() { keys = []engine.MVCCKey{iter.Key()} vals = [][]byte{iter.Value()} continue } // An implicit metadata. keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)} // A nil value for the encoded MVCCMetadata. This will unmarshal to an // empty MVCCMetadata which is sufficient for processKeysAndValues to // determine that there is no intent. vals = [][]byte{nil} } keys = append(keys, iter.Key()) vals = append(vals, iter.Value()) } if iter.Error() != nil { return nil, GCInfo{}, iter.Error() } // Handle last collected set of keys/vals. processKeysAndValues() infoMu.IntentTxns = len(txnMap) infoMu.NumKeysAffected = len(gcKeys) txnKeys, err := processTransactionTable(ctx, snap, desc, txnMap, txnExp, &infoMu, resolveIntentsFn) if err != nil { return nil, GCInfo{}, err } // From now on, all newly added keys are range-local. // TODO(tschottdorf): Might need to use two requests at some point since we // hard-coded the full non-local key range in the header, but that does // not take into account the range-local keys. It will be OK as long as // we send directly to the Replica, though. gcKeys = append(gcKeys, txnKeys...) // Process push transactions in parallel. var wg sync.WaitGroup sem := make(chan struct{}, gcTaskLimit) for _, txn := range txnMap { if txn.Status != roachpb.PENDING { continue } wg.Add(1) sem <- struct{}{} // Avoid passing loop variable into closure. txnCopy := txn go func() { defer func() { <-sem wg.Done() }() pushTxnFn(now, txnCopy, roachpb.PUSH_ABORT) }() } wg.Wait() // Resolve all intents. var intents []roachpb.Intent for txnID, txn := range txnMap { if txn.Status != roachpb.PENDING { for _, intent := range intentSpanMap[txnID] { intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta}) } } } if err := resolveIntentsFn(intents, true /* wait */, false /* !poison */); err != nil { return nil, GCInfo{}, err } // Clean up the abort cache. gcKeys = append(gcKeys, processAbortCache( ctx, snap, desc.RangeID, abortSpanGCThreshold, &infoMu, pushTxnFn)...) return gcKeys, infoMu.GCInfo, nil }