func testPut() roachpb.BatchRequest { var ba roachpb.BatchRequest ba.Timestamp = testTS put := &roachpb.PutRequest{} put.Key = testKey ba.Add(put) return ba }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(ctx, gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } log.VEventf(ctx, 1, "completed with stats %+v", info) info.updateMetrics(gcq.store.metrics) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { log.ErrEvent(ctx, pErr.String()) return pErr.GoError() } return nil }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request now := replica.store.Clock().Now() reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, ProposedTS: &now, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // We reset our state below regardless of whether we've gotten an error or // not, but note that an error is ambiguous - there's no guarantee that the // transfer will not still apply. That's OK, however, as the "in transfer" // state maintained by the pendingLeaseRequest is not relied on for // correctness (see replica.mu.minLeaseProposedTS), and resetting the state // is beneficial as it'll allow the replica to attempt to transfer again or // extend the existing lease in the future. // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for _, llChan := range p.llChans { // Don't send the same transaction object twice; this can lead to races. if pErr != nil { pErrClone := *pErr pErrClone.SetTxn(pErr.GetTxn()) llChan <- &pErrClone } else { llChan <- nil } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for i, llChan := range p.llChans { // Don't send the same pErr object twice; this can lead to races. We could // clone every time but it's more efficient to send pErr itself to one of // the channels (the last one; if we send it earlier the race can still // happen). if i == len(p.llChans)-1 { llChan <- pErr } else { llChan <- protoutil.Clone(pErr).(*roachpb.Error) // works with `nil` } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// initAndVerifyBatch initializes timestamp-related information and // verifies batch constraints before splitting. func (ds *DistSender) initAndVerifyBatch( ctx context.Context, ba *roachpb.BatchRequest, ) *roachpb.Error { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(hlc.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil { // Make a copy here since the code below modifies it in different places. // TODO(tschottdorf): be smarter about this - no need to do it for // requests that don't get split. txnClone := ba.Txn.Clone() ba.Txn = &txnClone if len(ba.Txn.ObservedTimestamps) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // // We already have a clone of our txn (see above), so we can // modify it freely. // // Zero the existing data. That makes sure that if we had // something of size zero but with capacity, we don't re-use the // existing space (which others may also use). This is just to // satisfy paranoia/OCD and not expected to matter in practice. ba.Txn.ResetObservedTimestamps() // OrigTimestamp is the HLC timestamp at which the Txn started, so // this effectively means no more uncertainty on this node. ba.Txn.UpdateObservedTimestamp(nDesc.NodeID, ba.Txn.OrigTimestamp) } } } if len(ba.Requests) < 1 { return roachpb.NewErrorf("empty batch") } if ba.MaxSpanRequestKeys != 0 { // Verify that the batch contains only specific range requests or the // Begin/EndTransactionRequest. Verify that a batch with a ReverseScan // only contains ReverseScan range requests. isReverse := ba.IsReverse() for _, req := range ba.Requests { inner := req.GetInner() switch inner.(type) { case *roachpb.ScanRequest, *roachpb.DeleteRangeRequest: // Accepted range requests. All other range requests are still // not supported. // TODO(vivek): don't enumerate all range requests. if isReverse { return roachpb.NewErrorf("batch with limit contains both forward and reverse scans") } case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest, *roachpb.ReverseScanRequest: continue default: return roachpb.NewErrorf("batch with limit contains %T request", inner) } } } return nil }
// requestLeaseAsync sends a transfer lease or lease request to the // specified replica. The request is sent in an async task. func (p *pendingLeaseRequest) requestLeaseAsync( repl *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, reqLease roachpb.Lease, status LeaseStatus, leaseReq roachpb.Request, ) error { return repl.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = repl.AnnotateCtx(ctx) var pErr *roachpb.Error // If requesting an epoch-based lease & current state is expired, // potentially heartbeat our own liveness or increment epoch of // prior owner. Note we only do this if the previous lease was // epoch-based. if reqLease.Type() == roachpb.LeaseEpoch && status.state == leaseExpired && status.lease.Type() == roachpb.LeaseEpoch { var err error // If this replica is previous & next lease holder, manually heartbeat to become live. if status.lease.OwnedBy(nextLeaseHolder.StoreID) && repl.store.StoreID() == nextLeaseHolder.StoreID { if err = repl.store.cfg.NodeLiveness.Heartbeat(ctx, status.liveness); err != nil { log.Error(ctx, err) } } else if status.liveness.Epoch == *status.lease.Epoch { // If not owner, increment epoch if necessary to invalidate lease. if err = repl.store.cfg.NodeLiveness.IncrementEpoch(ctx, status.liveness); err != nil { log.Error(ctx, err) } } // Set error for propagation to all waiters below. if err != nil { pErr = roachpb.NewError(newNotLeaseHolderError(status.lease, repl.store.StoreID(), repl.Desc())) } } // Propose a RequestLease command and wait for it to apply. if pErr == nil { ba := roachpb.BatchRequest{} ba.Timestamp = repl.store.Clock().Now() ba.RangeID = repl.RangeID ba.Add(leaseReq) _, pErr = repl.Send(ctx, ba) } // We reset our state below regardless of whether we've gotten an error or // not, but note that an error is ambiguous - there's no guarantee that the // transfer will not still apply. That's OK, however, as the "in transfer" // state maintained by the pendingLeaseRequest is not relied on for // correctness (see repl.mu.minLeaseProposedTS), and resetting the state // is beneficial as it'll allow the replica to attempt to transfer again or // extend the existing lease in the future. // Send result of lease to all waiter channels. repl.mu.Lock() defer repl.mu.Unlock() for _, llChan := range p.llChans { // Don't send the same transaction object twice; this can lead to races. if pErr != nil { pErrClone := *pErr pErrClone.SetTxn(pErr.GetTxn()) llChan <- &pErrClone } else { llChan <- nil } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) }