// sendRPC sends one or more RPCs to replicas from the supplied // roachpb.Replica slice. Returns an RPC error if the request could // not be sent. Note that the reply may contain a higher level error // and must be checked in addition to the RPC error. // // The replicas are assumed to be ordered by preference, with closer // ones (i.e. expected lowest latency) first. func (ds *DistSender) sendRPC( ctx context.Context, rangeID roachpb.RangeID, replicas ReplicaSlice, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, error) { if len(replicas) == 0 { return nil, roachpb.NewSendError( fmt.Sprintf("no replica node addresses available via gossip for range %d", rangeID)) } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. ba.RangeID = rangeID // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := SendOptions{ ctx: ctx, SendNextTimeout: ds.sendNextTimeout, transportFactory: ds.transportFactory, } tracing.AnnotateTrace() defer tracing.AnnotateTrace() reply, err := ds.sendToReplicas(rpcOpts, rangeID, replicas, ba, ds.rpcContext) if err != nil { return nil, err } return reply, nil }
// sendRPC sends one or more RPCs to replicas from the supplied // roachpb.Replica slice. Returns an RPC error if the request could // not be sent. Note that the reply may contain a higher level error // and must be checked in addition to the RPC error. // // The replicas are assumed to be ordered by preference, with closer // ones (i.e. expected lowest latency) first. func (ds *DistSender) sendRPC( ctx context.Context, rangeID roachpb.RangeID, replicas ReplicaSlice, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, error) { if len(replicas) == 0 { return nil, roachpb.NewSendError( fmt.Sprintf("no replica node addresses available via gossip for range %d", rangeID)) } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. ba.RangeID = rangeID // A given RPC may generate retries to multiple replicas, but as soon as we // get a response from one we want to cancel those other RPCs. ctx, cancel := context.WithCancel(ctx) defer cancel() // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := SendOptions{ SendNextTimeout: ds.sendNextTimeout, transportFactory: ds.transportFactory, metrics: &ds.metrics, } tracing.AnnotateTrace() defer tracing.AnnotateTrace() reply, err := ds.sendToReplicas(ctx, rpcOpts, rangeID, replicas, ba, ds.rpcContext) if err != nil { return nil, err } return reply, nil }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send( ctx context.Context, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, *roachpb.Error) { // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err) } rangeID, repDesc, err := ls.LookupReplica(rs.Key, rs.EndKey) if err != nil { return nil, roachpb.NewError(err) } ba.RangeID = rangeID ba.Replica = repDesc } store, err := ls.GetStore(ba.Replica.StoreID) if err != nil { return nil, roachpb.NewError(err) } if ba.Txn != nil { // For calls that read data within a txn, we keep track of timestamps // observed from the various participating nodes' HLC clocks. If we have // a timestamp on file for this Node which is smaller than MaxTimestamp, // we can lower MaxTimestamp accordingly. If MaxTimestamp drops below // OrigTimestamp, we effectively can't see uncertainty restarts any // more. // Note that it's not an issue if MaxTimestamp propagates back out to // the client via a returned Transaction update - when updating a Txn // from another, the larger MaxTimestamp wins. if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) { // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // The uncertainty window is [OrigTimestamp, maxTS), so if that window // is empty, there won't be any uncertainty restarts. if !ba.Txn.OrigTimestamp.Less(maxTS) { log.Event(ctx, "read has no clock uncertainty") } shallowTxn.MaxTimestamp.Backward(maxTS) ba.Txn = &shallowTxn } } br, pErr := store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(ctx, gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } log.VEventf(ctx, 1, "completed with stats %+v", info) info.updateMetrics(gcq.store.metrics) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { log.ErrEvent(ctx, pErr.String()) return pErr.GoError() } return nil }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request now := replica.store.Clock().Now() reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, ProposedTS: &now, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // We reset our state below regardless of whether we've gotten an error or // not, but note that an error is ambiguous - there's no guarantee that the // transfer will not still apply. That's OK, however, as the "in transfer" // state maintained by the pendingLeaseRequest is not relied on for // correctness (see replica.mu.minLeaseProposedTS), and resetting the state // is beneficial as it'll allow the replica to attempt to transfer again or // extend the existing lease in the future. // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for _, llChan := range p.llChans { // Don't send the same transaction object twice; this can lead to races. if pErr != nil { pErrClone := *pErr pErrClone.SetTxn(pErr.GetTxn()) llChan <- &pErrClone } else { llChan <- nil } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for i, llChan := range p.llChans { // Don't send the same pErr object twice; this can lead to races. We could // clone every time but it's more efficient to send pErr itself to one of // the channels (the last one; if we send it earlier the race can still // happen). if i == len(p.llChans)-1 { llChan <- pErr } else { llChan <- protoutil.Clone(pErr).(*roachpb.Error) // works with `nil` } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// requestLeaseAsync sends a transfer lease or lease request to the // specified replica. The request is sent in an async task. func (p *pendingLeaseRequest) requestLeaseAsync( repl *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, reqLease roachpb.Lease, status LeaseStatus, leaseReq roachpb.Request, ) error { return repl.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = repl.AnnotateCtx(ctx) var pErr *roachpb.Error // If requesting an epoch-based lease & current state is expired, // potentially heartbeat our own liveness or increment epoch of // prior owner. Note we only do this if the previous lease was // epoch-based. if reqLease.Type() == roachpb.LeaseEpoch && status.state == leaseExpired && status.lease.Type() == roachpb.LeaseEpoch { var err error // If this replica is previous & next lease holder, manually heartbeat to become live. if status.lease.OwnedBy(nextLeaseHolder.StoreID) && repl.store.StoreID() == nextLeaseHolder.StoreID { if err = repl.store.cfg.NodeLiveness.Heartbeat(ctx, status.liveness); err != nil { log.Error(ctx, err) } } else if status.liveness.Epoch == *status.lease.Epoch { // If not owner, increment epoch if necessary to invalidate lease. if err = repl.store.cfg.NodeLiveness.IncrementEpoch(ctx, status.liveness); err != nil { log.Error(ctx, err) } } // Set error for propagation to all waiters below. if err != nil { pErr = roachpb.NewError(newNotLeaseHolderError(status.lease, repl.store.StoreID(), repl.Desc())) } } // Propose a RequestLease command and wait for it to apply. if pErr == nil { ba := roachpb.BatchRequest{} ba.Timestamp = repl.store.Clock().Now() ba.RangeID = repl.RangeID ba.Add(leaseReq) _, pErr = repl.Send(ctx, ba) } // We reset our state below regardless of whether we've gotten an error or // not, but note that an error is ambiguous - there's no guarantee that the // transfer will not still apply. That's OK, however, as the "in transfer" // state maintained by the pendingLeaseRequest is not relied on for // correctness (see repl.mu.minLeaseProposedTS), and resetting the state // is beneficial as it'll allow the replica to attempt to transfer again or // extend the existing lease in the future. // Send result of lease to all waiter channels. repl.mu.Lock() defer repl.mu.Unlock() for _, llChan := range p.llChans { // Don't send the same transaction object twice; this can lead to races. if pErr != nil { pErrClone := *pErr pErrClone.SetTxn(pErr.GetTxn()) llChan <- &pErrClone } else { llChan <- nil } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) }