// JoinRequest adds one more waiter to the currently pending request. // It is the caller's responsibility to ensure that there is a pending request, // and that the request is compatible with whatever the caller is currently // wanting to do (i.e. the request is naming the intended node as the next // lease holder). func (p *pendingLeaseRequest) JoinRequest() <-chan *roachpb.Error { llChan := make(chan *roachpb.Error, 1) if len(p.llChans) == 0 { llChan <- roachpb.NewErrorf("no request in progress") return llChan } p.llChans = append(p.llChans, llChan) return llChan }
// Seek positions the iterator at the specified key. func (ri *RangeIterator) Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) { log.Eventf(ctx, "querying next range at %s", key) ri.scanDir = scanDir ri.init = true // the iterator is now initialized ri.pErr = nil // clear any prior error ri.key = key // set the key // Retry loop for looking up next range in the span. The retry loop // deals with retryable range descriptor lookups. for r := retry.StartWithCtx(ctx, ri.ds.rpcRetryOptions); r.Next(); { log.Event(ctx, "meta descriptor lookup") var err error ri.desc, ri.token, err = ri.ds.getDescriptor( ctx, ri.key, ri.token, ri.scanDir == Descending) // getDescriptor may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.VEventf(ctx, 1, "range descriptor lookup failed: %s", err) continue } // It's possible that the returned descriptor misses parts of the // keys it's supposed to include after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. // TODO: this code is subject to removal. See // https://groups.google.com/d/msg/cockroach-db/DebjQEgU9r4/_OhMe7atFQAJ reverse := ri.scanDir == Descending if (reverse && !ri.desc.ContainsExclusiveEndKey(ri.key)) || (!reverse && !ri.desc.ContainsKey(ri.key)) { log.Eventf(ctx, "addressing error: %s does not include key %s", ri.desc, ri.key) if err := ri.token.Evict(ctx); err != nil { ri.pErr = roachpb.NewError(err) return } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } return } // Check for an early exit from the retry loop. if pErr := ri.ds.deduceRetryEarlyExitError(ctx); pErr != nil { ri.pErr = pErr } else { ri.pErr = roachpb.NewErrorf("RangeIterator failed to seek to %s", key) } }
// TestInconsistentReads tests that the methods that generate inconsistent reads // generate outgoing requests with an INCONSISTENT read consistency. func TestInconsistentReads(t *testing.T) { defer leaktest.AfterTest(t)() // Mock out DistSender's sender function to check the read consistency for // outgoing BatchRequests and return an empty reply. var senderFn client.SenderFunc senderFn = func(_ context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewErrorf("BatchRequest has unexpected ReadConsistency %s", ba.ReadConsistency) } return ba.CreateReply(), nil } db := client.NewDB(senderFn) ctx := context.TODO() prepInconsistent := func() *client.Batch { b := &client.Batch{} b.Header.ReadConsistency = roachpb.INCONSISTENT return b } // Perform inconsistent reads through the mocked sender function. { key := roachpb.Key([]byte("key")) b := prepInconsistent() b.Get(key) if err := db.Run(ctx, b); err != nil { t.Fatal(err) } } { b := prepInconsistent() key1 := roachpb.Key([]byte("key1")) key2 := roachpb.Key([]byte("key2")) b.Scan(key1, key2) if err := db.Run(ctx, b); err != nil { t.Fatal(err) } } { key := roachpb.Key([]byte("key")) b := &client.Batch{} b.Header.ReadConsistency = roachpb.INCONSISTENT b.Get(key) if err := db.Run(ctx, b); err != nil { t.Fatal(err) } } }
// handleRaftRequest proxies a request to the listening server interface. func (t *RaftTransport) handleRaftRequest( ctx context.Context, req *RaftMessageRequest, respStream RaftMessageResponseStream, ) *roachpb.Error { t.recvMu.Lock() handler, ok := t.recvMu.handlers[req.ToReplica.StoreID] t.recvMu.Unlock() if !ok { return roachpb.NewErrorf("unable to accept Raft message from %+v: no handler registered for %+v", req.FromReplica, req.ToReplica) } return handler.HandleRaftRequest(ctx, req, respStream) }
func (s channelServer) HandleRaftRequest( ctx context.Context, req *storage.RaftMessageRequest, _ storage.RaftMessageResponseStream, ) *roachpb.Error { if s.maxSleep != 0 { // maxSleep simulates goroutine scheduling delays that could // result in messages being processed out of order (in previous // transport implementations). time.Sleep(time.Duration(rand.Int63n(int64(s.maxSleep)))) } if s.brokenRange != 0 && s.brokenRange == req.RangeID { return roachpb.NewErrorf(channelServerBrokenRangeMessage) } s.ch <- req return nil }
func (db *DB) prepareToSend(ba *roachpb.BatchRequest) *roachpb.Error { if ba.ReadConsistency == roachpb.INCONSISTENT { for _, ru := range ba.Requests { req := ru.GetInner() if req.Method() != roachpb.Get && req.Method() != roachpb.Scan && req.Method() != roachpb.ReverseScan { return roachpb.NewErrorf("method %s not allowed with INCONSISTENT batch", req.Method) } } } if db.ctx.UserPriority != 1 { ba.UserPriority = db.ctx.UserPriority } tracing.AnnotateTrace() return nil }
// TestTxnRequestTxnTimestamp verifies response txn timestamp is // always upgraded on successive requests. func TestTxnRequestTxnTimestamp(t *testing.T) { defer leaktest.AfterTest(t)() makeTS := func(walltime int64, logical int32) hlc.Timestamp { return hlc.ZeroTimestamp.Add(walltime, logical) } ba := testPut() testCases := []struct { expRequestTS, responseTS hlc.Timestamp }{ {makeTS(0, 0), makeTS(10, 0)}, {makeTS(10, 0), makeTS(10, 1)}, {makeTS(10, 1), makeTS(10, 0)}, {makeTS(10, 1), makeTS(20, 1)}, {makeTS(20, 1), makeTS(20, 1)}, {makeTS(20, 1), makeTS(0, 0)}, {makeTS(20, 1), makeTS(20, 1)}, } var testIdx int db := NewDB(newTestSender(nil, func(ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { test := testCases[testIdx] if !test.expRequestTS.Equal(ba.Txn.Timestamp) { return nil, roachpb.NewErrorf("%d: expected ts %s got %s", testIdx, test.expRequestTS, ba.Txn.Timestamp) } br := &roachpb.BatchResponse{} br.Txn = &roachpb.Transaction{} br.Txn.Update(ba.Txn) // copy br.Txn.Timestamp = test.responseTS return br, nil })) txn := NewTxn(context.Background(), *db) for testIdx = range testCases { if _, pErr := txn.sendInternal(ba); pErr != nil { t.Fatal(pErr) } } }
// TestSetPriority verifies that the batch UserPriority is correctly set // depending on the transaction priority. func TestSetPriority(t *testing.T) { defer leaktest.AfterTest(t)() var expected roachpb.UserPriority db := NewDB(newTestSender( func(ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if ba.UserPriority != expected { pErr := roachpb.NewErrorf("Priority not set correctly in the batch! "+ "(expected: %s, value: %s)", expected, ba.UserPriority) return nil, pErr } br := &roachpb.BatchResponse{} br.Txn = &roachpb.Transaction{} br.Txn.Update(ba.Txn) // copy return br, nil }, nil)) // Verify the normal priority setting path. expected = roachpb.HighUserPriority txn := NewTxn(context.Background(), *db) if err := txn.SetUserPriority(expected); err != nil { t.Fatal(err) } if _, pErr := txn.sendInternal(roachpb.BatchRequest{}); pErr != nil { t.Fatal(pErr) } // Verify the internal (fixed value) priority setting path. expected = roachpb.UserPriority(-13) txn = NewTxn(context.Background(), *db) txn.InternalSetPriority(13) if _, pErr := txn.sendInternal(roachpb.BatchRequest{}); pErr != nil { t.Fatal(pErr) } }
// Test that an error encountered by a read-only "NonKV" command is not // swallowed, and doesn't otherwise cause a panic. // We had a bug cause by the fact that errors for these commands aren't passed // through the epilogue returned by replica.beginCommands() and were getting // swallowed. func TestErrorHandlingForNonKVCommand(t *testing.T) { defer leaktest.AfterTest(t)() cmdFilter := func(fArgs storagebase.FilterArgs) *roachpb.Error { if fArgs.Hdr.UserPriority == 42 { return roachpb.NewErrorf("injected error") } return nil } srv, _, _ := serverutils.StartServer(t, base.TestServerArgs{ Knobs: base.TestingKnobs{ Store: &storage.StoreTestingKnobs{ TestingCommandFilter: cmdFilter, }, }, }) s := srv.(*server.TestServer) defer s.Stopper().Stop() // Send the lease request. key := roachpb.Key("a") leaseReq := roachpb.LeaseInfoRequest{ Span: roachpb.Span{ Key: key, }, } _, pErr := client.SendWrappedWith( context.Background(), s.DistSender(), roachpb.Header{UserPriority: 42}, &leaseReq, ) if !testutils.IsPError(pErr, "injected error") { t.Fatalf("expected error %q, got: %s", "injected error", pErr) } }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request now := replica.store.Clock().Now() reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, ProposedTS: &now, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // We reset our state below regardless of whether we've gotten an error or // not, but note that an error is ambiguous - there's no guarantee that the // transfer will not still apply. That's OK, however, as the "in transfer" // state maintained by the pendingLeaseRequest is not relied on for // correctness (see replica.mu.minLeaseProposedTS), and resetting the state // is beneficial as it'll allow the replica to attempt to transfer again or // extend the existing lease in the future. // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for _, llChan := range p.llChans { // Don't send the same transaction object twice; this can lead to races. if pErr != nil { pErrClone := *pErr pErrClone.SetTxn(pErr.GetTxn()) llChan <- &pErrClone } else { llChan <- nil } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( repl *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, status LeaseStatus, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request now := repl.store.Clock().Now() reqLease := roachpb.Lease{ Start: status.timestamp, Replica: nextLeaseHolder, ProposedTS: &now, } if repl.requiresExpiringLease() { reqLease.Expiration = status.timestamp.Add(int64(repl.store.cfg.RangeLeaseActiveDuration), 0) } else { // Get the liveness for the next lease holder and set the epoch in the lease request. liveness, err := repl.store.cfg.NodeLiveness.GetLiveness(nextLeaseHolder.NodeID) if err != nil { llChan <- roachpb.NewErrorf("couldn't request lease for %+v: %v", nextLeaseHolder, err) return llChan } reqLease.Epoch = proto.Int64(liveness.Epoch) } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, PrevLease: status.lease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, PrevLease: status.lease, } } if err := p.requestLeaseAsync(repl, nextLeaseHolder, reqLease, status, leaseReq); err != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, repl.store.StoreID(), repl.mu.state.Desc)) return llChan } // TODO(andrei): document this subtlety. p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send( ctx context.Context, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, *roachpb.Error) { // Start new or pick up active trace. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp := opentracing.SpanFromContext(ctx) var tracer opentracing.Tracer if sp == nil { tracer = tc.AmbientContext.Tracer sp = tracer.StartSpan(opTxnCoordSender) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) } else { tracer = sp.Tracer() } startNS := tc.clock.PhysicalNow() if ba.Txn != nil { // If this request is part of a transaction... if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } txnID := *ba.Txn.ID // Associate the txnID with the trace. We need to do this after the // maybeBeginTxn call. We set both a baggage item and a tag because only // tags show up in the LIghtstep UI. txnIDStr := txnID.String() sp.SetTag("txnID", txnIDStr) sp.SetBaggageItem("txnID", txnIDStr) var et *roachpb.EndTransactionRequest var hasET bool { var rArgs roachpb.Request rArgs, hasET = ba.GetArg(roachpb.EndTransaction) if hasET { et = rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } } } if pErr := func() *roachpb.Error { tc.Lock() defer tc.Unlock() if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil { return pErr } if !hasET { return nil } // Everything below is carried out only when trying to commit. // Populate et.IntentSpans, taking into account both any existing // and new writes, and taking care to perform proper deduplication. txnMeta := tc.txns[txnID] distinctSpans := true if txnMeta != nil { et.IntentSpans = txnMeta.keys // Defensively set distinctSpans to false if we had any previous // requests in this transaction. This effectively limits the distinct // spans optimization to 1pc transactions. distinctSpans = len(txnMeta.keys) == 0 } // We can't pass in a batch response here to better limit the key // spans as we don't know what is going to be affected. This will // affect queries such as `DELETE FROM my.table LIMIT 10` when // executed as a 1PC transaction. e.g.: a (BeginTransaction, // DeleteRange, EndTransaction) batch. ba.IntentSpanIterate(nil, func(key, endKey roachpb.Key) { et.IntentSpans = append(et.IntentSpans, roachpb.Span{ Key: key, EndKey: endKey, }) }) // TODO(peter): Populate DistinctSpans on all batches, not just batches // which contain an EndTransactionRequest. var distinct bool // The request might already be used by an outgoing goroutine, so // we can't safely mutate anything in-place (as MergeSpans does). et.IntentSpans = append([]roachpb.Span(nil), et.IntentSpans...) et.IntentSpans, distinct = roachpb.MergeSpans(et.IntentSpans) ba.Header.DistinctSpans = distinct && distinctSpans if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state // in the client. return roachpb.NewErrorf("cannot commit a read-only transaction") } if txnMeta != nil { txnMeta.keys = et.IntentSpans } return nil }(); pErr != nil { return nil, pErr } if hasET && log.V(1) { for _, intent := range et.IntentSpans { log.Eventf(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey) } } } // Embed the trace metadata into the header for use by RPC recipients. We need // to do this after the maybeBeginTxn call above. // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.TraceContext == nil { ba.TraceContext = &tracing.SpanContextCarrier{} } else { // We didn't make this object but are about to mutate it, so we // have to take a copy - the original might already have been // passed to the RPC layer. ba.TraceContext = protoutil.Clone(ba.TraceContext).(*tracing.SpanContextCarrier) } if err := tracer.Inject(sp.Context(), basictracer.Delegator, ba.TraceContext); err != nil { return nil, roachpb.NewError(err) } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(ctx, startNS, ba, br, pErr); pErr != nil { log.Eventf(ctx, "error: %s", pErr) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof(ctx, "%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.Lock() tc.cleanupTxnLocked(ctx, *br.Txn) tc.Unlock() } return br, nil }
// sendPartialBatch sends the supplied batch to the range specified by // desc. The batch request is first truncated so that it contains only // requests which intersect the range descriptor and keys for each // request are limited to the range's key span. The send occurs in a // retry loop to handle send failures. On failure to send to any // replicas, we backoff and retry by refetching the range // descriptor. If the underlying range seems to have split, we // recursively invoke divideAndSendBatchToRanges to re-enumerate the // ranges in the span and resend to each. func (ds *DistSender) sendPartialBatch( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, desc *roachpb.RangeDescriptor, evictToken *EvictionToken, isFirst bool, ) response { var reply *roachpb.BatchResponse var pErr *roachpb.Error isReverse := ba.IsReverse() // Truncate the request to range descriptor. intersected, err := rs.Intersect(desc) if err != nil { return response{pErr: roachpb.NewError(err)} } truncBA, numActive, err := truncate(ba, intersected) if numActive == 0 && err == nil { // This shouldn't happen in the wild, but some tests exercise it. return response{ pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba), } } if err != nil { return response{pErr: roachpb.NewError(err)} } // Start a retry loop for sending the batch to the range. for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { // If we've cleared the descriptor on a send failure, re-lookup. if desc == nil { var descKey roachpb.RKey if isReverse { descKey = intersected.EndKey } else { descKey = intersected.Key } desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse) if err != nil { log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err) continue } } reply, pErr = ds.sendSingleRange(ctx, truncBA, desc) // If sending succeeded, return immediately. if pErr == nil { return response{reply: reply} } log.ErrEventf(ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup") if err := evictToken.Evict(ctx); err != nil { return response{pErr: roachpb.NewError(err)} } // Clear the descriptor to reload on the next attempt. desc = nil continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return response{pErr: roachpb.NewError(err)} } // On addressing errors (likely a split), we need to re-invoke // the range descriptor lookup machinery, so we recurse by // sending batch to just the partial span this descriptor was // supposed to cover. log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr) reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst) return response{reply: reply, pErr: pErr} } break } // Propagate error if either the retry closer or context done // channels were closed. if pErr == nil { if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil { log.Fatal(ctx, "exited retry loop without an error") } } return response{pErr: pErr} }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for i, llChan := range p.llChans { // Don't send the same pErr object twice; this can lead to races. We could // clone every time but it's more efficient to send pErr itself to one of // the channels (the last one; if we send it earlier the race can still // happen). if i == len(p.llChans)-1 { llChan <- pErr } else { llChan <- protoutil.Clone(pErr).(*roachpb.Error) // works with `nil` } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// initAndVerifyBatch initializes timestamp-related information and // verifies batch constraints before splitting. func (ds *DistSender) initAndVerifyBatch( ctx context.Context, ba *roachpb.BatchRequest, ) *roachpb.Error { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(hlc.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil { // Make a copy here since the code below modifies it in different places. // TODO(tschottdorf): be smarter about this - no need to do it for // requests that don't get split. txnClone := ba.Txn.Clone() ba.Txn = &txnClone if len(ba.Txn.ObservedTimestamps) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // // We already have a clone of our txn (see above), so we can // modify it freely. // // Zero the existing data. That makes sure that if we had // something of size zero but with capacity, we don't re-use the // existing space (which others may also use). This is just to // satisfy paranoia/OCD and not expected to matter in practice. ba.Txn.ResetObservedTimestamps() // OrigTimestamp is the HLC timestamp at which the Txn started, so // this effectively means no more uncertainty on this node. ba.Txn.UpdateObservedTimestamp(nDesc.NodeID, ba.Txn.OrigTimestamp) } } } if len(ba.Requests) < 1 { return roachpb.NewErrorf("empty batch") } if ba.MaxSpanRequestKeys != 0 { // Verify that the batch contains only specific range requests or the // Begin/EndTransactionRequest. Verify that a batch with a ReverseScan // only contains ReverseScan range requests. isReverse := ba.IsReverse() for _, req := range ba.Requests { inner := req.GetInner() switch inner.(type) { case *roachpb.ScanRequest, *roachpb.DeleteRangeRequest: // Accepted range requests. All other range requests are still // not supported. // TODO(vivek): don't enumerate all range requests. if isReverse { return roachpb.NewErrorf("batch with limit contains both forward and reverse scans") } case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest, *roachpb.ReverseScanRequest: continue default: return roachpb.NewErrorf("batch with limit contains %T request", inner) } } } return nil }
continue default: return roachpb.NewErrorf("batch with limit contains %T request", inner) } } } return nil } // errNo1PCTxn indicates that a batch cannot be sent as a 1 phase // commit because it spans multiple ranges and must be split into at // least two parts, with the final part containing the EndTransaction // request. var errNo1PCTxn = roachpb.NewErrorf("cannot send 1PC txn to multiple ranges") // Send implements the batch.Sender interface. It subdivides the Batch // into batches admissible for sending (preventing certain illegal // mixtures of requests), executes each individual part (which may // span multiple ranges), and recombines the response. // // When the request spans ranges, it is split by range and a partial // subset of the batch request is sent to affected ranges in parallel. // // The first write in a transaction may not arrive before writes to // other ranges. This is relevant in the case of a BeginTransaction // request. Intents written to other ranges before the transaction // record is created will cause the transaction to abort early. func (ds *DistSender) Send( ctx context.Context, ba roachpb.BatchRequest,
// processWriteIntentError tries to push the conflicting // transaction(s) responsible for the given WriteIntentError, and to // resolve those intents if possible. Returns a new error to be used // in place of the original. // // The returned error may be a copy of the original WriteIntentError, // with or without the Resolved flag set, which governs the client's // retry behavior (if the transaction is pushed, the Resolved flag is // set to tell the client to retry immediately; otherwise it is false // to cause the client to back off). func (ir *intentResolver) processWriteIntentError( ctx context.Context, wiPErr *roachpb.Error, args roachpb.Request, h roachpb.Header, pushType roachpb.PushTxnType, ) *roachpb.Error { wiErr, ok := wiPErr.GetDetail().(*roachpb.WriteIntentError) if !ok { return roachpb.NewErrorf("not a WriteIntentError: %v", wiPErr) } if log.V(6) { log.Infof(ctx, "resolving write intent %s", wiErr) } method := args.Method() readOnly := roachpb.IsReadOnly(args) // TODO(tschottdorf): pass as param resolveIntents, pushErr := ir.maybePushTransactions(ctx, wiErr.Intents, h, pushType, false) if resErr := ir.resolveIntents(ctx, resolveIntents, false /* !wait */, pushType == roachpb.PUSH_ABORT /* poison */); resErr != nil { // When resolving without waiting, errors should not // usually be returned here, although there are some cases // when they may be (especially when a test cluster is in // the process of shutting down). log.Warningf(ctx, "asynchronous resolveIntents failed: %s", resErr) } if pushErr != nil { if log.V(1) { log.Infof(ctx, "on %s: %s", method, pushErr) } if _, isExpected := pushErr.GetDetail().(*roachpb.TransactionPushError); !isExpected { // If an unexpected error occurred, make sure it bubbles up to the // client. Examples are timeouts and logic errors. return pushErr } // For write/write conflicts within a transaction, propagate the // push failure, not the original write intent error. The push // failure will instruct the client to restart the transaction // with a backoff. if h.Txn != nil && h.Txn.ID != nil && !readOnly { return pushErr } // For read/write conflicts, and non-transactional write/write // conflicts, return the write intent error which engages // backoff/retry (with !Resolved). We don't need to restart the // txn, only resend the read with a backoff. return wiPErr } // We pushed all transactions, so tell the client everything's // resolved and it can retry immediately. wiErr.Resolved = true return wiPErr // references wiErr }
// Error returns the error the iterator encountered, if any. If // the iterator has not been initialized, returns iterator error. func (ri *RangeIterator) Error() *roachpb.Error { if !ri.init { return roachpb.NewErrorf("range iterator not intialized with Seek()") } return ri.pErr }
// send runs the specified calls synchronously in a single batch and // returns any errors. If the transaction is read-only or has already // been successfully committed or aborted, a potential trailing // EndTransaction call is silently dropped, allowing the caller to // always commit or clean-up explicitly even when that may not be // required (or even erroneous). Returns (nil, nil) for an empty batch. func (txn *Txn) send(ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if txn.Proto.Status != roachpb.PENDING || txn.IsFinalized() { return nil, roachpb.NewErrorf( "attempting to use transaction with wrong status or finalized: %s", txn.Proto.Status) } // It doesn't make sense to use inconsistent reads in a transaction. However, // we still need to accept it as a parameter for this to compile. if ba.ReadConsistency != roachpb.CONSISTENT { return nil, roachpb.NewErrorf("cannot use %s ReadConsistency in txn", ba.ReadConsistency) } lastIndex := len(ba.Requests) - 1 if lastIndex < 0 { return nil, nil } // firstWriteIndex is set to the index of the first command which is // a transactional write. If != -1, this indicates an intention to // write. This is in contrast to txn.Proto.Writing, which is set by // the coordinator when the first intent has been created, and which // lives for the life of the transaction. firstWriteIndex := -1 var firstWriteKey roachpb.Key for i, ru := range ba.Requests { args := ru.GetInner() if i < lastIndex { if _, ok := args.(*roachpb.EndTransactionRequest); ok { return nil, roachpb.NewErrorf("%s sent as non-terminal call", args.Method()) } } if roachpb.IsTransactionWrite(args) && firstWriteIndex == -1 { firstWriteKey = args.Header().Key firstWriteIndex = i } } haveTxnWrite := firstWriteIndex != -1 endTxnRequest, haveEndTxn := ba.Requests[lastIndex].GetInner().(*roachpb.EndTransactionRequest) needBeginTxn := !txn.Proto.Writing && haveTxnWrite needEndTxn := txn.Proto.Writing || haveTxnWrite elideEndTxn := haveEndTxn && !needEndTxn // If we're not yet writing in this txn, but intend to, insert a // begin transaction request before the first write command. if needBeginTxn { // If the transaction already has a key (we're in a restart), make // sure we set the key in the begin transaction request to the original. bt := &roachpb.BeginTransactionRequest{ Span: roachpb.Span{ Key: firstWriteKey, }, } if txn.Proto.Key != nil { bt.Key = txn.Proto.Key } // Inject the new request before position firstWriteIndex, taking // care to avoid unnecessary allocations. oldRequests := ba.Requests ba.Requests = make([]roachpb.RequestUnion, len(ba.Requests)+1) copy(ba.Requests, oldRequests[:firstWriteIndex]) ba.Requests[firstWriteIndex].MustSetInner(bt) copy(ba.Requests[firstWriteIndex+1:], oldRequests[firstWriteIndex:]) } if elideEndTxn { ba.Requests = ba.Requests[:lastIndex] } br, pErr := txn.sendInternal(ba) if elideEndTxn && pErr == nil { // Check that read only transactions do not violate their deadline. This can NOT // happen since the txn deadline is normally updated when it is about to expire // or expired. We will just keep the code for safety (see TestReacquireLeaseOnRestart). if endTxnRequest.Deadline != nil { if endTxnRequest.Deadline.Less(txn.Proto.Timestamp) { return nil, roachpb.NewErrorWithTxn(roachpb.NewTransactionAbortedError(), &txn.Proto) } } // This normally happens on the server and sent back in response // headers, but this transaction was optimized away. The caller may // still inspect the transaction struct, so we manually update it // here to emulate a true transaction. if endTxnRequest.Commit { txn.Proto.Status = roachpb.COMMITTED } else { txn.Proto.Status = roachpb.ABORTED } txn.finalized = true } // If we inserted a begin transaction request, remove it here. if needBeginTxn { if br != nil && br.Responses != nil { br.Responses = append(br.Responses[:firstWriteIndex], br.Responses[firstWriteIndex+1:]...) } // Handle case where inserted begin txn confused an indexed error. if pErr != nil && pErr.Index != nil { idx := pErr.Index.Index if idx == int32(firstWriteIndex) { // An error was encountered on begin txn; disallow the indexing. pErr.Index = nil } else if idx > int32(firstWriteIndex) { // An error was encountered after begin txn; decrement index. pErr.SetErrorIndex(idx - 1) } } } return br, pErr }