// RangeLookup implements the RangeDescriptorDB interface. // RangeLookup dispatches a RangeLookup request for the given metadata // key to the replicas of the given range. Note that we allow // inconsistent reads when doing range lookups for efficiency. Getting // stale data is not a correctness problem but instead may // infrequently result in additional latency as additional range // lookups may be required. Note also that rangeLookup bypasses the // DistSender's Send() method, so there is no error inspection and // retry logic here; this is not an issue since the lookup performs a // single inconsistent read only. func (ds *DistSender) RangeLookup( ctx context.Context, key roachpb.RKey, desc *roachpb.RangeDescriptor, useReverseScan bool, ) ([]roachpb.RangeDescriptor, []roachpb.RangeDescriptor, *roachpb.Error) { ba := roachpb.BatchRequest{} ba.ReadConsistency = roachpb.INCONSISTENT ba.Add(&roachpb.RangeLookupRequest{ Span: roachpb.Span{ // We can interpret the RKey as a Key here since it's a metadata // lookup; those are never local. Key: key.AsRawKey(), }, MaxRanges: ds.rangeLookupMaxRanges, Reverse: useReverseScan, }) replicas := newReplicaSlice(ds.gossip, desc) replicas.Shuffle() br, err := ds.sendRPC(ctx, desc.RangeID, replicas, ba) if err != nil { return nil, nil, roachpb.NewError(err) } if br.Error != nil { return nil, nil, br.Error } resp := br.Responses[0].GetInner().(*roachpb.RangeLookupResponse) return resp.Ranges, resp.PrefetchedRanges, nil }
// sendAndFill is a helper which sends the given batch and fills its results, // returning the appropriate error which is either from the first failing call, // or an "internal" error. func sendAndFill( send func(roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error), b *Batch, ) error { // Errors here will be attached to the results, so we will get them from // the call to fillResults in the regular case in which an individual call // fails. But send() also returns its own errors, so there's some dancing // here to do because we want to run fillResults() so that the individual // result gets initialized with an error from the corresponding call. var ba roachpb.BatchRequest // TODO(tschottdorf): this nonsensical copy is required since (at least at // the time of writing, the chunking and masking in DistSender operates on // the original data (as attested to by a whole bunch of test failures). ba.Requests = append([]roachpb.RequestUnion(nil), b.reqs...) ba.Header = b.Header b.response, b.pErr = send(ba) if b.pErr != nil { // Discard errors from fillResults. _ = b.fillResults() return b.pErr.GoError() } if err := b.fillResults(); err != nil { b.pErr = roachpb.NewError(err) return err } return nil }
func testPut() roachpb.BatchRequest { var ba roachpb.BatchRequest ba.Timestamp = testTS put := &roachpb.PutRequest{} put.Key = testKey ba.Add(put) return ba }
// TestBatchPrevNext tests batch.{Prev,Next}. func TestBatchPrevNext(t *testing.T) { defer leaktest.AfterTest(t)() loc := func(s string) string { return string(keys.RangeDescriptorKey(roachpb.RKey(s))) } span := func(strs ...string) []roachpb.Span { var r []roachpb.Span for i, str := range strs { if i%2 == 0 { r = append(r, roachpb.Span{Key: roachpb.Key(str)}) } else { r[len(r)-1].EndKey = roachpb.Key(str) } } return r } max, min := string(roachpb.RKeyMax), string(roachpb.RKeyMin) abc := span("a", "", "b", "", "c", "") testCases := []struct { spans []roachpb.Span key, expFW, expBW string }{ {spans: span("a", "c", "b", ""), key: "b", expFW: "b", expBW: "b"}, {spans: span("a", "c", "b", ""), key: "a", expFW: "a", expBW: "a"}, {spans: span("a", "c", "d", ""), key: "c", expFW: "d", expBW: "c"}, {spans: span("a", "c\x00", "d", ""), key: "c", expFW: "c", expBW: "c"}, {spans: abc, key: "b", expFW: "b", expBW: "b"}, {spans: abc, key: "b\x00", expFW: "c", expBW: "b\x00"}, {spans: abc, key: "bb", expFW: "c", expBW: "b"}, {spans: span(), key: "whatevs", expFW: max, expBW: min}, {spans: span(loc("a"), loc("c")), key: "c", expFW: "c", expBW: "c"}, {spans: span(loc("a"), loc("c")), key: "c\x00", expFW: max, expBW: "c\x00"}, } for i, test := range testCases { var ba roachpb.BatchRequest for _, span := range test.spans { args := &roachpb.ScanRequest{} args.Key, args.EndKey = span.Key, span.EndKey ba.Add(args) } if next, err := next(ba, roachpb.RKey(test.key)); err != nil { t.Errorf("%d: %v", i, err) } else if !bytes.Equal(next, roachpb.Key(test.expFW)) { t.Errorf("%d: next: expected %q, got %q", i, test.expFW, next) } if prev, err := prev(ba, roachpb.RKey(test.key)); err != nil { t.Errorf("%d: %v", i, err) } else if !bytes.Equal(prev, roachpb.Key(test.expBW)) { t.Errorf("%d: prev: expected %q, got %q", i, test.expBW, prev) } } }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(ctx, gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } log.VEventf(ctx, 1, "completed with stats %+v", info) info.updateMetrics(gcq.store.metrics) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { log.ErrEvent(ctx, pErr.String()) return pErr.GoError() } return nil }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send( ctx context.Context, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, *roachpb.Error) { // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err) } rangeID, repDesc, err := ls.LookupReplica(rs.Key, rs.EndKey) if err != nil { return nil, roachpb.NewError(err) } ba.RangeID = rangeID ba.Replica = repDesc } store, err := ls.GetStore(ba.Replica.StoreID) if err != nil { return nil, roachpb.NewError(err) } if ba.Txn != nil { // For calls that read data within a txn, we keep track of timestamps // observed from the various participating nodes' HLC clocks. If we have // a timestamp on file for this Node which is smaller than MaxTimestamp, // we can lower MaxTimestamp accordingly. If MaxTimestamp drops below // OrigTimestamp, we effectively can't see uncertainty restarts any // more. // Note that it's not an issue if MaxTimestamp propagates back out to // the client via a returned Transaction update - when updating a Txn // from another, the larger MaxTimestamp wins. if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) { // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // The uncertainty window is [OrigTimestamp, maxTS), so if that window // is empty, there won't be any uncertainty restarts. if !ba.Txn.OrigTimestamp.Less(maxTS) { log.Event(ctx, "read has no clock uncertainty") } shallowTxn.MaxTimestamp.Backward(maxTS) ba.Txn = &shallowTxn } } br, pErr := store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// TestTxnCoordSenderSingleRoundtripTxn checks that a batch which completely // holds the writing portion of a Txn (including EndTransaction) does not // launch a heartbeat goroutine at all. func TestTxnCoordSenderSingleRoundtripTxn(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() manual := hlc.NewManualClock(123) clock := hlc.NewClock(manual.UnixNano, 20*time.Nanosecond) senderFunc := func(_ context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { br := ba.CreateReply() txnClone := ba.Txn.Clone() br.Txn = &txnClone br.Txn.Writing = true return br, nil } ambient := log.AmbientContext{Tracer: tracing.NewTracer()} ts := NewTxnCoordSender( ambient, senderFn(senderFunc), clock, false, stopper, MakeTxnMetrics(metric.TestSampleInterval), ) // Stop the stopper manually, prior to trying the transaction. This has the // effect of returning a NodeUnavailableError for any attempts at launching // a heartbeat goroutine. stopper.Stop() var ba roachpb.BatchRequest key := roachpb.Key("test") ba.Add(&roachpb.BeginTransactionRequest{Span: roachpb.Span{Key: key}}) ba.Add(&roachpb.PutRequest{Span: roachpb.Span{Key: key}}) ba.Add(&roachpb.EndTransactionRequest{}) ba.Txn = &roachpb.Transaction{Name: "test"} _, pErr := ts.Send(context.Background(), ba) if pErr != nil { t.Fatal(pErr) } }
// TestTxnCoordSenderErrorWithIntent validates that if a transactional request // returns an error but also indicates a Writing transaction, the coordinator // tracks it just like a successful request. func TestTxnCoordSenderErrorWithIntent(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() manual := hlc.NewManualClock(0) clock := hlc.NewClock(manual.UnixNano) clock.SetMaxOffset(20) testCases := []struct { roachpb.Error errMsg string }{ {*roachpb.NewError(roachpb.NewTransactionRetryError()), "retry txn"}, {*roachpb.NewError(roachpb.NewTransactionPushError(roachpb.Transaction{ TxnMeta: enginepb.TxnMeta{ ID: uuid.NewV4(), }})), "failed to push"}, {*roachpb.NewErrorf("testError"), "testError"}, } for i, test := range testCases { func() { senderFunc := func(_ context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { txn := ba.Txn.Clone() txn.Writing = true pErr := &roachpb.Error{} *pErr = test.Error pErr.SetTxn(&txn) return nil, pErr } ambient := log.AmbientContext{Tracer: tracing.NewTracer()} ts := NewTxnCoordSender( ambient, senderFn(senderFunc), clock, false, stopper, MakeTxnMetrics(metric.TestSampleInterval), ) var ba roachpb.BatchRequest key := roachpb.Key("test") ba.Add(&roachpb.BeginTransactionRequest{Span: roachpb.Span{Key: key}}) ba.Add(&roachpb.PutRequest{Span: roachpb.Span{Key: key}}) ba.Add(&roachpb.EndTransactionRequest{}) ba.Txn = &roachpb.Transaction{Name: "test"} _, pErr := ts.Send(context.Background(), ba) if !testutils.IsPError(pErr, test.errMsg) { t.Errorf("%d: error did not match %s: %v", i, test.errMsg, pErr) } defer teardownHeartbeats(ts) ts.Lock() defer ts.Unlock() if len(ts.txns) != 1 { t.Errorf("%d: expected transaction to be tracked", i) } }() } }
// sendRPC sends one or more RPCs to replicas from the supplied // roachpb.Replica slice. Returns an RPC error if the request could // not be sent. Note that the reply may contain a higher level error // and must be checked in addition to the RPC error. // // The replicas are assumed to be ordered by preference, with closer // ones (i.e. expected lowest latency) first. func (ds *DistSender) sendRPC( ctx context.Context, rangeID roachpb.RangeID, replicas ReplicaSlice, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, error) { if len(replicas) == 0 { return nil, roachpb.NewSendError( fmt.Sprintf("no replica node addresses available via gossip for range %d", rangeID)) } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. ba.RangeID = rangeID // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := SendOptions{ ctx: ctx, SendNextTimeout: ds.sendNextTimeout, transportFactory: ds.transportFactory, } tracing.AnnotateTrace() defer tracing.AnnotateTrace() reply, err := ds.sendToReplicas(rpcOpts, rangeID, replicas, ba, ds.rpcContext) if err != nil { return nil, err } return reply, nil }
// sendRPC sends one or more RPCs to replicas from the supplied // roachpb.Replica slice. Returns an RPC error if the request could // not be sent. Note that the reply may contain a higher level error // and must be checked in addition to the RPC error. // // The replicas are assumed to be ordered by preference, with closer // ones (i.e. expected lowest latency) first. func (ds *DistSender) sendRPC( ctx context.Context, rangeID roachpb.RangeID, replicas ReplicaSlice, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, error) { if len(replicas) == 0 { return nil, roachpb.NewSendError( fmt.Sprintf("no replica node addresses available via gossip for range %d", rangeID)) } // TODO(pmattis): This needs to be tested. If it isn't set we'll // still route the request appropriately by key, but won't receive // RangeNotFoundErrors. ba.RangeID = rangeID // A given RPC may generate retries to multiple replicas, but as soon as we // get a response from one we want to cancel those other RPCs. ctx, cancel := context.WithCancel(ctx) defer cancel() // Set RPC opts with stipulation that one of N RPCs must succeed. rpcOpts := SendOptions{ SendNextTimeout: ds.sendNextTimeout, transportFactory: ds.transportFactory, metrics: &ds.metrics, } tracing.AnnotateTrace() defer tracing.AnnotateTrace() reply, err := ds.sendToReplicas(ctx, rpcOpts, rangeID, replicas, ba, ds.rpcContext) if err != nil { return nil, err } return reply, nil }
// SendWrappedWith is a convenience function which wraps the request in a batch // and sends it via the provided Sender and headers. It returns the unwrapped // response or an error. It's valid to pass a `nil` context; an empty one is // used in that case. func SendWrappedWith( ctx context.Context, sender Sender, h roachpb.Header, args roachpb.Request, ) (roachpb.Response, *roachpb.Error) { ba := roachpb.BatchRequest{} ba.Header = h ba.Add(args) br, pErr := sender.Send(ctx, ba) if pErr != nil { return nil, pErr } unwrappedReply := br.Responses[0].GetInner() header := unwrappedReply.Header() header.Txn = br.Txn unwrappedReply.SetHeader(header) return unwrappedReply, nil }
func TestBatchPrevNextWithNoop(t *testing.T) { defer leaktest.AfterTest(t)() leftKey := roachpb.Key("a") middleKey := roachpb.RKey("b") rightKey := roachpb.Key("c") var ba roachpb.BatchRequest ba.Add(&roachpb.GetRequest{Span: roachpb.Span{Key: leftKey}}) ba.Add(&roachpb.NoopRequest{}) ba.Add(&roachpb.GetRequest{Span: roachpb.Span{Key: rightKey}}) t.Run("prev", func(t *testing.T) { rk, err := prev(ba, middleKey) if err != nil { t.Fatal(err) } if !rk.Equal(leftKey) { t.Errorf("got %s, expected %s", rk, leftKey) } }) t.Run("next", func(t *testing.T) { rk, err := next(ba, middleKey) if err != nil { t.Fatal(err) } if !rk.Equal(rightKey) { t.Errorf("got %s, expected %s", rk, rightKey) } }) }
func TestBatchRequestString(t *testing.T) { br := roachpb.BatchRequest{} br.Txn = new(roachpb.Transaction) for i := 0; i < 100; i++ { br.Requests = append(br.Requests, roachpb.RequestUnion{Get: &roachpb.GetRequest{}}) } br.Requests = append(br.Requests, roachpb.RequestUnion{EndTransaction: &roachpb.EndTransactionRequest{}}) e := `[txn: <nil>], Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), ... 76 skipped ..., Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), Get [/Min,/Min), EndTransaction [/Min,/Min)` if e != br.String() { t.Fatalf("e = %s, v = %s", e, br.String()) } }
// tryAsyncAbort (synchronously) grabs a copy of the txn proto and the intents // (which it then clears from txnMeta), and asynchronously tries to abort the // transaction. func (tc *TxnCoordSender) tryAsyncAbort(txnID uuid.UUID) { tc.Lock() txnMeta := tc.txns[txnID] // Clone the intents and the txn to avoid data races. intentSpans, _ := roachpb.MergeSpans(append([]roachpb.Span(nil), txnMeta.keys...)) txnMeta.keys = nil txn := txnMeta.txn.Clone() tc.Unlock() // Since we don't hold the lock continuously, it's possible that two aborts // raced here. That's fine (and probably better than the alternative, which // is missing new intents sometimes). if txn.Status != roachpb.PENDING { return } ba := roachpb.BatchRequest{} ba.Txn = &txn et := &roachpb.EndTransactionRequest{ Span: roachpb.Span{ Key: txn.Key, }, Commit: false, IntentSpans: intentSpans, } ba.Add(et) ctx := tc.AnnotateCtx(context.TODO()) if err := tc.stopper.RunAsyncTask(ctx, func(ctx context.Context) { // Use the wrapped sender since the normal Sender does not allow // clients to specify intents. if _, pErr := tc.wrapped.Send(ctx, ba); pErr != nil { if log.V(1) { log.Warningf(ctx, "abort due to inactivity failed for %s: %s ", txn, pErr) } } }); err != nil { log.Warning(ctx, err) } }
// TestBatchError verifies that Range returns an error if a request has an invalid range. func TestBatchError(t *testing.T) { testCases := []struct { req [2]string errMsg string }{ { req: [2]string{"\xff\xff\xff\xff", "a"}, errMsg: "must be less than KeyMax", }, { req: [2]string{"a", "\xff\xff\xff\xff"}, errMsg: "must be less than or equal to KeyMax", }, } for i, c := range testCases { var ba roachpb.BatchRequest ba.Add(&roachpb.ScanRequest{Span: roachpb.Span{Key: roachpb.Key(c.req[0]), EndKey: roachpb.Key(c.req[1])}}) if _, err := Range(ba); !testutils.IsError(err, c.errMsg) { t.Errorf("%d: unexpected error %v", i, err) } } // Test a case where a non-range request has an end key. var ba roachpb.BatchRequest ba.Add(&roachpb.GetRequest{Span: roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")}}) if _, err := Range(ba); !testutils.IsError(err, "end key specified for non-range operation") { t.Errorf("unexpected error %v", err) } }
// sendSingleRange gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendSingleRange( ctx context.Context, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor, ) (*roachpb.BatchResponse, *roachpb.Error) { // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. ds.optimizeReplicaOrder(replicas) // If this request needs to go to a lease holder and we know who that is, move // it to the front. if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) { if leaseHolder, ok := ds.leaseHolderCache.Lookup(ctx, desc.RangeID); ok { if i := replicas.FindReplica(leaseHolder.StoreID); i >= 0 { replicas.MoveToFront(i) } } } // TODO(tschottdorf): should serialize the trace here, not higher up. br, err := ds.sendRPC(ctx, desc.RangeID, replicas, ba) if err != nil { return nil, roachpb.NewError(err) } // If the reply contains a timestamp, update the local HLC with it. if br.Error != nil && br.Error.Now != hlc.ZeroTimestamp { ds.clock.Update(br.Error.Now) } else if br.Now != hlc.ZeroTimestamp { ds.clock.Update(br.Now) } // Untangle the error from the received response. pErr := br.Error br.Error = nil // scrub the response error return br, pErr }
// maybeBeginTxn begins a new transaction if a txn has been specified // in the request but has a nil ID. The new transaction is initialized // using the name and isolation in the otherwise uninitialized txn. // The Priority, if non-zero is used as a minimum. // // No transactional writes are allowed unless preceded by a begin // transaction request within the same batch. The exception is if the // transaction is already in state txn.Writing=true. func (tc *TxnCoordSender) maybeBeginTxn(ba *roachpb.BatchRequest) error { if len(ba.Requests) == 0 { return errors.Errorf("empty batch with txn") } if ba.Txn.ID == nil { // Create transaction without a key. The key is set when a begin // transaction request is received. // The initial timestamp may be communicated by a higher layer. // If so, use that. Otherwise make up a new one. timestamp := ba.Txn.OrigTimestamp if timestamp == hlc.ZeroTimestamp { timestamp = tc.clock.Now() } newTxn := roachpb.NewTransaction(ba.Txn.Name, nil, ba.UserPriority, ba.Txn.Isolation, timestamp, tc.clock.MaxOffset().Nanoseconds()) // Use existing priority as a minimum. This is used on transaction // aborts to ratchet priority when creating successor transaction. if newTxn.Priority < ba.Txn.Priority { newTxn.Priority = ba.Txn.Priority } ba.Txn = newTxn } // Check for a begin transaction to set txn key based on the key of // the first transactional write. Also enforce that no transactional // writes occur before a begin transaction. var haveBeginTxn bool for _, req := range ba.Requests { args := req.GetInner() if bt, ok := args.(*roachpb.BeginTransactionRequest); ok { if haveBeginTxn || ba.Txn.Writing { return errors.Errorf("begin transaction requested twice in the same transaction: %s", ba.Txn) } haveBeginTxn = true if ba.Txn.Key == nil { ba.Txn.Key = bt.Key } } if roachpb.IsTransactionWrite(args) && !haveBeginTxn && !ba.Txn.Writing { return errors.Errorf("transactional write before begin transaction") } } return nil }
func (db *DB) prepareToSend(ba *roachpb.BatchRequest) *roachpb.Error { if ba.ReadConsistency == roachpb.INCONSISTENT { for _, ru := range ba.Requests { req := ru.GetInner() if req.Method() != roachpb.Get && req.Method() != roachpb.Scan && req.Method() != roachpb.ReverseScan { return roachpb.NewErrorf("method %s not allowed with INCONSISTENT batch", req.Method) } } } if db.ctx.UserPriority != 1 { ba.UserPriority = db.ctx.UserPriority } tracing.AnnotateTrace() return nil }
// Send implements the batch.Sender interface. It subdivides the Batch // into batches admissible for sending (preventing certain illegal // mixtures of requests), executes each individual part (which may // span multiple ranges), and recombines the response. // // When the request spans ranges, it is split by range and a partial // subset of the batch request is sent to affected ranges in parallel. // // The first write in a transaction may not arrive before writes to // other ranges. This is relevant in the case of a BeginTransaction // request. Intents written to other ranges before the transaction // record is created will cause the transaction to abort early. func (ds *DistSender) Send( ctx context.Context, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, *roachpb.Error) { tracing.AnnotateTrace() if pErr := ds.initAndVerifyBatch(ctx, &ba); pErr != nil { return nil, pErr } ctx = ds.AnnotateCtx(ctx) ctx, cleanup := tracing.EnsureContext(ctx, ds.AmbientContext.Tracer) defer cleanup() var rplChunks []*roachpb.BatchResponse parts := ba.Split(false /* don't split ET */) if len(parts) > 1 && ba.MaxSpanRequestKeys != 0 { // We already verified above that the batch contains only scan requests of the same type. // Such a batch should never need splitting. panic("batch with MaxSpanRequestKeys needs splitting") } for len(parts) > 0 { part := parts[0] ba.Requests = part // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err) } rpl, pErr := ds.divideAndSendBatchToRanges(ctx, ba, rs, true /* isFirst */) if pErr == errNo1PCTxn { // If we tried to send a single round-trip EndTransaction but // it looks like it's going to hit multiple ranges, split it // here and try again. if len(parts) != 1 { panic("EndTransaction not in last chunk of batch") } parts = ba.Split(true /* split ET */) if len(parts) != 2 { panic("split of final EndTransaction chunk resulted in != 2 parts") } continue } if pErr != nil { return nil, pErr } // Propagate transaction from last reply to next request. The final // update is taken and put into the response's main header. ba.UpdateTxn(rpl.Txn) rplChunks = append(rplChunks, rpl) parts = parts[1:] } reply := rplChunks[0] for _, rpl := range rplChunks[1:] { reply.Responses = append(reply.Responses, rpl.Responses...) reply.CollectedSpans = append(reply.CollectedSpans, rpl.CollectedSpans...) } reply.BatchResponse_Header = rplChunks[len(rplChunks)-1].BatchResponse_Header return reply, nil }
func TestBatchRange(t *testing.T) { testCases := []struct { req [][2]string exp [2]string }{ { // Boring single request. req: [][2]string{{"a", "b"}}, exp: [2]string{"a", "b"}, }, { // Request with invalid range. It's important that this still // results in a valid range. req: [][2]string{{"b", "a"}}, exp: [2]string{"b", "b\x00"}, }, { // Two overlapping ranges. req: [][2]string{{"a", "c"}, {"b", "d"}}, exp: [2]string{"a", "d"}, }, { // Two disjoint ranges. req: [][2]string{{"a", "b"}, {"c", "d"}}, exp: [2]string{"a", "d"}, }, { // Range and disjoint point request. req: [][2]string{{"a", "b"}, {"c", ""}}, exp: [2]string{"a", "c\x00"}, }, { // Three disjoint point requests. req: [][2]string{{"a", ""}, {"b", ""}, {"c", ""}}, exp: [2]string{"a", "c\x00"}, }, { // Disjoint range request and point request. req: [][2]string{{"a", "b"}, {"b", ""}}, exp: [2]string{"a", "b\x00"}, }, { // Range-local point request. req: [][2]string{{string(RangeDescriptorKey(roachpb.RKeyMax)), ""}}, exp: [2]string{"\xff\xff", "\xff\xff\x00"}, }, { // Range-local to global such that the key ordering flips. // Important that we get a valid range back. req: [][2]string{{string(RangeDescriptorKey(roachpb.RKeyMax)), "x"}}, exp: [2]string{"\xff\xff", "\xff\xff\x00"}, }, { // Range-local to global without order messed up. req: [][2]string{{string(RangeDescriptorKey(roachpb.RKey("a"))), "x"}}, exp: [2]string{"a", "x"}, }, } for i, c := range testCases { var ba roachpb.BatchRequest for _, pair := range c.req { ba.Add(&roachpb.ScanRequest{Span: roachpb.Span{Key: roachpb.Key(pair[0]), EndKey: roachpb.Key(pair[1])}}) } if rs, err := Range(ba); err != nil { t.Errorf("%d: %v", i, err) } else if actPair := [2]string{string(rs.Key), string(rs.EndKey)}; !reflect.DeepEqual(actPair, c.exp) { t.Errorf("%d: expected [%q,%q), got [%q,%q)", i, c.exp[0], c.exp[1], actPair[0], actPair[1]) } } }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request now := replica.store.Clock().Now() reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, ProposedTS: &now, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // We reset our state below regardless of whether we've gotten an error or // not, but note that an error is ambiguous - there's no guarantee that the // transfer will not still apply. That's OK, however, as the "in transfer" // state maintained by the pendingLeaseRequest is not relied on for // correctness (see replica.mu.minLeaseProposedTS), and resetting the state // is beneficial as it'll allow the replica to attempt to transfer again or // extend the existing lease in the future. // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for _, llChan := range p.llChans { // Don't send the same transaction object twice; this can lead to races. if pErr != nil { pErrClone := *pErr pErrClone.SetTxn(pErr.GetTxn()) llChan <- &pErrClone } else { llChan <- nil } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a // channel on which the result will be posted. If there's already a request in // progress, we join in waiting for the results of that request. // It is an error to call InitOrJoinRequest() while a request is in progress // naming another replica as lease holder. // // replica is used to schedule and execute async work (proposing a RequestLease // command). replica.mu is locked when delivering results, so calls from the // replica happen either before or after a result for a pending request has // happened. // // transfer needs to be set if the request represents a lease transfer (as // opposed to an extension, or acquiring the lease when none is held). // // Note: Once this function gets a context to be used for cancellation, instead // of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling // the Raft command, similar to replica.addWriteCmd. func (p *pendingLeaseRequest) InitOrJoinRequest( replica *Replica, nextLeaseHolder roachpb.ReplicaDescriptor, timestamp hlc.Timestamp, startKey roachpb.Key, transfer bool, ) <-chan *roachpb.Error { if nextLease, ok := p.RequestPending(); ok { if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID { // Join a pending request asking for the same replica to become lease // holder. return p.JoinRequest() } llChan := make(chan *roachpb.Error, 1) // We can't join the request in progress. llChan <- roachpb.NewErrorf("request for different replica in progress "+ "(requesting: %+v, in progress: %+v)", nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID) return llChan } llChan := make(chan *roachpb.Error, 1) // No request in progress. Let's propose a Lease command asynchronously. // TODO(tschottdorf): get duration from configuration, either as a // config flag or, later, dynamically adjusted. startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0) expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0) reqSpan := roachpb.Span{ Key: startKey, } var leaseReq roachpb.Request reqLease := roachpb.Lease{ Start: timestamp, StartStasis: startStasis, Expiration: expiration, Replica: nextLeaseHolder, } if transfer { leaseReq = &roachpb.TransferLeaseRequest{ Span: reqSpan, Lease: reqLease, } } else { leaseReq = &roachpb.RequestLeaseRequest{ Span: reqSpan, Lease: reqLease, } } if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) { ctx = replica.AnnotateCtx(ctx) // Propose a RequestLease command and wait for it to apply. ba := roachpb.BatchRequest{} ba.Timestamp = replica.store.Clock().Now() ba.RangeID = replica.RangeID ba.Add(leaseReq) if log.V(2) { log.Infof(ctx, "sending lease request %v", leaseReq) } _, pErr := replica.Send(ctx, ba) // Send result of lease to all waiter channels. replica.mu.Lock() defer replica.mu.Unlock() for i, llChan := range p.llChans { // Don't send the same pErr object twice; this can lead to races. We could // clone every time but it's more efficient to send pErr itself to one of // the channels (the last one; if we send it earlier the race can still // happen). if i == len(p.llChans)-1 { llChan <- pErr } else { llChan <- protoutil.Clone(pErr).(*roachpb.Error) // works with `nil` } } p.llChans = p.llChans[:0] p.nextLease = roachpb.Lease{} }) != nil { // We failed to start the asynchronous task. Send a blank NotLeaseHolderError // back to indicate that we have no idea who the range lease holder might // be; we've withdrawn from active duty. llChan <- roachpb.NewError( newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc)) return llChan } p.llChans = append(p.llChans, llChan) p.nextLease = reqLease return llChan }
func TestTruncate(t *testing.T) { defer leaktest.AfterTest(t)() loc := func(s string) string { return string(keys.RangeDescriptorKey(roachpb.RKey(s))) } locPrefix := func(s string) string { return string(keys.MakeRangeKeyPrefix(roachpb.RKey(s))) } testCases := []struct { keys [][2]string expKeys [][2]string from, to string desc [2]string // optional, defaults to {from,to} err string }{ { // Keys inside of active range. keys: [][2]string{{"a", "q"}, {"c"}, {"b, e"}, {"q"}}, expKeys: [][2]string{{"a", "q"}, {"c"}, {"b, e"}, {"q"}}, from: "a", to: "q\x00", }, { // Keys outside of active range. keys: [][2]string{{"a"}, {"a", "b"}, {"q"}, {"q", "z"}}, expKeys: [][2]string{{}, {}, {}, {}}, from: "b", to: "q", }, { // Range-local keys inside of active range. keys: [][2]string{{loc("b")}, {loc("c")}}, expKeys: [][2]string{{loc("b")}, {loc("c")}}, from: "b", to: "e", }, { // Range-local key outside of active range. keys: [][2]string{{loc("a")}}, expKeys: [][2]string{{}}, from: "b", to: "e", }, { // Range-local range contained in active range. keys: [][2]string{{loc("b"), loc("e") + "\x00"}}, expKeys: [][2]string{{loc("b"), loc("e") + "\x00"}}, from: "b", to: "e\x00", }, { // Range-local range not contained in active range. keys: [][2]string{{loc("a"), loc("b")}}, expKeys: [][2]string{{}}, from: "c", to: "e", }, { // Range-local range not contained in active range. keys: [][2]string{{loc("a"), locPrefix("b")}, {loc("e"), loc("f")}}, expKeys: [][2]string{{}, {}}, from: "b", to: "e", }, { // Range-local range partially contained in active range. keys: [][2]string{{loc("a"), loc("b")}}, expKeys: [][2]string{{loc("a"), locPrefix("b")}}, from: "a", to: "b", }, { // Range-local range partially contained in active range. keys: [][2]string{{loc("a"), loc("b")}}, expKeys: [][2]string{{locPrefix("b"), loc("b")}}, from: "b", to: "e", }, { // Range-local range contained in active range. keys: [][2]string{{locPrefix("b"), loc("b")}}, expKeys: [][2]string{{locPrefix("b"), loc("b")}}, from: "b", to: "c", }, { // Mixed range-local vs global key range. keys: [][2]string{{loc("c"), "d\x00"}}, from: "b", to: "e", err: "local key mixed with global key", }, { // Key range touching and intersecting active range. keys: [][2]string{{"a", "b"}, {"a", "c"}, {"p", "q"}, {"p", "r"}, {"a", "z"}}, expKeys: [][2]string{{}, {"b", "c"}, {"p", "q"}, {"p", "q"}, {"b", "q"}}, from: "b", to: "q", }, // Active key range is intersection of descriptor and [from,to). { keys: [][2]string{{"c", "q"}}, expKeys: [][2]string{{"d", "p"}}, from: "a", to: "z", desc: [2]string{"d", "p"}, }, { keys: [][2]string{{"c", "q"}}, expKeys: [][2]string{{"d", "p"}}, from: "d", to: "p", desc: [2]string{"a", "z"}, }, } for i, test := range testCases { goldenOriginal := roachpb.BatchRequest{} for _, ks := range test.keys { if len(ks[1]) > 0 { u := uuid.MakeV4() goldenOriginal.Add(&roachpb.ResolveIntentRangeRequest{ Span: roachpb.Span{Key: roachpb.Key(ks[0]), EndKey: roachpb.Key(ks[1])}, IntentTxn: enginepb.TxnMeta{ID: &u}, }) } else { goldenOriginal.Add(&roachpb.GetRequest{ Span: roachpb.Span{Key: roachpb.Key(ks[0])}, }) } } original := roachpb.BatchRequest{Requests: make([]roachpb.RequestUnion, len(goldenOriginal.Requests))} for i, request := range goldenOriginal.Requests { original.Requests[i].SetValue(request.GetInner().ShallowCopy()) } desc := &roachpb.RangeDescriptor{ StartKey: roachpb.RKey(test.desc[0]), EndKey: roachpb.RKey(test.desc[1]), } if len(desc.StartKey) == 0 { desc.StartKey = roachpb.RKey(test.from) } if len(desc.EndKey) == 0 { desc.EndKey = roachpb.RKey(test.to) } rs := roachpb.RSpan{Key: roachpb.RKey(test.from), EndKey: roachpb.RKey(test.to)} rs, err := rs.Intersect(desc) if err != nil { t.Errorf("%d: intersection failure: %v", i, err) continue } ba, num, err := truncate(original, rs) if err != nil || test.err != "" { if !testutils.IsError(err, test.err) { t.Errorf("%d: %v (expected: %q)", i, err, test.err) } continue } var reqs int for j, arg := range ba.Requests { req := arg.GetInner() if _, ok := req.(*roachpb.NoopRequest); ok { continue } if h := req.Header(); !bytes.Equal(h.Key, roachpb.Key(test.expKeys[j][0])) || !bytes.Equal(h.EndKey, roachpb.Key(test.expKeys[j][1])) { t.Errorf("%d.%d: range mismatch: actual [%q,%q), wanted [%q,%q)", i, j, h.Key, h.EndKey, test.expKeys[j][0], test.expKeys[j][1]) } else if _, ok := req.(*roachpb.NoopRequest); ok != (len(h.Key) == 0) { t.Errorf("%d.%d: expected NoopRequest, got %T", i, j, req) } else if len(h.Key) != 0 { reqs++ } } if reqs != num { t.Errorf("%d: counted %d requests, but truncation indicated %d", i, reqs, num) } if !reflect.DeepEqual(original, goldenOriginal) { t.Errorf("%d: truncation mutated original:\nexpected: %s\nactual: %s", i, goldenOriginal, original) } } }
// send runs the specified calls synchronously in a single batch and // returns any errors. If the transaction is read-only or has already // been successfully committed or aborted, a potential trailing // EndTransaction call is silently dropped, allowing the caller to // always commit or clean-up explicitly even when that may not be // required (or even erroneous). Returns (nil, nil) for an empty batch. func (txn *Txn) send(ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if txn.Proto.Status != roachpb.PENDING || txn.IsFinalized() { return nil, roachpb.NewErrorf( "attempting to use transaction with wrong status or finalized: %s", txn.Proto.Status) } // It doesn't make sense to use inconsistent reads in a transaction. However, // we still need to accept it as a parameter for this to compile. if ba.ReadConsistency != roachpb.CONSISTENT { return nil, roachpb.NewErrorf("cannot use %s ReadConsistency in txn", ba.ReadConsistency) } lastIndex := len(ba.Requests) - 1 if lastIndex < 0 { return nil, nil } // firstWriteIndex is set to the index of the first command which is // a transactional write. If != -1, this indicates an intention to // write. This is in contrast to txn.Proto.Writing, which is set by // the coordinator when the first intent has been created, and which // lives for the life of the transaction. firstWriteIndex := -1 var firstWriteKey roachpb.Key for i, ru := range ba.Requests { args := ru.GetInner() if i < lastIndex { if _, ok := args.(*roachpb.EndTransactionRequest); ok { return nil, roachpb.NewErrorf("%s sent as non-terminal call", args.Method()) } } if roachpb.IsTransactionWrite(args) && firstWriteIndex == -1 { firstWriteKey = args.Header().Key firstWriteIndex = i } } haveTxnWrite := firstWriteIndex != -1 endTxnRequest, haveEndTxn := ba.Requests[lastIndex].GetInner().(*roachpb.EndTransactionRequest) needBeginTxn := !txn.Proto.Writing && haveTxnWrite needEndTxn := txn.Proto.Writing || haveTxnWrite elideEndTxn := haveEndTxn && !needEndTxn // If we're not yet writing in this txn, but intend to, insert a // begin transaction request before the first write command. if needBeginTxn { // If the transaction already has a key (we're in a restart), make // sure we set the key in the begin transaction request to the original. bt := &roachpb.BeginTransactionRequest{ Span: roachpb.Span{ Key: firstWriteKey, }, } if txn.Proto.Key != nil { bt.Key = txn.Proto.Key } // Inject the new request before position firstWriteIndex, taking // care to avoid unnecessary allocations. oldRequests := ba.Requests ba.Requests = make([]roachpb.RequestUnion, len(ba.Requests)+1) copy(ba.Requests, oldRequests[:firstWriteIndex]) ba.Requests[firstWriteIndex].MustSetInner(bt) copy(ba.Requests[firstWriteIndex+1:], oldRequests[firstWriteIndex:]) } if elideEndTxn { ba.Requests = ba.Requests[:lastIndex] } br, pErr := txn.sendInternal(ba) if elideEndTxn && pErr == nil { // Check that read only transactions do not violate their deadline. This can NOT // happen since the txn deadline is normally updated when it is about to expire // or expired. We will just keep the code for safety (see TestReacquireLeaseOnRestart). if endTxnRequest.Deadline != nil { if endTxnRequest.Deadline.Less(txn.Proto.Timestamp) { return nil, roachpb.NewErrorWithTxn(roachpb.NewTransactionAbortedError(), &txn.Proto) } } // This normally happens on the server and sent back in response // headers, but this transaction was optimized away. The caller may // still inspect the transaction struct, so we manually update it // here to emulate a true transaction. if endTxnRequest.Commit { txn.Proto.Status = roachpb.COMMITTED } else { txn.Proto.Status = roachpb.ABORTED } txn.finalized = true } // If we inserted a begin transaction request, remove it here. if needBeginTxn { if br != nil && br.Responses != nil { br.Responses = append(br.Responses[:firstWriteIndex], br.Responses[firstWriteIndex+1:]...) } // Handle case where inserted begin txn confused an indexed error. if pErr != nil && pErr.Index != nil { idx := pErr.Index.Index if idx == int32(firstWriteIndex) { // An error was encountered on begin txn; disallow the indexing. pErr.Index = nil } else if idx > int32(firstWriteIndex) { // An error was encountered after begin txn; decrement index. pErr.SetErrorIndex(idx - 1) } } } return br, pErr }
// sendToReplicas sends one or more RPCs to clients specified by the // slice of replicas. On success, Send returns the first successful // reply. If an error occurs which is not specific to a single // replica, it's returned immediately. Otherwise, when all replicas // have been tried and failed, returns a send error. func (ds *DistSender) sendToReplicas( opts SendOptions, rangeID roachpb.RangeID, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context, ) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1)) } var ambiguousResult bool var haveCommit bool // We only check for committed txns, not aborts because aborts may // be retried without any risk of inconsistencies. if etArg, ok := args.GetArg(roachpb.EndTransaction); ok && etArg.(*roachpb.EndTransactionRequest).Commit { haveCommit = true } done := make(chan BatchCall, len(replicas)) transportFactory := opts.transportFactory if transportFactory == nil { transportFactory = grpcTransportFactory } transport, err := transportFactory(opts, rpcContext, replicas, args) if err != nil { return nil, err } defer transport.Close() if transport.IsExhausted() { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed", len(replicas))) } // Send the first request. pending := 1 log.VEventf(opts.ctx, 2, "sending RPC for batch: %s", args.Summary()) transport.SendNext(done) // Wait for completions. This loop will retry operations that fail // with errors that reflect per-replica state and may succeed on // other replicas. var sendNextTimer timeutil.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if !transport.IsExhausted() { log.VEventf(opts.ctx, 2, "timeout, trying next peer") pending++ transport.SendNext(done) } case call := <-done: pending-- err := call.Err if err == nil { if log.V(2) { log.Infof(opts.ctx, "RPC reply: %s", call.Reply) } else if log.V(1) && call.Reply.Error != nil { log.Infof(opts.ctx, "application error: %s", call.Reply.Error) } if call.Reply.Error == nil { return call.Reply, nil } else if !ds.handlePerReplicaError(opts.ctx, transport, rangeID, call.Reply.Error) { // The error received is not specific to this replica, so we // should return it instead of trying other replicas. However, // if we're trying to commit a transaction and there are // still other RPCs outstanding or an ambiguous RPC error // was already received, we must return an ambiguous commit // error instead of returned error. if haveCommit && (pending > 0 || ambiguousResult) { return nil, roachpb.NewAmbiguousResultError() } return call.Reply, nil } // Extract the detail so it can be included in the error // message if this is our last replica. // // TODO(bdarnell): The last error is not necessarily the best // one to return; we may want to remember the "best" error // we've seen (for example, a NotLeaseHolderError conveys more // information than a RangeNotFound). err = call.Reply.Error.GoError() } else { if log.V(1) { log.Warningf(opts.ctx, "RPC error: %s", err) } // All connection errors except for an unavailable node (this // is GRPC's fail-fast error), may mean that the request // succeeded on the remote server, but we were unable to // receive the reply. Set the ambiguous commit flag. // // We retry ambiguous commit batches to avoid returning the // unrecoverable AmbiguousResultError. This is safe because // repeating an already-successfully applied batch is // guaranteed to return either a TransactionReplayError (in // case the replay happens at the original leader), or a // TransactionRetryError (in case the replay happens at a new // leader). If the original attempt merely timed out or was // lost, then the batch will succeed and we can be assured the // commit was applied just once. // // The Unavailable code is used by GRPC to indicate that a // request fails fast and is not sent, so we can be sure there // is no ambiguity on these errors. Note that these are common // if a node is down. // See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/call.go#L182 // See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/stream.go#L158 if haveCommit && grpc.Code(err) != codes.Unavailable { ambiguousResult = true } } // Send to additional replicas if available. if !transport.IsExhausted() { log.VEventf(opts.ctx, 2, "error, trying next peer: %s", err) pending++ transport.SendNext(done) } if pending == 0 { if ambiguousResult { err = roachpb.NewAmbiguousResultError() } else { err = roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err), ) } if log.V(2) { log.ErrEvent(opts.ctx, err.Error()) } return nil, err } } } }
// TestTxnCoordSenderHeartbeat verifies periodic heartbeat of the // transaction record. func TestTxnCoordSenderHeartbeat(t *testing.T) { defer leaktest.AfterTest(t)() s, sender := createTestDB(t) defer s.Stop() defer teardownHeartbeats(sender) // Set heartbeat interval to 1ms for testing. sender.heartbeatInterval = 1 * time.Millisecond initialTxn := client.NewTxn(context.Background(), *s.DB) if err := initialTxn.Put(roachpb.Key("a"), []byte("value")); err != nil { t.Fatal(err) } // Verify 3 heartbeats. var heartbeatTS hlc.Timestamp for i := 0; i < 3; i++ { util.SucceedsSoon(t, func() error { txn, pErr := getTxn(sender, &initialTxn.Proto) if pErr != nil { t.Fatal(pErr) } // Advance clock by 1ns. // Locking the TxnCoordSender to prevent a data race. sender.Lock() s.Manual.Increment(1) sender.Unlock() if txn.LastHeartbeat != nil && heartbeatTS.Less(*txn.LastHeartbeat) { heartbeatTS = *txn.LastHeartbeat return nil } return errors.Errorf("expected heartbeat") }) } // Sneakily send an ABORT right to DistSender (bypassing TxnCoordSender). { var ba roachpb.BatchRequest ba.Add(&roachpb.EndTransactionRequest{ Commit: false, Span: roachpb.Span{Key: initialTxn.Proto.Key}, }) ba.Txn = &initialTxn.Proto if _, pErr := sender.wrapped.Send(context.Background(), ba); pErr != nil { t.Fatal(pErr) } } util.SucceedsSoon(t, func() error { sender.Lock() defer sender.Unlock() if txnMeta, ok := sender.txns[*initialTxn.Proto.ID]; !ok { t.Fatal("transaction unregistered prematurely") } else if txnMeta.txn.Status != roachpb.ABORTED { return fmt.Errorf("transaction is not aborted") } return nil }) // Trying to do something else should give us a TransactionAbortedError. _, err := initialTxn.Get("a") assertTransactionAbortedError(t, err) }
// fillSkippedResponses after meeting the batch key max limit for range // requests. func fillSkippedResponses(ba roachpb.BatchRequest, br *roachpb.BatchResponse, nextKey roachpb.RKey) { // Some requests might have NoopResponses; we must replace them with empty // responses of the proper type. for i, req := range ba.Requests { if _, ok := br.Responses[i].GetInner().(*roachpb.NoopResponse); !ok { continue } var reply roachpb.Response switch t := req.GetInner().(type) { case *roachpb.ScanRequest: reply = &roachpb.ScanResponse{} case *roachpb.ReverseScanRequest: reply = &roachpb.ReverseScanResponse{} case *roachpb.DeleteRangeRequest: reply = &roachpb.DeleteRangeResponse{} case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest: continue default: panic(fmt.Sprintf("bad type %T", t)) } union := roachpb.ResponseUnion{} union.MustSetInner(reply) br.Responses[i] = union } // Set the ResumeSpan for future batch requests. isReverse := ba.IsReverse() for i, resp := range br.Responses { req := ba.Requests[i].GetInner() if !roachpb.IsRange(req) { continue } hdr := resp.GetInner().Header() origSpan := req.Header() if isReverse { if hdr.ResumeSpan != nil { // The ResumeSpan.Key might be set to the StartKey of a range; // correctly set it to the Key of the original request span. hdr.ResumeSpan.Key = origSpan.Key } else if roachpb.RKey(origSpan.Key).Less(nextKey) { // Some keys have yet to be processed. hdr.ResumeSpan = &origSpan if nextKey.Less(roachpb.RKey(origSpan.EndKey)) { // The original span has been partially processed. hdr.ResumeSpan.EndKey = nextKey.AsRawKey() } } } else { if hdr.ResumeSpan != nil { // The ResumeSpan.EndKey might be set to the EndKey of a // range; correctly set it to the EndKey of the original // request span. hdr.ResumeSpan.EndKey = origSpan.EndKey } else if nextKey.Less(roachpb.RKey(origSpan.EndKey)) { // Some keys have yet to be processed. hdr.ResumeSpan = &origSpan if roachpb.RKey(origSpan.Key).Less(nextKey) { // The original span has been partially processed. hdr.ResumeSpan.Key = nextKey.AsRawKey() } } } br.Responses[i].GetInner().SetHeader(hdr) } }
// sendPartialBatch sends the supplied batch to the range specified by // desc. The batch request is first truncated so that it contains only // requests which intersect the range descriptor and keys for each // request are limited to the range's key span. The send occurs in a // retry loop to handle send failures. On failure to send to any // replicas, we backoff and retry by refetching the range // descriptor. If the underlying range seems to have split, we // recursively invoke divideAndSendBatchToRanges to re-enumerate the // ranges in the span and resend to each. func (ds *DistSender) sendPartialBatch( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, desc *roachpb.RangeDescriptor, evictToken *EvictionToken, isFirst bool, ) response { var reply *roachpb.BatchResponse var pErr *roachpb.Error isReverse := ba.IsReverse() // Truncate the request to range descriptor. intersected, err := rs.Intersect(desc) if err != nil { return response{pErr: roachpb.NewError(err)} } truncBA, numActive, err := truncate(ba, intersected) if numActive == 0 && err == nil { // This shouldn't happen in the wild, but some tests exercise it. return response{ pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba), } } if err != nil { return response{pErr: roachpb.NewError(err)} } // Start a retry loop for sending the batch to the range. for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { // If we've cleared the descriptor on a send failure, re-lookup. if desc == nil { var descKey roachpb.RKey if isReverse { descKey = intersected.EndKey } else { descKey = intersected.Key } desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse) if err != nil { log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err) continue } } reply, pErr = ds.sendSingleRange(ctx, truncBA, desc) // If sending succeeded, return immediately. if pErr == nil { return response{reply: reply} } log.ErrEventf(ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup") if err := evictToken.Evict(ctx); err != nil { return response{pErr: roachpb.NewError(err)} } // Clear the descriptor to reload on the next attempt. desc = nil continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return response{pErr: roachpb.NewError(err)} } // On addressing errors (likely a split), we need to re-invoke // the range descriptor lookup machinery, so we recurse by // sending batch to just the partial span this descriptor was // supposed to cover. log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr) reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst) return response{reply: reply, pErr: pErr} } break } // Propagate error if either the retry closer or context done // channels were closed. if pErr == nil { if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil { log.Fatal(ctx, "exited retry loop without an error") } } return response{pErr: pErr} }
// divideAndSendBatchToRanges sends the supplied batch to all of the // ranges which comprise the span specified by rs. The batch request // is trimmed against each range which is part of the span and sent // either serially or in parallel, if possible. isFirst indicates // whether this is the first time this method has been called on the // batch. It's specified false where this method is invoked recursively. func (ds *DistSender) divideAndSendBatchToRanges( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, isFirst bool, ) (br *roachpb.BatchResponse, pErr *roachpb.Error) { // This function builds a channel of responses for each range // implicated in the span (rs) and combines them into a single // BatchResponse when finished. var responseChs []chan response defer func() { for _, responseCh := range responseChs { resp := <-responseCh if resp.pErr != nil { if pErr == nil { pErr = resp.pErr } continue } if br == nil { // First response from a Range. br = resp.reply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(resp.reply); err != nil { pErr = roachpb.NewError(err) return } br.Txn.Update(resp.reply.Txn) } } // If we experienced an error, don't neglect to update the error's // attached transaction with any responses which were received. if pErr != nil { if br != nil { pErr.UpdateTxn(br.Txn) } } }() // Get initial seek key depending on direction of iteration. var seekKey roachpb.RKey isReverse := ba.IsReverse() if isReverse { seekKey = rs.EndKey } else { seekKey = rs.Key } // Send the request to one range per iteration. ri := NewRangeIterator(ds, isReverse) for ri.Seek(ctx, seekKey); ri.Valid(); ri.Seek(ctx, seekKey) { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to // for each RPC individually). On RPC errors, there's no guarantee // that the request hasn't made its way to the target regardless // of the error; we'd like the second execution to be caught by // the sequence cache if that happens. There is a small chance // that we address a range twice in this chunk (stale/suboptimal // descriptors due to splits/merges) which leads to a transaction // retry. // // TODO(tschottdorf): it's possible that if we don't evict from // the cache we could be in for a busy loop. ba.SetNewRequest() responseCh := make(chan response, 1) responseChs = append(responseChs, responseCh) if isFirst && ri.NeedAnother(rs) { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { responseCh <- response{pErr: roachpb.NewError(&roachpb.OpRequiresTxnError{})} return } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { responseCh <- response{pErr: errNo1PCTxn} return } } // Determine next seek key, taking a potentially sparse batch into // consideration. var err error nextRS := rs if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. seekKey, err = prev(ba, ri.Desc().StartKey) nextRS.EndKey = seekKey } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. seekKey, err = next(ba, ri.Desc().EndKey) nextRS.Key = seekKey } if err != nil { responseCh <- response{pErr: roachpb.NewError(err)} return } // Send the next partial batch to the first range in the "rs" span. // If we're not handling a request which limits responses and we // can reserve one of the limited goroutines available for parallel // batch RPCs, send asynchronously. if ba.MaxSpanRequestKeys == 0 && ri.NeedAnother(rs) && ds.rpcContext != nil && ds.sendPartialBatchAsync(ctx, ba, rs, ri.Desc(), ri.Token(), isFirst, responseCh) { // Note that we pass the batch request by value to the parallel // goroutine to avoid using the cloned txn. // Clone the txn to preserve the current txn sequence for the async call. if ba.Txn != nil { txnClone := ba.Txn.Clone() ba.Txn = &txnClone } } else { // Send synchronously if there is no parallel capacity left, there's a // max results limit, or this is the final request in the span. resp := ds.sendPartialBatch(ctx, ba, rs, ri.Desc(), ri.Token(), isFirst) responseCh <- resp if resp.pErr != nil { return } ba.UpdateTxn(resp.reply.Txn) // Check whether we've received enough responses to exit query loop. if ba.MaxSpanRequestKeys > 0 { var numResults int64 for _, r := range resp.reply.Responses { numResults += r.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults // Exiting; fill in missing responses. if ba.MaxSpanRequestKeys == 0 { fillSkippedResponses(ba, resp.reply, seekKey) return } } } // Check for completion. if !ri.NeedAnother(rs) { return } isFirst = false // next range will not be first! rs = nextRS } // We've exited early. Return the range iterator error. responseCh := make(chan response, 1) responseCh <- response{pErr: ri.Error()} responseChs = append(responseChs, responseCh) return }
// initAndVerifyBatch initializes timestamp-related information and // verifies batch constraints before splitting. func (ds *DistSender) initAndVerifyBatch( ctx context.Context, ba *roachpb.BatchRequest, ) *roachpb.Error { // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if ba.ReadConsistency == roachpb.INCONSISTENT && ba.Timestamp.Equal(hlc.ZeroTimestamp) { ba.Timestamp = ds.clock.Now() } if ba.Txn != nil { // Make a copy here since the code below modifies it in different places. // TODO(tschottdorf): be smarter about this - no need to do it for // requests that don't get split. txnClone := ba.Txn.Clone() ba.Txn = &txnClone if len(ba.Txn.ObservedTimestamps) == 0 { // Ensure the local NodeID is marked as free from clock offset; // the transaction's timestamp was taken off the local clock. if nDesc := ds.getNodeDescriptor(); nDesc != nil { // TODO(tschottdorf): future refactoring should move this to txn // creation in TxnCoordSender, which is currently unaware of the // NodeID (and wraps *DistSender through client.Sender since it // also needs test compatibility with *LocalSender). // // Taking care below to not modify any memory referenced from // our BatchRequest which may be shared with others. // // We already have a clone of our txn (see above), so we can // modify it freely. // // Zero the existing data. That makes sure that if we had // something of size zero but with capacity, we don't re-use the // existing space (which others may also use). This is just to // satisfy paranoia/OCD and not expected to matter in practice. ba.Txn.ResetObservedTimestamps() // OrigTimestamp is the HLC timestamp at which the Txn started, so // this effectively means no more uncertainty on this node. ba.Txn.UpdateObservedTimestamp(nDesc.NodeID, ba.Txn.OrigTimestamp) } } } if len(ba.Requests) < 1 { return roachpb.NewErrorf("empty batch") } if ba.MaxSpanRequestKeys != 0 { // Verify that the batch contains only specific range requests or the // Begin/EndTransactionRequest. Verify that a batch with a ReverseScan // only contains ReverseScan range requests. isReverse := ba.IsReverse() for _, req := range ba.Requests { inner := req.GetInner() switch inner.(type) { case *roachpb.ScanRequest, *roachpb.DeleteRangeRequest: // Accepted range requests. All other range requests are still // not supported. // TODO(vivek): don't enumerate all range requests. if isReverse { return roachpb.NewErrorf("batch with limit contains both forward and reverse scans") } case *roachpb.BeginTransactionRequest, *roachpb.EndTransactionRequest, *roachpb.ReverseScanRequest: continue default: return roachpb.NewErrorf("batch with limit contains %T request", inner) } } } return nil }