// send runs the specified calls synchronously in a single batch and // returns any errors. func (db *DB) send(calls ...proto.Call) (pErr *proto.Error) { if len(calls) == 0 { return nil } if len(calls) == 1 { c := calls[0] // We only send BatchRequest. Everything else needs to go into one. if _, ok := calls[0].Args.(*proto.BatchRequest); ok { if c.Args.Header().UserPriority == nil && db.userPriority != 0 { c.Args.Header().UserPriority = gogoproto.Int32(db.userPriority) } resetClientCmdID(c.Args) _ = SendCall(db.sender, c) pErr = c.Reply.Header().Error if pErr != nil { if log.V(1) { log.Infof("failed %s: %s", c.Method(), pErr) } } else if c.Post != nil { pErr = proto.NewError(c.Post()) } return pErr } } ba, br := &proto.BatchRequest{}, &proto.BatchResponse{} for _, call := range calls { ba.Add(call.Args) } pErr = db.send(proto.Call{Args: ba, Reply: br}) // Recover from protobuf merge panics. defer func() { if r := recover(); r != nil { // Take care to log merge error and to return it if no error has // already been set. mergeErr := util.Errorf("unable to merge response: %s", r) log.Error(mergeErr) if pErr == nil { pErr = proto.NewError(mergeErr) } } }() // Transfer individual responses from batch response to prepared replies. for i, reply := range br.Responses { c := calls[i] gogoproto.Merge(c.Reply, reply.GetInner()) if c.Post != nil { if e := c.Post(); e != nil && pErr != nil { pErr = proto.NewError(e) } } } return }
// send runs the specified calls synchronously in a single batch and // returns any errors. If the transaction is read-only or has already // been successfully committed or aborted, a potential trailing // EndTransaction call is silently dropped, allowing the caller to // always commit or clean-up explicitly even when that may not be // required (or even erroneous). func (txn *Txn) send(calls ...proto.Call) *proto.Error { if txn.Proto.Status != proto.PENDING { return proto.NewError(util.Errorf("attempting to use %s transaction", txn.Proto.Status)) } lastIndex := len(calls) - 1 if lastIndex < 0 { return nil } lastReq := calls[lastIndex].Args // haveTxnWrite tracks intention to write. This is in contrast to // txn.Proto.Writing, which is set by the coordinator when the first // intent has been created, and which lives for the life of the // transaction. haveTxnWrite := proto.IsTransactionWrite(lastReq) for _, call := range calls[:lastIndex] { request := call.Args if req, ok := request.(*proto.EndTransactionRequest); ok { return proto.NewError(util.Errorf("%s sent as non-terminal call", req.Method())) } if !haveTxnWrite { haveTxnWrite = proto.IsTransactionWrite(request) } } endTxnRequest, haveEndTxn := lastReq.(*proto.EndTransactionRequest) needEndTxn := txn.Proto.Writing || haveTxnWrite elideEndTxn := haveEndTxn && !needEndTxn if elideEndTxn { calls = calls[:lastIndex] } pErr := txn.db.send(calls...) if elideEndTxn && pErr == nil { // This normally happens on the server and sent back in response // headers, but this transaction was optimized away. The caller may // still inspect the transaction struct, so we manually update it // here to emulate a true transaction. if endTxnRequest.Commit { txn.Proto.Status = proto.COMMITTED } else { txn.Proto.Status = proto.ABORTED } } return pErr }
// sendAttempt gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendAttempt(trace *tracer.Trace, ba proto.BatchRequest, desc *proto.RangeDescriptor) (*proto.BatchResponse, *proto.Error) { defer trace.Epoch("sending RPC")() leader := ds.leaderCache.Lookup(proto.RangeID(desc.RangeID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsReadOnly(&ba) && ba.ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } // TODO(tschottdorf) &ba -> ba resp, err := ds.sendRPC(trace, desc.RangeID, replicas, order, &ba) if err != nil { return nil, proto.NewError(err) } // Untangle the error from the received response. br := resp.(*proto.BatchResponse) pErr := br.Error br.Error = nil // scrub the response error return br, pErr }
// getDescriptors looks up the range descriptor to use for a query over the // key range [from,to), with the given lookupOptions. The range descriptor // which contains the range in which the request should start its query is // returned first; the returned bool is true in case the given range reaches // outside the first descriptor. // In case either of the descriptors is discovered stale, the returned closure // should be called; it evicts the cache appropriately. // Note that `from` and `to` are not necessarily Key and EndKey from a // RequestHeader; it's assumed that they've been translated to key addresses // already (via KeyAddress). func (ds *DistSender) getDescriptors(from, to proto.Key, options lookupOptions) (*proto.RangeDescriptor, bool, func(), *proto.Error) { var desc *proto.RangeDescriptor var err error var descKey proto.Key if !options.useReverseScan { descKey = from } else { descKey = to } desc, err = ds.rangeCache.LookupRangeDescriptor(descKey, options) if err != nil { return nil, false, nil, proto.NewError(err) } // Checks whether need to get next range descriptor. If so, returns true. needAnother := func(desc *proto.RangeDescriptor, isReverse bool) bool { if isReverse { return from.Less(desc.StartKey) } return desc.EndKey.Less(to) } evict := func() { ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, options.useReverseScan) } return desc, needAnother(desc, options.useReverseScan), evict, nil }
// Batch sends a request to Cockroach via RPC. Errors which are retryable are // retried with backoff in a loop using the default retry options. Other errors // sending the request are retried indefinitely using the same client command // ID to avoid reporting failure when in fact the command may have gone through // and been executed successfully. We retry here to eventually get through with // the same client command ID and be given the cached response. func (s *rpcSender) Send(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { var err error var br proto.BatchResponse for r := retry.Start(s.retryOpts); r.Next(); { select { case <-s.client.Healthy(): default: err = fmt.Errorf("failed to send RPC request %s: client is unhealthy", method) log.Warning(err) continue } if err = s.client.Call(method, &ba, &br); err != nil { br.Reset() // don't trust anyone. // Assume all errors sending request are retryable. The actual // number of things that could go wrong is vast, but we don't // want to miss any which should in theory be retried with the // same client command ID. We log the error here as a warning so // there's visiblity that this is happening. Some of the errors // we'll sweep up in this net shouldn't be retried, but we can't // really know for sure which. log.Warningf("failed to send RPC request %s: %s", method, err) continue } // On successful post, we're done with retry loop. break } if err != nil { return nil, proto.NewError(err) } pErr := br.Error br.Error = nil return &br, pErr }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *LocalSender) Send(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { trace := tracer.FromCtx(ctx) var store *storage.Store var err error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *proto.Replica var rangeID proto.RangeID rangeID, repl, err = ls.lookupReplica(ba.Key, ba.EndKey) if err == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.Method, ba.Method(), // TODO(tschottdorf): Method() always `Batch`. log.Key, ba.Key, log.RangeID, ba.RangeID) if err == nil { store, err = ls.GetStore(ba.Replica.StoreID) } var br *proto.BatchResponse if err != nil { return nil, proto.NewError(err) } // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=Timestamp for the operation // accomplishes that. See proto.Transaction.CertainNodes for details. if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. trace.Event("read has no clock uncertainty") ba.Txn.MaxTimestamp = ba.Txn.Timestamp } // TODO(tschottdorf): &ba -> ba tmpR, pErr := store.ExecuteCmd(ctx, &ba) // TODO(tschottdorf): remove this dance once BatchResponse is returned. if tmpR != nil { br = tmpR.(*proto.BatchResponse) if br.Error != nil { panic(proto.ErrorUnexpectedlySet(store, br)) } } return br, pErr }
// TestRunTransactionRetryOnErrors verifies that the transaction // is retried on the correct errors. func TestRunTransactionRetryOnErrors(t *testing.T) { defer leaktest.AfterTest(t) testCases := []struct { err error retry bool // Expect retry? }{ {&proto.ReadWithinUncertaintyIntervalError{}, true}, {&proto.TransactionAbortedError{}, true}, {&proto.TransactionPushError{}, true}, {&proto.TransactionRetryError{}, true}, {&proto.RangeNotFoundError{}, false}, {&proto.RangeKeyMismatchError{}, false}, {&proto.TransactionStatusError{}, false}, } for i, test := range testCases { count := 0 db := newDB(newTestSender(func(ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { if _, ok := ba.GetArg(proto.Put); ok { count++ if count == 1 { return nil, proto.NewError(test.err) } } return &proto.BatchResponse{}, nil }, nil)) db.txnRetryOptions.InitialBackoff = 1 * time.Millisecond err := db.Txn(func(txn *Txn) error { return txn.Put("a", "b") }) if test.retry { if count != 2 { t.Errorf("%d: expected one retry; got %d", i, count-1) } if err != nil { t.Errorf("%d: expected success on retry; got %s", i, err) } } else { if count != 1 { t.Errorf("%d: expected no retries; got %d", i, count) } if reflect.TypeOf(err) != reflect.TypeOf(test.err) { t.Errorf("%d: expected error of type %T; got %T", i, test.err, err) } } } }
// sendAndFill is a helper which sends the given batch and fills its results, // returning the appropriate error which is either from the first failing call, // or an "internal" error. func sendAndFill(send func(...proto.Call) *proto.Error, b *Batch) *proto.Error { // Errors here will be attached to the results, so we will get them from // the call to fillResults in the regular case in which an individual call // fails. But send() also returns its own errors, so there's some dancing // here to do because we want to run fillResults() so that the individual // result gets initialized with an error from the corresponding call. pErr := send(b.calls...) if pErr != nil { // TODO(tschottdorf): give the error to fillResults or make sure in // some other way that fillResults knows it's only called to set the // keys. _ = b.fillResults() return pErr } return proto.NewError(b.fillResults()) }
// TestClientCommandID verifies that client command ID is set // on call. func TestClientCommandID(t *testing.T) { defer leaktest.AfterTest(t) count := 0 db := NewDB(newTestSender(func(ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { count++ if ba.CmdID.WallTime == 0 { return nil, proto.NewError(util.Errorf("expected client command ID to be initialized")) } return ba.CreateReply().(*proto.BatchResponse), nil }, nil)) if err := db.Put("a", "b"); err != nil { t.Error(err) } if count != 1 { t.Errorf("expected test sender to be invoked once; got %d", count) } }
// TestAbortTransactionOnCommitErrors verifies that non-exec transactions are // aborted on the correct errors. func TestAbortTransactionOnCommitErrors(t *testing.T) { defer leaktest.AfterTest(t) testCases := []struct { err error abort bool }{ {&proto.ReadWithinUncertaintyIntervalError{}, true}, {&proto.TransactionAbortedError{}, false}, {&proto.TransactionPushError{}, true}, {&proto.TransactionRetryError{}, true}, {&proto.RangeNotFoundError{}, true}, {&proto.RangeKeyMismatchError{}, true}, {&proto.TransactionStatusError{}, true}, } for _, test := range testCases { var commit, abort bool db := NewDB(newTestSender(func(ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { switch t := ba.Requests[0].GetInner().(type) { case *proto.EndTransactionRequest: if t.Commit { commit = true return nil, proto.NewError(test.err) } abort = true } return &proto.BatchResponse{}, nil }, nil)) txn := NewTxn(*db) _ = txn.Put("a", "b") _ = txn.Commit() if !commit { t.Errorf("%T: failed to find commit", test.err) } if test.abort && !abort { t.Errorf("%T: failed to find abort", test.err) } else if !test.abort && abort { t.Errorf("%T: found unexpected abort", test.err) } } }
// TestTxnResetTxnOnAbort verifies transaction is reset on abort. func TestTxnResetTxnOnAbort(t *testing.T) { defer leaktest.AfterTest(t) db := newDB(newTestSender(func(ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { return nil, proto.NewError(&proto.TransactionAbortedError{ Txn: *gogoproto.Clone(ba.Txn).(*proto.Transaction), }) }, nil)) txn := NewTxn(*db) _, pErr := txn.db.sender.Send(context.Background(), testPut()) if _, ok := pErr.GoError().(*proto.TransactionAbortedError); !ok { t.Fatalf("expected TransactionAbortedError, got %v", pErr) } if len(txn.Proto.ID) != 0 { t.Errorf("expected txn to be cleared") } }
// TestTxnRequestTxnTimestamp verifies response txn timestamp is // always upgraded on successive requests. func TestTxnRequestTxnTimestamp(t *testing.T) { defer leaktest.AfterTest(t) makeTS := func(walltime int64, logical int32) proto.Timestamp { return proto.ZeroTimestamp.Add(walltime, logical) } testPutReq := gogoproto.Clone(testPutReq).(*proto.PutRequest) testCases := []struct { expRequestTS, responseTS proto.Timestamp }{ {makeTS(0, 0), makeTS(10, 0)}, {makeTS(10, 0), makeTS(10, 1)}, {makeTS(10, 1), makeTS(10, 0)}, {makeTS(10, 1), makeTS(20, 1)}, {makeTS(20, 1), makeTS(20, 1)}, {makeTS(20, 1), makeTS(0, 0)}, {makeTS(20, 1), makeTS(20, 1)}, } var testIdx int db := NewDB(newTestSender(nil, func(ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { test := testCases[testIdx] if !test.expRequestTS.Equal(ba.Txn.Timestamp) { return nil, proto.NewError(util.Errorf("%d: expected ts %s got %s", testIdx, test.expRequestTS, ba.Txn.Timestamp)) } br := &proto.BatchResponse{} br.Txn = &proto.Transaction{} br.Txn.Update(ba.Txn) // copy br.Txn.Timestamp = test.responseTS return br, nil })) txn := NewTxn(*db) for testIdx = range testCases { if _, err := send(txn.db.sender, testPutReq); err != nil { t.Fatal(err) } } }
// TODO(tschottdorf): this method is somewhat awkward but unless we want to // give this error back to the client, our options are limited. We'll have to // run the whole thing for them, or any restart will still end up at the client // which will not be prepared to be handed a Txn. func (tc *TxnCoordSender) resendWithTxn(ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { // Run a one-off transaction with that single command. if log.V(1) { log.Infof("%s: auto-wrapping in txn and re-executing: ", ba) } tmpDB := client.NewDBWithPriority(tc, ba.GetUserPriority()) br := &proto.BatchResponse{} if err := tmpDB.Txn(func(txn *client.Txn) error { txn.SetDebugName("auto-wrap", 0) b := &client.Batch{} for _, arg := range ba.Requests { req := arg.GetInner() call := proto.Call{Args: req, Reply: req.CreateReply()} b.InternalAddCall(call) br.Add(call.Reply) } return txn.CommitInBatch(b) }); err != nil { return nil, proto.NewError(err) } return br, nil }
// TODO(tschottdorf): this method is somewhat awkward but unless we want to // give this error back to the client, our options are limited. We'll have to // run the whole thing for them, or any restart will still end up at the client // which will not be prepared to be handed a Txn. func (tc *TxnCoordSender) resendWithTxn(ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { // Run a one-off transaction with that single command. if log.V(1) { log.Infof("%s: auto-wrapping in txn and re-executing: ", ba) } tmpDB := client.NewDBWithPriority(tc, ba.GetUserPriority()) var br *proto.BatchResponse err := tmpDB.Txn(func(txn *client.Txn) error { txn.SetDebugName("auto-wrap", 0) b := &client.Batch{} for _, arg := range ba.Requests { req := arg.GetInner() b.InternalAddRequest(req) } var err error br, err = txn.CommitInBatchWithResponse(b) return err }) if err != nil { return nil, proto.NewError(err) } br.Txn = nil // hide the evidence return br, nil }
func TestNodeEventFeed(t *testing.T) { defer leaktest.AfterTest(t) nodeDesc := proto.NodeDescriptor{ NodeID: proto.NodeID(99), } // A testCase corresponds to a single Store event type. Each case contains a // method which publishes a single event to the given storeEventPublisher, // and an expected result interface which should match the produced // event. testCases := []struct { publishTo func(status.NodeEventFeed) expected interface{} }{ { publishTo: func(nef status.NodeEventFeed) { nef.StartNode(nodeDesc, 100) }, expected: &status.StartNodeEvent{ Desc: nodeDesc, StartedAt: 100, }, }, { publishTo: func(nef status.NodeEventFeed) { nef.CallComplete(wrap(proto.NewGet(proto.Key("abc"))), nil) }, expected: &status.CallSuccessEvent{ NodeID: proto.NodeID(1), Method: proto.Get, }, }, { publishTo: func(nef status.NodeEventFeed) { nef.CallComplete(wrap(proto.NewPut(proto.Key("abc"), proto.Value{Bytes: []byte("def")})), nil) }, expected: &status.CallSuccessEvent{ NodeID: proto.NodeID(1), Method: proto.Put, }, }, { publishTo: func(nef status.NodeEventFeed) { nef.CallComplete(wrap(proto.NewGet(proto.Key("abc"))), proto.NewError(util.Errorf("error"))) }, expected: &status.CallErrorEvent{ NodeID: proto.NodeID(1), Method: proto.Batch, }, }, { publishTo: func(nef status.NodeEventFeed) { nef.CallComplete(wrap(proto.NewGet(proto.Key("abc"))), &proto.Error{ Index: &proto.ErrPosition{Index: 0}, Message: "boo", }) }, expected: &status.CallErrorEvent{ NodeID: proto.NodeID(1), Method: proto.Get, }, }, } // Compile expected events into a single slice. expectedEvents := make([]interface{}, len(testCases)) for i := range testCases { expectedEvents[i] = testCases[i].expected } events := make([]interface{}, 0, len(expectedEvents)) // Run test cases directly through a feed. stopper := stop.NewStopper() defer stopper.Stop() feed := util.NewFeed(stopper) feed.Subscribe(func(event interface{}) { events = append(events, event) }) nodefeed := status.NewNodeEventFeed(proto.NodeID(1), feed) for _, tc := range testCases { tc.publishTo(nodefeed) } feed.Flush() if a, e := events, expectedEvents; !reflect.DeepEqual(a, e) { t.Errorf("received incorrect events.\nexpected: %v\nactual: %v", e, a) } }
// Run executes the operations queued up within a batch. Before executing any // of the operations the batch is first checked to see if there were any errors // during its construction (e.g. failure to marshal a proto message). // // The operations within a batch are run in parallel and the order is // non-deterministic. It is an unspecified behavior to modify and retrieve the // same key within a batch. // // Upon completion, Batch.Results will contain the results for each // operation. The order of the results matches the order the operations were // added to the batch. func (db *DB) Run(b *Batch) *proto.Error { if err := b.prepare(); err != nil { return proto.NewError(err) } return sendAndFill(db.send, b) }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba proto.BatchRequest, br *proto.BatchResponse, pErr *proto.Error) *proto.Error { trace := tracer.FromCtx(ctx) newTxn := &proto.Transaction{} newTxn.Update(ba.GetTxn()) err := pErr.GoError() switch t := err.(type) { case nil: newTxn.Update(br.GetTxn()) // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *proto.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(trace, t.Txn) case *proto.OpRequiresTxnError: // TODO(tschottdorf): range-spanning autowrap currently broken. panic("TODO(tschottdorf): disabled") case *proto.ReadWithinUncertaintyIntervalError: // Mark the host as certain. See the protobuf comment for // Transaction.CertainNodes for details. if t.NodeID == 0 { panic("no replica set in header on uncertainty restart") } newTxn.CertainNodes.Add(t.NodeID) // If the reader encountered a newer write within the uncertainty // interval, move the timestamp forward, just past that write or // up to MaxTimestamp, whichever comes first. candidateTS := newTxn.MaxTimestamp candidateTS.Backward(t.ExistingTimestamp.Add(0, 1)) newTxn.Timestamp.Forward(candidateTS) newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp) t.Txn = *newTxn case *proto.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Priority = t.Txn.Priority t.Txn = *newTxn // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(trace, t.Txn) case *proto.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp) t.Txn = newTxn case *proto.TransactionRetryError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp) t.Txn = *newTxn case proto.TransactionRestartError: // Assertion: The above cases should exhaust all ErrorDetails which // carry a Transaction. if pErr.Detail != nil { panic(fmt.Sprintf("unhandled TransactionRestartError %T", err)) } } return func() *proto.Error { if len(newTxn.ID) <= 0 { return pErr } id := string(newTxn.ID) tc.Lock() defer tc.Unlock() txnMeta := tc.txns[id] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation // TODO(tschottdorf): already computed the intents prior to sending, // consider re-using those. if intents := ba.GetIntents(); len(intents) > 0 && err == nil { if txnMeta == nil { newTxn.Writing = true txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[id] = txnMeta // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. if _, isEnding := ba.GetArg(proto.EndTransaction); !isEnding { trace.Event("coordinator spawns") if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(id) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(id) return proto.NewError(&proto.NodeUnavailableError{}) } } } for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn.Update(newTxn) // better to replace after #2300 if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if err == nil { // For successful transactional requests, always send the updated txn // record back. if br.Txn == nil { br.Txn = &proto.Transaction{} } *br.Txn = *newTxn } return pErr }() }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { tc.maybeBeginTxn(&ba) ba.CmdID = ba.GetOrCreateCmdID(tc.clock.PhysicalNow()) var startNS int64 // This is the earliest point at which the request has a ClientCmdID and/or // TxnID (if applicable). Begin a Trace which follows this request. trace := tc.tracer.NewTrace(&ba) defer trace.Finalize() // TODO(tschottdorf): always "Batch" defer trace.Epoch(fmt.Sprintf("sending %s", ba.Method()))() ctx = tracer.ToCtx(ctx, trace) // TODO(tschottdorf): No looping through the batch will be necessary once // we've eliminated all the redundancies. for _, arg := range ba.Requests { trace.Event(fmt.Sprintf("%T", arg.GetValue())) if err := updateForBatch(arg.GetInner(), ba.RequestHeader); err != nil { return nil, proto.NewError(err) } } var id string // optional transaction ID if ba.Txn != nil { // If this request is part of a transaction... id = string(ba.Txn.ID) // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[id] tc.Unlock() if !ok { return nil, proto.NewError(util.Errorf("transaction must not write on multiple coordinators")) } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(proto.EndTransaction); ok { et := rArgs.(*proto.EndTransactionRequest) // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.Intents) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, proto.NewError(util.Errorf("client must not pass intents to EndTransaction")) } if len(et.Key) != 0 { return nil, proto.NewError(util.Errorf("EndTransaction must not have a Key set")) } et.Key = ba.Txn.Key tc.Lock() txnMeta, metaOK := tc.txns[id] if id != "" && metaOK { et.Intents = txnMeta.intents() } tc.Unlock() if intents := ba.GetIntents(); len(intents) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.Intents = append(et.Intents, intents...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, proto.NewError(util.Errorf("transaction is already committed or aborted")) } if len(et.Intents) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, proto.NewError(util.Errorf("cannot commit a read-only transaction")) } // TODO(tschottdorf): V(1) for _, intent := range et.Intents { trace.Event(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *proto.BatchResponse { var pErr *proto.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GoError().(*proto.OpRequiresTxnError); ok { br, pErr = tc.resendWithTxn(ba) } if pErr := tc.updateState(ctx, ba, br, pErr); pErr != nil { return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(proto.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != proto.PENDING { tc.cleanupTxn(trace, *br.Txn) } return br, nil }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). func (ds *DistSender) sendChunk(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { // TODO(tschottdorf): prepare for removing Key and EndKey from BatchRequest, // making sure that anything that relies on them goes bust. ba.Key, ba.EndKey = nil, nil isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). from, to := keys.Range(ba) var br *proto.BatchResponse // Send the request to one range per iteration. for { options := lookupOptions{ useReverseScan: isReverse, } var curReply *proto.BatchResponse var desc *proto.RangeDescriptor var needAnother bool var pErr *proto.Error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, pErr = ds.getDescriptors(from, to, options) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if needAnother && ba.Txn == nil && ba.IsRange() && ba.ReadConsistency != proto.INCONSISTENT { return nil, proto.NewError(&proto.OpRequiresTxnError{}) } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, to)) || (!isReverse && !desc.ContainsKeyRange(from, desc.EndKey)) { evictDesc() continue } curReply, pErr = func() (*proto.BatchResponse, *proto.Error) { // Truncate the request to our current key range. untruncate, numActive, trErr := truncate(&ba, desc, from, to) if numActive == 0 && trErr == nil { untruncate() // This shouldn't happen in the wild, but some tests // exercise it. return nil, proto.NewError(util.Errorf("truncation resulted in empty batch on [%s,%s): %s", from, to, ba)) } defer untruncate() if trErr != nil { return nil, proto.NewError(trErr) } // TODO(tschottdorf): make key range on batch redundant. The // requests within dictate it anyways. ba.Key, ba.EndKey = keys.Range(ba) reply, err := ds.sendAttempt(trace, ba, desc) ba.Key, ba.EndKey = nil, nil if err != nil { if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } } return reply, err }() // If sending succeeded, break this loop. if pErr == nil { break } // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GoError().(type) { case *proto.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: trace.Event(fmt.Sprintf("reply error: %T", tErr)) // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } // On retries, allow [uncommitted] intents on range descriptor // lookups to be returned 50% of the time in order to succeed // at finding the transaction record pointed to by the intent // itself. The 50% probability of returning either the current // intent or the previously committed value balances between // the two cases where the intent's txn hasn't yet been // committed (the previous value is correct), or the intent's // txn has been committed (the intent value is correct). options.considerIntents = true continue case *proto.NotLeaderError: trace.Event(fmt.Sprintf("reply error: %T", tErr)) newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } trace.Event(fmt.Sprintf("reply error: %T", tErr)) continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr } first := br == nil if first { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { // TODO(tschottdorf): return nil, proto.NewError(err) panic(err) } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if first { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]proto.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*proto.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(proto.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(proto.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&proto.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. to = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. from = next(ba, desc.EndKey) } trace.Event("querying next range") } }
// TestTxnCoordSenderTxnUpdatedOnError verifies that errors adjust the // response transaction's timestamp and priority as appropriate. func TestTxnCoordSenderTxnUpdatedOnError(t *testing.T) { defer leaktest.AfterTest(t) t.Skip("TODO(tschottdorf): fix up and re-enable. It depends on each logical clock tick, so not fun.") manual := hlc.NewManualClock(0) clock := hlc.NewClock(manual.UnixNano) clock.SetMaxOffset(20) testCases := []struct { err error expEpoch int32 expPri int32 expTS proto.Timestamp expOrigTS proto.Timestamp nodeSeen bool }{ {nil, 0, 1, makeTS(0, 1), makeTS(0, 1), false}, {&proto.ReadWithinUncertaintyIntervalError{ ExistingTimestamp: makeTS(10, 10)}, 1, 1, makeTS(10, 11), makeTS(10, 11), true}, {&proto.TransactionAbortedError{Txn: proto.Transaction{ Timestamp: makeTS(20, 10), Priority: 10}}, 0, 10, makeTS(20, 10), makeTS(0, 1), false}, {&proto.TransactionPushError{PusheeTxn: proto.Transaction{ Timestamp: makeTS(10, 10), Priority: int32(10)}}, 1, 9, makeTS(10, 11), makeTS(10, 11), false}, {&proto.TransactionRetryError{Txn: proto.Transaction{ Timestamp: makeTS(10, 10), Priority: int32(10)}}, 1, 10, makeTS(10, 10), makeTS(10, 10), false}, } var testPutReq = &proto.PutRequest{ RequestHeader: proto.RequestHeader{ Key: proto.Key("test-key"), UserPriority: gogoproto.Int32(-1), Txn: &proto.Transaction{ Name: "test txn", }, Replica: proto.Replica{ NodeID: 12345, }, }, } for i, test := range testCases { stopper := stop.NewStopper() ts := NewTxnCoordSender(senderFn(func(_ context.Context, _ proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { return nil, proto.NewError(test.err) }), clock, false, nil, stopper) var reply *proto.PutResponse if r, err := batchutil.SendWrapped(ts, gogoproto.Clone(testPutReq).(proto.Request)); err != nil { t.Fatal(err) } else { reply = r.(*proto.PutResponse) } teardownHeartbeats(ts) stopper.Stop() if reflect.TypeOf(test.err) != reflect.TypeOf(reply.GoError()) { t.Fatalf("%d: expected %T; got %T: %v", i, test.err, reply.GoError(), reply.GoError()) } if reply.Txn.Epoch != test.expEpoch { t.Errorf("%d: expected epoch = %d; got %d", i, test.expEpoch, reply.Txn.Epoch) } if reply.Txn.Priority != test.expPri { t.Errorf("%d: expected priority = %d; got %d", i, test.expPri, reply.Txn.Priority) } if !reply.Txn.Timestamp.Equal(test.expTS) { t.Errorf("%d: expected timestamp to be %s; got %s", i, test.expTS, reply.Txn.Timestamp) } if !reply.Txn.OrigTimestamp.Equal(test.expOrigTS) { t.Errorf("%d: expected orig timestamp to be %s + 1; got %s", i, test.expOrigTS, reply.Txn.OrigTimestamp) } if nodes := reply.Txn.CertainNodes.Nodes; (len(nodes) != 0) != test.nodeSeen { t.Errorf("%d: expected nodeSeen=%t, but list of hosts is %v", i, test.nodeSeen, nodes) } } }
// TestTxnCoordSenderBatchTransaction tests that it is possible to send // one-off transactional calls within a batch under certain circumstances. func TestTxnCoordSenderBatchTransaction(t *testing.T) { defer leaktest.AfterTest(t) t.Skip("TODO(tschottdorf): remove this test; behavior is more transparent now") defer leaktest.AfterTest(t) stopper := stop.NewStopper() defer stopper.Stop() clock := hlc.NewClock(hlc.UnixNano) var called bool var alwaysError = errors.New("success") ts := NewTxnCoordSender(senderFn(func(_ context.Context, _ proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { called = true // Returning this error is an easy way of preventing heartbeats // to be started for otherwise "successful" calls. return nil, proto.NewError(alwaysError) }), clock, false, nil, stopper) pushArg := &proto.PushTxnRequest{} putArg := &proto.PutRequest{} getArg := &proto.GetRequest{} testCases := []struct { req proto.Request batch, arg, ok bool }{ // Lays intents: can't have this on individual calls at all. {putArg, false, false, true}, {putArg, true, false, true}, {putArg, true, true, false}, {putArg, false, true, false}, // No intents: all ok, except when batch and arg have different txns. {pushArg, false, false, true}, {pushArg, true, false, true}, {pushArg, true, true, false}, {pushArg, false, true, true}, {getArg, false, false, true}, {getArg, true, false, true}, {getArg, true, true, false}, {getArg, false, true, true}, } txn1 := &proto.Transaction{ID: []byte("txn1")} txn2 := &proto.Transaction{ID: []byte("txn2")} for i, tc := range testCases { called = false tc.req.Reset() ba := &proto.BatchRequest{} if tc.arg { tc.req.Header().Txn = txn1 } ba.Add(tc.req) if tc.batch { ba.Txn = txn2 } called = false _, err := batchutil.SendWrapped(ts, ba) if !tc.ok && err == alwaysError { t.Fatalf("%d: expected error%s", i, err) } else if tc.ok != called { t.Fatalf("%d: wanted call: %t, got call: %t", i, tc.ok, called) } } }