// Batch sends a request to Cockroach via RPC. Errors which are retryable are // retried with backoff in a loop using the default retry options. Other errors // sending the request are retried indefinitely using the same client command // ID to avoid reporting failure when in fact the command may have gone through // and been executed successfully. We retry here to eventually get through with // the same client command ID and be given the cached response. func (s *rpcSender) Send(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { var err error var br proto.BatchResponse for r := retry.Start(s.retryOpts); r.Next(); { select { case <-s.client.Healthy(): default: err = fmt.Errorf("failed to send RPC request %s: client is unhealthy", method) log.Warning(err) continue } if err = s.client.Call(method, &ba, &br); err != nil { br.Reset() // don't trust anyone. // Assume all errors sending request are retryable. The actual // number of things that could go wrong is vast, but we don't // want to miss any which should in theory be retried with the // same client command ID. We log the error here as a warning so // there's visiblity that this is happening. Some of the errors // we'll sweep up in this net shouldn't be retried, but we can't // really know for sure which. log.Warningf("failed to send RPC request %s: %s", method, err) continue } // On successful post, we're done with retry loop. break } if err != nil { return nil, proto.NewError(err) } pErr := br.Error br.Error = nil return &br, pErr }
func newTestSender(pre, post func(proto.BatchRequest) (*proto.BatchResponse, *proto.Error)) SenderFunc { txnKey := proto.Key("test-txn") txnID := []byte(uuid.NewUUID4()) return func(_ context.Context, ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { ba.UserPriority = gogoproto.Int32(-1) if ba.Txn != nil && len(ba.Txn.ID) == 0 { ba.Txn.Key = txnKey ba.Txn.ID = txnID } var br *proto.BatchResponse var pErr *proto.Error if pre != nil { br, pErr = pre(ba) } else { br = &proto.BatchResponse{} } if pErr != nil { return nil, pErr } var writing bool status := proto.PENDING if _, ok := ba.GetArg(proto.Put); ok { br.Add(gogoproto.Clone(testPutResp).(proto.Response)) writing = true } if args, ok := ba.GetArg(proto.EndTransaction); ok { et := args.(*proto.EndTransactionRequest) writing = true if et.Commit { status = proto.COMMITTED } else { status = proto.ABORTED } } br.Txn = gogoproto.Clone(ba.Txn).(*proto.Transaction) if br.Txn != nil && pErr == nil { br.Txn.Writing = writing br.Txn.Status = status } if post != nil { br, pErr = post(ba) } return br, pErr } }
// sendBatch unrolls a batched command and sends each constituent // command in parallel. // TODO(tschottdorf): modify sendBatch so that it sends truly parallel requests // when outside of a Transaction. This can then be used to address the TODO in // (*TxnCoordSender).resolve(). func (tc *TxnCoordSender) sendBatch(ctx context.Context, batchArgs *proto.BatchRequest, batchReply *proto.BatchResponse) { // Prepare the calls by unrolling the batch. If the batchReply is // pre-initialized with replies, use those; otherwise create replies // as needed. // TODO(spencer): send calls in parallel. batchReply.Txn = batchArgs.Txn for i := range batchArgs.Requests { args := batchArgs.Requests[i].GetValue().(proto.Request) if err := updateForBatch(args, batchArgs.RequestHeader); err != nil { batchReply.Header().SetGoError(err) return } call := proto.Call{Args: args} // Create a reply from the method type and add to batch response. if i >= len(batchReply.Responses) { call.Reply = args.CreateReply() batchReply.Add(call.Reply) } else { call.Reply = batchReply.Responses[i].GetValue().(proto.Response) } tc.sendOne(ctx, call) // Amalgamate transaction updates and propagate first error, if applicable. if batchReply.Txn != nil { batchReply.Txn.Update(call.Reply.Header().Txn) } if call.Reply.Header().Error != nil { batchReply.Error = call.Reply.Header().Error return } } }
// TODO(tschottdorf): this method is somewhat awkward but unless we want to // give this error back to the client, our options are limited. We'll have to // run the whole thing for them, or any restart will still end up at the client // which will not be prepared to be handed a Txn. func (tc *TxnCoordSender) resendWithTxn(ba proto.BatchRequest) (*proto.BatchResponse, *proto.Error) { // Run a one-off transaction with that single command. if log.V(1) { log.Infof("%s: auto-wrapping in txn and re-executing: ", ba) } tmpDB := client.NewDBWithPriority(tc, ba.GetUserPriority()) var br *proto.BatchResponse err := tmpDB.Txn(func(txn *client.Txn) error { txn.SetDebugName("auto-wrap", 0) b := &client.Batch{} for _, arg := range ba.Requests { req := arg.GetInner() b.InternalAddRequest(req) } var err error br, err = txn.CommitInBatchWithResponse(b) return err }) if err != nil { return nil, proto.NewError(err) } br.Txn = nil // hide the evidence return br, nil }
// sendBatch unrolls a batched command and sends each constituent // command in parallel. func (tc *TxnCoordSender) sendBatch(batchArgs *proto.BatchRequest, batchReply *proto.BatchResponse) { // Prepare the calls by unrolling the batch. If the batchReply is // pre-initialized with replies, use those; otherwise create replies // as needed. // TODO(spencer): send calls in parallel. batchReply.Txn = batchArgs.Txn for i := range batchArgs.Requests { // Initialize args header values where appropriate. args := batchArgs.Requests[i].GetValue().(proto.Request) method, err := proto.MethodForRequest(args) call := &client.Call{Method: method, Args: args} if err != nil { batchReply.SetGoError(err) return } if args.Header().User == "" { args.Header().User = batchArgs.User } if args.Header().UserPriority == nil { args.Header().UserPriority = batchArgs.UserPriority } args.Header().Txn = batchArgs.Txn // Create a reply from the method type and add to batch response. if i >= len(batchReply.Responses) { if call.Reply, err = proto.CreateReply(method); err != nil { batchReply.SetGoError(util.Errorf("unsupported method in batch: %s", method)) return } batchReply.Add(call.Reply) } else { call.Reply = batchReply.Responses[i].GetValue().(proto.Response) } tc.sendOne(call) // Amalgamate transaction updates and propagate first error, if applicable. if batchReply.Txn != nil { batchReply.Txn.Update(call.Reply.Header().Txn) } if call.Reply.Header().Error != nil { batchReply.Error = call.Reply.Header().Error return } } }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba proto.BatchRequest, br *proto.BatchResponse, pErr *proto.Error) *proto.Error { trace := tracer.FromCtx(ctx) newTxn := &proto.Transaction{} newTxn.Update(ba.GetTxn()) err := pErr.GoError() switch t := err.(type) { case nil: newTxn.Update(br.GetTxn()) // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *proto.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(trace, t.Txn) case *proto.OpRequiresTxnError: // TODO(tschottdorf): range-spanning autowrap currently broken. panic("TODO(tschottdorf): disabled") case *proto.ReadWithinUncertaintyIntervalError: // Mark the host as certain. See the protobuf comment for // Transaction.CertainNodes for details. if t.NodeID == 0 { panic("no replica set in header on uncertainty restart") } newTxn.CertainNodes.Add(t.NodeID) // If the reader encountered a newer write within the uncertainty // interval, move the timestamp forward, just past that write or // up to MaxTimestamp, whichever comes first. candidateTS := newTxn.MaxTimestamp candidateTS.Backward(t.ExistingTimestamp.Add(0, 1)) newTxn.Timestamp.Forward(candidateTS) newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp) t.Txn = *newTxn case *proto.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Priority = t.Txn.Priority t.Txn = *newTxn // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(trace, t.Txn) case *proto.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp) t.Txn = newTxn case *proto.TransactionRetryError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp) t.Txn = *newTxn case proto.TransactionRestartError: // Assertion: The above cases should exhaust all ErrorDetails which // carry a Transaction. if pErr.Detail != nil { panic(fmt.Sprintf("unhandled TransactionRestartError %T", err)) } } return func() *proto.Error { if len(newTxn.ID) <= 0 { return pErr } id := string(newTxn.ID) tc.Lock() defer tc.Unlock() txnMeta := tc.txns[id] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation // TODO(tschottdorf): already computed the intents prior to sending, // consider re-using those. if intents := ba.GetIntents(); len(intents) > 0 && err == nil { if txnMeta == nil { newTxn.Writing = true txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[id] = txnMeta // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. if _, isEnding := ba.GetArg(proto.EndTransaction); !isEnding { trace.Event("coordinator spawns") if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(id) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(id) return proto.NewError(&proto.NodeUnavailableError{}) } } } for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn.Update(newTxn) // better to replace after #2300 if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if err == nil { // For successful transactional requests, always send the updated txn // record back. if br.Txn == nil { br.Txn = &proto.Transaction{} } *br.Txn = *newTxn } return pErr }() }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). func (ds *DistSender) sendChunk(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, error) { // TODO(tschottdorf): prepare for removing Key and EndKey from BatchRequest, // making sure that anything that relies on them goes bust. ba.Key, ba.EndKey = nil, nil isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). from, to := keys.Range(ba) var br *proto.BatchResponse // Send the request to one range per iteration. for { options := lookupOptions{ useReverseScan: isReverse, } var curReply *proto.BatchResponse var desc *proto.RangeDescriptor var needAnother bool var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, err = ds.getDescriptors(from, to, options) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(retry.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if needAnother && ba.Txn == nil && ba.IsRange() && ba.ReadConsistency != proto.INCONSISTENT { return nil, &proto.OpRequiresTxnError{} } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, to)) || (!isReverse && !desc.ContainsKeyRange(from, desc.EndKey)) { evictDesc() continue } curReply, err = func() (*proto.BatchResponse, error) { // Truncate the request to our current key range. untruncate, numActive, trErr := truncate(&ba, desc, from, to) if numActive == 0 { untruncate() // This shouldn't happen in the wild, but some tests // exercise it. return nil, util.Errorf("truncation resulted in empty batch on [%s,%s): %s", from, to, ba) } defer untruncate() if trErr != nil { return nil, trErr } // TODO(tschottdorf): make key range on batch redundant. The // requests within dictate it anyways. ba.Key, ba.EndKey = keys.Range(ba) reply, err := ds.sendAttempt(trace, ba, desc) ba.Key, ba.EndKey = nil, nil if err != nil { if log.V(0 /* TODO(tschottdorf): 1 */) { log.Warningf("failed to invoke %s: %s", ba, err) } } return reply, err }() // If sending succeeded, break this loop. if err == nil { break } // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *rpc.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: trace.Event(fmt.Sprintf("reply error: %T", err)) // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } // For the remainder of this call, we'll assume that intents // are fair game. This replaces more complex logic based on // the type of request. options.considerIntents = true continue case *proto.NotLeaderError: trace.Event(fmt.Sprintf("reply error: %T", err)) newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } trace.Event(fmt.Sprintf("reply error: %T", err)) continue } } break } // Immediately return if querying a range failed non-retryably. if err != nil { return nil, err } first := br == nil if first { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { panic(err) // TODO(tschottdorf): return nil, err } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if first { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]proto.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetValue() if _, ok := args.(*proto.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(proto.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetValue().(proto.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&proto.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. to = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. from = next(ba, desc.EndKey) } trace.Event("querying next range") } }