// sendBatch unrolls a batched command and sends each constituent // command in parallel. // TODO(tschottdorf): modify sendBatch so that it sends truly parallel requests // when outside of a Transaction. This can then be used to address the TODO in // (*TxnCoordSender).resolve(). func (tc *TxnCoordSender) sendBatch(ctx context.Context, batchArgs *proto.BatchRequest, batchReply *proto.BatchResponse) { // Prepare the calls by unrolling the batch. If the batchReply is // pre-initialized with replies, use those; otherwise create replies // as needed. // TODO(spencer): send calls in parallel. batchReply.Txn = batchArgs.Txn for i := range batchArgs.Requests { args := batchArgs.Requests[i].GetValue().(proto.Request) if err := updateForBatch(args, batchArgs.RequestHeader); err != nil { batchReply.Header().SetGoError(err) return } call := proto.Call{Args: args} // Create a reply from the method type and add to batch response. if i >= len(batchReply.Responses) { call.Reply = args.CreateReply() batchReply.Add(call.Reply) } else { call.Reply = batchReply.Responses[i].GetValue().(proto.Response) } tc.sendOne(ctx, call) // Amalgamate transaction updates and propagate first error, if applicable. if batchReply.Txn != nil { batchReply.Txn.Update(call.Reply.Header().Txn) } if call.Reply.Header().Error != nil { batchReply.Error = call.Reply.Header().Error return } } }
// sendBatch unrolls a batched command and sends each constituent // command in parallel. func (tc *TxnCoordSender) sendBatch(batchArgs *proto.InternalBatchRequest, batchReply *proto.InternalBatchResponse) { // Prepare the calls by unrolling the batch. If the batchReply is // pre-initialized with replies, use those; otherwise create replies // as needed. // TODO(spencer): send calls in parallel. batchReply.Txn = batchArgs.Txn for i := range batchArgs.Requests { args := batchArgs.Requests[i].GetValue().(proto.Request) call := proto.Call{Args: args} // Disallow transaction, user and priority on individual calls, unless // equal. if args.Header().User != "" && args.Header().User != batchArgs.User { batchReply.Header().SetGoError(util.Error("cannot have individual user on call in batch")) return } args.Header().User = batchArgs.User if args.Header().UserPriority != nil && args.Header().GetUserPriority() != batchArgs.GetUserPriority() { batchReply.Header().SetGoError(util.Error("cannot have individual user priority on call in batch")) return } args.Header().UserPriority = batchArgs.UserPriority if txn := args.Header().Txn; txn != nil && !txn.Equal(batchArgs.Txn) { batchReply.Header().SetGoError(util.Error("cannot have individual transactional call in batch")) return } // Propagate batch Txn to each call. args.Header().Txn = batchArgs.Txn // Create a reply from the method type and add to batch response. if i >= len(batchReply.Responses) { call.Reply = args.CreateReply() batchReply.Add(call.Reply) } else { call.Reply = batchReply.Responses[i].GetValue().(proto.Response) } tc.sendOne(call) // Amalgamate transaction updates and propagate first error, if applicable. if batchReply.Txn != nil { batchReply.Txn.Update(call.Reply.Header().Txn) } if call.Reply.Header().Error != nil { batchReply.Error = call.Reply.Header().Error return } } }
// MaybeWrapCall returns a new call which wraps the original Args and Reply // in a batch, if necessary. // TODO(tschottdorf): will go when proto.Call does. func MaybeWrapCall(call proto.Call) (proto.Call, func(proto.Call) proto.Call) { var unwrap func(proto.Response) proto.Response call.Args, unwrap = MaybeWrap(call.Args) newUnwrap := func(origReply proto.Response) func(proto.Call) proto.Call { return func(newCall proto.Call) proto.Call { origReply.Reset() gogoproto.Merge(origReply, unwrap(newCall.Reply)) *origReply.Header() = *newCall.Reply.Header() newCall.Reply = origReply return newCall } }(call.Reply) call.Reply = call.Args.CreateReply() return call, newUnwrap }
// Send implements the client.Sender interface. It verifies // permissions and looks up the appropriate range based on the // supplied key and sends the RPC according to the specified options. // // If the request spans multiple ranges (which is possible for Scan or // DeleteRange requests), Send sends requests to the individual ranges // sequentially and combines the results transparently. // // This may temporarily adjust the request headers, so the proto.Call // must not be used concurrently until Send has returned. func (ds *DistSender) Send(_ context.Context, call proto.Call) { args := call.Args finalReply := call.Reply // Verify permissions. if err := ds.verifyPermissions(call.Args); err != nil { call.Reply.Header().SetGoError(err) return } // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp proto.Timestamp) { args.Header().Timestamp = timestamp }(args.Header().Timestamp) args.Header().Timestamp = ds.clock.Now() } // If this is a bounded request, we will change its bound as we receive // replies. This undoes that when we return. boundedArgs, argsBounded := args.(proto.Bounded) if argsBounded { defer func(bound int64) { boundedArgs.SetBound(bound) }(boundedArgs.GetBound()) } defer func(key proto.Key) { args.Header().Key = key }(args.Header().Key) // Retry logic for lookup of range by key and RPCs to range replicas. curReply := finalReply for { call.Reply = curReply curReply.Header().Reset() var desc, descNext *proto.RangeDescriptor var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). // sendAttempt below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. desc, descNext, err = ds.getDescriptors(call) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(util.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } err = func() error { // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if descNext != nil { defer func(endKey proto.Key) { args.Header().EndKey = endKey }(args.Header().EndKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(desc.RaftID, replicas, order, args, curReply) }() if err != nil { // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } else { err = curReply.Header().GoError() } if err != nil { if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } continue case *proto.NotLeaderError: newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case util.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } } } break } // Immediately return if querying a range failed non-retryably. // For multi-range requests, we return the failing range's reply. if err != nil { call.Reply.Header().SetGoError(err) return } if finalReply != curReply { // This was the second or later call in a multi-range request. // Combine the new response with the existing one. if cFinalReply, ok := finalReply.(proto.Combinable); ok { cFinalReply.Combine(curReply) } else { // This should never apply in practice, as we'll only end up here // for range-spanning requests. call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type")) return } } // If this request has a bound, such as MaxResults in // ScanRequest, check whether enough rows have been retrieved. if argsBounded { if prevBound := boundedArgs.GetBound(); prevBound > 0 { if cReply, ok := curReply.(proto.Countable); ok { if nextBound := prevBound - cReply.Count(); nextBound > 0 { // Update bound for the next round. // We've deferred restoring the original bound earlier. boundedArgs.SetBound(nextBound) } else { // Set flag to break the loop. descNext = nil } } } } // If this was the last range accessed by this call, exit loop. if descNext == nil { break } // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. args.Header().Key = desc.EndKey // This is a multi-range request, make a new reply object for // subsequent iterations of the loop. curReply = args.CreateReply() } call.Reply = finalReply }
// Send implements the client.Sender interface. It verifies // permissions and looks up the appropriate range based on the // supplied key and sends the RPC according to the specified options. // // If the request spans multiple ranges (which is possible for Scan or // DeleteRange requests), Send sends requests to the individual ranges // sequentially and combines the results transparently. // // This may temporarily adjust the request headers, so the proto.Call // must not be used concurrently until Send has returned. func (ds *DistSender) Send(_ context.Context, call proto.Call) { args := call.Args finalReply := call.Reply endKey := args.Header().EndKey // Verify permissions. if err := ds.verifyPermissions(call.Args); err != nil { call.Reply.Header().SetGoError(err) return } // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp proto.Timestamp) { args.Header().Timestamp = timestamp }(args.Header().Timestamp) args.Header().Timestamp = ds.clock.Now() } // If this is a bounded request, we will change its bound as we receive // replies. This undoes that when we return. boundedArgs, _ := args.(proto.Bounded) if boundedArgs != nil { defer func(n int64) { boundedArgs.SetBound(n) }(boundedArgs.GetBound()) } // Retry logic for lookup of range by key and RPCs to range replicas. retryOpts := ds.rpcRetryOptions retryOpts.Tag = "routing " + call.Method().String() + " rpc" curReply := finalReply for { call.Reply = curReply curReply.Header().Reset() var desc, descNext *proto.RangeDescriptor err := retry.WithBackoff(retryOpts, func() (retry.Status, error) { var err error // Get range descriptor (or, when spanning range, descriptors). // sendAttempt below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. desc, descNext, err = ds.getDescriptors(call) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(util.Retryable); ok && rErr.CanRetry() { return retry.Continue, err } return retry.Break, err } // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if descNext != nil { args.Header().EndKey = desc.EndKey defer func() { // "Untruncate" EndKey to original. args.Header().EndKey = endKey }() } return ds.sendAttempt(desc, call) }) // Immediately return if querying a range failed non-retryably. // For multi-range requests, we return the failing range's reply. if err != nil { call.Reply.Header().SetGoError(err) return } if finalReply != curReply { // This was the second or later call in a multi-range request. // Combine the new response with the existing one. if cFinalReply, ok := finalReply.(proto.Combinable); ok { cFinalReply.Combine(curReply) } else { // This should never apply in practice, as we'll only end up here // for range-spanning requests. call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type")) return } } // If this request has a bound, such as MaxResults in // ScanRequest, check whether enough rows have been retrieved. if boundedArgs != nil { if prevBound := boundedArgs.GetBound(); prevBound > 0 { if cReply, ok := curReply.(proto.Countable); ok { if nextBound := prevBound - cReply.Count(); nextBound > 0 { // Update bound for the next round. // We've deferred restoring the original bound earlier. boundedArgs.SetBound(nextBound) } else { // Set flag to break the loop. descNext = nil } } } } // If this was the last range accessed by this call, exit loop. if descNext == nil { break } if curReply == finalReply { // This is the end of the first iteration in a multi-range query, // so it's a convenient place to clean up changes to the args in // the case of multi-range requests. // Reset original start key (the EndKey is taken care of without // defer above). defer func(k proto.Key) { args.Header().Key = k }(args.Header().Key) } // In next iteration, query next range. args.Header().Key = descNext.StartKey // This is a multi-range request, make a new reply object for // subsequent iterations of the loop. curReply = args.CreateReply() } call.Reply = finalReply }
// close sends resolve intent commands for all key ranges this // transaction has covered, clears the keys cache and closes the // metadata heartbeat. Any keys listed in the resolved slice have // already been resolved and do not receive resolve intent commands. func (tm *txnMetadata) close(txn *proto.Transaction, resolved []proto.Key, sender client.Sender, stopper *util.Stopper) { close(tm.txnEnd) // stop heartbeat if tm.keys.Len() > 0 { if log.V(2) { log.Infof("cleaning up %d intent(s) for transaction %s", tm.keys.Len(), txn) } } for _, o := range tm.keys.GetOverlaps(proto.KeyMin, proto.KeyMax) { // If the op was range based, end key != start key: resolve a range. var call proto.Call key := o.Key.Start().(proto.Key) endKey := o.Key.End().(proto.Key) if !key.Next().Equal(endKey) { call.Args = &proto.InternalResolveIntentRangeRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, EndKey: endKey, User: storage.UserRoot, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentRangeResponse{} } else { // Check if the key has already been resolved; skip if yes. found := false for _, k := range resolved { if key.Equal(k) { found = true } } if found { continue } call.Args = &proto.InternalResolveIntentRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, User: storage.UserRoot, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentResponse{} } // We don't care about the reply channel; these are best // effort. We simply fire and forget, each in its own goroutine. if stopper.StartTask() { go func() { if log.V(2) { log.Infof("cleaning up intent %q for txn %s", call.Args.Header().Key, txn) } sender.Send(context.TODO(), call) if call.Reply.Header().Error != nil { log.Warningf("failed to cleanup %q intent: %s", call.Args.Header().Key, call.Reply.Header().GoError()) } stopper.FinishTask() }() } } tm.keys.Clear() }
// close sends resolve intent commands for all key ranges this // transaction has covered, clears the keys cache and closes the // metadata heartbeat. Any keys listed in the resolved slice have // already been resolved and do not receive resolve intent commands. func (tm *txnMetadata) close(trace *tracer.Trace, txn *proto.Transaction, resolved []proto.Key, sender client.Sender, stopper *stop.Stopper) { close(tm.txnEnd) // stop heartbeat trace.Event("coordinator stops") if tm.keys.Len() > 0 { if log.V(2) { log.Infof("cleaning up %d intent(s) for transaction %s", tm.keys.Len(), txn) } } // TODO(tschottdorf): Should create a Batch here. for _, o := range tm.keys.GetOverlaps(proto.KeyMin, proto.KeyMax) { // If the op was range based, end key != start key: resolve a range. var call proto.Call key := o.Key.Start().(proto.Key) endKey := o.Key.End().(proto.Key) if !key.Next().Equal(endKey) { call.Args = &proto.InternalResolveIntentRangeRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, EndKey: endKey, User: security.RootUser, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentRangeResponse{} } else { // Check if the key has already been resolved; skip if yes. found := false for _, k := range resolved { if key.Equal(k) { found = true } } if found { continue } call.Args = &proto.InternalResolveIntentRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, User: security.RootUser, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentResponse{} } // We don't care about the reply channel; these are best // effort. We simply fire and forget, each in its own goroutine. ctx := tracer.ToCtx(context.Background(), trace.Fork()) stopper.RunAsyncTask(func() { if log.V(2) { log.Infof("cleaning up intent %q for txn %s", call.Args.Header().Key, txn) } sender.Send(ctx, call) if call.Reply.Header().Error != nil { log.Warningf("failed to cleanup %q intent: %s", call.Args.Header().Key, call.Reply.Header().GoError()) } }) } tm.keys.Clear() }
// resolve sends resolve intent commands for all key ranges this transaction // has covered. Any keys listed in the resolved slice have already been // resolved and are skipped. func (tm *txnMetadata) resolve(trace *tracer.Trace, resolved []proto.Key, sender client.Sender) { txn := &tm.txn if tm.keys.Len() > 0 { if log.V(2) { log.Infof("cleaning up %d intent(s) for transaction %s", tm.keys.Len(), txn) } } // TODO(tschottdorf): Should create a Batch here. However, we're resolving // intents and if those are on meta records, there may be a certain order // in which they need to be resolved so that they can get routed to the // correct range. Since a batch runs its commands one by one and we don't // know the correct order, we prefer to fire them off in parallel. var wg sync.WaitGroup for _, o := range tm.keys.GetOverlaps(proto.KeyMin, proto.KeyMax) { // If the op was range based, end key != start key: resolve a range. var call proto.Call key := o.Key.Start().(proto.Key) endKey := o.Key.End().(proto.Key) if !key.Next().Equal(endKey) { call.Args = &proto.InternalResolveIntentRangeRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, EndKey: endKey, User: security.RootUser, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentRangeResponse{} } else { // Check if the key has already been resolved; skip if yes. found := false for _, k := range resolved { if key.Equal(k) { if log.V(2) { log.Warningf("skipping previously resolved intent at %q", k) } found = true } } if found { continue } call.Args = &proto.InternalResolveIntentRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, User: security.RootUser, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentResponse{} } ctx := tracer.ToCtx(context.Background(), trace.Fork()) if log.V(2) { log.Infof("cleaning up intent %q for txn %s", call.Args.Header().Key, txn) } // Each operation gets their own goroutine. We only want to return to // the caller after the operations have finished. wg.Add(1) go func() { sender.Send(ctx, call) wg.Done() if call.Reply.Header().Error != nil { log.Warningf("failed to cleanup %q intent: %s", call.Args.Header().Key, call.Reply.Header().GoError()) } }() } defer trace.Epoch("waiting for intent resolution")() wg.Wait() tm.keys.Clear() }