// sendAttempt gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendAttempt(trace *tracer.Trace, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor) (*roachpb.BatchResponse, *roachpb.Error) { defer trace.Epoch("sending RPC")() leader := ds.leaderCache.Lookup(roachpb.RangeID(desc.RangeID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } br, err := ds.sendRPC(trace, desc.RangeID, replicas, order, ba) if err != nil { return nil, roachpb.NewError(err) } // Untangle the error from the received response. pErr := br.Error br.Error = nil // scrub the response error return br, pErr }
// sendSingleRange gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendSingleRange( ctx context.Context, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor, ) (*roachpb.BatchResponse, *roachpb.Error) { // Hack: avoid formatting the message passed to Span.LogEvent for // opentracing.noopSpans. We can't actually tell if we have a noopSpan, but // we can see if the span as a NoopTracer. Note that this particular // invocation is expensive because we're pretty-printing keys. // // TODO(tschottdorf): This hack can go away when something like // Span.LogEventf is added. sp := opentracing.SpanFromContext(ctx) if sp != nil && sp.Tracer() != (opentracing.NoopTracer{}) { sp.LogEvent(fmt.Sprintf("sending RPC to [%s, %s)", desc.StartKey, desc.EndKey)) } // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) { if leader := ds.leaderCache.Lookup(roachpb.RangeID(desc.RangeID)); leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = orderStable } } } // TODO(tschottdorf): should serialize the trace here, not higher up. br, pErr := ds.sendRPC(ctx, desc.RangeID, replicas, order, ba) if pErr != nil { return nil, pErr } // If the reply contains a timestamp, update the local HLC with it. if br.Error != nil && br.Error.Now != roachpb.ZeroTimestamp { ds.clock.Update(br.Error.Now) } else if br.Now != roachpb.ZeroTimestamp { ds.clock.Update(br.Now) } // Untangle the error from the received response. pErr = br.Error br.Error = nil // scrub the response error return br, pErr }
// sendSingleRange gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendSingleRange( ctx context.Context, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor, ) (*roachpb.BatchResponse, *roachpb.Error) { log.Trace(ctx, fmt.Sprintf("sending RPC to [%s, %s)", desc.StartKey, desc.EndKey)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) { if leader := ds.leaderCache.Lookup(roachpb.RangeID(desc.RangeID)); leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = orderStable } } } // TODO(tschottdorf): should serialize the trace here, not higher up. br, pErr := ds.sendRPC(ctx, desc.RangeID, replicas, order, ba) if pErr != nil { return nil, pErr } // If the reply contains a timestamp, update the local HLC with it. if br.Error != nil && br.Error.Now != roachpb.ZeroTimestamp { ds.clock.Update(br.Error.Now) } else if br.Now != roachpb.ZeroTimestamp { ds.clock.Update(br.Now) } // Untangle the error from the received response. pErr = br.Error br.Error = nil // scrub the response error return br, pErr }
// sendSingleRange gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendSingleRange(trace opentracing.Span, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor) (*roachpb.BatchResponse, *roachpb.Error) { trace.LogEvent(fmt.Sprintf("sending RPC to [%s, %s)", desc.StartKey, desc.EndKey)) leader := ds.leaderCache.Lookup(roachpb.RangeID(desc.RangeID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = orderStable } } // Increase the sequence counter in the per-range loop (not // outside) since we might hit the same range twice by // accident. For example, we might send multiple requests to // the same Replica if (1) the descriptor cache has post-split // descriptors that are still write intents and (2) the split // has not yet been completed. ba.SetNewRequest() // TODO(tschottdorf): should serialize the trace here, not higher up. br, pErr := ds.sendRPC(trace, desc.RangeID, replicas, order, ba) if pErr != nil { return nil, pErr } // Untangle the error from the received response. pErr = br.Error br.Error = nil // scrub the response error return br, pErr }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } ba.CmdID = ba.GetOrCreateCmdID(tc.clock.PhysicalNow()) var startNS int64 // This is the earliest point at which the request has a ClientCmdID and/or // TxnID (if applicable). Begin a Trace which follows this request. trace := tc.tracer.NewTrace(tracer.Coord, &ba) defer trace.Finalize() defer trace.Epoch("sending batch")() ctx = tracer.ToCtx(ctx, trace) var id string // optional transaction ID if ba.Txn != nil { // If this request is part of a transaction... id = string(ba.Txn.ID) // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[id] tc.Unlock() if !ok { return nil, roachpb.NewError(util.Errorf("transaction must not write on multiple coordinators")) } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewError(util.Errorf("EndTransaction must not have a Key set")) } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.Intents) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewError(util.Errorf("client must not pass intents to EndTransaction")) } tc.Lock() txnMeta, metaOK := tc.txns[id] if id != "" && metaOK { et.Intents = txnMeta.intents() } tc.Unlock() if intents := ba.GetIntents(); len(intents) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.Intents = append(et.Intents, intents...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewError(util.Errorf("transaction is already committed or aborted")) } if len(et.Intents) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewError(util.Errorf("cannot commit a read-only transaction")) } if log.V(1) { for _, intent := range et.Intents { trace.Event(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GoError().(*roachpb.OpRequiresTxnError); ok { br, pErr = tc.resendWithTxn(ba) } if pErr := tc.updateState(ctx, ba, br, pErr); pErr != nil { return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(trace, *br.Txn) } return br, nil }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp, cleanupSp := tracing.SpanFromContext(opTxnCoordSender, tc.tracer, ctx) defer cleanupSp() // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } if err := tc.tracer.Inject(sp, basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var startNS int64 ba.SetNewRequest() // This is the earliest point at which the request has an ID (if // applicable). Begin a Trace which follows this request. ctx = opentracing.ContextWithSpan(ctx, sp) if ba.Txn != nil { // If this request is part of a transaction... txnID := *ba.Txn.ID // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if ba.Txn.Writing && ba.IsTransactionWrite() { tc.Lock() _, ok := tc.txns[txnID] tc.Unlock() if !ok { return nil, roachpb.NewErrorf("transaction must not write on multiple coordinators") } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if ba.IsReadOnly() { ba.Timestamp = ba.Txn.OrigTimestamp } else { ba.Timestamp = ba.Txn.Timestamp } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } tc.Lock() txnMeta, metaOK := tc.txns[txnID] if metaOK { et.IntentSpans = txnMeta.intentSpans() } tc.Unlock() if intentSpans := ba.GetIntentSpans(); len(intentSpans) > 0 { // Writes in Batch, so EndTransaction is fine. Should add // outstanding intents to EndTransaction, though. // TODO(tschottdorf): possible issues when the batch fails, // but the intents have been added anyways. // TODO(tschottdorf): some of these intents may be covered // by others, for example {[a,b), a}). This can lead to // some extra requests when those are non-local to the txn // record. But it doesn't seem worth optimizing now. et.IntentSpans = append(et.IntentSpans, intentSpans...) } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewErrorf("transaction is already committed or aborted") } if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewErrorf("cannot commit a read-only transaction") } if log.V(1) { for _, intent := range et.IntentSpans { sp.LogEvent(fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(ctx, ba, br, pErr); pErr != nil { sp.LogEvent(fmt.Sprintf("error: %s", pErr)) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(sp, *br.Txn) } return br, nil }