// sendAttempt gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendAttempt(trace *tracer.Trace, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor) (*roachpb.BatchResponse, *roachpb.Error) { defer trace.Epoch("sending RPC")() leader := ds.leaderCache.Lookup(roachpb.RangeID(desc.RangeID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } br, err := ds.sendRPC(trace, desc.RangeID, replicas, order, ba) if err != nil { return nil, roachpb.NewError(err) } // Untangle the error from the received response. pErr := br.Error br.Error = nil // scrub the response error return br, pErr }
// redirectOnOrAcquireLeaderLease checks whether this replica has the // leader lease at the specified timestamp. If it does, returns // success. If another replica currently holds the lease, redirects by // returning NotLeaderError. If the lease is expired, a renewal is // synchronously requested. This method uses the leader lease mutex // to guarantee only one request to grant the lease is pending. // // TODO(spencer): implement threshold regrants to avoid latency in // the presence of read or write pressure sufficiently close to the // current lease's expiration. // // TODO(spencer): for write commands, don't wait while requesting // the leader lease. If the lease acquisition fails, the write cmd // will fail as well. If it succeeds, as is likely, then the write // will not incur latency waiting for the command to complete. // Reads, however, must wait. func (r *Range) redirectOnOrAcquireLeaderLease(trace *tracer.Trace, timestamp proto.Timestamp) error { r.llMu.Lock() defer r.llMu.Unlock() raftNodeID := r.rm.RaftNodeID() if lease := r.getLease(); lease.Covers(timestamp) { if lease.OwnedBy(raftNodeID) { // Happy path: We have an active lease, nothing to do. return nil } // If lease is currently held by another, redirect to holder. return r.newNotLeaderError(lease, raftNodeID) } defer trace.Epoch("request leader lease")() // Otherwise, no active lease: Request renewal. err := r.requestLeaderLease(timestamp) // Getting a LeaseRejectedError back means someone else got there first; // we can redirect if they cover our timestamp. Note that it can't be us, // since we're holding a lock here, and even if it were it would be a rare // extra round-trip. if _, ok := err.(*proto.LeaseRejectedError); ok { if lease := r.getLease(); lease.Covers(timestamp) { return r.newNotLeaderError(lease, raftNodeID) } } return err }
func (tc *TxnCoordSender) heartbeat(id string, trace *tracer.Trace, ctx context.Context) bool { tc.Lock() proceed := true txnMeta := tc.txns[id] // Before we send a heartbeat, determine whether this transaction // should be considered abandoned. If so, exit heartbeat. if txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) { // TODO(tschottdorf): should we be more proactive here? // The client might be continuing the transaction // through another coordinator, but in the most likely // case it's just gone and the open transaction record // could block concurrent operations. if log.V(1) { log.Infof("transaction %s abandoned; stopping heartbeat", txnMeta.txn) } proceed = false } // txnMeta.txn is possibly replaced concurrently, // so grab a copy before unlocking. txn := txnMeta.txn tc.Unlock() if !proceed { return false } request := &proto.HeartbeatTxnRequest{ RequestHeader: proto.RequestHeader{ Key: txn.Key, Txn: &txn, }, } request.Header().Timestamp = tc.clock.Now() reply := &proto.HeartbeatTxnResponse{} call := proto.Call{ Args: request, Reply: reply, } epochEnds := trace.Epoch("heartbeat") tc.wrapped.Send(ctx, call) epochEnds() // If the transaction is not in pending state, then we can stop // the heartbeat. It's either aborted or committed, and we resolve // write intents accordingly. if reply.GoError() != nil { log.Warningf("heartbeat to %s failed: %s", txn, reply.GoError()) } // TODO(bdarnell): once we have gotten a heartbeat response with // Status != PENDING, future heartbeats are useless. However, we // need to continue the heartbeatLoop until the client either // commits or abandons the transaction. We could save a little // pointless work by restructuring this loop to stop sending // heartbeats between the time that the transaction is aborted and // the client finds out. Furthermore, we could use this information // to send TransactionAbortedErrors to the client so it can restart // immediately instead of running until its EndTransaction. return true }
// sendAttempt is invoked by Send. It temporarily truncates the arguments to // match the descriptor's EndKey (if necessary) and gathers and rearranges the // replicas before making a single attempt at sending the request. It returns // the result of sending the RPC; a potential error contained in the reply has // to be handled separately by the caller. func (ds *DistSender) sendAttempt(trace *tracer.Trace, args proto.Request, reply proto.Response, desc *proto.RangeDescriptor) error { defer trace.Epoch("sending RPC")() // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if endKey := args.Header().EndKey; endKey != nil && !endKey.Less(desc.EndKey) { defer func(k proto.Key) { args.Header().EndKey = k }(endKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(trace, desc.RaftID, replicas, order, args, reply) }
// sendOne invokes the specified RPC on the supplied client when the // client is ready. On success, the reply is sent on the channel; // otherwise an error is sent. // // Do not call directly, but instead use sendOneFn. Tests mock out this method // via sendOneFn in order to test various error cases. func sendOne(client *rpc.Client, timeout time.Duration, method string, getArgs func(addr net.Addr) proto.Message, getReply func() proto.Message, context *rpc.Context, trace *tracer.Trace, done chan *netrpc.Call) { addr := client.RemoteAddr() args := getArgs(addr) if args == nil { done <- &netrpc.Call{Error: newRPCError( util.Errorf("nil arguments returned for client %s", addr))} return } if log.V(2) { log.Infof("%s: sending request to %s: %+v", method, addr, args) } trace.Event(fmt.Sprintf("sending to %s", addr)) if enableLocalCalls && context.LocalServer != nil && addr.String() == context.LocalAddr { if context.LocalServer.LocalCall(method, args, done) { return } } reply := getReply() // Don't bother firing off a goroutine in the common case where a client // is already healthy. select { case <-client.Healthy(): client.Go(method, args, reply, done) return default: } go func() { var timeoutChan <-chan time.Time if timeout != 0 { timeoutChan = time.After(timeout) } select { case <-client.Healthy(): client.Go(method, args, reply, done) case <-client.Closed: done <- &netrpc.Call{Error: newRPCError( util.Errorf("rpc to %s failed as client connection was closed", method))} case <-timeoutChan: done <- &netrpc.Call{Error: newRPCError( util.Errorf("rpc to %s: client not ready after %s", method, timeout))} } }() }
// cleanupTxn is called when a transaction ends. The transaction record is // updated and the heartbeat goroutine signaled to clean up the transaction // gracefully. func (tc *TxnCoordSender) cleanupTxn(trace *tracer.Trace, txn roachpb.Transaction) { trace.Event("coordinator stops") tc.Lock() defer tc.Unlock() txnMeta, ok := tc.txns[string(txn.ID)] // The heartbeat might've already removed the record. if !ok { return } // The supplied txn may be newer than the one in txnMeta, which is relevant // for stats. txnMeta.txn = txn // Trigger heartbeat shutdown. close(txnMeta.txnEnd) }
// cleanupTxn is called when a transaction ends. The transaction record is // updated and the heartbeat goroutine signaled to clean up the transaction // gracefully. func (tc *TxnCoordSender) cleanupTxn(trace *tracer.Trace, txn proto.Transaction, resolved []proto.Key) { tc.Lock() defer tc.Unlock() txnMeta, ok := tc.txns[string(txn.ID)] if !ok { return } // The supplied txn may be newed than the one in txnMeta, which is relevant // for stats. txnMeta.txn = txn // Trigger intent resolution and heartbeat shutdown. trace.Event("coordinator stops") txnMeta.txnEnd <- resolved // buffered, so does not block close(txnMeta.txnEnd) }
// cleanupTxn is called when a transaction ends. The transaction record is // updated and the heartbeat goroutine signaled to clean up the transaction // gracefully. func (tc *TxnCoordSender) cleanupTxn(trace *tracer.Trace, txn proto.Transaction) { tc.Lock() defer tc.Unlock() txnMeta, ok := tc.txns[string(txn.ID)] // Only clean up once per transaction. if !ok || txnMeta.txnEnd == nil { return } // The supplied txn may be newed than the one in txnMeta, which is relevant // for stats. txnMeta.txn = txn // Trigger heartbeat shutdown. trace.Event("coordinator stops") close(txnMeta.txnEnd) txnMeta.txnEnd = nil // for idempotency; checked above }
// heartbeatLoop periodically sends a HeartbeatTxn RPC to an extant // transaction, stopping in the event the transaction is aborted or // committed after attempting to resolve the intents. When the // heartbeat stops, the transaction is unregistered from the // coordinator, func (tc *TxnCoordSender) heartbeatLoop(id string) { var tickChan <-chan time.Time { ticker := time.NewTicker(tc.heartbeatInterval) tickChan = ticker.C defer ticker.Stop() } defer func() { tc.Lock() tc.unregisterTxnLocked(id) tc.Unlock() }() var closer <-chan struct{} var trace *tracer.Trace { tc.Lock() txnMeta := tc.txns[id] // do not leak to outer scope closer = txnMeta.txnEnd trace = tc.tracer.NewTrace(tracer.Coord, &txnMeta.txn) defer trace.Finalize() tc.Unlock() } if closer == nil { // Avoid race in which a Txn is cleaned up before the heartbeat // goroutine gets a chance to start. return } ctx := tracer.ToCtx(context.Background(), trace) // Loop with ticker for periodic heartbeats. for { select { case <-tickChan: if !tc.heartbeat(id, trace, ctx) { return } case <-closer: // Transaction finished normally. return case <-tc.stopper.ShouldDrain(): return } } }
// sendSingleRange gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendSingleRange(trace *tracer.Trace, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor) (*roachpb.BatchResponse, *roachpb.Error) { defer trace.Epoch("sending RPC")() leader := ds.leaderCache.Lookup(roachpb.RangeID(desc.RangeID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } // Increase the sequence counter in the per-range loop (not // outside) since we might hit the same range twice by // accident. For example, we might send multiple requests to // the same Replica if (1) the descriptor cache has post-split // descriptors that are still write intents and (2) the split // has not yet been completed. ba.SetNewRequest() br, pErr := ds.sendRPC(trace, desc.RangeID, replicas, order, ba) if pErr != nil { return nil, pErr } // Untangle the error from the received response. pErr = br.Error br.Error = nil // scrub the response error return br, pErr }
// heartbeat periodically sends a HeartbeatTxn RPC to an extant // transaction, stopping in the event the transaction is aborted or // committed after attempting to resolve the intents. When the // heartbeat stops, the transaction is unregistered from the // coordinator, func (tc *TxnCoordSender) heartbeat(id string) { var tickChan <-chan time.Time { ticker := time.NewTicker(tc.heartbeatInterval) tickChan = ticker.C defer ticker.Stop() } defer tc.unregisterTxn(id) var closer <-chan struct{} var trace *tracer.Trace { tc.Lock() txnMeta := tc.txns[id] // do not leak to outer scope closer = txnMeta.txnEnd trace = tc.tracer.NewTrace(&txnMeta.txn) tc.Unlock() } if closer == nil { // Avoid race in which a Txn is cleaned up before the heartbeat // goroutine gets a chance to start. return } ctx := tracer.ToCtx(context.Background(), trace) defer trace.Finalize() // Loop with ticker for periodic heartbeats. for { select { case <-tickChan: tc.Lock() proceed := true txnMeta := tc.txns[id] // Before we send a heartbeat, determine whether this transaction // should be considered abandoned. If so, exit heartbeat. if txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) { // TODO(tschottdorf): should we be more proactive here? // The client might be continuing the transaction // through another coordinator, but in the most likely // case it's just gone and the open transaction record // could block concurrent operations. if log.V(1) { log.Infof("transaction %s abandoned; stopping heartbeat", txnMeta.txn) } proceed = false } // txnMeta.txn is possibly replaced concurrently, // so grab a copy before unlocking. txn := txnMeta.txn tc.Unlock() if !proceed { return } request := &proto.HeartbeatTxnRequest{ RequestHeader: proto.RequestHeader{ Key: txn.Key, User: security.RootUser, Txn: &txn, }, } request.Header().Timestamp = tc.clock.Now() reply := &proto.HeartbeatTxnResponse{} call := proto.Call{ Args: request, Reply: reply, } epochEnds := trace.Epoch("heartbeat") tc.wrapped.Send(ctx, call) epochEnds() // If the transaction is not in pending state, then we can stop // the heartbeat. It's either aborted or committed, and we resolve // write intents accordingly. if reply.GoError() != nil { log.Warningf("heartbeat to %s failed: %s", txn, reply.GoError()) } else if reply.Txn != nil && reply.Txn.Status != proto.PENDING { // Signal cleanup. Doesn't do much but stop this goroutine, but // let's be future-proof. tc.cleanupTxn(trace, *reply.Txn) return } case <-closer: // Transaction finished normally. return } } }
// close sends resolve intent commands for all key ranges this // transaction has covered, clears the keys cache and closes the // metadata heartbeat. Any keys listed in the resolved slice have // already been resolved and do not receive resolve intent commands. func (tm *txnMetadata) close(trace *tracer.Trace, txn *proto.Transaction, resolved []proto.Key, sender client.Sender, stopper *stop.Stopper) { close(tm.txnEnd) // stop heartbeat trace.Event("coordinator stops") if tm.keys.Len() > 0 { if log.V(2) { log.Infof("cleaning up %d intent(s) for transaction %s", tm.keys.Len(), txn) } } // TODO(tschottdorf): Should create a Batch here. for _, o := range tm.keys.GetOverlaps(proto.KeyMin, proto.KeyMax) { // If the op was range based, end key != start key: resolve a range. var call proto.Call key := o.Key.Start().(proto.Key) endKey := o.Key.End().(proto.Key) if !key.Next().Equal(endKey) { call.Args = &proto.InternalResolveIntentRangeRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, EndKey: endKey, User: security.RootUser, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentRangeResponse{} } else { // Check if the key has already been resolved; skip if yes. found := false for _, k := range resolved { if key.Equal(k) { found = true } } if found { continue } call.Args = &proto.InternalResolveIntentRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, User: security.RootUser, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentResponse{} } // We don't care about the reply channel; these are best // effort. We simply fire and forget, each in its own goroutine. ctx := tracer.ToCtx(context.Background(), trace.Fork()) stopper.RunAsyncTask(func() { if log.V(2) { log.Infof("cleaning up intent %q for txn %s", call.Args.Header().Key, txn) } sender.Send(ctx, call) if call.Reply.Header().Error != nil { log.Warningf("failed to cleanup %q intent: %s", call.Args.Header().Key, call.Reply.Header().GoError()) } }) } tm.keys.Clear() }
// resolve sends resolve intent commands for all key ranges this transaction // has covered. Any keys listed in the resolved slice have already been // resolved and are skipped. func (tm *txnMetadata) resolve(trace *tracer.Trace, resolved []proto.Key, sender client.Sender) { txn := &tm.txn if tm.keys.Len() > 0 { if log.V(2) { log.Infof("cleaning up %d intent(s) for transaction %s", tm.keys.Len(), txn) } } // TODO(tschottdorf): Should create a Batch here. However, we're resolving // intents and if those are on meta records, there may be a certain order // in which they need to be resolved so that they can get routed to the // correct range. Since a batch runs its commands one by one and we don't // know the correct order, we prefer to fire them off in parallel. var wg sync.WaitGroup for _, o := range tm.keys.GetOverlaps(proto.KeyMin, proto.KeyMax) { // If the op was range based, end key != start key: resolve a range. var call proto.Call key := o.Key.Start().(proto.Key) endKey := o.Key.End().(proto.Key) if !key.Next().Equal(endKey) { call.Args = &proto.InternalResolveIntentRangeRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, EndKey: endKey, User: security.RootUser, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentRangeResponse{} } else { // Check if the key has already been resolved; skip if yes. found := false for _, k := range resolved { if key.Equal(k) { if log.V(2) { log.Warningf("skipping previously resolved intent at %q", k) } found = true } } if found { continue } call.Args = &proto.InternalResolveIntentRequest{ RequestHeader: proto.RequestHeader{ Timestamp: txn.Timestamp, Key: key, User: security.RootUser, Txn: txn, }, } call.Reply = &proto.InternalResolveIntentResponse{} } ctx := tracer.ToCtx(context.Background(), trace.Fork()) if log.V(2) { log.Infof("cleaning up intent %q for txn %s", call.Args.Header().Key, txn) } // Each operation gets their own goroutine. We only want to return to // the caller after the operations have finished. wg.Add(1) go func() { sender.Send(ctx, call) wg.Done() if call.Reply.Header().Error != nil { log.Warningf("failed to cleanup %q intent: %s", call.Args.Header().Key, call.Reply.Header().GoError()) } }() } defer trace.Epoch("waiting for intent resolution")() wg.Wait() tm.keys.Clear() }