// AddCmd adds a command for execution on this range. The command's // affected keys are verified to be contained within the range and the // range's leadership is confirmed. The command is then dispatched // either along the read-only execution path or the read-write Raft // command queue. func (r *Range) AddCmd(ctx context.Context, call proto.Call) error { args := call.Args // TODO(tschottdorf) Some (internal) requests go here directly, so they // won't be traced. trace := tracer.FromCtx(ctx) // Differentiate between admin, read-only and read-write. var reply proto.Response var err error if proto.IsAdmin(args) { defer trace.Epoch("admin path")() reply, err = r.addAdminCmd(ctx, args) } else if proto.IsReadOnly(args) { defer trace.Epoch("read-only path")() reply, err = r.addReadOnlyCmd(ctx, args) } else if proto.IsWrite(args) { defer trace.Epoch("read-write path")() reply, err = r.addWriteCmd(ctx, args, nil) } else { panic(fmt.Sprintf("don't know how to handle command %T", args)) } if reply != nil { gogoproto.Merge(call.Reply, reply) } if err != nil { replyHeader := call.Reply.Header() if replyHeader.Error != nil { panic("the world is on fire") } replyHeader.SetGoError(err) } return err }
// getLeaseForGossip tries to obtain a leader lease. Only one of the replicas // should gossip; the bool returned indicates whether it's us. func (r *Range) getLeaseForGossip(ctx context.Context) (bool, error) { // If no Gossip available (some tests) or range too fresh, noop. if r.rm.Gossip() == nil || !r.isInitialized() { return false, util.Errorf("no gossip or range not initialized") } var hasLease bool var err error if !r.rm.Stopper().RunTask(func() { timestamp := r.rm.Clock().Now() // Check for or obtain the lease, if none active. err = r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), timestamp) hasLease = err == nil if err != nil { switch e := err.(type) { // NotLeaderError means there is an active lease, leaseRejectedError // means we tried to get one but someone beat us to it. case *proto.NotLeaderError, *proto.LeaseRejectedError: err = nil default: // Any other error is worth being logged visibly. log.Warningc(ctx, "could not acquire lease for range gossip: %s", e) } } }) { err = util.Errorf("node is stopping") } return hasLease, err }
// SendBatch implements batch.Sender. func (ls *LocalSender) SendBatch(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, error) { trace := tracer.FromCtx(ctx) var store *storage.Store var err error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *proto.Replica var rangeID proto.RangeID rangeID, repl, err = ls.lookupReplica(ba.Key, ba.EndKey) if err == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.Method, ba.Method(), // TODO(tschottdorf): Method() always `Batch`. log.Key, ba.Key, log.RangeID, ba.RangeID) if err == nil { store, err = ls.GetStore(ba.Replica.StoreID) } var br *proto.BatchResponse if err == nil { // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=Timestamp for the operation // accomplishes that. See proto.Transaction.CertainNodes for details. if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. trace.Event("read has no clock uncertainty") ba.Txn.MaxTimestamp = ba.Txn.Timestamp } { var tmpR proto.Response // TODO(tschottdorf): &ba -> ba tmpR, err = store.ExecuteCmd(ctx, &ba) // TODO(tschottdorf): remove this dance once BatchResponse is returned. if tmpR != nil { br = tmpR.(*proto.BatchResponse) if br.Error != nil { panic(proto.ErrorUnexpectedlySet) } } } } // TODO(tschottdorf): Later error needs to be associated to an index // and ideally individual requests don't even have an error in their // header. See #1891. return br, err }
// Send implements the client.Sender interface. The store is looked // up from the store map if specified by header.Replica; otherwise, // the command is being executed locally, and the replica is // determined via lookup through each store's LookupRange method. func (ls *LocalSender) Send(ctx context.Context, call proto.Call) { var err error var store *storage.Store trace := tracer.FromCtx(ctx) // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. header := call.Args.Header() if header.RaftID == 0 || header.Replica.StoreID == 0 { var repl *proto.Replica var raftID proto.RaftID raftID, repl, err = ls.lookupReplica(header.Key, header.EndKey) if err == nil { header.RaftID = raftID header.Replica = *repl } } ctx = log.Add(ctx, log.Method, call.Method(), log.Key, header.Key, log.RaftID, header.RaftID) if err == nil { store, err = ls.GetStore(header.Replica.StoreID) } var reply proto.Response if err == nil { // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=Timestamp for the operation // accomplishes that. See proto.Transaction.CertainNodes for details. if header.Txn != nil && header.Txn.CertainNodes.Contains(header.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. trace.Event("read has no clock uncertainty") header.Txn.MaxTimestamp = header.Txn.Timestamp } reply, err = store.ExecuteCmd(ctx, call.Args) } if reply != nil { gogoproto.Merge(call.Reply, reply) } if call.Reply.Header().Error != nil { panic(proto.ErrorUnexpectedlySet) } if err != nil { call.Reply.Header().SetGoError(err) } }
// addReadOnlyCmd updates the read timestamp cache and waits for any // overlapping writes currently processing through Raft ahead of us to // clear via the read queue. func (r *Range) addReadOnlyCmd(ctx context.Context, args proto.Request, reply proto.Response) error { header := args.Header() if err := r.checkCmdHeader(header); err != nil { reply.Header().SetGoError(err) return err } // If read-consistency is set to INCONSISTENT, run directly. if header.ReadConsistency == proto.INCONSISTENT { // But disallow any inconsistent reads within txns. if header.Txn != nil { reply.Header().SetGoError(util.Error("cannot allow inconsistent reads within a transaction")) return reply.Header().GoError() } if header.Timestamp.Equal(proto.ZeroTimestamp) { header.Timestamp = r.rm.Clock().Now() } intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply) if err == nil { r.handleSkippedIntents(args, intents) } return err } else if header.ReadConsistency == proto.CONSENSUS { reply.Header().SetGoError(util.Error("consensus reads not implemented")) return reply.Header().GoError() } // Add the read to the command queue to gate subsequent // overlapping commands until this command completes. cmdKey := r.beginCmd(header, true) // This replica must have leader lease to process a consistent read. if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil { r.endCmd(cmdKey, args, err, true /* readOnly */) reply.Header().SetGoError(err) return err } // Execute read-only command. intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply) // Only update the timestamp cache if the command succeeded. r.endCmd(cmdKey, args, err, true /* readOnly */) if err == nil { r.handleSkippedIntents(args, intents) } return err }
// AddCmd adds a command for execution on this range. The command's // affected keys are verified to be contained within the range and the // range's leadership is confirmed. The command is then dispatched // either along the read-only execution path or the read-write Raft // command queue. func (r *Range) AddCmd(ctx context.Context, call proto.Call) error { args, reply := call.Args, call.Reply // TODO(tschottdorf) Some (internal) requests go here directly, so they // won't be traced. trace := tracer.FromCtx(ctx) // Differentiate between admin, read-only and read-write. if proto.IsAdmin(args) { defer trace.Epoch("admin path")() return r.addAdminCmd(ctx, args, reply) } else if proto.IsReadOnly(args) { defer trace.Epoch("read path")() return r.addReadOnlyCmd(ctx, args, reply) } return r.addWriteCmd(ctx, args, reply, nil) }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { trace := tracer.FromCtx(ctx) var store *Store var pErr *roachpb.Error // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { var repl *roachpb.ReplicaDescriptor var rangeID roachpb.RangeID rs := keys.Range(ba) rangeID, repl, pErr = ls.lookupReplica(rs.Key, rs.EndKey) if pErr == nil { ba.RangeID = rangeID ba.Replica = *repl } } ctx = log.Add(ctx, log.RangeID, ba.RangeID) if pErr == nil { store, pErr = ls.GetStore(ba.Replica.StoreID) } var br *roachpb.BatchResponse if pErr != nil { return nil, pErr } // For calls that read data within a txn, we can avoid uncertainty // related retries in certain situations. If the node is in // "CertainNodes", we need not worry about uncertain reads any // more. Setting MaxTimestamp=Timestamp for the operation // accomplishes that. See roachpb.Transaction.CertainNodes for details. if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) { // MaxTimestamp = Timestamp corresponds to no clock uncertainty. trace.Event("read has no clock uncertainty") ba.Txn.MaxTimestamp = ba.Txn.Timestamp } br, pErr = store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// processRaftCommand processes a raft command by unpacking the command // struct to get args and reply and then applying the command to the // state machine via applyRaftCommand(). The error result is sent on // the command's done channel, if available. func (r *Range) processRaftCommand(idKey cmdIDKey, index uint64, raftCmd proto.InternalRaftCommand) error { if index == 0 { log.Fatalc(r.context(), "processRaftCommand requires a non-zero index") } r.Lock() cmd := r.pendingCmds[idKey] delete(r.pendingCmds, idKey) r.Unlock() args := raftCmd.Cmd.GetValue().(proto.Request) var reply proto.Response var ctx context.Context if cmd != nil { // We initiated this command, so use the caller-supplied reply. reply = cmd.Reply ctx = cmd.ctx } else { // This command originated elsewhere so we must create a new reply buffer. reply = args.CreateReply() // TODO(tschottdorf): consider the Trace situation here. ctx = r.context() } execDone := tracer.FromCtx(ctx).Epoch(fmt.Sprintf("applying %s", args.Method())) // applyRaftCommand will return "expected" errors, but may also indicate // replica corruption (as of now, signaled by a replicaCorruptionError). // We feed its return through maybeSetCorrupt to act when that happens. err := r.maybeSetCorrupt( r.applyRaftCommand(ctx, index, proto.RaftNodeID(raftCmd.OriginNodeID), args, reply), ) execDone() if cmd != nil { cmd.done <- err } else if err != nil && log.V(1) { log.Errorc(r.context(), "error executing raft command %s: %s", args.Method(), err) } return err }
// AddCmd adds a command for execution on this range. The command's // affected keys are verified to be contained within the range and the // range's leadership is confirmed. The command is then dispatched // either along the read-only execution path or the read-write Raft // command queue. func (r *Replica) AddCmd(ctx context.Context, args proto.Request) (proto.Response, error) { // TODO(tschottdorf) Some (internal) requests go here directly, so they // won't be traced. trace := tracer.FromCtx(ctx) // Differentiate between admin, read-only and read-write. var reply proto.Response var err error if proto.IsAdmin(args) { defer trace.Epoch("admin path")() reply, err = r.addAdminCmd(ctx, args) } else if proto.IsReadOnly(args) { defer trace.Epoch("read-only path")() reply, err = r.addReadOnlyCmd(ctx, args) } else if proto.IsWrite(args) { defer trace.Epoch("read-write path")() reply, err = r.addWriteCmd(ctx, args, nil) } else { panic(fmt.Sprintf("don't know how to handle command %T", args)) } return reply, err }
// addAdminCmd executes the command directly. There is no interaction // with the command queue or the timestamp cache, as admin commands // are not meant to consistently access or modify the underlying data. // Admin commands must run on the leader replica. func (r *Range) addAdminCmd(ctx context.Context, args proto.Request) (proto.Response, error) { header := args.Header() if err := r.checkCmdHeader(header); err != nil { return nil, err } // Admin commands always require the leader lease. if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil { return nil, err } switch tArgs := args.(type) { case *proto.AdminSplitRequest: resp, err := r.AdminSplit(tArgs) return &resp, err case *proto.AdminMergeRequest: resp, err := r.AdminMerge(tArgs) return &resp, err default: return nil, util.Error("unrecognized admin command") } }
// addAdminCmd executes the command directly. There is no interaction // with the command queue or the timestamp cache, as admin commands // are not meant to consistently access or modify the underlying data. // Admin commands must run on the leader replica. func (r *Range) addAdminCmd(ctx context.Context, args proto.Request, reply proto.Response) error { header := args.Header() if err := r.checkCmdHeader(header); err != nil { reply.Header().SetGoError(err) return err } // Admin commands always require the leader lease. if err := r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), header.Timestamp); err != nil { reply.Header().SetGoError(err) return err } switch args.(type) { case *proto.AdminSplitRequest: r.AdminSplit(args.(*proto.AdminSplitRequest), reply.(*proto.AdminSplitResponse)) case *proto.AdminMergeRequest: r.AdminMerge(args.(*proto.AdminMergeRequest), reply.(*proto.AdminMergeResponse)) default: return util.Error("unrecognized admin command") } return reply.Header().GoError() }
// sendOne sends a single call via the wrapped sender. If the call is // part of a transaction, the TxnCoordSender adds the transaction to a // map of active transactions and begins heartbeating it. Every // subsequent call for the same transaction updates the lastUpdate // timestamp to prevent live transactions from being considered // abandoned and garbage collected. Read/write mutating requests have // their key or key range added to the transaction's interval tree of // key ranges for eventual cleanup via resolved write intents. // // On success, and if the call is part of a transaction, the affected // key range is recorded as live intents for eventual cleanup upon // transaction commit. Upon successful txn commit, initiates cleanup // of intents. func (tc *TxnCoordSender) sendOne(ctx context.Context, call proto.Call) { var startNS int64 header := call.Args.Header() trace := tracer.FromCtx(ctx) var id string // optional transaction ID if header.Txn != nil { // If this call is part of a transaction... id = string(header.Txn.ID) // Verify that if this Transaction is not read-only, we have it on // file. If not, refuse writes - the client must have issued a write on // another coordinator previously. if header.Txn.Writing && proto.IsTransactionWrite(call.Args) { tc.Lock() _, ok := tc.txns[id] tc.Unlock() if !ok { call.Reply.Header().SetGoError(util.Errorf( "transaction must not write on multiple coordinators")) return } } // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if proto.IsReadOnly(call.Args) { header.Timestamp = header.Txn.OrigTimestamp } else { header.Timestamp = header.Txn.Timestamp } if args, ok := call.Args.(*proto.EndTransactionRequest); ok { // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() // EndTransaction must have its key set to that of the txn. header.Key = header.Txn.Key if len(args.Intents) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. call.Reply.Header().SetGoError(util.Errorf( "client must not pass intents to EndTransaction")) return } tc.Lock() txnMeta, metaOK := tc.txns[id] if id != "" && metaOK { args.Intents = txnMeta.intents() } tc.Unlock() if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. call.Reply.Header().SetGoError(util.Errorf( "transaction is already committed or aborted")) return } else if len(args.Intents) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. call.Reply.Header().SetGoError(util.Errorf( "cannot commit a read-only transaction")) return } } } // Send the command through wrapped sender. tc.wrapped.Send(ctx, call) // For transactional calls, need to track & update the transaction. if header.Txn != nil { respHeader := call.Reply.Header() if respHeader.Txn == nil { // When empty, simply use the request's transaction. // This is expected: the Range doesn't bother copying unless the // object changes. respHeader.Txn = gogoproto.Clone(header.Txn).(*proto.Transaction) } tc.updateResponseTxn(header, respHeader) } if txn := call.Reply.Header().Txn; txn != nil { if !header.Txn.Equal(txn) { panic("transaction ID changed") } tc.Lock() txnMeta := tc.txns[id] // If this transactional command leaves transactional intents, add the key // or key range to the intents map. If the transaction metadata doesn't yet // exist, create it. if call.Reply.Header().GoError() == nil { if proto.IsTransactionWrite(call.Args) { if txnMeta == nil { txn.Writing = true trace.Event("coordinator spawns") txnMeta = &txnMetadata{ txn: *txn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[id] = txnMeta if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(id) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. call.Reply.Header().SetGoError(&proto.NodeUnavailableError{}) tc.Unlock() tc.unregisterTxn(id) return } } txnMeta.addKeyRange(header.Key, header.EndKey) } // Update our record of this transaction. if txnMeta != nil { txnMeta.txn = *txn txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } } tc.Unlock() } // Cleanup intents and transaction map if end of transaction. switch t := call.Reply.Header().GoError().(type) { case *proto.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider it dead. tc.cleanupTxn(trace, t.Txn) case *proto.TransactionAbortedError: // If already aborted, cleanup the txn on this TxnCoordSender. tc.cleanupTxn(trace, t.Txn) case *proto.OpRequiresTxnError: // Run a one-off transaction with that single command. if log.V(1) { log.Infof("%s: auto-wrapping in txn and re-executing", call.Method()) } // TODO(tschottdorf): this part is awkward. Consider resending here // without starting a new call, which is hard to trace. Plus, the // below depends on default configuration. tmpDB, err := client.Open( fmt.Sprintf("//%s?priority=%d", call.Args.Header().User, call.Args.Header().GetUserPriority()), client.SenderOpt(tc)) if err != nil { log.Warning(err) return } call.Reply.Reset() if err := tmpDB.Txn(func(txn *client.Txn) error { txn.SetDebugName("auto-wrap", 0) b := &client.Batch{} b.InternalAddCall(call) return txn.CommitInBatch(b) }); err != nil { log.Warning(err) } case nil: if txn := call.Reply.Header().Txn; txn != nil { if _, ok := call.Args.(*proto.EndTransactionRequest); ok { // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if txn.Status != proto.PENDING { tc.cleanupTxn(trace, *txn) } } } } }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs := keys.Range(ba) var br *roachpb.BatchResponse // Send the request to one range per iteration. for { considerIntents := false var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var needAnother bool var pErr *roachpb.Error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, pErr = ds.getDescriptors(rs, considerIntents, isReverse) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, rs.EndKey)) || (!isReverse && !desc.ContainsKeyRange(rs.Key, desc.EndKey)) { evictDesc() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(trace, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } trace.Event(fmt.Sprintf("reply error: %T", pErr.GoError())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GoError().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError, *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } // On retries, allow [uncommitted] intents on range descriptor // lookups to be returned 50% of the time in order to succeed // at finding the transaction record pointed to by the intent // itself. The 50% probability of returning either the current // intent or the previously committed value balances between // the two cases where the intent's txn hasn't yet been // committed (the previous value is correct), or the intent's // txn has been committed (the intent value is correct). considerIntents = true continue case *roachpb.NotLeaderError: newLeader := tErr.Leader // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &roachpb.ReplicaDescriptor{} } ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&roachpb.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key = next(ba, desc.EndKey) } trace.Event("querying next range") } }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { trace := tracer.FromCtx(ctx) newTxn := &roachpb.Transaction{} newTxn.Update(ba.GetTxn()) // TODO(tamird): remove this clone. It's currently needed to avoid race conditions. pErr = proto.Clone(pErr).(*roachpb.Error) err := pErr.GoError() // TODO(bdarnell): We're writing to errors here (and where using ErrorWithIndex); // since there's no concept of ownership copy-on-write is always preferable. switch t := err.(type) { case nil: newTxn.Update(br.Txn) // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(trace, t.Txn) case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // Mark the host as certain. See the protobuf comment for // Transaction.CertainNodes for details. if t.NodeID == 0 { panic("no replica set in header on uncertainty restart") } newTxn.Update(&t.Txn) newTxn.CertainNodes.Add(t.NodeID) // If the reader encountered a newer write within the uncertainty // interval, move the timestamp forward, just past that write or // up to MaxTimestamp, whichever comes first. candidateTS := newTxn.MaxTimestamp candidateTS.Backward(t.ExistingTimestamp.Add(0, 1)) newTxn.Timestamp.Forward(candidateTS) newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp) t.Txn = *newTxn case *roachpb.TransactionAbortedError: trace.SetError() newTxn.Update(&t.Txn) // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Priority = t.Txn.Priority t.Txn = *newTxn // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(trace, t.Txn) case *roachpb.TransactionPushError: newTxn.Update(t.Txn) // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp) t.Txn = newTxn case *roachpb.TransactionRetryError: newTxn.Update(&t.Txn) newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp) t.Txn = *newTxn case roachpb.TransactionRestartError: // Assertion: The above cases should exhaust all ErrorDetails which // carry a Transaction. if pErr.Detail != nil { panic(fmt.Sprintf("unhandled TransactionRestartError %T", err)) } default: trace.SetError() } return func() *roachpb.Error { if len(newTxn.ID) <= 0 { return pErr } id := string(newTxn.ID) tc.Lock() defer tc.Unlock() txnMeta := tc.txns[id] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). // consider re-using those. if intents := ba.GetIntents(); len(intents) > 0 && (err == nil || newTxn.Writing) { if txnMeta == nil { if !newTxn.Writing { panic("txn with intents marked as non-writing") } txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[id] = txnMeta // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); err != nil || !isEnding { trace.Event("coordinator spawns") if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(id) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(id) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } } for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn = *newTxn if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if err == nil { // For successful transactional requests, always send the updated txn // record back. br.Txn = newTxn } return pErr }() }
// addWriteCmd first adds the keys affected by this command as pending writes // to the command queue. Next, the timestamp cache is checked to determine if // any newer accesses to this command's affected keys have been made. If so, // the command's timestamp is moved forward. Finally, the command is submitted // to Raft. Upon completion, the write is removed from the read queue and any // error returned. If a WaitGroup is supplied, it is signaled when the command // enters Raft or the function returns with a preprocessing error, whichever // happens earlier. func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, wg *sync.WaitGroup) (proto.Response, error) { signal := func() { if wg != nil { wg.Done() wg = nil } } // This happens more eagerly below, but it's important to guarantee that // early returns do not skip this. defer signal() header := args.Header() if err := r.checkCmdHeader(args.Header()); err != nil { return nil, err } trace := tracer.FromCtx(ctx) // Add the write to the command queue to gate subsequent overlapping // Commands until this command completes. Note that this must be // done before getting the max timestamp for the key(s), as // timestamp cache is only updated after preceding commands have // been run to successful completion. qDone := trace.Epoch("command queue") cmdKey := r.beginCmd(header, false) qDone() // This replica must have leader lease to process a write. if err := r.redirectOnOrAcquireLeaderLease(trace, header.Timestamp); err != nil { r.endCmd(cmdKey, args, err, false /* !readOnly */) return nil, err } // Two important invariants of Cockroach: 1) encountering a more // recently written value means transaction restart. 2) values must // be written with a greater timestamp than the most recent read to // the same key. Check the timestamp cache for reads/writes which // are at least as recent as the timestamp of this write. For // writes, send WriteTooOldError; for reads, update the write's // timestamp. When the write returns, the updated timestamp will // inform the final commit timestamp. if usesTimestampCache(args) { r.Lock() rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID()) r.Unlock() // Always push the timestamp forward if there's been a read which // occurred after our txn timestamp. if !rTS.Less(header.Timestamp) { header.Timestamp = rTS.Next() } // If there's a newer write timestamp... if !wTS.Less(header.Timestamp) { // If we're in a txn, we still go ahead and try the write since // we want to avoid restarting the transaction in the event that // there isn't an intent or the intent can be pushed by us. // // If we're not in a txn, it's trivial to just advance our timestamp. if header.Txn == nil { header.Timestamp = wTS.Next() } } } defer trace.Epoch("raft")() errChan, pendingCmd := r.proposeRaftCommand(ctx, args) signal() // First wait for raft to commit or abort the command. var err error var reply proto.Response if err = <-errChan; err == nil { // Next if the command was committed, wait for the range to apply it. respWithErr := <-pendingCmd.done reply, err = respWithErr.reply, respWithErr.err } else if err == multiraft.ErrGroupDeleted { // This error needs to be converted appropriately so that // clients will retry. err = proto.NewRangeNotFoundError(r.Desc().RaftID) } // As for reads, update timestamp cache with the timestamp // of this write on success. This ensures a strictly higher // timestamp for successive writes to the same key or key range. r.endCmd(cmdKey, args, err, false /* !readOnly */) return reply, err }
// Send implements the client.Sender interface. It verifies // permissions and looks up the appropriate range based on the // supplied key and sends the RPC according to the specified options. // // If the request spans multiple ranges (which is possible for Scan or // DeleteRange requests), Send sends requests to the individual ranges // sequentially and combines the results transparently. // // This may temporarily adjust the request headers, so the proto.Call // must not be used concurrently until Send has returned. func (ds *DistSender) Send(ctx context.Context, call proto.Call) { args := call.Args finalReply := call.Reply // Verify permissions. if err := ds.verifyPermissions(call.Args); err != nil { call.Reply.Header().SetGoError(err) return } trace := tracer.FromCtx(ctx) // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp proto.Timestamp) { args.Header().Timestamp = timestamp }(args.Header().Timestamp) args.Header().Timestamp = ds.clock.Now() } // If this is a bounded request, we will change its bound as we receive // replies. This undoes that when we return. boundedArgs, argsBounded := args.(proto.Bounded) if argsBounded { defer func(bound int64) { boundedArgs.SetBound(bound) }(boundedArgs.GetBound()) } defer func(key proto.Key) { args.Header().Key = key }(args.Header().Key) // Retry logic for lookup of range by key and RPCs to range replicas. curReply := finalReply for { call.Reply = curReply call.Reply.Header().Reset() var desc, descNext *proto.RangeDescriptor var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") desc, descNext, err = ds.getDescriptors(call) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(retry.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } err = ds.sendAttempt(trace, args, curReply, desc) if err != nil { trace.Event(fmt.Sprintf("send error: %T", err)) // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } else { err = curReply.Header().GoError() } if err == nil { break } if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: trace.Event(fmt.Sprintf("reply error: %T", err)) // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } continue case *proto.NotLeaderError: trace.Event(fmt.Sprintf("reply error: %T", err)) newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } trace.Event(fmt.Sprintf("reply error: %T", err)) continue } } break } // Immediately return if querying a range failed non-retryably. // For multi-range requests, we return the failing range's reply. if err != nil { call.Reply.Header().SetGoError(err) return } if finalReply != curReply { // This was the second or later call in a multi-range request. // Combine the new response with the existing one. if cFinalReply, ok := finalReply.(proto.Combinable); ok { cFinalReply.Combine(curReply) } else { // This should never apply in practice, as we'll only end up here // for range-spanning requests. call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type")) return } } // If this request has a bound, such as MaxResults in // ScanRequest, check whether enough rows have been retrieved. if argsBounded { if prevBound := boundedArgs.GetBound(); prevBound > 0 { if cReply, ok := curReply.(proto.Countable); ok { if nextBound := prevBound - cReply.Count(); nextBound > 0 { // Update bound for the next round. // We've deferred restoring the original bound earlier. boundedArgs.SetBound(nextBound) } else { // Set flag to break the loop. descNext = nil } } } } // If this was the last range accessed by this call, exit loop. if descNext == nil { break } // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. args.Header().Key = desc.EndKey // This is a multi-range request, make a new reply object for // subsequent iterations of the loop. curReply = args.CreateReply() trace.Event("querying next range") } call.Reply = finalReply }
// sendOne sends a single call via the wrapped sender. If the call is // part of a transaction, the TxnCoordSender adds the transaction to a // map of active transactions and begins heartbeating it. Every // subsequent call for the same transaction updates the lastUpdate // timestamp to prevent live transactions from being considered // abandoned and garbage collected. Read/write mutating requests have // their key or key range added to the transaction's interval tree of // key ranges for eventual cleanup via resolved write intents. // // On success, and if the call is part of a transaction, the affected // key range is recorded as live intents for eventual cleanup upon // transaction commit. Upon successful txn commit, initiates cleanup // of intents. func (tc *TxnCoordSender) sendOne(ctx context.Context, call proto.Call) { var startNS int64 header := call.Args.Header() trace := tracer.FromCtx(ctx) // If this call is part of a transaction... if header.Txn != nil { // Set the timestamp to the original timestamp for read-only // commands and to the transaction timestamp for read/write // commands. if proto.IsReadOnly(call.Args) { header.Timestamp = header.Txn.OrigTimestamp } else { header.Timestamp = header.Txn.Timestamp } // EndTransaction must have its key set to that of the txn. if _, ok := call.Args.(*proto.EndTransactionRequest); ok { header.Key = header.Txn.Key // Remember when EndTransaction started in case we want to // be linearizable. startNS = tc.clock.PhysicalNow() } } // Send the command through wrapped sender. tc.wrapped.Send(ctx, call) if header.Txn != nil { // If not already set, copy the request txn. if call.Reply.Header().Txn == nil { call.Reply.Header().Txn = gogoproto.Clone(header.Txn).(*proto.Transaction) } tc.updateResponseTxn(header, call.Reply.Header()) } if txn := call.Reply.Header().Txn; txn != nil { tc.Lock() txnMeta := tc.txns[string(txn.ID)] // If this transactional command leaves transactional intents, add the key // or key range to the intents map. If the transaction metadata doesn't yet // exist, create it. if call.Reply.Header().GoError() == nil { if proto.IsTransactionWrite(call.Args) { if txnMeta == nil { trace.Event("coordinator spawns") txnMeta = &txnMetadata{ txn: *txn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } id := string(txn.ID) tc.txns[id] = txnMeta tc.heartbeat(id) } txnMeta.addKeyRange(header.Key, header.EndKey) } // Update our record of this transaction. if txnMeta != nil { txnMeta.txn = *txn txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } } tc.Unlock() } // Cleanup intents and transaction map if end of transaction. switch t := call.Reply.Header().GoError().(type) { case *proto.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider it dead. tc.cleanupTxn(trace, t.Txn, nil) case *proto.TransactionAbortedError: // If already aborted, cleanup the txn on this TxnCoordSender. tc.cleanupTxn(trace, t.Txn, nil) case *proto.OpRequiresTxnError: // Run a one-off transaction with that single command. if log.V(1) { log.Infof("%s: auto-wrapping in txn and re-executing", call.Method()) } // TODO(tschottdorf): this part is awkward. Consider resending here // without starting a new call, which is hard to trace. Plus, the // below depends on default configuration. tmpDB, err := client.Open( fmt.Sprintf("//%s?priority=%d", call.Args.Header().User, call.Args.Header().GetUserPriority()), client.SenderOpt(tc)) if err != nil { log.Warning(err) return } call.Reply.Reset() if err := tmpDB.Txn(func(txn *client.Txn) error { txn.SetDebugName("auto-wrap") b := &client.Batch{} b.InternalAddCall(call) return txn.Commit(b) }); err != nil { log.Warning(err) } case nil: var resolved []proto.Key if txn := call.Reply.Header().Txn; txn != nil { if _, ok := call.Args.(*proto.EndTransactionRequest); ok { // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } resolved = call.Reply.(*proto.EndTransactionResponse).Resolved if txn.Status != proto.PENDING { tc.cleanupTxn(trace, *txn, resolved) } } } } }
// resolveIntents resolves the given intents. For those which are local to the // range, we submit directly to the range-local Raft instance; the call returns // as soon as all resolve commands have been **proposed** (not executed). This // ensures that if a waiting client retries immediately after conflict // resolution, it will not hit the same intents again. All non-local intents // are resolved asynchronously in a batch. // TODO(tschottdorf): once Txn records have a list of possibly open intents, // resolveIntents should send an RPC to update the transaction(s) as well (for // those intents with non-pending Txns). func (r *Replica) resolveIntents(ctx context.Context, intents []proto.Intent) { trace := tracer.FromCtx(ctx) tracer.ToCtx(ctx, nil) // we're doing async stuff below; those need new traces trace.Event("resolving intents [async]") var wg sync.WaitGroup bArgs := &proto.BatchRequest{} bArgs.User = security.RootUser for i := range intents { intent := intents[i] // avoids a race in `i, intent := range ...` var resolveArgs proto.Request var local bool // whether this intent lives on this Range { header := proto.RequestHeader{ // Use the pushee's timestamp, which might be lower than the // pusher's request timestamp. No need to push the intent higher // than the pushee's txn! Timestamp: intent.Txn.Timestamp, Key: intent.Key, EndKey: intent.EndKey, User: security.RootUser, Txn: &intent.Txn, } if len(intent.EndKey) == 0 { resolveArgs = &proto.ResolveIntentRequest{RequestHeader: header} local = r.ContainsKey(intent.Key) } else { resolveArgs = &proto.ResolveIntentRangeRequest{RequestHeader: header} local = r.ContainsKeyRange(intent.Key, intent.EndKey) } } // If the intent isn't (completely) local, we'll need to send an external request. // We'll batch them all up and send at the end. if !local { bArgs.Add(resolveArgs) continue } // If it is local, it goes directly into Raft. // TODO(tschottdorf): this may be premature optimization. Consider just // treating everything as an external request. This means having to // wait for complete execution of the command (whereas now we just wait // for proposition) and some more overhead sending things around. wg.Add(1) action := func() { // Trace this under the ID of the intent owner. ctx := tracer.ToCtx(ctx, r.rm.Tracer().NewTrace(resolveArgs.Header().Txn)) if _, err := r.addWriteCmd(ctx, resolveArgs, &wg); err != nil && log.V(1) { log.Warningc(ctx, "resolve for key %s failed: %s", intent.Key, err) } } if !r.rm.Stopper().RunAsyncTask(action) { // Still run the task. Our caller already has a task and going async // here again is merely for performance, but some intents need to // be resolved because they might block other tasks. See #1684. // Note that handleSkippedIntents has a TODO in case #1684 comes // back. action() } } // Resolve all of the intents which aren't local to the Range. This is a // no-op if all are local. b := &client.Batch{} b.InternalAddCall(proto.Call{Args: bArgs, Reply: &proto.BatchResponse{}}) action := func() { // TODO(tschottdorf): no tracing here yet. Probably useful at some point, // but needs a) the corresponding interface and b) facilities for tracing // multiple tracees at the same time (batch full of possibly individual // txns). if err := r.rm.DB().Run(b); err != nil { if log.V(1) { log.Infoc(ctx, "%s", err) } } } if !r.rm.Stopper().RunAsyncTask(action) { // As with local intents, try async to not keep the caller waiting, but // when draining just go ahead and do it synchronously. See #1684. action() } // Wait until all the local `ResolveIntent`s have been submitted to raft. // No-op if all were external. wg.Wait() }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState(ctx context.Context, ba proto.BatchRequest, br *proto.BatchResponse, pErr *proto.Error) *proto.Error { trace := tracer.FromCtx(ctx) newTxn := &proto.Transaction{} newTxn.Update(ba.GetTxn()) err := pErr.GoError() switch t := err.(type) { case nil: newTxn.Update(br.GetTxn()) // Move txn timestamp forward to response timestamp if applicable. // TODO(tschottdorf): see (*Replica).executeBatch and comments within. // Looks like this isn't necessary any more, nor did it prevent a bug // referenced in a TODO there. newTxn.Timestamp.Forward(br.Timestamp) case *proto.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(trace, t.Txn) case *proto.OpRequiresTxnError: // TODO(tschottdorf): range-spanning autowrap currently broken. panic("TODO(tschottdorf): disabled") case *proto.ReadWithinUncertaintyIntervalError: // Mark the host as certain. See the protobuf comment for // Transaction.CertainNodes for details. if t.NodeID == 0 { panic("no replica set in header on uncertainty restart") } newTxn.CertainNodes.Add(t.NodeID) // If the reader encountered a newer write within the uncertainty // interval, move the timestamp forward, just past that write or // up to MaxTimestamp, whichever comes first. candidateTS := newTxn.MaxTimestamp candidateTS.Backward(t.ExistingTimestamp.Add(0, 1)) newTxn.Timestamp.Forward(candidateTS) newTxn.Restart(ba.GetUserPriority(), newTxn.Priority, newTxn.Timestamp) t.Txn = *newTxn case *proto.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Priority = t.Txn.Priority t.Txn = *newTxn // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(trace, t.Txn) case *proto.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp.Add(0, 1)) newTxn.Restart(ba.GetUserPriority(), t.PusheeTxn.Priority-1, newTxn.Timestamp) t.Txn = newTxn case *proto.TransactionRetryError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(t.Txn.Timestamp) newTxn.Restart(ba.GetUserPriority(), t.Txn.Priority, newTxn.Timestamp) t.Txn = *newTxn case proto.TransactionRestartError: // Assertion: The above cases should exhaust all ErrorDetails which // carry a Transaction. if pErr.Detail != nil { panic(fmt.Sprintf("unhandled TransactionRestartError %T", err)) } } return func() *proto.Error { if len(newTxn.ID) <= 0 { return pErr } id := string(newTxn.ID) tc.Lock() defer tc.Unlock() txnMeta := tc.txns[id] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation // TODO(tschottdorf): already computed the intents prior to sending, // consider re-using those. if intents := ba.GetIntents(); len(intents) > 0 && err == nil { if txnMeta == nil { newTxn.Writing = true txnMeta = &txnMetadata{ txn: *newTxn, keys: cache.NewIntervalCache(cache.Config{Policy: cache.CacheNone}), firstUpdateNanos: tc.clock.PhysicalNow(), lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[id] = txnMeta // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. if _, isEnding := ba.GetArg(proto.EndTransaction); !isEnding { trace.Event("coordinator spawns") if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(id) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(id) return proto.NewError(&proto.NodeUnavailableError{}) } } } for _, intent := range intents { txnMeta.addKeyRange(intent.Key, intent.EndKey) } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn.Update(newTxn) // better to replace after #2300 if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if err == nil { // For successful transactional requests, always send the updated txn // record back. if br.Txn == nil { br.Txn = &proto.Transaction{} } *br.Txn = *newTxn } return pErr }() }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). func (ds *DistSender) sendChunk(ctx context.Context, ba proto.BatchRequest) (*proto.BatchResponse, error) { // TODO(tschottdorf): prepare for removing Key and EndKey from BatchRequest, // making sure that anything that relies on them goes bust. ba.Key, ba.EndKey = nil, nil isReverse := ba.IsReverse() trace := tracer.FromCtx(ctx) // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). from, to := keys.Range(ba) var br *proto.BatchResponse // Send the request to one range per iteration. for { options := lookupOptions{ useReverseScan: isReverse, } var curReply *proto.BatchResponse var desc *proto.RangeDescriptor var needAnother bool var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. descDone := trace.Epoch("meta descriptor lookup") var evictDesc func() desc, needAnother, evictDesc, err = ds.getDescriptors(from, to, options) descDone() // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(retry.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if needAnother && ba.Txn == nil && ba.IsRange() && ba.ReadConsistency != proto.INCONSISTENT { return nil, &proto.OpRequiresTxnError{} } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. if (isReverse && !desc.ContainsKeyRange(desc.StartKey, to)) || (!isReverse && !desc.ContainsKeyRange(from, desc.EndKey)) { evictDesc() continue } curReply, err = func() (*proto.BatchResponse, error) { // Truncate the request to our current key range. untruncate, numActive, trErr := truncate(&ba, desc, from, to) if numActive == 0 { untruncate() // This shouldn't happen in the wild, but some tests // exercise it. return nil, util.Errorf("truncation resulted in empty batch on [%s,%s): %s", from, to, ba) } defer untruncate() if trErr != nil { return nil, trErr } // TODO(tschottdorf): make key range on batch redundant. The // requests within dictate it anyways. ba.Key, ba.EndKey = keys.Range(ba) reply, err := ds.sendAttempt(trace, ba, desc) ba.Key, ba.EndKey = nil, nil if err != nil { if log.V(0 /* TODO(tschottdorf): 1 */) { log.Warningf("failed to invoke %s: %s", ba, err) } } return reply, err }() // If sending succeeded, break this loop. if err == nil { break } // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *rpc.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. // TODO(tschottdorf): If a replica group goes dead, this // will cause clients to put high read pressure on the first // range, so there should be some rate limiting here. evictDesc() if tErr.CanRetry() { continue } case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: trace.Event(fmt.Sprintf("reply error: %T", err)) // Range descriptor might be out of date - evict it. evictDesc() // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } // For the remainder of this call, we'll assume that intents // are fair game. This replaces more complex logic based on // the type of request. options.considerIntents = true continue case *proto.NotLeaderError: trace.Event(fmt.Sprintf("reply error: %T", err)) newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } evictDesc() } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } trace.Event(fmt.Sprintf("reply error: %T", err)) continue } } break } // Immediately return if querying a range failed non-retryably. if err != nil { return nil, err } first := br == nil if first { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { panic(err) // TODO(tschottdorf): return nil, err } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if first { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]proto.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetValue() if _, ok := args.(*proto.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(proto.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetValue().(proto.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). ba.Requests[i].Reset() // necessary (no one-of?) if !ba.Requests[i].SetValue(&proto.NoopRequest{}) { panic("RequestUnion excludes NoopRequest") } continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. to = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. from = next(ba, desc.EndKey) } trace.Event("querying next range") } }