func (q *consistencyQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, _ config.SystemConfig, ) (bool, float64) { shouldQ, priority := true, float64(0) if !repl.store.cfg.TestingKnobs.DisableLastProcessedCheck { lpTS, err := repl.getQueueLastProcessed(ctx, q.name) if err != nil { log.ErrEventf(ctx, "consistency queue last processed timestamp: %s", err) } if shouldQ, priority = shouldQueueAgain(now, lpTS, q.interval); !shouldQ { return false, 0 } } // Check if all replicas are live. Some tests run without a NodeLiveness configured. if repl.store.cfg.NodeLiveness != nil { for _, rep := range repl.Desc().Replicas { if live, err := repl.store.cfg.NodeLiveness.IsLive(rep.NodeID); err != nil { log.ErrEventf(ctx, "node %d liveness failed: %s", rep.NodeID, err) return false, 0 } else if !live { return false, 0 } } } return true, priority }
// process() is called on every range for which this node is a lease holder. func (q *consistencyQueue) process(ctx context.Context, repl *Replica, _ config.SystemConfig) error { req := roachpb.CheckConsistencyRequest{} if _, pErr := repl.CheckConsistency(ctx, req); pErr != nil { log.Error(ctx, pErr.GoError()) } // Update the last processed time for this queue. if err := repl.setQueueLastProcessed(ctx, q.name, repl.store.Clock().Now()); err != nil { log.ErrEventf(ctx, "failed to update last processed time: %v", err) } return nil }
func (q *consistencyQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, _ config.SystemConfig, ) (bool, float64) { if !repl.store.cfg.TestingKnobs.DisableLastProcessedCheck { lpTS, err := repl.getQueueLastProcessed(ctx, q.name) if err != nil { log.ErrEventf(ctx, "consistency queue last processed timestamp: %s", err) } return shouldQueueAgain(now, lpTS, q.interval) } return true, 0 }
func (q *timeSeriesMaintenanceQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() snap := repl.store.Engine().NewSnapshot() defer snap.Close() if err := q.tsData.PruneTimeSeries(ctx, snap, desc.StartKey, desc.EndKey, q.db, now); err != nil { return err } // Update the last processed time for this queue. if err := repl.setQueueLastProcessed(ctx, q.name, now); err != nil { log.ErrEventf(ctx, "failed to update last processed time: %v", err) } return nil }
func (q *timeSeriesMaintenanceQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, _ config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.cfg.TestingKnobs.DisableLastProcessedCheck { lpTS, err := repl.getQueueLastProcessed(ctx, q.name) if err != nil { log.ErrEventf(ctx, "time series maintenance queue last processed timestamp: %s", err) } shouldQ, priority = shouldQueueAgain(now, lpTS, TimeSeriesMaintenanceInterval) if !shouldQ { return } } desc := repl.Desc() if q.tsData.ContainsTimeSeries(desc.StartKey, desc.EndKey) { return } return false, 0 }
// sendPartialBatch sends the supplied batch to the range specified by // desc. The batch request is first truncated so that it contains only // requests which intersect the range descriptor and keys for each // request are limited to the range's key span. The send occurs in a // retry loop to handle send failures. On failure to send to any // replicas, we backoff and retry by refetching the range // descriptor. If the underlying range seems to have split, we // recursively invoke divideAndSendBatchToRanges to re-enumerate the // ranges in the span and resend to each. func (ds *DistSender) sendPartialBatch( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, desc *roachpb.RangeDescriptor, evictToken *EvictionToken, isFirst bool, ) response { var reply *roachpb.BatchResponse var pErr *roachpb.Error isReverse := ba.IsReverse() // Truncate the request to range descriptor. intersected, err := rs.Intersect(desc) if err != nil { return response{pErr: roachpb.NewError(err)} } truncBA, numActive, err := truncate(ba, intersected) if numActive == 0 && err == nil { // This shouldn't happen in the wild, but some tests exercise it. return response{ pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba), } } if err != nil { return response{pErr: roachpb.NewError(err)} } // Start a retry loop for sending the batch to the range. for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { // If we've cleared the descriptor on a send failure, re-lookup. if desc == nil { var descKey roachpb.RKey if isReverse { descKey = intersected.EndKey } else { descKey = intersected.Key } desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse) if err != nil { log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err) continue } } reply, pErr = ds.sendSingleRange(ctx, truncBA, desc) // If sending succeeded, return immediately. if pErr == nil { return response{reply: reply} } log.ErrEventf(ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup") if err := evictToken.Evict(ctx); err != nil { return response{pErr: roachpb.NewError(err)} } // Clear the descriptor to reload on the next attempt. desc = nil continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return response{pErr: roachpb.NewError(err)} } // On addressing errors (likely a split), we need to re-invoke // the range descriptor lookup machinery, so we recurse by // sending batch to just the partial span this descriptor was // supposed to cover. log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr) reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst) return response{reply: reply, pErr: pErr} } break } // Propagate error if either the retry closer or context done // channels were closed. if pErr == nil { if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil { log.Fatal(ctx, "exited retry loop without an error") } } return response{pErr: pErr} }
// execStmtInOpenTxn executes one statement in the context // of the planner's transaction (which is assumed to exist). // It handles statements that affect the transaction state (BEGIN, COMMIT) // and delegates everything else to `execStmt`. // It binds placeholders. // // The current transaction might be committed/rolled back when this returns. // It might also have transitioned to the aborted or RestartWait state. // // Args: // implicitTxn: set if the current transaction was implicitly // created by the system (i.e. the client sent the statement outside of // a transaction). // COMMIT/ROLLBACK statements are rejected if set. Also, the transaction // might be auto-committed in this function. // firstInTxn: set for the first statement in a transaction. Used // so that nested BEGIN statements are caught. // stmtTimestamp: Used as the statement_timestamp(). // // Returns: // - a Result // - an error, if any. In case of error, the result returned also reflects this error. func (e *Executor) execStmtInOpenTxn( stmt parser.Statement, planMaker *planner, implicitTxn bool, firstInTxn bool, txnState *txnState, ) (Result, error) { if txnState.State != Open { panic("execStmtInOpenTxn called outside of an open txn") } if planMaker.txn == nil { panic("execStmtInOpenTxn called with a txn not set on the planner") } planMaker.evalCtx.SetTxnTimestamp(txnState.sqlTimestamp) planMaker.evalCtx.SetStmtTimestamp(e.cfg.Clock.PhysicalTime()) session := planMaker.session log.Eventf(session.context, "%s", stmt) // TODO(cdo): Figure out how to not double count on retries. e.updateStmtCounts(stmt) switch s := stmt.(type) { case *parser.BeginTransaction: if !firstInTxn { txnState.updateStateAndCleanupOnErr(errTransactionInProgress, e) return Result{Err: errTransactionInProgress}, errTransactionInProgress } case *parser.CommitTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } // CommitTransaction is executed fully here; there's no planNode for it // and the planner is not involved at all. res, err := commitSQLTransaction(txnState, planMaker, commit, e) return res, err case *parser.ReleaseSavepoint: if implicitTxn { return e.noTransactionHelper(txnState) } if err := parser.ValidateRestartCheckpoint(s.Savepoint); err != nil { return Result{Err: err}, err } // ReleaseSavepoint is executed fully here; there's no planNode for it // and the planner is not involved at all. res, err := commitSQLTransaction(txnState, planMaker, release, e) return res, err case *parser.RollbackTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } // RollbackTransaction is executed fully here; there's no planNode for it // and the planner is not involved at all. // Notice that we don't return any errors on rollback. return rollbackSQLTransaction(txnState, planMaker), nil case *parser.SetTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } case *parser.Savepoint: if implicitTxn { return e.noTransactionHelper(txnState) } if err := parser.ValidateRestartCheckpoint(s.Name); err != nil { return Result{Err: err}, err } // We want to disallow SAVEPOINTs to be issued after a transaction has // started running, but such enforcement is problematic in the // presence of transaction retries (since the transaction proto is // necessarily reused). To work around this, we keep track of the // transaction's retrying state and special-case SAVEPOINT when it is // set. // // TODO(andrei): the check for retrying is a hack - we erroneously // allow SAVEPOINT to be issued at any time during a retry, not just // in the beginning. We should figure out how to track whether we // started using the transaction during a retry. if txnState.txn.Proto.IsInitialized() && !txnState.retrying { err := fmt.Errorf("SAVEPOINT %s needs to be the first statement in a transaction", parser.RestartSavepointName) txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err } // Note that Savepoint doesn't have a corresponding plan node. // This here is all the execution there is. txnState.retryIntent = true return Result{}, nil case *parser.RollbackToSavepoint: err := parser.ValidateRestartCheckpoint(s.Savepoint) if err == nil { // Can't restart if we didn't get an error first, which would've put the // txn in a different state. err = errNotRetriable } txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Prepare: err := util.UnimplementedWithIssueErrorf(7568, "Prepared statements are supported only via the Postgres wire protocol") txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Execute: err := util.UnimplementedWithIssueErrorf(7568, "Executing prepared statements is supported only via the Postgres wire protocol") txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Deallocate: if s.Name == "" { planMaker.session.PreparedStatements.DeleteAll() } else { if found := planMaker.session.PreparedStatements.Delete(string(s.Name)); !found { err := fmt.Errorf("prepared statement %s does not exist", s.Name) txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err } } return Result{PGTag: s.StatementTag()}, nil } autoCommit := implicitTxn && !e.cfg.TestingKnobs.DisableAutoCommit result, err := e.execStmt(stmt, planMaker, autoCommit) if err != nil { if result.Rows != nil { result.Rows.Close() result.Rows = nil } if traceSQL { log.ErrEventf(txnState.txn.Context, "ERROR: %v", err) } log.ErrEventf(session.context, "ERROR: %v", err) txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err } tResult := &traceResult{tag: result.PGTag, count: -1} switch result.Type { case parser.RowsAffected: tResult.count = result.RowsAffected case parser.Rows: tResult.count = result.Rows.Len() } if traceSQL { log.Eventf(txnState.txn.Context, "%s done", tResult) } log.Eventf(session.context, "%s done", tResult) return result, nil }
func (n *Node) batchInternal( ctx context.Context, args *roachpb.BatchRequest, ) (*roachpb.BatchResponse, error) { // TODO(marc): grpc's authentication model (which gives credential access in // the request handler) doesn't really fit with the current design of the // security package (which assumes that TLS state is only given at connection // time) - that should be fixed. if peer, ok := peer.FromContext(ctx); ok { if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok { certUser, err := security.GetCertificateUser(&tlsInfo.State) if err != nil { return nil, err } if certUser != security.NodeUser { return nil, errors.Errorf("user %s is not allowed", certUser) } } } var br *roachpb.BatchResponse type snowballInfo struct { syncutil.Mutex collectedSpans [][]byte done bool } var snowball *snowballInfo if err := n.stopper.RunTaskWithErr(func() error { const opName = "node.Batch" sp, err := tracing.JoinOrNew(n.storeCfg.AmbientCtx.Tracer, args.TraceContext, opName) if err != nil { return err } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { sp.LogEvent("delegating to snowball tracing") sp.Finish() snowball = new(snowballInfo) recorder := func(rawSpan basictracer.RawSpan) { snowball.Lock() defer snowball.Unlock() if snowball.done { // This is a late span that we must discard because the request was // already completed. return } encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(ctx, err) } snowball.collectedSpans = append(snowball.collectedSpans, encSp) } if sp, err = tracing.JoinOrNewSnowball(opName, args.TraceContext, recorder); err != nil { return err } } defer sp.Finish() traceCtx := opentracing.ContextWithSpan(ctx, sp) log.Event(traceCtx, args.Summary()) tStart := timeutil.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(traceCtx, *args) if pErr != nil { br = &roachpb.BatchResponse{} log.ErrEventf(traceCtx, "%T", pErr.GetDetail()) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr return nil }); err != nil { return nil, err } if snowball != nil { snowball.Lock() br.CollectedSpans = snowball.collectedSpans snowball.done = true snowball.Unlock() } return br, nil }
func (rq *replicateQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) (shouldQ bool, priority float64) { if !repl.store.splitQueue.Disabled() && repl.needsSplitBySize() { // If the range exceeds the split threshold, let that finish first. // Ranges must fit in memory on both sender and receiver nodes while // being replicated. This supplements the check provided by // acceptsUnsplitRanges, which looks at zone config boundaries rather // than data size. // // This check is ignored if the split queue is disabled, since in that // case, the split will never come. return } // Find the zone config for this range. desc := repl.Desc() zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { log.Error(ctx, err) return } action, priority := rq.allocator.ComputeAction(zone, desc) if action != AllocatorNoop { if log.V(2) { log.Infof(ctx, "%s repair needed (%s), enqueuing", repl, action) } return true, priority } // If we hold the lease, check to see if we should transfer it. var leaseStoreID roachpb.StoreID if lease, _ := repl.getLease(); lease != nil && lease.Covers(now) { leaseStoreID = lease.Replica.StoreID if rq.canTransferLease() && rq.allocator.ShouldTransferLease(zone.Constraints, leaseStoreID, desc.RangeID) { if log.V(2) { log.Infof(ctx, "%s lease transfer needed, enqueuing", repl) } return true, 0 } } // Check for a rebalancing opportunity. Note that leaseStoreID will be 0 if // the range doesn't currently have a lease which will allow the current // replica to be considered a rebalancing source. target, err := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, leaseStoreID, desc.RangeID, ) if err != nil { log.ErrEventf(ctx, "rebalance target failed: %s", err) return false, 0 } if log.V(2) { if target != nil { log.Infof(ctx, "%s rebalance target found, enqueuing", repl) } else { log.Infof(ctx, "%s no rebalance target found, not enqueuing", repl) } } return target != nil, 0 }
func (rq *replicateQueue) processOneChange( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Event(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget( zone.Constraints, desc.Replicas, desc.RangeID, true, ) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica) if err := rq.addReplica(ctx, repl, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Event(ctx, "removing a replica") // If the lease holder (our local store) is an overfull store (in terms of // leases) allow transferring the lease away. leaseHolderStoreID := repl.store.StoreID() if rq.allocator.ShouldTransferLease(zone.Constraints, leaseHolderStoreID, desc.RangeID) { leaseHolderStoreID = 0 } removeReplica, err := rq.allocator.RemoveTarget( zone.Constraints, desc.Replicas, leaseHolderStoreID, ) if err != nil { return err } if removeReplica.StoreID == repl.store.StoreID() { // The local replica was selected as the removal target, but that replica // is the leaseholder, so transfer the lease instead. We don't check that // the current store has too many leases in this case under the // assumption that replica balance is a greater concern. Also note that // AllocatorRemove action takes preference over AllocatorNoop // (rebalancing) which is where lease transfer would otherwise occur. We // need to be able to transfer leases in AllocatorRemove in order to get // out of situations where this store is overfull and yet holds all the // leases. candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, false /* checkTransferLeaseSource */) if target != (roachpb.ReplicaDescriptor{}) { log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(target.StoreID); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) // Do not requeue as we transferred our lease away. return nil } } else { log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica) if err := rq.removeReplica(ctx, repl, removeReplica, desc); err != nil { return err } } case AllocatorRemoveDead: log.Event(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica) if err := repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. log.Event(ctx, "considering a rebalance") if rq.canTransferLease() { // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, true /* checkTransferLeaseSource */) if target != (roachpb.ReplicaDescriptor{}) { log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(target.StoreID); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) // Do not requeue as we transferred our lease away. return nil } } rebalanceStore, err := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, repl.store.StoreID(), desc.RangeID, ) if err != nil { log.ErrEventf(ctx, "rebalance target failed %s", err) return nil } if rebalanceStore == nil { log.VEventf(ctx, 1, "no suitable rebalance target") // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica) if err := rq.addReplica(ctx, repl, rebalanceReplica, desc); err != nil { return err } } return nil }
// sendToReplicas sends one or more RPCs to clients specified by the // slice of replicas. On success, Send returns the first successful // reply. If an error occurs which is not specific to a single // replica, it's returned immediately. Otherwise, when all replicas // have been tried and failed, returns a send error. func (ds *DistSender) sendToReplicas( ctx context.Context, opts SendOptions, rangeID roachpb.RangeID, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context, ) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1)) } var ambiguousResult bool var haveCommit bool // We only check for committed txns, not aborts because aborts may // be retried without any risk of inconsistencies. if etArg, ok := args.GetArg(roachpb.EndTransaction); ok && etArg.(*roachpb.EndTransactionRequest).Commit { haveCommit = true } done := make(chan BatchCall, len(replicas)) transportFactory := opts.transportFactory if transportFactory == nil { transportFactory = grpcTransportFactory } transport, err := transportFactory(opts, rpcContext, replicas, args) if err != nil { return nil, err } defer transport.Close() if transport.IsExhausted() { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed", len(replicas))) } // Send the first request. pending := 1 log.VEventf(ctx, 2, "sending batch %s to range %d", args.Summary(), rangeID) transport.SendNext(ctx, done) // Wait for completions. This loop will retry operations that fail // with errors that reflect per-replica state and may succeed on // other replicas. var sendNextTimer timeutil.Timer defer sendNextTimer.Stop() slowTimer := time.After(base.SlowRequestThreshold) for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if !transport.IsExhausted() { ds.metrics.SendNextTimeoutCount.Inc(1) log.VEventf(ctx, 2, "timeout, trying next peer") pending++ transport.SendNext(ctx, done) } case <-slowTimer: slowTimer = nil log.Warningf(ctx, "have been waiting %s sending RPC for batch: %s", base.SlowRequestThreshold, args.Summary()) ds.metrics.SlowRequestsCount.Inc(1) defer ds.metrics.SlowRequestsCount.Dec(1) case call := <-done: pending-- err := call.Err if err == nil { if log.V(2) { log.Infof(ctx, "RPC reply: %s", call.Reply) } if call.Reply.Error == nil { return call.Reply, nil } else if !ds.handlePerReplicaError(ctx, transport, rangeID, call.Reply.Error) { // The error received is not specific to this replica, so we // should return it instead of trying other replicas. However, // if we're trying to commit a transaction and there are // still other RPCs outstanding or an ambiguous RPC error // was already received, we must return an ambiguous commit // error instead of returned error. log.ErrEventf(ctx, "application error: %s", call.Reply.Error) if haveCommit && (pending > 0 || ambiguousResult) { log.ErrEventf(ctx, "returning ambiguous result (pending=%d)", pending) return nil, roachpb.NewAmbiguousResultError( fmt.Sprintf("error=%s, pending RPCs=%d", call.Reply.Error, pending)) } return call.Reply, nil } // Extract the detail so it can be included in the error // message if this is our last replica. // // TODO(bdarnell): The last error is not necessarily the best // one to return; we may want to remember the "best" error // we've seen (for example, a NotLeaseHolderError conveys more // information than a RangeNotFound). log.ErrEventf(ctx, "application error: %s", call.Reply.Error) err = call.Reply.Error.GoError() } else { // All connection errors except for an unavailable node (this // is GRPC's fail-fast error), may mean that the request // succeeded on the remote server, but we were unable to // receive the reply. Set the ambiguous commit flag. // // We retry ambiguous commit batches to avoid returning the // unrecoverable AmbiguousResultError. This is safe because // repeating an already-successfully applied batch is // guaranteed to return either a TransactionReplayError (in // case the replay happens at the original leader), or a // TransactionRetryError (in case the replay happens at a new // leader). If the original attempt merely timed out or was // lost, then the batch will succeed and we can be assured the // commit was applied just once. // // The Unavailable code is used by GRPC to indicate that a // request fails fast and is not sent, so we can be sure there // is no ambiguity on these errors. Note that these are common // if a node is down. // See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/call.go#L182 // See https://github.com/grpc/grpc-go/blob/52f6504dc290bd928a8139ba94e3ab32ed9a6273/stream.go#L158 if haveCommit && grpc.Code(err) != codes.Unavailable { log.ErrEventf(ctx, "txn may have committed despite RPC error: %s", err) ambiguousResult = true } else { log.ErrEventf(ctx, "RPC error: %s", err) } } // Send to additional replicas if available. if !transport.IsExhausted() { ds.metrics.NextReplicaErrCount.Inc(1) log.VEventf(ctx, 2, "error, trying next peer") pending++ transport.SendNext(ctx, done) } if pending == 0 { if ambiguousResult { err = roachpb.NewAmbiguousResultError( fmt.Sprintf("sending to all %d replicas failed; last error: %v, "+ "but RPC failure may have masked txn commit", len(replicas), err)) } else { err = roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err), ) } log.ErrEvent(ctx, err.Error()) return nil, err } } } }