// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock should not be held // while calling this method. func (bq *baseQueue) processReplica(repl *Replica, clock *hlc.Clock) error { bq.processMu.Lock() defer bq.processMu.Unlock() // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { log.VEventf(1, bq.ctx, "no system config available. skipping") return nil } if bq.requiresSplit(cfg, repl) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. log.VEventf(3, bq.ctx, "%s: split needed; skipping", repl) return nil } sp := repl.store.Tracer().StartSpan(bq.name) ctx := opentracing.ContextWithSpan(context.Background(), sp) defer sp.Finish() log.Tracef(ctx, "processing replica %s", repl) // If the queue requires a replica to have the range lease in // order to be processed, check whether this replica has range lease // and renew or acquire if necessary. if bq.needsLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLease(ctx); err != nil { if _, harmless := err.GetDetail().(*roachpb.NotLeaseHolderError); harmless { log.VEventf(3, bq.ctx, "%s: not holding lease; skipping", repl) return nil } return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } log.Trace(ctx, "got range lease") } log.VEventf(3, bq.ctx, "%s: processing", repl) start := timeutil.Now() if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil { return err } log.VEventf(2, bq.ctx, "%s: done: %s", repl, timeutil.Since(start)) log.Trace(ctx, "done") return nil }
// InitSenderForLocalTestCluster initializes a TxnCoordSender that can be used // with LocalTestCluster. func InitSenderForLocalTestCluster( nodeDesc *roachpb.NodeDescriptor, tracer opentracing.Tracer, clock *hlc.Clock, latency time.Duration, stores client.Sender, stopper *stop.Stopper, gossip *gossip.Gossip, ) client.Sender { var rpcSend rpcSendFn = func(_ SendOptions, _ ReplicaSlice, args roachpb.BatchRequest, _ *rpc.Context) (*roachpb.BatchResponse, error) { if latency > 0 { time.Sleep(latency) } sp := tracer.StartSpan("node") defer sp.Finish() ctx := opentracing.ContextWithSpan(context.Background(), sp) log.Trace(ctx, args.String()) br, pErr := stores.Send(ctx, args) if br == nil { br = &roachpb.BatchResponse{} } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(stores, br)) } br.Error = pErr if pErr != nil { log.Trace(ctx, "error: "+pErr.String()) } return br, nil } retryOpts := GetDefaultDistSenderRetryOptions() retryOpts.Closer = stopper.ShouldDrain() distSender := NewDistSender(&DistSenderContext{ Clock: clock, RangeDescriptorCacheSize: defaultRangeDescriptorCacheSize, RangeLookupMaxRanges: defaultRangeLookupMaxRanges, LeaderCacheSize: defaultLeaderCacheSize, RPCRetryOptions: &retryOpts, nodeDescriptor: nodeDesc, RPCSend: rpcSend, // defined above RangeDescriptorDB: stores.(RangeDescriptorDB), // for descriptor lookup }, gossip) return NewTxnCoordSender(distSender, clock, false /* !linearizable */, tracer, stopper, NewTxnMetrics(metric.NewRegistry())) }
// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock should not be held // while calling this method. func (bq *baseQueue) processReplica(repl *Replica, clock *hlc.Clock) error { // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { bq.eventLog.VInfof(log.V(1), "no system config available. skipping") return nil } desc := repl.Desc() if !bq.acceptsUnsplitRanges && cfg.NeedsSplit(desc.StartKey, desc.EndKey) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. bq.eventLog.VInfof(log.V(3), "%s: split needed; skipping", repl) return nil } sp := repl.store.Tracer().StartSpan(bq.name) ctx := opentracing.ContextWithSpan(repl.context(context.Background()), sp) log.Trace(ctx, fmt.Sprintf("queue start for range %d", repl.RangeID)) defer sp.Finish() // If the queue requires a replica to have the range leader lease in // order to be processed, check whether this replica has leader lease // and renew or acquire if necessary. if bq.needsLeaderLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLeaderLease(ctx); err != nil { if _, harmless := err.GetDetail().(*roachpb.NotLeaderError); harmless { bq.eventLog.VInfof(log.V(3), "%s: not holding lease; skipping", repl) return nil } return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } log.Trace(ctx, "got range lease") } bq.eventLog.VInfof(log.V(3), "%s: processing", repl) start := timeutil.Now() if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil { return err } bq.eventLog.VInfof(log.V(2), "%s: done: %s", repl, timeutil.Since(start)) log.Trace(ctx, "done") return nil }
// EvictAndReplace instructs the evictionToken to evict the RangeDescriptor it was // created with from the rangeDescriptorCache. It also allows the user to provide // new RangeDescriptors to insert into the cache, all atomically. When called without // arguments, EvictAndReplace will behave the same as Evict. func (et *evictionToken) EvictAndReplace(ctx context.Context, newDescs ...roachpb.RangeDescriptor) error { var err error et.doOnce.Do(func() { et.doLocker.Lock() defer et.doLocker.Unlock() err = et.do() if err == nil { if len(newDescs) > 0 { err = et.doReplace(newDescs...) log.Trace(ctx, fmt.Sprintf("evicting cached range descriptor with %d replacements", len(newDescs))) } else { log.Trace(ctx, "evicting cached range descriptor") } } }) return err }
// process synchronously invokes admin split for each proposed split key. func (sq *splitQueue) process( ctx context.Context, now hlc.Timestamp, rng *Replica, sysCfg config.SystemConfig, ) error { // First handle case of splitting due to zone config maps. desc := rng.Desc() splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey) if len(splitKeys) > 0 { log.Infof("splitting %s at keys %v", rng, splitKeys) log.Trace(ctx, fmt.Sprintf("splitting at keys %v", splitKeys)) for _, splitKey := range splitKeys { if err := sq.db.AdminSplit(splitKey.AsRawKey()); err != nil { return errors.Errorf("unable to split %s at key %q: %s", rng, splitKey, err) } } return nil } // Next handle case of splitting due to size. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } size := rng.GetMVCCStats().Total() // FIXME: why is this implementation not the same as the one above? if float64(size)/float64(zone.RangeMaxBytes) > 1 { log.Infof("splitting %s size=%d max=%d", rng, size, zone.RangeMaxBytes) log.Trace(ctx, fmt.Sprintf("splitting size=%d max=%d", size, zone.RangeMaxBytes)) if _, pErr := client.SendWrappedWith(rng, ctx, roachpb.Header{ Timestamp: now, }, &roachpb.AdminSplitRequest{ Span: roachpb.Span{Key: desc.StartKey.AsRawKey()}, }); pErr != nil { return pErr.GoError() } } return nil }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err) } rangeID, repl, err := ls.lookupReplica(rs.Key, rs.EndKey) if err != nil { return nil, roachpb.NewError(err) } ba.RangeID = rangeID ba.Replica = *repl } ctx = log.Add(ctx, log.RangeID, ba.RangeID) store, err := ls.GetStore(ba.Replica.StoreID) if err != nil { return nil, roachpb.NewError(err) } if ba.Txn != nil { // For calls that read data within a txn, we keep track of timestamps // observed from the various participating nodes' HLC clocks. If we have // a timestamp on file for this Node which is smaller than MaxTimestamp, // we can lower MaxTimestamp accordingly. If MaxTimestamp drops below // OrigTimestamp, we effectively can't see uncertainty restarts any // more. // Note that it's not an issue if MaxTimestamp propagates back out to // the client via a returned Transaction update - when updating a Txn // from another, the larger MaxTimestamp wins. if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) { // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // The uncertainty window is [OrigTimestamp, maxTS), so if that window // is empty, there won't be any uncertainty restarts. if !ba.Txn.OrigTimestamp.Less(maxTS) { log.Trace(ctx, "read has no clock uncertainty") } shallowTxn.MaxTimestamp.Backward(maxTS) ba.Txn = &shallowTxn } } br, pErr := store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
func (s *senderTransport) SendNext(done chan BatchCall) { if s.called { panic("called an exhausted transport") } s.called = true sp := s.tracer.StartSpan("node") defer sp.Finish() ctx := opentracing.ContextWithSpan(context.Background(), sp) log.Trace(ctx, s.args.String()) br, pErr := s.sender.Send(ctx, s.args) if br == nil { br = &roachpb.BatchResponse{} } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(s.sender, br)) } br.Error = pErr if pErr != nil { log.Trace(ctx, "error: "+pErr.String()) } done <- BatchCall{Reply: br} }
// cleanupTxnLocked is called when a transaction ends. The transaction record // is updated and the heartbeat goroutine signaled to clean up the transaction // gracefully. func (tc *TxnCoordSender) cleanupTxnLocked(ctx context.Context, txn roachpb.Transaction) { log.Trace(ctx, "coordinator stops") txnMeta, ok := tc.txns[*txn.ID] // The heartbeat might've already removed the record. Or we may have already // closed txnEnd but we are racing with the heartbeat cleanup. if !ok || txnMeta.txnEnd == nil { return } // The supplied txn may be newer than the one in txnMeta, which is relevant // for stats. txnMeta.txn = txn // Trigger heartbeat shutdown. close(txnMeta.txnEnd) txnMeta.txnEnd = nil }
// cleanupTxn is called when a transaction ends. The transaction record is // updated and the heartbeat goroutine signaled to clean up the transaction // gracefully. func (tc *TxnCoordSender) cleanupTxn(ctx context.Context, txn roachpb.Transaction) { log.Trace(ctx, "coordinator stops") tc.Lock() defer tc.Unlock() txnMeta, ok := tc.txns[*txn.ID] // The heartbeat might've already removed the record. if !ok { return } // The supplied txn may be newer than the one in txnMeta, which is relevant // for stats. txnMeta.txn = txn // Trigger heartbeat shutdown. close(txnMeta.txnEnd) txnMeta.txnEnd = nil }
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool { tc.Lock() txnMeta := tc.txns[txnID] txn := txnMeta.txn.Clone() tc.Unlock() // Before we send a heartbeat, determine whether this transaction should be // considered abandoned. If so, exit heartbeat. If ctx.Done() is not nil, then // it is a cancellable Context and we skip this check and use the ctx lifetime // instead of a timeout. if ctx.Done() == nil && txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) { tc.clientHasAbandoned(txnID) return false } ba := roachpb.BatchRequest{} ba.Txn = &txn hb := &roachpb.HeartbeatTxnRequest{ Now: tc.clock.Now(), } hb.Key = txn.Key ba.Add(hb) log.Trace(ctx, "heartbeat") _, err := tc.wrapped.Send(ctx, ba) // If the transaction is not in pending state, then we can stop // the heartbeat. It's either aborted or committed, and we resolve // write intents accordingly. if err != nil { log.Warningf("heartbeat to %s failed: %s", txn, err) } // TODO(bdarnell): once we have gotten a heartbeat response with // Status != PENDING, future heartbeats are useless. However, we // need to continue the heartbeatLoop until the client either // commits or abandons the transaction. We could save a little // pointless work by restructuring this loop to stop sending // heartbeats between the time that the transaction is aborted and // the client finds out. Furthermore, we could use this information // to send TransactionAbortedErrors to the client so it can restart // immediately instead of running until its EndTransaction. return true }
// sendSingleRange gathers and rearranges the replicas, and makes an RPC call. func (ds *DistSender) sendSingleRange( ctx context.Context, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor, ) (*roachpb.BatchResponse, *roachpb.Error) { log.Trace(ctx, fmt.Sprintf("sending RPC to [%s, %s)", desc.StartKey, desc.EndKey)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) { if leader := ds.leaderCache.Lookup(roachpb.RangeID(desc.RangeID)); leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = orderStable } } } // TODO(tschottdorf): should serialize the trace here, not higher up. br, pErr := ds.sendRPC(ctx, desc.RangeID, replicas, order, ba) if pErr != nil { return nil, pErr } // If the reply contains a timestamp, update the local HLC with it. if br.Error != nil && br.Error.Now != roachpb.ZeroTimestamp { ds.clock.Update(br.Error.Now) } else if br.Now != roachpb.ZeroTimestamp { ds.clock.Update(br.Now) } // Untangle the error from the received response. pErr = br.Error br.Error = nil // scrub the response error return br, pErr }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp := opentracing.SpanFromContext(ctx) if sp == nil { sp = tc.tracer.StartSpan(opTxnCoordSender) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) } // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } if err := tc.tracer.Inject(sp, basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } } startNS := tc.clock.PhysicalNow() if ba.Txn != nil { // If this request is part of a transaction... if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } txnID := *ba.Txn.ID // Verify that if this Transaction is not read-only, we have it on file. // If not, refuse further operations - the transaction was aborted due // to a timeout or the client must have issued a write on another // coordinator previously. if ba.Txn.Writing { tc.Lock() _, ok := tc.txns[txnID] tc.Unlock() if !ok { pErr := roachpb.NewErrorf("writing transaction timed out, was aborted, " + "or ran on multiple coordinators") return nil, pErr } } if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok { et := rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } tc.Lock() txnMeta, metaOK := tc.txns[txnID] { // Populate et.IntentSpans, taking into account both existing // writes (if any) and new writes in this batch, and taking // care to perform proper deduplication. var keys interval.RangeGroup if metaOK { keys = txnMeta.keys } else { keys = interval.NewRangeTree() } ba.IntentSpanIterate(func(key, endKey roachpb.Key) { addKeyRange(keys, key, endKey) }) et.IntentSpans = collectIntentSpans(keys) } tc.Unlock() if len(et.IntentSpans) > 0 { // All good, proceed. } else if !metaOK { // If we don't have the transaction, then this must be a retry // by the client. We can no longer reconstruct a correct // request so we must fail. // // TODO(bdarnell): if we had a GetTransactionStatus API then // we could lookup the transaction and return either nil or // TransactionAbortedError instead of this ambivalent error. return nil, roachpb.NewErrorf("transaction is already committed or aborted") } if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state in // the client. return nil, roachpb.NewErrorf("cannot commit a read-only transaction") } if log.V(1) { for _, intent := range et.IntentSpans { log.Trace(ctx, fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil { log.Trace(ctx, fmt.Sprintf("error: %s", pErr)) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.cleanupTxn(ctx, *br.Txn) } return br, nil }
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool { tc.Lock() txnMeta := tc.txns[txnID] txn := txnMeta.txn.Clone() hasAbandoned := txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) tc.Unlock() if txn.Status != roachpb.PENDING { // A previous iteration has already determined that the transaction is // already finalized, so we wait for the client to realize that and // want to keep our state for the time being (to dish out the right // error once it returns). return true } // Before we send a heartbeat, determine whether this transaction should be // considered abandoned. If so, exit heartbeat. If ctx.Done() is not nil, then // it is a cancellable Context and we skip this check and use the ctx lifetime // instead of a timeout. if ctx.Done() == nil && hasAbandoned { if log.V(1) { log.Infof(ctx, "transaction %s abandoned; stopping heartbeat", txnMeta.txn) } tc.tryAsyncAbort(txnID) return false } ba := roachpb.BatchRequest{} ba.Txn = &txn hb := &roachpb.HeartbeatTxnRequest{ Now: tc.clock.Now(), } hb.Key = txn.Key ba.Add(hb) log.Trace(ctx, "heartbeat") br, pErr := tc.wrapped.Send(ctx, ba) // Correctness mandates that when we can't heartbeat the transaction, we // make sure the client doesn't keep going. This is particularly relevant // in the case of an ABORTED transaction, but if we can't reach the // transaction record at all, we're going to have to assume we're aborted // as well. if pErr != nil { log.Warningf(ctx, "heartbeat to %s failed: %s", txn, pErr) // We're not going to let the client carry out additional requests, so // try to clean up. tc.tryAsyncAbort(*txn.ID) txn.Status = roachpb.ABORTED } else { txn.Update(br.Responses[0].GetInner().(*roachpb.HeartbeatTxnResponse).Txn) } // Give the news to the txn in the txns map. This will update long-running // transactions (which may find out that they have to restart in that way), // but in particular makes sure that they notice when they've been aborted // (in which case we'll give them an error on their next request). tc.Lock() tc.txns[txnID].txn.Update(&txn) tc.Unlock() return true }
// maybePushTransactions tries to push the conflicting transaction(s) // responsible for the given intents: either move its // timestamp forward on a read/write conflict, abort it on a // write/write conflict, or do nothing if the transaction is no longer // pending. // // Returns a slice of intents which can now be resolved, and an error. // The returned intents should be resolved via intentResolver.resolveIntents. // // If skipIfInFlight is true, then no PushTxns will be sent and no // intents will be returned for any transaction for which there is // another push in progress. This should only be used by callers who // are not relying on the side effect of a push (i.e. only // pushType==PUSH_TOUCH), and who also don't need to synchronize with // the resolution of those intents (e.g. asynchronous resolutions of // intents skipped on inconsistent reads). // // Callers are involved with // a) conflict resolution for commands being executed at the Store with the // client waiting, // b) resolving intents encountered during inconsistent operations, and // c) resolving intents upon EndTransaction which are not local to the given // range. This is the only path in which the transaction is going to be // in non-pending state and doesn't require a push. func (ir *intentResolver) maybePushTransactions(ctx context.Context, intents []roachpb.Intent, h roachpb.Header, pushType roachpb.PushTxnType, skipIfInFlight bool) ( []roachpb.Intent, *roachpb.Error) { now := ir.store.Clock().Now() partialPusherTxn := h.Txn // If there's no pusher, we communicate a priority by sending an empty // txn with only the priority set. This is official usage of PushTxn. if partialPusherTxn == nil { partialPusherTxn = &roachpb.Transaction{ TxnMeta: roachpb.TxnMeta{ Priority: roachpb.MakePriority(h.UserPriority), }, } } log.Trace(ctx, "intent resolution") // Split intents into those we need to push and those which are good to // resolve. ir.mu.Lock() // TODO(tschottdorf): can optimize this and use same underlying slice. var pushIntents, nonPendingIntents []roachpb.Intent var pErr *roachpb.Error for _, intent := range intents { if intent.Status != roachpb.PENDING { // The current intent does not need conflict resolution // because the transaction is already finalized. // This shouldn't happen as all intents created are in // the PENDING status. nonPendingIntents = append(nonPendingIntents, intent) } else if _, ok := ir.mu.inFlight[*intent.Txn.ID]; ok && skipIfInFlight { // Another goroutine is working on this transaction so we can // skip it. if log.V(1) { log.Infof("skipping PushTxn for %s; attempt already in flight", intent.Txn.ID) } continue } else { pushIntents = append(pushIntents, intent) ir.mu.inFlight[*intent.Txn.ID]++ } } ir.mu.Unlock() if len(nonPendingIntents) > 0 { return nil, roachpb.NewErrorf("unexpected aborted/resolved intents: %s", nonPendingIntents) } // Attempt to push the transaction(s) which created the conflicting intent(s). var pushReqs []roachpb.Request for _, intent := range pushIntents { pushReqs = append(pushReqs, &roachpb.PushTxnRequest{ Span: roachpb.Span{ Key: intent.Txn.Key, }, PusherTxn: *partialPusherTxn, PusheeTxn: intent.Txn, PushTo: h.Timestamp, // The timestamp is used by PushTxn for figuring out whether the // transaction is abandoned. If we used the argument's timestamp // here, we would run into busy loops because that timestamp // usually stays fixed among retries, so it will never realize // that a transaction has timed out. See #877. Now: now, PushType: pushType, }) } // TODO(kaneda): Set the transaction in the header so that the // txn is correctly propagated in an error response. b := &client.Batch{} b.InternalAddRequest(pushReqs...) br, pErr := ir.store.db.RunWithResponse(b) ir.mu.Lock() for _, intent := range pushIntents { ir.mu.inFlight[*intent.Txn.ID]-- if ir.mu.inFlight[*intent.Txn.ID] == 0 { delete(ir.mu.inFlight, *intent.Txn.ID) } } ir.mu.Unlock() if pErr != nil { return nil, pErr } var resolveIntents []roachpb.Intent for i, intent := range pushIntents { pushee := br.Responses[i].GetInner().(*roachpb.PushTxnResponse).PusheeTxn intent.Txn = pushee.TxnMeta intent.Status = pushee.Status resolveIntents = append(resolveIntents, intent) } return resolveIntents, nil }
// resolveIntents resolves the given intents. For those which are // local to the range, we submit directly to the local Raft instance; // all non-local intents are resolved asynchronously in a batch. If // `wait` is true, all operations are carried out synchronously and an // error is returned. Otherwise, the call returns without error as // soon as all local resolve commands have been **proposed** (not // executed). This ensures that if a waiting client retries // immediately after calling this function, it will not hit the same // intents again. func (ir *intentResolver) resolveIntents(ctx context.Context, r *Replica, intents []roachpb.Intent, wait bool, poison bool) error { // We're doing async stuff below; those need new traces. ctx, cleanup := tracing.EnsureContext(ctx, ir.store.Tracer()) defer cleanup() log.Trace(ctx, fmt.Sprintf("resolving intents [wait=%t]", wait)) var reqsRemote []roachpb.Request baLocal := roachpb.BatchRequest{} baLocal.Timestamp = ir.store.Clock().Now() for i := range intents { intent := intents[i] // avoids a race in `i, intent := range ...` var resolveArgs roachpb.Request var local bool // whether this intent lives on this Range { if len(intent.EndKey) == 0 { resolveArgs = &roachpb.ResolveIntentRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } local = r.ContainsKey(intent.Key) } else { resolveArgs = &roachpb.ResolveIntentRangeRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } local = r.ContainsKeyRange(intent.Key, intent.EndKey) } } // If the intent isn't (completely) local, we'll need to send an external request. // We'll batch them all up and send at the end. if local { baLocal.Add(resolveArgs) } else { reqsRemote = append(reqsRemote, resolveArgs) } } // The local batch goes directly to Raft. var wg sync.WaitGroup if len(baLocal.Requests) > 0 { action := func() error { // Trace this under the ID of the intent owner. // Create a new span though, since we do not want to pass a span // between goroutines or we risk use-after-finish. sp := r.store.Tracer().StartSpan("resolve intents") defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) // Always operate with a timeout when resolving intents: this // prevents rare shutdown timeouts in tests. ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() _, pErr := r.addWriteCmd(ctxWithTimeout, baLocal, &wg) return pErr.GoError() } wg.Add(1) if wait || !r.store.Stopper().RunLimitedAsyncTask(ir.sem, func() { if err := action(); err != nil { log.Warningf("unable to resolve local intents; %s", err) } }) { // Still run the task when draining. Our caller already has a task and // going async here again is merely for performance, but some intents // need to be resolved because they might block other tasks. See #1684. // Note that handleSkippedIntents has a TODO in case #1684 comes back. if err := action(); err != nil { return err } } } // Resolve all of the intents which aren't local to the Range. if len(reqsRemote) > 0 { b := &client.Batch{} b.InternalAddRequest(reqsRemote...) action := func() error { // TODO(tschottdorf): no tracing here yet. return r.store.DB().Run(b).GoError() } if wait || !r.store.Stopper().RunLimitedAsyncTask(ir.sem, func() { if err := action(); err != nil { log.Warningf("unable to resolve external intents: %s", err) } }) { // As with local intents, try async to not keep the caller waiting, but // when draining just go ahead and do it synchronously. See #1684. if err := action(); err != nil { return err } } } // Wait until the local ResolveIntents batch has been submitted to // raft. No-op if all were non-local. wg.Wait() return nil }
// SnapshotWithContext is main implementation for Snapshot() but it takes a // context to allow tracing. func (r *Replica) SnapshotWithContext(ctx context.Context) (raftpb.Snapshot, error) { rangeID := r.RangeID // If a snapshot is in progress, see if it's ready. if r.mu.snapshotChan != nil { select { case snapData, ok := <-r.mu.snapshotChan: if ok { return snapData, nil } // If the old channel was closed, fall through to start a new task. default: // If the result is not ready, return immediately. log.Trace(ctx, "snapshot not yet ready") return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable } } if r.exceedsDoubleSplitSizeLocked() { maxBytes := r.mu.maxBytes size := r.mu.state.Stats.Total() log.Infof(ctx, "%s: not generating snapshot because replica is too large: %d > 2 * %d", r, size, maxBytes) return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable } // See if there is already a snapshot running for this store. if !r.store.AcquireRaftSnapshot() { log.Trace(ctx, "snapshot already running") return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable } startKey := r.mu.state.Desc.StartKey // Use an unbuffered channel so the worker stays alive until someone // reads from the channel, and can abandon the snapshot if it gets stale. ch := make(chan (raftpb.Snapshot)) if r.store.Stopper().RunAsyncTask(func() { defer close(ch) sp := r.store.Tracer().StartSpan("snapshot async") ctxInner := opentracing.ContextWithSpan(context.Background(), sp) defer sp.Finish() snap := r.store.NewSnapshot() log.Tracef(ctxInner, "new engine snapshot for replica %s", r) defer snap.Close() defer r.store.ReleaseRaftSnapshot() // Delegate to a static function to make sure that we do not depend // on any indirect calls to r.store.Engine() (or other in-memory // state of the Replica). Everything must come from the snapshot. snapData, err := snapshot(context.Background(), snap, rangeID, r.store.raftEntryCache, startKey) if err != nil { log.Errorf(ctxInner, "%s: error generating snapshot: %s", r, err) } else { log.Trace(ctxInner, "snapshot generated") r.store.metrics.RangeSnapshotsGenerated.Inc(1) select { case ch <- snapData: log.Trace(ctxInner, "snapshot accepted") case <-time.After(r.store.ctx.AsyncSnapshotMaxAge): // If raft decides it doesn't need this snapshot any more (or // just takes too long to use it), abandon it to save memory. log.Infof(ctxInner, "%s: abandoning snapshot after %s", r, r.store.ctx.AsyncSnapshotMaxAge) case <-r.store.Stopper().ShouldQuiesce(): } } }) == nil { r.mu.snapshotChan = ch } else { r.store.ReleaseRaftSnapshot() } if r.store.ctx.BlockingSnapshotDuration > 0 { select { case snap, ok := <-r.mu.snapshotChan: if ok { return snap, nil } case <-time.After(r.store.ctx.BlockingSnapshotDuration): log.Trace(ctx, "snapshot blocking duration exceeded") } } return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable }
// Batch implements the roachpb.KVServer interface. func (n *Node) Batch(ctx context.Context, args *roachpb.BatchRequest) (*roachpb.BatchResponse, error) { // TODO(marc): this code is duplicated in kv/db.go, which should be fixed. // Also, grpc's authentication model (which gives credential access in the // request handler) doesn't really fit with the current design of the // security package (which assumes that TLS state is only given at connection // time) - that should be fixed. if peer, ok := peer.FromContext(ctx); ok { if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok { certUser, err := security.GetCertificateUser(&tlsInfo.State) if err != nil { return nil, err } if certUser != security.NodeUser { return nil, util.Errorf("user %s is not allowed", certUser) } } } var br *roachpb.BatchResponse opName := "node " + strconv.Itoa(int(n.Descriptor.NodeID)) // could save allocs here fail := func(err error) { br = &roachpb.BatchResponse{} br.Error = roachpb.NewError(err) } f := func() { sp, err := tracing.JoinOrNew(n.ctx.Tracer, args.Trace, opName) if err != nil { fail(err) return } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { sp.LogEvent("delegating to snowball tracing") sp.Finish() if sp, err = tracing.JoinOrNewSnowball(opName, args.Trace, func(rawSpan basictracer.RawSpan) { encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(err) } br.CollectedSpans = append(br.CollectedSpans, encSp) }); err != nil { fail(err) return } } defer sp.Finish() traceCtx := opentracing.ContextWithSpan(n.context(ctx), sp) tStart := timeutil.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(traceCtx, *args) if pErr != nil { br = &roachpb.BatchResponse{} log.Trace(traceCtx, fmt.Sprintf("error: %T", pErr.GetDetail())) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr } if !n.stopper.RunTask(f) { return nil, util.Errorf("node %d stopped", n.Descriptor.NodeID) } return br, nil }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp := opentracing.SpanFromContext(ctx) if sp == nil { sp = tc.tracer.StartSpan(opTxnCoordSender) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) } // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } if err := tc.tracer.Inject(sp.Context(), basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } } startNS := tc.clock.PhysicalNow() if ba.Txn != nil { // If this request is part of a transaction... if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var et *roachpb.EndTransactionRequest var hasET bool { var rArgs roachpb.Request rArgs, hasET = ba.GetArg(roachpb.EndTransaction) if hasET { et = rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } } } if pErr := func() *roachpb.Error { tc.Lock() defer tc.Unlock() if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil { return pErr } if !hasET { return nil } // Everything below is carried out only when trying to commit. // Populate et.IntentSpans, taking into account both any existing // and new writes, and taking care to perform proper deduplication. txnMeta := tc.txns[*ba.Txn.ID] distinctSpans := true if txnMeta != nil { et.IntentSpans = txnMeta.keys // Defensively set distinctSpans to false if we had any previous // requests in this transaction. This effectively limits the distinct // spans optimization to 1pc transactions. distinctSpans = len(txnMeta.keys) == 0 } ba.IntentSpanIterate(func(key, endKey roachpb.Key) { et.IntentSpans = append(et.IntentSpans, roachpb.Span{ Key: key, EndKey: endKey, }) }) // TODO(peter): Populate DistinctSpans on all batches, not just batches // which contain an EndTransactionRequest. ba.Header.DistinctSpans = roachpb.MergeSpans(&et.IntentSpans) && distinctSpans if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state // in the client. return roachpb.NewErrorf("cannot commit a read-only transaction") } if txnMeta != nil { txnMeta.keys = et.IntentSpans } return nil }(); pErr != nil { return nil, pErr } if hasET && log.V(1) { for _, intent := range et.IntentSpans { log.Trace(ctx, fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey)) } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil { log.Trace(ctx, fmt.Sprintf("error: %s", pErr)) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.Lock() tc.cleanupTxnLocked(ctx, *br.Txn) tc.Unlock() } return br, nil }
// Start starts the test cluster by bootstrapping an in-memory store // (defaults to maximum of 50M). The server is started, launching the // node RPC server and all HTTP endpoints. Use the value of // TestServer.Addr after Start() for client connections. Use Stop() // to shutdown the server after the test completes. func (ltc *LocalTestCluster) Start(t util.Tester) { nodeID := roachpb.NodeID(1) nodeDesc := &roachpb.NodeDescriptor{NodeID: nodeID} ltc.tester = t ltc.Manual = hlc.NewManualClock(0) ltc.Clock = hlc.NewClock(ltc.Manual.UnixNano) ltc.Stopper = stop.NewStopper() rpcContext := rpc.NewContext(testutils.NewNodeTestBaseContext(), ltc.Clock, ltc.Stopper) ltc.Gossip = gossip.New(rpcContext, nil, ltc.Stopper) ltc.Eng = engine.NewInMem(roachpb.Attributes{}, 50<<20, ltc.Stopper) ltc.stores = storage.NewStores(ltc.Clock) tracer := tracing.NewTracer() var rpcSend rpcSendFn = func(_ SendOptions, _ ReplicaSlice, args roachpb.BatchRequest, _ *rpc.Context) (*roachpb.BatchResponse, error) { if ltc.Latency > 0 { time.Sleep(ltc.Latency) } sp := tracer.StartSpan("node") defer sp.Finish() ctx := opentracing.ContextWithSpan(context.Background(), sp) log.Trace(ctx, args.String()) br, pErr := ltc.stores.Send(ctx, args) if br == nil { br = &roachpb.BatchResponse{} } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(ltc.stores, br)) } br.Error = pErr if pErr != nil { log.Trace(ctx, "error: "+pErr.String()) } return br, nil } retryOpts := GetDefaultDistSenderRetryOptions() retryOpts.Closer = ltc.Stopper.ShouldDrain() ltc.distSender = NewDistSender(&DistSenderContext{ Clock: ltc.Clock, RangeDescriptorCacheSize: defaultRangeDescriptorCacheSize, RangeLookupMaxRanges: defaultRangeLookupMaxRanges, LeaderCacheSize: defaultLeaderCacheSize, RPCRetryOptions: &retryOpts, nodeDescriptor: nodeDesc, RPCSend: rpcSend, // defined above RangeDescriptorDB: ltc.stores, // for descriptor lookup }, ltc.Gossip) ltc.Sender = NewTxnCoordSender(ltc.distSender, ltc.Clock, false /* !linearizable */, tracer, ltc.Stopper, NewTxnMetrics(metric.NewRegistry())) ltc.DB = client.NewDB(ltc.Sender) transport := storage.NewDummyRaftTransport() ctx := storage.TestStoreContext() ctx.Clock = ltc.Clock ctx.DB = ltc.DB ctx.Gossip = ltc.Gossip ctx.Transport = transport ctx.Tracer = tracer ltc.Store = storage.NewStore(ctx, ltc.Eng, nodeDesc) if err := ltc.Store.Bootstrap(roachpb.StoreIdent{NodeID: nodeID, StoreID: 1}, ltc.Stopper); err != nil { t.Fatalf("unable to start local test cluster: %s", err) } ltc.stores.AddStore(ltc.Store) if err := ltc.Store.BootstrapRange(nil); err != nil { t.Fatalf("unable to start local test cluster: %s", err) } if err := ltc.Store.Start(ltc.Stopper); err != nil { t.Fatalf("unable to start local test cluster: %s", err) } ltc.Gossip.SetNodeID(nodeDesc.NodeID) if err := ltc.Gossip.SetNodeDescriptor(nodeDesc); err != nil { t.Fatalf("unable to set node descriptor: %s", err) } }
// Send sends one or more RPCs to clients specified by the slice of // replicas. On success, Send returns the first successful reply. Otherwise, // Send returns an error if and as soon as the number of failed RPCs exceeds // the available endpoints less the number of required replies. func send(opts SendOptions, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1), false) } done := make(chan batchCall, len(replicas)) clients := make([]batchClient, 0, len(replicas)) for _, replica := range replicas { conn, err := rpcContext.GRPCDial(replica.NodeDesc.Address.String()) if err != nil { return nil, err } argsCopy := args argsCopy.Replica = replica.ReplicaDescriptor clients = append(clients, batchClient{ remoteAddr: replica.NodeDesc.Address.String(), conn: conn, client: roachpb.NewInternalClient(conn), args: argsCopy, }) } // Put known-unhealthy clients last. nHealthy, err := splitHealthy(clients) if err != nil { return nil, err } var orderedClients []batchClient switch opts.Ordering { case orderStable: orderedClients = clients case orderRandom: // Randomly permute order, but keep known-unhealthy clients last. shuffleClients(clients[:nHealthy]) shuffleClients(clients[nHealthy:]) orderedClients = clients } // TODO(spencer): going to need to also sort by affinity; closest // ping time should win. Makes sense to have the rpc client/server // heartbeat measure ping times. With a bit of seasoning, each // node will be able to order the healthy replicas based on latency. // Send the first request. sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] var errors, retryableErrors int // Wait for completions. var sendNextTimer util.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if len(orderedClients) > 0 { log.Trace(opts.Context, "timeout, trying next peer") sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] } case call := <-done: err := call.err if err == nil { if log.V(2) { log.Infof("successful reply: %+v", call.reply) } return call.reply, nil } // Error handling. if log.V(1) { log.Warningf("error reply: %s", err) } errors++ // Since we have a reconnecting client here, disconnect errors are retryable. disconnected := err == io.ErrUnexpectedEOF if retryErr, ok := err.(retry.Retryable); disconnected || (ok && retryErr.CanRetry()) { retryableErrors++ } if remainingNonErrorRPCs := len(replicas) - errors; remainingNonErrorRPCs < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("too many errors encountered (%d of %d total): %v", errors, len(clients), err), remainingNonErrorRPCs+retryableErrors >= 1) } // Send to additional replicas if available. if len(orderedClients) > 0 { log.Trace(opts.Context, "error, trying next peer") sendOneFn(opts, rpcContext, orderedClients[0], done) orderedClients = orderedClients[1:] } } } }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState( startNS int64, ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error { newTxn := &roachpb.Transaction{} newTxn.Update(ba.Txn) if pErr == nil { newTxn.Update(br.Txn) } else { newTxn.Update(pErr.GetTxn()) } switch t := pErr.GetDetail().(type) { case *roachpb.TransactionStatusError: // Likely already committed or more obscure errors such as epoch or // timestamp regressions; consider txn dead. defer tc.cleanupTxn(ctx, *pErr.GetTxn()) case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // If the reader encountered a newer write within the uncertainty // interval, we advance the txn's timestamp just past the last observed // timestamp from the node. restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode) if !ok { pErr = roachpb.NewError(util.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode)) } else { newTxn.Timestamp.Forward(restartTS) newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp) } case *roachpb.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp) newTxn.Priority = pErr.GetTxn().Priority // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxn(ctx, *newTxn) case *roachpb.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp) newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp) case *roachpb.TransactionRetryError: // Increase timestamp so on restart, we're ahead of any timestamp // cache entries or newer versions which caused the restart. newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp) case *roachpb.WriteTooOldError: newTxn.Restart(ba.UserPriority, newTxn.Priority, t.ActualTimestamp) case nil: // Nothing to do here, avoid the default case. default: if pErr.GetTxn() != nil { if pErr.CanRetry() { panic("Retryable internal error must not happen at this level") } else { // Do not clean up the transaction here since the client might still // want to continue the transaction. For example, a client might // continue its transaction after receiving ConditionFailedError, which // can come from a unique index violation. } } } if pErr != nil && pErr.GetTxn() != nil { // Avoid changing existing errors because sometimes they escape into // goroutines and then there are races. Fairly sure there isn't one // here, but better safe than sorry. pErrShallow := *pErr pErrShallow.SetTxn(newTxn) pErr = &pErrShallow } if newTxn.ID == nil { return pErr } txnID := *newTxn.ID tc.Lock() defer tc.Unlock() txnMeta := tc.txns[txnID] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). See #3303. var intentGroup interval.RangeGroup if txnMeta != nil { intentGroup = txnMeta.keys } else if pErr == nil || newTxn.Writing { intentGroup = interval.NewRangeTree() } if intentGroup != nil { // Adding the intents even on error reduces the likelihood of dangling // intents blocking concurrent writers for extended periods of time. // See #3346. ba.IntentSpanIterate(func(key, endKey roachpb.Key) { addKeyRange(intentGroup, key, endKey) }) if txnMeta == nil && intentGroup.Len() > 0 { if !newTxn.Writing { panic("txn with intents marked as non-writing") } // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding { log.Trace(ctx, "coordinator spawns") txnMeta = &txnMetadata{ txn: *newTxn, keys: intentGroup, firstUpdateNanos: startNS, lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[txnID] = txnMeta if !tc.stopper.RunAsyncTask(func() { tc.heartbeatLoop(ctx, txnID) }) { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(txnID) return roachpb.NewError(&roachpb.NodeUnavailableError{}) } } else { // If this was a successful one phase commit, update stats // directly as they won't otherwise be updated on heartbeat // loop shutdown. etArgs, ok := br.Responses[len(br.Responses)-1].GetInner().(*roachpb.EndTransactionResponse) tc.updateStats(tc.clock.PhysicalNow()-startNS, 0, newTxn.Status, ok && etArgs.OnePhaseCommit) } } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn = *newTxn if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if pErr == nil { // For successful transactional requests, always send the updated txn // record back. br.Txn = newTxn } return pErr }
// initStores initializes the Stores map from ID to Store. Stores are // added to the local sender if already bootstrapped. A bootstrapped // Store has a valid ident with cluster, node and Store IDs set. If // the Store doesn't yet have a valid ident, it's added to the // bootstraps list for initialization once the cluster and node IDs // have been determined. func (n *Node) initStores( ctx context.Context, engines []engine.Engine, stopper *stop.Stopper, ) error { var bootstraps []*storage.Store if len(engines) == 0 { return errors.Errorf("no engines") } for _, e := range engines { s := storage.NewStore(n.ctx, e, &n.Descriptor) log.Tracef(ctx, "created store for engine: %s", e) // Initialize each store in turn, handling un-bootstrapped errors by // adding the store to the bootstraps list. if err := s.Start(ctx, stopper); err != nil { if _, ok := err.(*storage.NotBootstrappedError); ok { log.Infof(ctx, "store %s not bootstrapped", s) bootstraps = append(bootstraps, s) continue } return errors.Errorf("failed to start store: %s", err) } if s.Ident.ClusterID == *uuid.EmptyUUID || s.Ident.NodeID == 0 { return errors.Errorf("unidentified store: %s", s) } capacity, err := s.Capacity() if err != nil { return errors.Errorf("could not query store capacity: %s", err) } log.Infof(ctx, "initialized store %s: %+v", s, capacity) n.addStore(s) } // If there are no initialized stores and no gossip resolvers, // bootstrap this node as the seed of a new cluster. if n.stores.GetStoreCount() == 0 { resolvers := n.ctx.Gossip.GetResolvers() // Check for the case of uninitialized node having only itself specified as join host. switch len(resolvers) { case 0: return errNeedsBootstrap case 1: if resolvers[0].Addr() == n.Descriptor.Address.String() { return errCannotJoinSelf } } } // Verify all initialized stores agree on cluster and node IDs. if err := n.validateStores(); err != nil { return err } log.Trace(ctx, "validated stores") // Set the stores map as the gossip persistent storage, so that // gossip can bootstrap using the most recently persisted set of // node addresses. if err := n.ctx.Gossip.SetStorage(n.stores); err != nil { return fmt.Errorf("failed to initialize the gossip interface: %s", err) } // Connect gossip before starting bootstrap. For new nodes, connecting // to the gossip network is necessary to get the cluster ID. n.connectGossip() log.Trace(ctx, "connected to gossip") // If no NodeID has been assigned yet, allocate a new node ID by // supplying 0 to initNodeID. if n.Descriptor.NodeID == 0 { n.initNodeID(0) n.initialBoot = true log.Tracef(ctx, "allocated node ID %d", n.Descriptor.NodeID) } // Bootstrap any uninitialized stores asynchronously. if len(bootstraps) > 0 { if err := stopper.RunAsyncTask(func() { n.bootstrapStores(n.Ctx(), bootstraps, stopper) }); err != nil { return err } } return nil }
// sendToReplicas sends one or more RPCs to clients specified by the slice of // replicas. On success, Send returns the first successful reply. Otherwise, // Send returns an error if and as soon as the number of failed RPCs exceeds // the available endpoints less the number of required replies. func (ds *DistSender) sendToReplicas( opts SendOptions, rangeID roachpb.RangeID, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context, ) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1)) } done := make(chan BatchCall, len(replicas)) transportFactory := opts.transportFactory if transportFactory == nil { transportFactory = grpcTransportFactory } transport, err := transportFactory(opts, rpcContext, replicas, args) if err != nil { return nil, err } defer transport.Close() if transport.IsExhausted() { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed", len(replicas))) } // Send the first request. pending := 1 transport.SendNext(done) // Wait for completions. This loop will retry operations that fail // with errors that reflect per-replica state and may succeed on // other replicas. var sendNextTimer timeutil.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if !transport.IsExhausted() { log.Trace(opts.Context, "timeout, trying next peer") pending++ transport.SendNext(done) } case call := <-done: pending-- err := call.Err if err == nil { if log.V(2) { log.Infof(opts.Context, "RPC reply: %+v", call.Reply) } else if log.V(1) && call.Reply.Error != nil { log.Infof(opts.Context, "application error: %s", call.Reply.Error) } if !ds.handlePerReplicaError(rangeID, call.Reply.Error) { return call.Reply, nil } // Extract the detail so it can be included in the error // message if this is our last replica. // // TODO(bdarnell): The last error is not necessarily the best // one to return; we may want to remember the "best" error // we've seen (for example, a NotLeaseHolderError conveys more // information than a RangeNotFound). err = call.Reply.Error.GoError() } else if log.V(1) { log.Warningf(opts.Context, "RPC error: %s", err) } // Send to additional replicas if available. if !transport.IsExhausted() { log.Tracef(opts.Context, "error, trying next peer: %s", err) pending++ transport.SendNext(done) } if pending == 0 { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err)) } } } }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation, and is different from // contexts used at runtime for server's background work (like `s.Ctx()`). func (s *Server) Start(ctx context.Context) error { // Copy log tags from s.Ctx() ctx = log.WithLogTagsFromCtx(ctx, s.Ctx()) tlsConfig, err := s.ctx.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.ctx.Addr) if err != nil { return err } log.Tracef(ctx, "listening on port %s", s.ctx.Addr) unresolvedAddr, err := officialAddr(s.ctx.Addr, ln.Addr()) if err != nil { return err } s.ctx.Addr = unresolvedAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.ctx.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.ctx.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.ctx.HTTPAddr = unresolvedHTTPAddr.String() s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(s.Ctx(), err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { if err := s.pgServer.ServeConn(conn); err != nil && !netutil.IsClosedConnection(err) { log.Error(s.Ctx(), err) } })) }) if len(s.ctx.SocketFile) != 0 { // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.ctx.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(s.Ctx(), err) } }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { if err := s.pgServer.ServeConn(conn); err != nil && !netutil.IsClosedConnection(err) { log.Error(s.Ctx(), err) } })) }) } // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAddr) log.Trace(ctx, "started gossip") if err := s.node.start(ctx, unresolvedAddr, s.ctx.Engines, s.ctx.NodeAttributes); err != nil { return err } log.Trace(ctx, "started node") // Set the NodeID in the base context (which was inherited by the // various components of the server). s.nodeLogTagVal.Set(int64(s.node.Descriptor.NodeID)) // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.ctx.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource(s.recorder, s.ctx.MetricsSampleInterval, ts.Resolution10s, s.stopper) // Begin recording status summaries. s.node.startWriteSummaries(s.ctx.MetricsSampleInterval) s.sqlExecutor.SetNodeID(s.node.Descriptor.NodeID) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := new(sql.SchemaChangeManagerTestingKnobs) if s.ctx.TestingKnobs.SQLSchemaChangeManager != nil { testingKnobs = s.ctx.TestingKnobs.SQLSchemaChangeManager.(*sql.SchemaChangeManagerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) log.Infof(s.Ctx(), "starting %s server at %s", s.ctx.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(s.Ctx(), "starting grpc/postgres server at %s", unresolvedAddr) if len(s.ctx.SocketFile) != 0 { log.Infof(s.Ctx(), "starting postgres server at unix:%s", s.ctx.SocketFile) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) log.Trace(ctx, "accepting connections") // Initialize grpc-gateway mux and context. jsonpb := &util.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(util.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(util.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(util.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(util.ProtoContentType, protopb), gwruntime.WithMarshalerOption(util.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.Ctx()) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.ctx.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{&s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminEndpoint, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, s.status) s.mux.Handle(healthEndpoint, s.status) log.Trace(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(s.Ctx(), "failed to signal readiness using systemd protocol: %s", err) } log.Trace(ctx, "server ready") return nil }
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool { tc.Lock() proceed := true txnMeta := tc.txns[txnID] var intentSpans []roachpb.Span // Before we send a heartbeat, determine whether this transaction // should be considered abandoned. If so, exit heartbeat. if txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) { // TODO(tschottdorf): should we be more proactive here? // The client might be continuing the transaction // through another coordinator, but in the most likely // case it's just gone and the open transaction record // could block concurrent operations. if log.V(1) { log.Infof("transaction %s abandoned; stopping heartbeat", txnMeta.txn) } proceed = false // Grab the intents here to avoid potential race. intentSpans = collectIntentSpans(txnMeta.keys) txnMeta.keys.Clear() } // txnMeta.txn is possibly replaced concurrently, // so grab a copy before unlocking. txn := txnMeta.txn.Clone() tc.Unlock() ba := roachpb.BatchRequest{} ba.Txn = &txn if !proceed { // Actively abort the transaction and its intents since we assume it's abandoned. et := &roachpb.EndTransactionRequest{ Span: roachpb.Span{ Key: txn.Key, }, Commit: false, IntentSpans: intentSpans, } ba.Add(et) tc.stopper.RunAsyncTask(func() { // Use the wrapped sender since the normal Sender // does not allow clients to specify intents. // TODO(tschottdorf): not using the existing context here since that // leads to use-after-finish of the contained trace. Should fork off // before the goroutine. if _, pErr := tc.wrapped.Send(context.Background(), ba); pErr != nil { if log.V(1) { log.Warningf("abort due to inactivity failed for %s: %s ", txn, pErr) } } }) return false } hb := &roachpb.HeartbeatTxnRequest{ Now: tc.clock.Now(), } hb.Key = txn.Key ba.Add(hb) log.Trace(ctx, "heartbeat") _, err := tc.wrapped.Send(ctx, ba) // If the transaction is not in pending state, then we can stop // the heartbeat. It's either aborted or committed, and we resolve // write intents accordingly. if err != nil { log.Warningf("heartbeat to %s failed: %s", txn, err) } // TODO(bdarnell): once we have gotten a heartbeat response with // Status != PENDING, future heartbeats are useless. However, we // need to continue the heartbeatLoop until the client either // commits or abandons the transaction. We could save a little // pointless work by restructuring this loop to stop sending // heartbeats between the time that the transaction is aborted and // the client finds out. Furthermore, we could use this information // to send TransactionAbortedErrors to the client so it can restart // immediately instead of running until its EndTransaction. return true }
// lookupRangeDescriptorInternal is called from LookupRangeDescriptor or from tests. // // If a WaitGroup is supplied, it is signaled when the request is // added to the inflight request map (with or without merging) or the // function finishes. Used for testing. func (rdc *rangeDescriptorCache) lookupRangeDescriptorInternal( ctx context.Context, key roachpb.RKey, evictToken *evictionToken, considerIntents bool, useReverseScan bool, wg *sync.WaitGroup, ) (*roachpb.RangeDescriptor, *evictionToken, error) { rdc.rangeCache.RLock() doneWg := func() { if wg != nil { wg.Done() } wg = nil } defer doneWg() if _, desc, err := rdc.getCachedRangeDescriptorLocked(key, useReverseScan); err != nil { rdc.rangeCache.RUnlock() return nil, nil, err } else if desc != nil { rdc.rangeCache.RUnlock() returnToken := rdc.makeEvictionToken(desc, func() error { return rdc.evictCachedRangeDescriptorLocked(key, desc, useReverseScan) }) log.Trace(ctx, "looked up range descriptor from cache") return desc, returnToken, nil } if log.V(3) { log.Infof(ctx, "lookup range descriptor: key=%s\n%s", key, rdc.stringLocked()) } else if log.V(2) { log.Infof(ctx, "lookup range descriptor: key=%s", key) } var res lookupResult requestKey := makeLookupRequestKey(key, evictToken, considerIntents, useReverseScan) rdc.lookupRequests.Lock() if req, inflight := rdc.lookupRequests.inflight[requestKey]; inflight { resC := make(chan lookupResult, 1) req.observers = append(req.observers, resC) rdc.lookupRequests.inflight[requestKey] = req rdc.lookupRequests.Unlock() rdc.rangeCache.RUnlock() doneWg() res = <-resC log.Trace(ctx, "looked up range descriptor with shared request") } else { rdc.lookupRequests.inflight[requestKey] = req rdc.lookupRequests.Unlock() rdc.rangeCache.RUnlock() doneWg() rs, preRs, err := rdc.performRangeLookup(ctx, key, considerIntents, useReverseScan) if err != nil { res = lookupResult{err: err} } else { switch len(rs) { case 0: res = lookupResult{err: fmt.Errorf("no range descriptors returned for %s", key)} case 1: desc := &rs[0] res = lookupResult{ desc: desc, evictToken: rdc.makeEvictionToken(desc, func() error { return rdc.evictCachedRangeDescriptorLocked(key, desc, useReverseScan) }), } case 2: if !considerIntents { panic(fmt.Sprintf("more than 1 matching range descriptor returned for %s when not considering intents: %v", key, rs)) } desc := &rs[0] nextDesc := rs[1] res = lookupResult{ desc: desc, evictToken: rdc.makeEvictionToken(desc, func() error { return rdc.insertRangeDescriptorsLocked(nextDesc) }), } default: panic(fmt.Sprintf("more than 2 matching range descriptors returned for %s: %v", key, rs)) } } // We want to be assured that all goroutines which experienced a cache miss // have joined our in-flight request, and all others will experience a // cache hit. This requires atomicity across cache population and // notification, hence this exclusive lock. rdc.rangeCache.Lock() if res.err == nil { // These need to be separate because we need to preserve the pointer to rs[0] // so that the seenDesc logic works correctly in EvictCachedRangeDescriptor. An // append could cause a copy, which would change the address of rs[0]. We insert // the prefetched descriptors first to avoid any unintended overwriting. if err := rdc.insertRangeDescriptorsLocked(preRs...); err != nil { log.Warningf(ctx, "range cache inserting prefetched descriptors failed: %v", err) } if err := rdc.insertRangeDescriptorsLocked(rs...); err != nil { res = lookupResult{err: err} } } // rdc.lookupRequests does not need to be locked here because we hold an exclusive // write lock on rdc.rangeCache. However, we do anyway for clarity and future proofing. rdc.lookupRequests.Lock() for _, observer := range rdc.lookupRequests.inflight[requestKey].observers { observer <- res } delete(rdc.lookupRequests.inflight, requestKey) rdc.lookupRequests.Unlock() rdc.rangeCache.Unlock() log.Trace(ctx, "looked up range descriptor") } // It rarely may be possible that we somehow got grouped in with the // wrong RangeLookup (eg. from a double split), so if we did, return // a retryable lookupMismatchError with an unmodified eviction token. if res.desc != nil { if (!useReverseScan && !res.desc.ContainsKey(key)) || (useReverseScan && !res.desc.ContainsExclusiveEndKey(key)) { return nil, evictToken, lookupMismatchError{ desiredKey: key, mismatchedDesc: res.desc, } } } return res.desc, res.evictToken, res.err }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() ctx, cleanup := tracing.EnsureContext(ctx, ds.Tracer) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken evictionToken var needAnother bool var pErr *roachpb.Error var finished bool for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") desc, needAnother, evictToken, pErr = ds.getDescriptors(rs, evictToken, isReverse) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if pErr != nil { log.Trace(ctx, "range descriptor lookup failed: "+pErr.String()) if pErr.Retryable { if log.V(1) { log.Warning(pErr) } continue } break } else { log.Trace(ctx, "looked up range descriptor") } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { // This approach is needed because rs.EndKey is exclusive. return desc.ContainsKeyRange(desc.StartKey, rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } if log.V(1) { log.Warningf("failed to invoke %s: %s", ba, pErr) } log.Trace(ctx, fmt.Sprintf("reply error: %T", pErr.GetDetail())) // Error handling below. // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // For an RPC error to occur, we must've been unable to contact // any replicas. In this case, likely all nodes are down (or // not getting back to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date // replicas, so clearing the descriptor here should be a good // idea. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } if tErr.CanRetry() { continue } case *roachpb.RangeNotFoundError: // Range descriptor might be out of date - evict it. This is // likely the result of a rebalance. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(tErr) } continue case *roachpb.NotLeaderError: newLeader := tErr.Leader if newLeader != nil { // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale range descriptor; // evict cache. if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } } } else { // If the new leader is unknown, we were talking to a // replica that is partitioned away from the majority. Our // range descriptor may be stale, so clear the cache. // // TODO(bdarnell): An unknown-leader error doesn't // necessarily mean our descriptor is stale. Ideally we // would treat these errors more like SendError: retry on // another node (at a lower level), and then if it reaches // this level then we know we've exhausted our options and // must clear the cache. if err := evictToken.Evict(); err != nil { return nil, roachpb.NewError(err), false } newLeader = &roachpb.ReplicaDescriptor{} } // Next, cache the new leader. ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader) if log.V(1) { log.Warning(tErr) } r.Reset() continue case retry.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(tErr) } continue } } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false default: log.Fatal("exited retry loop with nil error but finished=false") } } ba.Txn.Update(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if ba.MaxScanResults > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { if cResp, ok := resp.GetInner().(roachpb.Countable); ok { numResults += cResp.Count() } } if numResults > ba.MaxScanResults { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxScanResults)) } ba.MaxScanResults -= numResults if ba.MaxScanResults == 0 { // We are done with this batch. Some requests might have NoopResponses; we must // replace them with empty responses of the proper type. for i, req := range ba.Requests { if _, ok := br.Responses[i].GetInner().(*roachpb.NoopResponse); !ok { continue } union := roachpb.ResponseUnion{} var reply roachpb.Response if _, ok := req.GetInner().(*roachpb.ScanRequest); ok { reply = &roachpb.ScanResponse{} } else { _ = req.GetInner().(*roachpb.ReverseScanRequest) reply = &roachpb.ReverseScanResponse{} } union.MustSetInner(reply) br.Responses[i] = union } return br, nil, false } } // If this request has a bound (such as MaxResults in // ScanRequest) and we are going to query at least one more range, // check whether enough rows have been retrieved. // TODO(tschottdorf): need tests for executing a multi-range batch // with various bounded requests which saturate at different times. if needAnother { // Start with the assumption that all requests are saturated. // Below, we look at each and decide whether that's true. // Everything that is indeed saturated is "masked out" from the // batch request; only if that's all requests does needAnother // remain false. needAnother = false if br == nil { // Clone ba.Requests. This is because we're multi-range, and // some requests may be bounded, which could lead to them being // masked out once they're saturated. We don't want to risk // removing requests that way in the "master copy" since that // could lead to omitting requests in certain retry scenarios. ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...) } for i, union := range ba.Requests { args := union.GetInner() if _, ok := args.(*roachpb.NoopRequest); ok { // NoopRequests are skipped. continue } boundedArg, ok := args.(roachpb.Bounded) if !ok { // Non-bounded request. We will have to query all ranges. needAnother = true continue } prevBound := boundedArg.GetBound() cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable) if !ok || prevBound <= 0 { // Request bounded, but without max results. Again, will // need to query everything we can. The case in which the reply // isn't countable occurs when the request wasn't active for // that range (since it didn't apply to it), so the response // is a NoopResponse. needAnother = true continue } nextBound := prevBound - cReply.Count() if nextBound <= 0 { // We've hit max results for this piece of the batch. Mask // it out (we've copied the requests slice above, so this // is kosher). union := &ba.Requests[i] // avoid working on copy union.MustSetInner(&noopRequest) continue } // The request isn't saturated yet. needAnother = true boundedArg.SetBound(nextBound) } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } log.Trace(ctx, "querying next range") } }
func (rq *replicateQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(repl.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Trace(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget(zone.ReplicaAttrs[0], desc.Replicas, true) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VTracef(1, ctx, "%s: adding replica to %+v due to under-replication", repl, newReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Trace(ctx, "removing a replica") // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas, repl.store.StoreID()) if err != nil { return err } log.VTracef(1, ctx, "%s: removing replica %+v due to over-replication", repl, removeReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil { return err } // Do not requeue if we removed ourselves. if removeReplica.StoreID == repl.store.StoreID() { return nil } case AllocatorRemoveDead: log.Trace(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VTracef(1, ctx, "%s: removing dead replica %+v from store", repl, deadReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: log.Trace(ctx, "considering a rebalance") // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. // // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. rebalanceStore := rq.allocator.RebalanceTarget( zone.ReplicaAttrs[0], desc.Replicas, repl.store.StoreID()) if rebalanceStore == nil { log.VTracef(1, ctx, "%s: no suitable rebalance target", repl) // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VTracef(1, ctx, "%s: rebalancing to %+v", repl, rebalanceReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil { return err } } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() // TODO(radu): when contexts are properly plumbed, we should be able to get // the tracer from ctx, not from the DistSender. ctx, cleanup := tracing.EnsureContext(ctx, tracing.TracerFromCtx(ds.Ctx)) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken *evictionToken var needAnother bool var pErr *roachpb.Error var finished bool var numAttempts int for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { numAttempts++ { const magicLogCurAttempt = 20 var seq int32 if ba.Txn != nil { seq = ba.Txn.Sequence } if numAttempts%magicLogCurAttempt == 0 || seq%magicLogCurAttempt == 0 { // Log a message if a request appears to get stuck for a long // time or, potentially, forever. See #8975. // The local counter captures this loop here; the Sequence number // should capture anything higher up (as it needs to be // incremented every time this method is called). log.Warningf( ctx, "%d retries for an RPC at sequence %d, last error was: %s, remaining key ranges %s: %s", numAttempts, seq, pErr, rs, ba, ) } } // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") var err error desc, needAnother, evictToken, err = ds.getDescriptors(ctx, rs, evictToken, isReverse) // getDescriptors may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.Trace(ctx, "range descriptor lookup failed: "+err.Error()) if log.V(1) { log.Warning(ctx, err) } pErr = roachpb.NewError(err) continue } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { return desc.ContainsExclusiveEndKey(rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } log.VTracef(1, ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(ctx, tErr) } continue } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false case <-ctx.Done(): return nil, roachpb.NewError(ctx.Err()), false default: log.Fatal(ctx, "exited retry loop with nil error but finished=false") } } ba.UpdateTxn(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } if ba.MaxSpanRequestKeys > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { numResults += resp.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults if ba.MaxSpanRequestKeys == 0 { // prepare the batch response after meeting the max key limit. fillSkippedResponses(ba, br, rs) // done, exit loop. return br, nil, false } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } // key cannot be less that the end key. if !rs.Key.Less(rs.EndKey) { panic(fmt.Sprintf("start key %s is less than %s", rs.Key, rs.EndKey)) } log.Trace(ctx, "querying next range") } }
// process performs a consistent lookup on the range descriptor to see if we are // still a member of the range. func (q *replicaGCQueue) process( ctx context.Context, now hlc.Timestamp, rng *Replica, _ config.SystemConfig, ) error { // Note that the Replicas field of desc is probably out of date, so // we should only use `desc` for its static fields like RangeID and // StartKey (and avoid rng.GetReplica() for the same reason). desc := rng.Desc() // Calls to RangeLookup typically use inconsistent reads, but we // want to do a consistent read here. This is important when we are // considering one of the metadata ranges: we must not do an // inconsistent lookup in our own copy of the range. b := &client.Batch{} b.AddRawRequest(&roachpb.RangeLookupRequest{ Span: roachpb.Span{ Key: keys.RangeMetaKey(desc.StartKey), }, MaxRanges: 1, }) if err := q.db.Run(b); err != nil { return err } br := b.RawResponse() reply := br.Responses[0].GetInner().(*roachpb.RangeLookupResponse) if len(reply.Ranges) != 1 { return errors.Errorf("expected 1 range descriptor, got %d", len(reply.Ranges)) } replyDesc := reply.Ranges[0] if _, currentMember := replyDesc.GetReplicaDescriptor(rng.store.StoreID()); !currentMember { // We are no longer a member of this range; clean up our local data. if log.V(1) { log.Infof("destroying local data from range %d", desc.RangeID) } log.Trace(ctx, "destroying local data") if err := rng.store.RemoveReplica(rng, replyDesc, true); err != nil { return err } } else if desc.RangeID != replyDesc.RangeID { // If we get a different range ID back, then the range has been merged // away. But currentMember is true, so we are still a member of the // subsuming range. Shut down raft processing for the former range // and delete any remaining metadata, but do not delete the data. if log.V(1) { log.Infof("removing merged range %d", desc.RangeID) } log.Trace(ctx, "removing merged range") if err := rng.store.RemoveReplica(rng, replyDesc, false); err != nil { return err } // TODO(bdarnell): remove raft logs and other metadata (while leaving a // tombstone). Add tests for GC of merged ranges. } else { // This replica is a current member of the raft group. Set the last replica // GC check time to avoid re-processing for another check interval. if err := rng.setLastReplicaGCTimestamp(now); err != nil { return err } } return nil }