// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock must not be held // while calling this method. func (bq *baseQueue) processReplica( queueCtx context.Context, repl *Replica, clock *hlc.Clock, ) error { bq.processMu.Lock() defer bq.processMu.Unlock() // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { log.VEventf(queueCtx, 1, "no system config available, skipping") return nil } if bq.requiresSplit(cfg, repl) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. log.VEventf(queueCtx, 3, "split needed; skipping") return nil } // Putting a span in a context means that events will no longer go to the // event log. Use queueCtx for events that are intended for the event log. ctx, span := bq.AnnotateCtxWithSpan(queueCtx, bq.name) defer span.Finish() // Also add the Replica annotations to ctx. ctx = repl.AnnotateCtx(ctx) ctx, cancel := context.WithTimeout(ctx, bq.processTimeout) defer cancel() log.Eventf(ctx, "processing replica") if err := repl.IsDestroyed(); err != nil { log.VEventf(queueCtx, 3, "replica destroyed (%s); skipping", err) return nil } // If the queue requires a replica to have the range lease in // order to be processed, check whether this replica has range lease // and renew or acquire if necessary. if bq.needsLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLease(ctx); err != nil { switch v := err.GetDetail().(type) { case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError: log.VEventf(queueCtx, 3, "%s; skipping", v) return nil default: return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } } log.Event(ctx, "got range lease") } log.VEventf(queueCtx, 3, "processing") if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil { return err } log.Event(ctx, "done") bq.successes.Inc(1) return nil }
// maybeRejectClientLocked checks whether the (transactional) request is in a // state that prevents it from continuing, such as the coordinator having // considered the client abandoned, or a heartbeat having reported an error. func (tc *TxnCoordSender) maybeRejectClientLocked( ctx context.Context, txn roachpb.Transaction, ) *roachpb.Error { if !txn.Writing { return nil } txnMeta, ok := tc.txns[*txn.ID] // Check whether the transaction is still tracked and has a chance of // completing. It's possible that the coordinator learns about the // transaction having terminated from a heartbeat, and GC queue correctness // (along with common sense) mandates that we don't let the client // continue. switch { case !ok: log.VEventf(ctx, 2, "rejecting unknown txn: %s", txn.ID) // TODO(spencerkimball): Could add coordinator node ID to the // transaction session so that we can definitively return the right // error between these possible errors. Or update the code to make an // educated guess based on the incoming transaction timestamp. return roachpb.NewError(errNoState) case txnMeta.txn.Status == roachpb.ABORTED: txn := txnMeta.txn.Clone() tc.cleanupTxnLocked(ctx, txn) return roachpb.NewErrorWithTxn(roachpb.NewTransactionAbortedError(), &txn) case txnMeta.txn.Status == roachpb.COMMITTED: txn := txnMeta.txn.Clone() tc.cleanupTxnLocked(ctx, txn) return roachpb.NewErrorWithTxn(roachpb.NewTransactionStatusError( "transaction is already committed"), &txn) default: return nil } }
// process truncates the raft log of the range if the replica is the raft // leader and if the total number of the range's raft log's stale entries // exceeds RaftLogQueueStaleThreshold. func (rlq *raftLogQueue) process(ctx context.Context, r *Replica, _ config.SystemConfig) error { truncatableIndexes, oldestIndex, err := getTruncatableIndexes(ctx, r) if err != nil { return err } // Can and should the raft logs be truncated? if truncatableIndexes >= RaftLogQueueStaleThreshold { r.mu.Lock() raftLogSize := r.mu.raftLogSize r.mu.Unlock() log.VEventf(ctx, 1, "truncating raft log %d-%d: size=%d", oldestIndex-truncatableIndexes, oldestIndex, raftLogSize) b := &client.Batch{} b.AddRawRequest(&roachpb.TruncateLogRequest{ Span: roachpb.Span{Key: r.Desc().StartKey.AsRawKey()}, Index: oldestIndex, RangeID: r.RangeID, }) if err := rlq.db.Run(ctx, b); err != nil { return err } r.store.metrics.RaftLogTruncated.Inc(int64(truncatableIndexes)) } return nil }
func (ds *ServerImpl) flowStreamInt(ctx context.Context, stream DistSQL_FlowStreamServer) error { // Receive the first message. msg, err := stream.Recv() if err != nil { if err == io.EOF { return errors.Errorf("empty stream") } return err } if msg.Header == nil { return errors.Errorf("no header in first message") } flowID := msg.Header.FlowID streamID := msg.Header.StreamID if log.V(1) { log.Infof(ctx, "connecting inbound stream %s/%d", flowID.Short(), streamID) } f, streamInfo, err := ds.flowRegistry.ConnectInboundStream(flowID, streamID) if err != nil { return err } log.VEventf(ctx, 1, "connected inbound stream %s/%d", flowID.Short(), streamID) defer ds.flowRegistry.FinishInboundStream(streamInfo) return ProcessInboundStream(&f.FlowCtx, stream, msg, streamInfo.receiver) }
// Heartbeat is called to update a node's expiration timestamp. This // method does a conditional put on the node liveness record, and if // successful, stores the updated liveness record in the nodes map. func (nl *NodeLiveness) Heartbeat(ctx context.Context, liveness *Liveness) error { defer func(start time.Time) { if dur := timeutil.Now().Sub(start); dur > time.Second { log.Warningf(ctx, "slow heartbeat took %0.1fs", dur.Seconds()) } }(timeutil.Now()) // Allow only one heartbeat at a time. select { case nl.heartbeatSem <- struct{}{}: case <-ctx.Done(): return ctx.Err() } defer func() { <-nl.heartbeatSem }() nodeID := nl.gossip.NodeID.Get() var newLiveness Liveness if liveness == nil { newLiveness = Liveness{ NodeID: nodeID, Epoch: 1, } } else { newLiveness = *liveness } // We need to add the maximum clock offset to the expiration because it's // used when determining liveness for a node. newLiveness.Expiration = nl.clock.Now().Add( (nl.livenessThreshold + nl.clock.MaxOffset()).Nanoseconds(), 0) if err := nl.updateLiveness(ctx, &newLiveness, liveness, func(actual Liveness) error { // Update liveness to actual value on mismatch. nl.mu.Lock() nl.mu.self = actual nl.mu.Unlock() // If the actual liveness is different than expected, but is // considered live, treat the heartbeat as a success. This can // happen when the periodic heartbeater races with a concurrent // lease acquisition. if actual.isLive(nl.clock.Now(), nl.clock.MaxOffset()) { return errNodeAlreadyLive } // Otherwise, return error. return errSkippedHeartbeat }); err != nil { if err == errNodeAlreadyLive { return nil } nl.metrics.HeartbeatFailures.Inc(1) return err } log.VEventf(ctx, 1, "heartbeat %+v", newLiveness.Expiration) nl.mu.Lock() nl.mu.self = newLiveness nl.mu.Unlock() nl.metrics.HeartbeatSuccesses.Inc(1) return nil }
// ReplicaInfo returns information about the replica that has been picked for // the current range. // A RangeUnavailableError is returned if there's no information in gossip // about any of the replicas. func (it *SpanResolverIterator) ReplicaInfo(ctx context.Context) (kv.ReplicaInfo, error) { if !it.Valid() { panic(it.Error()) } resolvedLH := false var repl kv.ReplicaInfo if lh, ok := it.it.LeaseHolder(ctx); ok { repl.ReplicaDescriptor = lh // Fill in the node descriptor. nd, err := it.gossip.GetNodeDescriptor(repl.NodeID) if err != nil { // Ignore the error; ask the oracle to pick another replica below. log.VEventf(ctx, 2, "failed to resolve node %d: %s", repl.NodeID, err) } repl.NodeDesc = nd resolvedLH = true } if !resolvedLH { leaseHolder, err := it.oracle.ChoosePreferredLeaseHolder( *it.it.Desc(), it.queryState) if err != nil { return kv.ReplicaInfo{}, err } repl = leaseHolder } it.queryState.rangesPerNode[repl.NodeID]++ return repl, nil }
func (rq *replicateQueue) transferLease( ctx context.Context, repl *Replica, desc *roachpb.RangeDescriptor, zone config.ZoneConfig, checkTransferLeaseSource bool, ) (bool, error) { candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) if target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, checkTransferLeaseSource, ); target != (roachpb.ReplicaDescriptor{}) { rq.metrics.TransferLeaseCount.Inc(1) log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(ctx, target.StoreID); err != nil { return false, errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) return true, nil } return false, nil }
// IncrementEpoch is called to increment the current liveness epoch, // thereby invalidating anything relying on the liveness of the // previous epoch. This method does a conditional put on the node // liveness record, and if successful, stores the updated liveness // record in the nodes map. If this method is called on a node ID // which is considered live according to the most recent information // gathered through gossip, an error is returned. func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness *Liveness) error { if liveness.isLive(nl.clock.Now(), nl.clock.MaxOffset()) { return errors.Errorf("cannot increment epoch on live node: %+v", liveness) } newLiveness := *liveness newLiveness.Epoch++ if err := nl.updateLiveness(ctx, &newLiveness, liveness, func(actual Liveness) error { if actual.Epoch > liveness.Epoch { newLiveness = actual return nil } else if actual.Epoch < liveness.Epoch { return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch) } nl.mu.Lock() defer nl.mu.Unlock() nl.mu.nodes[actual.NodeID] = actual return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", liveness, actual) }); err != nil { return err } log.VEventf(ctx, 1, "incremented node %d liveness epoch to %d", newLiveness.NodeID, newLiveness.Epoch) nl.mu.Lock() defer nl.mu.Unlock() if nodeID := nl.gossip.NodeID.Get(); nodeID == liveness.NodeID { nl.mu.self = newLiveness } else { nl.mu.nodes[newLiveness.NodeID] = newLiveness } nl.metrics.EpochIncrements.Inc(1) return nil }
// Run is part of the processor interface. func (tr *tableReader) Run(wg *sync.WaitGroup) { if wg != nil { defer wg.Done() } ctx, span := tracing.ChildSpan(tr.ctx, "table reader") defer tracing.FinishSpan(span) txn := tr.flowCtx.setupTxn(ctx) log.VEventf(ctx, 1, "starting (filter: %s)", &tr.filter) if log.V(1) { defer log.Infof(ctx, "exiting") } if err := tr.fetcher.StartScan( txn, tr.spans, true /* limit batches */, tr.getLimitHint(), ); err != nil { log.Errorf(ctx, "scan error: %s", err) tr.output.Close(err) return } var rowIdx int64 for { outRow, err := tr.nextRow() if err != nil || outRow == nil { tr.output.Close(err) return } if log.V(3) { log.Infof(ctx, "pushing row %s", outRow) } // Push the row to the output RowReceiver; stop if they don't need more // rows. if !tr.output.PushRow(outRow) { log.VEventf(ctx, 1, "no more rows required") tr.output.Close(nil) return } rowIdx++ if tr.hardLimit != 0 && rowIdx == tr.hardLimit { // We sent tr.hardLimit rows. tr.output.Close(nil) return } } }
// MaybeAdd adds the specified replica if bq.shouldQueue specifies it // should be queued. Replicas are added to the queue using the priority // returned by bq.shouldQueue. If the queue is too full, the replica may // not be added, as the replica with the lowest priority will be // dropped. func (bq *baseQueue) MaybeAdd(repl *Replica, now hlc.Timestamp) { // Load the system config. cfg, cfgOk := bq.gossip.GetSystemConfig() requiresSplit := cfgOk && bq.requiresSplit(cfg, repl) bq.mu.Lock() defer bq.mu.Unlock() if bq.mu.stopped { return } if !repl.IsInitialized() { return } ctx := repl.AnnotateCtx(bq.AnnotateCtx(context.TODO())) if !cfgOk { log.VEvent(ctx, 1, "no system config available. skipping") return } if requiresSplit { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. log.VEventf(ctx, 1, "split needed; not adding") return } if bq.needsLease { // Check to see if either we own the lease or do not know who the lease // holder is. if lease, _ := repl.getLease(); repl.IsLeaseValid(lease, now) && !lease.OwnedBy(repl.store.StoreID()) { log.VEventf(ctx, 1, "needs lease; not adding: %+v", lease) return } } should, priority := bq.impl.shouldQueue(ctx, now, repl, cfg) if _, err := bq.addInternal(ctx, repl.Desc(), should, priority); !isExpectedQueueError(err) { log.Errorf(ctx, "unable to add: %s", err) } }
// addInternal adds the replica the queue with specified priority. If // the replica is already queued, updates the existing // priority. Expects the queue lock to be held by caller. func (bq *baseQueue) addInternal( ctx context.Context, desc *roachpb.RangeDescriptor, should bool, priority float64, ) (bool, error) { if bq.mu.stopped { return false, errQueueStopped } if bq.mu.disabled { log.Event(ctx, "queue disabled") return false, errQueueDisabled } if !desc.IsInitialized() { // We checked this above in MaybeAdd(), but we need to check it // again for Add(). return false, errors.New("replica not initialized") } // If the replica is currently in purgatory, don't re-add it. if _, ok := bq.mu.purgatory[desc.RangeID]; ok { return false, nil } item, ok := bq.mu.replicas[desc.RangeID] if !should { if ok { log.Eventf(ctx, "%s: removing from queue", item.value) bq.remove(item) } return false, errReplicaNotAddable } else if ok { if item.priority != priority { log.Eventf(ctx, "%s: updating priority: %0.3f -> %0.3f", desc, item.priority, priority) } // Replica has already been added; update priority. bq.mu.priorityQ.update(item, priority) return false, nil } log.VEventf(ctx, 3, "%s: adding: priority=%0.3f", desc, priority) item = &replicaItem{value: desc.RangeID, priority: priority} bq.add(item) // If adding this replica has pushed the queue past its maximum size, // remove the lowest priority element. if pqLen := bq.mu.priorityQ.Len(); pqLen > bq.maxSize { bq.remove(bq.mu.priorityQ[pqLen-1]) } // Signal the processLoop that a replica has been added. select { case bq.incoming <- struct{}{}: default: // No need to signal again. } return true, nil }
// Seek positions the iterator at the specified key. func (ri *RangeIterator) Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) { log.Eventf(ctx, "querying next range at %s", key) ri.scanDir = scanDir ri.init = true // the iterator is now initialized ri.pErr = nil // clear any prior error ri.key = key // set the key // Retry loop for looking up next range in the span. The retry loop // deals with retryable range descriptor lookups. for r := retry.StartWithCtx(ctx, ri.ds.rpcRetryOptions); r.Next(); { log.Event(ctx, "meta descriptor lookup") var err error ri.desc, ri.token, err = ri.ds.getDescriptor( ctx, ri.key, ri.token, ri.scanDir == Descending) // getDescriptor may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.VEventf(ctx, 1, "range descriptor lookup failed: %s", err) continue } // It's possible that the returned descriptor misses parts of the // keys it's supposed to include after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. // TODO: this code is subject to removal. See // https://groups.google.com/d/msg/cockroach-db/DebjQEgU9r4/_OhMe7atFQAJ reverse := ri.scanDir == Descending if (reverse && !ri.desc.ContainsExclusiveEndKey(ri.key)) || (!reverse && !ri.desc.ContainsKey(ri.key)) { log.Eventf(ctx, "addressing error: %s does not include key %s", ri.desc, ri.key) if err := ri.token.Evict(ctx); err != nil { ri.pErr = roachpb.NewError(err) return } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } return } // Check for an early exit from the retry loop. if pErr := ri.ds.deduceRetryEarlyExitError(ctx); pErr != nil { ri.pErr = pErr } else { ri.pErr = roachpb.NewErrorf("RangeIterator failed to seek to %s", key) } }
// increaseBudget requests more memory from the pool. func (mm *MemoryMonitor) increaseBudget(ctx context.Context, minExtra int64) error { // NB: mm.mu Already locked by reserveMemory(). if mm.pool == nil { return errors.Errorf("%s: memory budget exceeded: %d bytes requested, %d bytes in budget", mm.name, minExtra, mm.reserved.curAllocated) } minExtra = mm.roundSize(minExtra) log.VEventf(ctx, 2, "%s: requesting %d bytes from the pool", mm.name, minExtra) return mm.pool.GrowAccount(ctx, &mm.mu.curBudget, minExtra) }
// process iterates through all keys in a replica's range, calling the garbage // collector for each key and associated set of values. GC'd keys are batched // into GC calls. Extant intents are resolved if intents are older than // intentAgeThreshold. The transaction and abort cache records are also // scanned and old entries evicted. During normal operation, both of these // records are cleaned up when their respective transaction finishes, so the // amount of work done here is expected to be small. // // Some care needs to be taken to avoid cyclic recreation of entries during GC: // * a Push initiated due to an intent may recreate a transaction entry // * resolving an intent may write a new abort cache entry // * obtaining the transaction for a abort cache entry requires a Push // // The following order is taken below: // 1) collect all intents with sufficiently old txn record // 2) collect these intents' transactions // 3) scan the transaction table, collecting abandoned or completed txns // 4) push all of these transactions (possibly recreating entries) // 5) resolve all intents (unless the txn is still PENDING), which will recreate // abort cache entries (but with the txn timestamp; i.e. likely gc'able) // 6) scan the abort cache table for old entries // 7) push these transactions (again, recreating txn entries). // 8) send a GCRequest. func (gcq *gcQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { snap := repl.store.Engine().NewSnapshot() desc := repl.Desc() defer snap.Close() // Lookup the GC policy for the zone containing this key range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return errors.Errorf("could not find zone config for range %s: %s", repl, err) } gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC, func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) { pushTxn(ctx, gcq.store.DB(), now, txn, typ) }, func(intents []roachpb.Intent, poison bool, wait bool) error { return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait) }) if err != nil { return err } log.VEventf(ctx, 1, "completed with stats %+v", info) info.updateMetrics(gcq.store.metrics) var ba roachpb.BatchRequest var gcArgs roachpb.GCRequest // TODO(tschottdorf): This is one of these instances in which we want // to be more careful that the request ends up on the correct Replica, // and we might have to worry about mixing range-local and global keys // in a batch which might end up spanning Ranges by the time it executes. gcArgs.Key = desc.StartKey.AsRawKey() gcArgs.EndKey = desc.EndKey.AsRawKey() gcArgs.Keys = gcKeys gcArgs.Threshold = info.Threshold gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold // Technically not needed since we're talking directly to the Range. ba.RangeID = desc.RangeID ba.Timestamp = now ba.Add(&gcArgs) if _, pErr := repl.Send(ctx, ba); pErr != nil { log.ErrEvent(ctx, pErr.String()) return pErr.GoError() } return nil }
// MaybeRemove removes the specified replica from the queue if enqueued. func (bq *baseQueue) MaybeRemove(rangeID roachpb.RangeID) { bq.mu.Lock() defer bq.mu.Unlock() if bq.mu.stopped { return } if item, ok := bq.mu.replicas[rangeID]; ok { ctx := bq.AnnotateCtx(context.TODO()) log.VEventf(ctx, 3, "%s: removing", item.value) bq.remove(item) } }
// Start starts the flow (each processor runs in their own goroutine). func (f *Flow) Start() { log.VEventf( f.Context, 1, "starting (%d processors, %d outboxes)", len(f.outboxes), len(f.processors), ) f.status = FlowRunning f.flowRegistry.RegisterFlow(f.id, f) for _, o := range f.outboxes { o.start(&f.waitGroup) } f.waitGroup.Add(len(f.processors)) for _, p := range f.processors { go p.Run(&f.waitGroup) } }
// Start starts the flow (each processor runs in their own goroutine). func (f *Flow) Start(doneFn func()) { f.doneFn = doneFn log.VEventf( f.Context, 1, "starting (%d processors, %d outboxes)", len(f.outboxes), len(f.processors), ) f.status = FlowRunning f.flowRegistry.RegisterFlow(f.id, f, f.inboundStreams) f.waitGroup.Add(len(f.inboundStreams)) f.waitGroup.Add(len(f.outboxes)) f.waitGroup.Add(len(f.processors)) for _, o := range f.outboxes { o.start(&f.waitGroup) } for _, p := range f.processors { go p.Run(&f.waitGroup) } }
// IncrementEpoch is called to increment the current liveness epoch, // thereby invalidating anything relying on the liveness of the // previous epoch. This method does a conditional put on the node // liveness record, and if successful, stores the updated liveness // record in the nodes map. If this method is called on a node ID // which is considered live according to the most recent information // gathered through gossip, an error is returned. func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness *Liveness) error { // Allow only one increment at a time. select { case nl.incrementSem <- struct{}{}: case <-ctx.Done(): return ctx.Err() } defer func() { <-nl.incrementSem }() if liveness.isLive(nl.clock.Now(), nl.clock.MaxOffset()) { return errors.Errorf("cannot increment epoch on live node: %+v", liveness) } newLiveness := *liveness newLiveness.Epoch++ update := func(l Liveness) { nl.mu.Lock() defer nl.mu.Unlock() if nodeID := nl.gossip.NodeID.Get(); nodeID == l.NodeID { nl.mu.self = l } else { nl.mu.nodes[l.NodeID] = l } } if err := nl.updateLiveness(ctx, &newLiveness, liveness, func(actual Liveness) error { defer update(actual) if actual.Epoch > liveness.Epoch { return errEpochAlreadyIncremented } else if actual.Epoch < liveness.Epoch { return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch) } return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", liveness, actual) }); err != nil { if err == errEpochAlreadyIncremented { return nil } return err } log.VEventf(ctx, 1, "incremented node %d liveness epoch to %d", newLiveness.NodeID, newLiveness.Epoch) update(newLiveness) nl.metrics.EpochIncrements.Inc(1) return nil }
// SendNext invokes the specified RPC on the supplied client when the // client is ready. On success, the reply is sent on the channel; // otherwise an error is sent. func (gt *grpcTransport) SendNext(ctx context.Context, done chan<- BatchCall) { client := gt.orderedClients[gt.clientIndex] gt.clientIndex++ gt.setPending(client.args.Replica, true) // Fork the original context as this async send may outlast the // caller's context. ctx, sp := tracing.ForkCtxSpan(ctx, "grpcTransport SendNext") go func() { defer tracing.FinishSpan(sp) gt.opts.metrics.SentCount.Inc(1) reply, err := func() (*roachpb.BatchResponse, error) { if enableLocalCalls { if localServer := gt.rpcContext.GetLocalInternalServerForAddr(client.remoteAddr); localServer != nil { // Clone the request. At the time of writing, Replica may mutate it // during command execution which can lead to data races. // // TODO(tamird): we should clone all of client.args.Header, but the // assertions in protoutil.Clone fire and there seems to be no // reasonable workaround. origTxn := client.args.Txn if origTxn != nil { clonedTxn := origTxn.Clone() client.args.Txn = &clonedTxn } gt.opts.metrics.LocalSentCount.Inc(1) log.VEvent(ctx, 2, "sending request to local server") return localServer.Batch(ctx, &client.args) } } log.VEventf(ctx, 2, "sending request to %s", client.remoteAddr) reply, err := client.client.Batch(ctx, &client.args) if reply != nil { for i := range reply.Responses { if err := reply.Responses[i].GetInner().Verify(client.args.Requests[i].GetInner()); err != nil { log.Error(ctx, err) } } } return reply, err }() gt.setPending(client.args.Replica, false) done <- BatchCall{Reply: reply, Err: err} }() }
// tightenNetwork "tightens" the network by starting a new gossip client to the // client to the most distant node to which we don't already have an outgoing // connection. Does nothing if we don't have room for any more outgoing // connections. func (g *Gossip) tightenNetwork(ctx context.Context) { g.mu.Lock() defer g.mu.Unlock() if g.outgoing.hasSpace() { distantNodeID, distantHops := g.mu.is.mostDistant(g.hasOutgoingLocked) log.VEventf(ctx, 2, "distantHops: %d from %d", distantHops, distantNodeID) if distantHops <= maxHops { return } if nodeAddr, err := g.getNodeIDAddressLocked(distantNodeID); err != nil { log.Errorf(ctx, "unable to get address for distant node %d: %s", distantNodeID, err) } else { log.Infof(ctx, "starting client to distant node %d (%d > %d) to tighten network graph", distantNodeID, distantHops, maxHops) log.Eventf(ctx, "tightening network with new client to %s", nodeAddr) g.startClientLocked(nodeAddr) } } }
// Heartbeat is called to update a node's expiration timestamp. This // method does a conditional put on the node liveness record, and if // successful, stores the updated liveness record in the nodes map. func (nl *NodeLiveness) Heartbeat(ctx context.Context, liveness *Liveness) error { nodeID := nl.gossip.NodeID.Get() var newLiveness Liveness if liveness == nil { newLiveness = Liveness{ NodeID: nodeID, Epoch: 1, } } else { newLiveness = *liveness } // We need to add the maximum clock offset to the expiration because it's // used when determining liveness for a node. newLiveness.Expiration = nl.clock.Now().Add( (nl.livenessThreshold + nl.clock.MaxOffset()).Nanoseconds(), 0) err := nl.updateLiveness(ctx, &newLiveness, liveness, func(actual Liveness) error { // Update liveness to actual value on mismatch. defer func() { newLiveness = actual }() // If the actual liveness is further ahead than expected, we're // done. This can happen when the periodic heartbeater races with // a concurrent lease acquisition. if !actual.Expiration.Less(newLiveness.Expiration) && actual.Epoch == newLiveness.Epoch { return nil } // Otherwise, return error. return errSkippedHeartbeat }) log.VEventf(ctx, 1, "heartbeat %+v", newLiveness.Expiration) nl.mu.Lock() nl.mu.self = newLiveness nl.mu.Unlock() if err != nil { nl.metrics.HeartbeatFailures.Inc(1) } else { nl.metrics.HeartbeatSuccesses.Inc(1) } return err }
// IncrementEpoch is called by nodes on other nodes in order to // increment the current liveness epoch, thereby invalidating anything // relying on the liveness of the previous epoch. This method does a // conditional put on the node liveness record, and if successful, // stores the updated liveness record in the nodes map. // // TODO(spencer): make sure calls to this method are captured in // range lease metrics. func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, nodeID roachpb.NodeID) error { liveness, err := nl.GetLiveness(nodeID) if err != nil { return err } if liveness.isLive(nl.clock) { return errors.Errorf("cannot increment epoch on live node: %+v", liveness) } newLiveness := liveness newLiveness.Epoch++ if err := nl.updateLiveness(ctx, nodeID, &newLiveness, &liveness, nil); err != nil { return err } log.VEventf(ctx, 1, "incremented node %d liveness epoch to %d", nodeID, newLiveness.Epoch) nl.mu.Lock() defer nl.mu.Unlock() nl.mu.nodes[nodeID] = newLiveness nl.metrics.EpochIncrements.Inc(1) return nil }
// heartbeat is called to update a node's expiration timestamp. This // method does a conditional put on the node liveness record, and if // successful, stores the updated liveness record in the nodes map. func (nl *NodeLiveness) heartbeat(ctx context.Context) error { nodeID := nl.gossip.NodeID.Get() var newLiveness Liveness var oldLiveness *Liveness liveness, err := nl.GetLiveness(nodeID) if err == nil { oldLiveness = &liveness newLiveness = liveness } else { newLiveness = Liveness{ NodeID: nodeID, Epoch: 1, } } // Retry heartbeat in the event the conditional put fails. for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { newLiveness.Expiration = nl.clock.Now().Add(nl.livenessThreshold.Nanoseconds(), 0) tryAgain := false if err := nl.updateLiveness(ctx, nodeID, &newLiveness, oldLiveness, func(actual Liveness) { oldLiveness = &actual newLiveness = actual tryAgain = true }); err != nil { nl.metrics.HeartbeatFailures.Inc(1) return err } if !tryAgain { break } } log.VEventf(ctx, 1, "heartbeat node %d liveness with expiration %s", nodeID, newLiveness.Expiration) nl.mu.Lock() defer nl.mu.Unlock() nl.mu.self = newLiveness nl.metrics.HeartbeatSuccesses.Inc(1) return nil }
// process truncates the raft log of the range if the replica is the raft // leader and if the total number of the range's raft log's stale entries // exceeds RaftLogQueueStaleThreshold. func (rlq *raftLogQueue) process( ctx context.Context, now hlc.Timestamp, r *Replica, _ config.SystemConfig, ) error { truncatableIndexes, oldestIndex, err := getTruncatableIndexes(ctx, r) if err != nil { return err } // Can and should the raft logs be truncated? if truncatableIndexes >= RaftLogQueueStaleThreshold { log.VEventf(ctx, 1, "truncating raft log %d-%d", oldestIndex-truncatableIndexes, oldestIndex) b := &client.Batch{} b.AddRawRequest(&roachpb.TruncateLogRequest{ Span: roachpb.Span{Key: r.Desc().StartKey.AsRawKey()}, Index: oldestIndex, RangeID: r.RangeID, }) return rlq.db.Run(ctx, b) } return nil }
// Start starts the flow (each processor runs in their own goroutine). func (f *Flow) Start(doneFn func()) { f.doneFn = doneFn log.VEventf( f.Context, 1, "starting (%d processors, %d outboxes)", len(f.outboxes), len(f.processors), ) f.status = FlowRunning // Once we call RegisterFlow, the inbound streams become accessible; we must // set up the WaitGroup counter before. f.waitGroup.Add(len(f.inboundStreams) + len(f.outboxes) + len(f.processors)) f.flowRegistry.RegisterFlow(f.id, f, f.inboundStreams) if log.V(1) { log.Infof(f.Context, "registered flow %s", f.id.Short()) } for _, o := range f.outboxes { o.start(&f.waitGroup) } for _, p := range f.processors { go p.Run(&f.waitGroup) } }
// StartHeartbeat starts a periodic heartbeat to refresh this node's // last heartbeat in the node liveness table. func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) { log.VEventf(ctx, 1, "starting liveness heartbeat") retryOpts := base.DefaultRetryOptions() retryOpts.Closer = stopper.ShouldQuiesce() stopper.RunWorker(func() { ambient := nl.ambientCtx ambient.AddLogTag("hb", nil) ticker := time.NewTicker(nl.heartbeatInterval) defer ticker.Stop() for { if !nl.pauseHeartbeat.Load().(bool) { ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat") ctx, cancel := context.WithTimeout(ctx, nl.heartbeatInterval) // Retry heartbeat in the event the conditional put fails. for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); { liveness, err := nl.Self() if err != nil && err != ErrNoLivenessRecord { log.Errorf(ctx, "unexpected error getting liveness: %v", err) } if err := nl.Heartbeat(ctx, liveness); err != nil { if err == errSkippedHeartbeat { continue } log.Errorf(ctx, "failed liveness heartbeat: %v", err) } break } cancel() sp.Finish() } select { case <-ticker.C: case <-stopper.ShouldStop(): return } } }) }
func (ta *TxnAborter) statementFilter(ctx context.Context, stmt string, res *sql.Result) { ta.mu.Lock() ri, ok := ta.mu.stmtsToAbort[stmt] shouldAbort := false if ok { ri.execCount++ if ri.abortCount == 0 { log.VEventf(ctx, 1, "TxnAborter sees satisfied statement %q", stmt) ri.satisfied = true } if ri.abortCount > 0 && res.Err == nil { log.Infof(ctx, "TxnAborter aborting txn for statement %q", stmt) ri.abortCount-- shouldAbort = true } } ta.mu.Unlock() if shouldAbort { if err := ta.abortTxn(ri.key); err != nil { res.Err = errors.Wrap(err, "TxnAborter failed to abort") } } }
// maybeTransferRaftLeadership attempts to transfer the leadership // away from this node to target, if this node is the current raft // leader. We don't attempt to transfer leadership if the transferee // is behind on applying the log. func (r *Replica) maybeTransferRaftLeadership(ctx context.Context, target roachpb.ReplicaID) { err := r.withRaftGroup(func(raftGroup *raft.RawNode) (bool, error) { // Only the raft leader can attempt a leadership transfer. if status := raftGroup.Status(); status.RaftState == raft.StateLeader { // Only attempt this if the target has all the log entries. if pr, ok := status.Progress[uint64(target)]; ok && pr.Match == r.mu.lastIndex { log.VEventf(ctx, 1, "transferring raft leadership to replica ID %v", target) r.store.metrics.RangeRaftLeaderTransfers.Inc(1) raftGroup.TransferLeader(uint64(target)) } } return true, nil }) if err != nil { // An error here indicates that this Replica has been destroyed // while lacking the necessary synchronization (or even worse, it // fails spuriously - could be a storage error), and so we avoid // sweeping that under the rug. // // TODO(tschottdorf): this error is not handled any more // at this level. log.Fatal(ctx, NewReplicaCorruptionError(err)) } }
// StartHeartbeat starts a periodic heartbeat to refresh this node's // last heartbeat in the node liveness table. func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) { log.VEventf(ctx, 1, "starting liveness heartbeat") stopper.RunWorker(func() { ambient := nl.ambientCtx ambient.AddLogTag("hb", nil) ticker := time.NewTicker(nl.heartbeatInterval) defer ticker.Stop() for { ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat") if err := nl.heartbeat(ctx); err != nil { log.Errorf(ctx, "failed liveness heartbeat: %s", err) } sp.Finish() select { case <-ticker.C: case <-nl.stopHeartbeat: return case <-stopper.ShouldStop(): return } } }) }
// processLoop processes the entries in the queue until the provided // stopper signals exit. func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { ctx := bq.AnnotateCtx(context.Background()) defer func() { bq.mu.Lock() bq.mu.stopped = true bq.mu.Unlock() bq.AmbientContext.FinishEventLog() }() // nextTime is initially nil; we don't start any timers until the queue // becomes non-empty. var nextTime <-chan time.Time immediately := make(chan time.Time) close(immediately) for { select { // Exit on stopper. case <-stopper.ShouldStop(): return // Incoming signal sets the next time to process if there were previously // no replicas in the queue. case <-bq.incoming: if nextTime == nil { // When a replica is added, wake up immediately. This is mainly // to facilitate testing without unnecessary sleeps. nextTime = immediately // In case we're in a test, still block on the impl. bq.impl.timer(0) } // Process replicas as the timer expires. case <-nextTime: repl := bq.pop() var duration time.Duration if repl != nil { if stopper.RunTask(func() { annotatedCtx := repl.AnnotateCtx(ctx) start := timeutil.Now() if err := bq.processReplica(annotatedCtx, repl, clock); err != nil { // Maybe add failing replica to purgatory if the queue supports it. bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper) } duration = timeutil.Since(start) log.VEventf(annotatedCtx, 2, "done %s", duration) bq.processingNanos.Inc(duration.Nanoseconds()) }) != nil { return } } if bq.Length() == 0 { nextTime = nil } else { nextTime = time.After(bq.impl.timer(duration)) } } } }) }