// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock must not be held // while calling this method. func (bq *baseQueue) processReplica( queueCtx context.Context, repl *Replica, clock *hlc.Clock, ) error { bq.processMu.Lock() defer bq.processMu.Unlock() // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { log.VEventf(queueCtx, 1, "no system config available, skipping") return nil } if bq.requiresSplit(cfg, repl) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. log.VEventf(queueCtx, 3, "split needed; skipping") return nil } // Putting a span in a context means that events will no longer go to the // event log. Use queueCtx for events that are intended for the event log. ctx, span := bq.AnnotateCtxWithSpan(queueCtx, bq.name) defer span.Finish() // Also add the Replica annotations to ctx. ctx = repl.AnnotateCtx(ctx) ctx, cancel := context.WithTimeout(ctx, bq.processTimeout) defer cancel() log.Eventf(ctx, "processing replica") if err := repl.IsDestroyed(); err != nil { log.VEventf(queueCtx, 3, "replica destroyed (%s); skipping", err) return nil } // If the queue requires a replica to have the range lease in // order to be processed, check whether this replica has range lease // and renew or acquire if necessary. if bq.needsLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLease(ctx); err != nil { switch v := err.GetDetail().(type) { case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError: log.VEventf(queueCtx, 3, "%s; skipping", v) return nil default: return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } } log.Event(ctx, "got range lease") } log.VEventf(queueCtx, 3, "processing") if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil { return err } log.Event(ctx, "done") bq.successes.Inc(1) return nil }
// snapshotWithContext is the main implementation for Snapshot() but it takes // a context to allow tracing. If this method returns without error, callers // must eventually call CloseOutSnap to ready this replica for more snapshots. // r.mu must be held. func (r *Replica) snapshotWithContext( ctx context.Context, snapType string, ) (*OutgoingSnapshot, error) { r.mu.AssertHeld() rangeID := r.RangeID if r.exceedsDoubleSplitSizeLocked() { maxBytes := r.mu.maxBytes size := r.mu.state.Stats.Total() log.Infof(ctx, "not generating %s snapshot because replica is too large: %d > 2 * %d", snapType, size, maxBytes) return &OutgoingSnapshot{}, raft.ErrSnapshotTemporarilyUnavailable } // See if there is already a snapshot running for this store. select { case <-r.mu.outSnapDone: default: log.Event(ctx, "snapshot already running") return nil, raft.ErrSnapshotTemporarilyUnavailable } if !r.store.AcquireRaftSnapshot() { log.Event(ctx, "snapshot already running") return nil, raft.ErrSnapshotTemporarilyUnavailable } startKey := r.mu.state.Desc.StartKey ctx, sp := r.AnnotateCtxWithSpan(ctx, "snapshot") defer sp.Finish() snap := r.store.NewSnapshot() log.Eventf(ctx, "new engine snapshot for replica %s", r) // Delegate to a static function to make sure that we do not depend // on any indirect calls to r.store.Engine() (or other in-memory // state of the Replica). Everything must come from the snapshot. snapData, err := snapshot(ctx, snapType, snap, rangeID, r.store.raftEntryCache, startKey) if err != nil { log.Errorf(ctx, "error generating snapshot: %s", err) return nil, err } log.Event(ctx, "snapshot generated") r.store.metrics.RangeSnapshotsGenerated.Inc(1) r.mu.outSnap = snapData r.mu.outSnapDone = make(chan struct{}) return &r.mu.outSnap, nil }
// addInternal adds the replica the queue with specified priority. If // the replica is already queued, updates the existing // priority. Expects the queue lock to be held by caller. func (bq *baseQueue) addInternal( ctx context.Context, desc *roachpb.RangeDescriptor, should bool, priority float64, ) (bool, error) { if bq.mu.stopped { return false, errQueueStopped } if bq.mu.disabled { log.Event(ctx, "queue disabled") return false, errQueueDisabled } if !desc.IsInitialized() { // We checked this above in MaybeAdd(), but we need to check it // again for Add(). return false, errors.New("replica not initialized") } // If the replica is currently in purgatory, don't re-add it. if _, ok := bq.mu.purgatory[desc.RangeID]; ok { return false, nil } item, ok := bq.mu.replicas[desc.RangeID] if !should { if ok { log.Eventf(ctx, "%s: removing from queue", item.value) bq.remove(item) } return false, errReplicaNotAddable } else if ok { if item.priority != priority { log.Eventf(ctx, "%s: updating priority: %0.3f -> %0.3f", desc, item.priority, priority) } // Replica has already been added; update priority. bq.mu.priorityQ.update(item, priority) return false, nil } log.VEventf(ctx, 3, "%s: adding: priority=%0.3f", desc, priority) item = &replicaItem{value: desc.RangeID, priority: priority} bq.add(item) // If adding this replica has pushed the queue past its maximum size, // remove the lowest priority element. if pqLen := bq.mu.priorityQ.Len(); pqLen > bq.maxSize { bq.remove(bq.mu.priorityQ[pqLen-1]) } // Signal the processLoop that a replica has been added. select { case bq.incoming <- struct{}{}: default: // No need to signal again. } return true, nil }
// Seek positions the iterator at the specified key. func (ri *RangeIterator) Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) { log.Eventf(ctx, "querying next range at %s", key) ri.scanDir = scanDir ri.init = true // the iterator is now initialized ri.pErr = nil // clear any prior error ri.key = key // set the key // Retry loop for looking up next range in the span. The retry loop // deals with retryable range descriptor lookups. for r := retry.StartWithCtx(ctx, ri.ds.rpcRetryOptions); r.Next(); { log.Event(ctx, "meta descriptor lookup") var err error ri.desc, ri.token, err = ri.ds.getDescriptor( ctx, ri.key, ri.token, ri.scanDir == Descending) // getDescriptor may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.VEventf(ctx, 1, "range descriptor lookup failed: %s", err) continue } // It's possible that the returned descriptor misses parts of the // keys it's supposed to include after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. // TODO: this code is subject to removal. See // https://groups.google.com/d/msg/cockroach-db/DebjQEgU9r4/_OhMe7atFQAJ reverse := ri.scanDir == Descending if (reverse && !ri.desc.ContainsExclusiveEndKey(ri.key)) || (!reverse && !ri.desc.ContainsKey(ri.key)) { log.Eventf(ctx, "addressing error: %s does not include key %s", ri.desc, ri.key) if err := ri.token.Evict(ctx); err != nil { ri.pErr = roachpb.NewError(err) return } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } return } // Check for an early exit from the retry loop. if pErr := ri.ds.deduceRetryEarlyExitError(ctx); pErr != nil { ri.pErr = pErr } else { ri.pErr = roachpb.NewErrorf("RangeIterator failed to seek to %s", key) } }
func (s *senderTransport) SendNext(done chan<- BatchCall) { if s.called { panic("called an exhausted transport") } s.called = true sp := s.tracer.StartSpan("node") defer sp.Finish() ctx := opentracing.ContextWithSpan(context.TODO(), sp) log.Event(ctx, s.args.String()) br, pErr := s.sender.Send(ctx, s.args) if br == nil { br = &roachpb.BatchResponse{} } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(s.sender, br)) } br.Error = pErr if pErr != nil { log.Event(ctx, "error: "+pErr.String()) } done <- BatchCall{Reply: br} }
// Send implements the client.Sender interface. The store is looked up from the // store map if specified by the request; otherwise, the command is being // executed locally, and the replica is determined via lookup through each // store's LookupRange method. The latter path is taken only by unit tests. func (ls *Stores) Send( ctx context.Context, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, *roachpb.Error) { // If we aren't given a Replica, then a little bending over // backwards here. This case applies exclusively to unittests. if ba.RangeID == 0 || ba.Replica.StoreID == 0 { rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err) } rangeID, repDesc, err := ls.LookupReplica(rs.Key, rs.EndKey) if err != nil { return nil, roachpb.NewError(err) } ba.RangeID = rangeID ba.Replica = repDesc } store, err := ls.GetStore(ba.Replica.StoreID) if err != nil { return nil, roachpb.NewError(err) } if ba.Txn != nil { // For calls that read data within a txn, we keep track of timestamps // observed from the various participating nodes' HLC clocks. If we have // a timestamp on file for this Node which is smaller than MaxTimestamp, // we can lower MaxTimestamp accordingly. If MaxTimestamp drops below // OrigTimestamp, we effectively can't see uncertainty restarts any // more. // Note that it's not an issue if MaxTimestamp propagates back out to // the client via a returned Transaction update - when updating a Txn // from another, the larger MaxTimestamp wins. if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) { // Copy-on-write to protect others we might be sharing the Txn with. shallowTxn := *ba.Txn // The uncertainty window is [OrigTimestamp, maxTS), so if that window // is empty, there won't be any uncertainty restarts. if !ba.Txn.OrigTimestamp.Less(maxTS) { log.Event(ctx, "read has no clock uncertainty") } shallowTxn.MaxTimestamp.Backward(maxTS) ba.Txn = &shallowTxn } } br, pErr := store.Send(ctx, ba) if br != nil && br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(store, br)) } return br, pErr }
// cleanupTxnLocked is called when a transaction ends. The transaction record // is updated and the heartbeat goroutine signaled to clean up the transaction // gracefully. func (tc *TxnCoordSender) cleanupTxnLocked(ctx context.Context, txn roachpb.Transaction) { log.Event(ctx, "coordinator stops") txnMeta, ok := tc.txns[*txn.ID] // The heartbeat might've already removed the record. Or we may have already // closed txnEnd but we are racing with the heartbeat cleanup. if !ok || txnMeta.txnEnd == nil { return } // The supplied txn may be newer than the one in txnMeta, which is relevant // for stats. txnMeta.txn = txn // Trigger heartbeat shutdown. close(txnMeta.txnEnd) txnMeta.txnEnd = nil }
// EvictAndReplace instructs the EvictionToken to evict the RangeDescriptor it was // created with from the rangeDescriptorCache. It also allows the user to provide // new RangeDescriptors to insert into the cache, all atomically. When called without // arguments, EvictAndReplace will behave the same as Evict. func (et *EvictionToken) EvictAndReplace( ctx context.Context, newDescs ...roachpb.RangeDescriptor, ) error { var err error et.doOnce.Do(func() { et.doLocker.Lock() defer et.doLocker.Unlock() err = et.do() if err == nil { if len(newDescs) > 0 { err = et.doReplace(newDescs...) log.Eventf(ctx, "evicting cached range descriptor with %d replacements", len(newDescs)) } else { log.Event(ctx, "evicting cached range descriptor") } } }) return err }
// maybeCleanupBootstrapAddresses cleans up the stored bootstrap addresses to // include only those currently available via gossip. The gossip mutex must // be held by the caller. func (g *Gossip) maybeCleanupBootstrapAddressesLocked() { if g.storage == nil || g.hasCleanedBS { return } defer func() { g.hasCleanedBS = true }() ctx := g.AnnotateCtx(context.TODO()) log.Event(ctx, "cleaning up bootstrap addresses") g.resolvers = g.resolvers[:0] g.resolverIdx = 0 g.bootstrapInfo.Addresses = g.bootstrapInfo.Addresses[:0] g.bootstrapAddrs = map[util.UnresolvedAddr]struct{}{} g.resolverAddrs = map[util.UnresolvedAddr]resolver.Resolver{} g.resolversTried = map[int]struct{}{} var desc roachpb.NodeDescriptor if err := g.mu.is.visitInfos(func(key string, i *Info) error { if strings.HasPrefix(key, KeyNodeIDPrefix) { if err := i.Value.GetProto(&desc); err != nil { return err } if desc.Address == g.mu.is.NodeAddr { return nil } g.maybeAddResolver(desc.Address) g.maybeAddBootstrapAddress(desc.Address) } return nil }); err != nil { log.Error(ctx, err) return } if err := g.storage.WriteBootstrapInfo(&g.bootstrapInfo); err != nil { log.Error(ctx, err) } }
// sendPartialBatch sends the supplied batch to the range specified by // desc. The batch request is first truncated so that it contains only // requests which intersect the range descriptor and keys for each // request are limited to the range's key span. The send occurs in a // retry loop to handle send failures. On failure to send to any // replicas, we backoff and retry by refetching the range // descriptor. If the underlying range seems to have split, we // recursively invoke divideAndSendBatchToRanges to re-enumerate the // ranges in the span and resend to each. func (ds *DistSender) sendPartialBatch( ctx context.Context, ba roachpb.BatchRequest, rs roachpb.RSpan, desc *roachpb.RangeDescriptor, evictToken *EvictionToken, isFirst bool, ) response { var reply *roachpb.BatchResponse var pErr *roachpb.Error isReverse := ba.IsReverse() // Truncate the request to range descriptor. intersected, err := rs.Intersect(desc) if err != nil { return response{pErr: roachpb.NewError(err)} } truncBA, numActive, err := truncate(ba, intersected) if numActive == 0 && err == nil { // This shouldn't happen in the wild, but some tests exercise it. return response{ pErr: roachpb.NewErrorf("truncation resulted in empty batch on %s: %s", intersected, ba), } } if err != nil { return response{pErr: roachpb.NewError(err)} } // Start a retry loop for sending the batch to the range. for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { // If we've cleared the descriptor on a send failure, re-lookup. if desc == nil { var descKey roachpb.RKey if isReverse { descKey = intersected.EndKey } else { descKey = intersected.Key } desc, evictToken, err = ds.getDescriptor(ctx, descKey, nil, isReverse) if err != nil { log.ErrEventf(ctx, "range descriptor re-lookup failed: %s", err) continue } } reply, pErr = ds.sendSingleRange(ctx, truncBA, desc) // If sending succeeded, return immediately. if pErr == nil { return response{reply: reply} } log.ErrEventf(ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. log.Event(ctx, "evicting range descriptor on send error and backoff for re-lookup") if err := evictToken.Evict(ctx); err != nil { return response{pErr: roachpb.NewError(err)} } // Clear the descriptor to reload on the next attempt. desc = nil continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(isReverse, tErr.SuggestedRange, rs) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return response{pErr: roachpb.NewError(err)} } // On addressing errors (likely a split), we need to re-invoke // the range descriptor lookup machinery, so we recurse by // sending batch to just the partial span this descriptor was // supposed to cover. log.VEventf(ctx, 1, "likely split; resending batch to span: %s", tErr) reply, pErr = ds.divideAndSendBatchToRanges(ctx, ba, intersected, isFirst) return response{reply: reply, pErr: pErr} } break } // Propagate error if either the retry closer or context done // channels were closed. if pErr == nil { if pErr = ds.deduceRetryEarlyExitError(ctx); pErr == nil { log.Fatal(ctx, "exited retry loop without an error") } } return response{pErr: pErr} }
// maybePushTransactions tries to push the conflicting transaction(s) // responsible for the given intents: either move its // timestamp forward on a read/write conflict, abort it on a // write/write conflict, or do nothing if the transaction is no longer // pending. // // Returns a slice of intents which can now be resolved, and an error. // The returned intents should be resolved via intentResolver.resolveIntents. // // If skipIfInFlight is true, then no PushTxns will be sent and no // intents will be returned for any transaction for which there is // another push in progress. This should only be used by callers who // are not relying on the side effect of a push (i.e. only // pushType==PUSH_TOUCH), and who also don't need to synchronize with // the resolution of those intents (e.g. asynchronous resolutions of // intents skipped on inconsistent reads). // // Callers are involved with // a) conflict resolution for commands being executed at the Store with the // client waiting, // b) resolving intents encountered during inconsistent operations, and // c) resolving intents upon EndTransaction which are not local to the given // range. This is the only path in which the transaction is going to be // in non-pending state and doesn't require a push. func (ir *intentResolver) maybePushTransactions( ctx context.Context, intents []roachpb.Intent, h roachpb.Header, pushType roachpb.PushTxnType, skipIfInFlight bool, ) ([]roachpb.Intent, *roachpb.Error) { now := ir.store.Clock().Now() partialPusherTxn := h.Txn // If there's no pusher, we communicate a priority by sending an empty // txn with only the priority set. This is official usage of PushTxn. if partialPusherTxn == nil { partialPusherTxn = &roachpb.Transaction{ TxnMeta: enginepb.TxnMeta{ Priority: roachpb.MakePriority(h.UserPriority), }, } } log.Event(ctx, "pushing transaction") // Split intents into those we need to push and those which are good to // resolve. ir.mu.Lock() // TODO(tschottdorf): can optimize this and use same underlying slice. var pushIntents, nonPendingIntents []roachpb.Intent for _, intent := range intents { if intent.Status != roachpb.PENDING { // The current intent does not need conflict resolution // because the transaction is already finalized. // This shouldn't happen as all intents created are in // the PENDING status. nonPendingIntents = append(nonPendingIntents, intent) } else if _, ok := ir.mu.inFlight[*intent.Txn.ID]; ok && skipIfInFlight { // Another goroutine is working on this transaction so we can // skip it. if log.V(1) { log.Infof(ctx, "skipping PushTxn for %s; attempt already in flight", intent.Txn.ID) } continue } else { pushIntents = append(pushIntents, intent) ir.mu.inFlight[*intent.Txn.ID]++ } } ir.mu.Unlock() if len(nonPendingIntents) > 0 { return nil, roachpb.NewError(errors.Errorf("unexpected aborted/resolved intents: %+v", nonPendingIntents)) } // Attempt to push the transaction(s) which created the conflicting intent(s). var pushReqs []roachpb.Request for _, intent := range pushIntents { pushReqs = append(pushReqs, &roachpb.PushTxnRequest{ Span: roachpb.Span{ Key: intent.Txn.Key, }, PusherTxn: *partialPusherTxn, PusheeTxn: intent.Txn, PushTo: h.Timestamp, // The timestamp is used by PushTxn for figuring out whether the // transaction is abandoned. If we used the argument's timestamp // here, we would run into busy loops because that timestamp // usually stays fixed among retries, so it will never realize // that a transaction has timed out. See #877. Now: now, PushType: pushType, }) } b := &client.Batch{} b.AddRawRequest(pushReqs...) var pErr *roachpb.Error if err := ir.store.db.Run(ctx, b); err != nil { pErr = b.MustPErr() } ir.mu.Lock() for _, intent := range pushIntents { ir.mu.inFlight[*intent.Txn.ID]-- if ir.mu.inFlight[*intent.Txn.ID] == 0 { delete(ir.mu.inFlight, *intent.Txn.ID) } } ir.mu.Unlock() if pErr != nil { return nil, pErr } br := b.RawResponse() var resolveIntents []roachpb.Intent for i, intent := range pushIntents { pushee := br.Responses[i].GetInner().(*roachpb.PushTxnResponse).PusheeTxn intent.Txn = pushee.TxnMeta intent.Status = pushee.Status resolveIntents = append(resolveIntents, intent) } return resolveIntents, nil }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation. func (s *Server) Start(ctx context.Context) error { ctx = s.AnnotateCtx(ctx) startTime := timeutil.Now() tlsConfig, err := s.cfg.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.cfg.Addr) if err != nil { return err } log.Eventf(ctx, "listening on port %s", s.cfg.Addr) unresolvedListenAddr, err := officialAddr(s.cfg.Addr, ln.Addr()) if err != nil { return err } s.cfg.Addr = unresolvedListenAddr.String() unresolvedAdvertAddr, err := officialAddr(s.cfg.AdvertiseAddr, ln.Addr()) if err != nil { return err } s.cfg.AdvertiseAddr = unresolvedAdvertAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.cfg.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.cfg.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.cfg.HTTPAddr = unresolvedHTTPAddr.String() workersCtx := s.AnnotateCtx(context.Background()) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) if len(s.cfg.SocketFile) != 0 { // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.cfg.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) } // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAdvertAddr) log.Event(ctx, "started gossip") s.engines, err = s.cfg.CreateEngines() if err != nil { return errors.Wrap(err, "failed to create engines") } s.stopper.AddCloser(&s.engines) // We might have to sleep a bit to protect against this node producing non- // monotonic timestamps. Before restarting, its clock might have been driven // by other nodes' fast clocks, but when we restarted, we lost all this // information. For example, a client might have written a value at a // timestamp that's in the future of the restarted node's clock, and if we // don't do something, the same client's read would not return the written // value. So, we wait up to MaxOffset; we couldn't have served timestamps more // than MaxOffset in the future (assuming that MaxOffset was not changed, see // #9733). // // As an optimization for tests, we don't sleep if all the stores are brand // new. In this case, the node will not serve anything anyway until it // synchronizes with other nodes. { anyStoreBootstrapped := false for _, e := range s.engines { if _, err := storage.ReadStoreIdent(ctx, e); err != nil { // NotBootstrappedError is expected. if _, ok := err.(*storage.NotBootstrappedError); !ok { return err } } else { anyStoreBootstrapped = true break } } if anyStoreBootstrapped { sleepDuration := s.clock.MaxOffset() - timeutil.Since(startTime) if sleepDuration > 0 { log.Infof(ctx, "sleeping for %s to guarantee HLC monotonicity", sleepDuration) time.Sleep(sleepDuration) } } } // Now that we have a monotonic HLC wrt previous incarnations of the process, // init all the replicas. err = s.node.start( ctx, unresolvedAdvertAddr, s.engines, s.cfg.NodeAttributes, s.cfg.Locality, ) if err != nil { return err } log.Event(ctx, "started node") s.nodeLiveness.StartHeartbeat(ctx, s.stopper) // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.cfg.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource( s.cfg.AmbientCtx, s.recorder, s.cfg.MetricsSampleInterval, ts.Resolution10s, s.stopper, ) // Begin recording status summaries. s.node.startWriteSummaries(s.cfg.MetricsSampleInterval) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := &sql.SchemaChangerTestingKnobs{} if s.cfg.TestingKnobs.SQLSchemaChanger != nil { testingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) s.distSQLServer.Start() log.Infof(ctx, "starting %s server at %s", s.cfg.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(ctx, "starting grpc/postgres server at %s", unresolvedListenAddr) log.Infof(ctx, "advertising CockroachDB node at %s", unresolvedAdvertAddr) if len(s.cfg.SocketFile) != 0 { log.Infof(ctx, "starting postgres server at unix:%s", s.cfg.SocketFile) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) log.Event(ctx, "accepting connections") // Initialize grpc-gateway mux and context. jsonpb := &protoutil.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(protoutil.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb), gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background())) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.cfg.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("pkg/ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminPrefix, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, gwMux) s.mux.Handle("/health", gwMux) s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars)) log.Event(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err) } log.Event(ctx, "server ready") return nil }
func (n *Node) batchInternal( ctx context.Context, args *roachpb.BatchRequest, ) (*roachpb.BatchResponse, error) { // TODO(marc): grpc's authentication model (which gives credential access in // the request handler) doesn't really fit with the current design of the // security package (which assumes that TLS state is only given at connection // time) - that should be fixed. if peer, ok := peer.FromContext(ctx); ok { if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok { certUser, err := security.GetCertificateUser(&tlsInfo.State) if err != nil { return nil, err } if certUser != security.NodeUser { return nil, errors.Errorf("user %s is not allowed", certUser) } } } var br *roachpb.BatchResponse type snowballInfo struct { syncutil.Mutex collectedSpans [][]byte done bool } var snowball *snowballInfo if err := n.stopper.RunTaskWithErr(func() error { const opName = "node.Batch" sp, err := tracing.JoinOrNew(n.storeCfg.AmbientCtx.Tracer, args.TraceContext, opName) if err != nil { return err } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { sp.LogEvent("delegating to snowball tracing") sp.Finish() snowball = new(snowballInfo) recorder := func(rawSpan basictracer.RawSpan) { snowball.Lock() defer snowball.Unlock() if snowball.done { // This is a late span that we must discard because the request was // already completed. return } encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(ctx, err) } snowball.collectedSpans = append(snowball.collectedSpans, encSp) } if sp, err = tracing.JoinOrNewSnowball(opName, args.TraceContext, recorder); err != nil { return err } } defer sp.Finish() traceCtx := opentracing.ContextWithSpan(ctx, sp) log.Event(traceCtx, args.Summary()) tStart := timeutil.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(traceCtx, *args) if pErr != nil { br = &roachpb.BatchResponse{} log.ErrEventf(traceCtx, "%T", pErr.GetDetail()) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr return nil }); err != nil { return nil, err } if snowball != nil { snowball.Lock() br.CollectedSpans = snowball.collectedSpans snowball.done = true snowball.Unlock() } return br, nil }
// initStores initializes the Stores map from ID to Store. Stores are // added to the local sender if already bootstrapped. A bootstrapped // Store has a valid ident with cluster, node and Store IDs set. If // the Store doesn't yet have a valid ident, it's added to the // bootstraps list for initialization once the cluster and node IDs // have been determined. func (n *Node) initStores( ctx context.Context, engines []engine.Engine, stopper *stop.Stopper, bootstrapped bool, ) error { var bootstraps []*storage.Store if len(engines) == 0 { return errors.Errorf("no engines") } for _, e := range engines { s := storage.NewStore(n.storeCfg, e, &n.Descriptor) log.Eventf(ctx, "created store for engine: %s", e) if bootstrapped { s.NotifyBootstrapped() } // Initialize each store in turn, handling un-bootstrapped errors by // adding the store to the bootstraps list. if err := s.Start(ctx, stopper); err != nil { if _, ok := err.(*storage.NotBootstrappedError); ok { log.Infof(ctx, "store %s not bootstrapped", s) bootstraps = append(bootstraps, s) continue } return errors.Errorf("failed to start store: %s", err) } if s.Ident.ClusterID == *uuid.EmptyUUID || s.Ident.NodeID == 0 { return errors.Errorf("unidentified store: %s", s) } capacity, err := s.Capacity() if err != nil { return errors.Errorf("could not query store capacity: %s", err) } log.Infof(ctx, "initialized store %s: %+v", s, capacity) n.addStore(s) } // If there are no initialized stores and no gossip resolvers, // bootstrap this node as the seed of a new cluster. if n.stores.GetStoreCount() == 0 { resolvers := n.storeCfg.Gossip.GetResolvers() // Check for the case of uninitialized node having only itself specified as join host. switch len(resolvers) { case 0: return errNeedsBootstrap case 1: if resolvers[0].Addr() == n.Descriptor.Address.String() { return errCannotJoinSelf } } } // Verify all initialized stores agree on cluster and node IDs. if err := n.validateStores(); err != nil { return err } log.Event(ctx, "validated stores") // Set the stores map as the gossip persistent storage, so that // gossip can bootstrap using the most recently persisted set of // node addresses. if err := n.storeCfg.Gossip.SetStorage(n.stores); err != nil { return fmt.Errorf("failed to initialize the gossip interface: %s", err) } // Connect gossip before starting bootstrap. For new nodes, connecting // to the gossip network is necessary to get the cluster ID. n.connectGossip(ctx) log.Event(ctx, "connected to gossip") // If no NodeID has been assigned yet, allocate a new node ID by // supplying 0 to initNodeID. if n.Descriptor.NodeID == 0 { n.initNodeID(0) n.initialBoot = true log.Eventf(ctx, "allocated node ID %d", n.Descriptor.NodeID) } // Bootstrap any uninitialized stores asynchronously. if len(bootstraps) > 0 { if err := stopper.RunAsyncTask(ctx, func(ctx context.Context) { n.bootstrapStores(ctx, bootstraps, stopper) }); err != nil { return err } } return nil }
func (rq *replicateQueue) processOneChange( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Event(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget( zone.Constraints, desc.Replicas, desc.RangeID, true, ) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica) if err := rq.addReplica(ctx, repl, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Event(ctx, "removing a replica") // If the lease holder (our local store) is an overfull store (in terms of // leases) allow transferring the lease away. leaseHolderStoreID := repl.store.StoreID() if rq.allocator.ShouldTransferLease(zone.Constraints, leaseHolderStoreID, desc.RangeID) { leaseHolderStoreID = 0 } removeReplica, err := rq.allocator.RemoveTarget( zone.Constraints, desc.Replicas, leaseHolderStoreID, ) if err != nil { return err } if removeReplica.StoreID == repl.store.StoreID() { // The local replica was selected as the removal target, but that replica // is the leaseholder, so transfer the lease instead. We don't check that // the current store has too many leases in this case under the // assumption that replica balance is a greater concern. Also note that // AllocatorRemove action takes preference over AllocatorNoop // (rebalancing) which is where lease transfer would otherwise occur. We // need to be able to transfer leases in AllocatorRemove in order to get // out of situations where this store is overfull and yet holds all the // leases. candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, false /* checkTransferLeaseSource */) if target != (roachpb.ReplicaDescriptor{}) { log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(target.StoreID); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) // Do not requeue as we transferred our lease away. return nil } } else { log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica) if err := rq.removeReplica(ctx, repl, removeReplica, desc); err != nil { return err } } case AllocatorRemoveDead: log.Event(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica) if err := repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. log.Event(ctx, "considering a rebalance") if rq.canTransferLease() { // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas) target := rq.allocator.TransferLeaseTarget( zone.Constraints, candidates, repl.store.StoreID(), desc.RangeID, true /* checkTransferLeaseSource */) if target != (roachpb.ReplicaDescriptor{}) { log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) if err := repl.AdminTransferLease(target.StoreID); err != nil { return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) } rq.lastLeaseTransfer.Store(timeutil.Now()) // Do not requeue as we transferred our lease away. return nil } } rebalanceStore, err := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, repl.store.StoreID(), desc.RangeID, ) if err != nil { log.ErrEventf(ctx, "rebalance target failed %s", err) return nil } if rebalanceStore == nil { log.VEventf(ctx, 1, "no suitable rebalance target") // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica) if err := rq.addReplica(ctx, repl, rebalanceReplica, desc); err != nil { return err } } return nil }
func (rq *replicateQueue) process( ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig, ) error { desc := repl.Desc() // Find the zone config for this range. zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) if err != nil { return err } action, _ := rq.allocator.ComputeAction(zone, desc) // Avoid taking action if the range has too many dead replicas to make // quorum. deadReplicas := rq.allocator.storePool.deadReplicas(desc.RangeID, desc.Replicas) quorum := computeQuorum(len(desc.Replicas)) liveReplicaCount := len(desc.Replicas) - len(deadReplicas) if liveReplicaCount < quorum { return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.") } switch action { case AllocatorAdd: log.Event(ctx, "adding a new replica") newStore, err := rq.allocator.AllocateTarget( zone.Constraints, desc.Replicas, desc.RangeID, true, ) if err != nil { return err } newReplica := roachpb.ReplicaDescriptor{ NodeID: newStore.Node.NodeID, StoreID: newStore.StoreID, } log.VEventf(ctx, 1, "adding replica to %+v due to under-replication", newReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, newReplica, desc); err != nil { return err } case AllocatorRemove: log.Event(ctx, "removing a replica") // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas, repl.store.StoreID()) if err != nil { return err } log.VEventf(ctx, 1, "removing replica %+v due to over-replication", removeReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil { return err } // Do not requeue if we removed ourselves. if removeReplica.StoreID == repl.store.StoreID() { return nil } case AllocatorRemoveDead: log.Event(ctx, "removing a dead replica") if len(deadReplicas) == 0 { if log.V(1) { log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl) } break } deadReplica := deadReplicas[0] log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica) if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil { return err } case AllocatorNoop: log.Event(ctx, "considering a rebalance") // The Noop case will result if this replica was queued in order to // rebalance. Attempt to find a rebalancing target. // // We require the lease in order to process replicas, so // repl.store.StoreID() corresponds to the lease-holder's store ID. rebalanceStore := rq.allocator.RebalanceTarget( zone.Constraints, desc.Replicas, repl.store.StoreID(), desc.RangeID, ) if rebalanceStore == nil { log.VEventf(ctx, 1, "no suitable rebalance target") // No action was necessary and no rebalance target was found. Return // without re-queuing this replica. return nil } rebalanceReplica := roachpb.ReplicaDescriptor{ NodeID: rebalanceStore.Node.NodeID, StoreID: rebalanceStore.StoreID, } log.VEventf(ctx, 1, "rebalancing to %+v", rebalanceReplica) if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil { return err } } // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.clock.Now()) return nil }
// lookupRangeDescriptorInternal is called from LookupRangeDescriptor or from tests. // // If a WaitGroup is supplied, it is signaled when the request is // added to the inflight request map (with or without merging) or the // function finishes. Used for testing. func (rdc *rangeDescriptorCache) lookupRangeDescriptorInternal( ctx context.Context, key roachpb.RKey, evictToken *EvictionToken, useReverseScan bool, wg *sync.WaitGroup, ) (*roachpb.RangeDescriptor, *EvictionToken, error) { rdc.rangeCache.RLock() doneWg := func() { if wg != nil { wg.Done() } wg = nil } defer doneWg() if _, desc, err := rdc.getCachedRangeDescriptorLocked(key, useReverseScan); err != nil { rdc.rangeCache.RUnlock() return nil, nil, err } else if desc != nil { rdc.rangeCache.RUnlock() returnToken := rdc.makeEvictionToken(desc, func() error { return rdc.evictCachedRangeDescriptorLocked(key, desc, useReverseScan) }) log.Event(ctx, "looked up range descriptor from cache") return desc, returnToken, nil } if log.V(3) { log.Infof(ctx, "lookup range descriptor: key=%s\n%s", key, rdc.stringLocked()) } else if log.V(2) { log.Infof(ctx, "lookup range descriptor: key=%s", key) } var res lookupResult requestKey := makeLookupRequestKey(key, evictToken, useReverseScan) rdc.lookupRequests.Lock() if req, inflight := rdc.lookupRequests.inflight[requestKey]; inflight { resC := make(chan lookupResult, 1) req.observers = append(req.observers, resC) rdc.lookupRequests.inflight[requestKey] = req rdc.lookupRequests.Unlock() rdc.rangeCache.RUnlock() doneWg() res = <-resC log.Event(ctx, "looked up range descriptor with shared request") } else { rdc.lookupRequests.inflight[requestKey] = req rdc.lookupRequests.Unlock() rdc.rangeCache.RUnlock() doneWg() rs, preRs, err := rdc.performRangeLookup(ctx, key, useReverseScan) if err != nil { res = lookupResult{err: err} } else { switch len(rs) { case 0: res = lookupResult{err: fmt.Errorf("no range descriptors returned for %s", key)} case 1: desc := &rs[0] res = lookupResult{ desc: desc, evictToken: rdc.makeEvictionToken(desc, func() error { return rdc.evictCachedRangeDescriptorLocked(key, desc, useReverseScan) }), } case 2: desc := &rs[0] nextDesc := rs[1] res = lookupResult{ desc: desc, evictToken: rdc.makeEvictionToken(desc, func() error { return rdc.insertRangeDescriptorsLocked(nextDesc) }), } default: panic(fmt.Sprintf("more than 2 matching range descriptors returned for %s: %v", key, rs)) } } // We want to be assured that all goroutines which experienced a cache miss // have joined our in-flight request, and all others will experience a // cache hit. This requires atomicity across cache population and // notification, hence this exclusive lock. rdc.rangeCache.Lock() if res.err == nil { // These need to be separate because we need to preserve the pointer to rs[0] // so that the seenDesc logic works correctly in EvictCachedRangeDescriptor. An // append could cause a copy, which would change the address of rs[0]. We insert // the prefetched descriptors first to avoid any unintended overwriting. if err := rdc.insertRangeDescriptorsLocked(preRs...); err != nil { log.Warningf(ctx, "range cache inserting prefetched descriptors failed: %v", err) } if err := rdc.insertRangeDescriptorsLocked(rs...); err != nil { res = lookupResult{err: err} } } // rdc.lookupRequests does not need to be locked here because we hold an exclusive // write lock on rdc.rangeCache. However, we do anyway for clarity and future proofing. rdc.lookupRequests.Lock() for _, observer := range rdc.lookupRequests.inflight[requestKey].observers { observer <- res } delete(rdc.lookupRequests.inflight, requestKey) rdc.lookupRequests.Unlock() rdc.rangeCache.Unlock() log.Event(ctx, "looked up range descriptor") } // It rarely may be possible that we got grouped in with the wrong // RangeLookup (eg. from a double split), so if we did, return an error with // an unmodified eviction token. if desc := res.desc; desc != nil { containsFn := (*roachpb.RangeDescriptor).ContainsKey if useReverseScan { containsFn = (*roachpb.RangeDescriptor).ContainsExclusiveEndKey } if !containsFn(desc, key) { return nil, evictToken, errors.Errorf("key %q not contained in range lookup's resulting descriptor %v", key, desc) } } return res.desc, res.evictToken, res.err }
// process performs a consistent lookup on the range descriptor to see if we are // still a member of the range. func (rgcq *replicaGCQueue) process( ctx context.Context, repl *Replica, _ config.SystemConfig, ) error { // Note that the Replicas field of desc is probably out of date, so // we should only use `desc` for its static fields like RangeID and // StartKey (and avoid rng.GetReplica() for the same reason). desc := repl.Desc() // Calls to RangeLookup typically use inconsistent reads, but we // want to do a consistent read here. This is important when we are // considering one of the metadata ranges: we must not do an // inconsistent lookup in our own copy of the range. b := &client.Batch{} b.AddRawRequest(&roachpb.RangeLookupRequest{ Span: roachpb.Span{ Key: keys.RangeMetaKey(desc.StartKey), }, MaxRanges: 1, }) if err := rgcq.db.Run(ctx, b); err != nil { return err } br := b.RawResponse() reply := br.Responses[0].GetInner().(*roachpb.RangeLookupResponse) if len(reply.Ranges) != 1 { return errors.Errorf("expected 1 range descriptor, got %d", len(reply.Ranges)) } replyDesc := reply.Ranges[0] if _, currentMember := replyDesc.GetReplicaDescriptor(repl.store.StoreID()); !currentMember { // We are no longer a member of this range; clean up our local data. rgcq.metrics.RemoveReplicaCount.Inc(1) log.VEventf(ctx, 1, "destroying local data") if err := repl.store.RemoveReplica(ctx, repl, replyDesc, true); err != nil { return err } } else if desc.RangeID != replyDesc.RangeID { // If we get a different range ID back, then the range has been merged // away. But currentMember is true, so we are still a member of the // subsuming range. Shut down raft processing for the former range // and delete any remaining metadata, but do not delete the data. rgcq.metrics.RemoveReplicaCount.Inc(1) log.VEventf(ctx, 1, "removing merged range") if err := repl.store.RemoveReplica(ctx, repl, replyDesc, false); err != nil { return err } // TODO(bdarnell): remove raft logs and other metadata (while leaving a // tombstone). Add tests for GC of merged ranges. } else { // This replica is a current member of the raft group. Set the last replica // GC check time to avoid re-processing for another check interval. // // TODO(tschottdorf): should keep stats in particular on this outcome // but also on how good a job the queue does at inspecting every // Replica (see #8111) when inactive ones can be starved by // event-driven additions. log.Event(ctx, "not gc'able") if err := repl.setLastReplicaGCTimestamp(ctx, repl.store.Clock().Now()); err != nil { return err } } return nil }
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool { tc.Lock() txnMeta := tc.txns[txnID] txn := txnMeta.txn.Clone() hasAbandoned := txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) tc.Unlock() if txn.Status != roachpb.PENDING { // A previous iteration has already determined that the transaction is // already finalized, so we wait for the client to realize that and // want to keep our state for the time being (to dish out the right // error once it returns). return true } // Before we send a heartbeat, determine whether this transaction should be // considered abandoned. If so, exit heartbeat. If ctx.Done() is not nil, then // it is a cancellable Context and we skip this check and use the ctx lifetime // instead of a timeout. // // TODO(andrei): We should disallow non-cancellable contexts in the heartbeat // goroutine and enforce that our kv client cancels the context when it's // done. We get non-cancellable contexts from remote clients // (roachpb.ExternalClient) because we override the gRPC context to make it // non-cancellable in DBServer.Batch (as that context is not tied to a txn // lifetime). // Further note that, unfortunately, the Sender interface generally makes it // difficult for the TxnCoordSender to get a context with the same lifetime as // the transaction (the TxnCoordSender associates the context of the txn's // first write with the txn). We should move to using only use local clients // (i.e. merge, or at least co-locate client.Txn and the TxnCoordSender). At // that point, we probably don't even need to deal with context cancellation // any more; the client will be trusted to always send an EndRequest when it's // done with a transaction. if ctx.Done() == nil && hasAbandoned { if log.V(1) { log.Infof(ctx, "transaction %s abandoned; stopping heartbeat", txnMeta.txn) } tc.tryAsyncAbort(txnID) return false } ba := roachpb.BatchRequest{} ba.Txn = &txn hb := &roachpb.HeartbeatTxnRequest{ Now: tc.clock.Now(), } hb.Key = txn.Key ba.Add(hb) log.Event(ctx, "heartbeat") br, pErr := tc.wrapped.Send(ctx, ba) // Correctness mandates that when we can't heartbeat the transaction, we // make sure the client doesn't keep going. This is particularly relevant // in the case of an ABORTED transaction, but if we can't reach the // transaction record at all, we're going to have to assume we're aborted // as well. if pErr != nil { log.Warningf(ctx, "heartbeat to %s failed: %s", txn, pErr) // We're not going to let the client carry out additional requests, so // try to clean up. tc.tryAsyncAbort(*txn.ID) txn.Status = roachpb.ABORTED } else { txn.Update(br.Responses[0].GetInner().(*roachpb.HeartbeatTxnResponse).Txn) } // Give the news to the txn in the txns map. This will update long-running // transactions (which may find out that they have to restart in that way), // but in particular makes sure that they notice when they've been aborted // (in which case we'll give them an error on their next request). tc.Lock() tc.txns[txnID].txn.Update(&txn) tc.Unlock() return true }
// updateState updates the transaction state in both the success and // error cases, applying those updates to the corresponding txnMeta // object when adequate. It also updates certain errors with the // updated transaction for use by client restarts. func (tc *TxnCoordSender) updateState( ctx context.Context, startNS int64, ba roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error, ) *roachpb.Error { tc.Lock() defer tc.Unlock() if ba.Txn == nil { // Not a transactional request. return pErr } var newTxn roachpb.Transaction newTxn.Update(ba.Txn) if pErr == nil { newTxn.Update(br.Txn) } else if errTxn := pErr.GetTxn(); errTxn != nil { newTxn.Update(errTxn) } switch t := pErr.GetDetail().(type) { case *roachpb.OpRequiresTxnError: panic("OpRequiresTxnError must not happen at this level") case *roachpb.ReadWithinUncertaintyIntervalError: // If the reader encountered a newer write within the uncertainty // interval, we advance the txn's timestamp just past the last observed // timestamp from the node. restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode) if !ok { pErr = roachpb.NewError(errors.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode)) } else { newTxn.Timestamp.Forward(restartTS) newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp) } case *roachpb.TransactionAbortedError: // Increase timestamp if applicable. newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp) newTxn.Priority = pErr.GetTxn().Priority // Clean up the freshly aborted transaction in defer(), avoiding a // race with the state update below. defer tc.cleanupTxnLocked(ctx, newTxn) case *roachpb.TransactionPushError: // Increase timestamp if applicable, ensuring that we're // just ahead of the pushee. newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp) newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp) case *roachpb.TransactionRetryError: // Increase timestamp so on restart, we're ahead of any timestamp // cache entries or newer versions which caused the restart. newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp) case *roachpb.WriteTooOldError: newTxn.Restart(ba.UserPriority, newTxn.Priority, t.ActualTimestamp) case nil: // Nothing to do here, avoid the default case. default: // Do not clean up the transaction since we're leaving cancellation of // the transaction up to the client. For example, on seeing an error, // like TransactionStatusError or ConditionFailedError, the client // will call Txn.CleanupOnError() which will cleanup the transaction // and its intents. Therefore leave the transaction in the PENDING // state and do not call cleanTxnLocked(). } txnID := *newTxn.ID txnMeta := tc.txns[txnID] // For successful transactional requests, keep the written intents and // the updated transaction record to be sent along with the reply. // The transaction metadata is created with the first writing operation. // A tricky edge case is that of a transaction which "fails" on the // first writing request, but actually manages to write some intents // (for example, due to being multi-range). In this case, there will // be an error, but the transaction will be marked as Writing and the // coordinator must track the state, for the client's retry will be // performed with a Writing transaction which the coordinator rejects // unless it is tracking it (on top of it making sense to track it; // after all, it **has** laid down intents and only the coordinator // can augment a potential EndTransaction call). See #3303. if txnMeta != nil || pErr == nil || newTxn.Writing { // Adding the intents even on error reduces the likelihood of dangling // intents blocking concurrent writers for extended periods of time. // See #3346. var keys []roachpb.Span if txnMeta != nil { keys = txnMeta.keys } ba.IntentSpanIterate(br, func(key, endKey roachpb.Key) { keys = append(keys, roachpb.Span{ Key: key, EndKey: endKey, }) }) if txnMeta != nil { txnMeta.keys = keys } else if len(keys) > 0 { if !newTxn.Writing { panic("txn with intents marked as non-writing") } // If the transaction is already over, there's no point in // launching a one-off coordinator which will shut down right // away. If we ended up here with an error, we'll always start // the coordinator - the transaction has laid down intents, so // we expect it to be committed/aborted at some point in the // future. if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding { log.Event(ctx, "coordinator spawns") txnMeta = &txnMetadata{ txn: newTxn, keys: keys, firstUpdateNanos: startNS, lastUpdateNanos: tc.clock.PhysicalNow(), timeoutDuration: tc.clientTimeout, txnEnd: make(chan struct{}), } tc.txns[txnID] = txnMeta if err := tc.stopper.RunAsyncTask(ctx, func(ctx context.Context) { tc.heartbeatLoop(ctx, txnID) }); err != nil { // The system is already draining and we can't start the // heartbeat. We refuse new transactions for now because // they're likely not going to have all intents committed. // In principle, we can relax this as needed though. tc.unregisterTxnLocked(txnID) return roachpb.NewError(err) } } else { // If this was a successful one phase commit, update stats // directly as they won't otherwise be updated on heartbeat // loop shutdown. etArgs, ok := br.Responses[len(br.Responses)-1].GetInner().(*roachpb.EndTransactionResponse) tc.updateStats(tc.clock.PhysicalNow()-startNS, 0, newTxn.Status, ok && etArgs.OnePhaseCommit) } } } // Update our record of this transaction, even on error. if txnMeta != nil { txnMeta.txn.Update(&newTxn) if !txnMeta.txn.Writing { panic("tracking a non-writing txn") } txnMeta.setLastUpdate(tc.clock.PhysicalNow()) } if pErr == nil { // For successful transactional requests, always send the updated txn // record back. Note that we make sure not to share data with newTxn // (which may have made it into txnMeta). if br.Txn != nil { br.Txn.Update(&newTxn) } else { clonedTxn := newTxn.Clone() br.Txn = &clonedTxn } } else if pErr.GetTxn() != nil { // Avoid changing existing errors because sometimes they escape into // goroutines and data races can occur. pErrShallow := *pErr pErrShallow.SetTxn(&newTxn) // SetTxn clones newTxn pErr = &pErrShallow } return pErr }
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool { tc.Lock() txnMeta := tc.txns[txnID] txn := txnMeta.txn.Clone() hasAbandoned := txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) tc.Unlock() if txn.Status != roachpb.PENDING { // A previous iteration has already determined that the transaction is // already finalized, so we wait for the client to realize that and // want to keep our state for the time being (to dish out the right // error once it returns). return true } // Before we send a heartbeat, determine whether this transaction should be // considered abandoned. If so, exit heartbeat. If ctx.Done() is not nil, then // it is a cancelable Context and we skip this check and use the ctx lifetime // instead of a timeout. if ctx.Done() == nil && hasAbandoned { if log.V(1) { log.Infof(ctx, "transaction %s abandoned; stopping heartbeat", txnMeta.txn) } tc.tryAsyncAbort(txnID) return false } ba := roachpb.BatchRequest{} ba.Txn = &txn hb := &roachpb.HeartbeatTxnRequest{ Now: tc.clock.Now(), } hb.Key = txn.Key ba.Add(hb) log.Event(ctx, "heartbeat") br, pErr := tc.wrapped.Send(ctx, ba) // Correctness mandates that when we can't heartbeat the transaction, we // make sure the client doesn't keep going. This is particularly relevant // in the case of an ABORTED transaction, but if we can't reach the // transaction record at all, we're going to have to assume we're aborted // as well. if pErr != nil { log.Warningf(ctx, "heartbeat to %s failed: %s", txn, pErr) // We're not going to let the client carry out additional requests, so // try to clean up. tc.tryAsyncAbort(*txn.ID) txn.Status = roachpb.ABORTED } else { txn.Update(br.Responses[0].GetInner().(*roachpb.HeartbeatTxnResponse).Txn) } // Give the news to the txn in the txns map. This will update long-running // transactions (which may find out that they have to restart in that way), // but in particular makes sure that they notice when they've been aborted // (in which case we'll give them an error on their next request). tc.Lock() tc.txns[txnID].txn.Update(&txn) tc.Unlock() return true }