// NewTxnCoordSender creates a new TxnCoordSender for use from a KV // distributed DB instance. // ctx is the base context and is used for logs and traces when there isn't a // more specific context available; it must have a Tracer set. func NewTxnCoordSender( ctx context.Context, wrapped client.Sender, clock *hlc.Clock, linearizable bool, stopper *stop.Stopper, txnMetrics TxnMetrics, ) *TxnCoordSender { if ctx == nil || tracing.TracerFromCtx(ctx) == nil { panic("ctx with tracer must be supplied") } if ctx.Done() != nil { panic("context with cancel or deadline") } tc := &TxnCoordSender{ ctx: ctx, wrapped: wrapped, clock: clock, heartbeatInterval: base.DefaultHeartbeatInterval, clientTimeout: defaultClientTimeout, txns: map[uuid.UUID]*txnMetadata{}, linearizable: linearizable, stopper: stopper, metrics: txnMetrics, } tc.stopper.RunWorker(tc.startStats) return tc }
// heartbeatLoop periodically sends a HeartbeatTxn RPC to an extant transaction, // stopping in the event the transaction is aborted or committed after // attempting to resolve the intents. When the heartbeat stops, the transaction // is unregistered from the coordinator. // // TODO(dan): The Context we use for this is currently the one from the first // request in a Txn, but the semantics of this aren't good. Each context has its // own associated lifetime and we're ignoring all but the first. It happens now // that we pass the same one in every request, but it's brittle to rely on this // forever. // TODO(wiz): Update (*DBServer).Batch to not use context.TODO(). func (tc *TxnCoordSender) heartbeatLoop(ctx context.Context, txnID uuid.UUID) { var tickChan <-chan time.Time { ticker := time.NewTicker(tc.heartbeatInterval) tickChan = ticker.C defer ticker.Stop() } defer func() { tc.Lock() duration, restarts, status := tc.unregisterTxnLocked(txnID) tc.Unlock() tc.updateStats(duration, restarts, status, false) }() var closer <-chan struct{} // TODO(tschottdorf): this should join to the trace of the request // which starts this goroutine. sp := tracing.TracerFromCtx(tc.ctx).StartSpan(opHeartbeatLoop) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) { tc.Lock() txnMeta := tc.txns[txnID] // do not leak to outer scope closer = txnMeta.txnEnd tc.Unlock() } if closer == nil { // Avoid race in which a Txn is cleaned up before the heartbeat // goroutine gets a chance to start. return } // Loop with ticker for periodic heartbeats. for { select { case <-tickChan: if !tc.heartbeat(ctx, txnID) { return } case <-closer: // Transaction finished normally. return case <-ctx.Done(): // Note that if ctx is not cancellable, then ctx.Done() returns a nil // channel, which blocks forever. In this case, the heartbeat loop is // responsible for timing out transactions. If ctx.Done() is not nil, then // then heartbeat loop ignores the timeout check and this case is // responsible for client timeouts. tc.tryAsyncAbort(txnID) return case <-tc.stopper.ShouldQuiesce(): return } } }
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one // which doesn't need to be subdivided further before going to a range (so no // mixing of forward and reverse scans, etc). The parameters and return values // correspond to client.Sender with the exception of the returned boolean, // which is true when indicating that the caller should retry but needs to send // EndTransaction in a separate request. func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) { isReverse := ba.IsReverse() // TODO(radu): when contexts are properly plumbed, we should be able to get // the tracer from ctx, not from the DistSender. ctx, cleanup := tracing.EnsureContext(ctx, tracing.TracerFromCtx(ds.Ctx)) defer cleanup() // The minimal key range encompassing all requests contained within. // Local addressing has already been resolved. // TODO(tschottdorf): consider rudimentary validation of the batch here // (for example, non-range requests with EndKey, or empty key ranges). rs, err := keys.Range(ba) if err != nil { return nil, roachpb.NewError(err), false } var br *roachpb.BatchResponse // Send the request to one range per iteration. for { // Increase the sequence counter only once before sending RPCs to // the ranges involved in this chunk of the batch (as opposed to for // each RPC individually). On RPC errors, there's no guarantee that // the request hasn't made its way to the target regardless of the // error; we'd like the second execution to be caught by the sequence // cache if that happens. There is a small chance that that we address // a range twice in this chunk (stale/suboptimal descriptors due to // splits/merges) which leads to a transaction retry. // TODO(tschottdorf): it's possible that if we don't evict from the // cache we could be in for a busy loop. ba.SetNewRequest() var curReply *roachpb.BatchResponse var desc *roachpb.RangeDescriptor var evictToken *evictionToken var needAnother bool var pErr *roachpb.Error var finished bool var numAttempts int for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); { numAttempts++ { const magicLogCurAttempt = 20 var seq int32 if ba.Txn != nil { seq = ba.Txn.Sequence } if numAttempts%magicLogCurAttempt == 0 || seq%magicLogCurAttempt == 0 { // Log a message if a request appears to get stuck for a long // time or, potentially, forever. See #8975. // The local counter captures this loop here; the Sequence number // should capture anything higher up (as it needs to be // incremented every time this method is called). log.Warningf( ctx, "%d retries for an RPC at sequence %d, last error was: %s, remaining key ranges %s: %s", numAttempts, seq, pErr, rs, ba, ) } } // Get range descriptor (or, when spanning range, descriptors). Our // error handling below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. log.Trace(ctx, "meta descriptor lookup") var err error desc, needAnother, evictToken, err = ds.getDescriptors(ctx, rs, evictToken, isReverse) // getDescriptors may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.Trace(ctx, "range descriptor lookup failed: "+err.Error()) if log.V(1) { log.Warning(ctx, err) } pErr = roachpb.NewError(err) continue } if needAnother && br == nil { // TODO(tschottdorf): we should have a mechanism for discovering // range merges (descriptor staleness will mostly go unnoticed), // or we'll be turning single-range queries into multi-range // queries for no good reason. // If there's no transaction and op spans ranges, possibly // re-run as part of a transaction for consistency. The // case where we don't need to re-run is if the read // consistency is not required. if ba.Txn == nil && ba.IsPossibleTransaction() && ba.ReadConsistency != roachpb.INCONSISTENT { return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false } // If the request is more than but ends with EndTransaction, we // want the caller to come again with the EndTransaction in an // extra call. if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction { return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */ } } // It's possible that the returned descriptor misses parts of the // keys it's supposed to scan after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool { if isReverse { return desc.ContainsExclusiveEndKey(rs.EndKey) } return desc.ContainsKey(rs.Key) } if !includesFrontOfCurSpan(desc) { if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) { // Truncate the request to our current key range. intersected, iErr := rs.Intersect(desc) if iErr != nil { return nil, roachpb.NewError(iErr) } truncBA, numActive, trErr := truncate(ba, intersected) if numActive == 0 && trErr == nil { // This shouldn't happen in the wild, but some tests // exercise it. return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s", rs.Key, rs.EndKey, ba) } if trErr != nil { return nil, roachpb.NewError(trErr) } return ds.sendSingleRange(ctx, truncBA, desc) }() // If sending succeeded, break this loop. if pErr == nil { finished = true break } log.VTracef(1, ctx, "reply error %s: %s", ba, pErr) // Error handling: If the error indicates that our range // descriptor is out of date, evict it from the cache and try // again. Errors that apply only to a single replica were // handled in send(). // // TODO(bdarnell): Don't retry endlessly. If we fail twice in a // row and the range descriptor hasn't changed, return the error // to our caller. switch tErr := pErr.GetDetail().(type) { case *roachpb.SendError: // We've tried all the replicas without success. Either // they're all down, or we're using an out-of-date range // descriptor. Invalidate the cache and try again with the new // metadata. if err := evictToken.Evict(ctx); err != nil { return nil, roachpb.NewError(err), false } continue case *roachpb.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. This is // likely the result of a range split. If we have new range // descriptors, insert them instead as long as they are different // from the last descriptor to avoid endless loops. var replacements []roachpb.RangeDescriptor different := func(rd *roachpb.RangeDescriptor) bool { return !desc.RSpan().Equal(rd.RSpan()) } if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) { replacements = append(replacements, *tErr.MismatchedRange) } if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) { if includesFrontOfCurSpan(tErr.SuggestedRange) { replacements = append(replacements, *tErr.SuggestedRange) } } // Same as Evict() if replacements is empty. if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil { return nil, roachpb.NewError(err), false } // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(ctx, tErr) } continue } break } // Immediately return if querying a range failed non-retryably. if pErr != nil { return nil, pErr, false } else if !finished { select { case <-ds.rpcRetryOptions.Closer: return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false case <-ctx.Done(): return nil, roachpb.NewError(ctx.Err()), false default: log.Fatal(ctx, "exited retry loop with nil error but finished=false") } } ba.UpdateTxn(curReply.Txn) if br == nil { // First response from a Range. br = curReply } else { // This was the second or later call in a cross-Range request. // Combine the new response with the existing one. if err := br.Combine(curReply); err != nil { return nil, roachpb.NewError(err), false } } if isReverse { // In next iteration, query previous range. // We use the StartKey of the current descriptor as opposed to the // EndKey of the previous one since that doesn't have bugs when // stale descriptors come into play. rs.EndKey, err = prev(ba, desc.StartKey) } else { // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. rs.Key, err = next(ba, desc.EndKey) } if err != nil { return nil, roachpb.NewError(err), false } if ba.MaxSpanRequestKeys > 0 { // Count how many results we received. var numResults int64 for _, resp := range curReply.Responses { numResults += resp.GetInner().Header().NumKeys } if numResults > ba.MaxSpanRequestKeys { panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys)) } ba.MaxSpanRequestKeys -= numResults if ba.MaxSpanRequestKeys == 0 { // prepare the batch response after meeting the max key limit. fillSkippedResponses(ba, br, rs) // done, exit loop. return br, nil, false } } // If this was the last range accessed by this call, exit loop. if !needAnother { return br, nil, false } // key cannot be less that the end key. if !rs.Key.Less(rs.EndKey) { panic(fmt.Sprintf("start key %s is less than %s", rs.Key, rs.EndKey)) } log.Trace(ctx, "querying next range") } }
// NewDistSender returns a batch.Sender instance which connects to the // Cockroach cluster via the supplied gossip instance. Supplying a // DistSenderContext or the fields within is optional. For omitted values, sane // defaults will be used. func NewDistSender(cfg *DistSenderConfig, g *gossip.Gossip) *DistSender { if cfg == nil { cfg = &DistSenderConfig{} } ds := &DistSender{gossip: g} ds.Ctx = cfg.Ctx if ds.Ctx == nil { ds.Ctx = context.Background() } if ds.Ctx.Done() != nil { panic("context with cancel or deadline") } if tracing.TracerFromCtx(ds.Ctx) == nil { ds.Ctx = tracing.WithTracer(ds.Ctx, tracing.NewTracer()) } ds.clock = cfg.Clock if ds.clock == nil { ds.clock = hlc.NewClock(hlc.UnixNano) } if cfg.nodeDescriptor != nil { atomic.StorePointer(&ds.nodeDescriptor, unsafe.Pointer(cfg.nodeDescriptor)) } rcSize := cfg.RangeDescriptorCacheSize if rcSize <= 0 { rcSize = defaultRangeDescriptorCacheSize } rdb := cfg.RangeDescriptorDB if rdb == nil { rdb = ds } ds.rangeCache = newRangeDescriptorCache(rdb, int(rcSize)) lcSize := cfg.LeaseHolderCacheSize if lcSize <= 0 { lcSize = defaultLeaseHolderCacheSize } ds.leaseHolderCache = newLeaseHolderCache(int(lcSize)) if cfg.RangeLookupMaxRanges <= 0 { ds.rangeLookupMaxRanges = defaultRangeLookupMaxRanges } if cfg.TransportFactory != nil { ds.transportFactory = cfg.TransportFactory } ds.rpcRetryOptions = base.DefaultRetryOptions() if cfg.RPCRetryOptions != nil { ds.rpcRetryOptions = *cfg.RPCRetryOptions } if cfg.RPCContext != nil { ds.rpcContext = cfg.RPCContext if ds.rpcRetryOptions.Closer == nil { ds.rpcRetryOptions.Closer = ds.rpcContext.Stopper.ShouldQuiesce() } } if cfg.SendNextTimeout != 0 { ds.sendNextTimeout = cfg.SendNextTimeout } else { ds.sendNextTimeout = defaultSendNextTimeout } if g != nil { g.RegisterCallback(gossip.KeyFirstRangeDescriptor, func(_ string, value roachpb.Value) { if log.V(1) { var desc roachpb.RangeDescriptor if err := value.GetProto(&desc); err != nil { log.Errorf(ds.Ctx, "unable to parse gossipped first range descriptor: %s", err) } else { log.Infof(ds.Ctx, "gossipped first range descriptor: %+v", desc.Replicas) } } err := ds.rangeCache.EvictCachedRangeDescriptor(roachpb.RKeyMin, nil, false) if err != nil { log.Warningf(ds.Ctx, "failed to evict first range descriptor: %s", err) } }) } return ds }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp := opentracing.SpanFromContext(ctx) // TODO(radu): once contexts are plumbed correctly, we should use the Tracer // from ctx. tracer := tracing.TracerFromCtx(tc.ctx) if sp == nil { sp = tracer.StartSpan(opTxnCoordSender) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) } // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } else { // We didn't make this object but are about to mutate it, so we // have to take a copy - the original might already have been // passed to the RPC layer. ba.Header.Trace = protoutil.Clone(ba.Header.Trace).(*tracing.Span) } if err := tracer.Inject(sp.Context(), basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } } startNS := tc.clock.PhysicalNow() if ba.Txn != nil { // If this request is part of a transaction... if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var et *roachpb.EndTransactionRequest var hasET bool { var rArgs roachpb.Request rArgs, hasET = ba.GetArg(roachpb.EndTransaction) if hasET { et = rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } } } if pErr := func() *roachpb.Error { tc.Lock() defer tc.Unlock() if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil { return pErr } if !hasET { return nil } // Everything below is carried out only when trying to commit. // Populate et.IntentSpans, taking into account both any existing // and new writes, and taking care to perform proper deduplication. txnMeta := tc.txns[*ba.Txn.ID] distinctSpans := true if txnMeta != nil { et.IntentSpans = txnMeta.keys // Defensively set distinctSpans to false if we had any previous // requests in this transaction. This effectively limits the distinct // spans optimization to 1pc transactions. distinctSpans = len(txnMeta.keys) == 0 } ba.IntentSpanIterate(func(key, endKey roachpb.Key) { et.IntentSpans = append(et.IntentSpans, roachpb.Span{ Key: key, EndKey: endKey, }) }) // TODO(peter): Populate DistinctSpans on all batches, not just batches // which contain an EndTransactionRequest. var distinct bool // The request might already be used by an outgoing goroutine, so // we can't safely mutate anything in-place (as MergeSpans does). et.IntentSpans = append([]roachpb.Span(nil), et.IntentSpans...) et.IntentSpans, distinct = roachpb.MergeSpans(et.IntentSpans) ba.Header.DistinctSpans = distinct && distinctSpans if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state // in the client. return roachpb.NewErrorf("cannot commit a read-only transaction") } if txnMeta != nil { txnMeta.keys = et.IntentSpans } return nil }(); pErr != nil { return nil, pErr } if hasET && log.V(1) { for _, intent := range et.IntentSpans { log.Tracef(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey) } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil { log.Tracef(ctx, "error: %s", pErr) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof(ctx, "%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.Lock() tc.cleanupTxnLocked(ctx, *br.Txn) tc.Unlock() } return br, nil }
// NewServer creates a Server from a server.Context. func NewServer(srvCtx Context, stopper *stop.Stopper) (*Server, error) { if _, err := net.ResolveTCPAddr("tcp", srvCtx.Addr); err != nil { return nil, errors.Errorf("unable to resolve RPC address %q: %v", srvCtx.Addr, err) } if srvCtx.Ctx == nil { srvCtx.Ctx = context.Background() } if srvCtx.Ctx.Done() != nil { panic("context with cancel or deadline") } if tracing.TracerFromCtx(srvCtx.Ctx) == nil { // TODO(radu): instead of modifying srvCtx.Ctx, we should have a separate // context.Context inside Server. We will need to rename server.Context // though. srvCtx.Ctx = tracing.WithTracer(srvCtx.Ctx, tracing.NewTracer()) } if srvCtx.Insecure { log.Warning(srvCtx.Ctx, "running in insecure mode, this is strongly discouraged. See --insecure.") } // Try loading the TLS configs before anything else. if _, err := srvCtx.GetServerTLSConfig(); err != nil { return nil, err } if _, err := srvCtx.GetClientTLSConfig(); err != nil { return nil, err } s := &Server{ mux: http.NewServeMux(), clock: hlc.NewClock(hlc.UnixNano), stopper: stopper, } // Add a dynamic log tag value for the node ID. // // We need to pass the server's Ctx as a base context for the various server // components, but we won't know the node ID until we Start(). At that point // it's too late to change the contexts in the components (various background // processes will have already started using the contexts). // // The dynamic value allows us to add the log tag to the context now and // update the value asynchronously. It's not significantly more expensive than // a regular tag since it's just doing an (atomic) load when a log/trace // message is constructed. s.nodeLogTagVal.Set(log.DynamicIntValueUnknown) srvCtx.Ctx = log.WithLogTag(srvCtx.Ctx, "n", &s.nodeLogTagVal) s.ctx = srvCtx s.clock.SetMaxOffset(srvCtx.MaxOffset) s.rpcContext = rpc.NewContext(srvCtx.Context, s.clock, s.stopper) s.rpcContext.HeartbeatCB = func() { if err := s.rpcContext.RemoteClocks.VerifyClockOffset(); err != nil { log.Fatal(s.Ctx(), err) } } s.grpc = rpc.NewServer(s.rpcContext) s.registry = metric.NewRegistry() s.gossip = gossip.New( s.Ctx(), s.rpcContext, s.grpc, s.ctx.GossipBootstrapResolvers, s.stopper, s.registry) s.storePool = storage.NewStorePool( s.gossip, s.clock, s.rpcContext, srvCtx.ReservationsEnabled, srvCtx.TimeUntilStoreDead, s.stopper, ) // A custom RetryOptions is created which uses stopper.ShouldQuiesce() as // the Closer. This prevents infinite retry loops from occurring during // graceful server shutdown // // Such a loop loop occurs with the DistSender attempts a connection to the // local server during shutdown, and receives an internal server error (HTTP // Code 5xx). This is the correct error for a server to return when it is // shutting down, and is normally retryable in a cluster environment. // However, on a single-node setup (such as a test), retries will never // succeed because the only server has been shut down; thus, thus the // DistSender needs to know that it should not retry in this situation. retryOpts := base.DefaultRetryOptions() retryOpts.Closer = s.stopper.ShouldQuiesce() distSenderCfg := kv.DistSenderConfig{ Ctx: s.Ctx(), Clock: s.clock, RPCContext: s.rpcContext, RPCRetryOptions: &retryOpts, } s.distSender = kv.NewDistSender(&distSenderCfg, s.gossip) txnMetrics := kv.MakeTxnMetrics() s.registry.AddMetricStruct(txnMetrics) s.txnCoordSender = kv.NewTxnCoordSender(s.Ctx(), s.distSender, s.clock, srvCtx.Linearizable, s.stopper, txnMetrics) s.db = client.NewDB(s.txnCoordSender) s.raftTransport = storage.NewRaftTransport(storage.GossipAddressResolver(s.gossip), s.grpc, s.rpcContext) s.kvDB = kv.NewDBServer(s.ctx.Context, s.txnCoordSender, s.stopper) roachpb.RegisterExternalServer(s.grpc, s.kvDB) // Set up Lease Manager var lmKnobs sql.LeaseManagerTestingKnobs if srvCtx.TestingKnobs.SQLLeaseManager != nil { lmKnobs = *srvCtx.TestingKnobs.SQLLeaseManager.(*sql.LeaseManagerTestingKnobs) } s.leaseMgr = sql.NewLeaseManager(0, *s.db, s.clock, lmKnobs, s.stopper) s.leaseMgr.RefreshLeases(s.stopper, s.db, s.gossip) // Set up the DistSQL server distSQLCfg := distsql.ServerConfig{ Context: s.Ctx(), DB: s.db, RPCContext: s.rpcContext, } s.distSQLServer = distsql.NewServer(distSQLCfg) distsql.RegisterDistSQLServer(s.grpc, s.distSQLServer) // Set up Executor execCfg := sql.ExecutorConfig{ Context: s.Ctx(), DB: s.db, Gossip: s.gossip, LeaseManager: s.leaseMgr, Clock: s.clock, DistSQLSrv: s.distSQLServer, } if srvCtx.TestingKnobs.SQLExecutor != nil { execCfg.TestingKnobs = srvCtx.TestingKnobs.SQLExecutor.(*sql.ExecutorTestingKnobs) } else { execCfg.TestingKnobs = &sql.ExecutorTestingKnobs{} } s.sqlExecutor = sql.NewExecutor(execCfg, s.stopper) s.registry.AddMetricStruct(s.sqlExecutor) s.pgServer = pgwire.MakeServer(s.ctx.Context, s.sqlExecutor) s.registry.AddMetricStruct(s.pgServer.Metrics()) // TODO(bdarnell): make StoreConfig configurable. nCtx := storage.StoreContext{ Ctx: s.Ctx(), Clock: s.clock, DB: s.db, Gossip: s.gossip, Transport: s.raftTransport, RaftTickInterval: s.ctx.RaftTickInterval, ScanInterval: s.ctx.ScanInterval, ScanMaxIdleTime: s.ctx.ScanMaxIdleTime, ConsistencyCheckInterval: s.ctx.ConsistencyCheckInterval, ConsistencyCheckPanicOnFailure: s.ctx.ConsistencyCheckPanicOnFailure, StorePool: s.storePool, SQLExecutor: sql.InternalExecutor{ LeaseManager: s.leaseMgr, }, LogRangeEvents: true, AllocatorOptions: storage.AllocatorOptions{ AllowRebalance: true, }, } if srvCtx.TestingKnobs.Store != nil { nCtx.TestingKnobs = *srvCtx.TestingKnobs.Store.(*storage.StoreTestingKnobs) } s.recorder = status.NewMetricsRecorder(s.clock) s.registry.AddMetricStruct(s.rpcContext.RemoteClocks.Metrics()) s.runtime = status.MakeRuntimeStatSampler(s.clock) s.registry.AddMetricStruct(s.runtime) s.node = NewNode(nCtx, s.recorder, s.registry, s.stopper, txnMetrics, sql.MakeEventLogger(s.leaseMgr)) roachpb.RegisterInternalServer(s.grpc, s.node) storage.RegisterStoresServer(s.grpc, s.node.storesServer) s.tsDB = ts.NewDB(s.db) s.tsServer = ts.MakeServer(s.tsDB) s.admin = makeAdminServer(s) s.status = newStatusServer(s.db, s.gossip, s.recorder, s.ctx.Context, s.rpcContext, s.node.stores) for _, gw := range []grpcGatewayServer{&s.admin, s.status, &s.tsServer} { gw.RegisterService(s.grpc) } return s, nil }
// Batch implements the roachpb.InternalServer interface. func (n *Node) Batch( ctx context.Context, args *roachpb.BatchRequest, ) (br *roachpb.BatchResponse, err error) { // TODO(marc,bdarnell): this code is duplicated in server/node.go, // which should be fixed. defer func() { // We always return errors via BatchResponse.Error so structure is // preserved; plain errors are presumed to be from the RPC // framework and not from cockroach. if err != nil { if br == nil { br = &roachpb.BatchResponse{} } if br.Error != nil { panic(fmt.Sprintf( "attempting to return both a plain error (%s) and roachpb.Error (%s)", err, br.Error)) } br.Error = roachpb.NewError(err) err = nil } }() // TODO(marc): grpc's authentication model (which gives credential access in // the request handler) doesn't really fit with the current design of the // security package (which assumes that TLS state is only given at connection // time) - that should be fixed. if peer, ok := peer.FromContext(ctx); ok { if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok { certUser, err := security.GetCertificateUser(&tlsInfo.State) if err != nil { return nil, err } if certUser != security.NodeUser { return nil, errors.Errorf("user %s is not allowed", certUser) } } } const opName = "node.Batch" fail := func(err error) { br = &roachpb.BatchResponse{} br.Error = roachpb.NewError(err) } f := func() { sp, err := tracing.JoinOrNew(tracing.TracerFromCtx(n.Ctx()), args.Trace, opName) if err != nil { fail(err) return } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { sp.LogEvent("delegating to snowball tracing") sp.Finish() if sp, err = tracing.JoinOrNewSnowball(opName, args.Trace, func(rawSpan basictracer.RawSpan) { encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(ctx, err) } br.CollectedSpans = append(br.CollectedSpans, encSp) }); err != nil { fail(err) return } } defer sp.Finish() traceCtx := opentracing.ContextWithSpan(ctx, sp) log.Tracef(traceCtx, "node "+strconv.Itoa(int(n.Descriptor.NodeID))) // could save allocs here. tStart := timeutil.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(traceCtx, *args) if pErr != nil { br = &roachpb.BatchResponse{} log.Tracef(traceCtx, "error: %T", pErr.GetDetail()) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr } if err := n.stopper.RunTask(f); err != nil { return nil, err } return br, nil }