// GetSnapshot wraps Snapshot() but does not require the replica lock // to be held and it will block instead of returning // ErrSnapshotTemporaryUnavailable. func (r *Replica) GetSnapshot(ctx context.Context) (raftpb.Snapshot, error) { retryOptions := retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 50 * time.Millisecond, Multiplier: 2, Closer: r.store.Stopper().ShouldQuiesce(), } for retry := retry.Start(retryOptions); retry.Next(); { log.Tracef(ctx, "snapshot retry loop pass %d", retry.CurrentAttempt()) r.mu.Lock() snap, err := r.SnapshotWithContext(ctx) snapshotChan := r.mu.snapshotChan r.mu.Unlock() if err == raft.ErrSnapshotTemporarilyUnavailable { if snapshotChan == nil { // The call to Snapshot() didn't start an async process due to // rate limiting. Try again later. continue } var ok bool snap, ok = <-snapshotChan if ok { return snap, nil } // Each snapshot worker's output can only be consumed once. // We could be racing with raft itself, so if we get a closed // channel loop back and try again. } else { return snap, err } } return raftpb.Snapshot{}, &roachpb.NodeUnavailableError{} }
// Prepare returns the result types of the given statement. pinfo may // contain partial type information for placeholders. Prepare will // populate the missing types. The column result types are returned (or // nil if there are no results). func (e *Executor) Prepare( query string, session *Session, pinfo parser.PlaceholderTypes, ) ([]ResultColumn, error) { if log.V(2) { log.Infof(session.Ctx(), "preparing: %s", query) } else if traceSQL { log.Tracef(session.Ctx(), "preparing: %s", query) } stmt, err := parser.ParseOne(query, parser.Syntax(session.Syntax)) if err != nil { return nil, err } if err = pinfo.ProcessPlaceholderAnnotations(stmt); err != nil { return nil, err } protoTS, err := isAsOf(&session.planner, stmt, e.ctx.Clock.Now()) if err != nil { return nil, err } session.planner.resetForBatch(e) session.planner.semaCtx.Placeholders.SetTypes(pinfo) session.planner.evalCtx.PrepareOnly = true // Prepare needs a transaction because it needs to retrieve db/table // descriptors for type checking. txn := client.NewTxn(session.Ctx(), *e.ctx.DB) txn.Proto.Isolation = session.DefaultIsolationLevel session.planner.setTxn(txn) defer session.planner.setTxn(nil) if protoTS != nil { session.planner.asOf = true defer func() { session.planner.asOf = false }() setTxnTimestamps(txn, *protoTS) } plan, err := session.planner.prepare(stmt) if err != nil { return nil, err } if plan == nil { return nil, nil } cols := plan.Columns() for _, c := range cols { if err := checkResultDatum(c.Typ); err != nil { return nil, err } } return cols, nil }
// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock should not be held // while calling this method. func (bq *baseQueue) processReplica(repl *Replica, clock *hlc.Clock) error { bq.processMu.Lock() defer bq.processMu.Unlock() // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { log.VEventf(1, bq.ctx, "no system config available. skipping") return nil } if bq.requiresSplit(cfg, repl) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. log.VEventf(3, bq.ctx, "%s: split needed; skipping", repl) return nil } sp := repl.store.Tracer().StartSpan(bq.name) ctx := opentracing.ContextWithSpan(context.Background(), sp) defer sp.Finish() log.Tracef(ctx, "processing replica %s", repl) // If the queue requires a replica to have the range lease in // order to be processed, check whether this replica has range lease // and renew or acquire if necessary. if bq.needsLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLease(ctx); err != nil { if _, harmless := err.GetDetail().(*roachpb.NotLeaseHolderError); harmless { log.VEventf(3, bq.ctx, "%s: not holding lease; skipping", repl) return nil } return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } log.Trace(ctx, "got range lease") } log.VEventf(3, bq.ctx, "%s: processing", repl) start := timeutil.Now() if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil { return err } log.VEventf(2, bq.ctx, "%s: done: %s", repl, timeutil.Since(start)) log.Trace(ctx, "done") return nil }
// EvictAndReplace instructs the evictionToken to evict the RangeDescriptor it was // created with from the rangeDescriptorCache. It also allows the user to provide // new RangeDescriptors to insert into the cache, all atomically. When called without // arguments, EvictAndReplace will behave the same as Evict. func (et *evictionToken) EvictAndReplace(ctx context.Context, newDescs ...roachpb.RangeDescriptor) error { var err error et.doOnce.Do(func() { et.doLocker.Lock() defer et.doLocker.Unlock() err = et.do() if err == nil { if len(newDescs) > 0 { err = et.doReplace(newDescs...) log.Tracef(ctx, "evicting cached range descriptor with %d replacements", len(newDescs)) } else { log.Trace(ctx, "evicting cached range descriptor") } } }) return err }
// sendToReplicas sends one or more RPCs to clients specified by the slice of // replicas. On success, Send returns the first successful reply. Otherwise, // Send returns an error if and as soon as the number of failed RPCs exceeds // the available endpoints less the number of required replies. func (ds *DistSender) sendToReplicas( opts SendOptions, rangeID roachpb.RangeID, replicas ReplicaSlice, args roachpb.BatchRequest, rpcContext *rpc.Context, ) (*roachpb.BatchResponse, error) { if len(replicas) < 1 { return nil, roachpb.NewSendError( fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d", len(replicas), 1)) } done := make(chan BatchCall, len(replicas)) transportFactory := opts.transportFactory if transportFactory == nil { transportFactory = grpcTransportFactory } transport, err := transportFactory(opts, rpcContext, replicas, args) if err != nil { return nil, err } defer transport.Close() if transport.IsExhausted() { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed", len(replicas))) } // Send the first request. pending := 1 transport.SendNext(done) // Wait for completions. This loop will retry operations that fail // with errors that reflect per-replica state and may succeed on // other replicas. var sendNextTimer timeutil.Timer defer sendNextTimer.Stop() for { sendNextTimer.Reset(opts.SendNextTimeout) select { case <-sendNextTimer.C: sendNextTimer.Read = true // On successive RPC timeouts, send to additional replicas if available. if !transport.IsExhausted() { log.Trace(opts.Context, "timeout, trying next peer") pending++ transport.SendNext(done) } case call := <-done: pending-- err := call.Err if err == nil { if log.V(2) { log.Infof(opts.Context, "RPC reply: %+v", call.Reply) } else if log.V(1) && call.Reply.Error != nil { log.Infof(opts.Context, "application error: %s", call.Reply.Error) } if !ds.handlePerReplicaError(rangeID, call.Reply.Error) { return call.Reply, nil } // Extract the detail so it can be included in the error // message if this is our last replica. // // TODO(bdarnell): The last error is not necessarily the best // one to return; we may want to remember the "best" error // we've seen (for example, a NotLeaseHolderError conveys more // information than a RangeNotFound). err = call.Reply.Error.GoError() } else if log.V(1) { log.Warningf(opts.Context, "RPC error: %s", err) } // Send to additional replicas if available. if !transport.IsExhausted() { log.Tracef(opts.Context, "error, trying next peer: %s", err) pending++ transport.SendNext(done) } if pending == 0 { return nil, roachpb.NewSendError( fmt.Sprintf("sending to all %d replicas failed; last error: %v", len(replicas), err)) } } } }
// SnapshotWithContext is main implementation for Snapshot() but it takes a // context to allow tracing. func (r *Replica) SnapshotWithContext(ctx context.Context) (raftpb.Snapshot, error) { rangeID := r.RangeID // If a snapshot is in progress, see if it's ready. if r.mu.snapshotChan != nil { select { case snapData, ok := <-r.mu.snapshotChan: if ok { return snapData, nil } // If the old channel was closed, fall through to start a new task. default: // If the result is not ready, return immediately. log.Trace(ctx, "snapshot not yet ready") return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable } } if r.exceedsDoubleSplitSizeLocked() { maxBytes := r.mu.maxBytes size := r.mu.state.Stats.Total() log.Infof(ctx, "%s: not generating snapshot because replica is too large: %d > 2 * %d", r, size, maxBytes) return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable } // See if there is already a snapshot running for this store. if !r.store.AcquireRaftSnapshot() { log.Trace(ctx, "snapshot already running") return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable } startKey := r.mu.state.Desc.StartKey // Use an unbuffered channel so the worker stays alive until someone // reads from the channel, and can abandon the snapshot if it gets stale. ch := make(chan (raftpb.Snapshot)) if r.store.Stopper().RunAsyncTask(func() { defer close(ch) sp := r.store.Tracer().StartSpan("snapshot async") ctxInner := opentracing.ContextWithSpan(context.Background(), sp) defer sp.Finish() snap := r.store.NewSnapshot() log.Tracef(ctxInner, "new engine snapshot for replica %s", r) defer snap.Close() defer r.store.ReleaseRaftSnapshot() // Delegate to a static function to make sure that we do not depend // on any indirect calls to r.store.Engine() (or other in-memory // state of the Replica). Everything must come from the snapshot. snapData, err := snapshot(context.Background(), snap, rangeID, r.store.raftEntryCache, startKey) if err != nil { log.Errorf(ctxInner, "%s: error generating snapshot: %s", r, err) } else { log.Trace(ctxInner, "snapshot generated") r.store.metrics.RangeSnapshotsGenerated.Inc(1) select { case ch <- snapData: log.Trace(ctxInner, "snapshot accepted") case <-time.After(r.store.ctx.AsyncSnapshotMaxAge): // If raft decides it doesn't need this snapshot any more (or // just takes too long to use it), abandon it to save memory. log.Infof(ctxInner, "%s: abandoning snapshot after %s", r, r.store.ctx.AsyncSnapshotMaxAge) case <-r.store.Stopper().ShouldQuiesce(): } } }) == nil { r.mu.snapshotChan = ch } else { r.store.ReleaseRaftSnapshot() } if r.store.ctx.BlockingSnapshotDuration > 0 { select { case snap, ok := <-r.mu.snapshotChan: if ok { return snap, nil } case <-time.After(r.store.ctx.BlockingSnapshotDuration): log.Trace(ctx, "snapshot blocking duration exceeded") } } return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) { { // Start new or pick up active trace and embed its trace metadata into // header for use by RPC recipients. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp := opentracing.SpanFromContext(ctx) // TODO(radu): once contexts are plumbed correctly, we should use the Tracer // from ctx. tracer := tracing.TracerFromCtx(tc.ctx) if sp == nil { sp = tracer.StartSpan(opTxnCoordSender) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) } // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.Header.Trace == nil { ba.Header.Trace = &tracing.Span{} } else { // We didn't make this object but are about to mutate it, so we // have to take a copy - the original might already have been // passed to the RPC layer. ba.Header.Trace = protoutil.Clone(ba.Header.Trace).(*tracing.Span) } if err := tracer.Inject(sp.Context(), basictracer.Delegator, ba.Trace); err != nil { return nil, roachpb.NewError(err) } } startNS := tc.clock.PhysicalNow() if ba.Txn != nil { // If this request is part of a transaction... if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } var et *roachpb.EndTransactionRequest var hasET bool { var rArgs roachpb.Request rArgs, hasET = ba.GetArg(roachpb.EndTransaction) if hasET { et = rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } } } if pErr := func() *roachpb.Error { tc.Lock() defer tc.Unlock() if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil { return pErr } if !hasET { return nil } // Everything below is carried out only when trying to commit. // Populate et.IntentSpans, taking into account both any existing // and new writes, and taking care to perform proper deduplication. txnMeta := tc.txns[*ba.Txn.ID] distinctSpans := true if txnMeta != nil { et.IntentSpans = txnMeta.keys // Defensively set distinctSpans to false if we had any previous // requests in this transaction. This effectively limits the distinct // spans optimization to 1pc transactions. distinctSpans = len(txnMeta.keys) == 0 } ba.IntentSpanIterate(func(key, endKey roachpb.Key) { et.IntentSpans = append(et.IntentSpans, roachpb.Span{ Key: key, EndKey: endKey, }) }) // TODO(peter): Populate DistinctSpans on all batches, not just batches // which contain an EndTransactionRequest. var distinct bool // The request might already be used by an outgoing goroutine, so // we can't safely mutate anything in-place (as MergeSpans does). et.IntentSpans = append([]roachpb.Span(nil), et.IntentSpans...) et.IntentSpans, distinct = roachpb.MergeSpans(et.IntentSpans) ba.Header.DistinctSpans = distinct && distinctSpans if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state // in the client. return roachpb.NewErrorf("cannot commit a read-only transaction") } if txnMeta != nil { txnMeta.keys = et.IntentSpans } return nil }(); pErr != nil { return nil, pErr } if hasET && log.V(1) { for _, intent := range et.IntentSpans { log.Tracef(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey) } } } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil { log.Tracef(ctx, "error: %s", pErr) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof(ctx, "%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.Lock() tc.cleanupTxnLocked(ctx, *br.Txn) tc.Unlock() } return br, nil }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation, and is different from // contexts used at runtime for server's background work (like `s.Ctx()`). func (s *Server) Start(ctx context.Context) error { // Copy log tags from s.Ctx() ctx = log.WithLogTagsFromCtx(ctx, s.Ctx()) tlsConfig, err := s.ctx.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.ctx.Addr) if err != nil { return err } log.Tracef(ctx, "listening on port %s", s.ctx.Addr) unresolvedAddr, err := officialAddr(s.ctx.Addr, ln.Addr()) if err != nil { return err } s.ctx.Addr = unresolvedAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.ctx.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.ctx.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.ctx.HTTPAddr = unresolvedHTTPAddr.String() s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(s.Ctx(), err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { if err := s.pgServer.ServeConn(conn); err != nil && !netutil.IsClosedConnection(err) { log.Error(s.Ctx(), err) } })) }) if len(s.ctx.SocketFile) != 0 { // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.ctx.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(s.Ctx(), err) } }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { if err := s.pgServer.ServeConn(conn); err != nil && !netutil.IsClosedConnection(err) { log.Error(s.Ctx(), err) } })) }) } // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAddr) log.Trace(ctx, "started gossip") if err := s.node.start(ctx, unresolvedAddr, s.ctx.Engines, s.ctx.NodeAttributes); err != nil { return err } log.Trace(ctx, "started node") // Set the NodeID in the base context (which was inherited by the // various components of the server). s.nodeLogTagVal.Set(int64(s.node.Descriptor.NodeID)) // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.ctx.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource(s.recorder, s.ctx.MetricsSampleInterval, ts.Resolution10s, s.stopper) // Begin recording status summaries. s.node.startWriteSummaries(s.ctx.MetricsSampleInterval) s.sqlExecutor.SetNodeID(s.node.Descriptor.NodeID) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := new(sql.SchemaChangeManagerTestingKnobs) if s.ctx.TestingKnobs.SQLSchemaChangeManager != nil { testingKnobs = s.ctx.TestingKnobs.SQLSchemaChangeManager.(*sql.SchemaChangeManagerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) log.Infof(s.Ctx(), "starting %s server at %s", s.ctx.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(s.Ctx(), "starting grpc/postgres server at %s", unresolvedAddr) if len(s.ctx.SocketFile) != 0 { log.Infof(s.Ctx(), "starting postgres server at unix:%s", s.ctx.SocketFile) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) log.Trace(ctx, "accepting connections") // Initialize grpc-gateway mux and context. jsonpb := &util.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(util.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(util.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(util.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(util.ProtoContentType, protopb), gwruntime.WithMarshalerOption(util.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.Ctx()) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.ctx.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{&s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminEndpoint, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, s.status) s.mux.Handle(healthEndpoint, s.status) log.Trace(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(s.Ctx(), "failed to signal readiness using systemd protocol: %s", err) } log.Trace(ctx, "server ready") return nil }
// Batch implements the roachpb.InternalServer interface. func (n *Node) Batch( ctx context.Context, args *roachpb.BatchRequest, ) (br *roachpb.BatchResponse, err error) { // TODO(marc,bdarnell): this code is duplicated in server/node.go, // which should be fixed. defer func() { // We always return errors via BatchResponse.Error so structure is // preserved; plain errors are presumed to be from the RPC // framework and not from cockroach. if err != nil { if br == nil { br = &roachpb.BatchResponse{} } if br.Error != nil { panic(fmt.Sprintf( "attempting to return both a plain error (%s) and roachpb.Error (%s)", err, br.Error)) } br.Error = roachpb.NewError(err) err = nil } }() // TODO(marc): grpc's authentication model (which gives credential access in // the request handler) doesn't really fit with the current design of the // security package (which assumes that TLS state is only given at connection // time) - that should be fixed. if peer, ok := peer.FromContext(ctx); ok { if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok { certUser, err := security.GetCertificateUser(&tlsInfo.State) if err != nil { return nil, err } if certUser != security.NodeUser { return nil, errors.Errorf("user %s is not allowed", certUser) } } } opName := "node " + strconv.Itoa(int(n.Descriptor.NodeID)) // could save allocs here fail := func(err error) { br = &roachpb.BatchResponse{} br.Error = roachpb.NewError(err) } f := func() { sp, err := tracing.JoinOrNew(n.ctx.Tracer, args.Trace, opName) if err != nil { fail(err) return } // If this is a snowball span, it gets special treatment: It skips the // regular tracing machinery, and we instead send the collected spans // back with the response. This is more expensive, but then again, // those are individual requests traced by users, so they can be. if sp.BaggageItem(tracing.Snowball) != "" { sp.LogEvent("delegating to snowball tracing") sp.Finish() if sp, err = tracing.JoinOrNewSnowball(opName, args.Trace, func(rawSpan basictracer.RawSpan) { encSp, err := tracing.EncodeRawSpan(&rawSpan, nil) if err != nil { log.Warning(ctx, err) } br.CollectedSpans = append(br.CollectedSpans, encSp) }); err != nil { fail(err) return } } defer sp.Finish() traceCtx := opentracing.ContextWithSpan(ctx, sp) tStart := timeutil.Now() var pErr *roachpb.Error br, pErr = n.stores.Send(traceCtx, *args) if pErr != nil { br = &roachpb.BatchResponse{} log.Tracef(traceCtx, "error: %T", pErr.GetDetail()) } if br.Error != nil { panic(roachpb.ErrorUnexpectedlySet(n.stores, br)) } n.metrics.callComplete(timeutil.Since(tStart), pErr) br.Error = pErr } if err := n.stopper.RunTask(f); err != nil { return nil, err } return br, nil }
// resolveIntents resolves the given intents. `wait` is currently a // no-op; all intents are resolved synchronously. // // TODO(bdarnell): Restore the wait=false optimization when/if #8360 // is fixed. `wait=false` requests a semi-synchronous operation, // returning when all local commands have been *proposed* but not yet // committed or executed. This ensures that if a waiting client // retries immediately after calling this function, it will not hit // the same intents again (in the absence of #8360, we provide this // guarantee by resolving the intents synchronously regardless of the // `wait` argument). func (ir *intentResolver) resolveIntents(ctx context.Context, intents []roachpb.Intent, wait bool, poison bool) error { // Force synchronous operation; see above TODO. wait = true if len(intents) == 0 { return nil } // We're doing async stuff below; those need new traces. ctx, cleanup := tracing.EnsureContext(ctx, ir.store.Tracer()) defer cleanup() log.Tracef(ctx, "resolving intents [wait=%t]", wait) var reqs []roachpb.Request for i := range intents { intent := intents[i] // avoids a race in `i, intent := range ...` var resolveArgs roachpb.Request { if len(intent.EndKey) == 0 { resolveArgs = &roachpb.ResolveIntentRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } } else { resolveArgs = &roachpb.ResolveIntentRangeRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } } } reqs = append(reqs, resolveArgs) } // Resolve all of the intents. if len(reqs) > 0 { b := &client.Batch{} b.AddRawRequest(reqs...) action := func() error { // TODO(tschottdorf): no tracing here yet. return ir.store.DB().Run(b) } if wait || ir.store.Stopper().RunLimitedAsyncTask(ir.sem, func() { if err := action(); err != nil { log.Warningf(ctx, "unable to resolve external intents: %s", err) } }) != nil { // Try async to not keep the caller waiting, but when draining // just go ahead and do it synchronously. See #1684. // TODO(tschottdorf): This is ripe for removal. if err := action(); err != nil { return err } } } return nil }
// execStmtInOpenTxn executes one statement in the context // of the planner's transaction (which is assumed to exist). // It handles statements that affect the transaction state (BEGIN, COMMIT) // and delegates everything else to `execStmt`. // It binds placeholders. // // The current transaction might be committed/rolled back when this returns. // It might also have transitioned to the aborted or RestartWait state. // // Args: // implicitTxn: set if the current transaction was implicitly // created by the system (i.e. the client sent the statement outside of // a transaction). // COMMIT/ROLLBACK statements are rejected if set. Also, the transaction // might be auto-committed in this function. // firstInTxn: set for the first statement in a transaction. Used // so that nested BEGIN statements are caught. // stmtTimestamp: Used as the statement_timestamp(). // // Returns: // - a Result // - an error, if any. In case of error, the result returned also reflects this error. func (e *Executor) execStmtInOpenTxn( stmt parser.Statement, planMaker *planner, implicitTxn bool, firstInTxn bool, txnState *txnState, ) (Result, error) { if txnState.State != Open { panic("execStmtInOpenTxn called outside of an open txn") } if planMaker.txn == nil { panic("execStmtInOpenTxn called with the a txn not set on the planner") } planMaker.evalCtx.SetTxnTimestamp(txnState.sqlTimestamp) planMaker.evalCtx.SetStmtTimestamp(e.ctx.Clock.PhysicalTime()) // TODO(cdo): Figure out how to not double count on retries. e.updateStmtCounts(stmt) switch s := stmt.(type) { case *parser.BeginTransaction: if !firstInTxn { txnState.updateStateAndCleanupOnErr(errTransactionInProgress, e) return Result{Err: errTransactionInProgress}, errTransactionInProgress } case *parser.CommitTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } // CommitTransaction is executed fully here; there's no planNode for it // and the planner is not involved at all. res, err := commitSQLTransaction(txnState, planMaker, commit, e) return res, err case *parser.ReleaseSavepoint: if implicitTxn { return e.noTransactionHelper(txnState) } if err := parser.ValidateRestartCheckpoint(s.Savepoint); err != nil { return Result{Err: err}, err } // ReleaseSavepoint is executed fully here; there's no planNode for it // and the planner is not involved at all. res, err := commitSQLTransaction(txnState, planMaker, release, e) return res, err case *parser.RollbackTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } // RollbackTransaction is executed fully here; there's no planNode for it // and the planner is not involved at all. // Notice that we don't return any errors on rollback. return rollbackSQLTransaction(txnState, planMaker), nil case *parser.SetTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } case *parser.Savepoint: if implicitTxn { return e.noTransactionHelper(txnState) } if err := parser.ValidateRestartCheckpoint(s.Name); err != nil { return Result{Err: err}, err } // We want to disallow SAVEPOINTs to be issued after a transaction has // started running, but such enforcement is problematic in the // presence of transaction retries (since the transaction proto is // necessarily reused). To work around this, we keep track of the // transaction's retrying state and special-case SAVEPOINT when it is // set. // // TODO(andrei): the check for retrying is a hack - we erroneously // allow SAVEPOINT to be issued at any time during a retry, not just // in the beginning. We should figure out how to track whether we // started using the transaction during a retry. if txnState.txn.Proto.IsInitialized() && !txnState.retrying { err := fmt.Errorf("SAVEPOINT %s needs to be the first statement in a transaction", parser.RestartSavepointName) txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err } // Note that Savepoint doesn't have a corresponding plan node. // This here is all the execution there is. txnState.retryIntent = true return Result{}, nil case *parser.RollbackToSavepoint: err := parser.ValidateRestartCheckpoint(s.Savepoint) if err == nil { // Can't restart if we didn't get an error first, which would've put the // txn in a different state. err = errNotRetriable } txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Prepare: err := util.UnimplementedWithIssueErrorf(7568, "Prepared statements are supported only via the Postgres wire protocol") txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Execute: err := util.UnimplementedWithIssueErrorf(7568, "Executing prepared statements is supported only via the Postgres wire protocol") txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Deallocate: if s.Name == "" { planMaker.session.PreparedStatements.DeleteAll() } else { if found := planMaker.session.PreparedStatements.Delete(string(s.Name)); !found { err := fmt.Errorf("prepared statement %s does not exist", s.Name) txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err } } return Result{PGTag: s.StatementTag()}, nil } if txnState.tr != nil { txnState.tr.LazyLog(stmt, true /* sensitive */) } result, err := e.execStmt(stmt, planMaker, implicitTxn /* autoCommit */) if err != nil { if traceSQL { log.Tracef(txnState.txn.Context, "ERROR: %v", err) } if txnState.tr != nil { txnState.tr.LazyPrintf("ERROR: %v", err) } txnState.updateStateAndCleanupOnErr(err, e) result = Result{Err: err} } else if txnState.tr != nil { tResult := &traceResult{tag: result.PGTag, count: -1} switch result.Type { case parser.RowsAffected: tResult.count = result.RowsAffected case parser.Rows: tResult.count = len(result.Rows) } txnState.tr.LazyLog(tResult, false) if traceSQL { log.Tracef(txnState.txn.Context, "%s done", tResult) } } return result, err }
// execStmtsInCurrentTxn consumes a prefix of stmts, namely the // statements belonging to a single SQL transaction. It executes in // the planner's transaction, which is assumed to exist. // // COMMIT/ROLLBACK statements can end the current transaction. If that happens, // this method returns, and the remaining statements are returned. // // If an error occurs while executing a statement, the SQL txn will be // considered aborted and subsequent statements will be discarded (they will // not be executed, they will not be returned for future execution, they will // not generate results). Note that this also includes COMMIT/ROLLBACK // statements. Further note that errTransactionAborted is no exception - // encountering it will discard subsequent statements. This means that, to // recover from an aborted txn, a COMMIT/ROLLBACK statement needs to be the // first one in stmts. // // Args: // txnState: Specifies whether we're executing inside a txn, or inside an aborted txn. // The state is updated. // implicitTxn: set if the current transaction was implicitly // created by the system (i.e. the client sent the statement outside of // a transaction). // Returns: // - the list of results (one per executed statement). // - the statements that haven't been executed because the transaction has // been committed or rolled back. In returning an error, this will be nil. // - the error encountered while executing statements, if any. If an error // occurred, it is also the last result returned. Subsequent statements // have not been executed. func (e *Executor) execStmtsInCurrentTxn( stmts parser.StatementList, planMaker *planner, txnState *txnState, implicitTxn bool, txnBeginning bool, ) ([]Result, parser.StatementList, error) { var results []Result if txnState.State == NoTxn { panic("execStmtsInCurrentTransaction called outside of a txn") } if txnState.State == Open && planMaker.txn == nil { panic(fmt.Sprintf("inconsistent planMaker txn state. txnState: %+v", txnState)) } for i, stmt := range stmts { ctx := planMaker.session.Ctx() if log.V(2) { log.Infof(ctx, "executing %d/%d: %s", i+1, len(stmts), stmt) } else if traceSQL { log.Tracef(ctx, "executing %d/%d: %s", i+1, len(stmts), stmt) } txnState.schemaChangers.curStatementIdx = i var stmtStrBefore string // TODO(nvanbenschoten) Constant literals can change their representation (1.0000 -> 1) when type checking, // so we need to reconsider how this works. if e.ctx.TestingKnobs.CheckStmtStringChange && false { stmtStrBefore = stmt.String() } var res Result var err error switch txnState.State { case Open: res, err = e.execStmtInOpenTxn( stmt, planMaker, implicitTxn, txnBeginning && (i == 0), /* firstInTxn */ txnState) case Aborted, RestartWait: res, err = e.execStmtInAbortedTxn(stmt, txnState, planMaker) case CommitWait: res, err = e.execStmtInCommitWaitTxn(stmt, txnState) default: panic(fmt.Sprintf("unexpected txn state: %s", txnState.State)) } if e.ctx.TestingKnobs.CheckStmtStringChange && false { if after := stmt.String(); after != stmtStrBefore { panic(fmt.Sprintf("statement changed after exec; before:\n %s\nafter:\n %s", stmtStrBefore, after)) } } res.Err = convertToErrWithPGCode(res.Err) results = append(results, res) if err != nil { // After an error happened, skip executing all the remaining statements // in this batch. This is Postgres behavior, and it makes sense as the // protocol doesn't let you return results after an error. return results, nil, err } if txnState.State == NoTxn { // If the transaction is done, return the remaining statements to // be executed as a different group. return results, stmts[i+1:], nil } } // If we got here, we've managed to consume all statements and we're still in a txn. return results, nil, nil }
// initStores initializes the Stores map from ID to Store. Stores are // added to the local sender if already bootstrapped. A bootstrapped // Store has a valid ident with cluster, node and Store IDs set. If // the Store doesn't yet have a valid ident, it's added to the // bootstraps list for initialization once the cluster and node IDs // have been determined. func (n *Node) initStores( ctx context.Context, engines []engine.Engine, stopper *stop.Stopper, ) error { var bootstraps []*storage.Store if len(engines) == 0 { return errors.Errorf("no engines") } for _, e := range engines { s := storage.NewStore(n.ctx, e, &n.Descriptor) log.Tracef(ctx, "created store for engine: %s", e) // Initialize each store in turn, handling un-bootstrapped errors by // adding the store to the bootstraps list. if err := s.Start(ctx, stopper); err != nil { if _, ok := err.(*storage.NotBootstrappedError); ok { log.Infof(ctx, "store %s not bootstrapped", s) bootstraps = append(bootstraps, s) continue } return errors.Errorf("failed to start store: %s", err) } if s.Ident.ClusterID == *uuid.EmptyUUID || s.Ident.NodeID == 0 { return errors.Errorf("unidentified store: %s", s) } capacity, err := s.Capacity() if err != nil { return errors.Errorf("could not query store capacity: %s", err) } log.Infof(ctx, "initialized store %s: %+v", s, capacity) n.addStore(s) } // If there are no initialized stores and no gossip resolvers, // bootstrap this node as the seed of a new cluster. if n.stores.GetStoreCount() == 0 { resolvers := n.ctx.Gossip.GetResolvers() // Check for the case of uninitialized node having only itself specified as join host. switch len(resolvers) { case 0: return errNeedsBootstrap case 1: if resolvers[0].Addr() == n.Descriptor.Address.String() { return errCannotJoinSelf } } } // Verify all initialized stores agree on cluster and node IDs. if err := n.validateStores(); err != nil { return err } log.Trace(ctx, "validated stores") // Set the stores map as the gossip persistent storage, so that // gossip can bootstrap using the most recently persisted set of // node addresses. if err := n.ctx.Gossip.SetStorage(n.stores); err != nil { return fmt.Errorf("failed to initialize the gossip interface: %s", err) } // Connect gossip before starting bootstrap. For new nodes, connecting // to the gossip network is necessary to get the cluster ID. n.connectGossip() log.Trace(ctx, "connected to gossip") // If no NodeID has been assigned yet, allocate a new node ID by // supplying 0 to initNodeID. if n.Descriptor.NodeID == 0 { n.initNodeID(0) n.initialBoot = true log.Tracef(ctx, "allocated node ID %d", n.Descriptor.NodeID) } // Bootstrap any uninitialized stores asynchronously. if len(bootstraps) > 0 { if err := stopper.RunAsyncTask(func() { n.bootstrapStores(n.Ctx(), bootstraps, stopper) }); err != nil { return err } } return nil }