// maybeSignalStatusChangeLocked checks whether gossip should transition its // internal state from connected to stalled or vice versa. func (g *Gossip) maybeSignalStatusChangeLocked() { ctx := g.AnnotateCtx(context.TODO()) orphaned := g.outgoing.len()+g.mu.incoming.len() == 0 stalled := orphaned || g.mu.is.getInfo(KeySentinel) == nil if stalled { // We employ the stalled boolean to avoid filling logs with warnings. if !g.stalled { log.Eventf(ctx, "now stalled") if orphaned { if len(g.resolvers) == 0 { log.Warningf(ctx, "no resolvers found; use --join to specify a connected node") } else { log.Warningf(ctx, "no incoming or outgoing connections") } } else if len(g.resolversTried) == len(g.resolvers) { log.Warningf(ctx, "first range unavailable; resolvers exhausted") } else { log.Warningf(ctx, "first range unavailable; trying remaining resolvers") } } if len(g.resolvers) > 0 { g.signalStalledLocked() } } else { if g.stalled { log.Eventf(ctx, "connected") log.Infof(ctx, "node has connected to cluster via gossip") g.signalConnectedLocked() } g.maybeCleanupBootstrapAddressesLocked() } g.stalled = stalled }
// addInternal adds the replica the queue with specified priority. If // the replica is already queued, updates the existing // priority. Expects the queue lock to be held by caller. func (bq *baseQueue) addInternal( ctx context.Context, desc *roachpb.RangeDescriptor, should bool, priority float64, ) (bool, error) { if bq.mu.stopped { return false, errQueueStopped } if bq.mu.disabled { log.Event(ctx, "queue disabled") return false, errQueueDisabled } if !desc.IsInitialized() { // We checked this above in MaybeAdd(), but we need to check it // again for Add(). return false, errors.New("replica not initialized") } // If the replica is currently in purgatory, don't re-add it. if _, ok := bq.mu.purgatory[desc.RangeID]; ok { return false, nil } item, ok := bq.mu.replicas[desc.RangeID] if !should { if ok { log.Eventf(ctx, "%s: removing from queue", item.value) bq.remove(item) } return false, errReplicaNotAddable } else if ok { if item.priority != priority { log.Eventf(ctx, "%s: updating priority: %0.3f -> %0.3f", desc, item.priority, priority) } // Replica has already been added; update priority. bq.mu.priorityQ.update(item, priority) return false, nil } log.VEventf(ctx, 3, "%s: adding: priority=%0.3f", desc, priority) item = &replicaItem{value: desc.RangeID, priority: priority} bq.add(item) // If adding this replica has pushed the queue past its maximum size, // remove the lowest priority element. if pqLen := bq.mu.priorityQ.Len(); pqLen > bq.maxSize { bq.remove(bq.mu.priorityQ[pqLen-1]) } // Signal the processLoop that a replica has been added. select { case bq.incoming <- struct{}{}: default: // No need to signal again. } return true, nil }
// bootstrap connects the node to the gossip network. Bootstrapping // commences in the event there are no connected clients or the // sentinel gossip info is not available. After a successful bootstrap // connection, this method will block on the stalled condvar, which // receives notifications that gossip network connectivity has been // lost and requires re-bootstrapping. func (g *Gossip) bootstrap() { g.server.stopper.RunWorker(func() { ctx := g.AnnotateCtx(context.Background()) ctx = log.WithLogTag(ctx, "bootstrap", nil) var bootstrapTimer timeutil.Timer defer bootstrapTimer.Stop() for { if g.server.stopper.RunTask(func() { g.mu.Lock() defer g.mu.Unlock() haveClients := g.outgoing.len() > 0 haveSentinel := g.mu.is.getInfo(KeySentinel) != nil log.Eventf(ctx, "have clients: %t, have sentinel: %t", haveClients, haveSentinel) if !haveClients || !haveSentinel { // Try to get another bootstrap address from the resolvers. if addr := g.getNextBootstrapAddress(); addr != nil { g.startClient(addr, g.NodeID.Get()) } else { bootstrapAddrs := make([]string, 0, len(g.bootstrapping)) for addr := range g.bootstrapping { bootstrapAddrs = append(bootstrapAddrs, addr) } log.Eventf(ctx, "no next bootstrap address; currently bootstrapping: %v", bootstrapAddrs) // We couldn't start a client, signal that we're stalled so that // we'll retry. g.maybeSignalStatusChangeLocked() } } }) != nil { return } // Pause an interval before next possible bootstrap. bootstrapTimer.Reset(g.bootstrapInterval) log.Eventf(ctx, "sleeping %s until bootstrap", g.bootstrapInterval) select { case <-bootstrapTimer.C: bootstrapTimer.Read = true // break case <-g.server.stopper.ShouldStop(): return } log.Eventf(ctx, "idling until bootstrap required") // Block until we need bootstrapping again. select { case <-g.stalledCh: log.Eventf(ctx, "detected stall; commencing bootstrap") // break case <-g.server.stopper.ShouldStop(): return } } }) }
// Seek positions the iterator at the specified key. func (ri *RangeIterator) Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) { log.Eventf(ctx, "querying next range at %s", key) ri.scanDir = scanDir ri.init = true // the iterator is now initialized ri.pErr = nil // clear any prior error ri.key = key // set the key // Retry loop for looking up next range in the span. The retry loop // deals with retryable range descriptor lookups. for r := retry.StartWithCtx(ctx, ri.ds.rpcRetryOptions); r.Next(); { log.Event(ctx, "meta descriptor lookup") var err error ri.desc, ri.token, err = ri.ds.getDescriptor( ctx, ri.key, ri.token, ri.scanDir == Descending) // getDescriptor may fail retryably if, for example, the first // range isn't available via Gossip. Assume that all errors at // this level are retryable. Non-retryable errors would be for // things like malformed requests which we should have checked // for before reaching this point. if err != nil { log.VEventf(ctx, 1, "range descriptor lookup failed: %s", err) continue } // It's possible that the returned descriptor misses parts of the // keys it's supposed to include after it's truncated to match the // descriptor. Example revscan [a,g), first desc lookup for "g" // returns descriptor [c,d) -> [d,g) is never scanned. // We evict and retry in such a case. // TODO: this code is subject to removal. See // https://groups.google.com/d/msg/cockroach-db/DebjQEgU9r4/_OhMe7atFQAJ reverse := ri.scanDir == Descending if (reverse && !ri.desc.ContainsExclusiveEndKey(ri.key)) || (!reverse && !ri.desc.ContainsKey(ri.key)) { log.Eventf(ctx, "addressing error: %s does not include key %s", ri.desc, ri.key) if err := ri.token.Evict(ctx); err != nil { ri.pErr = roachpb.NewError(err) return } // On addressing errors, don't backoff; retry immediately. r.Reset() continue } return } // Check for an early exit from the retry loop. if pErr := ri.ds.deduceRetryEarlyExitError(ctx); pErr != nil { ri.pErr = pErr } else { ri.pErr = roachpb.NewErrorf("RangeIterator failed to seek to %s", key) } }
// processReplica processes a single replica. This should not be // called externally to the queue. bq.mu.Lock must not be held // while calling this method. func (bq *baseQueue) processReplica( queueCtx context.Context, repl *Replica, clock *hlc.Clock, ) error { bq.processMu.Lock() defer bq.processMu.Unlock() // Load the system config. cfg, ok := bq.gossip.GetSystemConfig() if !ok { log.VEventf(queueCtx, 1, "no system config available, skipping") return nil } if bq.requiresSplit(cfg, repl) { // Range needs to be split due to zone configs, but queue does // not accept unsplit ranges. log.VEventf(queueCtx, 3, "split needed; skipping") return nil } // Putting a span in a context means that events will no longer go to the // event log. Use queueCtx for events that are intended for the event log. ctx, span := bq.AnnotateCtxWithSpan(queueCtx, bq.name) defer span.Finish() // Also add the Replica annotations to ctx. ctx = repl.AnnotateCtx(ctx) ctx, cancel := context.WithTimeout(ctx, bq.processTimeout) defer cancel() log.Eventf(ctx, "processing replica") if err := repl.IsDestroyed(); err != nil { log.VEventf(queueCtx, 3, "replica destroyed (%s); skipping", err) return nil } // If the queue requires a replica to have the range lease in // order to be processed, check whether this replica has range lease // and renew or acquire if necessary. if bq.needsLease { // Create a "fake" get request in order to invoke redirectOnOrAcquireLease. if err := repl.redirectOnOrAcquireLease(ctx); err != nil { switch v := err.GetDetail().(type) { case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError: log.VEventf(queueCtx, 3, "%s; skipping", v) return nil default: return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl) } } log.Event(ctx, "got range lease") } log.VEventf(queueCtx, 3, "processing") if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil { return err } log.Event(ctx, "done") bq.successes.Inc(1) return nil }
// GetSnapshot returns a snapshot of the replica appropriate for sending to a // replica. If this method returns without error, callers must eventually call // OutgoingSnapshot.Close. func (r *Replica) GetSnapshot(ctx context.Context, snapType string) (*OutgoingSnapshot, error) { r.mu.Lock() defer r.mu.Unlock() rangeID := r.RangeID if r.exceedsDoubleSplitSizeLocked() { maxBytes := r.mu.maxBytes size := r.mu.state.Stats.Total() err := errors.Errorf( "%s: not generating %s snapshot because replica is too large: %d > 2 * %d", r, snapType, size, maxBytes) return &OutgoingSnapshot{}, err } startKey := r.mu.state.Desc.StartKey ctx, sp := r.AnnotateCtxWithSpan(ctx, "snapshot") defer sp.Finish() snap := r.store.NewSnapshot() log.Eventf(ctx, "new engine snapshot for replica %s", r) // Delegate to a static function to make sure that we do not depend // on any indirect calls to r.store.Engine() (or other in-memory // state of the Replica). Everything must come from the snapshot. snapData, err := snapshot(ctx, snapType, snap, rangeID, r.store.raftEntryCache, startKey) if err != nil { log.Errorf(ctx, "error generating snapshot: %s", err) return nil, err } return &snapData, nil }
// GetSnapshot wraps Snapshot() but does not require the replica lock // to be held and it will block instead of returning // ErrSnapshotTemporaryUnavailable. The caller is directly responsible for // calling r.CloseOutSnap. func (r *Replica) GetSnapshot(ctx context.Context, snapType string) (*OutgoingSnapshot, error) { // Use shorter-than-usual backoffs because this rarely succeeds on // the first attempt and this method is used a lot in tests. // Unsuccessful attempts are cheap, so we can have a low MaxBackoff. retryOpts := retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 100 * time.Millisecond, Multiplier: 2, } for retryObj := retry.StartWithCtx(ctx, retryOpts); retryObj.Next(); { log.Eventf(ctx, "snapshot retry loop pass %d", retryObj.CurrentAttempt()) r.mu.Lock() doneChan := r.mu.outSnapDone r.mu.Unlock() <-doneChan r.mu.Lock() snap, err := r.snapshotWithContext(ctx, snapType) if err == nil { r.mu.outSnap.claimed = true } r.mu.Unlock() if err == raft.ErrSnapshotTemporarilyUnavailable { continue } else { return snap, err } } return nil, ctx.Err() // the only loop exit condition }
// maybeAddBootstrapAddress adds the specified address to the list of // bootstrap addresses if not already present. Returns whether a new // bootstrap address was added. The caller must hold the gossip mutex. func (g *Gossip) maybeAddBootstrapAddress(addr util.UnresolvedAddr) bool { if _, ok := g.bootstrapAddrs[addr]; ok { return false } g.bootstrapInfo.Addresses = append(g.bootstrapInfo.Addresses, addr) g.bootstrapAddrs[addr] = struct{}{} ctx := g.AnnotateCtx(context.TODO()) log.Eventf(ctx, "add bootstrap %s", addr) return true }
// manage manages outgoing clients. Periodically, the infostore is // scanned for infos with hop count exceeding the MaxHops // threshold. If the number of outgoing clients doesn't exceed // maxPeers(), a new gossip client is connected to a randomly selected // peer beyond MaxHops threshold. Otherwise, the least useful peer // node is cut off to make room for a replacement. Disconnected // clients are processed via the disconnected channel and taken out of // the outgoing address set. If there are no longer any outgoing // connections or the sentinel gossip is unavailable, the bootstrapper // is notified via the stalled conditional variable. func (g *Gossip) manage() { g.server.stopper.RunWorker(func() { ctx := g.AnnotateCtx(context.Background()) cullTicker := time.NewTicker(g.jitteredInterval(g.cullInterval)) stallTicker := time.NewTicker(g.jitteredInterval(g.stallInterval)) defer cullTicker.Stop() defer stallTicker.Stop() for { select { case <-g.server.stopper.ShouldStop(): return case c := <-g.disconnected: g.doDisconnected(c) case nodeID := <-g.tighten: g.tightenNetwork(nodeID) case <-cullTicker.C: func() { g.mu.Lock() if !g.outgoing.hasSpace() { leastUsefulID := g.mu.is.leastUseful(g.outgoing) if c := g.findClient(func(c *client) bool { return c.peerID == leastUsefulID }); c != nil { if log.V(1) { log.Infof(ctx, "closing least useful client %+v to tighten network graph", c) } log.Eventf(ctx, "culling %s", c.addr) c.close() // After releasing the lock, block until the client disconnects. defer func() { g.doDisconnected(<-g.disconnected) }() } else { if log.V(1) { g.clientsMu.Lock() log.Infof(ctx, "couldn't find least useful client among %+v", g.clientsMu.clients) g.clientsMu.Unlock() } } } g.mu.Unlock() }() case <-stallTicker.C: g.mu.Lock() g.maybeSignalStatusChangeLocked() g.mu.Unlock() } } }) }
// maybeAddBootstrapAddress adds the specified address to the list of // bootstrap addresses if not already present. Returns whether a new // bootstrap address was added. The caller must hold the gossip mutex. func (g *Gossip) maybeAddBootstrapAddress(addr util.UnresolvedAddr, nodeID roachpb.NodeID) bool { if existingNodeID, ok := g.bootstrapAddrs[addr]; ok { if existingNodeID == unknownNodeID || existingNodeID != nodeID { g.bootstrapAddrs[addr] = nodeID } return false } g.bootstrapInfo.Addresses = append(g.bootstrapInfo.Addresses, addr) g.bootstrapAddrs[addr] = nodeID ctx := g.AnnotateCtx(context.TODO()) log.Eventf(ctx, "add bootstrap %s", addr) return true }
// removeClient removes the specified client. Called when a client // disconnects. func (g *Gossip) removeClient(target *client) { g.clientsMu.Lock() defer g.clientsMu.Unlock() for i, candidate := range g.clientsMu.clients { if candidate == target { ctx := g.AnnotateCtx(context.TODO()) log.Eventf(ctx, "client %s disconnected", candidate.addr) g.clientsMu.clients = append(g.clientsMu.clients[:i], g.clientsMu.clients[i+1:]...) delete(g.bootstrapping, candidate.addr.String()) g.outgoing.removeNode(candidate.peerID) break } } }
// startClient launches a new client connected to remote address. // The client is added to the outgoing address set and launched in // a goroutine. func (g *Gossip) startClient(addr net.Addr, nodeID roachpb.NodeID) { g.clientsMu.Lock() defer g.clientsMu.Unlock() breaker, ok := g.clientsMu.breakers[addr.String()] if !ok { breaker = g.rpcContext.NewBreaker() g.clientsMu.breakers[addr.String()] = breaker } ctx := g.AnnotateCtx(context.TODO()) log.Eventf(ctx, "starting new client to %s", addr) c := newClient(g.server.AmbientContext, addr, g.serverMetrics) g.clientsMu.clients = append(g.clientsMu.clients, c) c.start(g, g.disconnected, g.rpcContext, g.server.stopper, nodeID, breaker) }
// tightenNetwork "tightens" the network by starting a new gossip // client to the most distant node as measured in required gossip hops // to propagate info from the distant node to this node. func (g *Gossip) tightenNetwork(distantNodeID roachpb.NodeID) { g.mu.Lock() defer g.mu.Unlock() if g.outgoing.hasSpace() { ctx := g.AnnotateCtx(context.TODO()) if nodeAddr, err := g.getNodeIDAddressLocked(distantNodeID); err != nil { log.Errorf(ctx, "unable to get address for node %d: %s", distantNodeID, err) } else { log.Infof(ctx, "starting client to distant node %d to tighten network graph", distantNodeID) log.Eventf(ctx, "tightening network with new client to %s", nodeAddr) g.startClient(nodeAddr, g.NodeID.Get()) } } }
// maybeAddResolver creates and adds a resolver for the specified // address if one does not already exist. Returns whether a new // resolver was added. The caller must hold the gossip mutex. func (g *Gossip) maybeAddResolver(addr util.UnresolvedAddr) bool { if _, ok := g.resolverAddrs[addr]; ok { return false } ctx := g.AnnotateCtx(context.TODO()) r, err := resolver.NewResolverFromUnresolvedAddr(addr) if err != nil { log.Warningf(ctx, "bad address %s: %s", addr, err) return false } g.resolvers = append(g.resolvers, r) g.resolverAddrs[addr] = r log.Eventf(ctx, "add resolver %s", r) return true }
// snapshotWithContext is the main implementation for Snapshot() but it takes // a context to allow tracing. If this method returns without error, callers // must eventually call CloseOutSnap to ready this replica for more snapshots. // r.mu must be held. func (r *Replica) snapshotWithContext( ctx context.Context, snapType string, ) (*OutgoingSnapshot, error) { r.mu.AssertHeld() rangeID := r.RangeID if r.exceedsDoubleSplitSizeLocked() { maxBytes := r.mu.maxBytes size := r.mu.state.Stats.Total() log.Infof(ctx, "not generating %s snapshot because replica is too large: %d > 2 * %d", snapType, size, maxBytes) return &OutgoingSnapshot{}, raft.ErrSnapshotTemporarilyUnavailable } // See if there is already a snapshot running for this store. select { case <-r.mu.outSnapDone: default: log.Event(ctx, "snapshot already running") return nil, raft.ErrSnapshotTemporarilyUnavailable } if !r.store.AcquireRaftSnapshot() { log.Event(ctx, "snapshot already running") return nil, raft.ErrSnapshotTemporarilyUnavailable } startKey := r.mu.state.Desc.StartKey ctx, sp := r.AnnotateCtxWithSpan(ctx, "snapshot") defer sp.Finish() snap := r.store.NewSnapshot() log.Eventf(ctx, "new engine snapshot for replica %s", r) // Delegate to a static function to make sure that we do not depend // on any indirect calls to r.store.Engine() (or other in-memory // state of the Replica). Everything must come from the snapshot. snapData, err := snapshot(ctx, snapType, snap, rangeID, r.store.raftEntryCache, startKey) if err != nil { log.Errorf(ctx, "error generating snapshot: %s", err) return nil, err } log.Event(ctx, "snapshot generated") r.store.metrics.RangeSnapshotsGenerated.Inc(1) r.mu.outSnap = snapData r.mu.outSnapDone = make(chan struct{}) return &r.mu.outSnap, nil }
// tightenNetwork "tightens" the network by starting a new gossip client to the // client to the most distant node to which we don't already have an outgoing // connection. Does nothing if we don't have room for any more outgoing // connections. func (g *Gossip) tightenNetwork(ctx context.Context) { g.mu.Lock() defer g.mu.Unlock() if g.outgoing.hasSpace() { distantNodeID, distantHops := g.mu.is.mostDistant(g.hasOutgoingLocked) log.VEventf(ctx, 2, "distantHops: %d from %d", distantHops, distantNodeID) if distantHops <= maxHops { return } if nodeAddr, err := g.getNodeIDAddressLocked(distantNodeID); err != nil { log.Errorf(ctx, "unable to get address for distant node %d: %s", distantNodeID, err) } else { log.Infof(ctx, "starting client to distant node %d (%d > %d) to tighten network graph", distantNodeID, distantHops, maxHops) log.Eventf(ctx, "tightening network with new client to %s", nodeAddr) g.startClientLocked(nodeAddr) } } }
// EvictAndReplace instructs the EvictionToken to evict the RangeDescriptor it was // created with from the rangeDescriptorCache. It also allows the user to provide // new RangeDescriptors to insert into the cache, all atomically. When called without // arguments, EvictAndReplace will behave the same as Evict. func (et *EvictionToken) EvictAndReplace( ctx context.Context, newDescs ...roachpb.RangeDescriptor, ) error { var err error et.doOnce.Do(func() { et.doLocker.Lock() defer et.doLocker.Unlock() err = et.do() if err == nil { if len(newDescs) > 0 { err = et.doReplace(newDescs...) log.Eventf(ctx, "evicting cached range descriptor with %d replacements", len(newDescs)) } else { log.Event(ctx, "evicting cached range descriptor") } } }) return err }
// GetSnapshot wraps Snapshot() but does not require the replica lock // to be held and it will block instead of returning // ErrSnapshotTemporaryUnavailable. The caller is directly responsible for // calling r.CloseOutSnap. func (r *Replica) GetSnapshot(ctx context.Context) (*OutgoingSnapshot, error) { for i := 0; ; i++ { log.Eventf(ctx, "snapshot retry loop pass %d", i) r.mu.Lock() doneChan := r.mu.outSnapDone r.mu.Unlock() <-doneChan r.mu.Lock() snap, err := r.SnapshotWithContext(ctx) if err == nil { r.mu.outSnap.claimed = true } r.mu.Unlock() if err == raft.ErrSnapshotTemporarilyUnavailable { continue } else { return snap, err } } }
// Exec executes fn in the context of a distributed transaction. // Execution is controlled by opt (see comments in TxnExecOptions). // // opt is passed to fn, and it's valid for fn to modify opt as it sees // fit during each execution attempt. // // It's valid for txn to be nil (meaning the txn has already aborted) if fn // can handle that. This is useful for continuing transactions that have been // aborted because of an error in a previous batch of statements in the hope // that a ROLLBACK will reset the state. Neither opt.AutoRetry not opt.AutoCommit // can be set in this case. // // When this method returns, txn might be in any state; Exec does not attempt // to clean up the transaction before returning an error. In case of // TransactionAbortedError, txn is reset to a fresh transaction, ready to be // used. func (txn *Txn) Exec(opt TxnExecOptions, fn func(txn *Txn, opt *TxnExecOptions) error) (err error) { // Run fn in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. var retryOptions retry.Options if txn == nil && (opt.AutoRetry || opt.AutoCommit) { panic("asked to retry or commit a txn that is already aborted") } // Ensure that a RetryableTxnError escaping this function is not used by // another (higher-level) Exec() invocation to restart its unrelated // transaction. Technically, setting TxnID to nil here is best-effort and // doesn't ensure that (the error will be wrongly used if the outer txn also // has a nil TxnID). // TODO(andrei): set TxnID to a bogus non-nil value once we get rid of the // retErr.Transaction field. defer func() { if retErr, ok := err.(*roachpb.RetryableTxnError); ok { retErr.TxnID = nil retErr.Transaction = nil } }() if opt.AutoRetry { retryOptions = txn.db.ctx.TxnRetryOptions } for r := retry.Start(retryOptions); r.Next(); { if txn != nil { // If we're looking at a brand new transaction, then communicate // what should be used as initial timestamp for the KV txn created // by TxnCoordSender. if opt.Clock != nil && !txn.Proto.IsInitialized() { // Control the KV timestamp, such that the value returned by // `cluster_logical_timestamp()` is consistent with the commit // (serializable) ordering. txn.Proto.OrigTimestamp = opt.Clock.Now() } } err = fn(txn, &opt) // TODO(andrei): Until 7881 is fixed. if err == nil && opt.AutoCommit && txn.Proto.Status == roachpb.ABORTED { log.Errorf(txn.Context, "#7881: no err but aborted txn proto. opt: %+v, txn: %+v", opt, txn) } if err == nil && opt.AutoCommit && txn.Proto.Status == roachpb.PENDING { // fn succeeded, but didn't commit. err = txn.Commit() log.Eventf(txn.Context, "client.Txn did AutoCommit. err: %v\ntxn: %+v", err, txn.Proto) if err != nil { if _, retryable := err.(*roachpb.RetryableTxnError); !retryable { // We can't retry, so let the caller know we tried to // autocommit. err = &AutoCommitError{cause: err} } } } if !opt.AutoRetry { break } if retErr, retryable := err.(*roachpb.RetryableTxnError); !retryable { break } else { // Make sure the txn record that err carries is for this txn. // If it's not, we terminate the "retryable" character of the error. if txn.Proto.ID != nil && (retErr.TxnID == nil || *retErr.TxnID != *txn.Proto.ID) { return errors.New(retErr.Error()) } if !retErr.Backoff { r.Reset() } } txn.commitTriggers = nil log.VEventf(txn.Context, 2, "automatically retrying transaction: %s because of error: %s", txn.DebugName(), err) } return err }
// execStmtInOpenTxn executes one statement in the context // of the planner's transaction (which is assumed to exist). // It handles statements that affect the transaction state (BEGIN, COMMIT) // and delegates everything else to `execStmt`. // It binds placeholders. // // The current transaction might be committed/rolled back when this returns. // It might also have transitioned to the aborted or RestartWait state. // // Args: // implicitTxn: set if the current transaction was implicitly // created by the system (i.e. the client sent the statement outside of // a transaction). // COMMIT/ROLLBACK statements are rejected if set. Also, the transaction // might be auto-committed in this function. // firstInTxn: set for the first statement in a transaction. Used // so that nested BEGIN statements are caught. // stmtTimestamp: Used as the statement_timestamp(). // // Returns: // - a Result // - an error, if any. In case of error, the result returned also reflects this error. func (e *Executor) execStmtInOpenTxn( stmt parser.Statement, planMaker *planner, implicitTxn bool, firstInTxn bool, txnState *txnState, ) (Result, error) { if txnState.State != Open { panic("execStmtInOpenTxn called outside of an open txn") } if planMaker.txn == nil { panic("execStmtInOpenTxn called with a txn not set on the planner") } planMaker.evalCtx.SetTxnTimestamp(txnState.sqlTimestamp) planMaker.evalCtx.SetStmtTimestamp(e.cfg.Clock.PhysicalTime()) session := planMaker.session log.Eventf(session.context, "%s", stmt) // TODO(cdo): Figure out how to not double count on retries. e.updateStmtCounts(stmt) switch s := stmt.(type) { case *parser.BeginTransaction: if !firstInTxn { txnState.updateStateAndCleanupOnErr(errTransactionInProgress, e) return Result{Err: errTransactionInProgress}, errTransactionInProgress } case *parser.CommitTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } // CommitTransaction is executed fully here; there's no planNode for it // and the planner is not involved at all. res, err := commitSQLTransaction(txnState, planMaker, commit, e) return res, err case *parser.ReleaseSavepoint: if implicitTxn { return e.noTransactionHelper(txnState) } if err := parser.ValidateRestartCheckpoint(s.Savepoint); err != nil { return Result{Err: err}, err } // ReleaseSavepoint is executed fully here; there's no planNode for it // and the planner is not involved at all. res, err := commitSQLTransaction(txnState, planMaker, release, e) return res, err case *parser.RollbackTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } // RollbackTransaction is executed fully here; there's no planNode for it // and the planner is not involved at all. // Notice that we don't return any errors on rollback. return rollbackSQLTransaction(txnState, planMaker), nil case *parser.SetTransaction: if implicitTxn { return e.noTransactionHelper(txnState) } case *parser.Savepoint: if implicitTxn { return e.noTransactionHelper(txnState) } if err := parser.ValidateRestartCheckpoint(s.Name); err != nil { return Result{Err: err}, err } // We want to disallow SAVEPOINTs to be issued after a transaction has // started running, but such enforcement is problematic in the // presence of transaction retries (since the transaction proto is // necessarily reused). To work around this, we keep track of the // transaction's retrying state and special-case SAVEPOINT when it is // set. // // TODO(andrei): the check for retrying is a hack - we erroneously // allow SAVEPOINT to be issued at any time during a retry, not just // in the beginning. We should figure out how to track whether we // started using the transaction during a retry. if txnState.txn.Proto.IsInitialized() && !txnState.retrying { err := fmt.Errorf("SAVEPOINT %s needs to be the first statement in a transaction", parser.RestartSavepointName) txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err } // Note that Savepoint doesn't have a corresponding plan node. // This here is all the execution there is. txnState.retryIntent = true return Result{}, nil case *parser.RollbackToSavepoint: err := parser.ValidateRestartCheckpoint(s.Savepoint) if err == nil { // Can't restart if we didn't get an error first, which would've put the // txn in a different state. err = errNotRetriable } txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Prepare: err := util.UnimplementedWithIssueErrorf(7568, "Prepared statements are supported only via the Postgres wire protocol") txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Execute: err := util.UnimplementedWithIssueErrorf(7568, "Executing prepared statements is supported only via the Postgres wire protocol") txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err case *parser.Deallocate: if s.Name == "" { planMaker.session.PreparedStatements.DeleteAll() } else { if found := planMaker.session.PreparedStatements.Delete(string(s.Name)); !found { err := fmt.Errorf("prepared statement %s does not exist", s.Name) txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err } } return Result{PGTag: s.StatementTag()}, nil } autoCommit := implicitTxn && !e.cfg.TestingKnobs.DisableAutoCommit result, err := e.execStmt(stmt, planMaker, autoCommit) if err != nil { if result.Rows != nil { result.Rows.Close() result.Rows = nil } if traceSQL { log.ErrEventf(txnState.txn.Context, "ERROR: %v", err) } log.ErrEventf(session.context, "ERROR: %v", err) txnState.updateStateAndCleanupOnErr(err, e) return Result{Err: err}, err } tResult := &traceResult{tag: result.PGTag, count: -1} switch result.Type { case parser.RowsAffected: tResult.count = result.RowsAffected case parser.Rows: tResult.count = result.Rows.Len() } if traceSQL { log.Eventf(txnState.txn.Context, "%s done", tResult) } log.Eventf(session.context, "%s done", tResult) return result, nil }
// execRequest executes the request using the provided planner. // It parses the sql into statements, iterates through the statements, creates // KV transactions and automatically retries them when possible, and executes // the (synchronous attempt of) schema changes. // It will accumulate a result in Response for each statement. // It will resume a SQL transaction, if one was previously open for this client. // // execRequest handles the mismatch between the SQL interface that the Executor // provides, based on statements being streamed from the client in the context // of a session, and the KV client.Txn interface, based on (possibly-retriable) // callbacks passed to be executed in the context of a transaction. Actual // execution of statements in the context of a KV txn is delegated to // runTxnAttempt(). // // Args: // txnState: State about about ongoing transaction (if any). The state will be // updated. func (e *Executor) execRequest(session *Session, sql string, copymsg copyMsg) StatementResults { var res StatementResults txnState := &session.TxnState planMaker := &session.planner var stmts parser.StatementList var err error log.VEventf(session.Ctx(), 2, "execRequest: %s", sql) if session.planner.copyFrom != nil { stmts, err = session.planner.ProcessCopyData(sql, copymsg) } else if copymsg != copyMsgNone { err = fmt.Errorf("unexpected copy command") } else { stmts, err = planMaker.parser.Parse(sql, parser.Syntax(session.Syntax)) } if err != nil { // A parse error occurred: we can't determine if there were multiple // statements or only one, so just pretend there was one. if txnState.txn != nil { // Rollback the txn. txnState.updateStateAndCleanupOnErr(err, e) } res.ResultList = append(res.ResultList, Result{Err: err}) return res } if len(stmts) == 0 { res.Empty = true return res } // If the planMaker wants config updates to be blocked, then block them. defer planMaker.blockConfigUpdatesMaybe(e)() for len(stmts) > 0 { // Each iteration consumes a transaction's worth of statements. inTxn := txnState.State != NoTxn execOpt := client.TxnExecOptions{ Clock: e.cfg.Clock, } // Figure out the statements out of which we're going to try to consume // this iteration. If we need to create an implicit txn, only one statement // can be consumed. stmtsToExec := stmts // If protoTS is set, the transaction proto sets its Orig and Max timestamps // to it each retry. var protoTS *hlc.Timestamp // We can AutoRetry the next batch of statements if we're in a clean state // (i.e. the next statements we're going to see are the first statements in // a transaction). if !inTxn { // Detect implicit transactions. if _, isBegin := stmts[0].(*parser.BeginTransaction); !isBegin { execOpt.AutoCommit = true stmtsToExec = stmtsToExec[:1] // Check for AS OF SYSTEM TIME. If it is present but not detected here, // it will raise an error later on. protoTS, err = isAsOf(planMaker, stmtsToExec[0], e.cfg.Clock.Now()) if err != nil { res.ResultList = append(res.ResultList, Result{Err: err}) return res } if protoTS != nil { planMaker.avoidCachedDescriptors = true defer func() { planMaker.avoidCachedDescriptors = false }() } } txnState.resetForNewSQLTxn(e, session) txnState.autoRetry = true txnState.sqlTimestamp = e.cfg.Clock.PhysicalTime() if execOpt.AutoCommit { txnState.txn.SetDebugName(sqlImplicitTxnName, 0) } else { txnState.txn.SetDebugName(sqlTxnName, 0) } } else { txnState.autoRetry = false } execOpt.AutoRetry = txnState.autoRetry if txnState.State == NoTxn { panic("we failed to initialize a txn") } // Now actually run some statements. var remainingStmts parser.StatementList var results []Result origState := txnState.State txnClosure := func(txn *client.Txn, opt *client.TxnExecOptions) error { if txnState.State == Open && txnState.txn != txn { panic(fmt.Sprintf("closure wasn't called in the txn we set up for it."+ "\ntxnState.txn:%+v\ntxn:%+v\ntxnState:%+v", txnState.txn, txn, txnState)) } txnState.txn = txn if protoTS != nil { setTxnTimestamps(txnState.txn, *protoTS) } var err error if results != nil { // Some results were produced by a previous attempt. Discard them. ResultList(results).Close() } results, remainingStmts, err = runTxnAttempt(e, planMaker, origState, txnState, opt, stmtsToExec) // TODO(andrei): Until #7881 fixed. if err == nil && txnState.State == Aborted { log.Errorf(session.Ctx(), "7881: txnState is Aborted without an error propagating. stmtsToExec: %s, "+ "results: %+v, remainingStmts: %s, txnState: %+v", stmtsToExec, results, remainingStmts, txnState) } return err } // This is where the magic happens - we ask db to run a KV txn and possibly retry it. txn := txnState.txn // this might be nil if the txn was already aborted. err := txn.Exec(execOpt, txnClosure) // Update the Err field of the last result if the error was coming from // auto commit. The error was generated outside of the txn closure, so it was not // set in any result. if err != nil { lastResult := &results[len(results)-1] if aErr, ok := err.(*client.AutoCommitError); ok { // TODO(andrei): Until #7881 fixed. { log.Eventf(session.Ctx(), "executor got AutoCommitError: %s\n"+ "txn: %+v\nexecOpt.AutoRetry %t, execOpt.AutoCommit:%t, stmts %+v, remaining %+v", aErr, txnState.txn.Proto, execOpt.AutoRetry, execOpt.AutoCommit, stmts, remainingStmts) if txnState.txn == nil { log.Errorf(session.Ctx(), "7881: AutoCommitError on nil txn: %s, "+ "txnState %+v, execOpt %+v, stmts %+v, remaining %+v", aErr, txnState, execOpt, stmts, remainingStmts) txnState.sp.SetBaggageItem(keyFor7881Sample, "sample me please") } } lastResult.Err = aErr e.TxnAbortCount.Inc(1) txn.CleanupOnError(err) } if lastResult.Err == nil { log.Fatalf(session.Ctx(), "error (%s) was returned, but it was not set in the last result (%v)", err, lastResult) } } res.ResultList = append(res.ResultList, results...) // Now make sense of the state we got into and update txnState. if (txnState.State == RestartWait || txnState.State == Aborted) && txnState.commitSeen { // A COMMIT got an error (retryable or not). Too bad, this txn is toast. // After we return a result for COMMIT (with the COMMIT pgwire tag), the // user can't send any more commands. e.TxnAbortCount.Inc(1) txn.CleanupOnError(err) txnState.resetStateAndTxn(NoTxn) } if execOpt.AutoCommit { // If execOpt.AutoCommit was set, then the txn no longer exists at this point. txnState.resetStateAndTxn(NoTxn) } // If we're no longer in a transaction, finish the trace. if txnState.State == NoTxn { txnState.finishSQLTxn(session.context) } // If the txn is in any state but Open, exec the schema changes. They'll // short-circuit themselves if the mutation that queued them has been // rolled back from the table descriptor. stmtsExecuted := stmts[:len(stmtsToExec)-len(remainingStmts)] if txnState.State != Open { planMaker.checkTestingVerifyMetadataInitialOrDie(e, stmts) planMaker.checkTestingVerifyMetadataOrDie(e, stmtsExecuted) // Exec the schema changers (if the txn rolled back, the schema changers // will short-circuit because the corresponding descriptor mutation is not // found). planMaker.releaseLeases() txnState.schemaChangers.execSchemaChanges(e, planMaker, res.ResultList) } else { // We're still in a txn, so we only check that the verifyMetadata callback // fails the first time it's run. The gossip update that will make the // callback succeed only happens when the txn is done. planMaker.checkTestingVerifyMetadataInitialOrDie(e, stmtsExecuted) } // Figure out what statements to run on the next iteration. if err != nil { // Don't execute anything further. stmts = nil } else if execOpt.AutoCommit { stmts = stmts[1:] } else { stmts = remainingStmts } } return res }
// resolveIntents resolves the given intents. `wait` is currently a // no-op; all intents are resolved synchronously. // // TODO(bdarnell): Restore the wait=false optimization when/if #8360 // is fixed. `wait=false` requests a semi-synchronous operation, // returning when all local commands have been *proposed* but not yet // committed or executed. This ensures that if a waiting client // retries immediately after calling this function, it will not hit // the same intents again (in the absence of #8360, we provide this // guarantee by resolving the intents synchronously regardless of the // `wait` argument). func (ir *intentResolver) resolveIntents( ctx context.Context, intents []roachpb.Intent, wait bool, poison bool, ) error { // Force synchronous operation; see above TODO. wait = true if len(intents) == 0 { return nil } // We're doing async stuff below; those need new traces. ctx, cleanup := tracing.EnsureContext(ctx, ir.store.Tracer()) defer cleanup() log.Eventf(ctx, "resolving intents [wait=%t]", wait) var reqs []roachpb.Request for i := range intents { intent := intents[i] // avoids a race in `i, intent := range ...` var resolveArgs roachpb.Request { if len(intent.EndKey) == 0 { resolveArgs = &roachpb.ResolveIntentRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } } else { resolveArgs = &roachpb.ResolveIntentRangeRequest{ Span: intent.Span, IntentTxn: intent.Txn, Status: intent.Status, Poison: poison, } } } reqs = append(reqs, resolveArgs) } // Resolve all of the intents. if len(reqs) > 0 { b := &client.Batch{} b.AddRawRequest(reqs...) action := func() error { // TODO(tschottdorf): no tracing here yet. return ir.store.DB().Run(ctx, b) } if wait || ir.store.Stopper().RunLimitedAsyncTask( ctx, ir.sem, true /* wait */, func(ctx context.Context) { if err := action(); err != nil { log.Warningf(ctx, "unable to resolve external intents: %s", err) } }) != nil { // Try async to not keep the caller waiting, but when draining // just go ahead and do it synchronously. See #1684. // TODO(tschottdorf): This is ripe for removal. if err := action(); err != nil { return err } } } return nil }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation. func (s *Server) Start(ctx context.Context) error { ctx = s.AnnotateCtx(ctx) startTime := timeutil.Now() tlsConfig, err := s.cfg.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.cfg.Addr) if err != nil { return err } log.Eventf(ctx, "listening on port %s", s.cfg.Addr) unresolvedListenAddr, err := officialAddr(s.cfg.Addr, ln.Addr()) if err != nil { return err } s.cfg.Addr = unresolvedListenAddr.String() unresolvedAdvertAddr, err := officialAddr(s.cfg.AdvertiseAddr, ln.Addr()) if err != nil { return err } s.cfg.AdvertiseAddr = unresolvedAdvertAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.cfg.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.cfg.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.cfg.HTTPAddr = unresolvedHTTPAddr.String() workersCtx := s.AnnotateCtx(context.Background()) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) if len(s.cfg.SocketFile) != 0 { // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.cfg.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) } // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAdvertAddr) log.Event(ctx, "started gossip") s.engines, err = s.cfg.CreateEngines() if err != nil { return errors.Wrap(err, "failed to create engines") } s.stopper.AddCloser(&s.engines) // We might have to sleep a bit to protect against this node producing non- // monotonic timestamps. Before restarting, its clock might have been driven // by other nodes' fast clocks, but when we restarted, we lost all this // information. For example, a client might have written a value at a // timestamp that's in the future of the restarted node's clock, and if we // don't do something, the same client's read would not return the written // value. So, we wait up to MaxOffset; we couldn't have served timestamps more // than MaxOffset in the future (assuming that MaxOffset was not changed, see // #9733). // // As an optimization for tests, we don't sleep if all the stores are brand // new. In this case, the node will not serve anything anyway until it // synchronizes with other nodes. { anyStoreBootstrapped := false for _, e := range s.engines { if _, err := storage.ReadStoreIdent(ctx, e); err != nil { // NotBootstrappedError is expected. if _, ok := err.(*storage.NotBootstrappedError); !ok { return err } } else { anyStoreBootstrapped = true break } } if anyStoreBootstrapped { sleepDuration := s.clock.MaxOffset() - timeutil.Since(startTime) if sleepDuration > 0 { log.Infof(ctx, "sleeping for %s to guarantee HLC monotonicity", sleepDuration) time.Sleep(sleepDuration) } } } // Now that we have a monotonic HLC wrt previous incarnations of the process, // init all the replicas. err = s.node.start( ctx, unresolvedAdvertAddr, s.engines, s.cfg.NodeAttributes, s.cfg.Locality, ) if err != nil { return err } log.Event(ctx, "started node") s.nodeLiveness.StartHeartbeat(ctx, s.stopper) // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.cfg.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource( s.cfg.AmbientCtx, s.recorder, s.cfg.MetricsSampleInterval, ts.Resolution10s, s.stopper, ) // Begin recording status summaries. s.node.startWriteSummaries(s.cfg.MetricsSampleInterval) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := &sql.SchemaChangerTestingKnobs{} if s.cfg.TestingKnobs.SQLSchemaChanger != nil { testingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) s.distSQLServer.Start() log.Infof(ctx, "starting %s server at %s", s.cfg.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(ctx, "starting grpc/postgres server at %s", unresolvedListenAddr) log.Infof(ctx, "advertising CockroachDB node at %s", unresolvedAdvertAddr) if len(s.cfg.SocketFile) != 0 { log.Infof(ctx, "starting postgres server at unix:%s", s.cfg.SocketFile) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) log.Event(ctx, "accepting connections") // Initialize grpc-gateway mux and context. jsonpb := &protoutil.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(protoutil.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb), gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background())) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.cfg.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("pkg/ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminPrefix, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, gwMux) s.mux.Handle("/health", gwMux) s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars)) log.Event(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err) } log.Event(ctx, "server ready") return nil }
// initStores initializes the Stores map from ID to Store. Stores are // added to the local sender if already bootstrapped. A bootstrapped // Store has a valid ident with cluster, node and Store IDs set. If // the Store doesn't yet have a valid ident, it's added to the // bootstraps list for initialization once the cluster and node IDs // have been determined. func (n *Node) initStores( ctx context.Context, engines []engine.Engine, stopper *stop.Stopper, bootstrapped bool, ) error { var bootstraps []*storage.Store if len(engines) == 0 { return errors.Errorf("no engines") } for _, e := range engines { s := storage.NewStore(n.storeCfg, e, &n.Descriptor) log.Eventf(ctx, "created store for engine: %s", e) if bootstrapped { s.NotifyBootstrapped() } // Initialize each store in turn, handling un-bootstrapped errors by // adding the store to the bootstraps list. if err := s.Start(ctx, stopper); err != nil { if _, ok := err.(*storage.NotBootstrappedError); ok { log.Infof(ctx, "store %s not bootstrapped", s) bootstraps = append(bootstraps, s) continue } return errors.Errorf("failed to start store: %s", err) } if s.Ident.ClusterID == *uuid.EmptyUUID || s.Ident.NodeID == 0 { return errors.Errorf("unidentified store: %s", s) } capacity, err := s.Capacity() if err != nil { return errors.Errorf("could not query store capacity: %s", err) } log.Infof(ctx, "initialized store %s: %+v", s, capacity) n.addStore(s) } // If there are no initialized stores and no gossip resolvers, // bootstrap this node as the seed of a new cluster. if n.stores.GetStoreCount() == 0 { resolvers := n.storeCfg.Gossip.GetResolvers() // Check for the case of uninitialized node having only itself specified as join host. switch len(resolvers) { case 0: return errNeedsBootstrap case 1: if resolvers[0].Addr() == n.Descriptor.Address.String() { return errCannotJoinSelf } } } // Verify all initialized stores agree on cluster and node IDs. if err := n.validateStores(); err != nil { return err } log.Event(ctx, "validated stores") // Set the stores map as the gossip persistent storage, so that // gossip can bootstrap using the most recently persisted set of // node addresses. if err := n.storeCfg.Gossip.SetStorage(n.stores); err != nil { return fmt.Errorf("failed to initialize the gossip interface: %s", err) } // Connect gossip before starting bootstrap. For new nodes, connecting // to the gossip network is necessary to get the cluster ID. n.connectGossip(ctx) log.Event(ctx, "connected to gossip") // If no NodeID has been assigned yet, allocate a new node ID by // supplying 0 to initNodeID. if n.Descriptor.NodeID == 0 { n.initNodeID(0) n.initialBoot = true log.Eventf(ctx, "allocated node ID %d", n.Descriptor.NodeID) } // Bootstrap any uninitialized stores asynchronously. if len(bootstraps) > 0 { if err := stopper.RunAsyncTask(ctx, func(ctx context.Context) { n.bootstrapStores(ctx, bootstraps, stopper) }); err != nil { return err } } return nil }
// Send implements the batch.Sender interface. If the request is part of a // transaction, the TxnCoordSender adds the transaction to a map of active // transactions and begins heartbeating it. Every subsequent request for the // same transaction updates the lastUpdate timestamp to prevent live // transactions from being considered abandoned and garbage collected. // Read/write mutating requests have their key or key range added to the // transaction's interval tree of key ranges for eventual cleanup via resolved // write intents; they're tagged to an outgoing EndTransaction request, with // the receiving replica in charge of resolving them. func (tc *TxnCoordSender) Send( ctx context.Context, ba roachpb.BatchRequest, ) (*roachpb.BatchResponse, *roachpb.Error) { // Start new or pick up active trace. From here on, there's always an active // Trace, though its overhead is small unless it's sampled. sp := opentracing.SpanFromContext(ctx) var tracer opentracing.Tracer if sp == nil { tracer = tc.AmbientContext.Tracer sp = tracer.StartSpan(opTxnCoordSender) defer sp.Finish() ctx = opentracing.ContextWithSpan(ctx, sp) } else { tracer = sp.Tracer() } startNS := tc.clock.PhysicalNow() if ba.Txn != nil { // If this request is part of a transaction... if err := tc.maybeBeginTxn(&ba); err != nil { return nil, roachpb.NewError(err) } txnID := *ba.Txn.ID // Associate the txnID with the trace. We need to do this after the // maybeBeginTxn call. We set both a baggage item and a tag because only // tags show up in the LIghtstep UI. txnIDStr := txnID.String() sp.SetTag("txnID", txnIDStr) sp.SetBaggageItem("txnID", txnIDStr) var et *roachpb.EndTransactionRequest var hasET bool { var rArgs roachpb.Request rArgs, hasET = ba.GetArg(roachpb.EndTransaction) if hasET { et = rArgs.(*roachpb.EndTransactionRequest) if len(et.Key) != 0 { return nil, roachpb.NewErrorf("EndTransaction must not have a Key set") } et.Key = ba.Txn.Key if len(et.IntentSpans) > 0 { // TODO(tschottdorf): it may be useful to allow this later. // That would be part of a possible plan to allow txns which // write on multiple coordinators. return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction") } } } if pErr := func() *roachpb.Error { tc.Lock() defer tc.Unlock() if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil { return pErr } if !hasET { return nil } // Everything below is carried out only when trying to commit. // Populate et.IntentSpans, taking into account both any existing // and new writes, and taking care to perform proper deduplication. txnMeta := tc.txns[txnID] distinctSpans := true if txnMeta != nil { et.IntentSpans = txnMeta.keys // Defensively set distinctSpans to false if we had any previous // requests in this transaction. This effectively limits the distinct // spans optimization to 1pc transactions. distinctSpans = len(txnMeta.keys) == 0 } // We can't pass in a batch response here to better limit the key // spans as we don't know what is going to be affected. This will // affect queries such as `DELETE FROM my.table LIMIT 10` when // executed as a 1PC transaction. e.g.: a (BeginTransaction, // DeleteRange, EndTransaction) batch. ba.IntentSpanIterate(nil, func(key, endKey roachpb.Key) { et.IntentSpans = append(et.IntentSpans, roachpb.Span{ Key: key, EndKey: endKey, }) }) // TODO(peter): Populate DistinctSpans on all batches, not just batches // which contain an EndTransactionRequest. var distinct bool // The request might already be used by an outgoing goroutine, so // we can't safely mutate anything in-place (as MergeSpans does). et.IntentSpans = append([]roachpb.Span(nil), et.IntentSpans...) et.IntentSpans, distinct = roachpb.MergeSpans(et.IntentSpans) ba.Header.DistinctSpans = distinct && distinctSpans if len(et.IntentSpans) == 0 { // If there aren't any intents, then there's factually no // transaction to end. Read-only txns have all of their state // in the client. return roachpb.NewErrorf("cannot commit a read-only transaction") } if txnMeta != nil { txnMeta.keys = et.IntentSpans } return nil }(); pErr != nil { return nil, pErr } if hasET && log.V(1) { for _, intent := range et.IntentSpans { log.Eventf(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey) } } } // Embed the trace metadata into the header for use by RPC recipients. We need // to do this after the maybeBeginTxn call above. // TODO(tschottdorf): To get rid of the spurious alloc below we need to // implement the carrier interface on ba.Header or make Span non-nullable, // both of which force all of ba on the Heap. It's already there, so may // not be a big deal, but ba should live on the stack. Also not easy to use // a buffer pool here since anything that goes into the RPC layer could be // used by goroutines we didn't wait for. if ba.TraceContext == nil { ba.TraceContext = &tracing.SpanContextCarrier{} } else { // We didn't make this object but are about to mutate it, so we // have to take a copy - the original might already have been // passed to the RPC layer. ba.TraceContext = protoutil.Clone(ba.TraceContext).(*tracing.SpanContextCarrier) } if err := tracer.Inject(sp.Context(), basictracer.Delegator, ba.TraceContext); err != nil { return nil, roachpb.NewError(err) } // Send the command through wrapped sender, taking appropriate measures // on error. var br *roachpb.BatchResponse { var pErr *roachpb.Error br, pErr = tc.wrapped.Send(ctx, ba) if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok { // TODO(tschottdorf): needs to keep the trace. br, pErr = tc.resendWithTxn(ba) } if pErr = tc.updateState(ctx, startNS, ba, br, pErr); pErr != nil { log.Eventf(ctx, "error: %s", pErr) return nil, pErr } } if br.Txn == nil { return br, nil } if _, ok := ba.GetArg(roachpb.EndTransaction); !ok { return br, nil } // If the --linearizable flag is set, we want to make sure that // all the clocks in the system are past the commit timestamp // of the transaction. This is guaranteed if either // - the commit timestamp is MaxOffset behind startNS // - MaxOffset ns were spent in this function // when returning to the client. Below we choose the option // that involves less waiting, which is likely the first one // unless a transaction commits with an odd timestamp. if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS { startNS = tsNS } sleepNS := tc.clock.MaxOffset() - time.Duration(tc.clock.PhysicalNow()-startNS) if tc.linearizable && sleepNS > 0 { defer func() { if log.V(1) { log.Infof(ctx, "%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond)) } time.Sleep(sleepNS) }() } if br.Txn.Status != roachpb.PENDING { tc.Lock() tc.cleanupTxnLocked(ctx, *br.Txn) tc.Unlock() } return br, nil }