Example #1
0
// maybeSignalStatusChangeLocked checks whether gossip should transition its
// internal state from connected to stalled or vice versa.
func (g *Gossip) maybeSignalStatusChangeLocked() {
	ctx := g.AnnotateCtx(context.TODO())
	orphaned := g.outgoing.len()+g.mu.incoming.len() == 0
	stalled := orphaned || g.mu.is.getInfo(KeySentinel) == nil
	if stalled {
		// We employ the stalled boolean to avoid filling logs with warnings.
		if !g.stalled {
			log.Eventf(ctx, "now stalled")
			if orphaned {
				if len(g.resolvers) == 0 {
					log.Warningf(ctx, "no resolvers found; use --join to specify a connected node")
				} else {
					log.Warningf(ctx, "no incoming or outgoing connections")
				}
			} else if len(g.resolversTried) == len(g.resolvers) {
				log.Warningf(ctx, "first range unavailable; resolvers exhausted")
			} else {
				log.Warningf(ctx, "first range unavailable; trying remaining resolvers")
			}
		}
		if len(g.resolvers) > 0 {
			g.signalStalledLocked()
		}
	} else {
		if g.stalled {
			log.Eventf(ctx, "connected")
			log.Infof(ctx, "node has connected to cluster via gossip")
			g.signalConnectedLocked()
		}
		g.maybeCleanupBootstrapAddressesLocked()
	}
	g.stalled = stalled
}
Example #2
0
// addInternal adds the replica the queue with specified priority. If
// the replica is already queued, updates the existing
// priority. Expects the queue lock to be held by caller.
func (bq *baseQueue) addInternal(
	ctx context.Context, desc *roachpb.RangeDescriptor, should bool, priority float64,
) (bool, error) {
	if bq.mu.stopped {
		return false, errQueueStopped
	}

	if bq.mu.disabled {
		log.Event(ctx, "queue disabled")
		return false, errQueueDisabled
	}

	if !desc.IsInitialized() {
		// We checked this above in MaybeAdd(), but we need to check it
		// again for Add().
		return false, errors.New("replica not initialized")
	}

	// If the replica is currently in purgatory, don't re-add it.
	if _, ok := bq.mu.purgatory[desc.RangeID]; ok {
		return false, nil
	}

	item, ok := bq.mu.replicas[desc.RangeID]
	if !should {
		if ok {
			log.Eventf(ctx, "%s: removing from queue", item.value)
			bq.remove(item)
		}
		return false, errReplicaNotAddable
	} else if ok {
		if item.priority != priority {
			log.Eventf(ctx, "%s: updating priority: %0.3f -> %0.3f",
				desc, item.priority, priority)
		}
		// Replica has already been added; update priority.
		bq.mu.priorityQ.update(item, priority)
		return false, nil
	}

	log.VEventf(ctx, 3, "%s: adding: priority=%0.3f", desc, priority)
	item = &replicaItem{value: desc.RangeID, priority: priority}
	bq.add(item)

	// If adding this replica has pushed the queue past its maximum size,
	// remove the lowest priority element.
	if pqLen := bq.mu.priorityQ.Len(); pqLen > bq.maxSize {
		bq.remove(bq.mu.priorityQ[pqLen-1])
	}
	// Signal the processLoop that a replica has been added.
	select {
	case bq.incoming <- struct{}{}:
	default:
		// No need to signal again.
	}
	return true, nil
}
Example #3
0
// bootstrap connects the node to the gossip network. Bootstrapping
// commences in the event there are no connected clients or the
// sentinel gossip info is not available. After a successful bootstrap
// connection, this method will block on the stalled condvar, which
// receives notifications that gossip network connectivity has been
// lost and requires re-bootstrapping.
func (g *Gossip) bootstrap() {
	g.server.stopper.RunWorker(func() {
		ctx := g.AnnotateCtx(context.Background())
		ctx = log.WithLogTag(ctx, "bootstrap", nil)
		var bootstrapTimer timeutil.Timer
		defer bootstrapTimer.Stop()
		for {
			if g.server.stopper.RunTask(func() {
				g.mu.Lock()
				defer g.mu.Unlock()
				haveClients := g.outgoing.len() > 0
				haveSentinel := g.mu.is.getInfo(KeySentinel) != nil
				log.Eventf(ctx, "have clients: %t, have sentinel: %t", haveClients, haveSentinel)
				if !haveClients || !haveSentinel {
					// Try to get another bootstrap address from the resolvers.
					if addr := g.getNextBootstrapAddress(); addr != nil {
						g.startClient(addr, g.NodeID.Get())
					} else {
						bootstrapAddrs := make([]string, 0, len(g.bootstrapping))
						for addr := range g.bootstrapping {
							bootstrapAddrs = append(bootstrapAddrs, addr)
						}
						log.Eventf(ctx, "no next bootstrap address; currently bootstrapping: %v", bootstrapAddrs)
						// We couldn't start a client, signal that we're stalled so that
						// we'll retry.
						g.maybeSignalStatusChangeLocked()
					}
				}
			}) != nil {
				return
			}

			// Pause an interval before next possible bootstrap.
			bootstrapTimer.Reset(g.bootstrapInterval)
			log.Eventf(ctx, "sleeping %s until bootstrap", g.bootstrapInterval)
			select {
			case <-bootstrapTimer.C:
				bootstrapTimer.Read = true
				// break
			case <-g.server.stopper.ShouldStop():
				return
			}
			log.Eventf(ctx, "idling until bootstrap required")
			// Block until we need bootstrapping again.
			select {
			case <-g.stalledCh:
				log.Eventf(ctx, "detected stall; commencing bootstrap")
				// break
			case <-g.server.stopper.ShouldStop():
				return
			}
		}
	})
}
Example #4
0
// Seek positions the iterator at the specified key.
func (ri *RangeIterator) Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) {
	log.Eventf(ctx, "querying next range at %s", key)
	ri.scanDir = scanDir
	ri.init = true // the iterator is now initialized
	ri.pErr = nil  // clear any prior error
	ri.key = key   // set the key

	// Retry loop for looking up next range in the span. The retry loop
	// deals with retryable range descriptor lookups.
	for r := retry.StartWithCtx(ctx, ri.ds.rpcRetryOptions); r.Next(); {
		log.Event(ctx, "meta descriptor lookup")
		var err error
		ri.desc, ri.token, err = ri.ds.getDescriptor(
			ctx, ri.key, ri.token, ri.scanDir == Descending)

		// getDescriptor may fail retryably if, for example, the first
		// range isn't available via Gossip. Assume that all errors at
		// this level are retryable. Non-retryable errors would be for
		// things like malformed requests which we should have checked
		// for before reaching this point.
		if err != nil {
			log.VEventf(ctx, 1, "range descriptor lookup failed: %s", err)
			continue
		}

		// It's possible that the returned descriptor misses parts of the
		// keys it's supposed to include after it's truncated to match the
		// descriptor. Example revscan [a,g), first desc lookup for "g"
		// returns descriptor [c,d) -> [d,g) is never scanned.
		// We evict and retry in such a case.
		// TODO: this code is subject to removal. See
		// https://groups.google.com/d/msg/cockroach-db/DebjQEgU9r4/_OhMe7atFQAJ
		reverse := ri.scanDir == Descending
		if (reverse && !ri.desc.ContainsExclusiveEndKey(ri.key)) ||
			(!reverse && !ri.desc.ContainsKey(ri.key)) {
			log.Eventf(ctx, "addressing error: %s does not include key %s", ri.desc, ri.key)
			if err := ri.token.Evict(ctx); err != nil {
				ri.pErr = roachpb.NewError(err)
				return
			}
			// On addressing errors, don't backoff; retry immediately.
			r.Reset()
			continue
		}
		return
	}

	// Check for an early exit from the retry loop.
	if pErr := ri.ds.deduceRetryEarlyExitError(ctx); pErr != nil {
		ri.pErr = pErr
	} else {
		ri.pErr = roachpb.NewErrorf("RangeIterator failed to seek to %s", key)
	}
}
Example #5
0
// processReplica processes a single replica. This should not be
// called externally to the queue. bq.mu.Lock must not be held
// while calling this method.
func (bq *baseQueue) processReplica(
	queueCtx context.Context, repl *Replica, clock *hlc.Clock,
) error {
	bq.processMu.Lock()
	defer bq.processMu.Unlock()

	// Load the system config.
	cfg, ok := bq.gossip.GetSystemConfig()
	if !ok {
		log.VEventf(queueCtx, 1, "no system config available, skipping")
		return nil
	}

	if bq.requiresSplit(cfg, repl) {
		// Range needs to be split due to zone configs, but queue does
		// not accept unsplit ranges.
		log.VEventf(queueCtx, 3, "split needed; skipping")
		return nil
	}

	// Putting a span in a context means that events will no longer go to the
	// event log. Use queueCtx for events that are intended for the event log.
	ctx, span := bq.AnnotateCtxWithSpan(queueCtx, bq.name)
	defer span.Finish()
	// Also add the Replica annotations to ctx.
	ctx = repl.AnnotateCtx(ctx)
	ctx, cancel := context.WithTimeout(ctx, bq.processTimeout)
	defer cancel()
	log.Eventf(ctx, "processing replica")

	if err := repl.IsDestroyed(); err != nil {
		log.VEventf(queueCtx, 3, "replica destroyed (%s); skipping", err)
		return nil
	}

	// If the queue requires a replica to have the range lease in
	// order to be processed, check whether this replica has range lease
	// and renew or acquire if necessary.
	if bq.needsLease {
		// Create a "fake" get request in order to invoke redirectOnOrAcquireLease.
		if err := repl.redirectOnOrAcquireLease(ctx); err != nil {
			switch v := err.GetDetail().(type) {
			case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError:
				log.VEventf(queueCtx, 3, "%s; skipping", v)
				return nil
			default:
				return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl)
			}
		}
		log.Event(ctx, "got range lease")
	}

	log.VEventf(queueCtx, 3, "processing")
	if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil {
		return err
	}
	log.Event(ctx, "done")
	bq.successes.Inc(1)
	return nil
}
Example #6
0
// GetSnapshot returns a snapshot of the replica appropriate for sending to a
// replica. If this method returns without error, callers must eventually call
// OutgoingSnapshot.Close.
func (r *Replica) GetSnapshot(ctx context.Context, snapType string) (*OutgoingSnapshot, error) {
	r.mu.Lock()
	defer r.mu.Unlock()
	rangeID := r.RangeID

	if r.exceedsDoubleSplitSizeLocked() {
		maxBytes := r.mu.maxBytes
		size := r.mu.state.Stats.Total()
		err := errors.Errorf(
			"%s: not generating %s snapshot because replica is too large: %d > 2 * %d",
			r, snapType, size, maxBytes)
		return &OutgoingSnapshot{}, err
	}

	startKey := r.mu.state.Desc.StartKey
	ctx, sp := r.AnnotateCtxWithSpan(ctx, "snapshot")
	defer sp.Finish()
	snap := r.store.NewSnapshot()
	log.Eventf(ctx, "new engine snapshot for replica %s", r)

	// Delegate to a static function to make sure that we do not depend
	// on any indirect calls to r.store.Engine() (or other in-memory
	// state of the Replica). Everything must come from the snapshot.
	snapData, err := snapshot(ctx, snapType, snap, rangeID, r.store.raftEntryCache, startKey)
	if err != nil {
		log.Errorf(ctx, "error generating snapshot: %s", err)
		return nil, err
	}
	return &snapData, nil
}
// GetSnapshot wraps Snapshot() but does not require the replica lock
// to be held and it will block instead of returning
// ErrSnapshotTemporaryUnavailable. The caller is directly responsible for
// calling r.CloseOutSnap.
func (r *Replica) GetSnapshot(ctx context.Context, snapType string) (*OutgoingSnapshot, error) {
	// Use shorter-than-usual backoffs because this rarely succeeds on
	// the first attempt and this method is used a lot in tests.
	// Unsuccessful attempts are cheap, so we can have a low MaxBackoff.
	retryOpts := retry.Options{
		InitialBackoff: 1 * time.Millisecond,
		MaxBackoff:     100 * time.Millisecond,
		Multiplier:     2,
	}
	for retryObj := retry.StartWithCtx(ctx, retryOpts); retryObj.Next(); {
		log.Eventf(ctx, "snapshot retry loop pass %d", retryObj.CurrentAttempt())

		r.mu.Lock()
		doneChan := r.mu.outSnapDone
		r.mu.Unlock()

		<-doneChan

		r.mu.Lock()
		snap, err := r.snapshotWithContext(ctx, snapType)
		if err == nil {
			r.mu.outSnap.claimed = true
		}
		r.mu.Unlock()
		if err == raft.ErrSnapshotTemporarilyUnavailable {
			continue
		} else {
			return snap, err
		}
	}
	return nil, ctx.Err() // the only loop exit condition
}
Example #8
0
// maybeAddBootstrapAddress adds the specified address to the list of
// bootstrap addresses if not already present. Returns whether a new
// bootstrap address was added. The caller must hold the gossip mutex.
func (g *Gossip) maybeAddBootstrapAddress(addr util.UnresolvedAddr) bool {
	if _, ok := g.bootstrapAddrs[addr]; ok {
		return false
	}
	g.bootstrapInfo.Addresses = append(g.bootstrapInfo.Addresses, addr)
	g.bootstrapAddrs[addr] = struct{}{}
	ctx := g.AnnotateCtx(context.TODO())
	log.Eventf(ctx, "add bootstrap %s", addr)
	return true
}
Example #9
0
// manage manages outgoing clients. Periodically, the infostore is
// scanned for infos with hop count exceeding the MaxHops
// threshold. If the number of outgoing clients doesn't exceed
// maxPeers(), a new gossip client is connected to a randomly selected
// peer beyond MaxHops threshold. Otherwise, the least useful peer
// node is cut off to make room for a replacement. Disconnected
// clients are processed via the disconnected channel and taken out of
// the outgoing address set. If there are no longer any outgoing
// connections or the sentinel gossip is unavailable, the bootstrapper
// is notified via the stalled conditional variable.
func (g *Gossip) manage() {
	g.server.stopper.RunWorker(func() {
		ctx := g.AnnotateCtx(context.Background())
		cullTicker := time.NewTicker(g.jitteredInterval(g.cullInterval))
		stallTicker := time.NewTicker(g.jitteredInterval(g.stallInterval))
		defer cullTicker.Stop()
		defer stallTicker.Stop()
		for {
			select {
			case <-g.server.stopper.ShouldStop():
				return
			case c := <-g.disconnected:
				g.doDisconnected(c)
			case nodeID := <-g.tighten:
				g.tightenNetwork(nodeID)
			case <-cullTicker.C:
				func() {
					g.mu.Lock()
					if !g.outgoing.hasSpace() {
						leastUsefulID := g.mu.is.leastUseful(g.outgoing)

						if c := g.findClient(func(c *client) bool {
							return c.peerID == leastUsefulID
						}); c != nil {
							if log.V(1) {
								log.Infof(ctx, "closing least useful client %+v to tighten network graph", c)
							}
							log.Eventf(ctx, "culling %s", c.addr)
							c.close()

							// After releasing the lock, block until the client disconnects.
							defer func() {
								g.doDisconnected(<-g.disconnected)
							}()
						} else {
							if log.V(1) {
								g.clientsMu.Lock()
								log.Infof(ctx, "couldn't find least useful client among %+v", g.clientsMu.clients)
								g.clientsMu.Unlock()
							}
						}
					}
					g.mu.Unlock()
				}()
			case <-stallTicker.C:
				g.mu.Lock()
				g.maybeSignalStatusChangeLocked()
				g.mu.Unlock()
			}
		}
	})
}
Example #10
0
// maybeAddBootstrapAddress adds the specified address to the list of
// bootstrap addresses if not already present. Returns whether a new
// bootstrap address was added. The caller must hold the gossip mutex.
func (g *Gossip) maybeAddBootstrapAddress(addr util.UnresolvedAddr, nodeID roachpb.NodeID) bool {
	if existingNodeID, ok := g.bootstrapAddrs[addr]; ok {
		if existingNodeID == unknownNodeID || existingNodeID != nodeID {
			g.bootstrapAddrs[addr] = nodeID
		}
		return false
	}
	g.bootstrapInfo.Addresses = append(g.bootstrapInfo.Addresses, addr)
	g.bootstrapAddrs[addr] = nodeID
	ctx := g.AnnotateCtx(context.TODO())
	log.Eventf(ctx, "add bootstrap %s", addr)
	return true
}
Example #11
0
// removeClient removes the specified client. Called when a client
// disconnects.
func (g *Gossip) removeClient(target *client) {
	g.clientsMu.Lock()
	defer g.clientsMu.Unlock()
	for i, candidate := range g.clientsMu.clients {
		if candidate == target {
			ctx := g.AnnotateCtx(context.TODO())
			log.Eventf(ctx, "client %s disconnected", candidate.addr)
			g.clientsMu.clients = append(g.clientsMu.clients[:i], g.clientsMu.clients[i+1:]...)
			delete(g.bootstrapping, candidate.addr.String())
			g.outgoing.removeNode(candidate.peerID)
			break
		}
	}
}
Example #12
0
// startClient launches a new client connected to remote address.
// The client is added to the outgoing address set and launched in
// a goroutine.
func (g *Gossip) startClient(addr net.Addr, nodeID roachpb.NodeID) {
	g.clientsMu.Lock()
	defer g.clientsMu.Unlock()
	breaker, ok := g.clientsMu.breakers[addr.String()]
	if !ok {
		breaker = g.rpcContext.NewBreaker()
		g.clientsMu.breakers[addr.String()] = breaker
	}
	ctx := g.AnnotateCtx(context.TODO())
	log.Eventf(ctx, "starting new client to %s", addr)
	c := newClient(g.server.AmbientContext, addr, g.serverMetrics)
	g.clientsMu.clients = append(g.clientsMu.clients, c)
	c.start(g, g.disconnected, g.rpcContext, g.server.stopper, nodeID, breaker)
}
Example #13
0
// tightenNetwork "tightens" the network by starting a new gossip
// client to the most distant node as measured in required gossip hops
// to propagate info from the distant node to this node.
func (g *Gossip) tightenNetwork(distantNodeID roachpb.NodeID) {
	g.mu.Lock()
	defer g.mu.Unlock()
	if g.outgoing.hasSpace() {
		ctx := g.AnnotateCtx(context.TODO())
		if nodeAddr, err := g.getNodeIDAddressLocked(distantNodeID); err != nil {
			log.Errorf(ctx, "unable to get address for node %d: %s", distantNodeID, err)
		} else {
			log.Infof(ctx, "starting client to distant node %d to tighten network graph", distantNodeID)
			log.Eventf(ctx, "tightening network with new client to %s", nodeAddr)
			g.startClient(nodeAddr, g.NodeID.Get())
		}
	}
}
Example #14
0
// maybeAddResolver creates and adds a resolver for the specified
// address if one does not already exist. Returns whether a new
// resolver was added. The caller must hold the gossip mutex.
func (g *Gossip) maybeAddResolver(addr util.UnresolvedAddr) bool {
	if _, ok := g.resolverAddrs[addr]; ok {
		return false
	}
	ctx := g.AnnotateCtx(context.TODO())
	r, err := resolver.NewResolverFromUnresolvedAddr(addr)
	if err != nil {
		log.Warningf(ctx, "bad address %s: %s", addr, err)
		return false
	}
	g.resolvers = append(g.resolvers, r)
	g.resolverAddrs[addr] = r
	log.Eventf(ctx, "add resolver %s", r)
	return true
}
// snapshotWithContext is the main implementation for Snapshot() but it takes
// a context to allow tracing. If this method returns without error, callers
// must eventually call CloseOutSnap to ready this replica for more snapshots.
// r.mu must be held.
func (r *Replica) snapshotWithContext(
	ctx context.Context, snapType string,
) (*OutgoingSnapshot, error) {
	r.mu.AssertHeld()
	rangeID := r.RangeID

	if r.exceedsDoubleSplitSizeLocked() {
		maxBytes := r.mu.maxBytes
		size := r.mu.state.Stats.Total()
		log.Infof(ctx,
			"not generating %s snapshot because replica is too large: %d > 2 * %d",
			snapType, size, maxBytes)
		return &OutgoingSnapshot{}, raft.ErrSnapshotTemporarilyUnavailable
	}

	// See if there is already a snapshot running for this store.
	select {
	case <-r.mu.outSnapDone:
	default:
		log.Event(ctx, "snapshot already running")
		return nil, raft.ErrSnapshotTemporarilyUnavailable
	}
	if !r.store.AcquireRaftSnapshot() {
		log.Event(ctx, "snapshot already running")
		return nil, raft.ErrSnapshotTemporarilyUnavailable
	}

	startKey := r.mu.state.Desc.StartKey
	ctx, sp := r.AnnotateCtxWithSpan(ctx, "snapshot")
	defer sp.Finish()
	snap := r.store.NewSnapshot()
	log.Eventf(ctx, "new engine snapshot for replica %s", r)

	// Delegate to a static function to make sure that we do not depend
	// on any indirect calls to r.store.Engine() (or other in-memory
	// state of the Replica). Everything must come from the snapshot.
	snapData, err := snapshot(ctx, snapType, snap, rangeID, r.store.raftEntryCache, startKey)
	if err != nil {
		log.Errorf(ctx, "error generating snapshot: %s", err)
		return nil, err
	}
	log.Event(ctx, "snapshot generated")
	r.store.metrics.RangeSnapshotsGenerated.Inc(1)
	r.mu.outSnap = snapData
	r.mu.outSnapDone = make(chan struct{})
	return &r.mu.outSnap, nil
}
Example #16
0
// tightenNetwork "tightens" the network by starting a new gossip client to the
// client to the most distant node to which we don't already have an outgoing
// connection. Does nothing if we don't have room for any more outgoing
// connections.
func (g *Gossip) tightenNetwork(ctx context.Context) {
	g.mu.Lock()
	defer g.mu.Unlock()
	if g.outgoing.hasSpace() {
		distantNodeID, distantHops := g.mu.is.mostDistant(g.hasOutgoingLocked)
		log.VEventf(ctx, 2, "distantHops: %d from %d", distantHops, distantNodeID)
		if distantHops <= maxHops {
			return
		}
		if nodeAddr, err := g.getNodeIDAddressLocked(distantNodeID); err != nil {
			log.Errorf(ctx, "unable to get address for distant node %d: %s", distantNodeID, err)
		} else {
			log.Infof(ctx, "starting client to distant node %d (%d > %d) to tighten network graph",
				distantNodeID, distantHops, maxHops)
			log.Eventf(ctx, "tightening network with new client to %s", nodeAddr)
			g.startClientLocked(nodeAddr)
		}
	}
}
Example #17
0
// EvictAndReplace instructs the EvictionToken to evict the RangeDescriptor it was
// created with from the rangeDescriptorCache. It also allows the user to provide
// new RangeDescriptors to insert into the cache, all atomically. When called without
// arguments, EvictAndReplace will behave the same as Evict.
func (et *EvictionToken) EvictAndReplace(
	ctx context.Context, newDescs ...roachpb.RangeDescriptor,
) error {
	var err error
	et.doOnce.Do(func() {
		et.doLocker.Lock()
		defer et.doLocker.Unlock()
		err = et.do()
		if err == nil {
			if len(newDescs) > 0 {
				err = et.doReplace(newDescs...)
				log.Eventf(ctx, "evicting cached range descriptor with %d replacements", len(newDescs))
			} else {
				log.Event(ctx, "evicting cached range descriptor")
			}
		}
	})
	return err
}
Example #18
0
// GetSnapshot wraps Snapshot() but does not require the replica lock
// to be held and it will block instead of returning
// ErrSnapshotTemporaryUnavailable. The caller is directly responsible for
// calling r.CloseOutSnap.
func (r *Replica) GetSnapshot(ctx context.Context) (*OutgoingSnapshot, error) {
	for i := 0; ; i++ {
		log.Eventf(ctx, "snapshot retry loop pass %d", i)

		r.mu.Lock()
		doneChan := r.mu.outSnapDone
		r.mu.Unlock()

		<-doneChan

		r.mu.Lock()
		snap, err := r.SnapshotWithContext(ctx)
		if err == nil {
			r.mu.outSnap.claimed = true
		}
		r.mu.Unlock()
		if err == raft.ErrSnapshotTemporarilyUnavailable {
			continue
		} else {
			return snap, err
		}
	}
}
Example #19
0
// Exec executes fn in the context of a distributed transaction.
// Execution is controlled by opt (see comments in TxnExecOptions).
//
// opt is passed to fn, and it's valid for fn to modify opt as it sees
// fit during each execution attempt.
//
// It's valid for txn to be nil (meaning the txn has already aborted) if fn
// can handle that. This is useful for continuing transactions that have been
// aborted because of an error in a previous batch of statements in the hope
// that a ROLLBACK will reset the state. Neither opt.AutoRetry not opt.AutoCommit
// can be set in this case.
//
// When this method returns, txn might be in any state; Exec does not attempt
// to clean up the transaction before returning an error. In case of
// TransactionAbortedError, txn is reset to a fresh transaction, ready to be
// used.
func (txn *Txn) Exec(opt TxnExecOptions, fn func(txn *Txn, opt *TxnExecOptions) error) (err error) {
	// Run fn in a retry loop until we encounter a success or
	// error condition this loop isn't capable of handling.
	var retryOptions retry.Options
	if txn == nil && (opt.AutoRetry || opt.AutoCommit) {
		panic("asked to retry or commit a txn that is already aborted")
	}

	// Ensure that a RetryableTxnError escaping this function is not used by
	// another (higher-level) Exec() invocation to restart its unrelated
	// transaction. Technically, setting TxnID to nil here is best-effort and
	// doesn't ensure that (the error will be wrongly used if the outer txn also
	// has a nil TxnID).
	// TODO(andrei): set TxnID to a bogus non-nil value once we get rid of the
	// retErr.Transaction field.
	defer func() {
		if retErr, ok := err.(*roachpb.RetryableTxnError); ok {
			retErr.TxnID = nil
			retErr.Transaction = nil
		}
	}()

	if opt.AutoRetry {
		retryOptions = txn.db.ctx.TxnRetryOptions
	}

	for r := retry.Start(retryOptions); r.Next(); {
		if txn != nil {
			// If we're looking at a brand new transaction, then communicate
			// what should be used as initial timestamp for the KV txn created
			// by TxnCoordSender.
			if opt.Clock != nil && !txn.Proto.IsInitialized() {
				// Control the KV timestamp, such that the value returned by
				// `cluster_logical_timestamp()` is consistent with the commit
				// (serializable) ordering.
				txn.Proto.OrigTimestamp = opt.Clock.Now()
			}
		}

		err = fn(txn, &opt)

		// TODO(andrei): Until 7881 is fixed.
		if err == nil && opt.AutoCommit && txn.Proto.Status == roachpb.ABORTED {
			log.Errorf(txn.Context, "#7881: no err but aborted txn proto. opt: %+v, txn: %+v",
				opt, txn)
		}

		if err == nil && opt.AutoCommit && txn.Proto.Status == roachpb.PENDING {
			// fn succeeded, but didn't commit.
			err = txn.Commit()
			log.Eventf(txn.Context, "client.Txn did AutoCommit. err: %v\ntxn: %+v", err, txn.Proto)
			if err != nil {
				if _, retryable := err.(*roachpb.RetryableTxnError); !retryable {
					// We can't retry, so let the caller know we tried to
					// autocommit.
					err = &AutoCommitError{cause: err}
				}
			}
		}

		if !opt.AutoRetry {
			break
		}

		if retErr, retryable := err.(*roachpb.RetryableTxnError); !retryable {
			break
		} else {
			// Make sure the txn record that err carries is for this txn.
			// If it's not, we terminate the "retryable" character of the error.
			if txn.Proto.ID != nil && (retErr.TxnID == nil || *retErr.TxnID != *txn.Proto.ID) {
				return errors.New(retErr.Error())
			}

			if !retErr.Backoff {
				r.Reset()
			}
		}
		txn.commitTriggers = nil

		log.VEventf(txn.Context, 2, "automatically retrying transaction: %s because of error: %s",
			txn.DebugName(), err)
	}

	return err
}
Example #20
0
// execStmtInOpenTxn executes one statement in the context
// of the planner's transaction (which is assumed to exist).
// It handles statements that affect the transaction state (BEGIN, COMMIT)
// and delegates everything else to `execStmt`.
// It binds placeholders.
//
// The current transaction might be committed/rolled back when this returns.
// It might also have transitioned to the aborted or RestartWait state.
//
// Args:
// implicitTxn: set if the current transaction was implicitly
//  created by the system (i.e. the client sent the statement outside of
//  a transaction).
//  COMMIT/ROLLBACK statements are rejected if set. Also, the transaction
//  might be auto-committed in this function.
// firstInTxn: set for the first statement in a transaction. Used
//  so that nested BEGIN statements are caught.
// stmtTimestamp: Used as the statement_timestamp().
//
// Returns:
// - a Result
// - an error, if any. In case of error, the result returned also reflects this error.
func (e *Executor) execStmtInOpenTxn(
	stmt parser.Statement, planMaker *planner, implicitTxn bool, firstInTxn bool, txnState *txnState,
) (Result, error) {
	if txnState.State != Open {
		panic("execStmtInOpenTxn called outside of an open txn")
	}
	if planMaker.txn == nil {
		panic("execStmtInOpenTxn called with a txn not set on the planner")
	}

	planMaker.evalCtx.SetTxnTimestamp(txnState.sqlTimestamp)
	planMaker.evalCtx.SetStmtTimestamp(e.cfg.Clock.PhysicalTime())

	session := planMaker.session
	log.Eventf(session.context, "%s", stmt)

	// TODO(cdo): Figure out how to not double count on retries.
	e.updateStmtCounts(stmt)
	switch s := stmt.(type) {
	case *parser.BeginTransaction:
		if !firstInTxn {
			txnState.updateStateAndCleanupOnErr(errTransactionInProgress, e)
			return Result{Err: errTransactionInProgress}, errTransactionInProgress
		}
	case *parser.CommitTransaction:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
		// CommitTransaction is executed fully here; there's no planNode for it
		// and the planner is not involved at all.
		res, err := commitSQLTransaction(txnState, planMaker, commit, e)
		return res, err
	case *parser.ReleaseSavepoint:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
		if err := parser.ValidateRestartCheckpoint(s.Savepoint); err != nil {
			return Result{Err: err}, err
		}
		// ReleaseSavepoint is executed fully here; there's no planNode for it
		// and the planner is not involved at all.
		res, err := commitSQLTransaction(txnState, planMaker, release, e)
		return res, err
	case *parser.RollbackTransaction:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
		// RollbackTransaction is executed fully here; there's no planNode for it
		// and the planner is not involved at all.
		// Notice that we don't return any errors on rollback.
		return rollbackSQLTransaction(txnState, planMaker), nil
	case *parser.SetTransaction:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
	case *parser.Savepoint:
		if implicitTxn {
			return e.noTransactionHelper(txnState)
		}
		if err := parser.ValidateRestartCheckpoint(s.Name); err != nil {
			return Result{Err: err}, err
		}
		// We want to disallow SAVEPOINTs to be issued after a transaction has
		// started running, but such enforcement is problematic in the
		// presence of transaction retries (since the transaction proto is
		// necessarily reused). To work around this, we keep track of the
		// transaction's retrying state and special-case SAVEPOINT when it is
		// set.
		//
		// TODO(andrei): the check for retrying is a hack - we erroneously
		// allow SAVEPOINT to be issued at any time during a retry, not just
		// in the beginning. We should figure out how to track whether we
		// started using the transaction during a retry.
		if txnState.txn.Proto.IsInitialized() && !txnState.retrying {
			err := fmt.Errorf("SAVEPOINT %s needs to be the first statement in a transaction",
				parser.RestartSavepointName)
			txnState.updateStateAndCleanupOnErr(err, e)
			return Result{Err: err}, err
		}
		// Note that Savepoint doesn't have a corresponding plan node.
		// This here is all the execution there is.
		txnState.retryIntent = true
		return Result{}, nil
	case *parser.RollbackToSavepoint:
		err := parser.ValidateRestartCheckpoint(s.Savepoint)
		if err == nil {
			// Can't restart if we didn't get an error first, which would've put the
			// txn in a different state.
			err = errNotRetriable
		}
		txnState.updateStateAndCleanupOnErr(err, e)
		return Result{Err: err}, err
	case *parser.Prepare:
		err := util.UnimplementedWithIssueErrorf(7568,
			"Prepared statements are supported only via the Postgres wire protocol")
		txnState.updateStateAndCleanupOnErr(err, e)
		return Result{Err: err}, err
	case *parser.Execute:
		err := util.UnimplementedWithIssueErrorf(7568,
			"Executing prepared statements is supported only via the Postgres wire protocol")
		txnState.updateStateAndCleanupOnErr(err, e)
		return Result{Err: err}, err
	case *parser.Deallocate:
		if s.Name == "" {
			planMaker.session.PreparedStatements.DeleteAll()
		} else {
			if found := planMaker.session.PreparedStatements.Delete(string(s.Name)); !found {
				err := fmt.Errorf("prepared statement %s does not exist", s.Name)
				txnState.updateStateAndCleanupOnErr(err, e)
				return Result{Err: err}, err
			}
		}
		return Result{PGTag: s.StatementTag()}, nil
	}

	autoCommit := implicitTxn && !e.cfg.TestingKnobs.DisableAutoCommit
	result, err := e.execStmt(stmt, planMaker, autoCommit)
	if err != nil {
		if result.Rows != nil {
			result.Rows.Close()
			result.Rows = nil
		}
		if traceSQL {
			log.ErrEventf(txnState.txn.Context, "ERROR: %v", err)
		}
		log.ErrEventf(session.context, "ERROR: %v", err)
		txnState.updateStateAndCleanupOnErr(err, e)
		return Result{Err: err}, err
	}

	tResult := &traceResult{tag: result.PGTag, count: -1}
	switch result.Type {
	case parser.RowsAffected:
		tResult.count = result.RowsAffected
	case parser.Rows:
		tResult.count = result.Rows.Len()
	}
	if traceSQL {
		log.Eventf(txnState.txn.Context, "%s done", tResult)
	}
	log.Eventf(session.context, "%s done", tResult)
	return result, nil
}
Example #21
0
// execRequest executes the request using the provided planner.
// It parses the sql into statements, iterates through the statements, creates
// KV transactions and automatically retries them when possible, and executes
// the (synchronous attempt of) schema changes.
// It will accumulate a result in Response for each statement.
// It will resume a SQL transaction, if one was previously open for this client.
//
// execRequest handles the mismatch between the SQL interface that the Executor
// provides, based on statements being streamed from the client in the context
// of a session, and the KV client.Txn interface, based on (possibly-retriable)
// callbacks passed to be executed in the context of a transaction. Actual
// execution of statements in the context of a KV txn is delegated to
// runTxnAttempt().
//
// Args:
//  txnState: State about about ongoing transaction (if any). The state will be
//   updated.
func (e *Executor) execRequest(session *Session, sql string, copymsg copyMsg) StatementResults {
	var res StatementResults
	txnState := &session.TxnState
	planMaker := &session.planner
	var stmts parser.StatementList
	var err error

	log.VEventf(session.Ctx(), 2, "execRequest: %s", sql)

	if session.planner.copyFrom != nil {
		stmts, err = session.planner.ProcessCopyData(sql, copymsg)
	} else if copymsg != copyMsgNone {
		err = fmt.Errorf("unexpected copy command")
	} else {
		stmts, err = planMaker.parser.Parse(sql, parser.Syntax(session.Syntax))
	}
	if err != nil {
		// A parse error occurred: we can't determine if there were multiple
		// statements or only one, so just pretend there was one.
		if txnState.txn != nil {
			// Rollback the txn.
			txnState.updateStateAndCleanupOnErr(err, e)
		}
		res.ResultList = append(res.ResultList, Result{Err: err})
		return res
	}
	if len(stmts) == 0 {
		res.Empty = true
		return res
	}

	// If the planMaker wants config updates to be blocked, then block them.
	defer planMaker.blockConfigUpdatesMaybe(e)()

	for len(stmts) > 0 {
		// Each iteration consumes a transaction's worth of statements.

		inTxn := txnState.State != NoTxn
		execOpt := client.TxnExecOptions{
			Clock: e.cfg.Clock,
		}
		// Figure out the statements out of which we're going to try to consume
		// this iteration. If we need to create an implicit txn, only one statement
		// can be consumed.
		stmtsToExec := stmts
		// If protoTS is set, the transaction proto sets its Orig and Max timestamps
		// to it each retry.
		var protoTS *hlc.Timestamp
		// We can AutoRetry the next batch of statements if we're in a clean state
		// (i.e. the next statements we're going to see are the first statements in
		// a transaction).
		if !inTxn {
			// Detect implicit transactions.
			if _, isBegin := stmts[0].(*parser.BeginTransaction); !isBegin {
				execOpt.AutoCommit = true
				stmtsToExec = stmtsToExec[:1]
				// Check for AS OF SYSTEM TIME. If it is present but not detected here,
				// it will raise an error later on.
				protoTS, err = isAsOf(planMaker, stmtsToExec[0], e.cfg.Clock.Now())
				if err != nil {
					res.ResultList = append(res.ResultList, Result{Err: err})
					return res
				}
				if protoTS != nil {
					planMaker.avoidCachedDescriptors = true
					defer func() {
						planMaker.avoidCachedDescriptors = false
					}()
				}
			}
			txnState.resetForNewSQLTxn(e, session)
			txnState.autoRetry = true
			txnState.sqlTimestamp = e.cfg.Clock.PhysicalTime()
			if execOpt.AutoCommit {
				txnState.txn.SetDebugName(sqlImplicitTxnName, 0)
			} else {
				txnState.txn.SetDebugName(sqlTxnName, 0)
			}
		} else {
			txnState.autoRetry = false
		}
		execOpt.AutoRetry = txnState.autoRetry
		if txnState.State == NoTxn {
			panic("we failed to initialize a txn")
		}
		// Now actually run some statements.
		var remainingStmts parser.StatementList
		var results []Result
		origState := txnState.State

		txnClosure := func(txn *client.Txn, opt *client.TxnExecOptions) error {
			if txnState.State == Open && txnState.txn != txn {
				panic(fmt.Sprintf("closure wasn't called in the txn we set up for it."+
					"\ntxnState.txn:%+v\ntxn:%+v\ntxnState:%+v", txnState.txn, txn, txnState))
			}
			txnState.txn = txn

			if protoTS != nil {
				setTxnTimestamps(txnState.txn, *protoTS)
			}

			var err error
			if results != nil {
				// Some results were produced by a previous attempt. Discard them.
				ResultList(results).Close()
			}
			results, remainingStmts, err = runTxnAttempt(e, planMaker, origState, txnState, opt, stmtsToExec)

			// TODO(andrei): Until #7881 fixed.
			if err == nil && txnState.State == Aborted {
				log.Errorf(session.Ctx(),
					"7881: txnState is Aborted without an error propagating. stmtsToExec: %s, "+
						"results: %+v, remainingStmts: %s, txnState: %+v", stmtsToExec, results,
					remainingStmts, txnState)
			}

			return err
		}
		// This is where the magic happens - we ask db to run a KV txn and possibly retry it.
		txn := txnState.txn // this might be nil if the txn was already aborted.
		err := txn.Exec(execOpt, txnClosure)

		// Update the Err field of the last result if the error was coming from
		// auto commit. The error was generated outside of the txn closure, so it was not
		// set in any result.
		if err != nil {
			lastResult := &results[len(results)-1]
			if aErr, ok := err.(*client.AutoCommitError); ok {
				// TODO(andrei): Until #7881 fixed.
				{
					log.Eventf(session.Ctx(), "executor got AutoCommitError: %s\n"+
						"txn: %+v\nexecOpt.AutoRetry %t, execOpt.AutoCommit:%t, stmts %+v, remaining %+v",
						aErr, txnState.txn.Proto, execOpt.AutoRetry, execOpt.AutoCommit, stmts,
						remainingStmts)
					if txnState.txn == nil {
						log.Errorf(session.Ctx(), "7881: AutoCommitError on nil txn: %s, "+
							"txnState %+v, execOpt %+v, stmts %+v, remaining %+v",
							aErr, txnState, execOpt, stmts, remainingStmts)
						txnState.sp.SetBaggageItem(keyFor7881Sample, "sample me please")
					}
				}
				lastResult.Err = aErr
				e.TxnAbortCount.Inc(1)
				txn.CleanupOnError(err)
			}
			if lastResult.Err == nil {
				log.Fatalf(session.Ctx(),
					"error (%s) was returned, but it was not set in the last result (%v)",
					err, lastResult)
			}
		}

		res.ResultList = append(res.ResultList, results...)
		// Now make sense of the state we got into and update txnState.
		if (txnState.State == RestartWait || txnState.State == Aborted) &&
			txnState.commitSeen {
			// A COMMIT got an error (retryable or not). Too bad, this txn is toast.
			// After we return a result for COMMIT (with the COMMIT pgwire tag), the
			// user can't send any more commands.
			e.TxnAbortCount.Inc(1)
			txn.CleanupOnError(err)
			txnState.resetStateAndTxn(NoTxn)
		}

		if execOpt.AutoCommit {
			// If execOpt.AutoCommit was set, then the txn no longer exists at this point.
			txnState.resetStateAndTxn(NoTxn)
		}

		// If we're no longer in a transaction, finish the trace.
		if txnState.State == NoTxn {
			txnState.finishSQLTxn(session.context)
		}

		// If the txn is in any state but Open, exec the schema changes. They'll
		// short-circuit themselves if the mutation that queued them has been
		// rolled back from the table descriptor.
		stmtsExecuted := stmts[:len(stmtsToExec)-len(remainingStmts)]
		if txnState.State != Open {
			planMaker.checkTestingVerifyMetadataInitialOrDie(e, stmts)
			planMaker.checkTestingVerifyMetadataOrDie(e, stmtsExecuted)
			// Exec the schema changers (if the txn rolled back, the schema changers
			// will short-circuit because the corresponding descriptor mutation is not
			// found).
			planMaker.releaseLeases()
			txnState.schemaChangers.execSchemaChanges(e, planMaker, res.ResultList)
		} else {
			// We're still in a txn, so we only check that the verifyMetadata callback
			// fails the first time it's run. The gossip update that will make the
			// callback succeed only happens when the txn is done.
			planMaker.checkTestingVerifyMetadataInitialOrDie(e, stmtsExecuted)
		}

		// Figure out what statements to run on the next iteration.
		if err != nil {
			// Don't execute anything further.
			stmts = nil
		} else if execOpt.AutoCommit {
			stmts = stmts[1:]
		} else {
			stmts = remainingStmts
		}
	}

	return res
}
Example #22
0
// resolveIntents resolves the given intents. `wait` is currently a
// no-op; all intents are resolved synchronously.
//
// TODO(bdarnell): Restore the wait=false optimization when/if #8360
// is fixed. `wait=false` requests a semi-synchronous operation,
// returning when all local commands have been *proposed* but not yet
// committed or executed. This ensures that if a waiting client
// retries immediately after calling this function, it will not hit
// the same intents again (in the absence of #8360, we provide this
// guarantee by resolving the intents synchronously regardless of the
// `wait` argument).
func (ir *intentResolver) resolveIntents(
	ctx context.Context, intents []roachpb.Intent, wait bool, poison bool,
) error {
	// Force synchronous operation; see above TODO.
	wait = true
	if len(intents) == 0 {
		return nil
	}
	// We're doing async stuff below; those need new traces.
	ctx, cleanup := tracing.EnsureContext(ctx, ir.store.Tracer())
	defer cleanup()
	log.Eventf(ctx, "resolving intents [wait=%t]", wait)

	var reqs []roachpb.Request
	for i := range intents {
		intent := intents[i] // avoids a race in `i, intent := range ...`
		var resolveArgs roachpb.Request
		{
			if len(intent.EndKey) == 0 {
				resolveArgs = &roachpb.ResolveIntentRequest{
					Span:      intent.Span,
					IntentTxn: intent.Txn,
					Status:    intent.Status,
					Poison:    poison,
				}
			} else {
				resolveArgs = &roachpb.ResolveIntentRangeRequest{
					Span:      intent.Span,
					IntentTxn: intent.Txn,
					Status:    intent.Status,
					Poison:    poison,
				}
			}
		}

		reqs = append(reqs, resolveArgs)
	}

	// Resolve all of the intents.
	if len(reqs) > 0 {
		b := &client.Batch{}
		b.AddRawRequest(reqs...)
		action := func() error {
			// TODO(tschottdorf): no tracing here yet.
			return ir.store.DB().Run(ctx, b)
		}
		if wait || ir.store.Stopper().RunLimitedAsyncTask(
			ctx, ir.sem, true /* wait */, func(ctx context.Context) {
				if err := action(); err != nil {
					log.Warningf(ctx, "unable to resolve external intents: %s", err)
				}
			}) != nil {
			// Try async to not keep the caller waiting, but when draining
			// just go ahead and do it synchronously. See #1684.
			// TODO(tschottdorf): This is ripe for removal.
			if err := action(); err != nil {
				return err
			}
		}
	}

	return nil
}
Example #23
0
// Start starts the server on the specified port, starts gossip and initializes
// the node using the engines from the server's context.
//
// The passed context can be used to trace the server startup. The context
// should represent the general startup operation.
func (s *Server) Start(ctx context.Context) error {
	ctx = s.AnnotateCtx(ctx)

	startTime := timeutil.Now()

	tlsConfig, err := s.cfg.GetServerTLSConfig()
	if err != nil {
		return err
	}

	httpServer := netutil.MakeServer(s.stopper, tlsConfig, s)
	plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect)
	}))

	// The following code is a specialization of util/net.go's ListenAndServe
	// which adds pgwire support. A single port is used to serve all protocols
	// (pg, http, h2) via the following construction:
	//
	// non-TLS case:
	// net.Listen -> cmux.New
	//               |
	//               -  -> pgwire.Match -> pgwire.Server.ServeConn
	//               -  -> cmux.Any -> grpc.(*Server).Serve
	//
	// TLS case:
	// net.Listen -> cmux.New
	//               |
	//               -  -> pgwire.Match -> pgwire.Server.ServeConn
	//               -  -> cmux.Any -> grpc.(*Server).Serve
	//
	// Note that the difference between the TLS and non-TLS cases exists due to
	// Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments
	// in util.ListenAndServe for an explanation of how h2c is implemented there
	// and here.

	ln, err := net.Listen("tcp", s.cfg.Addr)
	if err != nil {
		return err
	}
	log.Eventf(ctx, "listening on port %s", s.cfg.Addr)
	unresolvedListenAddr, err := officialAddr(s.cfg.Addr, ln.Addr())
	if err != nil {
		return err
	}
	s.cfg.Addr = unresolvedListenAddr.String()
	unresolvedAdvertAddr, err := officialAddr(s.cfg.AdvertiseAddr, ln.Addr())
	if err != nil {
		return err
	}
	s.cfg.AdvertiseAddr = unresolvedAdvertAddr.String()

	s.rpcContext.SetLocalInternalServer(s.node)

	m := cmux.New(ln)
	pgL := m.Match(pgwire.Match)
	anyL := m.Match(cmux.Any())

	httpLn, err := net.Listen("tcp", s.cfg.HTTPAddr)
	if err != nil {
		return err
	}
	unresolvedHTTPAddr, err := officialAddr(s.cfg.HTTPAddr, httpLn.Addr())
	if err != nil {
		return err
	}
	s.cfg.HTTPAddr = unresolvedHTTPAddr.String()

	workersCtx := s.AnnotateCtx(context.Background())

	s.stopper.RunWorker(func() {
		<-s.stopper.ShouldQuiesce()
		if err := httpLn.Close(); err != nil {
			log.Fatal(workersCtx, err)
		}
	})

	if tlsConfig != nil {
		httpMux := cmux.New(httpLn)
		clearL := httpMux.Match(cmux.HTTP1())
		tlsL := httpMux.Match(cmux.Any())

		s.stopper.RunWorker(func() {
			netutil.FatalIfUnexpected(httpMux.Serve())
		})

		s.stopper.RunWorker(func() {
			netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL))
		})

		httpLn = tls.NewListener(tlsL, tlsConfig)
	}

	s.stopper.RunWorker(func() {
		netutil.FatalIfUnexpected(httpServer.Serve(httpLn))
	})

	s.stopper.RunWorker(func() {
		<-s.stopper.ShouldQuiesce()
		netutil.FatalIfUnexpected(anyL.Close())
		<-s.stopper.ShouldStop()
		s.grpc.Stop()
	})

	s.stopper.RunWorker(func() {
		netutil.FatalIfUnexpected(s.grpc.Serve(anyL))
	})

	s.stopper.RunWorker(func() {
		pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background())
		netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) {
			connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String())
			if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) {
				// Report the error on this connection's context, so that we
				// know which remote client caused the error when looking at
				// the logs.
				log.Error(connCtx, err)
			}
		}))
	})

	if len(s.cfg.SocketFile) != 0 {
		// Unix socket enabled: postgres protocol only.
		unixLn, err := net.Listen("unix", s.cfg.SocketFile)
		if err != nil {
			return err
		}

		s.stopper.RunWorker(func() {
			<-s.stopper.ShouldQuiesce()
			if err := unixLn.Close(); err != nil {
				log.Fatal(workersCtx, err)
			}
		})

		s.stopper.RunWorker(func() {
			pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background())
			netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) {
				connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String())
				if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) {
					// Report the error on this connection's context, so that we
					// know which remote client caused the error when looking at
					// the logs.
					log.Error(connCtx, err)
				}
			}))
		})
	}

	// Enable the debug endpoints first to provide an earlier window
	// into what's going on with the node in advance of exporting node
	// functionality.
	// TODO(marc): when cookie-based authentication exists,
	// apply it for all web endpoints.
	s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug))

	s.gossip.Start(unresolvedAdvertAddr)
	log.Event(ctx, "started gossip")

	s.engines, err = s.cfg.CreateEngines()
	if err != nil {
		return errors.Wrap(err, "failed to create engines")
	}
	s.stopper.AddCloser(&s.engines)

	// We might have to sleep a bit to protect against this node producing non-
	// monotonic timestamps. Before restarting, its clock might have been driven
	// by other nodes' fast clocks, but when we restarted, we lost all this
	// information. For example, a client might have written a value at a
	// timestamp that's in the future of the restarted node's clock, and if we
	// don't do something, the same client's read would not return the written
	// value. So, we wait up to MaxOffset; we couldn't have served timestamps more
	// than MaxOffset in the future (assuming that MaxOffset was not changed, see
	// #9733).
	//
	// As an optimization for tests, we don't sleep if all the stores are brand
	// new. In this case, the node will not serve anything anyway until it
	// synchronizes with other nodes.
	{
		anyStoreBootstrapped := false
		for _, e := range s.engines {
			if _, err := storage.ReadStoreIdent(ctx, e); err != nil {
				// NotBootstrappedError is expected.
				if _, ok := err.(*storage.NotBootstrappedError); !ok {
					return err
				}
			} else {
				anyStoreBootstrapped = true
				break
			}
		}
		if anyStoreBootstrapped {
			sleepDuration := s.clock.MaxOffset() - timeutil.Since(startTime)
			if sleepDuration > 0 {
				log.Infof(ctx, "sleeping for %s to guarantee HLC monotonicity", sleepDuration)
				time.Sleep(sleepDuration)
			}
		}
	}

	// Now that we have a monotonic HLC wrt previous incarnations of the process,
	// init all the replicas.
	err = s.node.start(
		ctx,
		unresolvedAdvertAddr,
		s.engines,
		s.cfg.NodeAttributes,
		s.cfg.Locality,
	)
	if err != nil {
		return err
	}
	log.Event(ctx, "started node")

	s.nodeLiveness.StartHeartbeat(ctx, s.stopper)

	// We can now add the node registry.
	s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt)

	// Begin recording runtime statistics.
	s.startSampleEnvironment(s.cfg.MetricsSampleInterval)

	// Begin recording time series data collected by the status monitor.
	s.tsDB.PollSource(
		s.cfg.AmbientCtx, s.recorder, s.cfg.MetricsSampleInterval, ts.Resolution10s, s.stopper,
	)

	// Begin recording status summaries.
	s.node.startWriteSummaries(s.cfg.MetricsSampleInterval)

	// Create and start the schema change manager only after a NodeID
	// has been assigned.
	testingKnobs := &sql.SchemaChangerTestingKnobs{}
	if s.cfg.TestingKnobs.SQLSchemaChanger != nil {
		testingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs)
	}
	sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper)

	s.distSQLServer.Start()

	log.Infof(ctx, "starting %s server at %s", s.cfg.HTTPRequestScheme(), unresolvedHTTPAddr)
	log.Infof(ctx, "starting grpc/postgres server at %s", unresolvedListenAddr)
	log.Infof(ctx, "advertising CockroachDB node at %s", unresolvedAdvertAddr)
	if len(s.cfg.SocketFile) != 0 {
		log.Infof(ctx, "starting postgres server at unix:%s", s.cfg.SocketFile)
	}

	s.stopper.RunWorker(func() {
		netutil.FatalIfUnexpected(m.Serve())
	})

	log.Event(ctx, "accepting connections")

	// Initialize grpc-gateway mux and context.
	jsonpb := &protoutil.JSONPb{
		EnumsAsInts:  true,
		EmitDefaults: true,
		Indent:       "  ",
	}
	protopb := new(protoutil.ProtoPb)
	gwMux := gwruntime.NewServeMux(
		gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb),
		gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb),
		gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb),
		gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb),
		gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb),
	)
	gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background()))
	s.stopper.AddCloser(stop.CloserFn(gwCancel))

	// Setup HTTP<->gRPC handlers.
	conn, err := s.rpcContext.GRPCDial(s.cfg.Addr)
	if err != nil {
		return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err)
	}

	for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} {
		if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil {
			return err
		}
	}

	var uiFileSystem http.FileSystem
	uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false)
	if uiDebug {
		uiFileSystem = http.Dir("pkg/ui")
	} else {
		uiFileSystem = &assetfs.AssetFS{
			Asset:     ui.Asset,
			AssetDir:  ui.AssetDir,
			AssetInfo: ui.AssetInfo,
		}
	}
	uiFileServer := http.FileServer(uiFileSystem)

	s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		if r.URL.Path == "/" {
			if uiDebug {
				r.URL.Path = "debug.html"
			} else {
				r.URL.Path = "release.html"
			}
		}
		uiFileServer.ServeHTTP(w, r)
	}))

	// TODO(marc): when cookie-based authentication exists,
	// apply it for all web endpoints.
	s.mux.Handle(adminPrefix, gwMux)
	s.mux.Handle(ts.URLPrefix, gwMux)
	s.mux.Handle(statusPrefix, gwMux)
	s.mux.Handle("/health", gwMux)
	s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars))
	log.Event(ctx, "added http endpoints")

	if err := sdnotify.Ready(); err != nil {
		log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err)
	}
	log.Event(ctx, "server ready")

	return nil
}
Example #24
0
// initStores initializes the Stores map from ID to Store. Stores are
// added to the local sender if already bootstrapped. A bootstrapped
// Store has a valid ident with cluster, node and Store IDs set. If
// the Store doesn't yet have a valid ident, it's added to the
// bootstraps list for initialization once the cluster and node IDs
// have been determined.
func (n *Node) initStores(
	ctx context.Context, engines []engine.Engine, stopper *stop.Stopper, bootstrapped bool,
) error {
	var bootstraps []*storage.Store

	if len(engines) == 0 {
		return errors.Errorf("no engines")
	}
	for _, e := range engines {
		s := storage.NewStore(n.storeCfg, e, &n.Descriptor)
		log.Eventf(ctx, "created store for engine: %s", e)
		if bootstrapped {
			s.NotifyBootstrapped()
		}
		// Initialize each store in turn, handling un-bootstrapped errors by
		// adding the store to the bootstraps list.
		if err := s.Start(ctx, stopper); err != nil {
			if _, ok := err.(*storage.NotBootstrappedError); ok {
				log.Infof(ctx, "store %s not bootstrapped", s)
				bootstraps = append(bootstraps, s)
				continue
			}
			return errors.Errorf("failed to start store: %s", err)
		}
		if s.Ident.ClusterID == *uuid.EmptyUUID || s.Ident.NodeID == 0 {
			return errors.Errorf("unidentified store: %s", s)
		}
		capacity, err := s.Capacity()
		if err != nil {
			return errors.Errorf("could not query store capacity: %s", err)
		}
		log.Infof(ctx, "initialized store %s: %+v", s, capacity)
		n.addStore(s)
	}

	// If there are no initialized stores and no gossip resolvers,
	// bootstrap this node as the seed of a new cluster.
	if n.stores.GetStoreCount() == 0 {
		resolvers := n.storeCfg.Gossip.GetResolvers()
		// Check for the case of uninitialized node having only itself specified as join host.
		switch len(resolvers) {
		case 0:
			return errNeedsBootstrap
		case 1:
			if resolvers[0].Addr() == n.Descriptor.Address.String() {
				return errCannotJoinSelf
			}
		}
	}

	// Verify all initialized stores agree on cluster and node IDs.
	if err := n.validateStores(); err != nil {
		return err
	}
	log.Event(ctx, "validated stores")

	// Set the stores map as the gossip persistent storage, so that
	// gossip can bootstrap using the most recently persisted set of
	// node addresses.
	if err := n.storeCfg.Gossip.SetStorage(n.stores); err != nil {
		return fmt.Errorf("failed to initialize the gossip interface: %s", err)
	}

	// Connect gossip before starting bootstrap. For new nodes, connecting
	// to the gossip network is necessary to get the cluster ID.
	n.connectGossip(ctx)
	log.Event(ctx, "connected to gossip")

	// If no NodeID has been assigned yet, allocate a new node ID by
	// supplying 0 to initNodeID.
	if n.Descriptor.NodeID == 0 {
		n.initNodeID(0)
		n.initialBoot = true
		log.Eventf(ctx, "allocated node ID %d", n.Descriptor.NodeID)
	}

	// Bootstrap any uninitialized stores asynchronously.
	if len(bootstraps) > 0 {
		if err := stopper.RunAsyncTask(ctx, func(ctx context.Context) {
			n.bootstrapStores(ctx, bootstraps, stopper)
		}); err != nil {
			return err
		}
	}

	return nil
}
Example #25
0
// Send implements the batch.Sender interface. If the request is part of a
// transaction, the TxnCoordSender adds the transaction to a map of active
// transactions and begins heartbeating it. Every subsequent request for the
// same transaction updates the lastUpdate timestamp to prevent live
// transactions from being considered abandoned and garbage collected.
// Read/write mutating requests have their key or key range added to the
// transaction's interval tree of key ranges for eventual cleanup via resolved
// write intents; they're tagged to an outgoing EndTransaction request, with
// the receiving replica in charge of resolving them.
func (tc *TxnCoordSender) Send(
	ctx context.Context, ba roachpb.BatchRequest,
) (*roachpb.BatchResponse, *roachpb.Error) {
	// Start new or pick up active trace. From here on, there's always an active
	// Trace, though its overhead is small unless it's sampled.
	sp := opentracing.SpanFromContext(ctx)
	var tracer opentracing.Tracer
	if sp == nil {
		tracer = tc.AmbientContext.Tracer
		sp = tracer.StartSpan(opTxnCoordSender)
		defer sp.Finish()
		ctx = opentracing.ContextWithSpan(ctx, sp)
	} else {
		tracer = sp.Tracer()
	}

	startNS := tc.clock.PhysicalNow()

	if ba.Txn != nil {
		// If this request is part of a transaction...
		if err := tc.maybeBeginTxn(&ba); err != nil {
			return nil, roachpb.NewError(err)
		}

		txnID := *ba.Txn.ID

		// Associate the txnID with the trace. We need to do this after the
		// maybeBeginTxn call. We set both a baggage item and a tag because only
		// tags show up in the LIghtstep UI.
		txnIDStr := txnID.String()
		sp.SetTag("txnID", txnIDStr)
		sp.SetBaggageItem("txnID", txnIDStr)

		var et *roachpb.EndTransactionRequest
		var hasET bool
		{
			var rArgs roachpb.Request
			rArgs, hasET = ba.GetArg(roachpb.EndTransaction)
			if hasET {
				et = rArgs.(*roachpb.EndTransactionRequest)
				if len(et.Key) != 0 {
					return nil, roachpb.NewErrorf("EndTransaction must not have a Key set")
				}
				et.Key = ba.Txn.Key
				if len(et.IntentSpans) > 0 {
					// TODO(tschottdorf): it may be useful to allow this later.
					// That would be part of a possible plan to allow txns which
					// write on multiple coordinators.
					return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction")
				}
			}
		}

		if pErr := func() *roachpb.Error {
			tc.Lock()
			defer tc.Unlock()
			if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil {
				return pErr
			}

			if !hasET {
				return nil
			}
			// Everything below is carried out only when trying to commit.

			// Populate et.IntentSpans, taking into account both any existing
			// and new writes, and taking care to perform proper deduplication.
			txnMeta := tc.txns[txnID]
			distinctSpans := true
			if txnMeta != nil {
				et.IntentSpans = txnMeta.keys
				// Defensively set distinctSpans to false if we had any previous
				// requests in this transaction. This effectively limits the distinct
				// spans optimization to 1pc transactions.
				distinctSpans = len(txnMeta.keys) == 0
			}
			// We can't pass in a batch response here to better limit the key
			// spans as we don't know what is going to be affected. This will
			// affect queries such as `DELETE FROM my.table LIMIT 10` when
			// executed as a 1PC transaction. e.g.: a (BeginTransaction,
			// DeleteRange, EndTransaction) batch.
			ba.IntentSpanIterate(nil, func(key, endKey roachpb.Key) {
				et.IntentSpans = append(et.IntentSpans, roachpb.Span{
					Key:    key,
					EndKey: endKey,
				})
			})
			// TODO(peter): Populate DistinctSpans on all batches, not just batches
			// which contain an EndTransactionRequest.
			var distinct bool
			// The request might already be used by an outgoing goroutine, so
			// we can't safely mutate anything in-place (as MergeSpans does).
			et.IntentSpans = append([]roachpb.Span(nil), et.IntentSpans...)
			et.IntentSpans, distinct = roachpb.MergeSpans(et.IntentSpans)
			ba.Header.DistinctSpans = distinct && distinctSpans
			if len(et.IntentSpans) == 0 {
				// If there aren't any intents, then there's factually no
				// transaction to end. Read-only txns have all of their state
				// in the client.
				return roachpb.NewErrorf("cannot commit a read-only transaction")
			}
			if txnMeta != nil {
				txnMeta.keys = et.IntentSpans
			}
			return nil
		}(); pErr != nil {
			return nil, pErr
		}

		if hasET && log.V(1) {
			for _, intent := range et.IntentSpans {
				log.Eventf(ctx, "intent: [%s,%s)", intent.Key, intent.EndKey)
			}
		}
	}

	// Embed the trace metadata into the header for use by RPC recipients. We need
	// to do this after the maybeBeginTxn call above.
	// TODO(tschottdorf): To get rid of the spurious alloc below we need to
	// implement the carrier interface on ba.Header or make Span non-nullable,
	// both of which force all of ba on the Heap. It's already there, so may
	// not be a big deal, but ba should live on the stack. Also not easy to use
	// a buffer pool here since anything that goes into the RPC layer could be
	// used by goroutines we didn't wait for.
	if ba.TraceContext == nil {
		ba.TraceContext = &tracing.SpanContextCarrier{}
	} else {
		// We didn't make this object but are about to mutate it, so we
		// have to take a copy - the original might already have been
		// passed to the RPC layer.
		ba.TraceContext = protoutil.Clone(ba.TraceContext).(*tracing.SpanContextCarrier)
	}
	if err := tracer.Inject(sp.Context(), basictracer.Delegator, ba.TraceContext); err != nil {
		return nil, roachpb.NewError(err)
	}

	// Send the command through wrapped sender, taking appropriate measures
	// on error.
	var br *roachpb.BatchResponse
	{
		var pErr *roachpb.Error
		br, pErr = tc.wrapped.Send(ctx, ba)

		if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok {
			// TODO(tschottdorf): needs to keep the trace.
			br, pErr = tc.resendWithTxn(ba)
		}

		if pErr = tc.updateState(ctx, startNS, ba, br, pErr); pErr != nil {
			log.Eventf(ctx, "error: %s", pErr)
			return nil, pErr
		}
	}

	if br.Txn == nil {
		return br, nil
	}

	if _, ok := ba.GetArg(roachpb.EndTransaction); !ok {
		return br, nil
	}
	// If the --linearizable flag is set, we want to make sure that
	// all the clocks in the system are past the commit timestamp
	// of the transaction. This is guaranteed if either
	// - the commit timestamp is MaxOffset behind startNS
	// - MaxOffset ns were spent in this function
	// when returning to the client. Below we choose the option
	// that involves less waiting, which is likely the first one
	// unless a transaction commits with an odd timestamp.
	if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS {
		startNS = tsNS
	}
	sleepNS := tc.clock.MaxOffset() -
		time.Duration(tc.clock.PhysicalNow()-startNS)
	if tc.linearizable && sleepNS > 0 {
		defer func() {
			if log.V(1) {
				log.Infof(ctx, "%v: waiting %s on EndTransaction for linearizability", br.Txn.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
			}
			time.Sleep(sleepNS)
		}()
	}
	if br.Txn.Status != roachpb.PENDING {
		tc.Lock()
		tc.cleanupTxnLocked(ctx, *br.Txn)
		tc.Unlock()
	}
	return br, nil
}