Beispiel #1
0
// processReplica processes a single replica. This should not be
// called externally to the queue. bq.mu.Lock must not be held
// while calling this method.
func (bq *baseQueue) processReplica(
	queueCtx context.Context, repl *Replica, clock *hlc.Clock,
) error {
	bq.processMu.Lock()
	defer bq.processMu.Unlock()

	// Load the system config.
	cfg, ok := bq.gossip.GetSystemConfig()
	if !ok {
		log.VEventf(queueCtx, 1, "no system config available, skipping")
		return nil
	}

	if bq.requiresSplit(cfg, repl) {
		// Range needs to be split due to zone configs, but queue does
		// not accept unsplit ranges.
		log.VEventf(queueCtx, 3, "split needed; skipping")
		return nil
	}

	// Putting a span in a context means that events will no longer go to the
	// event log. Use queueCtx for events that are intended for the event log.
	ctx, span := bq.AnnotateCtxWithSpan(queueCtx, bq.name)
	defer span.Finish()
	// Also add the Replica annotations to ctx.
	ctx = repl.AnnotateCtx(ctx)
	ctx, cancel := context.WithTimeout(ctx, bq.processTimeout)
	defer cancel()
	log.Eventf(ctx, "processing replica")

	if err := repl.IsDestroyed(); err != nil {
		log.VEventf(queueCtx, 3, "replica destroyed (%s); skipping", err)
		return nil
	}

	// If the queue requires a replica to have the range lease in
	// order to be processed, check whether this replica has range lease
	// and renew or acquire if necessary.
	if bq.needsLease {
		// Create a "fake" get request in order to invoke redirectOnOrAcquireLease.
		if err := repl.redirectOnOrAcquireLease(ctx); err != nil {
			switch v := err.GetDetail().(type) {
			case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError:
				log.VEventf(queueCtx, 3, "%s; skipping", v)
				return nil
			default:
				return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl)
			}
		}
		log.Event(ctx, "got range lease")
	}

	log.VEventf(queueCtx, 3, "processing")
	if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil {
		return err
	}
	log.Event(ctx, "done")
	bq.successes.Inc(1)
	return nil
}
Beispiel #2
0
// maybeRejectClientLocked checks whether the (transactional) request is in a
// state that prevents it from continuing, such as the coordinator having
// considered the client abandoned, or a heartbeat having reported an error.
func (tc *TxnCoordSender) maybeRejectClientLocked(
	ctx context.Context, txn roachpb.Transaction,
) *roachpb.Error {

	if !txn.Writing {
		return nil
	}
	txnMeta, ok := tc.txns[*txn.ID]
	// Check whether the transaction is still tracked and has a chance of
	// completing. It's possible that the coordinator learns about the
	// transaction having terminated from a heartbeat, and GC queue correctness
	// (along with common sense) mandates that we don't let the client
	// continue.
	switch {
	case !ok:
		log.VEventf(ctx, 2, "rejecting unknown txn: %s", txn.ID)
		// TODO(spencerkimball): Could add coordinator node ID to the
		// transaction session so that we can definitively return the right
		// error between these possible errors. Or update the code to make an
		// educated guess based on the incoming transaction timestamp.
		return roachpb.NewError(errNoState)
	case txnMeta.txn.Status == roachpb.ABORTED:
		txn := txnMeta.txn.Clone()
		tc.cleanupTxnLocked(ctx, txn)
		return roachpb.NewErrorWithTxn(roachpb.NewTransactionAbortedError(),
			&txn)
	case txnMeta.txn.Status == roachpb.COMMITTED:
		txn := txnMeta.txn.Clone()
		tc.cleanupTxnLocked(ctx, txn)
		return roachpb.NewErrorWithTxn(roachpb.NewTransactionStatusError(
			"transaction is already committed"), &txn)
	default:
		return nil
	}
}
Beispiel #3
0
// process truncates the raft log of the range if the replica is the raft
// leader and if the total number of the range's raft log's stale entries
// exceeds RaftLogQueueStaleThreshold.
func (rlq *raftLogQueue) process(ctx context.Context, r *Replica, _ config.SystemConfig) error {
	truncatableIndexes, oldestIndex, err := getTruncatableIndexes(ctx, r)
	if err != nil {
		return err
	}

	// Can and should the raft logs be truncated?
	if truncatableIndexes >= RaftLogQueueStaleThreshold {
		r.mu.Lock()
		raftLogSize := r.mu.raftLogSize
		r.mu.Unlock()

		log.VEventf(ctx, 1, "truncating raft log %d-%d: size=%d",
			oldestIndex-truncatableIndexes, oldestIndex, raftLogSize)
		b := &client.Batch{}
		b.AddRawRequest(&roachpb.TruncateLogRequest{
			Span:    roachpb.Span{Key: r.Desc().StartKey.AsRawKey()},
			Index:   oldestIndex,
			RangeID: r.RangeID,
		})
		if err := rlq.db.Run(ctx, b); err != nil {
			return err
		}
		r.store.metrics.RaftLogTruncated.Inc(int64(truncatableIndexes))
	}
	return nil
}
Beispiel #4
0
func (ds *ServerImpl) flowStreamInt(ctx context.Context, stream DistSQL_FlowStreamServer) error {
	// Receive the first message.
	msg, err := stream.Recv()
	if err != nil {
		if err == io.EOF {
			return errors.Errorf("empty stream")
		}
		return err
	}
	if msg.Header == nil {
		return errors.Errorf("no header in first message")
	}
	flowID := msg.Header.FlowID
	streamID := msg.Header.StreamID
	if log.V(1) {
		log.Infof(ctx, "connecting inbound stream %s/%d", flowID.Short(), streamID)
	}
	f, streamInfo, err := ds.flowRegistry.ConnectInboundStream(flowID, streamID)
	if err != nil {
		return err
	}
	log.VEventf(ctx, 1, "connected inbound stream %s/%d", flowID.Short(), streamID)
	defer ds.flowRegistry.FinishInboundStream(streamInfo)
	return ProcessInboundStream(&f.FlowCtx, stream, msg, streamInfo.receiver)
}
// Heartbeat is called to update a node's expiration timestamp. This
// method does a conditional put on the node liveness record, and if
// successful, stores the updated liveness record in the nodes map.
func (nl *NodeLiveness) Heartbeat(ctx context.Context, liveness *Liveness) error {
	defer func(start time.Time) {
		if dur := timeutil.Now().Sub(start); dur > time.Second {
			log.Warningf(ctx, "slow heartbeat took %0.1fs", dur.Seconds())
		}
	}(timeutil.Now())

	// Allow only one heartbeat at a time.
	select {
	case nl.heartbeatSem <- struct{}{}:
	case <-ctx.Done():
		return ctx.Err()
	}
	defer func() {
		<-nl.heartbeatSem
	}()

	nodeID := nl.gossip.NodeID.Get()
	var newLiveness Liveness
	if liveness == nil {
		newLiveness = Liveness{
			NodeID: nodeID,
			Epoch:  1,
		}
	} else {
		newLiveness = *liveness
	}
	// We need to add the maximum clock offset to the expiration because it's
	// used when determining liveness for a node.
	newLiveness.Expiration = nl.clock.Now().Add(
		(nl.livenessThreshold + nl.clock.MaxOffset()).Nanoseconds(), 0)
	if err := nl.updateLiveness(ctx, &newLiveness, liveness, func(actual Liveness) error {
		// Update liveness to actual value on mismatch.
		nl.mu.Lock()
		nl.mu.self = actual
		nl.mu.Unlock()
		// If the actual liveness is different than expected, but is
		// considered live, treat the heartbeat as a success. This can
		// happen when the periodic heartbeater races with a concurrent
		// lease acquisition.
		if actual.isLive(nl.clock.Now(), nl.clock.MaxOffset()) {
			return errNodeAlreadyLive
		}
		// Otherwise, return error.
		return errSkippedHeartbeat
	}); err != nil {
		if err == errNodeAlreadyLive {
			return nil
		}
		nl.metrics.HeartbeatFailures.Inc(1)
		return err
	}

	log.VEventf(ctx, 1, "heartbeat %+v", newLiveness.Expiration)
	nl.mu.Lock()
	nl.mu.self = newLiveness
	nl.mu.Unlock()
	nl.metrics.HeartbeatSuccesses.Inc(1)
	return nil
}
// ReplicaInfo returns information about the replica that has been picked for
// the current range.
// A RangeUnavailableError is returned if there's no information in gossip
// about any of the replicas.
func (it *SpanResolverIterator) ReplicaInfo(ctx context.Context) (kv.ReplicaInfo, error) {
	if !it.Valid() {
		panic(it.Error())
	}

	resolvedLH := false
	var repl kv.ReplicaInfo
	if lh, ok := it.it.LeaseHolder(ctx); ok {
		repl.ReplicaDescriptor = lh
		// Fill in the node descriptor.
		nd, err := it.gossip.GetNodeDescriptor(repl.NodeID)
		if err != nil {
			// Ignore the error; ask the oracle to pick another replica below.
			log.VEventf(ctx, 2, "failed to resolve node %d: %s", repl.NodeID, err)
		}
		repl.NodeDesc = nd
		resolvedLH = true
	}
	if !resolvedLH {
		leaseHolder, err := it.oracle.ChoosePreferredLeaseHolder(
			*it.it.Desc(), it.queryState)
		if err != nil {
			return kv.ReplicaInfo{}, err
		}
		repl = leaseHolder
	}
	it.queryState.rangesPerNode[repl.NodeID]++
	return repl, nil
}
Beispiel #7
0
func (rq *replicateQueue) transferLease(
	ctx context.Context,
	repl *Replica,
	desc *roachpb.RangeDescriptor,
	zone config.ZoneConfig,
	checkTransferLeaseSource bool,
) (bool, error) {
	candidates := filterBehindReplicas(repl.RaftStatus(), desc.Replicas)
	if target := rq.allocator.TransferLeaseTarget(
		zone.Constraints,
		candidates,
		repl.store.StoreID(),
		desc.RangeID,
		checkTransferLeaseSource,
	); target != (roachpb.ReplicaDescriptor{}) {
		rq.metrics.TransferLeaseCount.Inc(1)
		log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID)
		if err := repl.AdminTransferLease(ctx, target.StoreID); err != nil {
			return false, errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID)
		}
		rq.lastLeaseTransfer.Store(timeutil.Now())
		return true, nil
	}
	return false, nil
}
Beispiel #8
0
// IncrementEpoch is called to increment the current liveness epoch,
// thereby invalidating anything relying on the liveness of the
// previous epoch. This method does a conditional put on the node
// liveness record, and if successful, stores the updated liveness
// record in the nodes map. If this method is called on a node ID
// which is considered live according to the most recent information
// gathered through gossip, an error is returned.
func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness *Liveness) error {
	if liveness.isLive(nl.clock.Now(), nl.clock.MaxOffset()) {
		return errors.Errorf("cannot increment epoch on live node: %+v", liveness)
	}
	newLiveness := *liveness
	newLiveness.Epoch++
	if err := nl.updateLiveness(ctx, &newLiveness, liveness, func(actual Liveness) error {
		if actual.Epoch > liveness.Epoch {
			newLiveness = actual
			return nil
		} else if actual.Epoch < liveness.Epoch {
			return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch)
		}
		nl.mu.Lock()
		defer nl.mu.Unlock()
		nl.mu.nodes[actual.NodeID] = actual
		return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", liveness, actual)
	}); err != nil {
		return err
	}

	log.VEventf(ctx, 1, "incremented node %d liveness epoch to %d",
		newLiveness.NodeID, newLiveness.Epoch)
	nl.mu.Lock()
	defer nl.mu.Unlock()
	if nodeID := nl.gossip.NodeID.Get(); nodeID == liveness.NodeID {
		nl.mu.self = newLiveness
	} else {
		nl.mu.nodes[newLiveness.NodeID] = newLiveness
	}
	nl.metrics.EpochIncrements.Inc(1)
	return nil
}
Beispiel #9
0
// Run is part of the processor interface.
func (tr *tableReader) Run(wg *sync.WaitGroup) {
	if wg != nil {
		defer wg.Done()
	}

	ctx, span := tracing.ChildSpan(tr.ctx, "table reader")
	defer tracing.FinishSpan(span)

	txn := tr.flowCtx.setupTxn(ctx)

	log.VEventf(ctx, 1, "starting (filter: %s)", &tr.filter)
	if log.V(1) {
		defer log.Infof(ctx, "exiting")
	}

	if err := tr.fetcher.StartScan(
		txn, tr.spans, true /* limit batches */, tr.getLimitHint(),
	); err != nil {
		log.Errorf(ctx, "scan error: %s", err)
		tr.output.Close(err)
		return
	}
	var rowIdx int64
	for {
		outRow, err := tr.nextRow()
		if err != nil || outRow == nil {
			tr.output.Close(err)
			return
		}
		if log.V(3) {
			log.Infof(ctx, "pushing row %s", outRow)
		}
		// Push the row to the output RowReceiver; stop if they don't need more
		// rows.
		if !tr.output.PushRow(outRow) {
			log.VEventf(ctx, 1, "no more rows required")
			tr.output.Close(nil)
			return
		}
		rowIdx++
		if tr.hardLimit != 0 && rowIdx == tr.hardLimit {
			// We sent tr.hardLimit rows.
			tr.output.Close(nil)
			return
		}
	}
}
Beispiel #10
0
// MaybeAdd adds the specified replica if bq.shouldQueue specifies it
// should be queued. Replicas are added to the queue using the priority
// returned by bq.shouldQueue. If the queue is too full, the replica may
// not be added, as the replica with the lowest priority will be
// dropped.
func (bq *baseQueue) MaybeAdd(repl *Replica, now hlc.Timestamp) {
	// Load the system config.
	cfg, cfgOk := bq.gossip.GetSystemConfig()
	requiresSplit := cfgOk && bq.requiresSplit(cfg, repl)

	bq.mu.Lock()
	defer bq.mu.Unlock()

	if bq.mu.stopped {
		return
	}

	if !repl.IsInitialized() {
		return
	}

	ctx := repl.AnnotateCtx(bq.AnnotateCtx(context.TODO()))

	if !cfgOk {
		log.VEvent(ctx, 1, "no system config available. skipping")
		return
	}

	if requiresSplit {
		// Range needs to be split due to zone configs, but queue does
		// not accept unsplit ranges.
		log.VEventf(ctx, 1, "split needed; not adding")
		return
	}

	if bq.needsLease {
		// Check to see if either we own the lease or do not know who the lease
		// holder is.
		if lease, _ := repl.getLease(); repl.IsLeaseValid(lease, now) &&
			!lease.OwnedBy(repl.store.StoreID()) {
			log.VEventf(ctx, 1, "needs lease; not adding: %+v", lease)
			return
		}
	}

	should, priority := bq.impl.shouldQueue(ctx, now, repl, cfg)
	if _, err := bq.addInternal(ctx, repl.Desc(), should, priority); !isExpectedQueueError(err) {
		log.Errorf(ctx, "unable to add: %s", err)
	}
}
Beispiel #11
0
// addInternal adds the replica the queue with specified priority. If
// the replica is already queued, updates the existing
// priority. Expects the queue lock to be held by caller.
func (bq *baseQueue) addInternal(
	ctx context.Context, desc *roachpb.RangeDescriptor, should bool, priority float64,
) (bool, error) {
	if bq.mu.stopped {
		return false, errQueueStopped
	}

	if bq.mu.disabled {
		log.Event(ctx, "queue disabled")
		return false, errQueueDisabled
	}

	if !desc.IsInitialized() {
		// We checked this above in MaybeAdd(), but we need to check it
		// again for Add().
		return false, errors.New("replica not initialized")
	}

	// If the replica is currently in purgatory, don't re-add it.
	if _, ok := bq.mu.purgatory[desc.RangeID]; ok {
		return false, nil
	}

	item, ok := bq.mu.replicas[desc.RangeID]
	if !should {
		if ok {
			log.Eventf(ctx, "%s: removing from queue", item.value)
			bq.remove(item)
		}
		return false, errReplicaNotAddable
	} else if ok {
		if item.priority != priority {
			log.Eventf(ctx, "%s: updating priority: %0.3f -> %0.3f",
				desc, item.priority, priority)
		}
		// Replica has already been added; update priority.
		bq.mu.priorityQ.update(item, priority)
		return false, nil
	}

	log.VEventf(ctx, 3, "%s: adding: priority=%0.3f", desc, priority)
	item = &replicaItem{value: desc.RangeID, priority: priority}
	bq.add(item)

	// If adding this replica has pushed the queue past its maximum size,
	// remove the lowest priority element.
	if pqLen := bq.mu.priorityQ.Len(); pqLen > bq.maxSize {
		bq.remove(bq.mu.priorityQ[pqLen-1])
	}
	// Signal the processLoop that a replica has been added.
	select {
	case bq.incoming <- struct{}{}:
	default:
		// No need to signal again.
	}
	return true, nil
}
Beispiel #12
0
// Seek positions the iterator at the specified key.
func (ri *RangeIterator) Seek(ctx context.Context, key roachpb.RKey, scanDir ScanDirection) {
	log.Eventf(ctx, "querying next range at %s", key)
	ri.scanDir = scanDir
	ri.init = true // the iterator is now initialized
	ri.pErr = nil  // clear any prior error
	ri.key = key   // set the key

	// Retry loop for looking up next range in the span. The retry loop
	// deals with retryable range descriptor lookups.
	for r := retry.StartWithCtx(ctx, ri.ds.rpcRetryOptions); r.Next(); {
		log.Event(ctx, "meta descriptor lookup")
		var err error
		ri.desc, ri.token, err = ri.ds.getDescriptor(
			ctx, ri.key, ri.token, ri.scanDir == Descending)

		// getDescriptor may fail retryably if, for example, the first
		// range isn't available via Gossip. Assume that all errors at
		// this level are retryable. Non-retryable errors would be for
		// things like malformed requests which we should have checked
		// for before reaching this point.
		if err != nil {
			log.VEventf(ctx, 1, "range descriptor lookup failed: %s", err)
			continue
		}

		// It's possible that the returned descriptor misses parts of the
		// keys it's supposed to include after it's truncated to match the
		// descriptor. Example revscan [a,g), first desc lookup for "g"
		// returns descriptor [c,d) -> [d,g) is never scanned.
		// We evict and retry in such a case.
		// TODO: this code is subject to removal. See
		// https://groups.google.com/d/msg/cockroach-db/DebjQEgU9r4/_OhMe7atFQAJ
		reverse := ri.scanDir == Descending
		if (reverse && !ri.desc.ContainsExclusiveEndKey(ri.key)) ||
			(!reverse && !ri.desc.ContainsKey(ri.key)) {
			log.Eventf(ctx, "addressing error: %s does not include key %s", ri.desc, ri.key)
			if err := ri.token.Evict(ctx); err != nil {
				ri.pErr = roachpb.NewError(err)
				return
			}
			// On addressing errors, don't backoff; retry immediately.
			r.Reset()
			continue
		}
		return
	}

	// Check for an early exit from the retry loop.
	if pErr := ri.ds.deduceRetryEarlyExitError(ctx); pErr != nil {
		ri.pErr = pErr
	} else {
		ri.pErr = roachpb.NewErrorf("RangeIterator failed to seek to %s", key)
	}
}
Beispiel #13
0
// increaseBudget requests more memory from the pool.
func (mm *MemoryMonitor) increaseBudget(ctx context.Context, minExtra int64) error {
	// NB: mm.mu Already locked by reserveMemory().
	if mm.pool == nil {
		return errors.Errorf("%s: memory budget exceeded: %d bytes requested, %d bytes in budget",
			mm.name, minExtra, mm.reserved.curAllocated)
	}
	minExtra = mm.roundSize(minExtra)
	log.VEventf(ctx, 2, "%s: requesting %d bytes from the pool",
		mm.name, minExtra)

	return mm.pool.GrowAccount(ctx, &mm.mu.curBudget, minExtra)
}
Beispiel #14
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold. The transaction and abort cache records are also
// scanned and old entries evicted. During normal operation, both of these
// records are cleaned up when their respective transaction finishes, so the
// amount of work done here is expected to be small.
//
// Some care needs to be taken to avoid cyclic recreation of entries during GC:
// * a Push initiated due to an intent may recreate a transaction entry
// * resolving an intent may write a new abort cache entry
// * obtaining the transaction for a abort cache entry requires a Push
//
// The following order is taken below:
// 1) collect all intents with sufficiently old txn record
// 2) collect these intents' transactions
// 3) scan the transaction table, collecting abandoned or completed txns
// 4) push all of these transactions (possibly recreating entries)
// 5) resolve all intents (unless the txn is still PENDING), which will recreate
//    abort cache entries (but with the txn timestamp; i.e. likely gc'able)
// 6) scan the abort cache table for old entries
// 7) push these transactions (again, recreating txn entries).
// 8) send a GCRequest.
func (gcq *gcQueue) process(
	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg config.SystemConfig,
) error {
	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return errors.Errorf("could not find zone config for range %s: %s", repl, err)
	}

	gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC,
		func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) {
			pushTxn(ctx, gcq.store.DB(), now, txn, typ)
		},
		func(intents []roachpb.Intent, poison bool, wait bool) error {
			return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait)
		})

	if err != nil {
		return err
	}

	log.VEventf(ctx, 1, "completed with stats %+v", info)

	info.updateMetrics(gcq.store.metrics)

	var ba roachpb.BatchRequest
	var gcArgs roachpb.GCRequest
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()
	gcArgs.Keys = gcKeys
	gcArgs.Threshold = info.Threshold
	gcArgs.TxnSpanGCThreshold = info.TxnSpanGCThreshold

	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(&gcArgs)
	if _, pErr := repl.Send(ctx, ba); pErr != nil {
		log.ErrEvent(ctx, pErr.String())
		return pErr.GoError()
	}
	return nil
}
Beispiel #15
0
// MaybeRemove removes the specified replica from the queue if enqueued.
func (bq *baseQueue) MaybeRemove(rangeID roachpb.RangeID) {
	bq.mu.Lock()
	defer bq.mu.Unlock()

	if bq.mu.stopped {
		return
	}

	if item, ok := bq.mu.replicas[rangeID]; ok {
		ctx := bq.AnnotateCtx(context.TODO())
		log.VEventf(ctx, 3, "%s: removing", item.value)
		bq.remove(item)
	}
}
Beispiel #16
0
// Start starts the flow (each processor runs in their own goroutine).
func (f *Flow) Start() {
	log.VEventf(
		f.Context, 1, "starting (%d processors, %d outboxes)", len(f.outboxes), len(f.processors),
	)
	f.status = FlowRunning
	f.flowRegistry.RegisterFlow(f.id, f)
	for _, o := range f.outboxes {
		o.start(&f.waitGroup)
	}
	f.waitGroup.Add(len(f.processors))
	for _, p := range f.processors {
		go p.Run(&f.waitGroup)
	}
}
Beispiel #17
0
// Start starts the flow (each processor runs in their own goroutine).
func (f *Flow) Start(doneFn func()) {
	f.doneFn = doneFn
	log.VEventf(
		f.Context, 1, "starting (%d processors, %d outboxes)", len(f.outboxes), len(f.processors),
	)
	f.status = FlowRunning
	f.flowRegistry.RegisterFlow(f.id, f, f.inboundStreams)
	f.waitGroup.Add(len(f.inboundStreams))
	f.waitGroup.Add(len(f.outboxes))
	f.waitGroup.Add(len(f.processors))
	for _, o := range f.outboxes {
		o.start(&f.waitGroup)
	}
	for _, p := range f.processors {
		go p.Run(&f.waitGroup)
	}
}
// IncrementEpoch is called to increment the current liveness epoch,
// thereby invalidating anything relying on the liveness of the
// previous epoch. This method does a conditional put on the node
// liveness record, and if successful, stores the updated liveness
// record in the nodes map. If this method is called on a node ID
// which is considered live according to the most recent information
// gathered through gossip, an error is returned.
func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness *Liveness) error {
	// Allow only one increment at a time.
	select {
	case nl.incrementSem <- struct{}{}:
	case <-ctx.Done():
		return ctx.Err()
	}
	defer func() {
		<-nl.incrementSem
	}()

	if liveness.isLive(nl.clock.Now(), nl.clock.MaxOffset()) {
		return errors.Errorf("cannot increment epoch on live node: %+v", liveness)
	}
	newLiveness := *liveness
	newLiveness.Epoch++
	update := func(l Liveness) {
		nl.mu.Lock()
		defer nl.mu.Unlock()
		if nodeID := nl.gossip.NodeID.Get(); nodeID == l.NodeID {
			nl.mu.self = l
		} else {
			nl.mu.nodes[l.NodeID] = l
		}
	}
	if err := nl.updateLiveness(ctx, &newLiveness, liveness, func(actual Liveness) error {
		defer update(actual)
		if actual.Epoch > liveness.Epoch {
			return errEpochAlreadyIncremented
		} else if actual.Epoch < liveness.Epoch {
			return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch)
		}
		return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", liveness, actual)
	}); err != nil {
		if err == errEpochAlreadyIncremented {
			return nil
		}
		return err
	}

	log.VEventf(ctx, 1, "incremented node %d liveness epoch to %d",
		newLiveness.NodeID, newLiveness.Epoch)
	update(newLiveness)
	nl.metrics.EpochIncrements.Inc(1)
	return nil
}
Beispiel #19
0
// SendNext invokes the specified RPC on the supplied client when the
// client is ready. On success, the reply is sent on the channel;
// otherwise an error is sent.
func (gt *grpcTransport) SendNext(ctx context.Context, done chan<- BatchCall) {
	client := gt.orderedClients[gt.clientIndex]
	gt.clientIndex++
	gt.setPending(client.args.Replica, true)

	// Fork the original context as this async send may outlast the
	// caller's context.
	ctx, sp := tracing.ForkCtxSpan(ctx, "grpcTransport SendNext")
	go func() {
		defer tracing.FinishSpan(sp)
		gt.opts.metrics.SentCount.Inc(1)
		reply, err := func() (*roachpb.BatchResponse, error) {
			if enableLocalCalls {
				if localServer := gt.rpcContext.GetLocalInternalServerForAddr(client.remoteAddr); localServer != nil {
					// Clone the request. At the time of writing, Replica may mutate it
					// during command execution which can lead to data races.
					//
					// TODO(tamird): we should clone all of client.args.Header, but the
					// assertions in protoutil.Clone fire and there seems to be no
					// reasonable workaround.
					origTxn := client.args.Txn
					if origTxn != nil {
						clonedTxn := origTxn.Clone()
						client.args.Txn = &clonedTxn
					}
					gt.opts.metrics.LocalSentCount.Inc(1)
					log.VEvent(ctx, 2, "sending request to local server")
					return localServer.Batch(ctx, &client.args)
				}
			}

			log.VEventf(ctx, 2, "sending request to %s", client.remoteAddr)
			reply, err := client.client.Batch(ctx, &client.args)
			if reply != nil {
				for i := range reply.Responses {
					if err := reply.Responses[i].GetInner().Verify(client.args.Requests[i].GetInner()); err != nil {
						log.Error(ctx, err)
					}
				}
			}
			return reply, err
		}()
		gt.setPending(client.args.Replica, false)
		done <- BatchCall{Reply: reply, Err: err}
	}()
}
Beispiel #20
0
// tightenNetwork "tightens" the network by starting a new gossip client to the
// client to the most distant node to which we don't already have an outgoing
// connection. Does nothing if we don't have room for any more outgoing
// connections.
func (g *Gossip) tightenNetwork(ctx context.Context) {
	g.mu.Lock()
	defer g.mu.Unlock()
	if g.outgoing.hasSpace() {
		distantNodeID, distantHops := g.mu.is.mostDistant(g.hasOutgoingLocked)
		log.VEventf(ctx, 2, "distantHops: %d from %d", distantHops, distantNodeID)
		if distantHops <= maxHops {
			return
		}
		if nodeAddr, err := g.getNodeIDAddressLocked(distantNodeID); err != nil {
			log.Errorf(ctx, "unable to get address for distant node %d: %s", distantNodeID, err)
		} else {
			log.Infof(ctx, "starting client to distant node %d (%d > %d) to tighten network graph",
				distantNodeID, distantHops, maxHops)
			log.Eventf(ctx, "tightening network with new client to %s", nodeAddr)
			g.startClientLocked(nodeAddr)
		}
	}
}
Beispiel #21
0
// Heartbeat is called to update a node's expiration timestamp. This
// method does a conditional put on the node liveness record, and if
// successful, stores the updated liveness record in the nodes map.
func (nl *NodeLiveness) Heartbeat(ctx context.Context, liveness *Liveness) error {
	nodeID := nl.gossip.NodeID.Get()
	var newLiveness Liveness
	if liveness == nil {
		newLiveness = Liveness{
			NodeID: nodeID,
			Epoch:  1,
		}
	} else {
		newLiveness = *liveness
	}
	// We need to add the maximum clock offset to the expiration because it's
	// used when determining liveness for a node.
	newLiveness.Expiration = nl.clock.Now().Add(
		(nl.livenessThreshold + nl.clock.MaxOffset()).Nanoseconds(), 0)
	err := nl.updateLiveness(ctx, &newLiveness, liveness, func(actual Liveness) error {
		// Update liveness to actual value on mismatch.
		defer func() {
			newLiveness = actual
		}()
		// If the actual liveness is further ahead than expected, we're
		// done. This can happen when the periodic heartbeater races with
		// a concurrent lease acquisition.
		if !actual.Expiration.Less(newLiveness.Expiration) && actual.Epoch == newLiveness.Epoch {
			return nil
		}
		// Otherwise, return error.
		return errSkippedHeartbeat
	})

	log.VEventf(ctx, 1, "heartbeat %+v", newLiveness.Expiration)
	nl.mu.Lock()
	nl.mu.self = newLiveness
	nl.mu.Unlock()
	if err != nil {
		nl.metrics.HeartbeatFailures.Inc(1)
	} else {
		nl.metrics.HeartbeatSuccesses.Inc(1)
	}
	return err
}
Beispiel #22
0
// IncrementEpoch is called by nodes on other nodes in order to
// increment the current liveness epoch, thereby invalidating anything
// relying on the liveness of the previous epoch. This method does a
// conditional put on the node liveness record, and if successful,
// stores the updated liveness record in the nodes map.
//
// TODO(spencer): make sure calls to this method are captured in
// range lease metrics.
func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, nodeID roachpb.NodeID) error {
	liveness, err := nl.GetLiveness(nodeID)
	if err != nil {
		return err
	}
	if liveness.isLive(nl.clock) {
		return errors.Errorf("cannot increment epoch on live node: %+v", liveness)
	}
	newLiveness := liveness
	newLiveness.Epoch++
	if err := nl.updateLiveness(ctx, nodeID, &newLiveness, &liveness, nil); err != nil {
		return err
	}

	log.VEventf(ctx, 1, "incremented node %d liveness epoch to %d", nodeID, newLiveness.Epoch)
	nl.mu.Lock()
	defer nl.mu.Unlock()
	nl.mu.nodes[nodeID] = newLiveness
	nl.metrics.EpochIncrements.Inc(1)
	return nil
}
Beispiel #23
0
// heartbeat is called to update a node's expiration timestamp. This
// method does a conditional put on the node liveness record, and if
// successful, stores the updated liveness record in the nodes map.
func (nl *NodeLiveness) heartbeat(ctx context.Context) error {
	nodeID := nl.gossip.NodeID.Get()

	var newLiveness Liveness
	var oldLiveness *Liveness
	liveness, err := nl.GetLiveness(nodeID)
	if err == nil {
		oldLiveness = &liveness
		newLiveness = liveness
	} else {
		newLiveness = Liveness{
			NodeID: nodeID,
			Epoch:  1,
		}
	}

	// Retry heartbeat in the event the conditional put fails.
	for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); {
		newLiveness.Expiration = nl.clock.Now().Add(nl.livenessThreshold.Nanoseconds(), 0)
		tryAgain := false
		if err := nl.updateLiveness(ctx, nodeID, &newLiveness, oldLiveness, func(actual Liveness) {
			oldLiveness = &actual
			newLiveness = actual
			tryAgain = true
		}); err != nil {
			nl.metrics.HeartbeatFailures.Inc(1)
			return err
		}
		if !tryAgain {
			break
		}
	}

	log.VEventf(ctx, 1, "heartbeat node %d liveness with expiration %s", nodeID, newLiveness.Expiration)
	nl.mu.Lock()
	defer nl.mu.Unlock()
	nl.mu.self = newLiveness
	nl.metrics.HeartbeatSuccesses.Inc(1)
	return nil
}
Beispiel #24
0
// process truncates the raft log of the range if the replica is the raft
// leader and if the total number of the range's raft log's stale entries
// exceeds RaftLogQueueStaleThreshold.
func (rlq *raftLogQueue) process(
	ctx context.Context, now hlc.Timestamp, r *Replica, _ config.SystemConfig,
) error {
	truncatableIndexes, oldestIndex, err := getTruncatableIndexes(ctx, r)
	if err != nil {
		return err
	}

	// Can and should the raft logs be truncated?
	if truncatableIndexes >= RaftLogQueueStaleThreshold {
		log.VEventf(ctx, 1, "truncating raft log %d-%d",
			oldestIndex-truncatableIndexes, oldestIndex)
		b := &client.Batch{}
		b.AddRawRequest(&roachpb.TruncateLogRequest{
			Span:    roachpb.Span{Key: r.Desc().StartKey.AsRawKey()},
			Index:   oldestIndex,
			RangeID: r.RangeID,
		})
		return rlq.db.Run(ctx, b)
	}
	return nil
}
Beispiel #25
0
// Start starts the flow (each processor runs in their own goroutine).
func (f *Flow) Start(doneFn func()) {
	f.doneFn = doneFn
	log.VEventf(
		f.Context, 1, "starting (%d processors, %d outboxes)", len(f.outboxes), len(f.processors),
	)
	f.status = FlowRunning

	// Once we call RegisterFlow, the inbound streams become accessible; we must
	// set up the WaitGroup counter before.
	f.waitGroup.Add(len(f.inboundStreams) + len(f.outboxes) + len(f.processors))

	f.flowRegistry.RegisterFlow(f.id, f, f.inboundStreams)
	if log.V(1) {
		log.Infof(f.Context, "registered flow %s", f.id.Short())
	}
	for _, o := range f.outboxes {
		o.start(&f.waitGroup)
	}
	for _, p := range f.processors {
		go p.Run(&f.waitGroup)
	}
}
Beispiel #26
0
// StartHeartbeat starts a periodic heartbeat to refresh this node's
// last heartbeat in the node liveness table.
func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) {
	log.VEventf(ctx, 1, "starting liveness heartbeat")
	retryOpts := base.DefaultRetryOptions()
	retryOpts.Closer = stopper.ShouldQuiesce()

	stopper.RunWorker(func() {
		ambient := nl.ambientCtx
		ambient.AddLogTag("hb", nil)
		ticker := time.NewTicker(nl.heartbeatInterval)
		defer ticker.Stop()
		for {
			if !nl.pauseHeartbeat.Load().(bool) {
				ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat")
				ctx, cancel := context.WithTimeout(ctx, nl.heartbeatInterval)
				// Retry heartbeat in the event the conditional put fails.
				for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
					liveness, err := nl.Self()
					if err != nil && err != ErrNoLivenessRecord {
						log.Errorf(ctx, "unexpected error getting liveness: %v", err)
					}
					if err := nl.Heartbeat(ctx, liveness); err != nil {
						if err == errSkippedHeartbeat {
							continue
						}
						log.Errorf(ctx, "failed liveness heartbeat: %v", err)
					}
					break
				}
				cancel()
				sp.Finish()
			}
			select {
			case <-ticker.C:
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #27
0
func (ta *TxnAborter) statementFilter(ctx context.Context, stmt string, res *sql.Result) {
	ta.mu.Lock()
	ri, ok := ta.mu.stmtsToAbort[stmt]
	shouldAbort := false
	if ok {
		ri.execCount++
		if ri.abortCount == 0 {
			log.VEventf(ctx, 1, "TxnAborter sees satisfied statement %q", stmt)
			ri.satisfied = true
		}
		if ri.abortCount > 0 && res.Err == nil {
			log.Infof(ctx, "TxnAborter aborting txn for statement %q", stmt)
			ri.abortCount--
			shouldAbort = true
		}
	}
	ta.mu.Unlock()
	if shouldAbort {
		if err := ta.abortTxn(ri.key); err != nil {
			res.Err = errors.Wrap(err, "TxnAborter failed to abort")
		}
	}
}
// maybeTransferRaftLeadership attempts to transfer the leadership
// away from this node to target, if this node is the current raft
// leader. We don't attempt to transfer leadership if the transferee
// is behind on applying the log.
func (r *Replica) maybeTransferRaftLeadership(ctx context.Context, target roachpb.ReplicaID) {
	err := r.withRaftGroup(func(raftGroup *raft.RawNode) (bool, error) {
		// Only the raft leader can attempt a leadership transfer.
		if status := raftGroup.Status(); status.RaftState == raft.StateLeader {
			// Only attempt this if the target has all the log entries.
			if pr, ok := status.Progress[uint64(target)]; ok && pr.Match == r.mu.lastIndex {
				log.VEventf(ctx, 1, "transferring raft leadership to replica ID %v", target)
				r.store.metrics.RangeRaftLeaderTransfers.Inc(1)
				raftGroup.TransferLeader(uint64(target))
			}
		}
		return true, nil
	})
	if err != nil {
		// An error here indicates that this Replica has been destroyed
		// while lacking the necessary synchronization (or even worse, it
		// fails spuriously - could be a storage error), and so we avoid
		// sweeping that under the rug.
		//
		// TODO(tschottdorf): this error is not handled any more
		// at this level.
		log.Fatal(ctx, NewReplicaCorruptionError(err))
	}
}
Beispiel #29
0
// StartHeartbeat starts a periodic heartbeat to refresh this node's
// last heartbeat in the node liveness table.
func (nl *NodeLiveness) StartHeartbeat(ctx context.Context, stopper *stop.Stopper) {
	log.VEventf(ctx, 1, "starting liveness heartbeat")

	stopper.RunWorker(func() {
		ambient := nl.ambientCtx
		ambient.AddLogTag("hb", nil)
		ticker := time.NewTicker(nl.heartbeatInterval)
		defer ticker.Stop()
		for {
			ctx, sp := ambient.AnnotateCtxWithSpan(context.Background(), "heartbeat")
			if err := nl.heartbeat(ctx); err != nil {
				log.Errorf(ctx, "failed liveness heartbeat: %s", err)
			}
			sp.Finish()
			select {
			case <-ticker.C:
			case <-nl.stopHeartbeat:
				return
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #30
0
// processLoop processes the entries in the queue until the provided
// stopper signals exit.
func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *stop.Stopper) {
	stopper.RunWorker(func() {
		ctx := bq.AnnotateCtx(context.Background())
		defer func() {
			bq.mu.Lock()
			bq.mu.stopped = true
			bq.mu.Unlock()
			bq.AmbientContext.FinishEventLog()
		}()

		// nextTime is initially nil; we don't start any timers until the queue
		// becomes non-empty.
		var nextTime <-chan time.Time

		immediately := make(chan time.Time)
		close(immediately)

		for {
			select {
			// Exit on stopper.
			case <-stopper.ShouldStop():
				return

			// Incoming signal sets the next time to process if there were previously
			// no replicas in the queue.
			case <-bq.incoming:
				if nextTime == nil {
					// When a replica is added, wake up immediately. This is mainly
					// to facilitate testing without unnecessary sleeps.
					nextTime = immediately

					// In case we're in a test, still block on the impl.
					bq.impl.timer(0)
				}
			// Process replicas as the timer expires.
			case <-nextTime:
				repl := bq.pop()
				var duration time.Duration
				if repl != nil {
					if stopper.RunTask(func() {
						annotatedCtx := repl.AnnotateCtx(ctx)
						start := timeutil.Now()
						if err := bq.processReplica(annotatedCtx, repl, clock); err != nil {
							// Maybe add failing replica to purgatory if the queue supports it.
							bq.maybeAddToPurgatory(annotatedCtx, repl, err, clock, stopper)
						}
						duration = timeutil.Since(start)
						log.VEventf(annotatedCtx, 2, "done %s", duration)
						bq.processingNanos.Inc(duration.Nanoseconds())
					}) != nil {
						return
					}
				}
				if bq.Length() == 0 {
					nextTime = nil
				} else {
					nextTime = time.After(bq.impl.timer(duration))
				}
			}
		}
	})
}