Exemple #1
0
// processReplica processes a single replica. This should not be
// called externally to the queue. bq.mu.Lock should not be held
// while calling this method.
func (bq *baseQueue) processReplica(repl *Replica, clock *hlc.Clock) error {
	bq.processMu.Lock()
	defer bq.processMu.Unlock()

	// Load the system config.
	cfg, ok := bq.gossip.GetSystemConfig()
	if !ok {
		log.VEventf(1, bq.ctx, "no system config available. skipping")
		return nil
	}

	if bq.requiresSplit(cfg, repl) {
		// Range needs to be split due to zone configs, but queue does
		// not accept unsplit ranges.
		log.VEventf(3, bq.ctx, "%s: split needed; skipping", repl)
		return nil
	}

	sp := repl.store.Tracer().StartSpan(bq.name)
	ctx := opentracing.ContextWithSpan(context.Background(), sp)
	defer sp.Finish()
	log.Tracef(ctx, "processing replica %s", repl)

	// If the queue requires a replica to have the range lease in
	// order to be processed, check whether this replica has range lease
	// and renew or acquire if necessary.
	if bq.needsLease {
		// Create a "fake" get request in order to invoke redirectOnOrAcquireLease.
		if err := repl.redirectOnOrAcquireLease(ctx); err != nil {
			if _, harmless := err.GetDetail().(*roachpb.NotLeaseHolderError); harmless {
				log.VEventf(3, bq.ctx, "%s: not holding lease; skipping", repl)
				return nil
			}
			return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl)
		}
		log.Trace(ctx, "got range lease")
	}

	log.VEventf(3, bq.ctx, "%s: processing", repl)
	start := timeutil.Now()
	if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil {
		return err
	}
	log.VEventf(2, bq.ctx, "%s: done: %s", repl, timeutil.Since(start))
	log.Trace(ctx, "done")
	return nil
}
// InitSenderForLocalTestCluster initializes a TxnCoordSender that can be used
// with LocalTestCluster.
func InitSenderForLocalTestCluster(
	nodeDesc *roachpb.NodeDescriptor,
	tracer opentracing.Tracer,
	clock *hlc.Clock,
	latency time.Duration,
	stores client.Sender,
	stopper *stop.Stopper,
	gossip *gossip.Gossip,
) client.Sender {
	var rpcSend rpcSendFn = func(_ SendOptions, _ ReplicaSlice,
		args roachpb.BatchRequest, _ *rpc.Context) (*roachpb.BatchResponse, error) {
		if latency > 0 {
			time.Sleep(latency)
		}
		sp := tracer.StartSpan("node")
		defer sp.Finish()
		ctx := opentracing.ContextWithSpan(context.Background(), sp)
		log.Trace(ctx, args.String())
		br, pErr := stores.Send(ctx, args)
		if br == nil {
			br = &roachpb.BatchResponse{}
		}
		if br.Error != nil {
			panic(roachpb.ErrorUnexpectedlySet(stores, br))
		}
		br.Error = pErr
		if pErr != nil {
			log.Trace(ctx, "error: "+pErr.String())
		}
		return br, nil
	}
	retryOpts := GetDefaultDistSenderRetryOptions()
	retryOpts.Closer = stopper.ShouldDrain()
	distSender := NewDistSender(&DistSenderContext{
		Clock: clock,
		RangeDescriptorCacheSize: defaultRangeDescriptorCacheSize,
		RangeLookupMaxRanges:     defaultRangeLookupMaxRanges,
		LeaderCacheSize:          defaultLeaderCacheSize,
		RPCRetryOptions:          &retryOpts,
		nodeDescriptor:           nodeDesc,
		RPCSend:                  rpcSend,                    // defined above
		RangeDescriptorDB:        stores.(RangeDescriptorDB), // for descriptor lookup
	}, gossip)

	return NewTxnCoordSender(distSender, clock, false /* !linearizable */, tracer,
		stopper, NewTxnMetrics(metric.NewRegistry()))
}
Exemple #3
0
// processReplica processes a single replica. This should not be
// called externally to the queue. bq.mu.Lock should not be held
// while calling this method.
func (bq *baseQueue) processReplica(repl *Replica, clock *hlc.Clock) error {
	// Load the system config.
	cfg, ok := bq.gossip.GetSystemConfig()
	if !ok {
		bq.eventLog.VInfof(log.V(1), "no system config available. skipping")
		return nil
	}

	desc := repl.Desc()
	if !bq.acceptsUnsplitRanges && cfg.NeedsSplit(desc.StartKey, desc.EndKey) {
		// Range needs to be split due to zone configs, but queue does
		// not accept unsplit ranges.
		bq.eventLog.VInfof(log.V(3), "%s: split needed; skipping", repl)
		return nil
	}

	sp := repl.store.Tracer().StartSpan(bq.name)
	ctx := opentracing.ContextWithSpan(repl.context(context.Background()), sp)
	log.Trace(ctx, fmt.Sprintf("queue start for range %d", repl.RangeID))
	defer sp.Finish()

	// If the queue requires a replica to have the range leader lease in
	// order to be processed, check whether this replica has leader lease
	// and renew or acquire if necessary.
	if bq.needsLeaderLease {
		// Create a "fake" get request in order to invoke redirectOnOrAcquireLease.
		if err := repl.redirectOnOrAcquireLeaderLease(ctx); err != nil {
			if _, harmless := err.GetDetail().(*roachpb.NotLeaderError); harmless {
				bq.eventLog.VInfof(log.V(3), "%s: not holding lease; skipping", repl)
				return nil
			}
			return errors.Wrapf(err.GoError(), "%s: could not obtain lease", repl)
		}
		log.Trace(ctx, "got range lease")
	}

	bq.eventLog.VInfof(log.V(3), "%s: processing", repl)
	start := timeutil.Now()
	if err := bq.impl.process(ctx, clock.Now(), repl, cfg); err != nil {
		return err
	}
	bq.eventLog.VInfof(log.V(2), "%s: done: %s", repl, timeutil.Since(start))
	log.Trace(ctx, "done")
	return nil
}
Exemple #4
0
// EvictAndReplace instructs the evictionToken to evict the RangeDescriptor it was
// created with from the rangeDescriptorCache. It also allows the user to provide
// new RangeDescriptors to insert into the cache, all atomically. When called without
// arguments, EvictAndReplace will behave the same as Evict.
func (et *evictionToken) EvictAndReplace(ctx context.Context, newDescs ...roachpb.RangeDescriptor) error {
	var err error
	et.doOnce.Do(func() {
		et.doLocker.Lock()
		defer et.doLocker.Unlock()
		err = et.do()
		if err == nil {
			if len(newDescs) > 0 {
				err = et.doReplace(newDescs...)
				log.Trace(ctx, fmt.Sprintf("evicting cached range descriptor with %d replacements",
					len(newDescs)))
			} else {
				log.Trace(ctx, "evicting cached range descriptor")
			}
		}
	})
	return err
}
Exemple #5
0
// process synchronously invokes admin split for each proposed split key.
func (sq *splitQueue) process(
	ctx context.Context,
	now hlc.Timestamp,
	rng *Replica,
	sysCfg config.SystemConfig,
) error {
	// First handle case of splitting due to zone config maps.
	desc := rng.Desc()
	splitKeys := sysCfg.ComputeSplitKeys(desc.StartKey, desc.EndKey)
	if len(splitKeys) > 0 {
		log.Infof("splitting %s at keys %v", rng, splitKeys)
		log.Trace(ctx, fmt.Sprintf("splitting at keys %v", splitKeys))
		for _, splitKey := range splitKeys {
			if err := sq.db.AdminSplit(splitKey.AsRawKey()); err != nil {
				return errors.Errorf("unable to split %s at key %q: %s", rng, splitKey, err)
			}
		}
		return nil
	}

	// Next handle case of splitting due to size.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	size := rng.GetMVCCStats().Total()
	// FIXME: why is this implementation not the same as the one above?
	if float64(size)/float64(zone.RangeMaxBytes) > 1 {
		log.Infof("splitting %s size=%d max=%d", rng, size, zone.RangeMaxBytes)
		log.Trace(ctx, fmt.Sprintf("splitting size=%d max=%d", size, zone.RangeMaxBytes))
		if _, pErr := client.SendWrappedWith(rng, ctx, roachpb.Header{
			Timestamp: now,
		}, &roachpb.AdminSplitRequest{
			Span: roachpb.Span{Key: desc.StartKey.AsRawKey()},
		}); pErr != nil {
			return pErr.GoError()
		}
	}
	return nil
}
Exemple #6
0
// Send implements the client.Sender interface. The store is looked up from the
// store map if specified by the request; otherwise, the command is being
// executed locally, and the replica is determined via lookup through each
// store's LookupRange method. The latter path is taken only by unit tests.
func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	// If we aren't given a Replica, then a little bending over
	// backwards here. This case applies exclusively to unittests.
	if ba.RangeID == 0 || ba.Replica.StoreID == 0 {
		rs, err := keys.Range(ba)
		if err != nil {
			return nil, roachpb.NewError(err)
		}
		rangeID, repl, err := ls.lookupReplica(rs.Key, rs.EndKey)
		if err != nil {
			return nil, roachpb.NewError(err)
		}
		ba.RangeID = rangeID
		ba.Replica = *repl
	}

	ctx = log.Add(ctx,
		log.RangeID, ba.RangeID)

	store, err := ls.GetStore(ba.Replica.StoreID)
	if err != nil {
		return nil, roachpb.NewError(err)
	}

	if ba.Txn != nil {
		// For calls that read data within a txn, we keep track of timestamps
		// observed from the various participating nodes' HLC clocks. If we have
		// a timestamp on file for this Node which is smaller than MaxTimestamp,
		// we can lower MaxTimestamp accordingly. If MaxTimestamp drops below
		// OrigTimestamp, we effectively can't see uncertainty restarts any
		// more.
		// Note that it's not an issue if MaxTimestamp propagates back out to
		// the client via a returned Transaction update - when updating a Txn
		// from another, the larger MaxTimestamp wins.
		if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) {
			// Copy-on-write to protect others we might be sharing the Txn with.
			shallowTxn := *ba.Txn
			// The uncertainty window is [OrigTimestamp, maxTS), so if that window
			// is empty, there won't be any uncertainty restarts.
			if !ba.Txn.OrigTimestamp.Less(maxTS) {
				log.Trace(ctx, "read has no clock uncertainty")
			}
			shallowTxn.MaxTimestamp.Backward(maxTS)
			ba.Txn = &shallowTxn
		}
	}
	br, pErr := store.Send(ctx, ba)
	if br != nil && br.Error != nil {
		panic(roachpb.ErrorUnexpectedlySet(store, br))
	}
	return br, pErr
}
Exemple #7
0
func (s *senderTransport) SendNext(done chan BatchCall) {
	if s.called {
		panic("called an exhausted transport")
	}
	s.called = true
	sp := s.tracer.StartSpan("node")
	defer sp.Finish()
	ctx := opentracing.ContextWithSpan(context.Background(), sp)
	log.Trace(ctx, s.args.String())
	br, pErr := s.sender.Send(ctx, s.args)
	if br == nil {
		br = &roachpb.BatchResponse{}
	}
	if br.Error != nil {
		panic(roachpb.ErrorUnexpectedlySet(s.sender, br))
	}
	br.Error = pErr
	if pErr != nil {
		log.Trace(ctx, "error: "+pErr.String())
	}
	done <- BatchCall{Reply: br}
}
// cleanupTxnLocked is called when a transaction ends. The transaction record
// is updated and the heartbeat goroutine signaled to clean up the transaction
// gracefully.
func (tc *TxnCoordSender) cleanupTxnLocked(ctx context.Context, txn roachpb.Transaction) {
	log.Trace(ctx, "coordinator stops")
	txnMeta, ok := tc.txns[*txn.ID]
	// The heartbeat might've already removed the record. Or we may have already
	// closed txnEnd but we are racing with the heartbeat cleanup.
	if !ok || txnMeta.txnEnd == nil {
		return
	}

	// The supplied txn may be newer than the one in txnMeta, which is relevant
	// for stats.
	txnMeta.txn = txn
	// Trigger heartbeat shutdown.
	close(txnMeta.txnEnd)
	txnMeta.txnEnd = nil
}
// cleanupTxn is called when a transaction ends. The transaction record is
// updated and the heartbeat goroutine signaled to clean up the transaction
// gracefully.
func (tc *TxnCoordSender) cleanupTxn(ctx context.Context, txn roachpb.Transaction) {
	log.Trace(ctx, "coordinator stops")
	tc.Lock()
	defer tc.Unlock()
	txnMeta, ok := tc.txns[*txn.ID]
	// The heartbeat might've already removed the record.
	if !ok {
		return
	}

	// The supplied txn may be newer than the one in txnMeta, which is relevant
	// for stats.
	txnMeta.txn = txn
	// Trigger heartbeat shutdown.
	close(txnMeta.txnEnd)
	txnMeta.txnEnd = nil
}
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool {
	tc.Lock()
	txnMeta := tc.txns[txnID]
	txn := txnMeta.txn.Clone()
	tc.Unlock()

	// Before we send a heartbeat, determine whether this transaction should be
	// considered abandoned. If so, exit heartbeat. If ctx.Done() is not nil, then
	// it is a cancellable Context and we skip this check and use the ctx lifetime
	// instead of a timeout.
	if ctx.Done() == nil && txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) {
		tc.clientHasAbandoned(txnID)
		return false
	}

	ba := roachpb.BatchRequest{}
	ba.Txn = &txn

	hb := &roachpb.HeartbeatTxnRequest{
		Now: tc.clock.Now(),
	}
	hb.Key = txn.Key
	ba.Add(hb)

	log.Trace(ctx, "heartbeat")
	_, err := tc.wrapped.Send(ctx, ba)
	// If the transaction is not in pending state, then we can stop
	// the heartbeat. It's either aborted or committed, and we resolve
	// write intents accordingly.
	if err != nil {
		log.Warningf("heartbeat to %s failed: %s", txn, err)
	}
	// TODO(bdarnell): once we have gotten a heartbeat response with
	// Status != PENDING, future heartbeats are useless. However, we
	// need to continue the heartbeatLoop until the client either
	// commits or abandons the transaction. We could save a little
	// pointless work by restructuring this loop to stop sending
	// heartbeats between the time that the transaction is aborted and
	// the client finds out. Furthermore, we could use this information
	// to send TransactionAbortedErrors to the client so it can restart
	// immediately instead of running until its EndTransaction.
	return true
}
Exemple #11
0
// sendSingleRange gathers and rearranges the replicas, and makes an RPC call.
func (ds *DistSender) sendSingleRange(
	ctx context.Context, ba roachpb.BatchRequest, desc *roachpb.RangeDescriptor,
) (*roachpb.BatchResponse, *roachpb.Error) {
	log.Trace(ctx, fmt.Sprintf("sending RPC to [%s, %s)", desc.StartKey, desc.EndKey))

	// Try to send the call.
	replicas := newReplicaSlice(ds.gossip, desc)

	// Rearrange the replicas so that those replicas with long common
	// prefix of attributes end up first. If there's no prefix, this is a
	// no-op.
	order := ds.optimizeReplicaOrder(replicas)

	// If this request needs to go to a leader and we know who that is, move
	// it to the front.
	if !(ba.IsReadOnly() && ba.ReadConsistency == roachpb.INCONSISTENT) {
		if leader := ds.leaderCache.Lookup(roachpb.RangeID(desc.RangeID)); leader.StoreID > 0 {
			if i := replicas.FindReplica(leader.StoreID); i >= 0 {
				replicas.MoveToFront(i)
				order = orderStable
			}
		}
	}

	// TODO(tschottdorf): should serialize the trace here, not higher up.
	br, pErr := ds.sendRPC(ctx, desc.RangeID, replicas, order, ba)
	if pErr != nil {
		return nil, pErr
	}

	// If the reply contains a timestamp, update the local HLC with it.
	if br.Error != nil && br.Error.Now != roachpb.ZeroTimestamp {
		ds.clock.Update(br.Error.Now)
	} else if br.Now != roachpb.ZeroTimestamp {
		ds.clock.Update(br.Now)
	}

	// Untangle the error from the received response.
	pErr = br.Error
	br.Error = nil // scrub the response error
	return br, pErr
}
// Send implements the batch.Sender interface. If the request is part of a
// transaction, the TxnCoordSender adds the transaction to a map of active
// transactions and begins heartbeating it. Every subsequent request for the
// same transaction updates the lastUpdate timestamp to prevent live
// transactions from being considered abandoned and garbage collected.
// Read/write mutating requests have their key or key range added to the
// transaction's interval tree of key ranges for eventual cleanup via resolved
// write intents; they're tagged to an outgoing EndTransaction request, with
// the receiving replica in charge of resolving them.
func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	{
		// Start new or pick up active trace and embed its trace metadata into
		// header for use by RPC recipients. From here on, there's always an active
		// Trace, though its overhead is small unless it's sampled.
		sp := opentracing.SpanFromContext(ctx)
		if sp == nil {
			sp = tc.tracer.StartSpan(opTxnCoordSender)
			defer sp.Finish()
			ctx = opentracing.ContextWithSpan(ctx, sp)
		}
		// TODO(tschottdorf): To get rid of the spurious alloc below we need to
		// implement the carrier interface on ba.Header or make Span non-nullable,
		// both of which force all of ba on the Heap. It's already there, so may
		// not be a big deal, but ba should live on the stack. Also not easy to use
		// a buffer pool here since anything that goes into the RPC layer could be
		// used by goroutines we didn't wait for.
		if ba.Header.Trace == nil {
			ba.Header.Trace = &tracing.Span{}
		}
		if err := tc.tracer.Inject(sp, basictracer.Delegator, ba.Trace); err != nil {
			return nil, roachpb.NewError(err)
		}
	}

	startNS := tc.clock.PhysicalNow()

	if ba.Txn != nil {
		// If this request is part of a transaction...
		if err := tc.maybeBeginTxn(&ba); err != nil {
			return nil, roachpb.NewError(err)
		}
		txnID := *ba.Txn.ID
		// Verify that if this Transaction is not read-only, we have it on file.
		// If not, refuse further operations - the transaction was aborted due
		// to a timeout or the client must have issued a write on another
		// coordinator previously.
		if ba.Txn.Writing {
			tc.Lock()
			_, ok := tc.txns[txnID]
			tc.Unlock()
			if !ok {
				pErr := roachpb.NewErrorf("writing transaction timed out, was aborted, " +
					"or ran on multiple coordinators")
				return nil, pErr
			}
		}

		if rArgs, ok := ba.GetArg(roachpb.EndTransaction); ok {
			et := rArgs.(*roachpb.EndTransactionRequest)
			if len(et.Key) != 0 {
				return nil, roachpb.NewErrorf("EndTransaction must not have a Key set")
			}
			et.Key = ba.Txn.Key
			if len(et.IntentSpans) > 0 {
				// TODO(tschottdorf): it may be useful to allow this later.
				// That would be part of a possible plan to allow txns which
				// write on multiple coordinators.
				return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction")
			}
			tc.Lock()
			txnMeta, metaOK := tc.txns[txnID]
			{
				// Populate et.IntentSpans, taking into account both existing
				// writes (if any) and new writes in this batch, and taking
				// care to perform proper deduplication.
				var keys interval.RangeGroup
				if metaOK {
					keys = txnMeta.keys
				} else {
					keys = interval.NewRangeTree()
				}
				ba.IntentSpanIterate(func(key, endKey roachpb.Key) {
					addKeyRange(keys, key, endKey)
				})
				et.IntentSpans = collectIntentSpans(keys)
			}
			tc.Unlock()

			if len(et.IntentSpans) > 0 {
				// All good, proceed.
			} else if !metaOK {
				// If we don't have the transaction, then this must be a retry
				// by the client. We can no longer reconstruct a correct
				// request so we must fail.
				//
				// TODO(bdarnell): if we had a GetTransactionStatus API then
				// we could lookup the transaction and return either nil or
				// TransactionAbortedError instead of this ambivalent error.
				return nil, roachpb.NewErrorf("transaction is already committed or aborted")
			}
			if len(et.IntentSpans) == 0 {
				// If there aren't any intents, then there's factually no
				// transaction to end. Read-only txns have all of their state in
				// the client.
				return nil, roachpb.NewErrorf("cannot commit a read-only transaction")
			}
			if log.V(1) {
				for _, intent := range et.IntentSpans {
					log.Trace(ctx, fmt.Sprintf("intent: [%s,%s)", intent.Key, intent.EndKey))
				}
			}
		}
	}

	// Send the command through wrapped sender, taking appropriate measures
	// on error.
	var br *roachpb.BatchResponse
	{
		var pErr *roachpb.Error
		br, pErr = tc.wrapped.Send(ctx, ba)

		if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok {
			// TODO(tschottdorf): needs to keep the trace.
			br, pErr = tc.resendWithTxn(ba)
		}

		if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil {
			log.Trace(ctx, fmt.Sprintf("error: %s", pErr))
			return nil, pErr
		}
	}

	if br.Txn == nil {
		return br, nil
	}

	if _, ok := ba.GetArg(roachpb.EndTransaction); !ok {
		return br, nil
	}
	// If the --linearizable flag is set, we want to make sure that
	// all the clocks in the system are past the commit timestamp
	// of the transaction. This is guaranteed if either
	// - the commit timestamp is MaxOffset behind startNS
	// - MaxOffset ns were spent in this function
	// when returning to the client. Below we choose the option
	// that involves less waiting, which is likely the first one
	// unless a transaction commits with an odd timestamp.
	if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS {
		startNS = tsNS
	}
	sleepNS := tc.clock.MaxOffset() -
		time.Duration(tc.clock.PhysicalNow()-startNS)
	if tc.linearizable && sleepNS > 0 {
		defer func() {
			if log.V(1) {
				log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
			}
			time.Sleep(sleepNS)
		}()
	}
	if br.Txn.Status != roachpb.PENDING {
		tc.cleanupTxn(ctx, *br.Txn)
	}
	return br, nil
}
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool {
	tc.Lock()
	txnMeta := tc.txns[txnID]
	txn := txnMeta.txn.Clone()
	hasAbandoned := txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow())
	tc.Unlock()

	if txn.Status != roachpb.PENDING {
		// A previous iteration has already determined that the transaction is
		// already finalized, so we wait for the client to realize that and
		// want to keep our state for the time being (to dish out the right
		// error once it returns).
		return true
	}

	// Before we send a heartbeat, determine whether this transaction should be
	// considered abandoned. If so, exit heartbeat. If ctx.Done() is not nil, then
	// it is a cancellable Context and we skip this check and use the ctx lifetime
	// instead of a timeout.
	if ctx.Done() == nil && hasAbandoned {
		if log.V(1) {
			log.Infof(ctx, "transaction %s abandoned; stopping heartbeat", txnMeta.txn)
		}
		tc.tryAsyncAbort(txnID)
		return false
	}

	ba := roachpb.BatchRequest{}
	ba.Txn = &txn

	hb := &roachpb.HeartbeatTxnRequest{
		Now: tc.clock.Now(),
	}
	hb.Key = txn.Key
	ba.Add(hb)

	log.Trace(ctx, "heartbeat")
	br, pErr := tc.wrapped.Send(ctx, ba)

	// Correctness mandates that when we can't heartbeat the transaction, we
	// make sure the client doesn't keep going. This is particularly relevant
	// in the case of an ABORTED transaction, but if we can't reach the
	// transaction record at all, we're going to have to assume we're aborted
	// as well.
	if pErr != nil {
		log.Warningf(ctx, "heartbeat to %s failed: %s", txn, pErr)
		// We're not going to let the client carry out additional requests, so
		// try to clean up.
		tc.tryAsyncAbort(*txn.ID)
		txn.Status = roachpb.ABORTED
	} else {
		txn.Update(br.Responses[0].GetInner().(*roachpb.HeartbeatTxnResponse).Txn)
	}

	// Give the news to the txn in the txns map. This will update long-running
	// transactions (which may find out that they have to restart in that way),
	// but in particular makes sure that they notice when they've been aborted
	// (in which case we'll give them an error on their next request).
	tc.Lock()
	tc.txns[txnID].txn.Update(&txn)
	tc.Unlock()

	return true
}
// maybePushTransactions tries to push the conflicting transaction(s)
// responsible for the given intents: either move its
// timestamp forward on a read/write conflict, abort it on a
// write/write conflict, or do nothing if the transaction is no longer
// pending.
//
// Returns a slice of intents which can now be resolved, and an error.
// The returned intents should be resolved via intentResolver.resolveIntents.
//
// If skipIfInFlight is true, then no PushTxns will be sent and no
// intents will be returned for any transaction for which there is
// another push in progress. This should only be used by callers who
// are not relying on the side effect of a push (i.e. only
// pushType==PUSH_TOUCH), and who also don't need to synchronize with
// the resolution of those intents (e.g. asynchronous resolutions of
// intents skipped on inconsistent reads).
//
// Callers are involved with
// a) conflict resolution for commands being executed at the Store with the
//    client waiting,
// b) resolving intents encountered during inconsistent operations, and
// c) resolving intents upon EndTransaction which are not local to the given
//    range. This is the only path in which the transaction is going to be
//    in non-pending state and doesn't require a push.
func (ir *intentResolver) maybePushTransactions(ctx context.Context, intents []roachpb.Intent,
	h roachpb.Header, pushType roachpb.PushTxnType, skipIfInFlight bool) (
	[]roachpb.Intent, *roachpb.Error) {

	now := ir.store.Clock().Now()

	partialPusherTxn := h.Txn
	// If there's no pusher, we communicate a priority by sending an empty
	// txn with only the priority set. This is official usage of PushTxn.
	if partialPusherTxn == nil {
		partialPusherTxn = &roachpb.Transaction{
			TxnMeta: roachpb.TxnMeta{
				Priority: roachpb.MakePriority(h.UserPriority),
			},
		}
	}

	log.Trace(ctx, "intent resolution")

	// Split intents into those we need to push and those which are good to
	// resolve.
	ir.mu.Lock()
	// TODO(tschottdorf): can optimize this and use same underlying slice.
	var pushIntents, nonPendingIntents []roachpb.Intent
	var pErr *roachpb.Error
	for _, intent := range intents {
		if intent.Status != roachpb.PENDING {
			// The current intent does not need conflict resolution
			// because the transaction is already finalized.
			// This shouldn't happen as all intents created are in
			// the PENDING status.
			nonPendingIntents = append(nonPendingIntents, intent)
		} else if _, ok := ir.mu.inFlight[*intent.Txn.ID]; ok && skipIfInFlight {
			// Another goroutine is working on this transaction so we can
			// skip it.
			if log.V(1) {
				log.Infof("skipping PushTxn for %s; attempt already in flight", intent.Txn.ID)
			}
			continue
		} else {
			pushIntents = append(pushIntents, intent)
			ir.mu.inFlight[*intent.Txn.ID]++
		}
	}
	ir.mu.Unlock()
	if len(nonPendingIntents) > 0 {
		return nil, roachpb.NewErrorf("unexpected aborted/resolved intents: %s", nonPendingIntents)
	}

	// Attempt to push the transaction(s) which created the conflicting intent(s).
	var pushReqs []roachpb.Request
	for _, intent := range pushIntents {
		pushReqs = append(pushReqs, &roachpb.PushTxnRequest{
			Span: roachpb.Span{
				Key: intent.Txn.Key,
			},
			PusherTxn: *partialPusherTxn,
			PusheeTxn: intent.Txn,
			PushTo:    h.Timestamp,
			// The timestamp is used by PushTxn for figuring out whether the
			// transaction is abandoned. If we used the argument's timestamp
			// here, we would run into busy loops because that timestamp
			// usually stays fixed among retries, so it will never realize
			// that a transaction has timed out. See #877.
			Now:      now,
			PushType: pushType,
		})
	}
	// TODO(kaneda): Set the transaction in the header so that the
	// txn is correctly propagated in an error response.
	b := &client.Batch{}
	b.InternalAddRequest(pushReqs...)
	br, pErr := ir.store.db.RunWithResponse(b)
	ir.mu.Lock()
	for _, intent := range pushIntents {
		ir.mu.inFlight[*intent.Txn.ID]--
		if ir.mu.inFlight[*intent.Txn.ID] == 0 {
			delete(ir.mu.inFlight, *intent.Txn.ID)
		}
	}
	ir.mu.Unlock()
	if pErr != nil {
		return nil, pErr
	}

	var resolveIntents []roachpb.Intent
	for i, intent := range pushIntents {
		pushee := br.Responses[i].GetInner().(*roachpb.PushTxnResponse).PusheeTxn
		intent.Txn = pushee.TxnMeta
		intent.Status = pushee.Status
		resolveIntents = append(resolveIntents, intent)
	}
	return resolveIntents, nil
}
// resolveIntents resolves the given intents. For those which are
// local to the range, we submit directly to the local Raft instance;
// all non-local intents are resolved asynchronously in a batch. If
// `wait` is true, all operations are carried out synchronously and an
// error is returned. Otherwise, the call returns without error as
// soon as all local resolve commands have been **proposed** (not
// executed). This ensures that if a waiting client retries
// immediately after calling this function, it will not hit the same
// intents again.
func (ir *intentResolver) resolveIntents(ctx context.Context, r *Replica,
	intents []roachpb.Intent, wait bool, poison bool) error {
	// We're doing async stuff below; those need new traces.
	ctx, cleanup := tracing.EnsureContext(ctx, ir.store.Tracer())
	defer cleanup()
	log.Trace(ctx, fmt.Sprintf("resolving intents [wait=%t]", wait))

	var reqsRemote []roachpb.Request
	baLocal := roachpb.BatchRequest{}
	baLocal.Timestamp = ir.store.Clock().Now()
	for i := range intents {
		intent := intents[i] // avoids a race in `i, intent := range ...`
		var resolveArgs roachpb.Request
		var local bool // whether this intent lives on this Range
		{
			if len(intent.EndKey) == 0 {
				resolveArgs = &roachpb.ResolveIntentRequest{
					Span:      intent.Span,
					IntentTxn: intent.Txn,
					Status:    intent.Status,
					Poison:    poison,
				}
				local = r.ContainsKey(intent.Key)
			} else {
				resolveArgs = &roachpb.ResolveIntentRangeRequest{
					Span:      intent.Span,
					IntentTxn: intent.Txn,
					Status:    intent.Status,
					Poison:    poison,
				}
				local = r.ContainsKeyRange(intent.Key, intent.EndKey)
			}
		}

		// If the intent isn't (completely) local, we'll need to send an external request.
		// We'll batch them all up and send at the end.
		if local {
			baLocal.Add(resolveArgs)
		} else {
			reqsRemote = append(reqsRemote, resolveArgs)
		}
	}

	// The local batch goes directly to Raft.
	var wg sync.WaitGroup
	if len(baLocal.Requests) > 0 {
		action := func() error {
			// Trace this under the ID of the intent owner.
			// Create a new span though, since we do not want to pass a span
			// between goroutines or we risk use-after-finish.
			sp := r.store.Tracer().StartSpan("resolve intents")
			defer sp.Finish()
			ctx = opentracing.ContextWithSpan(ctx, sp)
			// Always operate with a timeout when resolving intents: this
			// prevents rare shutdown timeouts in tests.
			ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout)
			defer cancel()
			_, pErr := r.addWriteCmd(ctxWithTimeout, baLocal, &wg)
			return pErr.GoError()
		}
		wg.Add(1)
		if wait || !r.store.Stopper().RunLimitedAsyncTask(ir.sem, func() {
			if err := action(); err != nil {
				log.Warningf("unable to resolve local intents; %s", err)
			}
		}) {
			// Still run the task when draining. Our caller already has a task and
			// going async here again is merely for performance, but some intents
			// need to be resolved because they might block other tasks. See #1684.
			// Note that handleSkippedIntents has a TODO in case #1684 comes back.
			if err := action(); err != nil {
				return err
			}
		}
	}

	// Resolve all of the intents which aren't local to the Range.
	if len(reqsRemote) > 0 {
		b := &client.Batch{}
		b.InternalAddRequest(reqsRemote...)
		action := func() error {
			// TODO(tschottdorf): no tracing here yet.
			return r.store.DB().Run(b).GoError()
		}
		if wait || !r.store.Stopper().RunLimitedAsyncTask(ir.sem, func() {
			if err := action(); err != nil {
				log.Warningf("unable to resolve external intents: %s", err)
			}
		}) {
			// As with local intents, try async to not keep the caller waiting, but
			// when draining just go ahead and do it synchronously. See #1684.
			if err := action(); err != nil {
				return err
			}
		}
	}

	// Wait until the local ResolveIntents batch has been submitted to
	// raft. No-op if all were non-local.
	wg.Wait()
	return nil
}
// SnapshotWithContext is main implementation for Snapshot() but it takes a
// context to allow tracing.
func (r *Replica) SnapshotWithContext(ctx context.Context) (raftpb.Snapshot, error) {
	rangeID := r.RangeID

	// If a snapshot is in progress, see if it's ready.
	if r.mu.snapshotChan != nil {
		select {
		case snapData, ok := <-r.mu.snapshotChan:
			if ok {
				return snapData, nil
			}
			// If the old channel was closed, fall through to start a new task.

		default:
			// If the result is not ready, return immediately.
			log.Trace(ctx, "snapshot not yet ready")
			return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable
		}
	}

	if r.exceedsDoubleSplitSizeLocked() {
		maxBytes := r.mu.maxBytes
		size := r.mu.state.Stats.Total()
		log.Infof(ctx,
			"%s: not generating snapshot because replica is too large: %d > 2 * %d",
			r, size, maxBytes)
		return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable
	}

	// See if there is already a snapshot running for this store.
	if !r.store.AcquireRaftSnapshot() {
		log.Trace(ctx, "snapshot already running")
		return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable
	}

	startKey := r.mu.state.Desc.StartKey

	// Use an unbuffered channel so the worker stays alive until someone
	// reads from the channel, and can abandon the snapshot if it gets stale.
	ch := make(chan (raftpb.Snapshot))

	if r.store.Stopper().RunAsyncTask(func() {
		defer close(ch)
		sp := r.store.Tracer().StartSpan("snapshot async")
		ctxInner := opentracing.ContextWithSpan(context.Background(), sp)
		defer sp.Finish()
		snap := r.store.NewSnapshot()
		log.Tracef(ctxInner, "new engine snapshot for replica %s", r)
		defer snap.Close()
		defer r.store.ReleaseRaftSnapshot()
		// Delegate to a static function to make sure that we do not depend
		// on any indirect calls to r.store.Engine() (or other in-memory
		// state of the Replica). Everything must come from the snapshot.
		snapData, err := snapshot(context.Background(), snap, rangeID, r.store.raftEntryCache, startKey)
		if err != nil {
			log.Errorf(ctxInner, "%s: error generating snapshot: %s", r, err)
		} else {
			log.Trace(ctxInner, "snapshot generated")
			r.store.metrics.RangeSnapshotsGenerated.Inc(1)
			select {
			case ch <- snapData:
				log.Trace(ctxInner, "snapshot accepted")
			case <-time.After(r.store.ctx.AsyncSnapshotMaxAge):
				// If raft decides it doesn't need this snapshot any more (or
				// just takes too long to use it), abandon it to save memory.
				log.Infof(ctxInner, "%s: abandoning snapshot after %s", r, r.store.ctx.AsyncSnapshotMaxAge)
			case <-r.store.Stopper().ShouldQuiesce():
			}
		}
	}) == nil {
		r.mu.snapshotChan = ch
	} else {
		r.store.ReleaseRaftSnapshot()
	}

	if r.store.ctx.BlockingSnapshotDuration > 0 {
		select {
		case snap, ok := <-r.mu.snapshotChan:
			if ok {
				return snap, nil
			}
		case <-time.After(r.store.ctx.BlockingSnapshotDuration):
			log.Trace(ctx, "snapshot blocking duration exceeded")
		}
	}
	return raftpb.Snapshot{}, raft.ErrSnapshotTemporarilyUnavailable
}
Exemple #17
0
// Batch implements the roachpb.KVServer interface.
func (n *Node) Batch(ctx context.Context, args *roachpb.BatchRequest) (*roachpb.BatchResponse, error) {
	// TODO(marc): this code is duplicated in kv/db.go, which should be fixed.
	// Also, grpc's authentication model (which gives credential access in the
	// request handler) doesn't really fit with the current design of the
	// security package (which assumes that TLS state is only given at connection
	// time) - that should be fixed.
	if peer, ok := peer.FromContext(ctx); ok {
		if tlsInfo, ok := peer.AuthInfo.(credentials.TLSInfo); ok {
			certUser, err := security.GetCertificateUser(&tlsInfo.State)
			if err != nil {
				return nil, err
			}
			if certUser != security.NodeUser {
				return nil, util.Errorf("user %s is not allowed", certUser)
			}
		}
	}

	var br *roachpb.BatchResponse
	opName := "node " + strconv.Itoa(int(n.Descriptor.NodeID)) // could save allocs here

	fail := func(err error) {
		br = &roachpb.BatchResponse{}
		br.Error = roachpb.NewError(err)
	}

	f := func() {
		sp, err := tracing.JoinOrNew(n.ctx.Tracer, args.Trace, opName)
		if err != nil {
			fail(err)
			return
		}
		// If this is a snowball span, it gets special treatment: It skips the
		// regular tracing machinery, and we instead send the collected spans
		// back with the response. This is more expensive, but then again,
		// those are individual requests traced by users, so they can be.
		if sp.BaggageItem(tracing.Snowball) != "" {
			sp.LogEvent("delegating to snowball tracing")
			sp.Finish()
			if sp, err = tracing.JoinOrNewSnowball(opName, args.Trace, func(rawSpan basictracer.RawSpan) {
				encSp, err := tracing.EncodeRawSpan(&rawSpan, nil)
				if err != nil {
					log.Warning(err)
				}
				br.CollectedSpans = append(br.CollectedSpans, encSp)
			}); err != nil {
				fail(err)
				return
			}
		}
		defer sp.Finish()
		traceCtx := opentracing.ContextWithSpan(n.context(ctx), sp)

		tStart := timeutil.Now()
		var pErr *roachpb.Error
		br, pErr = n.stores.Send(traceCtx, *args)
		if pErr != nil {
			br = &roachpb.BatchResponse{}
			log.Trace(traceCtx, fmt.Sprintf("error: %T", pErr.GetDetail()))
		}
		if br.Error != nil {
			panic(roachpb.ErrorUnexpectedlySet(n.stores, br))
		}
		n.metrics.callComplete(timeutil.Since(tStart), pErr)
		br.Error = pErr
	}

	if !n.stopper.RunTask(f) {
		return nil, util.Errorf("node %d stopped", n.Descriptor.NodeID)
	}
	return br, nil
}
// Send implements the batch.Sender interface. If the request is part of a
// transaction, the TxnCoordSender adds the transaction to a map of active
// transactions and begins heartbeating it. Every subsequent request for the
// same transaction updates the lastUpdate timestamp to prevent live
// transactions from being considered abandoned and garbage collected.
// Read/write mutating requests have their key or key range added to the
// transaction's interval tree of key ranges for eventual cleanup via resolved
// write intents; they're tagged to an outgoing EndTransaction request, with
// the receiving replica in charge of resolving them.
func (tc *TxnCoordSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	{
		// Start new or pick up active trace and embed its trace metadata into
		// header for use by RPC recipients. From here on, there's always an active
		// Trace, though its overhead is small unless it's sampled.
		sp := opentracing.SpanFromContext(ctx)
		if sp == nil {
			sp = tc.tracer.StartSpan(opTxnCoordSender)
			defer sp.Finish()
			ctx = opentracing.ContextWithSpan(ctx, sp)
		}
		// TODO(tschottdorf): To get rid of the spurious alloc below we need to
		// implement the carrier interface on ba.Header or make Span non-nullable,
		// both of which force all of ba on the Heap. It's already there, so may
		// not be a big deal, but ba should live on the stack. Also not easy to use
		// a buffer pool here since anything that goes into the RPC layer could be
		// used by goroutines we didn't wait for.
		if ba.Header.Trace == nil {
			ba.Header.Trace = &tracing.Span{}
		}
		if err := tc.tracer.Inject(sp.Context(), basictracer.Delegator, ba.Trace); err != nil {
			return nil, roachpb.NewError(err)
		}
	}

	startNS := tc.clock.PhysicalNow()

	if ba.Txn != nil {
		// If this request is part of a transaction...
		if err := tc.maybeBeginTxn(&ba); err != nil {
			return nil, roachpb.NewError(err)
		}
		var et *roachpb.EndTransactionRequest
		var hasET bool
		{
			var rArgs roachpb.Request
			rArgs, hasET = ba.GetArg(roachpb.EndTransaction)
			if hasET {
				et = rArgs.(*roachpb.EndTransactionRequest)
				if len(et.Key) != 0 {
					return nil, roachpb.NewErrorf("EndTransaction must not have a Key set")
				}
				et.Key = ba.Txn.Key
				if len(et.IntentSpans) > 0 {
					// TODO(tschottdorf): it may be useful to allow this later.
					// That would be part of a possible plan to allow txns which
					// write on multiple coordinators.
					return nil, roachpb.NewErrorf("client must not pass intents to EndTransaction")
				}
			}
		}

		if pErr := func() *roachpb.Error {
			tc.Lock()
			defer tc.Unlock()
			if pErr := tc.maybeRejectClientLocked(ctx, *ba.Txn); pErr != nil {
				return pErr
			}

			if !hasET {
				return nil
			}
			// Everything below is carried out only when trying to commit.

			// Populate et.IntentSpans, taking into account both any existing
			// and new writes, and taking care to perform proper deduplication.
			txnMeta := tc.txns[*ba.Txn.ID]
			distinctSpans := true
			if txnMeta != nil {
				et.IntentSpans = txnMeta.keys
				// Defensively set distinctSpans to false if we had any previous
				// requests in this transaction. This effectively limits the distinct
				// spans optimization to 1pc transactions.
				distinctSpans = len(txnMeta.keys) == 0
			}
			ba.IntentSpanIterate(func(key, endKey roachpb.Key) {
				et.IntentSpans = append(et.IntentSpans, roachpb.Span{
					Key:    key,
					EndKey: endKey,
				})
			})
			// TODO(peter): Populate DistinctSpans on all batches, not just batches
			// which contain an EndTransactionRequest.
			ba.Header.DistinctSpans = roachpb.MergeSpans(&et.IntentSpans) && distinctSpans
			if len(et.IntentSpans) == 0 {
				// If there aren't any intents, then there's factually no
				// transaction to end. Read-only txns have all of their state
				// in the client.
				return roachpb.NewErrorf("cannot commit a read-only transaction")
			}
			if txnMeta != nil {
				txnMeta.keys = et.IntentSpans
			}
			return nil
		}(); pErr != nil {
			return nil, pErr
		}

		if hasET && log.V(1) {
			for _, intent := range et.IntentSpans {
				log.Trace(ctx, fmt.Sprintf("intent: [%s,%s)",
					intent.Key, intent.EndKey))
			}
		}
	}

	// Send the command through wrapped sender, taking appropriate measures
	// on error.
	var br *roachpb.BatchResponse
	{
		var pErr *roachpb.Error
		br, pErr = tc.wrapped.Send(ctx, ba)

		if _, ok := pErr.GetDetail().(*roachpb.OpRequiresTxnError); ok {
			// TODO(tschottdorf): needs to keep the trace.
			br, pErr = tc.resendWithTxn(ba)
		}

		if pErr = tc.updateState(startNS, ctx, ba, br, pErr); pErr != nil {
			log.Trace(ctx, fmt.Sprintf("error: %s", pErr))
			return nil, pErr
		}
	}

	if br.Txn == nil {
		return br, nil
	}

	if _, ok := ba.GetArg(roachpb.EndTransaction); !ok {
		return br, nil
	}
	// If the --linearizable flag is set, we want to make sure that
	// all the clocks in the system are past the commit timestamp
	// of the transaction. This is guaranteed if either
	// - the commit timestamp is MaxOffset behind startNS
	// - MaxOffset ns were spent in this function
	// when returning to the client. Below we choose the option
	// that involves less waiting, which is likely the first one
	// unless a transaction commits with an odd timestamp.
	if tsNS := br.Txn.Timestamp.WallTime; startNS > tsNS {
		startNS = tsNS
	}
	sleepNS := tc.clock.MaxOffset() -
		time.Duration(tc.clock.PhysicalNow()-startNS)
	if tc.linearizable && sleepNS > 0 {
		defer func() {
			if log.V(1) {
				log.Infof("%v: waiting %s on EndTransaction for linearizability", br.Txn.ID.Short(), util.TruncateDuration(sleepNS, time.Millisecond))
			}
			time.Sleep(sleepNS)
		}()
	}
	if br.Txn.Status != roachpb.PENDING {
		tc.Lock()
		tc.cleanupTxnLocked(ctx, *br.Txn)
		tc.Unlock()
	}
	return br, nil
}
// Start starts the test cluster by bootstrapping an in-memory store
// (defaults to maximum of 50M). The server is started, launching the
// node RPC server and all HTTP endpoints. Use the value of
// TestServer.Addr after Start() for client connections. Use Stop()
// to shutdown the server after the test completes.
func (ltc *LocalTestCluster) Start(t util.Tester) {
	nodeID := roachpb.NodeID(1)
	nodeDesc := &roachpb.NodeDescriptor{NodeID: nodeID}
	ltc.tester = t
	ltc.Manual = hlc.NewManualClock(0)
	ltc.Clock = hlc.NewClock(ltc.Manual.UnixNano)
	ltc.Stopper = stop.NewStopper()
	rpcContext := rpc.NewContext(testutils.NewNodeTestBaseContext(), ltc.Clock, ltc.Stopper)
	ltc.Gossip = gossip.New(rpcContext, nil, ltc.Stopper)
	ltc.Eng = engine.NewInMem(roachpb.Attributes{}, 50<<20, ltc.Stopper)

	ltc.stores = storage.NewStores(ltc.Clock)
	tracer := tracing.NewTracer()
	var rpcSend rpcSendFn = func(_ SendOptions, _ ReplicaSlice,
		args roachpb.BatchRequest, _ *rpc.Context) (*roachpb.BatchResponse, error) {
		if ltc.Latency > 0 {
			time.Sleep(ltc.Latency)
		}
		sp := tracer.StartSpan("node")
		defer sp.Finish()
		ctx := opentracing.ContextWithSpan(context.Background(), sp)
		log.Trace(ctx, args.String())
		br, pErr := ltc.stores.Send(ctx, args)
		if br == nil {
			br = &roachpb.BatchResponse{}
		}
		if br.Error != nil {
			panic(roachpb.ErrorUnexpectedlySet(ltc.stores, br))
		}
		br.Error = pErr
		if pErr != nil {
			log.Trace(ctx, "error: "+pErr.String())
		}
		return br, nil
	}
	retryOpts := GetDefaultDistSenderRetryOptions()
	retryOpts.Closer = ltc.Stopper.ShouldDrain()
	ltc.distSender = NewDistSender(&DistSenderContext{
		Clock: ltc.Clock,
		RangeDescriptorCacheSize: defaultRangeDescriptorCacheSize,
		RangeLookupMaxRanges:     defaultRangeLookupMaxRanges,
		LeaderCacheSize:          defaultLeaderCacheSize,
		RPCRetryOptions:          &retryOpts,
		nodeDescriptor:           nodeDesc,
		RPCSend:                  rpcSend,    // defined above
		RangeDescriptorDB:        ltc.stores, // for descriptor lookup
	}, ltc.Gossip)

	ltc.Sender = NewTxnCoordSender(ltc.distSender, ltc.Clock, false /* !linearizable */, tracer,
		ltc.Stopper, NewTxnMetrics(metric.NewRegistry()))
	ltc.DB = client.NewDB(ltc.Sender)
	transport := storage.NewDummyRaftTransport()
	ctx := storage.TestStoreContext()
	ctx.Clock = ltc.Clock
	ctx.DB = ltc.DB
	ctx.Gossip = ltc.Gossip
	ctx.Transport = transport
	ctx.Tracer = tracer
	ltc.Store = storage.NewStore(ctx, ltc.Eng, nodeDesc)
	if err := ltc.Store.Bootstrap(roachpb.StoreIdent{NodeID: nodeID, StoreID: 1}, ltc.Stopper); err != nil {
		t.Fatalf("unable to start local test cluster: %s", err)
	}
	ltc.stores.AddStore(ltc.Store)
	if err := ltc.Store.BootstrapRange(nil); err != nil {
		t.Fatalf("unable to start local test cluster: %s", err)
	}
	if err := ltc.Store.Start(ltc.Stopper); err != nil {
		t.Fatalf("unable to start local test cluster: %s", err)
	}
	ltc.Gossip.SetNodeID(nodeDesc.NodeID)
	if err := ltc.Gossip.SetNodeDescriptor(nodeDesc); err != nil {
		t.Fatalf("unable to set node descriptor: %s", err)
	}
}
Exemple #20
0
// Send sends one or more RPCs to clients specified by the slice of
// replicas. On success, Send returns the first successful reply. Otherwise,
// Send returns an error if and as soon as the number of failed RPCs exceeds
// the available endpoints less the number of required replies.
func send(opts SendOptions, replicas ReplicaSlice,
	args roachpb.BatchRequest, rpcContext *rpc.Context) (*roachpb.BatchResponse, error) {

	if len(replicas) < 1 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d",
				len(replicas), 1), false)
	}

	done := make(chan batchCall, len(replicas))

	clients := make([]batchClient, 0, len(replicas))
	for _, replica := range replicas {
		conn, err := rpcContext.GRPCDial(replica.NodeDesc.Address.String())
		if err != nil {
			return nil, err
		}
		argsCopy := args
		argsCopy.Replica = replica.ReplicaDescriptor
		clients = append(clients, batchClient{
			remoteAddr: replica.NodeDesc.Address.String(),
			conn:       conn,
			client:     roachpb.NewInternalClient(conn),
			args:       argsCopy,
		})
	}

	// Put known-unhealthy clients last.
	nHealthy, err := splitHealthy(clients)
	if err != nil {
		return nil, err
	}

	var orderedClients []batchClient
	switch opts.Ordering {
	case orderStable:
		orderedClients = clients
	case orderRandom:
		// Randomly permute order, but keep known-unhealthy clients last.
		shuffleClients(clients[:nHealthy])
		shuffleClients(clients[nHealthy:])

		orderedClients = clients
	}
	// TODO(spencer): going to need to also sort by affinity; closest
	// ping time should win. Makes sense to have the rpc client/server
	// heartbeat measure ping times. With a bit of seasoning, each
	// node will be able to order the healthy replicas based on latency.

	// Send the first request.
	sendOneFn(opts, rpcContext, orderedClients[0], done)
	orderedClients = orderedClients[1:]

	var errors, retryableErrors int

	// Wait for completions.
	var sendNextTimer util.Timer
	defer sendNextTimer.Stop()
	for {
		sendNextTimer.Reset(opts.SendNextTimeout)
		select {
		case <-sendNextTimer.C:
			sendNextTimer.Read = true
			// On successive RPC timeouts, send to additional replicas if available.
			if len(orderedClients) > 0 {
				log.Trace(opts.Context, "timeout, trying next peer")
				sendOneFn(opts, rpcContext, orderedClients[0], done)
				orderedClients = orderedClients[1:]
			}

		case call := <-done:
			err := call.err
			if err == nil {
				if log.V(2) {
					log.Infof("successful reply: %+v", call.reply)
				}

				return call.reply, nil
			}

			// Error handling.
			if log.V(1) {
				log.Warningf("error reply: %s", err)
			}

			errors++

			// Since we have a reconnecting client here, disconnect errors are retryable.
			disconnected := err == io.ErrUnexpectedEOF
			if retryErr, ok := err.(retry.Retryable); disconnected || (ok && retryErr.CanRetry()) {
				retryableErrors++
			}

			if remainingNonErrorRPCs := len(replicas) - errors; remainingNonErrorRPCs < 1 {
				return nil, roachpb.NewSendError(
					fmt.Sprintf("too many errors encountered (%d of %d total): %v",
						errors, len(clients), err), remainingNonErrorRPCs+retryableErrors >= 1)
			}
			// Send to additional replicas if available.
			if len(orderedClients) > 0 {
				log.Trace(opts.Context, "error, trying next peer")
				sendOneFn(opts, rpcContext, orderedClients[0], done)
				orderedClients = orderedClients[1:]
			}
		}
	}
}
// updateState updates the transaction state in both the success and
// error cases, applying those updates to the corresponding txnMeta
// object when adequate. It also updates certain errors with the
// updated transaction for use by client restarts.
func (tc *TxnCoordSender) updateState(
	startNS int64, ctx context.Context, ba roachpb.BatchRequest,
	br *roachpb.BatchResponse, pErr *roachpb.Error) *roachpb.Error {
	newTxn := &roachpb.Transaction{}
	newTxn.Update(ba.Txn)
	if pErr == nil {
		newTxn.Update(br.Txn)
	} else {
		newTxn.Update(pErr.GetTxn())
	}

	switch t := pErr.GetDetail().(type) {
	case *roachpb.TransactionStatusError:
		// Likely already committed or more obscure errors such as epoch or
		// timestamp regressions; consider txn dead.
		defer tc.cleanupTxn(ctx, *pErr.GetTxn())
	case *roachpb.OpRequiresTxnError:
		panic("OpRequiresTxnError must not happen at this level")
	case *roachpb.ReadWithinUncertaintyIntervalError:
		// If the reader encountered a newer write within the uncertainty
		// interval, we advance the txn's timestamp just past the last observed
		// timestamp from the node.
		restartTS, ok := newTxn.GetObservedTimestamp(pErr.OriginNode)
		if !ok {
			pErr = roachpb.NewError(util.Errorf("no observed timestamp for node %d found on uncertainty restart", pErr.OriginNode))
		} else {
			newTxn.Timestamp.Forward(restartTS)
			newTxn.Restart(ba.UserPriority, newTxn.Priority, newTxn.Timestamp)
		}
	case *roachpb.TransactionAbortedError:
		// Increase timestamp if applicable.
		newTxn.Timestamp.Forward(pErr.GetTxn().Timestamp)
		newTxn.Priority = pErr.GetTxn().Priority
		// Clean up the freshly aborted transaction in defer(), avoiding a
		// race with the state update below.
		defer tc.cleanupTxn(ctx, *newTxn)
	case *roachpb.TransactionPushError:
		// Increase timestamp if applicable, ensuring that we're
		// just ahead of the pushee.
		newTxn.Timestamp.Forward(t.PusheeTxn.Timestamp)
		newTxn.Restart(ba.UserPriority, t.PusheeTxn.Priority-1, newTxn.Timestamp)
	case *roachpb.TransactionRetryError:
		// Increase timestamp so on restart, we're ahead of any timestamp
		// cache entries or newer versions which caused the restart.
		newTxn.Restart(ba.UserPriority, pErr.GetTxn().Priority, newTxn.Timestamp)
	case *roachpb.WriteTooOldError:
		newTxn.Restart(ba.UserPriority, newTxn.Priority, t.ActualTimestamp)
	case nil:
		// Nothing to do here, avoid the default case.
	default:
		if pErr.GetTxn() != nil {
			if pErr.CanRetry() {
				panic("Retryable internal error must not happen at this level")
			} else {
				// Do not clean up the transaction here since the client might still
				// want to continue the transaction. For example, a client might
				// continue its transaction after receiving ConditionFailedError, which
				// can come from a unique index violation.
			}
		}
	}

	if pErr != nil && pErr.GetTxn() != nil {
		// Avoid changing existing errors because sometimes they escape into
		// goroutines and then there are races. Fairly sure there isn't one
		// here, but better safe than sorry.
		pErrShallow := *pErr
		pErrShallow.SetTxn(newTxn)
		pErr = &pErrShallow
	}

	if newTxn.ID == nil {
		return pErr
	}
	txnID := *newTxn.ID
	tc.Lock()
	defer tc.Unlock()
	txnMeta := tc.txns[txnID]
	// For successful transactional requests, keep the written intents and
	// the updated transaction record to be sent along with the reply.
	// The transaction metadata is created with the first writing operation.
	// A tricky edge case is that of a transaction which "fails" on the
	// first writing request, but actually manages to write some intents
	// (for example, due to being multi-range). In this case, there will
	// be an error, but the transaction will be marked as Writing and the
	// coordinator must track the state, for the client's retry will be
	// performed with a Writing transaction which the coordinator rejects
	// unless it is tracking it (on top of it making sense to track it;
	// after all, it **has** laid down intents and only the coordinator
	// can augment a potential EndTransaction call). See #3303.
	var intentGroup interval.RangeGroup
	if txnMeta != nil {
		intentGroup = txnMeta.keys
	} else if pErr == nil || newTxn.Writing {
		intentGroup = interval.NewRangeTree()
	}
	if intentGroup != nil {
		// Adding the intents even on error reduces the likelihood of dangling
		// intents blocking concurrent writers for extended periods of time.
		// See #3346.
		ba.IntentSpanIterate(func(key, endKey roachpb.Key) {
			addKeyRange(intentGroup, key, endKey)
		})

		if txnMeta == nil && intentGroup.Len() > 0 {
			if !newTxn.Writing {
				panic("txn with intents marked as non-writing")
			}
			// If the transaction is already over, there's no point in
			// launching a one-off coordinator which will shut down right
			// away. If we ended up here with an error, we'll always start
			// the coordinator - the transaction has laid down intents, so
			// we expect it to be committed/aborted at some point in the
			// future.
			if _, isEnding := ba.GetArg(roachpb.EndTransaction); pErr != nil || !isEnding {
				log.Trace(ctx, "coordinator spawns")
				txnMeta = &txnMetadata{
					txn:              *newTxn,
					keys:             intentGroup,
					firstUpdateNanos: startNS,
					lastUpdateNanos:  tc.clock.PhysicalNow(),
					timeoutDuration:  tc.clientTimeout,
					txnEnd:           make(chan struct{}),
				}
				tc.txns[txnID] = txnMeta

				if !tc.stopper.RunAsyncTask(func() {
					tc.heartbeatLoop(ctx, txnID)
				}) {
					// The system is already draining and we can't start the
					// heartbeat. We refuse new transactions for now because
					// they're likely not going to have all intents committed.
					// In principle, we can relax this as needed though.
					tc.unregisterTxnLocked(txnID)
					return roachpb.NewError(&roachpb.NodeUnavailableError{})
				}
			} else {
				// If this was a successful one phase commit, update stats
				// directly as they won't otherwise be updated on heartbeat
				// loop shutdown.
				etArgs, ok := br.Responses[len(br.Responses)-1].GetInner().(*roachpb.EndTransactionResponse)
				tc.updateStats(tc.clock.PhysicalNow()-startNS, 0, newTxn.Status, ok && etArgs.OnePhaseCommit)
			}
		}
	}

	// Update our record of this transaction, even on error.
	if txnMeta != nil {
		txnMeta.txn = *newTxn
		if !txnMeta.txn.Writing {
			panic("tracking a non-writing txn")
		}
		txnMeta.setLastUpdate(tc.clock.PhysicalNow())
	}
	if pErr == nil {
		// For successful transactional requests, always send the updated txn
		// record back.
		br.Txn = newTxn
	}
	return pErr
}
Exemple #22
0
// initStores initializes the Stores map from ID to Store. Stores are
// added to the local sender if already bootstrapped. A bootstrapped
// Store has a valid ident with cluster, node and Store IDs set. If
// the Store doesn't yet have a valid ident, it's added to the
// bootstraps list for initialization once the cluster and node IDs
// have been determined.
func (n *Node) initStores(
	ctx context.Context, engines []engine.Engine, stopper *stop.Stopper,
) error {
	var bootstraps []*storage.Store

	if len(engines) == 0 {
		return errors.Errorf("no engines")
	}
	for _, e := range engines {
		s := storage.NewStore(n.ctx, e, &n.Descriptor)
		log.Tracef(ctx, "created store for engine: %s", e)
		// Initialize each store in turn, handling un-bootstrapped errors by
		// adding the store to the bootstraps list.
		if err := s.Start(ctx, stopper); err != nil {
			if _, ok := err.(*storage.NotBootstrappedError); ok {
				log.Infof(ctx, "store %s not bootstrapped", s)
				bootstraps = append(bootstraps, s)
				continue
			}
			return errors.Errorf("failed to start store: %s", err)
		}
		if s.Ident.ClusterID == *uuid.EmptyUUID || s.Ident.NodeID == 0 {
			return errors.Errorf("unidentified store: %s", s)
		}
		capacity, err := s.Capacity()
		if err != nil {
			return errors.Errorf("could not query store capacity: %s", err)
		}
		log.Infof(ctx, "initialized store %s: %+v", s, capacity)
		n.addStore(s)
	}

	// If there are no initialized stores and no gossip resolvers,
	// bootstrap this node as the seed of a new cluster.
	if n.stores.GetStoreCount() == 0 {
		resolvers := n.ctx.Gossip.GetResolvers()
		// Check for the case of uninitialized node having only itself specified as join host.
		switch len(resolvers) {
		case 0:
			return errNeedsBootstrap
		case 1:
			if resolvers[0].Addr() == n.Descriptor.Address.String() {
				return errCannotJoinSelf
			}
		}
	}

	// Verify all initialized stores agree on cluster and node IDs.
	if err := n.validateStores(); err != nil {
		return err
	}
	log.Trace(ctx, "validated stores")

	// Set the stores map as the gossip persistent storage, so that
	// gossip can bootstrap using the most recently persisted set of
	// node addresses.
	if err := n.ctx.Gossip.SetStorage(n.stores); err != nil {
		return fmt.Errorf("failed to initialize the gossip interface: %s", err)
	}

	// Connect gossip before starting bootstrap. For new nodes, connecting
	// to the gossip network is necessary to get the cluster ID.
	n.connectGossip()
	log.Trace(ctx, "connected to gossip")

	// If no NodeID has been assigned yet, allocate a new node ID by
	// supplying 0 to initNodeID.
	if n.Descriptor.NodeID == 0 {
		n.initNodeID(0)
		n.initialBoot = true
		log.Tracef(ctx, "allocated node ID %d", n.Descriptor.NodeID)
	}

	// Bootstrap any uninitialized stores asynchronously.
	if len(bootstraps) > 0 {
		if err := stopper.RunAsyncTask(func() {
			n.bootstrapStores(n.Ctx(), bootstraps, stopper)
		}); err != nil {
			return err
		}
	}

	return nil
}
Exemple #23
0
// sendToReplicas sends one or more RPCs to clients specified by the slice of
// replicas. On success, Send returns the first successful reply. Otherwise,
// Send returns an error if and as soon as the number of failed RPCs exceeds
// the available endpoints less the number of required replies.
func (ds *DistSender) sendToReplicas(
	opts SendOptions,
	rangeID roachpb.RangeID,
	replicas ReplicaSlice,
	args roachpb.BatchRequest,
	rpcContext *rpc.Context,
) (*roachpb.BatchResponse, error) {
	if len(replicas) < 1 {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("insufficient replicas (%d) to satisfy send request of %d",
				len(replicas), 1))
	}

	done := make(chan BatchCall, len(replicas))

	transportFactory := opts.transportFactory
	if transportFactory == nil {
		transportFactory = grpcTransportFactory
	}
	transport, err := transportFactory(opts, rpcContext, replicas, args)
	if err != nil {
		return nil, err
	}
	defer transport.Close()
	if transport.IsExhausted() {
		return nil, roachpb.NewSendError(
			fmt.Sprintf("sending to all %d replicas failed", len(replicas)))
	}

	// Send the first request.
	pending := 1
	transport.SendNext(done)

	// Wait for completions. This loop will retry operations that fail
	// with errors that reflect per-replica state and may succeed on
	// other replicas.
	var sendNextTimer timeutil.Timer
	defer sendNextTimer.Stop()
	for {
		sendNextTimer.Reset(opts.SendNextTimeout)
		select {
		case <-sendNextTimer.C:
			sendNextTimer.Read = true
			// On successive RPC timeouts, send to additional replicas if available.
			if !transport.IsExhausted() {
				log.Trace(opts.Context, "timeout, trying next peer")
				pending++
				transport.SendNext(done)
			}

		case call := <-done:
			pending--
			err := call.Err
			if err == nil {
				if log.V(2) {
					log.Infof(opts.Context, "RPC reply: %+v", call.Reply)
				} else if log.V(1) && call.Reply.Error != nil {
					log.Infof(opts.Context, "application error: %s", call.Reply.Error)
				}

				if !ds.handlePerReplicaError(rangeID, call.Reply.Error) {
					return call.Reply, nil
				}

				// Extract the detail so it can be included in the error
				// message if this is our last replica.
				//
				// TODO(bdarnell): The last error is not necessarily the best
				// one to return; we may want to remember the "best" error
				// we've seen (for example, a NotLeaseHolderError conveys more
				// information than a RangeNotFound).
				err = call.Reply.Error.GoError()
			} else if log.V(1) {
				log.Warningf(opts.Context, "RPC error: %s", err)
			}

			// Send to additional replicas if available.
			if !transport.IsExhausted() {
				log.Tracef(opts.Context, "error, trying next peer: %s", err)
				pending++
				transport.SendNext(done)
			}
			if pending == 0 {
				return nil, roachpb.NewSendError(
					fmt.Sprintf("sending to all %d replicas failed; last error: %v",
						len(replicas), err))
			}
		}
	}
}
Exemple #24
0
// Start starts the server on the specified port, starts gossip and initializes
// the node using the engines from the server's context.
//
// The passed context can be used to trace the server startup. The context
// should represent the general startup operation, and is different from
// contexts used at runtime for server's background work (like `s.Ctx()`).
func (s *Server) Start(ctx context.Context) error {
	// Copy log tags from s.Ctx()
	ctx = log.WithLogTagsFromCtx(ctx, s.Ctx())

	tlsConfig, err := s.ctx.GetServerTLSConfig()
	if err != nil {
		return err
	}

	httpServer := netutil.MakeServer(s.stopper, tlsConfig, s)
	plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect)
	}))

	// The following code is a specialization of util/net.go's ListenAndServe
	// which adds pgwire support. A single port is used to serve all protocols
	// (pg, http, h2) via the following construction:
	//
	// non-TLS case:
	// net.Listen -> cmux.New
	//               |
	//               -  -> pgwire.Match -> pgwire.Server.ServeConn
	//               -  -> cmux.Any -> grpc.(*Server).Serve
	//
	// TLS case:
	// net.Listen -> cmux.New
	//               |
	//               -  -> pgwire.Match -> pgwire.Server.ServeConn
	//               -  -> cmux.Any -> grpc.(*Server).Serve
	//
	// Note that the difference between the TLS and non-TLS cases exists due to
	// Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments
	// in util.ListenAndServe for an explanation of how h2c is implemented there
	// and here.

	ln, err := net.Listen("tcp", s.ctx.Addr)
	if err != nil {
		return err
	}
	log.Tracef(ctx, "listening on port %s", s.ctx.Addr)
	unresolvedAddr, err := officialAddr(s.ctx.Addr, ln.Addr())
	if err != nil {
		return err
	}
	s.ctx.Addr = unresolvedAddr.String()
	s.rpcContext.SetLocalInternalServer(s.node)

	m := cmux.New(ln)
	pgL := m.Match(pgwire.Match)
	anyL := m.Match(cmux.Any())

	httpLn, err := net.Listen("tcp", s.ctx.HTTPAddr)
	if err != nil {
		return err
	}
	unresolvedHTTPAddr, err := officialAddr(s.ctx.HTTPAddr, httpLn.Addr())
	if err != nil {
		return err
	}
	s.ctx.HTTPAddr = unresolvedHTTPAddr.String()

	s.stopper.RunWorker(func() {
		<-s.stopper.ShouldQuiesce()
		if err := httpLn.Close(); err != nil {
			log.Fatal(s.Ctx(), err)
		}
	})

	if tlsConfig != nil {
		httpMux := cmux.New(httpLn)
		clearL := httpMux.Match(cmux.HTTP1())
		tlsL := httpMux.Match(cmux.Any())

		s.stopper.RunWorker(func() {
			netutil.FatalIfUnexpected(httpMux.Serve())
		})

		s.stopper.RunWorker(func() {
			netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL))
		})

		httpLn = tls.NewListener(tlsL, tlsConfig)
	}

	s.stopper.RunWorker(func() {
		netutil.FatalIfUnexpected(httpServer.Serve(httpLn))
	})

	s.stopper.RunWorker(func() {
		<-s.stopper.ShouldQuiesce()
		netutil.FatalIfUnexpected(anyL.Close())
		<-s.stopper.ShouldStop()
		s.grpc.Stop()
	})

	s.stopper.RunWorker(func() {
		netutil.FatalIfUnexpected(s.grpc.Serve(anyL))
	})

	s.stopper.RunWorker(func() {
		netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) {
			if err := s.pgServer.ServeConn(conn); err != nil && !netutil.IsClosedConnection(err) {
				log.Error(s.Ctx(), err)
			}
		}))
	})

	if len(s.ctx.SocketFile) != 0 {
		// Unix socket enabled: postgres protocol only.
		unixLn, err := net.Listen("unix", s.ctx.SocketFile)
		if err != nil {
			return err
		}

		s.stopper.RunWorker(func() {
			<-s.stopper.ShouldQuiesce()
			if err := unixLn.Close(); err != nil {
				log.Fatal(s.Ctx(), err)
			}
		})

		s.stopper.RunWorker(func() {
			netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) {
				if err := s.pgServer.ServeConn(conn); err != nil &&
					!netutil.IsClosedConnection(err) {
					log.Error(s.Ctx(), err)
				}
			}))
		})
	}

	// Enable the debug endpoints first to provide an earlier window
	// into what's going on with the node in advance of exporting node
	// functionality.
	// TODO(marc): when cookie-based authentication exists,
	// apply it for all web endpoints.
	s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug))

	s.gossip.Start(unresolvedAddr)
	log.Trace(ctx, "started gossip")

	if err := s.node.start(ctx, unresolvedAddr, s.ctx.Engines, s.ctx.NodeAttributes); err != nil {
		return err
	}
	log.Trace(ctx, "started node")

	// Set the NodeID in the base context (which was inherited by the
	// various components of the server).
	s.nodeLogTagVal.Set(int64(s.node.Descriptor.NodeID))

	// We can now add the node registry.
	s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt)

	// Begin recording runtime statistics.
	s.startSampleEnvironment(s.ctx.MetricsSampleInterval)

	// Begin recording time series data collected by the status monitor.
	s.tsDB.PollSource(s.recorder, s.ctx.MetricsSampleInterval, ts.Resolution10s, s.stopper)

	// Begin recording status summaries.
	s.node.startWriteSummaries(s.ctx.MetricsSampleInterval)

	s.sqlExecutor.SetNodeID(s.node.Descriptor.NodeID)

	// Create and start the schema change manager only after a NodeID
	// has been assigned.
	testingKnobs := new(sql.SchemaChangeManagerTestingKnobs)
	if s.ctx.TestingKnobs.SQLSchemaChangeManager != nil {
		testingKnobs = s.ctx.TestingKnobs.SQLSchemaChangeManager.(*sql.SchemaChangeManagerTestingKnobs)
	}
	sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper)

	log.Infof(s.Ctx(), "starting %s server at %s", s.ctx.HTTPRequestScheme(), unresolvedHTTPAddr)
	log.Infof(s.Ctx(), "starting grpc/postgres server at %s", unresolvedAddr)
	if len(s.ctx.SocketFile) != 0 {
		log.Infof(s.Ctx(), "starting postgres server at unix:%s", s.ctx.SocketFile)
	}

	s.stopper.RunWorker(func() {
		netutil.FatalIfUnexpected(m.Serve())
	})
	log.Trace(ctx, "accepting connections")

	// Initialize grpc-gateway mux and context.
	jsonpb := &util.JSONPb{
		EnumsAsInts:  true,
		EmitDefaults: true,
		Indent:       "  ",
	}
	protopb := new(util.ProtoPb)
	gwMux := gwruntime.NewServeMux(
		gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb),
		gwruntime.WithMarshalerOption(util.JSONContentType, jsonpb),
		gwruntime.WithMarshalerOption(util.AltJSONContentType, jsonpb),
		gwruntime.WithMarshalerOption(util.ProtoContentType, protopb),
		gwruntime.WithMarshalerOption(util.AltProtoContentType, protopb),
	)
	gwCtx, gwCancel := context.WithCancel(s.Ctx())
	s.stopper.AddCloser(stop.CloserFn(gwCancel))

	// Setup HTTP<->gRPC handlers.
	conn, err := s.rpcContext.GRPCDial(s.ctx.Addr)
	if err != nil {
		return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err)
	}

	for _, gw := range []grpcGatewayServer{&s.admin, s.status, &s.tsServer} {
		if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil {
			return err
		}
	}

	var uiFileSystem http.FileSystem
	uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false)
	if uiDebug {
		uiFileSystem = http.Dir("ui")
	} else {
		uiFileSystem = &assetfs.AssetFS{
			Asset:     ui.Asset,
			AssetDir:  ui.AssetDir,
			AssetInfo: ui.AssetInfo,
		}
	}
	uiFileServer := http.FileServer(uiFileSystem)

	s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		if r.URL.Path == "/" {
			if uiDebug {
				r.URL.Path = "debug.html"
			} else {
				r.URL.Path = "release.html"
			}
		}
		uiFileServer.ServeHTTP(w, r)
	}))

	// TODO(marc): when cookie-based authentication exists,
	// apply it for all web endpoints.
	s.mux.Handle(adminEndpoint, gwMux)
	s.mux.Handle(ts.URLPrefix, gwMux)
	s.mux.Handle(statusPrefix, s.status)
	s.mux.Handle(healthEndpoint, s.status)
	log.Trace(ctx, "added http endpoints")

	if err := sdnotify.Ready(); err != nil {
		log.Errorf(s.Ctx(), "failed to signal readiness using systemd protocol: %s", err)
	}
	log.Trace(ctx, "server ready")

	return nil
}
Exemple #25
0
func (tc *TxnCoordSender) heartbeat(ctx context.Context, txnID uuid.UUID) bool {
	tc.Lock()
	proceed := true
	txnMeta := tc.txns[txnID]
	var intentSpans []roachpb.Span
	// Before we send a heartbeat, determine whether this transaction
	// should be considered abandoned. If so, exit heartbeat.
	if txnMeta.hasClientAbandonedCoord(tc.clock.PhysicalNow()) {
		// TODO(tschottdorf): should we be more proactive here?
		// The client might be continuing the transaction
		// through another coordinator, but in the most likely
		// case it's just gone and the open transaction record
		// could block concurrent operations.
		if log.V(1) {
			log.Infof("transaction %s abandoned; stopping heartbeat",
				txnMeta.txn)
		}
		proceed = false
		// Grab the intents here to avoid potential race.
		intentSpans = collectIntentSpans(txnMeta.keys)
		txnMeta.keys.Clear()
	}
	// txnMeta.txn is possibly replaced concurrently,
	// so grab a copy before unlocking.
	txn := txnMeta.txn.Clone()
	tc.Unlock()

	ba := roachpb.BatchRequest{}
	ba.Txn = &txn

	if !proceed {
		// Actively abort the transaction and its intents since we assume it's abandoned.
		et := &roachpb.EndTransactionRequest{
			Span: roachpb.Span{
				Key: txn.Key,
			},
			Commit:      false,
			IntentSpans: intentSpans,
		}
		ba.Add(et)
		tc.stopper.RunAsyncTask(func() {
			// Use the wrapped sender since the normal Sender
			// does not allow clients to specify intents.
			// TODO(tschottdorf): not using the existing context here since that
			// leads to use-after-finish of the contained trace. Should fork off
			// before the goroutine.
			if _, pErr := tc.wrapped.Send(context.Background(), ba); pErr != nil {
				if log.V(1) {
					log.Warningf("abort due to inactivity failed for %s: %s ", txn, pErr)
				}
			}
		})
		return false
	}

	hb := &roachpb.HeartbeatTxnRequest{
		Now: tc.clock.Now(),
	}
	hb.Key = txn.Key
	ba.Add(hb)

	log.Trace(ctx, "heartbeat")
	_, err := tc.wrapped.Send(ctx, ba)
	// If the transaction is not in pending state, then we can stop
	// the heartbeat. It's either aborted or committed, and we resolve
	// write intents accordingly.
	if err != nil {
		log.Warningf("heartbeat to %s failed: %s", txn, err)
	}
	// TODO(bdarnell): once we have gotten a heartbeat response with
	// Status != PENDING, future heartbeats are useless. However, we
	// need to continue the heartbeatLoop until the client either
	// commits or abandons the transaction. We could save a little
	// pointless work by restructuring this loop to stop sending
	// heartbeats between the time that the transaction is aborted and
	// the client finds out. Furthermore, we could use this information
	// to send TransactionAbortedErrors to the client so it can restart
	// immediately instead of running until its EndTransaction.
	return true
}
Exemple #26
0
// lookupRangeDescriptorInternal is called from LookupRangeDescriptor or from tests.
//
// If a WaitGroup is supplied, it is signaled when the request is
// added to the inflight request map (with or without merging) or the
// function finishes. Used for testing.
func (rdc *rangeDescriptorCache) lookupRangeDescriptorInternal(
	ctx context.Context,
	key roachpb.RKey,
	evictToken *evictionToken,
	considerIntents bool,
	useReverseScan bool,
	wg *sync.WaitGroup,
) (*roachpb.RangeDescriptor, *evictionToken, error) {
	rdc.rangeCache.RLock()
	doneWg := func() {
		if wg != nil {
			wg.Done()
		}
		wg = nil
	}
	defer doneWg()

	if _, desc, err := rdc.getCachedRangeDescriptorLocked(key, useReverseScan); err != nil {
		rdc.rangeCache.RUnlock()
		return nil, nil, err
	} else if desc != nil {
		rdc.rangeCache.RUnlock()
		returnToken := rdc.makeEvictionToken(desc, func() error {
			return rdc.evictCachedRangeDescriptorLocked(key, desc, useReverseScan)
		})
		log.Trace(ctx, "looked up range descriptor from cache")
		return desc, returnToken, nil
	}

	if log.V(3) {
		log.Infof(ctx, "lookup range descriptor: key=%s\n%s", key, rdc.stringLocked())
	} else if log.V(2) {
		log.Infof(ctx, "lookup range descriptor: key=%s", key)
	}

	var res lookupResult
	requestKey := makeLookupRequestKey(key, evictToken, considerIntents, useReverseScan)
	rdc.lookupRequests.Lock()
	if req, inflight := rdc.lookupRequests.inflight[requestKey]; inflight {
		resC := make(chan lookupResult, 1)
		req.observers = append(req.observers, resC)
		rdc.lookupRequests.inflight[requestKey] = req
		rdc.lookupRequests.Unlock()
		rdc.rangeCache.RUnlock()
		doneWg()

		res = <-resC
		log.Trace(ctx, "looked up range descriptor with shared request")
	} else {
		rdc.lookupRequests.inflight[requestKey] = req
		rdc.lookupRequests.Unlock()
		rdc.rangeCache.RUnlock()
		doneWg()

		rs, preRs, err := rdc.performRangeLookup(ctx, key, considerIntents, useReverseScan)
		if err != nil {
			res = lookupResult{err: err}
		} else {
			switch len(rs) {
			case 0:
				res = lookupResult{err: fmt.Errorf("no range descriptors returned for %s", key)}
			case 1:
				desc := &rs[0]
				res = lookupResult{
					desc: desc,
					evictToken: rdc.makeEvictionToken(desc, func() error {
						return rdc.evictCachedRangeDescriptorLocked(key, desc, useReverseScan)
					}),
				}
			case 2:
				if !considerIntents {
					panic(fmt.Sprintf("more than 1 matching range descriptor returned for %s when not considering intents: %v", key, rs))
				}
				desc := &rs[0]
				nextDesc := rs[1]
				res = lookupResult{
					desc: desc,
					evictToken: rdc.makeEvictionToken(desc, func() error {
						return rdc.insertRangeDescriptorsLocked(nextDesc)
					}),
				}
			default:
				panic(fmt.Sprintf("more than 2 matching range descriptors returned for %s: %v", key, rs))
			}
		}

		// We want to be assured that all goroutines which experienced a cache miss
		// have joined our in-flight request, and all others will experience a
		// cache hit. This requires atomicity across cache population and
		// notification, hence this exclusive lock.
		rdc.rangeCache.Lock()
		if res.err == nil {
			// These need to be separate because we need to preserve the pointer to rs[0]
			// so that the seenDesc logic works correctly in EvictCachedRangeDescriptor. An
			// append could cause a copy, which would change the address of rs[0]. We insert
			// the prefetched descriptors first to avoid any unintended overwriting.
			if err := rdc.insertRangeDescriptorsLocked(preRs...); err != nil {
				log.Warningf(ctx, "range cache inserting prefetched descriptors failed: %v", err)
			}
			if err := rdc.insertRangeDescriptorsLocked(rs...); err != nil {
				res = lookupResult{err: err}
			}
		}

		// rdc.lookupRequests does not need to be locked here because we hold an exclusive
		// write lock on rdc.rangeCache. However, we do anyway for clarity and future proofing.
		rdc.lookupRequests.Lock()
		for _, observer := range rdc.lookupRequests.inflight[requestKey].observers {
			observer <- res
		}
		delete(rdc.lookupRequests.inflight, requestKey)
		rdc.lookupRequests.Unlock()
		rdc.rangeCache.Unlock()
		log.Trace(ctx, "looked up range descriptor")
	}

	// It rarely may be possible that we somehow got grouped in with the
	// wrong RangeLookup (eg. from a double split), so if we did, return
	// a retryable lookupMismatchError with an unmodified eviction token.
	if res.desc != nil {
		if (!useReverseScan && !res.desc.ContainsKey(key)) || (useReverseScan && !res.desc.ContainsExclusiveEndKey(key)) {
			return nil, evictToken, lookupMismatchError{
				desiredKey:     key,
				mismatchedDesc: res.desc,
			}
		}
	}
	return res.desc, res.evictToken, res.err
}
Exemple #27
0
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one
// which doesn't need to be subdivided further before going to a range (so no
// mixing of forward and reverse scans, etc). The parameters and return values
// correspond to client.Sender with the exception of the returned boolean,
// which is true when indicating that the caller should retry but needs to send
// EndTransaction in a separate request.
func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) {
	isReverse := ba.IsReverse()

	ctx, cleanup := tracing.EnsureContext(ctx, ds.Tracer)
	defer cleanup()

	// The minimal key range encompassing all requests contained within.
	// Local addressing has already been resolved.
	// TODO(tschottdorf): consider rudimentary validation of the batch here
	// (for example, non-range requests with EndKey, or empty key ranges).
	rs, err := keys.Range(ba)
	if err != nil {
		return nil, roachpb.NewError(err), false
	}
	var br *roachpb.BatchResponse

	// Send the request to one range per iteration.
	for {
		// Increase the sequence counter only once before sending RPCs to
		// the ranges involved in this chunk of the batch (as opposed to for
		// each RPC individually). On RPC errors, there's no guarantee that
		// the request hasn't made its way to the target regardless of the
		// error; we'd like the second execution to be caught by the sequence
		// cache if that happens. There is a small chance that that we address
		// a range twice in this chunk (stale/suboptimal descriptors due to
		// splits/merges) which leads to a transaction retry.
		// TODO(tschottdorf): it's possible that if we don't evict from the
		//   cache we could be in for a busy loop.
		ba.SetNewRequest()

		var curReply *roachpb.BatchResponse
		var desc *roachpb.RangeDescriptor
		var evictToken evictionToken
		var needAnother bool
		var pErr *roachpb.Error
		var finished bool
		for r := retry.Start(ds.rpcRetryOptions); r.Next(); {
			// Get range descriptor (or, when spanning range, descriptors). Our
			// error handling below may clear them on certain errors, so we
			// refresh (likely from the cache) on every retry.
			log.Trace(ctx, "meta descriptor lookup")
			desc, needAnother, evictToken, pErr = ds.getDescriptors(rs, evictToken, isReverse)

			// getDescriptors may fail retryably if the first range isn't
			// available via Gossip.
			if pErr != nil {
				log.Trace(ctx, "range descriptor lookup failed: "+pErr.String())
				if pErr.Retryable {
					if log.V(1) {
						log.Warning(pErr)
					}
					continue
				}
				break
			} else {
				log.Trace(ctx, "looked up range descriptor")
			}

			if needAnother && br == nil {
				// TODO(tschottdorf): we should have a mechanism for discovering
				// range merges (descriptor staleness will mostly go unnoticed),
				// or we'll be turning single-range queries into multi-range
				// queries for no good reason.

				// If there's no transaction and op spans ranges, possibly
				// re-run as part of a transaction for consistency. The
				// case where we don't need to re-run is if the read
				// consistency is not required.
				if ba.Txn == nil && ba.IsPossibleTransaction() &&
					ba.ReadConsistency != roachpb.INCONSISTENT {
					return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false
				}
				// If the request is more than but ends with EndTransaction, we
				// want the caller to come again with the EndTransaction in an
				// extra call.
				if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction {
					return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */
				}
			}

			// It's possible that the returned descriptor misses parts of the
			// keys it's supposed to scan after it's truncated to match the
			// descriptor. Example revscan [a,g), first desc lookup for "g"
			// returns descriptor [c,d) -> [d,g) is never scanned.
			// We evict and retry in such a case.
			includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool {
				if isReverse {
					// This approach is needed because rs.EndKey is exclusive.
					return desc.ContainsKeyRange(desc.StartKey, rs.EndKey)
				}
				return desc.ContainsKey(rs.Key)
			}
			if !includesFrontOfCurSpan(desc) {
				if err := evictToken.Evict(); err != nil {
					return nil, roachpb.NewError(err), false
				}
				// On addressing errors, don't backoff; retry immediately.
				r.Reset()
				continue
			}

			curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) {
				// Truncate the request to our current key range.
				intersected, iErr := rs.Intersect(desc)
				if iErr != nil {
					return nil, roachpb.NewError(iErr)
				}
				truncBA, numActive, trErr := truncate(ba, intersected)
				if numActive == 0 && trErr == nil {
					// This shouldn't happen in the wild, but some tests
					// exercise it.
					return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s",
						rs.Key, rs.EndKey, ba)
				}
				if trErr != nil {
					return nil, roachpb.NewError(trErr)
				}
				return ds.sendSingleRange(ctx, truncBA, desc)
			}()
			// If sending succeeded, break this loop.
			if pErr == nil {
				finished = true
				break
			}

			if log.V(1) {
				log.Warningf("failed to invoke %s: %s", ba, pErr)
			}
			log.Trace(ctx, fmt.Sprintf("reply error: %T", pErr.GetDetail()))

			// Error handling below.
			// If retryable, allow retry. For range not found or range
			// key mismatch errors, we don't backoff on the retry,
			// but reset the backoff loop so we can retry immediately.
			switch tErr := pErr.GetDetail().(type) {
			case *roachpb.SendError:
				// For an RPC error to occur, we must've been unable to contact
				// any replicas. In this case, likely all nodes are down (or
				// not getting back to us within a reasonable amount of time).
				// We may simply not be trying to talk to the up-to-date
				// replicas, so clearing the descriptor here should be a good
				// idea.
				if err := evictToken.Evict(); err != nil {
					return nil, roachpb.NewError(err), false
				}
				if tErr.CanRetry() {
					continue
				}
			case *roachpb.RangeNotFoundError:
				// Range descriptor might be out of date - evict it. This is
				// likely the result of a rebalance.
				if err := evictToken.Evict(); err != nil {
					return nil, roachpb.NewError(err), false
				}
				// On addressing errors, don't backoff; retry immediately.
				r.Reset()
				if log.V(1) {
					log.Warning(tErr)
				}
				continue
			case *roachpb.RangeKeyMismatchError:
				// Range descriptor might be out of date - evict it. This is
				// likely the result of a range split. If we have new range
				// descriptors, insert them instead as long as they are different
				// from the last descriptor to avoid endless loops.
				var replacements []roachpb.RangeDescriptor
				different := func(rd *roachpb.RangeDescriptor) bool {
					return !desc.RSpan().Equal(rd.RSpan())
				}
				if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) {
					replacements = append(replacements, *tErr.MismatchedRange)
				}
				if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) {
					if includesFrontOfCurSpan(tErr.SuggestedRange) {
						replacements = append(replacements, *tErr.SuggestedRange)

					}
				}
				// Same as Evict() if replacements is empty.
				if err := evictToken.EvictAndReplace(replacements...); err != nil {
					return nil, roachpb.NewError(err), false
				}
				// On addressing errors, don't backoff; retry immediately.
				r.Reset()
				if log.V(1) {
					log.Warning(tErr)
				}
				continue
			case *roachpb.NotLeaderError:
				newLeader := tErr.Leader
				if newLeader != nil {
					// Verify that leader is a known replica according to the
					// descriptor. If not, we've got a stale range descriptor;
					// evict cache.
					if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 {
						if log.V(1) {
							log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc)
						}
						if err := evictToken.Evict(); err != nil {
							return nil, roachpb.NewError(err), false
						}
					}
				} else {
					// If the new leader is unknown, we were talking to a
					// replica that is partitioned away from the majority. Our
					// range descriptor may be stale, so clear the cache.
					//
					// TODO(bdarnell): An unknown-leader error doesn't
					// necessarily mean our descriptor is stale. Ideally we
					// would treat these errors more like SendError: retry on
					// another node (at a lower level), and then if it reaches
					// this level then we know we've exhausted our options and
					// must clear the cache.
					if err := evictToken.Evict(); err != nil {
						return nil, roachpb.NewError(err), false
					}
					newLeader = &roachpb.ReplicaDescriptor{}
				}
				// Next, cache the new leader.
				ds.updateLeaderCache(roachpb.RangeID(desc.RangeID), *newLeader)
				if log.V(1) {
					log.Warning(tErr)
				}
				r.Reset()
				continue
			case retry.Retryable:
				if tErr.CanRetry() {
					if log.V(1) {
						log.Warning(tErr)
					}
					continue
				}
			}
			break
		}

		// Immediately return if querying a range failed non-retryably.
		if pErr != nil {
			return nil, pErr, false
		} else if !finished {
			select {
			case <-ds.rpcRetryOptions.Closer:
				return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false
			default:
				log.Fatal("exited retry loop with nil error but finished=false")
			}
		}

		ba.Txn.Update(curReply.Txn)

		if br == nil {
			// First response from a Range.
			br = curReply
		} else {
			// This was the second or later call in a cross-Range request.
			// Combine the new response with the existing one.
			if err := br.Combine(curReply); err != nil {
				return nil, roachpb.NewError(err), false
			}
		}

		if ba.MaxScanResults > 0 {
			// Count how many results we received.
			var numResults int64
			for _, resp := range curReply.Responses {
				if cResp, ok := resp.GetInner().(roachpb.Countable); ok {
					numResults += cResp.Count()
				}
			}
			if numResults > ba.MaxScanResults {
				panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxScanResults))
			}
			ba.MaxScanResults -= numResults
			if ba.MaxScanResults == 0 {
				// We are done with this batch. Some requests might have NoopResponses; we must
				// replace them with empty responses of the proper type.
				for i, req := range ba.Requests {
					if _, ok := br.Responses[i].GetInner().(*roachpb.NoopResponse); !ok {
						continue
					}
					union := roachpb.ResponseUnion{}
					var reply roachpb.Response
					if _, ok := req.GetInner().(*roachpb.ScanRequest); ok {
						reply = &roachpb.ScanResponse{}
					} else {
						_ = req.GetInner().(*roachpb.ReverseScanRequest)
						reply = &roachpb.ReverseScanResponse{}
					}
					union.MustSetInner(reply)
					br.Responses[i] = union
				}
				return br, nil, false
			}
		}

		// If this request has a bound (such as MaxResults in
		// ScanRequest) and we are going to query at least one more range,
		// check whether enough rows have been retrieved.
		// TODO(tschottdorf): need tests for executing a multi-range batch
		// with various bounded requests which saturate at different times.
		if needAnother {
			// Start with the assumption that all requests are saturated.
			// Below, we look at each and decide whether that's true.
			// Everything that is indeed saturated is "masked out" from the
			// batch request; only if that's all requests does needAnother
			// remain false.
			needAnother = false
			if br == nil {
				// Clone ba.Requests. This is because we're multi-range, and
				// some requests may be bounded, which could lead to them being
				// masked out once they're saturated. We don't want to risk
				// removing requests that way in the "master copy" since that
				// could lead to omitting requests in certain retry scenarios.
				ba.Requests = append([]roachpb.RequestUnion(nil), ba.Requests...)
			}
			for i, union := range ba.Requests {
				args := union.GetInner()
				if _, ok := args.(*roachpb.NoopRequest); ok {
					// NoopRequests are skipped.
					continue
				}
				boundedArg, ok := args.(roachpb.Bounded)
				if !ok {
					// Non-bounded request. We will have to query all ranges.
					needAnother = true
					continue
				}
				prevBound := boundedArg.GetBound()
				cReply, ok := curReply.Responses[i].GetInner().(roachpb.Countable)
				if !ok || prevBound <= 0 {
					// Request bounded, but without max results. Again, will
					// need to query everything we can. The case in which the reply
					// isn't countable occurs when the request wasn't active for
					// that range (since it didn't apply to it), so the response
					// is a NoopResponse.
					needAnother = true
					continue
				}
				nextBound := prevBound - cReply.Count()
				if nextBound <= 0 {
					// We've hit max results for this piece of the batch. Mask
					// it out (we've copied the requests slice above, so this
					// is kosher).
					union := &ba.Requests[i] // avoid working on copy
					union.MustSetInner(&noopRequest)
					continue
				}
				// The request isn't saturated yet.
				needAnother = true
				boundedArg.SetBound(nextBound)
			}
		}

		// If this was the last range accessed by this call, exit loop.
		if !needAnother {
			return br, nil, false
		}

		if isReverse {
			// In next iteration, query previous range.
			// We use the StartKey of the current descriptor as opposed to the
			// EndKey of the previous one since that doesn't have bugs when
			// stale descriptors come into play.
			rs.EndKey, err = prev(ba, desc.StartKey)
		} else {
			// In next iteration, query next range.
			// It's important that we use the EndKey of the current descriptor
			// as opposed to the StartKey of the next one: if the former is stale,
			// it's possible that the next range has since merged the subsequent
			// one, and unless both descriptors are stale, the next descriptor's
			// StartKey would move us to the beginning of the current range,
			// resulting in a duplicate scan.
			rs.Key, err = next(ba, desc.EndKey)
		}
		if err != nil {
			return nil, roachpb.NewError(err), false
		}
		log.Trace(ctx, "querying next range")
	}
}
func (rq *replicateQueue) process(
	ctx context.Context,
	now hlc.Timestamp,
	repl *Replica,
	sysCfg config.SystemConfig,
) error {
	desc := repl.Desc()
	// Find the zone config for this range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return err
	}
	action, _ := rq.allocator.ComputeAction(zone, desc)

	// Avoid taking action if the range has too many dead replicas to make
	// quorum.
	deadReplicas := rq.allocator.storePool.deadReplicas(repl.RangeID, desc.Replicas)
	quorum := computeQuorum(len(desc.Replicas))
	liveReplicaCount := len(desc.Replicas) - len(deadReplicas)
	if liveReplicaCount < quorum {
		return errors.Errorf("range requires a replication change, but lacks a quorum of live nodes.")
	}

	switch action {
	case AllocatorAdd:
		log.Trace(ctx, "adding a new replica")
		newStore, err := rq.allocator.AllocateTarget(zone.ReplicaAttrs[0], desc.Replicas, true)
		if err != nil {
			return err
		}
		newReplica := roachpb.ReplicaDescriptor{
			NodeID:  newStore.Node.NodeID,
			StoreID: newStore.StoreID,
		}

		log.VTracef(1, ctx, "%s: adding replica to %+v due to under-replication", repl, newReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, newReplica, desc); err != nil {
			return err
		}
	case AllocatorRemove:
		log.Trace(ctx, "removing a replica")
		// We require the lease in order to process replicas, so
		// repl.store.StoreID() corresponds to the lease-holder's store ID.
		removeReplica, err := rq.allocator.RemoveTarget(desc.Replicas, repl.store.StoreID())
		if err != nil {
			return err
		}
		log.VTracef(1, ctx, "%s: removing replica %+v due to over-replication", repl, removeReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, removeReplica, desc); err != nil {
			return err
		}
		// Do not requeue if we removed ourselves.
		if removeReplica.StoreID == repl.store.StoreID() {
			return nil
		}
	case AllocatorRemoveDead:
		log.Trace(ctx, "removing a dead replica")
		if len(deadReplicas) == 0 {
			if log.V(1) {
				log.Warningf(ctx, "Range of replica %s was identified as having dead replicas, but no dead replicas were found.", repl)
			}
			break
		}
		deadReplica := deadReplicas[0]
		log.VTracef(1, ctx, "%s: removing dead replica %+v from store", repl, deadReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.REMOVE_REPLICA, deadReplica, desc); err != nil {
			return err
		}
	case AllocatorNoop:
		log.Trace(ctx, "considering a rebalance")
		// The Noop case will result if this replica was queued in order to
		// rebalance. Attempt to find a rebalancing target.
		//
		// We require the lease in order to process replicas, so
		// repl.store.StoreID() corresponds to the lease-holder's store ID.
		rebalanceStore := rq.allocator.RebalanceTarget(
			zone.ReplicaAttrs[0], desc.Replicas, repl.store.StoreID())
		if rebalanceStore == nil {
			log.VTracef(1, ctx, "%s: no suitable rebalance target", repl)
			// No action was necessary and no rebalance target was found. Return
			// without re-queuing this replica.
			return nil
		}
		rebalanceReplica := roachpb.ReplicaDescriptor{
			NodeID:  rebalanceStore.Node.NodeID,
			StoreID: rebalanceStore.StoreID,
		}
		log.VTracef(1, ctx, "%s: rebalancing to %+v", repl, rebalanceReplica)
		if err = repl.ChangeReplicas(ctx, roachpb.ADD_REPLICA, rebalanceReplica, desc); err != nil {
			return err
		}
	}

	// Enqueue this replica again to see if there are more changes to be made.
	rq.MaybeAdd(repl, rq.clock.Now())
	return nil
}
Exemple #29
0
// sendChunk is in charge of sending an "admissible" piece of batch, i.e. one
// which doesn't need to be subdivided further before going to a range (so no
// mixing of forward and reverse scans, etc). The parameters and return values
// correspond to client.Sender with the exception of the returned boolean,
// which is true when indicating that the caller should retry but needs to send
// EndTransaction in a separate request.
func (ds *DistSender) sendChunk(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error, bool) {
	isReverse := ba.IsReverse()

	// TODO(radu): when contexts are properly plumbed, we should be able to get
	// the tracer from ctx, not from the DistSender.
	ctx, cleanup := tracing.EnsureContext(ctx, tracing.TracerFromCtx(ds.Ctx))
	defer cleanup()

	// The minimal key range encompassing all requests contained within.
	// Local addressing has already been resolved.
	// TODO(tschottdorf): consider rudimentary validation of the batch here
	// (for example, non-range requests with EndKey, or empty key ranges).
	rs, err := keys.Range(ba)
	if err != nil {
		return nil, roachpb.NewError(err), false
	}
	var br *roachpb.BatchResponse

	// Send the request to one range per iteration.
	for {
		// Increase the sequence counter only once before sending RPCs to
		// the ranges involved in this chunk of the batch (as opposed to for
		// each RPC individually). On RPC errors, there's no guarantee that
		// the request hasn't made its way to the target regardless of the
		// error; we'd like the second execution to be caught by the sequence
		// cache if that happens. There is a small chance that that we address
		// a range twice in this chunk (stale/suboptimal descriptors due to
		// splits/merges) which leads to a transaction retry.
		// TODO(tschottdorf): it's possible that if we don't evict from the
		//   cache we could be in for a busy loop.
		ba.SetNewRequest()

		var curReply *roachpb.BatchResponse
		var desc *roachpb.RangeDescriptor
		var evictToken *evictionToken
		var needAnother bool
		var pErr *roachpb.Error
		var finished bool
		var numAttempts int
		for r := retry.StartWithCtx(ctx, ds.rpcRetryOptions); r.Next(); {
			numAttempts++
			{
				const magicLogCurAttempt = 20

				var seq int32
				if ba.Txn != nil {
					seq = ba.Txn.Sequence
				}

				if numAttempts%magicLogCurAttempt == 0 || seq%magicLogCurAttempt == 0 {
					// Log a message if a request appears to get stuck for a long
					// time or, potentially, forever. See #8975.
					// The local counter captures this loop here; the Sequence number
					// should capture anything higher up (as it needs to be
					// incremented every time this method is called).
					log.Warningf(
						ctx,
						"%d retries for an RPC at sequence %d, last error was: %s, remaining key ranges %s: %s",
						numAttempts, seq, pErr, rs, ba,
					)
				}
			}
			// Get range descriptor (or, when spanning range, descriptors). Our
			// error handling below may clear them on certain errors, so we
			// refresh (likely from the cache) on every retry.
			log.Trace(ctx, "meta descriptor lookup")
			var err error
			desc, needAnother, evictToken, err = ds.getDescriptors(ctx, rs, evictToken, isReverse)

			// getDescriptors may fail retryably if, for example, the first
			// range isn't available via Gossip. Assume that all errors at
			// this level are retryable. Non-retryable errors would be for
			// things like malformed requests which we should have checked
			// for before reaching this point.
			if err != nil {
				log.Trace(ctx, "range descriptor lookup failed: "+err.Error())
				if log.V(1) {
					log.Warning(ctx, err)
				}
				pErr = roachpb.NewError(err)
				continue
			}

			if needAnother && br == nil {
				// TODO(tschottdorf): we should have a mechanism for discovering
				// range merges (descriptor staleness will mostly go unnoticed),
				// or we'll be turning single-range queries into multi-range
				// queries for no good reason.

				// If there's no transaction and op spans ranges, possibly
				// re-run as part of a transaction for consistency. The
				// case where we don't need to re-run is if the read
				// consistency is not required.
				if ba.Txn == nil && ba.IsPossibleTransaction() &&
					ba.ReadConsistency != roachpb.INCONSISTENT {
					return nil, roachpb.NewError(&roachpb.OpRequiresTxnError{}), false
				}
				// If the request is more than but ends with EndTransaction, we
				// want the caller to come again with the EndTransaction in an
				// extra call.
				if l := len(ba.Requests) - 1; l > 0 && ba.Requests[l].GetInner().Method() == roachpb.EndTransaction {
					return nil, roachpb.NewError(errors.New("cannot send 1PC txn to multiple ranges")), true /* shouldSplitET */
				}
			}

			// It's possible that the returned descriptor misses parts of the
			// keys it's supposed to scan after it's truncated to match the
			// descriptor. Example revscan [a,g), first desc lookup for "g"
			// returns descriptor [c,d) -> [d,g) is never scanned.
			// We evict and retry in such a case.
			includesFrontOfCurSpan := func(rd *roachpb.RangeDescriptor) bool {
				if isReverse {
					return desc.ContainsExclusiveEndKey(rs.EndKey)
				}
				return desc.ContainsKey(rs.Key)
			}
			if !includesFrontOfCurSpan(desc) {
				if err := evictToken.Evict(ctx); err != nil {
					return nil, roachpb.NewError(err), false
				}
				// On addressing errors, don't backoff; retry immediately.
				r.Reset()
				continue
			}

			curReply, pErr = func() (*roachpb.BatchResponse, *roachpb.Error) {
				// Truncate the request to our current key range.
				intersected, iErr := rs.Intersect(desc)
				if iErr != nil {
					return nil, roachpb.NewError(iErr)
				}
				truncBA, numActive, trErr := truncate(ba, intersected)
				if numActive == 0 && trErr == nil {
					// This shouldn't happen in the wild, but some tests
					// exercise it.
					return nil, roachpb.NewErrorf("truncation resulted in empty batch on [%s,%s): %s",
						rs.Key, rs.EndKey, ba)
				}
				if trErr != nil {
					return nil, roachpb.NewError(trErr)
				}
				return ds.sendSingleRange(ctx, truncBA, desc)
			}()
			// If sending succeeded, break this loop.
			if pErr == nil {
				finished = true
				break
			}

			log.VTracef(1, ctx, "reply error %s: %s", ba, pErr)

			// Error handling: If the error indicates that our range
			// descriptor is out of date, evict it from the cache and try
			// again. Errors that apply only to a single replica were
			// handled in send().
			//
			// TODO(bdarnell): Don't retry endlessly. If we fail twice in a
			// row and the range descriptor hasn't changed, return the error
			// to our caller.
			switch tErr := pErr.GetDetail().(type) {
			case *roachpb.SendError:
				// We've tried all the replicas without success. Either
				// they're all down, or we're using an out-of-date range
				// descriptor. Invalidate the cache and try again with the new
				// metadata.
				if err := evictToken.Evict(ctx); err != nil {
					return nil, roachpb.NewError(err), false
				}
				continue
			case *roachpb.RangeKeyMismatchError:
				// Range descriptor might be out of date - evict it. This is
				// likely the result of a range split. If we have new range
				// descriptors, insert them instead as long as they are different
				// from the last descriptor to avoid endless loops.
				var replacements []roachpb.RangeDescriptor
				different := func(rd *roachpb.RangeDescriptor) bool {
					return !desc.RSpan().Equal(rd.RSpan())
				}
				if tErr.MismatchedRange != nil && different(tErr.MismatchedRange) {
					replacements = append(replacements, *tErr.MismatchedRange)
				}
				if tErr.SuggestedRange != nil && different(tErr.SuggestedRange) {
					if includesFrontOfCurSpan(tErr.SuggestedRange) {
						replacements = append(replacements, *tErr.SuggestedRange)

					}
				}
				// Same as Evict() if replacements is empty.
				if err := evictToken.EvictAndReplace(ctx, replacements...); err != nil {
					return nil, roachpb.NewError(err), false
				}
				// On addressing errors, don't backoff; retry immediately.
				r.Reset()
				if log.V(1) {
					log.Warning(ctx, tErr)
				}
				continue
			}
			break
		}

		// Immediately return if querying a range failed non-retryably.
		if pErr != nil {
			return nil, pErr, false
		} else if !finished {
			select {
			case <-ds.rpcRetryOptions.Closer:
				return nil, roachpb.NewError(&roachpb.NodeUnavailableError{}), false
			case <-ctx.Done():
				return nil, roachpb.NewError(ctx.Err()), false
			default:
				log.Fatal(ctx, "exited retry loop with nil error but finished=false")
			}
		}

		ba.UpdateTxn(curReply.Txn)

		if br == nil {
			// First response from a Range.
			br = curReply
		} else {
			// This was the second or later call in a cross-Range request.
			// Combine the new response with the existing one.
			if err := br.Combine(curReply); err != nil {
				return nil, roachpb.NewError(err), false
			}
		}

		if isReverse {
			// In next iteration, query previous range.
			// We use the StartKey of the current descriptor as opposed to the
			// EndKey of the previous one since that doesn't have bugs when
			// stale descriptors come into play.
			rs.EndKey, err = prev(ba, desc.StartKey)
		} else {
			// In next iteration, query next range.
			// It's important that we use the EndKey of the current descriptor
			// as opposed to the StartKey of the next one: if the former is stale,
			// it's possible that the next range has since merged the subsequent
			// one, and unless both descriptors are stale, the next descriptor's
			// StartKey would move us to the beginning of the current range,
			// resulting in a duplicate scan.
			rs.Key, err = next(ba, desc.EndKey)
		}
		if err != nil {
			return nil, roachpb.NewError(err), false
		}

		if ba.MaxSpanRequestKeys > 0 {
			// Count how many results we received.
			var numResults int64
			for _, resp := range curReply.Responses {
				numResults += resp.GetInner().Header().NumKeys
			}
			if numResults > ba.MaxSpanRequestKeys {
				panic(fmt.Sprintf("received %d results, limit was %d", numResults, ba.MaxSpanRequestKeys))
			}
			ba.MaxSpanRequestKeys -= numResults
			if ba.MaxSpanRequestKeys == 0 {
				// prepare the batch response after meeting the max key limit.
				fillSkippedResponses(ba, br, rs)
				// done, exit loop.
				return br, nil, false
			}
		}

		// If this was the last range accessed by this call, exit loop.
		if !needAnother {
			return br, nil, false
		}

		// key cannot be less that the end key.
		if !rs.Key.Less(rs.EndKey) {
			panic(fmt.Sprintf("start key %s is less than %s", rs.Key, rs.EndKey))
		}

		log.Trace(ctx, "querying next range")
	}
}
// process performs a consistent lookup on the range descriptor to see if we are
// still a member of the range.
func (q *replicaGCQueue) process(
	ctx context.Context,
	now hlc.Timestamp,
	rng *Replica,
	_ config.SystemConfig,
) error {
	// Note that the Replicas field of desc is probably out of date, so
	// we should only use `desc` for its static fields like RangeID and
	// StartKey (and avoid rng.GetReplica() for the same reason).
	desc := rng.Desc()

	// Calls to RangeLookup typically use inconsistent reads, but we
	// want to do a consistent read here. This is important when we are
	// considering one of the metadata ranges: we must not do an
	// inconsistent lookup in our own copy of the range.
	b := &client.Batch{}
	b.AddRawRequest(&roachpb.RangeLookupRequest{
		Span: roachpb.Span{
			Key: keys.RangeMetaKey(desc.StartKey),
		},
		MaxRanges: 1,
	})
	if err := q.db.Run(b); err != nil {
		return err
	}
	br := b.RawResponse()
	reply := br.Responses[0].GetInner().(*roachpb.RangeLookupResponse)

	if len(reply.Ranges) != 1 {
		return errors.Errorf("expected 1 range descriptor, got %d", len(reply.Ranges))
	}

	replyDesc := reply.Ranges[0]
	if _, currentMember := replyDesc.GetReplicaDescriptor(rng.store.StoreID()); !currentMember {
		// We are no longer a member of this range; clean up our local data.
		if log.V(1) {
			log.Infof("destroying local data from range %d", desc.RangeID)
		}
		log.Trace(ctx, "destroying local data")
		if err := rng.store.RemoveReplica(rng, replyDesc, true); err != nil {
			return err
		}
	} else if desc.RangeID != replyDesc.RangeID {
		// If we get a different range ID back, then the range has been merged
		// away. But currentMember is true, so we are still a member of the
		// subsuming range. Shut down raft processing for the former range
		// and delete any remaining metadata, but do not delete the data.
		if log.V(1) {
			log.Infof("removing merged range %d", desc.RangeID)
		}
		log.Trace(ctx, "removing merged range")
		if err := rng.store.RemoveReplica(rng, replyDesc, false); err != nil {
			return err
		}

		// TODO(bdarnell): remove raft logs and other metadata (while leaving a
		// tombstone). Add tests for GC of merged ranges.
	} else {
		// This replica is a current member of the raft group. Set the last replica
		// GC check time to avoid re-processing for another check interval.
		if err := rng.setLastReplicaGCTimestamp(now); err != nil {
			return err
		}
	}

	return nil
}