Beispiel #1
0
// Send forwards the call to the single store. This is a poor man's
// version of kv.TxnCoordSender, but it serves the purposes of
// supporting tests in this package. Transactions are not supported.
// Since kv/ depends on storage/, we can't get access to a
// TxnCoordSender from here.
// TODO(tschottdorf): {kv->storage}.LocalSender
func (db *testSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	if et, ok := ba.GetArg(roachpb.EndTransaction); ok {
		return nil, roachpb.NewError(util.Errorf("%s method not supported", et.Method()))
	}
	// Lookup range and direct request.
	key, endKey := keys.Range(ba)
	rng := db.store.LookupReplica(key, endKey)
	if rng == nil {
		return nil, roachpb.NewError(roachpb.NewRangeKeyMismatchError(key, endKey, nil))
	}
	ba.RangeID = rng.Desc().RangeID
	replica := rng.GetReplica()
	if replica == nil {
		return nil, roachpb.NewError(util.Errorf("own replica missing in range"))
	}
	ba.Replica = *replica
	br, pErr := db.store.Send(ctx, ba)
	if br != nil && br.Error != nil {
		panic(roachpb.ErrorUnexpectedlySet(db.store, br))
	}
	if pErr != nil {
		return nil, pErr
	}
	return br, nil
}
Beispiel #2
0
// sendRPC sends one or more RPCs to replicas from the supplied roachpb.Replica
// slice. Returns an RPC error if the request could not be sent. Note
// that the reply may contain a higher level error and must be checked in
// addition to the RPC error.
// The replicas are assume to have been ordered by preference, closer ones (if
// any) at the front.
func (ds *DistSender) sendRPC(
	ctx context.Context,
	rangeID roachpb.RangeID,
	replicas ReplicaSlice,
	ba roachpb.BatchRequest,
) (*roachpb.BatchResponse, error) {
	if len(replicas) == 0 {
		return nil, noNodeAddrsAvailError{}
	}

	// TODO(pmattis): This needs to be tested. If it isn't set we'll
	// still route the request appropriately by key, but won't receive
	// RangeNotFoundErrors.
	ba.RangeID = rangeID

	// Set RPC opts with stipulation that one of N RPCs must succeed.
	rpcOpts := SendOptions{
		SendNextTimeout:  ds.sendNextTimeout,
		Timeout:          base.NetworkTimeout,
		Context:          ctx,
		transportFactory: ds.transportFactory,
	}
	tracing.AnnotateTrace()
	defer tracing.AnnotateTrace()

	reply, err := ds.sendToReplicas(rpcOpts, rangeID, replicas, ba, ds.rpcContext)
	if err != nil {
		return nil, err
	}
	return reply, nil
}
Beispiel #3
0
// sendRPC sends one or more RPCs to replicas from the supplied roachpb.Replica
// slice. First, replicas which have gossiped addresses are corralled (and
// rearranged depending on proximity and whether the request needs to go to a
// leader) and then sent via Send, with requirement that one RPC to a server
// must succeed. Returns an RPC error if the request could not be sent. Note
// that the reply may contain a higher level error and must be checked in
// addition to the RPC error.
func (ds *DistSender) sendRPC(trace opentracing.Span, rangeID roachpb.RangeID, replicas ReplicaSlice,
	order orderingPolicy, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	if len(replicas) == 0 {
		return nil, roachpb.NewError(noNodeAddrsAvailError{})
	}

	// TODO(pmattis): This needs to be tested. If it isn't set we'll
	// still route the request appropriately by key, but won't receive
	// RangeNotFoundErrors.
	ba.RangeID = rangeID

	// Set RPC opts with stipulation that one of N RPCs must succeed.
	rpcOpts := SendOptions{
		Ordering:        order,
		SendNextTimeout: defaultSendNextTimeout,
		Timeout:         rpc.DefaultRPCTimeout,
		Trace:           trace,
	}
	tracing.AnnotateTrace()
	defer tracing.AnnotateTrace()

	reply, err := ds.rpcSend(rpcOpts, replicas, ba, ds.rpcContext)
	if err != nil {
		return nil, roachpb.NewError(err)
	}
	return reply.(*roachpb.BatchResponse), nil
}
Beispiel #4
0
// Send implements the client.Sender interface. The store is looked up from the
// store map if specified by the request; otherwise, the command is being
// executed locally, and the replica is determined via lookup through each
// store's LookupRange method. The latter path is taken only by unit tests.
func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	var store *Store
	var err error

	// If we aren't given a Replica, then a little bending over
	// backwards here. This case applies exclusively to unittests.
	if ba.RangeID == 0 || ba.Replica.StoreID == 0 {
		var repl *roachpb.ReplicaDescriptor
		var rangeID roachpb.RangeID
		rs := keys.Range(ba)
		rangeID, repl, err = ls.lookupReplica(rs.Key, rs.EndKey)
		if err == nil {
			ba.RangeID = rangeID
			ba.Replica = *repl
		}
	}

	ctx = log.Add(ctx,
		log.RangeID, ba.RangeID)

	if err == nil {
		store, err = ls.GetStore(ba.Replica.StoreID)
	}

	if err != nil {
		return nil, roachpb.NewError(err)
	}

	sp, cleanupSp := tracing.SpanFromContext(opStores, store.Tracer(), ctx)
	defer cleanupSp()
	if ba.Txn != nil {
		// For calls that read data within a txn, we keep track of timestamps
		// observed from the various participating nodes' HLC clocks. If we have
		// a timestamp on file for this Node which is smaller than MaxTimestamp,
		// we can lower MaxTimestamp accordingly. If MaxTimestamp drops below
		// OrigTimestamp, we effectively can't see uncertainty restarts any
		// more.
		// Note that it's not an issue if MaxTimestamp propagates back out to
		// the client via a returned Transaction update - when updating a Txn
		// from another, the larger MaxTimestamp wins.
		if maxTS, ok := ba.Txn.GetObservedTimestamp(ba.Replica.NodeID); ok && maxTS.Less(ba.Txn.MaxTimestamp) {
			// Copy-on-write to protect others we might be sharing the Txn with.
			shallowTxn := *ba.Txn
			// The uncertainty window is [OrigTimestamp, maxTS), so if that window
			// is empty, there won't be any uncertainty restarts.
			if !ba.Txn.OrigTimestamp.Less(maxTS) {
				sp.LogEvent("read has no clock uncertainty")
			}
			shallowTxn.MaxTimestamp.Backward(maxTS)
			ba.Txn = &shallowTxn
		}
	}
	br, pErr := store.Send(ctx, ba)
	if br != nil && br.Error != nil {
		panic(roachpb.ErrorUnexpectedlySet(store, br))
	}
	return br, pErr
}
Beispiel #5
0
// Send implements the client.Sender interface. The store is looked up from the
// store map if specified by the request; otherwise, the command is being
// executed locally, and the replica is determined via lookup through each
// store's LookupRange method. The latter path is taken only by unit tests.
func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	sp := tracing.SpanFromContext(ctx)
	var store *Store
	var pErr *roachpb.Error

	// If we aren't given a Replica, then a little bending over
	// backwards here. This case applies exclusively to unittests.
	if ba.RangeID == 0 || ba.Replica.StoreID == 0 {
		var repl *roachpb.ReplicaDescriptor
		var rangeID roachpb.RangeID
		rs := keys.Range(ba)
		rangeID, repl, pErr = ls.lookupReplica(rs.Key, rs.EndKey)
		if pErr == nil {
			ba.RangeID = rangeID
			ba.Replica = *repl
		}
	}

	ctx = log.Add(ctx,
		log.RangeID, ba.RangeID)

	if pErr == nil {
		store, pErr = ls.GetStore(ba.Replica.StoreID)
	}

	var br *roachpb.BatchResponse
	if pErr != nil {
		return nil, pErr
	}
	// For calls that read data within a txn, we can avoid uncertainty
	// related retries in certain situations. If the node is in
	// "CertainNodes", we need not worry about uncertain reads any
	// more. Setting MaxTimestamp=OrigTimestamp for the operation
	// accomplishes that. See roachpb.Transaction.CertainNodes for details.
	if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) {
		// MaxTimestamp = Timestamp corresponds to no clock uncertainty.
		sp.LogEvent("read has no clock uncertainty")
		// Copy-on-write to protect others we might be sharing the Txn with.
		shallowTxn := *ba.Txn
		// We set to OrigTimestamp because that works for both SNAPSHOT and
		// SERIALIZABLE: If we used Timestamp instead, we could run into
		// unnecessary retries at SNAPSHOT. For example, a SNAPSHOT txn at
		// OrigTimestamp = 1000.0, Timestamp = 2000.0, MaxTimestamp = 3000.0
		// will always read at 1000, so a MaxTimestamp of 2000 will still let
		// it restart with uncertainty when it finds a value in (1000, 2000).
		shallowTxn.MaxTimestamp = ba.Txn.OrigTimestamp
		ba.Txn = &shallowTxn
	}
	br, pErr = store.Send(ctx, ba)
	if br != nil && br.Error != nil {
		panic(roachpb.ErrorUnexpectedlySet(store, br))
	}
	return br, pErr
}
Beispiel #6
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold. The transaction and abort cache records are also
// scanned and old entries evicted. During normal operation, both of these
// records are cleaned up when their respective transaction finishes, so the
// amount of work done here is expected to be small.
//
// Some care needs to be taken to avoid cyclic recreation of entries during GC:
// * a Push initiated due to an intent may recreate a transaction entry
// * resolving an intent may write a new abort cache entry
// * obtaining the transaction for a abort cache entry requires a Push
//
// The following order is taken below:
// 1) collect all intents with sufficiently old txn record
// 2) collect these intents' transactions
// 3) scan the transaction table, collecting abandoned or completed txns
// 4) push all of these transactions (possibly recreating entries)
// 5) resolve all intents (unless the txn is still PENDING), which will recreate
//    abort cache entries (but with the txn timestamp; i.e. likely gc'able)
// 6) scan the abort cache table for old entries
// 7) push these transactions (again, recreating txn entries).
// 8) send a GCRequest.
func (gcq *gcQueue) process(
	ctx context.Context,
	now hlc.Timestamp,
	repl *Replica,
	sysCfg config.SystemConfig,
) error {
	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return errors.Errorf("could not find zone config for range %s: %s", repl, err)
	}

	gcKeys, info, err := RunGC(ctx, desc, snap, now, zone.GC,
		func(now hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) {
			pushTxn(gcq.store.DB(), now, txn, typ)
		},
		func(intents []roachpb.Intent, poison bool, wait bool) error {
			return repl.store.intentResolver.resolveIntents(ctx, intents, poison, wait)
		})

	if err != nil {
		return err
	}

	gcq.eventLog.VInfof(true, "completed with stats %+v", info)

	var ba roachpb.BatchRequest
	var gcArgs roachpb.GCRequest
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()
	gcArgs.Keys = gcKeys
	gcArgs.Threshold = info.Threshold

	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(&gcArgs)
	if _, pErr := repl.Send(ctx, ba); pErr != nil {
		return pErr.GoError()
	}
	return nil
}
Beispiel #7
0
// Send implements the client.Sender interface. The store is looked up from the
// store map if specified by the request; otherwise, the command is being
// executed locally, and the replica is determined via lookup through each
// store's LookupRange method. The latter path is taken only by unit tests.
func (ls *Stores) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	sp := tracing.SpanFromContext(ctx)
	var store *Store
	var pErr *roachpb.Error

	// If we aren't given a Replica, then a little bending over
	// backwards here. This case applies exclusively to unittests.
	if ba.RangeID == 0 || ba.Replica.StoreID == 0 {
		var repl *roachpb.ReplicaDescriptor
		var rangeID roachpb.RangeID
		rs := keys.Range(ba)
		rangeID, repl, pErr = ls.lookupReplica(rs.Key, rs.EndKey)
		if pErr == nil {
			ba.RangeID = rangeID
			ba.Replica = *repl
		}
	}

	ctx = log.Add(ctx,
		log.RangeID, ba.RangeID)

	if pErr == nil {
		store, pErr = ls.GetStore(ba.Replica.StoreID)
	}

	var br *roachpb.BatchResponse
	if pErr != nil {
		return nil, pErr
	}
	// For calls that read data within a txn, we can avoid uncertainty
	// related retries in certain situations. If the node is in
	// "CertainNodes", we need not worry about uncertain reads any
	// more. Setting MaxTimestamp=Timestamp for the operation
	// accomplishes that. See roachpb.Transaction.CertainNodes for details.
	if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) {
		// MaxTimestamp = Timestamp corresponds to no clock uncertainty.
		sp.LogEvent("read has no clock uncertainty")
		// Copy-on-write to protect others we might be sharing the Txn with.
		shallowTxn := *ba.Txn
		shallowTxn.MaxTimestamp = ba.Txn.Timestamp
		ba.Txn = &shallowTxn
	}
	br, pErr = store.Send(ctx, ba)
	if br != nil && br.Error != nil {
		panic(roachpb.ErrorUnexpectedlySet(store, br))
	}
	return br, pErr
}
Beispiel #8
0
// Send implements the client.Sender interface. The store is looked up from the
// store map if specified by the request; otherwise, the command is being
// executed locally, and the replica is determined via lookup through each
// store's LookupRange method. The latter path is taken only by unit tests.
func (ls *LocalSender) Send(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	trace := tracer.FromCtx(ctx)
	var store *storage.Store
	var err error

	// If we aren't given a Replica, then a little bending over
	// backwards here. This case applies exclusively to unittests.
	if ba.RangeID == 0 || ba.Replica.StoreID == 0 {
		var repl *roachpb.ReplicaDescriptor
		var rangeID roachpb.RangeID
		key, endKey := keys.Range(ba)
		rangeID, repl, err = ls.lookupReplica(key, endKey)
		if err == nil {
			ba.RangeID = rangeID
			ba.Replica = *repl
		}
	}

	ctx = log.Add(ctx,
		log.RangeID, ba.RangeID)

	if err == nil {
		store, err = ls.GetStore(ba.Replica.StoreID)
	}

	var br *roachpb.BatchResponse
	if err != nil {
		return nil, roachpb.NewError(err)
	}
	// For calls that read data within a txn, we can avoid uncertainty
	// related retries in certain situations. If the node is in
	// "CertainNodes", we need not worry about uncertain reads any
	// more. Setting MaxTimestamp=Timestamp for the operation
	// accomplishes that. See roachpb.Transaction.CertainNodes for details.
	if ba.Txn != nil && ba.Txn.CertainNodes.Contains(ba.Replica.NodeID) {
		// MaxTimestamp = Timestamp corresponds to no clock uncertainty.
		trace.Event("read has no clock uncertainty")
		ba.Txn.MaxTimestamp = ba.Txn.Timestamp
	}
	br, pErr := store.Send(ctx, ba)
	if br != nil && br.Error != nil {
		panic(roachpb.ErrorUnexpectedlySet(store, br))
	}
	return br, pErr
}
Beispiel #9
0
func TestStoreExecuteNoop(t *testing.T) {
	defer leaktest.AfterTest(t)
	store, _, stopper := createTestStore(t)
	defer stopper.Stop()
	ba := roachpb.BatchRequest{}
	ba.Key = nil // intentional
	ba.RangeID = 1
	ba.Replica = roachpb.ReplicaDescriptor{StoreID: store.StoreID()}
	ba.Add(&roachpb.GetRequest{RequestHeader: roachpb.RequestHeader{Key: roachpb.Key("a")}})
	ba.Add(&roachpb.NoopRequest{})

	br, pErr := store.Send(context.Background(), ba)
	if pErr != nil {
		t.Error(pErr)
	}
	reply := br.Responses[1].GetInner()
	if _, ok := reply.(*roachpb.NoopResponse); !ok {
		t.Errorf("expected *roachpb.NoopResponse, got %T", reply)
	}
}
Beispiel #10
0
// sendRPC sends one or more RPCs to replicas from the supplied roachpb.Replica
// slice. First, replicas which have gossiped addresses are corralled (and
// rearranged depending on proximity and whether the request needs to go to a
// leader) and then sent via rpc.Send, with requirement that one RPC to a
// server must succeed. Returns an RPC error if the request could not be sent.
// Note that the reply may contain a higher level error and must be checked in
// addition to the RPC error.
func (ds *DistSender) sendRPC(trace *tracer.Trace, rangeID roachpb.RangeID, replicas replicaSlice, order rpc.OrderingPolicy,
	ba roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) {
	if len(replicas) == 0 {
		return nil, roachpb.NewError(noNodeAddrsAvailError{})
	}

	// Build a slice of replica addresses.
	addrs := make([]net.Addr, 0, len(replicas))
	replicaMap := make(map[string]*roachpb.ReplicaDescriptor, len(replicas))
	for i := range replicas {
		addr := replicas[i].NodeDesc.Address
		addrs = append(addrs, addr)
		replicaMap[addr.String()] = &replicas[i].ReplicaDescriptor
	}

	// TODO(pmattis): This needs to be tested. If it isn't set we'll
	// still route the request appropriately by key, but won't receive
	// RangeNotFoundErrors.
	ba.RangeID = rangeID

	// Set RPC opts with stipulation that one of N RPCs must succeed.
	rpcOpts := rpc.Options{
		N:               1,
		Ordering:        order,
		SendNextTimeout: defaultSendNextTimeout,
		Timeout:         defaultRPCTimeout,
		Trace:           trace,
	}
	// getArgs clones the arguments on demand for all but the first replica.
	firstArgs := true
	getArgs := func(addr net.Addr) proto.Message {
		var a *roachpb.BatchRequest
		// Use the supplied args proto if this is our first address.
		if firstArgs {
			firstArgs = false
			a = &ba
		} else {
			// Otherwise, copy the args value and set the replica in the header.
			a = proto.Clone(&ba).(*roachpb.BatchRequest)
		}
		if addr != nil {
			// TODO(tschottdorf): see len(replicas) above.
			a.Replica = *replicaMap[addr.String()]
		}
		return a
	}
	// RPCs are sent asynchronously and there is no synchronized access to
	// the reply object, so we don't pass itself to RPCSend.
	// Otherwise there maybe a race case:
	// If the RPC call times out using our original reply object,
	// we must not use it any more; the rpc call might still return
	// and just write to it at any time.
	// args.CreateReply() should be cheaper than proto.Clone which use reflect.
	getReply := func() proto.Message {
		return &roachpb.BatchResponse{}
	}

	const method = "Node.Batch"
	replies, err := ds.rpcSend(rpcOpts, method, addrs, getArgs, getReply, ds.rpcContext)
	if err != nil {
		return nil, roachpb.NewError(err)
	}
	return replies[0].(*roachpb.BatchResponse), nil
}
Beispiel #11
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold. The transaction and sequence cache records are also
// scanned and old entries evicted. During normal operation, both of these
// records are cleaned up when their respective transaction finishes, so the
// amount of work done here is expected to be small.
//
// Some care needs to be taken to avoid cyclic recreation of entries during GC:
// * a Push initiated due to an intent may recreate a transaction entry
// * resolving an intent may write a new sequence cache entry
// * obtaining the transaction for a sequence cache entry requires a Push
//
// The following order is taken below:
// 1) collect all intents with sufficiently old txn record
// 2) collect these intents' transactions
// 3) scan the transaction table, collecting abandoned or completed txns
// 4) push all of these transactions (possibly recreating entries)
// 5) resolve all intents (unless the txn is still PENDING), which will recreate
//    sequence cache entries (but with the txn timestamp; i.e. likely gc'able)
// 6) scan the sequence table for old entries
// 7) push these transactions (again, recreating txn entries).
// 8) send a GCRequest.
func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica,
	sysCfg config.SystemConfig) error {

	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	iter := newReplicaDataIterator(desc, snap, true /* replicatedOnly */)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return util.Errorf("could not find zone config for range %s: %s", repl, err)
	}

	gc := engine.NewGarbageCollector(now, zone.GC)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()
	txnExp := now
	txnExp.WallTime -= txnCleanupThreshold.Nanoseconds()

	gcArgs := &roachpb.GCRequest{}
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()

	var expBaseKey roachpb.Key
	var keys []engine.MVCCKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[uuid.UUID]*roachpb.Transaction{}
	intentSpanMap := map[uuid.UUID][]roachpb.Span{}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	var intentCount int
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						txnID := *meta.Txn.ID
						txn := &roachpb.Transaction{
							TxnMeta: *meta.Txn,
						}
						txnMap[txnID] = txn
						intentCount++
						intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey})
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		iterKey := iter.Key()
		if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = iterKey.Key
			if !iterKey.IsValue() {
				keys = []engine.MVCCKey{iter.Key()}
				vals = [][]byte{iter.Value()}
				continue
			}
			// An implicit metadata.
			keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)}
			// A nil value for the encoded MVCCMetadata. This will unmarshal to an
			// empty MVCCMetadata which is sufficient for processKeysAndValues to
			// determine that there is no intent.
			vals = [][]byte{nil}
		}
		keys = append(keys, iter.Key())
		vals = append(vals, iter.Value())
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()
	gcq.eventLog.Infof(true, "assembled %d transactions from %d old intents; found %d gc'able keys", len(txnMap), intentCount, len(gcArgs.Keys))

	txnKeys, err := gcq.processTransactionTable(repl, txnMap, txnExp)
	if err != nil {
		return err
	}

	// From now on, all newly added keys are range-local.
	// TODO(tschottdorf): Might need to use two requests at some point since we
	// hard-coded the full non-local key range in the header, but that does
	// not take into account the range-local keys. It will be OK as long as
	// we send directly to the Replica, though.
	gcArgs.Keys = append(gcArgs.Keys, txnKeys...)

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	gcq.eventLog.Infof(true, "pushing %d txns", len(txnMap))
	for _, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			continue
		}
		wg.Add(1)
		go gcq.pushTxn(repl, now, txn, roachpb.PUSH_ABORT, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for txnID, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentSpanMap[txnID] {
				intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta})
			}
		}
	}
	gcq.eventLog.Infof(true, "resolving %d intents", len(intents))

	if pErr := repl.store.intentResolver.resolveIntents(repl.context(), repl, intents,
		true /* wait */, false /* !poison */); pErr != nil {
		return pErr.GoError()
	}

	// Deal with any leftover sequence cache keys. There shouldn't be many of
	// them.
	leftoverSeqCacheKeys := gcq.processSequenceCache(repl, now, txnExp, txnMap)
	gcq.eventLog.Infof(true, "collected %d leftover sequence cache keys", len(leftoverSeqCacheKeys))
	gcArgs.Keys = append(gcArgs.Keys, leftoverSeqCacheKeys...)
	gcq.eventLog.Infof(true, "sending gc request for %d keys", len(gcArgs.Keys))

	var ba roachpb.BatchRequest
	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(gcArgs)
	if _, pErr := repl.Send(repl.context(), ba); pErr != nil {
		return pErr.GoError()
	}

	return nil
}
Beispiel #12
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold.
func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica,
	sysCfg *config.SystemConfig) error {

	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	iter := newReplicaDataIterator(desc, snap)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return fmt.Errorf("could not find GC policy for range %s: %s", repl, err)
	}
	policy := zone.GC

	gcMeta := roachpb.NewGCMetadata(now.WallTime)
	gc := engine.NewGarbageCollector(now, *policy)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()
	txnExp := now
	txnExp.WallTime -= txnCleanupThreshold.Nanoseconds()

	gcArgs := &roachpb.GCRequest{}
	// TODO(tschottdorf): This is one of these instances in which we want
	// to be more careful that the request ends up on the correct Replica,
	// and we might have to worry about mixing range-local and global keys
	// in a batch which might end up spanning Ranges by the time it executes.
	gcArgs.Key = desc.StartKey.AsRawKey()
	gcArgs.EndKey = desc.EndKey.AsRawKey()

	var expBaseKey roachpb.Key
	var keys []engine.MVCCKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[string]*roachpb.Transaction{}
	intentSpanMap := map[string][]roachpb.Span{}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						id := string(meta.Txn.ID)
						txnMap[id] = meta.Txn
						intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey})
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key())
		if err != nil {
			log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err)
			continue
		}
		if !isValue {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = baseKey
			keys = []engine.MVCCKey{iter.Key()}
			vals = [][]byte{iter.Value()}
		} else {
			if !baseKey.Equal(expBaseKey) {
				log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey)
				continue
			}
			keys = append(keys, iter.Key())
			vals = append(vals, iter.Value())
		}
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	txnKeys, err := processTransactionTable(repl, txnMap, txnExp)
	if err != nil {
		return err
	}

	// From now on, all newly added keys are range-local.
	// TODO(tschottdorf): Might need to use two requests at some point since we
	// hard-coded the full non-local key range in the header, but that does
	// not take into account the range-local keys. It will be OK as long as
	// we send directly to the Replica, though.
	gcArgs.Keys = append(gcArgs.Keys, txnKeys...)

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	for _, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			continue
		}
		wg.Add(1)
		go pushTxn(repl, now, txn, roachpb.ABORT_TXN, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for id, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentSpanMap[id] {
				intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn})
			}
		}
	}

	if err := repl.resolveIntents(repl.context(), intents, true /* wait */, false /* !poison */); err != nil {
		return err
	}

	// Deal with any leftover sequence cache keys. There shouldn't be many of
	// them.
	gcArgs.Keys = append(gcArgs.Keys, processSequenceCache(repl, now, txnExp, txnMap)...)

	// Send GC request through range.
	gcArgs.GCMeta = *gcMeta

	var ba roachpb.BatchRequest
	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Timestamp = now
	ba.Add(gcArgs)
	if _, pErr := repl.Send(repl.context(), ba); pErr != nil {
		return pErr.GoError()
	}

	// Store current timestamp as last verification for this replica, as
	// we've just successfully scanned.
	if err := repl.SetLastVerificationTimestamp(now); err != nil {
		log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err)
	}

	return nil
}
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a
// channel on which the result will be posted. If there's already a request in
// progress, we join in waiting for the results of that request.
// It is an error to call InitOrJoinRequest() while a request is in progress
// naming another replica as lease holder.
//
// replica is used to schedule and execute async work (proposing a RequestLease
// command). replica.mu is locked when delivering results, so calls from the
// replica happen either before or after a result for a pending request has
// happened.
//
// transfer needs to be set if the request represents a lease transfer (as
// opposed to an extension, or acquiring the lease when none is held).
//
// Note: Once this function gets a context to be used for cancellation, instead
// of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling
// the Raft command, similar to replica.addWriteCmd.
func (p *pendingLeaseRequest) InitOrJoinRequest(
	replica *Replica,
	nextLeaseHolder roachpb.ReplicaDescriptor,
	timestamp hlc.Timestamp,
	startKey roachpb.Key,
	transfer bool,
) <-chan *roachpb.Error {
	if nextLease := p.RequestPending(); nextLease != nil {
		if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID {
			// Join a pending request asking for the same replica to become lease
			// holder.
			return p.JoinRequest()
		}
		llChan := make(chan *roachpb.Error, 1)
		// We can't join the request in progress.
		llChan <- roachpb.NewErrorf("request for different replica in progress "+
			"(requesting: %+v, in progress: %+v)",
			nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID)
		return llChan
	}
	llChan := make(chan *roachpb.Error, 1)
	// No request in progress. Let's propose a Lease command asynchronously.
	// TODO(tschottdorf): get duration from configuration, either as a
	// config flag or, later, dynamically adjusted.
	startStasis := timestamp.Add(int64(replica.store.ctx.rangeLeaseActiveDuration), 0)
	expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0)
	reqSpan := roachpb.Span{
		Key: startKey,
	}
	var leaseReq roachpb.Request
	reqLease := roachpb.Lease{
		Start:       timestamp,
		StartStasis: startStasis,
		Expiration:  expiration,
		Replica:     nextLeaseHolder,
	}
	if transfer {
		leaseReq = &roachpb.TransferLeaseRequest{
			Span:  reqSpan,
			Lease: reqLease,
		}
	} else {
		leaseReq = &roachpb.RequestLeaseRequest{
			Span:  reqSpan,
			Lease: reqLease,
		}
	}
	if replica.store.Stopper().RunAsyncTask(func() {
		// Propose a RequestLease command and wait for it to apply.
		var execPErr *roachpb.Error
		ba := roachpb.BatchRequest{}
		ba.Timestamp = replica.store.Clock().Now()
		ba.RangeID = replica.RangeID
		ba.Add(leaseReq)
		// Send lease request directly to raft in order to skip unnecessary
		// checks from normal request machinery, (e.g. the command queue).
		// Note that the command itself isn't traced, but usually the caller
		// waiting for the result has an active Trace.
		ch, _, err := replica.proposeRaftCommand(
			replica.context(context.Background()), ba)
		if err != nil {
			execPErr = roachpb.NewError(err)
		} else {
			// If the command was committed, wait for the range to apply it.
			select {
			case c := <-ch:
				if c.Err != nil {
					if log.V(1) {
						log.Infof("failed to acquire lease for replica %s: %s", replica.store, c.Err)
					}
					execPErr = c.Err
				}
			case <-replica.store.Stopper().ShouldQuiesce():
				execPErr = roachpb.NewError(
					replica.newNotLeaseHolderError(nil, replica.store.StoreID(), replica.Desc()))
			}
		}

		// Send result of lease to all waiter channels.
		replica.mu.Lock()
		defer replica.mu.Unlock()
		for i, llChan := range p.llChans {
			// Don't send the same pErr object twice; this can lead to races. We could
			// clone every time but it's more efficient to send pErr itself to one of
			// the channels (the last one; if we send it earlier the race can still
			// happen).
			if i == len(p.llChans)-1 {
				llChan <- execPErr
			} else {
				llChan <- protoutil.Clone(execPErr).(*roachpb.Error) // works with `nil`
			}
		}
		p.llChans = p.llChans[:0]
		p.nextLease = roachpb.Lease{}
	}) != nil {
		// We failed to start the asynchronous task. Send a blank NotLeaseHolderError
		// back to indicate that we have no idea who the range lease holder might
		// be; we've withdrawn from active duty.
		llChan <- roachpb.NewError(
			replica.newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc))
		return llChan
	}
	p.llChans = append(p.llChans, llChan)
	p.nextLease = reqLease
	return llChan
}
Beispiel #14
0
// process iterates through all keys in a replica's range, calling the garbage
// collector for each key and associated set of values. GC'd keys are batched
// into GC calls. Extant intents are resolved if intents are older than
// intentAgeThreshold.
func (gcq *gcQueue) process(now roachpb.Timestamp, repl *Replica,
	sysCfg *config.SystemConfig) error {

	snap := repl.store.Engine().NewSnapshot()
	desc := repl.Desc()
	iter := newReplicaDataIterator(desc, snap)
	defer iter.Close()
	defer snap.Close()

	// Lookup the GC policy for the zone containing this key range.
	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
	if err != nil {
		return fmt.Errorf("could not find GC policy for range %s: %s", repl, err)
	}
	policy := zone.GC

	gcMeta := roachpb.NewGCMetadata(now.WallTime)
	gc := engine.NewGarbageCollector(now, *policy)

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()

	// TODO(tschottdorf): execution will use a leader-assigned local
	// timestamp to compute intent age. While this should be fine, could
	// consider adding a Now timestamp to GCRequest which would be used
	// instead.
	gcArgs := &roachpb.GCRequest{}
	var mu sync.Mutex
	var oldestIntentNanos int64 = math.MaxInt64
	var expBaseKey roachpb.Key
	var keys []engine.MVCCKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[string]*roachpb.Transaction{}
	intentSpanMap := map[string][]roachpb.Span{}

	// updateOldestIntent atomically updates the oldest intent.
	updateOldestIntent := func(intentNanos int64) {
		mu.Lock()
		defer mu.Unlock()
		if intentNanos < oldestIntentNanos {
			oldestIntentNanos = intentNanos
		}
	}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &engine.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf("unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						id := string(meta.Txn.ID)
						txnMap[id] = meta.Txn
						intentSpanMap[id] = append(intentSpanMap[id], roachpb.Span{Key: expBaseKey})
					} else {
						updateOldestIntent(meta.Txn.OrigTimestamp.WallTime)
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(roachpb.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		baseKey, ts, isValue, err := engine.MVCCDecodeKey(iter.Key())
		if err != nil {
			log.Errorf("unable to decode MVCC key: %q: %v", iter.Key(), err)
			continue
		}
		if !isValue {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = baseKey
			keys = []engine.MVCCKey{iter.Key()}
			vals = [][]byte{iter.Value()}
		} else {
			if !baseKey.Equal(expBaseKey) {
				log.Errorf("unexpectedly found a value for %q with ts=%s; expected key %q", baseKey, ts, expBaseKey)
				continue
			}
			keys = append(keys, iter.Key())
			vals = append(vals, iter.Value())
		}
	}
	if iter.Error() != nil {
		return iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	for _, txn := range txnMap {
		wg.Add(1)
		go gcq.pushTxn(repl, now, txn, updateOldestIntent, &wg)
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for id, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentSpanMap[id] {
				intents = append(intents, roachpb.Intent{Span: intent, Txn: *txn})
			}
		}
	}

	done := true
	if len(intents) > 0 {
		done = false
		repl.resolveIntents(repl.context(), intents)
	}

	// Set start and end keys.
	if len(gcArgs.Keys) > 0 {
		done = false
		gcArgs.Key = gcArgs.Keys[0].Key
		gcArgs.EndKey = gcArgs.Keys[len(gcArgs.Keys)-1].Key.Next()
	}

	if done {
		return nil
	}

	// Send GC request through range.
	gcMeta.OldestIntentNanos = proto.Int64(oldestIntentNanos)
	gcArgs.GCMeta = *gcMeta

	var ba roachpb.BatchRequest
	// Technically not needed since we're talking directly to the Range.
	ba.RangeID = desc.RangeID
	ba.Add(gcArgs)
	if _, pErr := repl.Send(repl.context(), ba); pErr != nil {
		return pErr.GoError()
	}

	// Store current timestamp as last verification for this replica, as
	// we've just successfully scanned.
	if err := repl.SetLastVerificationTimestamp(now); err != nil {
		log.Errorf("failed to set last verification timestamp for replica %s: %s", repl, err)
	}

	return nil
}