Пример #1
0
// UpdateDeadlineMaybe sets the transactions deadline to the lower of the
// current one (if any) and the passed value.
func (txn *Txn) UpdateDeadlineMaybe(deadline hlc.Timestamp) bool {
	if txn.deadline == nil || deadline.Less(*txn.deadline) {
		txn.deadline = &deadline
		return true
	}
	return false
}
Пример #2
0
// UpdateObservedTimestamp stores a timestamp off a node's clock for future
// operations in the transaction. When multiple calls are made for a single
// nodeID, the lowest timestamp prevails.
func (t *Transaction) UpdateObservedTimestamp(nodeID NodeID, maxTS hlc.Timestamp) {
	if t.ObservedTimestamps == nil {
		t.ObservedTimestamps = make(map[NodeID]hlc.Timestamp)
	}
	if ts, ok := t.ObservedTimestamps[nodeID]; !ok || maxTS.Less(ts) {
		t.ObservedTimestamps[nodeID] = maxTS
	}
}
Пример #3
0
// shouldQueueAgain is a helper function to determine whether the
// replica should be queued according to the current time, the last
// time the replica was processed, and the minimum interval between
// successive processing. Specifying minInterval=0 queues all replicas.
// Returns a bool for whether to queue as well as a priority based
// on how long it's been since last processed.
func shouldQueueAgain(now, last hlc.Timestamp, minInterval time.Duration) (bool, float64) {
	if minInterval == 0 || last == hlc.ZeroTimestamp {
		return true, 0
	}
	if diff := now.GoTime().Sub(last.GoTime()); diff >= minInterval {
		priority := float64(1)
		// If there's a non-zero last processed timestamp, adjust the
		// priority by a multiple of how long it's been since the last
		// time this replica was processed.
		if last != hlc.ZeroTimestamp {
			priority = float64(diff.Nanoseconds()) / float64(minInterval.Nanoseconds())
		}
		return true, priority
	}
	return false, 0
}
Пример #4
0
func replicaGCShouldQueueImpl(
	now, lastCheck, lastActivity hlc.Timestamp, isCandidate bool,
) (bool, float64) {
	timeout := ReplicaGCQueueInactivityThreshold
	priority := replicaGCPriorityDefault

	if isCandidate {
		// If the range is a candidate (which happens if its former replica set
		// ignores it), let it expire much earlier.
		timeout = ReplicaGCQueueCandidateTimeout
		priority = replicaGCPriorityCandidate
	} else if now.Less(lastCheck.Add(ReplicaGCQueueInactivityThreshold.Nanoseconds(), 0)) {
		// Return false immediately if the previous check was less than the
		// check interval in the past. Note that we don't do this if the
		// replica is in candidate state, in which case we want to be more
		// aggressive - a failed rebalance attempt could have checked this
		// range, and candidate state suggests that a retry succeeded. See
		// #7489.
		return false, 0
	}

	shouldQ := lastActivity.Add(timeout.Nanoseconds(), 0).Less(now)

	if !shouldQ {
		return false, 0
	}

	return shouldQ, priority
}
Пример #5
0
// NewTransaction creates a new transaction. The transaction key is
// composed using the specified baseKey (for locality with data
// affected by the transaction) and a random ID to guarantee
// uniqueness. The specified user-level priority is combined with a
// randomly chosen value to yield a final priority, used to settle
// write conflicts in a way that avoids starvation of long-running
// transactions (see Replica.PushTxn).
func NewTransaction(
	name string,
	baseKey Key,
	userPriority UserPriority,
	isolation enginepb.IsolationType,
	now hlc.Timestamp,
	maxOffset int64,
) *Transaction {
	u := uuid.MakeV4()

	return &Transaction{
		TxnMeta: enginepb.TxnMeta{
			Key:       baseKey,
			ID:        &u,
			Isolation: isolation,
			Timestamp: now,
			Priority:  MakePriority(userPriority),
			Sequence:  1,
		},
		Name:          name,
		OrigTimestamp: now,
		MaxTimestamp:  now.Add(maxOffset, 0),
	}
}
Пример #6
0
// leaseStatus returns lease status. If the lease is epoch-based,
// the liveness field will be set to the liveness used to compute
// its state, unless state == leaseError.
//
// - The lease is considered valid if the timestamp is covered by the
//   supplied lease. This is determined differently depending on the
//   lease properties. For expiration-based leases, the timestamp is
//   covered if it's less than the expiration (minus the maximum
//   clock offset). For epoch-based "node liveness" leases, the lease
//   epoch must match the owner node's liveness epoch -AND- the
//   timestamp must be within the node's liveness expiration (also
//   minus the maximum clock offset).
//
//   To be valid, a lease which contains a valid ProposedTS must have
//   a proposed timestamp greater than the minimum proposed timestamp,
//   which prevents a restarted process from serving commands, since
//   the command queue has been wiped through the restart.
//
// - The lease is considered in stasis if the timestamp is within the
//   maximum clock offset window of the lease expiration.
//
// - The lease is considered expired in all other cases.
//
// The maximum clock offset must always be taken into consideration to
// avoid a failure of linearizability on a single register during
// lease changes. Without that stasis period, the following could
// occur:
//
// * a range lease gets committed on the new lease holder (but not the old).
// * client proposes and commits a write on new lease holder (with a
//   timestamp just greater than the expiration of the old lease).
// * client tries to read what it wrote, but hits a slow coordinator
//   (which assigns a timestamp covered by the old lease).
// * the read is served by the old lease holder (which has not
//   processed the change in lease holdership).
// * the client fails to read their own write.
func (r *Replica) leaseStatus(
	lease *roachpb.Lease, timestamp, minProposedTS hlc.Timestamp,
) LeaseStatus {
	status := LeaseStatus{timestamp: timestamp, lease: lease}
	if lease == nil {
		status.state = leaseExpired
		return status
	}
	var expiration hlc.Timestamp
	if lease.Type() == roachpb.LeaseExpiration {
		expiration = lease.Expiration
	} else {
		var err error
		status.liveness, err = r.store.cfg.NodeLiveness.GetLiveness(lease.Replica.NodeID)
		if err != nil || status.liveness.Epoch < *lease.Epoch {
			// If lease validity can't be determined (e.g. gossip is down
			// and liveness info isn't available for owner), we can neither
			// use the lease nor do we want to attempt to acquire it.
			status.state = leaseError
			return status
		}
		if status.liveness.Epoch > *lease.Epoch {
			status.state = leaseExpired
			return status
		}
		expiration = status.liveness.Expiration
	}
	stasis := expiration.Add(-int64(r.store.Clock().MaxOffset()), 0)
	if timestamp.Less(stasis) {
		status.state = leaseValid
		// If the replica owns the lease, additional verify that the lease's
		// proposed timestamp is not earlier than the min proposed timestamp.
		if lease.Replica.StoreID == r.store.StoreID() &&
			lease.ProposedTS != nil && lease.ProposedTS.Less(minProposedTS) {
			status.state = leaseProscribed
		}
	} else if timestamp.Less(expiration) {
		status.state = leaseStasis
	} else {
		status.state = leaseExpired
	}
	return status
}
Пример #7
0
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a
// channel on which the result will be posted. If there's already a request in
// progress, we join in waiting for the results of that request.
// It is an error to call InitOrJoinRequest() while a request is in progress
// naming another replica as lease holder.
//
// replica is used to schedule and execute async work (proposing a RequestLease
// command). replica.mu is locked when delivering results, so calls from the
// replica happen either before or after a result for a pending request has
// happened.
//
// transfer needs to be set if the request represents a lease transfer (as
// opposed to an extension, or acquiring the lease when none is held).
//
// Note: Once this function gets a context to be used for cancellation, instead
// of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling
// the Raft command, similar to replica.addWriteCmd.
func (p *pendingLeaseRequest) InitOrJoinRequest(
	replica *Replica,
	nextLeaseHolder roachpb.ReplicaDescriptor,
	timestamp hlc.Timestamp,
	startKey roachpb.Key,
	transfer bool,
) <-chan *roachpb.Error {
	if nextLease, ok := p.RequestPending(); ok {
		if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID {
			// Join a pending request asking for the same replica to become lease
			// holder.
			return p.JoinRequest()
		}
		llChan := make(chan *roachpb.Error, 1)
		// We can't join the request in progress.
		llChan <- roachpb.NewErrorf("request for different replica in progress "+
			"(requesting: %+v, in progress: %+v)",
			nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID)
		return llChan
	}
	llChan := make(chan *roachpb.Error, 1)
	// No request in progress. Let's propose a Lease command asynchronously.
	// TODO(tschottdorf): get duration from configuration, either as a
	// config flag or, later, dynamically adjusted.
	startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0)
	expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0)
	reqSpan := roachpb.Span{
		Key: startKey,
	}
	var leaseReq roachpb.Request
	now := replica.store.Clock().Now()
	reqLease := roachpb.Lease{
		Start:       timestamp,
		StartStasis: startStasis,
		Expiration:  expiration,
		Replica:     nextLeaseHolder,
		ProposedTS:  &now,
	}
	if transfer {
		leaseReq = &roachpb.TransferLeaseRequest{
			Span:  reqSpan,
			Lease: reqLease,
		}
	} else {
		leaseReq = &roachpb.RequestLeaseRequest{
			Span:  reqSpan,
			Lease: reqLease,
		}
	}
	if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) {
		ctx = replica.AnnotateCtx(ctx)
		// Propose a RequestLease command and wait for it to apply.
		ba := roachpb.BatchRequest{}
		ba.Timestamp = replica.store.Clock().Now()
		ba.RangeID = replica.RangeID
		ba.Add(leaseReq)
		if log.V(2) {
			log.Infof(ctx, "sending lease request %v", leaseReq)
		}
		_, pErr := replica.Send(ctx, ba)
		// We reset our state below regardless of whether we've gotten an error or
		// not, but note that an error is ambiguous - there's no guarantee that the
		// transfer will not still apply. That's OK, however, as the "in transfer"
		// state maintained by the pendingLeaseRequest is not relied on for
		// correctness (see replica.mu.minLeaseProposedTS), and resetting the state
		// is beneficial as it'll allow the replica to attempt to transfer again or
		// extend the existing lease in the future.

		// Send result of lease to all waiter channels.
		replica.mu.Lock()
		defer replica.mu.Unlock()
		for _, llChan := range p.llChans {
			// Don't send the same transaction object twice; this can lead to races.
			if pErr != nil {
				pErrClone := *pErr
				pErrClone.SetTxn(pErr.GetTxn())
				llChan <- &pErrClone
			} else {
				llChan <- nil
			}
		}
		p.llChans = p.llChans[:0]
		p.nextLease = roachpb.Lease{}
	}) != nil {
		// We failed to start the asynchronous task. Send a blank NotLeaseHolderError
		// back to indicate that we have no idea who the range lease holder might
		// be; we've withdrawn from active duty.
		llChan <- roachpb.NewError(
			newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc))
		return llChan
	}
	p.llChans = append(p.llChans, llChan)
	p.nextLease = reqLease
	return llChan
}
Пример #8
0
// InitOrJoinRequest executes a RequestLease command asynchronously and returns a
// channel on which the result will be posted. If there's already a request in
// progress, we join in waiting for the results of that request.
// It is an error to call InitOrJoinRequest() while a request is in progress
// naming another replica as lease holder.
//
// replica is used to schedule and execute async work (proposing a RequestLease
// command). replica.mu is locked when delivering results, so calls from the
// replica happen either before or after a result for a pending request has
// happened.
//
// transfer needs to be set if the request represents a lease transfer (as
// opposed to an extension, or acquiring the lease when none is held).
//
// Note: Once this function gets a context to be used for cancellation, instead
// of replica.store.Stopper().ShouldQuiesce(), care will be needed for cancelling
// the Raft command, similar to replica.addWriteCmd.
func (p *pendingLeaseRequest) InitOrJoinRequest(
	replica *Replica,
	nextLeaseHolder roachpb.ReplicaDescriptor,
	timestamp hlc.Timestamp,
	startKey roachpb.Key,
	transfer bool,
) <-chan *roachpb.Error {
	if nextLease, ok := p.RequestPending(); ok {
		if nextLease.Replica.ReplicaID == nextLeaseHolder.ReplicaID {
			// Join a pending request asking for the same replica to become lease
			// holder.
			return p.JoinRequest()
		}
		llChan := make(chan *roachpb.Error, 1)
		// We can't join the request in progress.
		llChan <- roachpb.NewErrorf("request for different replica in progress "+
			"(requesting: %+v, in progress: %+v)",
			nextLeaseHolder.ReplicaID, nextLease.Replica.ReplicaID)
		return llChan
	}
	llChan := make(chan *roachpb.Error, 1)
	// No request in progress. Let's propose a Lease command asynchronously.
	// TODO(tschottdorf): get duration from configuration, either as a
	// config flag or, later, dynamically adjusted.
	startStasis := timestamp.Add(int64(replica.store.cfg.RangeLeaseActiveDuration), 0)
	expiration := startStasis.Add(int64(replica.store.Clock().MaxOffset()), 0)
	reqSpan := roachpb.Span{
		Key: startKey,
	}
	var leaseReq roachpb.Request
	reqLease := roachpb.Lease{
		Start:       timestamp,
		StartStasis: startStasis,
		Expiration:  expiration,
		Replica:     nextLeaseHolder,
	}
	if transfer {
		leaseReq = &roachpb.TransferLeaseRequest{
			Span:  reqSpan,
			Lease: reqLease,
		}
	} else {
		leaseReq = &roachpb.RequestLeaseRequest{
			Span:  reqSpan,
			Lease: reqLease,
		}
	}
	if replica.store.Stopper().RunAsyncTask(context.TODO(), func(ctx context.Context) {
		ctx = replica.AnnotateCtx(ctx)
		// Propose a RequestLease command and wait for it to apply.
		ba := roachpb.BatchRequest{}
		ba.Timestamp = replica.store.Clock().Now()
		ba.RangeID = replica.RangeID
		ba.Add(leaseReq)
		if log.V(2) {
			log.Infof(ctx, "sending lease request %v", leaseReq)
		}
		_, pErr := replica.Send(ctx, ba)

		// Send result of lease to all waiter channels.
		replica.mu.Lock()
		defer replica.mu.Unlock()
		for i, llChan := range p.llChans {
			// Don't send the same pErr object twice; this can lead to races. We could
			// clone every time but it's more efficient to send pErr itself to one of
			// the channels (the last one; if we send it earlier the race can still
			// happen).
			if i == len(p.llChans)-1 {
				llChan <- pErr
			} else {
				llChan <- protoutil.Clone(pErr).(*roachpb.Error) // works with `nil`
			}
		}
		p.llChans = p.llChans[:0]
		p.nextLease = roachpb.Lease{}
	}) != nil {
		// We failed to start the asynchronous task. Send a blank NotLeaseHolderError
		// back to indicate that we have no idea who the range lease holder might
		// be; we've withdrawn from active duty.
		llChan <- roachpb.NewError(
			newNotLeaseHolderError(nil, replica.store.StoreID(), replica.mu.state.Desc))
		return llChan
	}
	p.llChans = append(p.llChans, llChan)
	p.nextLease = reqLease
	return llChan
}
Пример #9
0
// Covers returns true if the given timestamp can be served by the Lease.
// This is the case if the timestamp precedes the Lease's stasis period.
// Note that the fact that a lease covers a timestamp is not enough for the
// holder of the lease to be able to serve a read with that timestamp;
// pendingLeaderLeaseRequest.TransferInProgress() should also be consulted to
// account for possible lease transfers.
func (l Lease) Covers(timestamp hlc.Timestamp) bool {
	return timestamp.Less(l.StartStasis)
}
Пример #10
0
// RunGC runs garbage collection for the specified descriptor on the provided
// Engine (which is not mutated). It uses the provided functions pushTxnFn and
// resolveIntentsFn to clarify the true status of and clean up after encountered
// transactions. It returns a slice of gc'able keys from the data, transaction,
// and abort spans.
func RunGC(
	ctx context.Context,
	desc *roachpb.RangeDescriptor,
	snap engine.Reader,
	now hlc.Timestamp,
	policy config.GCPolicy,
	pushTxnFn pushFunc,
	resolveIntentsFn resolveFunc,
) ([]roachpb.GCRequest_GCKey, GCInfo, error) {

	iter := NewReplicaDataIterator(desc, snap, true /* replicatedOnly */)
	defer iter.Close()

	var infoMu = lockableGCInfo{}
	infoMu.Policy = policy
	infoMu.Now = now

	{
		realResolveIntentsFn := resolveIntentsFn
		resolveIntentsFn = func(intents []roachpb.Intent, poison bool, wait bool) (err error) {
			defer func() {
				infoMu.Lock()
				infoMu.ResolveTotal += len(intents)
				if err == nil {
					infoMu.ResolveSuccess += len(intents)
				}
				infoMu.Unlock()
			}()
			return realResolveIntentsFn(intents, poison, wait)
		}
		realPushTxnFn := pushTxnFn
		pushTxnFn = func(ts hlc.Timestamp, txn *roachpb.Transaction, typ roachpb.PushTxnType) {
			infoMu.Lock()
			infoMu.PushTxn++
			infoMu.Unlock()
			realPushTxnFn(ts, txn, typ)
		}
	}

	// Compute intent expiration (intent age at which we attempt to resolve).
	intentExp := now
	intentExp.WallTime -= intentAgeThreshold.Nanoseconds()
	txnExp := now
	txnExp.WallTime -= txnCleanupThreshold.Nanoseconds()
	abortSpanGCThreshold := now.Add(-int64(abortCacheAgeThreshold), 0)

	gc := engine.MakeGarbageCollector(now, policy)
	infoMu.Threshold = gc.Threshold
	infoMu.TxnSpanGCThreshold = txnExp

	var gcKeys []roachpb.GCRequest_GCKey
	var expBaseKey roachpb.Key
	var keys []engine.MVCCKey
	var vals [][]byte

	// Maps from txn ID to txn and intent key slice.
	txnMap := map[uuid.UUID]*roachpb.Transaction{}
	intentSpanMap := map[uuid.UUID][]roachpb.Span{}

	// processKeysAndValues is invoked with each key and its set of
	// values. Intents older than the intent age threshold are sent for
	// resolution and values after the MVCC metadata, and possible
	// intent, are sent for garbage collection.
	processKeysAndValues := func() {
		// If there's more than a single value for the key, possibly send for GC.
		if len(keys) > 1 {
			meta := &enginepb.MVCCMetadata{}
			if err := proto.Unmarshal(vals[0], meta); err != nil {
				log.Errorf(ctx, "unable to unmarshal MVCC metadata for key %q: %s", keys[0], err)
			} else {
				// In the event that there's an active intent, send for
				// intent resolution if older than the threshold.
				startIdx := 1
				if meta.Txn != nil {
					// Keep track of intent to resolve if older than the intent
					// expiration threshold.
					if meta.Timestamp.Less(intentExp) {
						txnID := *meta.Txn.ID
						txn := &roachpb.Transaction{
							TxnMeta: *meta.Txn,
						}
						txnMap[txnID] = txn
						infoMu.IntentsConsidered++
						intentSpanMap[txnID] = append(intentSpanMap[txnID], roachpb.Span{Key: expBaseKey})
					}
					// With an active intent, GC ignores MVCC metadata & intent value.
					startIdx = 2
				}
				// See if any values may be GC'd.
				if gcTS := gc.Filter(keys[startIdx:], vals[startIdx:]); !gcTS.Equal(hlc.ZeroTimestamp) {
					// TODO(spencer): need to split the requests up into
					// multiple requests in the event that more than X keys
					// are added to the request.
					gcKeys = append(gcKeys, roachpb.GCRequest_GCKey{Key: expBaseKey, Timestamp: gcTS})
				}
			}
		}
	}

	// Iterate through the keys and values of this replica's range.
	for ; iter.Valid(); iter.Next() {
		iterKey := iter.Key()
		if !iterKey.IsValue() || !iterKey.Key.Equal(expBaseKey) {
			// Moving to the next key (& values).
			processKeysAndValues()
			expBaseKey = iterKey.Key
			if !iterKey.IsValue() {
				keys = []engine.MVCCKey{iter.Key()}
				vals = [][]byte{iter.Value()}
				continue
			}
			// An implicit metadata.
			keys = []engine.MVCCKey{engine.MakeMVCCMetadataKey(iterKey.Key)}
			// A nil value for the encoded MVCCMetadata. This will unmarshal to an
			// empty MVCCMetadata which is sufficient for processKeysAndValues to
			// determine that there is no intent.
			vals = [][]byte{nil}
		}
		keys = append(keys, iter.Key())
		vals = append(vals, iter.Value())
	}
	if iter.Error() != nil {
		return nil, GCInfo{}, iter.Error()
	}
	// Handle last collected set of keys/vals.
	processKeysAndValues()

	infoMu.IntentTxns = len(txnMap)
	infoMu.NumKeysAffected = len(gcKeys)

	txnKeys, err := processTransactionTable(ctx, snap, desc, txnMap, txnExp, &infoMu, resolveIntentsFn)
	if err != nil {
		return nil, GCInfo{}, err
	}

	// From now on, all newly added keys are range-local.
	// TODO(tschottdorf): Might need to use two requests at some point since we
	// hard-coded the full non-local key range in the header, but that does
	// not take into account the range-local keys. It will be OK as long as
	// we send directly to the Replica, though.
	gcKeys = append(gcKeys, txnKeys...)

	// Process push transactions in parallel.
	var wg sync.WaitGroup
	sem := make(chan struct{}, gcTaskLimit)
	for _, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			continue
		}
		wg.Add(1)
		sem <- struct{}{}
		// Avoid passing loop variable into closure.
		txnCopy := txn
		go func() {
			defer func() {
				<-sem
				wg.Done()
			}()
			pushTxnFn(now, txnCopy, roachpb.PUSH_ABORT)
		}()
	}
	wg.Wait()

	// Resolve all intents.
	var intents []roachpb.Intent
	for txnID, txn := range txnMap {
		if txn.Status != roachpb.PENDING {
			for _, intent := range intentSpanMap[txnID] {
				intents = append(intents, roachpb.Intent{Span: intent, Status: txn.Status, Txn: txn.TxnMeta})
			}
		}
	}

	if err := resolveIntentsFn(intents, true /* wait */, false /* !poison */); err != nil {
		return nil, GCInfo{}, err
	}

	// Clean up the abort cache.
	gcKeys = append(gcKeys, processAbortCache(
		ctx, snap, desc.RangeID, abortSpanGCThreshold, &infoMu, pushTxnFn)...)
	return gcKeys, infoMu.GCInfo, nil
}
Пример #11
0
// selectEventTimestamp selects a timestamp for this log message. If the
// transaction this event is being written in has a non-zero timestamp, then that
// timestamp should be used; otherwise, the store's physical clock is used.
// This helps with testing; in normal usage, the logging of an event will never
// be the first action in the transaction, and thus the transaction will have an
// assigned database timestamp. However, in the case of our tests log events
// *are* the first action in a transaction, and we must elect to use the store's
// physical time instead.
func (ev EventLogger) selectEventTimestamp(input hlc.Timestamp) time.Time {
	if input == hlc.ZeroTimestamp {
		return ev.LeaseManager.clock.PhysicalTime()
	}
	return input.GoTime()
}
Пример #12
0
// TestTxnCoordSenderHeartbeat verifies periodic heartbeat of the
// transaction record.
func TestTxnCoordSenderHeartbeat(t *testing.T) {
	defer leaktest.AfterTest(t)()
	s, sender := createTestDB(t)
	defer s.Stop()
	defer teardownHeartbeats(sender)

	// Set heartbeat interval to 1ms for testing.
	sender.heartbeatInterval = 1 * time.Millisecond

	initialTxn := client.NewTxn(context.Background(), *s.DB)
	if err := initialTxn.Put(roachpb.Key("a"), []byte("value")); err != nil {
		t.Fatal(err)
	}

	// Verify 3 heartbeats.
	var heartbeatTS hlc.Timestamp
	for i := 0; i < 3; i++ {
		util.SucceedsSoon(t, func() error {
			txn, pErr := getTxn(sender, &initialTxn.Proto)
			if pErr != nil {
				t.Fatal(pErr)
			}
			// Advance clock by 1ns.
			// Locking the TxnCoordSender to prevent a data race.
			sender.Lock()
			s.Manual.Increment(1)
			sender.Unlock()
			if txn.LastHeartbeat != nil && heartbeatTS.Less(*txn.LastHeartbeat) {
				heartbeatTS = *txn.LastHeartbeat
				return nil
			}
			return errors.Errorf("expected heartbeat")
		})
	}

	// Sneakily send an ABORT right to DistSender (bypassing TxnCoordSender).
	{
		var ba roachpb.BatchRequest
		ba.Add(&roachpb.EndTransactionRequest{
			Commit: false,
			Span:   roachpb.Span{Key: initialTxn.Proto.Key},
		})
		ba.Txn = &initialTxn.Proto
		if _, pErr := sender.wrapped.Send(context.Background(), ba); pErr != nil {
			t.Fatal(pErr)
		}
	}

	util.SucceedsSoon(t, func() error {
		sender.Lock()
		defer sender.Unlock()
		if txnMeta, ok := sender.txns[*initialTxn.Proto.ID]; !ok {
			t.Fatal("transaction unregistered prematurely")
		} else if txnMeta.txn.Status != roachpb.ABORTED {
			return fmt.Errorf("transaction is not aborted")
		}
		return nil
	})

	// Trying to do something else should give us a TransactionAbortedError.
	_, err := initialTxn.Get("a")
	assertTransactionAbortedError(t, err)
}
Пример #13
0
// isAsOf analyzes a select statement to bypass the logic in newPlan(),
// since that requires the transaction to be started already. If the returned
// timestamp is not nil, it is the timestamp to which a transaction should
// be set.
//
// max is a lower bound on what the transaction's timestamp will be. Used to
// check that the user didn't specify a timestamp in the future.
func isAsOf(planMaker *planner, stmt parser.Statement, max hlc.Timestamp) (*hlc.Timestamp, error) {
	s, ok := stmt.(*parser.Select)
	if !ok {
		return nil, nil
	}
	sc, ok := s.Select.(*parser.SelectClause)
	if !ok {
		return nil, nil
	}
	if sc.From == nil || sc.From.AsOf.Expr == nil {
		return nil, nil
	}
	te, err := sc.From.AsOf.Expr.TypeCheck(nil, parser.TypeString)
	if err != nil {
		return nil, err
	}
	d, err := te.Eval(&planMaker.evalCtx)
	if err != nil {
		return nil, err
	}
	var ts hlc.Timestamp
	switch d := d.(type) {
	case *parser.DString:
		// Allow nanosecond precision because the timestamp is only used by the
		// system and won't be returned to the user over pgwire.
		dt, err := parser.ParseDTimestamp(string(*d), time.Nanosecond)
		if err != nil {
			return nil, err
		}
		ts.WallTime = dt.Time.UnixNano()
	case *parser.DInt:
		ts.WallTime = int64(*d)
	case *parser.DDecimal:
		// Format the decimal into a string and split on `.` to extract the nanosecond
		// walltime and logical tick parts.
		s := d.String()
		parts := strings.SplitN(s, ".", 2)
		nanos, err := strconv.ParseInt(parts[0], 10, 64)
		if err != nil {
			return nil, errors.Wrap(err, "parse AS OF SYSTEM TIME argument")
		}
		var logical int64
		if len(parts) > 1 {
			// logicalLength is the number of decimal digits expected in the
			// logical part to the right of the decimal. See the implementation of
			// cluster_logical_timestamp().
			const logicalLength = 10
			p := parts[1]
			if lp := len(p); lp > logicalLength {
				return nil, errors.Errorf("bad AS OF SYSTEM TIME argument: logical part has too many digits")
			} else if lp < logicalLength {
				p += strings.Repeat("0", logicalLength-lp)
			}
			logical, err = strconv.ParseInt(p, 10, 32)
			if err != nil {
				return nil, errors.Wrap(err, "parse AS OF SYSTEM TIME argument")
			}
		}
		ts.WallTime = nanos
		ts.Logical = int32(logical)
	default:
		return nil, fmt.Errorf("unexpected AS OF SYSTEM TIME argument: %s (%T)", d.ResolvedType(), d)
	}
	if max.Less(ts) {
		return nil, fmt.Errorf("cannot specify timestamp in the future")
	}
	return &ts, nil
}
Пример #14
0
func TestBatchBuilderStress(t *testing.T) {
	defer leaktest.AfterTest(t)()

	stopper := stop.NewStopper()
	defer stopper.Stop()
	e := NewInMem(roachpb.Attributes{}, 1<<20)
	stopper.AddCloser(e)

	rng, _ := randutil.NewPseudoRand()

	for i := 0; i < 1000; i++ {
		count := 1 + rng.Intn(1000)

		func() {
			batch := e.NewBatch().(*rocksDBBatch)
			// Ensure that, even though we reach into the batch's internals with
			// dbPut etc, asking for the batch's Repr will get data from C++ and
			// not its unused builder.
			batch.flushes++
			defer batch.Close()

			builder := &RocksDBBatchBuilder{}

			for j := 0; j < count; j++ {
				var ts hlc.Timestamp
				if rng.Float32() <= 0.9 {
					// Give 90% of keys timestamps.
					ts.WallTime = rng.Int63()
					if rng.Float32() <= 0.1 {
						// Give 10% of timestamps a non-zero logical component.
						ts.Logical = rng.Int31()
					}
				}
				key := MVCCKey{
					Key:       []byte(fmt.Sprintf("%d", rng.Intn(10000))),
					Timestamp: ts,
				}
				// Generate a random mixture of puts, deletes and merges.
				switch rng.Intn(3) {
				case 0:
					if err := dbPut(batch.batch, key, []byte("value")); err != nil {
						t.Fatal(err)
					}
					builder.Put(key, []byte("value"))
				case 1:
					if err := dbClear(batch.batch, key); err != nil {
						t.Fatal(err)
					}
					builder.Clear(key)
				case 2:
					if err := dbMerge(batch.batch, key, appender("bar")); err != nil {
						t.Fatal(err)
					}
					builder.Merge(key, appender("bar"))
				}
			}

			batchRepr := batch.Repr()
			builderRepr := builder.Finish()
			if !bytes.Equal(batchRepr, builderRepr) {
				t.Fatalf("expected [% x], but got [% x]", batchRepr, builderRepr)
			}
		}()
	}
}
Пример #15
0
func (l *Liveness) isLive(now hlc.Timestamp, maxOffset time.Duration) bool {
	expiration := l.Expiration.Add(-maxOffset.Nanoseconds(), 0)
	return now.Less(expiration)
}
Пример #16
0
// add the specified timestamp to the cache as covering the range of
// keys from start to end. If end is nil, the range covers the start
// key only. txnID is nil for no transaction. readTSCache specifies
// whether the command adding this timestamp should update the read
// timestamp; false to update the write timestamp cache.
func (tc *timestampCache) add(
	start, end roachpb.Key, timestamp hlc.Timestamp, txnID *uuid.UUID, readTSCache bool,
) {
	// This gives us a memory-efficient end key if end is empty.
	if len(end) == 0 {
		end = start.Next()
		start = end[:len(start)]
	}
	tc.latest.Forward(timestamp)
	// Only add to the cache if the timestamp is more recent than the
	// low water mark.
	if tc.lowWater.Less(timestamp) {
		tcache := tc.wCache
		if readTSCache {
			tcache = tc.rCache
		}

		addRange := func(r interval.Range) {
			value := cacheValue{timestamp: timestamp, txnID: txnID}
			key := tcache.MakeKey(r.Start, r.End)
			entry := makeCacheEntry(key, value)
			tcache.AddEntry(entry)
		}
		r := interval.Range{
			Start: interval.Comparable(start),
			End:   interval.Comparable(end),
		}

		// Check existing, overlapping entries and truncate/split/remove if
		// superseded and in the past. If existing entries are in the future,
		// subtract from the range/ranges that need to be added to cache.
		for _, entry := range tcache.GetOverlaps(r.Start, r.End) {
			cv := entry.Value.(*cacheValue)
			key := entry.Key.(*cache.IntervalKey)
			sCmp := r.Start.Compare(key.Start)
			eCmp := r.End.Compare(key.End)
			if cv.timestamp.Less(timestamp) {
				// The existing interval has a timestamp less than the new
				// interval. Compare interval ranges to determine how to
				// modify existing interval.
				switch {
				case sCmp == 0 && eCmp == 0:
					// New and old are equal; replace old with new and avoid the need to insert new.
					//
					// New: ------------
					// Old: ------------
					//
					// New: ------------
					// Old:
					*cv = cacheValue{timestamp: timestamp, txnID: txnID}
					tcache.MoveToEnd(entry)
					return
				case sCmp <= 0 && eCmp >= 0:
					// New contains or is equal to old; delete old.
					//
					// New: ------------      ------------      ------------
					// Old:   --------    or    ----------  or  ----------
					//
					// New: ------------      ------------      ------------
					// Old:
					tcache.DelEntry(entry)
				case sCmp > 0 && eCmp < 0:
					// Old contains new; split up old into two.
					//
					// New:     ----
					// Old: ------------
					//
					// New:     ----
					// Old: ----    ----
					oldEnd := key.End
					key.End = r.Start

					newKey := tcache.MakeKey(r.End, oldEnd)
					newEntry := makeCacheEntry(newKey, *cv)
					tcache.AddEntryAfter(newEntry, entry)
				case eCmp >= 0:
					// Left partial overlap; truncate old end.
					//
					// New:     --------          --------
					// Old: --------      or  ------------
					//
					// New:     --------          --------
					// Old: ----              ----
					key.End = r.Start
				case sCmp <= 0:
					// Right partial overlap; truncate old start.
					//
					// New: --------          --------
					// Old:     --------  or  ------------
					//
					// New: --------          --------
					// Old:         ----              ----
					key.Start = r.End
				default:
					panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r))
				}
			} else if timestamp.Less(cv.timestamp) {
				// The existing interval has a timestamp greater than the new interval.
				// Compare interval ranges to determine how to modify new interval before
				// adding it to the timestamp cache.
				switch {
				case sCmp >= 0 && eCmp <= 0:
					// Old contains or is equal to new; no need to add.
					//
					// Old: -----------      -----------      -----------      -----------
					// New:    -----     or  -----------  or  --------     or     --------
					//
					// Old: -----------      -----------      -----------      -----------
					// New:
					return
				case sCmp < 0 && eCmp > 0:
					// New contains old; split up old into two. We can add the left piece
					// immediately because it is guaranteed to be before the rest of the
					// overlaps.
					//
					// Old:    ------
					// New: ------------
					//
					// Old:    ------
					// New: ---      ---
					lr := interval.Range{Start: r.Start, End: key.Start}
					addRange(lr)

					r.Start = key.End
				case eCmp > 0:
					// Left partial overlap; truncate new start.
					//
					// Old: --------          --------
					// New:     --------  or  ------------
					//
					// Old: --------          --------
					// New:         ----              ----
					r.Start = key.End
				case sCmp < 0:
					// Right partial overlap; truncate new end.
					//
					// Old:     --------          --------
					// New: --------      or  ------------
					//
					// Old:     --------          --------
					// New: ----              ----
					r.End = key.Start
				default:
					panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r))
				}
			} else if (cv.txnID == nil && txnID == nil) ||
				(cv.txnID != nil && txnID != nil && *cv.txnID == *txnID) {
				// The existing interval has a timestamp equal to the new
				// interval, and the same transaction ID.
				switch {
				case sCmp >= 0 && eCmp <= 0:
					// Old contains or is equal to new; no need to add.
					//
					// New:    -----     or  -----------  or  --------     or     --------
					// Old: -----------      -----------      -----------      -----------
					//
					// New:
					// Old: -----------      -----------      -----------      -----------
					return
				case sCmp <= 0 && eCmp >= 0:
					// New contains old; delete old.
					//
					// New: ------------      ------------      ------------
					// Old:   --------    or    ----------  or  ----------
					//
					// New: ------------      ------------      ------------
					// Old:
					tcache.DelEntry(entry)
				case eCmp >= 0:
					// Left partial overlap; truncate old end.
					//
					// New:     --------          --------
					// Old: --------      or  ------------
					//
					// New:     --------          --------
					// Old: ----              ----
					key.End = r.Start
				case sCmp <= 0:
					// Right partial overlap; truncate old start.
					//
					// New: --------          --------
					// Old:     --------  or  ------------
					//
					// New: --------          --------
					// Old:         ----              ----
					key.Start = r.End
				default:
					panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r))
				}
			} else {
				// The existing interval has a timestamp equal to the new
				// interval and a different transaction ID.
				switch {
				case sCmp == 0 && eCmp == 0:
					// New and old are equal. Segment is no longer owned by any
					// transaction.
					//
					// New: ------------
					// Old: ------------
					//
					// New:
					// Nil: ============
					// Old:
					cv.txnID = nil
					return
				case sCmp == 0 && eCmp > 0:
					// New contains old, left-aligned. Clear ownership of the
					// existing segment and truncate new.
					//
					// New: ------------
					// Old: ----------
					//
					// New:           --
					// Nil: ==========
					// Old:
					cv.txnID = nil
					r.Start = key.End
				case sCmp < 0 && eCmp == 0:
					// New contains old, right-aligned. Clear ownership of the
					// existing segment and truncate new.
					//
					// New: ------------
					// Old:   ----------
					//
					// New: --
					// Nil:   ==========
					// Old:
					cv.txnID = nil
					r.End = key.Start
				case sCmp < 0 && eCmp > 0:
					// New contains old; split into three segments with the
					// overlap owned by no txn.
					//
					// New: ------------
					// Old:   --------
					//
					// New: --        --
					// Nil:   ========
					// Old:
					cv.txnID = nil
					newKey := tcache.MakeKey(r.Start, key.Start)
					newEntry := makeCacheEntry(newKey, cacheValue{timestamp: timestamp, txnID: txnID})
					tcache.AddEntryAfter(newEntry, entry)
					r.Start = key.End
				case sCmp > 0 && eCmp < 0:
					// Old contains new; split up old into two. New segment is
					// owned by no txn.
					//
					// New:     ----
					// Old: ------------
					//
					// New:
					// Nil:     ====
					// Old: ----    ----
					txnID = nil
					oldEnd := key.End
					key.End = r.Start

					newKey := tcache.MakeKey(r.End, oldEnd)
					newEntry := makeCacheEntry(newKey, *cv)
					tcache.AddEntryAfter(newEntry, entry)
				case eCmp == 0:
					// Old contains new, right-aligned; truncate old end and clear
					// ownership of new segment.
					//
					// New:     --------
					// Old: ------------
					//
					// New:
					// Nil:     ========
					// Old: ----
					txnID = nil
					key.End = r.Start
				case sCmp == 0:
					// Old contains new, left-aligned; truncate old start and
					// clear ownership of new segment.
					// New: --------
					// Old: ------------
					//
					// New:
					// Nil: ========
					// Old:         ----
					txnID = nil
					key.Start = r.End
				case eCmp > 0:
					// Left partial overlap; truncate old end and split new into
					// segments owned by no txn (the overlap) and the new txn.
					//
					// New:     --------
					// Old: --------
					//
					// New:         ----
					// Nil:     ====
					// Old: ----
					key.End, r.Start = r.Start, key.End
					newKey := tcache.MakeKey(key.End, r.Start)
					newCV := cacheValue{timestamp: cv.timestamp, txnID: nil}
					newEntry := makeCacheEntry(newKey, newCV)
					tcache.AddEntryAfter(newEntry, entry)
				case sCmp < 0:
					// Right partial overlap; truncate old start and split new into
					// segments owned by no txn (the overlap) and the new txn.
					//
					// New: --------
					// Old:     --------
					//
					// New: ----
					// Nil:     ====
					// Old:         ----
					key.Start, r.End = r.End, key.Start
					newKey := tcache.MakeKey(r.End, key.Start)
					newCV := cacheValue{timestamp: cv.timestamp, txnID: nil}
					newEntry := makeCacheEntry(newKey, newCV)
					tcache.AddEntryAfter(newEntry, entry)
				default:
					panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r))
				}
			}
		}
		addRange(r)
	}
}
Пример #17
0
// selectEventTimestamp selects a timestamp for this log message. If the
// transaction this event is being written in has a non-zero timestamp, then that
// timestamp should be used; otherwise, the store's physical clock is used.
// This helps with testing; in normal usage, the logging of an event will never
// be the first action in the transaction, and thus the transaction will have an
// assigned database timestamp. However, in the case of our tests log events
// *are* the first action in a transaction, and we must elect to use the store's
// physical time instead.
func selectEventTimestamp(s *Store, input hlc.Timestamp) time.Time {
	if input == hlc.ZeroTimestamp {
		return s.Clock().PhysicalTime()
	}
	return input.GoTime()
}