Beispiel #1
0
// getLeaseForGossip tries to obtain a leader lease. Only one of the replicas
// should gossip; the bool returned indicates whether it's us.
func (r *Range) getLeaseForGossip(ctx context.Context) (bool, error) {
	// If no Gossip available (some tests) or range too fresh, noop.
	if r.rm.Gossip() == nil || !r.isInitialized() {
		return false, util.Errorf("no gossip or range not initialized")
	}
	var hasLease bool
	var err error
	if !r.rm.Stopper().RunTask(func() {
		timestamp := r.rm.Clock().Now()
		// Check for or obtain the lease, if none active.
		err = r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), timestamp)
		hasLease = err == nil
		if err != nil {
			switch e := err.(type) {
			// NotLeaderError means there is an active lease, leaseRejectedError
			// means we tried to get one but someone beat us to it.
			case *proto.NotLeaderError, *proto.LeaseRejectedError:
				err = nil
			default:
				// Any other error is worth being logged visibly.
				log.Warningc(ctx, "could not acquire lease for range gossip: %s", e)
			}
		}
	}) {
		err = util.Errorf("node is stopping")
	}
	return hasLease, err
}
Beispiel #2
0
// recordJoinEvent begins an asynchronous task which attempts to log a "node
// join" or "node restart" event. This query will retry until it succeeds or the
// server stops.
func (n *Node) recordJoinEvent() {
	if !n.ctx.LogRangeEvents {
		return
	}

	logEventType := sql.EventLogNodeRestart
	if n.initialBoot {
		logEventType = sql.EventLogNodeJoin
	}

	n.stopper.RunWorker(func() {
		for r := retry.Start(retry.Options{Closer: n.stopper.ShouldStop()}); r.Next(); {
			if err := n.ctx.DB.Txn(func(txn *client.Txn) error {
				return n.eventLogger.InsertEventRecord(txn,
					logEventType,
					int32(n.Descriptor.NodeID),
					int32(n.Descriptor.NodeID),
					struct {
						Descriptor roachpb.NodeDescriptor
						ClusterID  uuid.UUID
						StartedAt  int64
					}{n.Descriptor, n.ClusterID, n.startedAt},
				)
			}); err != nil {
				log.Warningc(n.context(context.TODO()), "unable to log %s event for node %d: %s", logEventType, n.Descriptor.NodeID, err)
			} else {
				return
			}
		}
	})
}
Beispiel #3
0
// getLeaseForGossip tries to obtain a leader lease. Only one of the replicas
// should gossip; the bool returned indicates whether it's us.
func (r *Range) getLeaseForGossip(ctx context.Context) (bool, error) {
	// If no Gossip available (some tests) or range too fresh, noop.
	if r.rm.Gossip() == nil || !r.isInitialized() {
		return false, util.Errorf("no gossip or range not initialized")
	}
	if !r.rm.Stopper().StartTask() {
		return false, util.Errorf("system is shutting down")
	}
	defer r.rm.Stopper().FinishTask()

	timestamp := r.rm.Clock().Now()

	// Check for or obtain the lease, if none active.
	err := r.redirectOnOrAcquireLeaderLease(timestamp)
	if err != nil {
		switch e := err.(type) {
		// NotLeaderError means there is an active lease, leaseRejectedError
		// means we tried to get one but someone beat us to it.
		case *proto.NotLeaderError, *proto.LeaseRejectedError:
		default:
			// Any other error is worth being logged visibly.
			log.Warningc(ctx, "could not acquire lease for range gossip: %s", e)
			return false, err
		}
	}
	return err == nil, nil
}
Beispiel #4
0
func (r *Range) handleSkippedIntents(args proto.Request, intents []proto.Intent) {
	if len(intents) == 0 {
		return
	}

	ctx := r.context()
	stopper := r.rm.Stopper()
	stopper.RunAsyncTask(func() {
		err := r.rm.resolveWriteIntentError(ctx, &proto.WriteIntentError{
			Intents: intents,
		}, r, args, proto.CLEANUP_TXN)
		if wiErr, ok := err.(*proto.WriteIntentError); !ok || wiErr == nil || !wiErr.Resolved {
			log.Warningc(ctx, "failed to resolve on inconsistent read: %s", err)
		}
	})
}
Beispiel #5
0
func (r *Replica) handleSkippedIntents(args proto.Request, intents []proto.Intent) {
	if len(intents) == 0 {
		return
	}

	ctx := r.context()
	stopper := r.rm.Stopper()
	// TODO(tschottdorf): There's a chance that #1684 will make a comeback
	// since intent resolution on commit has since moved to EndTransaction,
	// which returns (some of) them as skipped intents. If so, need to resolve
	// synchronously if we're not allowed to do async (or just launch
	// goroutines).
	stopper.RunAsyncTask(func() {
		err := r.rm.resolveWriteIntentError(ctx, &proto.WriteIntentError{
			Intents: intents,
		}, r, args, proto.CLEANUP_TXN)
		if wiErr, ok := err.(*proto.WriteIntentError); !ok || wiErr == nil || !wiErr.Resolved {
			log.Warningc(ctx, "failed to resolve on inconsistent read: %s", err)
		}
	})
}
Beispiel #6
0
// recordJoinEvent begins an asynchronous task which attempts to log a "node
// join" or "node restart" event. This query will retry until it succeeds or the
// server stops.
func (n *Node) recordJoinEvent() {
	if !n.ctx.LogRangeEvents {
		return
	}

	logEventType := sql.EventLogNodeRestart
	if n.initialBoot {
		logEventType = sql.EventLogNodeJoin
	}

	n.stopper.RunWorker(func() {
		for {
			if err := n.ctx.DB.Txn(func(txn *client.Txn) *roachpb.Error {
				return sql.MakeEventLogger(n.ctx.SQLExecutor.LeaseManager).InsertEventRecord(txn,
					logEventType,
					int32(n.Descriptor.NodeID),
					int32(n.Descriptor.NodeID),
					struct {
						Descriptor roachpb.NodeDescriptor
						ClusterID  uuid.UUID
						StartedAt  int64
					}{n.Descriptor, n.ClusterID, n.startedAt},
				)
			}); err != nil {
				log.Warningc(n.context(), "unable to log %s event for node %d: %s", logEventType, n.Descriptor.NodeID, err)
			} else {
				return
			}

			select {
			case <-n.stopper.ShouldStop():
				return
			default:
				// Break.
			}
		}
	})
}
Beispiel #7
0
// addWriteCmd first consults the response cache to determine whether
// this command has already been sent to the range. If a response is
// found, it's returned immediately and not submitted to raft. Next,
// the timestamp cache is checked to determine if any newer accesses to
// this command's affected keys have been made. If so, this command's
// timestamp is moved forward. Finally the keys affected by this
// command are added as pending writes to the read queue and the
// command is submitted to Raft. Upon completion, the write is removed
// from the read queue and the reply is added to the response cache.
// If wait is true, will block until the command is complete.
func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, reply proto.Response, wait bool) error {
	// Check the response cache in case this is a replay. This call
	// may block if the same command is already underway.
	header := args.Header()

	// Add the write to the command queue to gate subsequent overlapping
	// Commands until this command completes. Note that this must be
	// done before getting the max timestamp for the key(s), as
	// timestamp cache is only updated after preceding commands have
	// been run to successful completion.
	cmdKey := r.beginCmd(header, false)

	// This replica must have leader lease to process a write.
	if err := r.redirectOnOrAcquireLeaderLease(header.Timestamp); err != nil {
		r.endCmd(cmdKey, args, err, false /* !readOnly */)
		reply.Header().SetGoError(err)
		return err
	}

	// Two important invariants of Cockroach: 1) encountering a more
	// recently written value means transaction restart. 2) values must
	// be written with a greater timestamp than the most recent read to
	// the same key. Check the timestamp cache for reads/writes which
	// are at least as recent as the timestamp of this write. For
	// writes, send WriteTooOldError; for reads, update the write's
	// timestamp. When the write returns, the updated timestamp will
	// inform the final commit timestamp.
	if usesTimestampCache(args) {
		r.Lock()
		rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID())
		r.Unlock()

		// Always push the timestamp forward if there's been a read which
		// occurred after our txn timestamp.
		if !rTS.Less(header.Timestamp) {
			header.Timestamp = rTS.Next()
		}
		// If there's a newer write timestamp...
		if !wTS.Less(header.Timestamp) {
			// If we're in a txn, set a write too old error in reply. We
			// still go ahead and try the write because we want to avoid
			// restarting the transaction in the event that there isn't an
			// intent or the intent can be pushed by us.
			if header.Txn != nil {
				err := &proto.WriteTooOldError{Timestamp: header.Timestamp, ExistingTimestamp: wTS}
				reply.Header().SetGoError(err)
			} else {
				// Otherwise, make sure we advance the request's timestamp.
				header.Timestamp = wTS.Next()
			}
		}
	}

	errChan, pendingCmd := r.proposeRaftCommand(ctx, args, reply)

	// Create a completion func for mandatory cleanups which we either
	// run synchronously if we're waiting or in a goroutine otherwise.
	completionFunc := func() error {
		// First wait for raft to commit or abort the command.
		var err error
		if err = <-errChan; err == nil {
			// Next if the command was committed, wait for the range to apply it.
			err = <-pendingCmd.done
		} else if err == multiraft.ErrGroupDeleted {
			// This error needs to be converted appropriately so that
			// clients will retry.
			err = proto.NewRangeNotFoundError(r.Desc().RaftID)
		}
		// As for reads, update timestamp cache with the timestamp
		// of this write on success. This ensures a strictly higher
		// timestamp for successive writes to the same key or key range.
		r.endCmd(cmdKey, args, err, false /* !readOnly */)
		return err
	}

	if wait {
		return completionFunc()
	}
	go func() {
		// If the original client didn't wait (e.g. resolve write intent),
		// log execution errors so they're surfaced somewhere.
		if err := completionFunc(); err != nil {
			// TODO(tschottdorf): possible security risk to log args.
			log.Warningc(ctx, "async execution of %v failed: %s", args, err)
		}
	}()
	return nil
}
// processIntentsAsync asynchronously processes intents which were
// encountered during another command but did not interfere with the
// execution of that command. This occurs in two cases: inconsistent
// reads and EndTransaction (which queues its own external intents for
// processing via this method). The two cases are handled somewhat
// differently and would be better served by different entry points,
// but combining them simplifies the plumbing necessary in Replica.
func (ir *intentResolver) processIntentsAsync(r *Replica, intents []intentsWithArg) {
	if len(intents) == 0 {
		return
	}
	now := r.store.Clock().Now()
	ctx := r.context(context.TODO())
	stopper := r.store.Stopper()

	for _, item := range intents {
		if item.args.Method() != roachpb.EndTransaction {
			stopper.RunLimitedAsyncTask(ir.sem, func() {
				// Everything here is best effort; give up rather than waiting
				// too long (helps avoid deadlocks during test shutdown,
				// although this is imperfect due to the use of an
				// uninterruptible WaitGroup.Wait in beginCmds).
				ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout)
				defer cancel()
				h := roachpb.Header{Timestamp: now}
				resolveIntents, pushErr := ir.maybePushTransactions(ctxWithTimeout,
					item.intents, h, roachpb.PUSH_TOUCH, true /* skipInFlight */)

				// resolveIntents with poison=true because we're resolving
				// intents outside of the context of an EndTransaction.
				//
				// Naively, it doesn't seem like we need to poison the abort
				// cache since we're pushing with PUSH_TOUCH - meaning that
				// the primary way our Push leads to aborting intents is that
				// of the transaction having timed out (and thus presumably no
				// client being around any more, though at the time of writing
				// we don't guarantee that). But there's another path in which
				// the Push comes back successful, namely that of the
				// transaction already having been aborted by someone else, in
				// which case the client may still be running. Thus, we must
				// poison.
				if err := ir.resolveIntents(ctxWithTimeout, r, resolveIntents,
					true /* wait */, true /* poison */); err != nil {
					log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", err)
					return
				}
				if pushErr != nil {
					log.Warningc(ctxWithTimeout, "failed to push during intent resolution: %s", pushErr)
					return
				}
			})
		} else { // EndTransaction
			stopper.RunLimitedAsyncTask(ir.sem, func() {
				ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout)
				defer cancel()

				// For EndTransaction, we know the transaction is finalized so
				// we can skip the push and go straight to the resolve.
				//
				// This mechanism assumes that when an EndTransaction fails,
				// the client makes no assumptions about the result. For
				// example, an attempt to explicitly rollback the transaction
				// may succeed (triggering this code path), but the result may
				// not make it back to the client.
				if err := ir.resolveIntents(ctxWithTimeout, r, item.intents,
					true /* wait */, false /* !poison */); err != nil {
					log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", err)
					return
				}

				// We successfully resolved the intents, so we're able to GC from
				// the txn span directly.
				var ba roachpb.BatchRequest
				ba.Timestamp = now

				txn := item.intents[0].Txn
				gcArgs := roachpb.GCRequest{
					Span: roachpb.Span{
						Key:    r.Desc().StartKey.AsRawKey(),
						EndKey: r.Desc().EndKey.AsRawKey(),
					},
				}
				gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{
					Key: keys.TransactionKey(txn.Key, txn.ID),
				})
				ba.Add(&gcArgs)
				if _, pErr := r.addWriteCmd(ctxWithTimeout, ba, nil /* nil */); pErr != nil {
					log.Warningf("could not GC completed transaction: %s", pErr)
				}
			})
		}
	}
}
Beispiel #9
0
// resolveIntents resolves the given intents. For those which are local to the
// range, we submit directly to the range-local Raft instance; the call returns
// as soon as all resolve commands have been **proposed** (not executed). This
// ensures that if a waiting client retries immediately after conflict
// resolution, it will not hit the same intents again. All non-local intents
// are resolved asynchronously in a batch.
// TODO(tschottdorf): once Txn records have a list of possibly open intents,
// resolveIntents should send an RPC to update the transaction(s) as well (for
// those intents with non-pending Txns).
func (r *Replica) resolveIntents(ctx context.Context, intents []proto.Intent) {
	trace := tracer.FromCtx(ctx)
	tracer.ToCtx(ctx, nil) // we're doing async stuff below; those need new traces
	trace.Event("resolving intents [async]")
	var wg sync.WaitGroup

	bArgs := &proto.BatchRequest{}
	bArgs.User = security.RootUser
	for i := range intents {
		intent := intents[i] // avoids a race in `i, intent := range ...`
		var resolveArgs proto.Request
		var local bool // whether this intent lives on this Range
		{
			header := proto.RequestHeader{
				// Use the pushee's timestamp, which might be lower than the
				// pusher's request timestamp. No need to push the intent higher
				// than the pushee's txn!
				Timestamp: intent.Txn.Timestamp,
				Key:       intent.Key,
				EndKey:    intent.EndKey,
				User:      security.RootUser,
				Txn:       &intent.Txn,
			}

			if len(intent.EndKey) == 0 {
				resolveArgs = &proto.ResolveIntentRequest{RequestHeader: header}
				local = r.ContainsKey(intent.Key)
			} else {
				resolveArgs = &proto.ResolveIntentRangeRequest{RequestHeader: header}
				local = r.ContainsKeyRange(intent.Key, intent.EndKey)
			}
		}

		// If the intent isn't (completely) local, we'll need to send an external request.
		// We'll batch them all up and send at the end.
		if !local {
			bArgs.Add(resolveArgs)
			continue
		}

		// If it is local, it goes directly into Raft.
		// TODO(tschottdorf): this may be premature optimization. Consider just
		// treating everything as an external request. This means having to
		// wait for complete execution of the command (whereas now we just wait
		// for proposition) and some more overhead sending things around.
		wg.Add(1)
		action := func() {
			// Trace this under the ID of the intent owner.
			ctx := tracer.ToCtx(ctx, r.rm.Tracer().NewTrace(resolveArgs.Header().Txn))
			if _, err := r.addWriteCmd(ctx, resolveArgs, &wg); err != nil && log.V(1) {
				log.Warningc(ctx, "resolve for key %s failed: %s", intent.Key, err)
			}
		}
		if !r.rm.Stopper().RunAsyncTask(action) {
			// Still run the task. Our caller already has a task and going async
			// here again is merely for performance, but some intents need to
			// be resolved because they might block other tasks. See #1684.
			// Note that handleSkippedIntents has a TODO in case #1684 comes
			// back.
			action()
		}
	}
	// Resolve all of the intents which aren't local to the Range. This is a
	// no-op if all are local.
	b := &client.Batch{}
	b.InternalAddCall(proto.Call{Args: bArgs, Reply: &proto.BatchResponse{}})
	action := func() {
		// TODO(tschottdorf): no tracing here yet. Probably useful at some point,
		// but needs a) the corresponding interface and b) facilities for tracing
		// multiple tracees at the same time (batch full of possibly individual
		// txns).
		if err := r.rm.DB().Run(b); err != nil {
			if log.V(1) {
				log.Infoc(ctx, "%s", err)
			}
		}
	}
	if !r.rm.Stopper().RunAsyncTask(action) {
		// As with local intents, try async to not keep the caller waiting, but
		// when draining just go ahead and do it synchronously. See #1684.
		action()
	}

	// Wait until all the local `ResolveIntent`s have been submitted to raft.
	// No-op if all were external.
	wg.Wait()
}
Beispiel #10
0
// processIntentsAsync asynchronously processes intents which were
// encountered during another command but did not interfere with the
// execution of that command. This occurs in two cases: inconsistent
// reads and EndTransaction (which queues its own external intents for
// processing via this method). The two cases are handled somewhat
// differently and would be better served by different entry points,
// but combining them simplifies the plumbing necessary in Replica.
func (ir *intentResolver) processIntentsAsync(r *Replica, intents []intentsWithArg) {
	if len(intents) == 0 {
		return
	}
	now := r.store.Clock().Now()
	ctx := r.context()
	stopper := r.store.Stopper()

	for _, item := range intents {
		if item.args.Method() != roachpb.EndTransaction {
			stopper.RunLimitedAsyncTask(ir.sem, func() {
				// Everything here is best effort; give up rather than waiting
				// too long (helps avoid deadlocks during test shutdown,
				// although this is imperfect due to the use of an
				// uninterruptible WaitGroup.Wait in beginCmds).
				ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout)
				defer cancel()
				h := roachpb.Header{Timestamp: now}
				resolveIntents, pushErr := ir.maybePushTransactions(ctxWithTimeout,
					item.intents, h, roachpb.PUSH_TOUCH, true /* skipInFlight */)
				if pErr := ir.resolveIntents(ctxWithTimeout, r, resolveIntents,
					true /* wait */, false /* TODO(tschottdorf): #5088 */); pErr != nil {
					log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", pErr)
					return
				}
				if pushErr != nil {
					log.Warningc(ctxWithTimeout, "failed to push during intent resolution: %s", pushErr)
					return
				}
			})
		} else { // EndTransaction
			stopper.RunLimitedAsyncTask(ir.sem, func() {
				ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout)
				defer cancel()

				// For EndTransaction, we know the transaction is finalized so
				// we can skip the push and go straight to the resolve.
				if pErr := ir.resolveIntents(ctxWithTimeout, r, item.intents,
					true /* wait */, false /* TODO(tschottdorf): #5088 */); pErr != nil {
					log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", pErr)
					return
				}

				// We successfully resolved the intents, so we're able to GC from
				// the txn span directly. Note that the sequence cache was cleared
				// out synchronously with EndTransaction (see comments within for
				// an explanation of why that is kosher).
				//
				// Note that we poisoned the sequence caches on the external ranges
				// above. This may seem counter-intuitive, but it's actually
				// necessary: Assume a transaction has committed here, with two
				// external intents, and assume that we did not poison. Normally,
				// these two intents would be resolved in the same batch, but that
				// is not guaranteed (for example, if DistSender has a stale
				// descriptor after a Merge). When resolved separately, the first
				// ResolveIntent would clear out the sequence cache; an individual
				// write on the second (still present) intent could then be
				// replayed and would resolve to a real value (at least for a
				// window of time unless we delete the local txn entry). That's not
				// OK for non-idempotent commands such as Increment.
				// TODO(tschottdorf): We should have another side effect on
				// MVCCResolveIntent (on commit/abort): If it were able to remove
				// the txn from its corresponding entries in the timestamp cache,
				// no more replays at the same timestamp would be possible. This
				// appears to be a useful performance optimization; we could then
				// not poison on EndTransaction. In fact, the above mechanism
				// could be an effective alternative to sequence-cache based
				// poisoning (or the whole sequence cache?) itself.
				//
				// TODO(tschottdorf): down the road, can probably unclog the system
				// here by batching up a bunch of those GCRequests before proposing.
				var ba roachpb.BatchRequest
				txn := item.intents[0].Txn
				gcArgs := roachpb.GCRequest{
					Span: roachpb.Span{
						Key:    r.Desc().StartKey.AsRawKey(),
						EndKey: r.Desc().EndKey.AsRawKey(),
					},
				}
				gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{
					Key: keys.TransactionKey(txn.Key, txn.ID),
				})
				ba.Add(&gcArgs)
				if _, pErr := r.addWriteCmd(ctxWithTimeout, ba, nil /* nil */); pErr != nil {
					log.Warningf("could not GC completed transaction: %s", pErr)
				}
			})
		}
	}
}