// getLeaseForGossip tries to obtain a leader lease. Only one of the replicas // should gossip; the bool returned indicates whether it's us. func (r *Range) getLeaseForGossip(ctx context.Context) (bool, error) { // If no Gossip available (some tests) or range too fresh, noop. if r.rm.Gossip() == nil || !r.isInitialized() { return false, util.Errorf("no gossip or range not initialized") } var hasLease bool var err error if !r.rm.Stopper().RunTask(func() { timestamp := r.rm.Clock().Now() // Check for or obtain the lease, if none active. err = r.redirectOnOrAcquireLeaderLease(tracer.FromCtx(ctx), timestamp) hasLease = err == nil if err != nil { switch e := err.(type) { // NotLeaderError means there is an active lease, leaseRejectedError // means we tried to get one but someone beat us to it. case *proto.NotLeaderError, *proto.LeaseRejectedError: err = nil default: // Any other error is worth being logged visibly. log.Warningc(ctx, "could not acquire lease for range gossip: %s", e) } } }) { err = util.Errorf("node is stopping") } return hasLease, err }
// recordJoinEvent begins an asynchronous task which attempts to log a "node // join" or "node restart" event. This query will retry until it succeeds or the // server stops. func (n *Node) recordJoinEvent() { if !n.ctx.LogRangeEvents { return } logEventType := sql.EventLogNodeRestart if n.initialBoot { logEventType = sql.EventLogNodeJoin } n.stopper.RunWorker(func() { for r := retry.Start(retry.Options{Closer: n.stopper.ShouldStop()}); r.Next(); { if err := n.ctx.DB.Txn(func(txn *client.Txn) error { return n.eventLogger.InsertEventRecord(txn, logEventType, int32(n.Descriptor.NodeID), int32(n.Descriptor.NodeID), struct { Descriptor roachpb.NodeDescriptor ClusterID uuid.UUID StartedAt int64 }{n.Descriptor, n.ClusterID, n.startedAt}, ) }); err != nil { log.Warningc(n.context(context.TODO()), "unable to log %s event for node %d: %s", logEventType, n.Descriptor.NodeID, err) } else { return } } }) }
// getLeaseForGossip tries to obtain a leader lease. Only one of the replicas // should gossip; the bool returned indicates whether it's us. func (r *Range) getLeaseForGossip(ctx context.Context) (bool, error) { // If no Gossip available (some tests) or range too fresh, noop. if r.rm.Gossip() == nil || !r.isInitialized() { return false, util.Errorf("no gossip or range not initialized") } if !r.rm.Stopper().StartTask() { return false, util.Errorf("system is shutting down") } defer r.rm.Stopper().FinishTask() timestamp := r.rm.Clock().Now() // Check for or obtain the lease, if none active. err := r.redirectOnOrAcquireLeaderLease(timestamp) if err != nil { switch e := err.(type) { // NotLeaderError means there is an active lease, leaseRejectedError // means we tried to get one but someone beat us to it. case *proto.NotLeaderError, *proto.LeaseRejectedError: default: // Any other error is worth being logged visibly. log.Warningc(ctx, "could not acquire lease for range gossip: %s", e) return false, err } } return err == nil, nil }
func (r *Range) handleSkippedIntents(args proto.Request, intents []proto.Intent) { if len(intents) == 0 { return } ctx := r.context() stopper := r.rm.Stopper() stopper.RunAsyncTask(func() { err := r.rm.resolveWriteIntentError(ctx, &proto.WriteIntentError{ Intents: intents, }, r, args, proto.CLEANUP_TXN) if wiErr, ok := err.(*proto.WriteIntentError); !ok || wiErr == nil || !wiErr.Resolved { log.Warningc(ctx, "failed to resolve on inconsistent read: %s", err) } }) }
func (r *Replica) handleSkippedIntents(args proto.Request, intents []proto.Intent) { if len(intents) == 0 { return } ctx := r.context() stopper := r.rm.Stopper() // TODO(tschottdorf): There's a chance that #1684 will make a comeback // since intent resolution on commit has since moved to EndTransaction, // which returns (some of) them as skipped intents. If so, need to resolve // synchronously if we're not allowed to do async (or just launch // goroutines). stopper.RunAsyncTask(func() { err := r.rm.resolveWriteIntentError(ctx, &proto.WriteIntentError{ Intents: intents, }, r, args, proto.CLEANUP_TXN) if wiErr, ok := err.(*proto.WriteIntentError); !ok || wiErr == nil || !wiErr.Resolved { log.Warningc(ctx, "failed to resolve on inconsistent read: %s", err) } }) }
// recordJoinEvent begins an asynchronous task which attempts to log a "node // join" or "node restart" event. This query will retry until it succeeds or the // server stops. func (n *Node) recordJoinEvent() { if !n.ctx.LogRangeEvents { return } logEventType := sql.EventLogNodeRestart if n.initialBoot { logEventType = sql.EventLogNodeJoin } n.stopper.RunWorker(func() { for { if err := n.ctx.DB.Txn(func(txn *client.Txn) *roachpb.Error { return sql.MakeEventLogger(n.ctx.SQLExecutor.LeaseManager).InsertEventRecord(txn, logEventType, int32(n.Descriptor.NodeID), int32(n.Descriptor.NodeID), struct { Descriptor roachpb.NodeDescriptor ClusterID uuid.UUID StartedAt int64 }{n.Descriptor, n.ClusterID, n.startedAt}, ) }); err != nil { log.Warningc(n.context(), "unable to log %s event for node %d: %s", logEventType, n.Descriptor.NodeID, err) } else { return } select { case <-n.stopper.ShouldStop(): return default: // Break. } } }) }
// addWriteCmd first consults the response cache to determine whether // this command has already been sent to the range. If a response is // found, it's returned immediately and not submitted to raft. Next, // the timestamp cache is checked to determine if any newer accesses to // this command's affected keys have been made. If so, this command's // timestamp is moved forward. Finally the keys affected by this // command are added as pending writes to the read queue and the // command is submitted to Raft. Upon completion, the write is removed // from the read queue and the reply is added to the response cache. // If wait is true, will block until the command is complete. func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, reply proto.Response, wait bool) error { // Check the response cache in case this is a replay. This call // may block if the same command is already underway. header := args.Header() // Add the write to the command queue to gate subsequent overlapping // Commands until this command completes. Note that this must be // done before getting the max timestamp for the key(s), as // timestamp cache is only updated after preceding commands have // been run to successful completion. cmdKey := r.beginCmd(header, false) // This replica must have leader lease to process a write. if err := r.redirectOnOrAcquireLeaderLease(header.Timestamp); err != nil { r.endCmd(cmdKey, args, err, false /* !readOnly */) reply.Header().SetGoError(err) return err } // Two important invariants of Cockroach: 1) encountering a more // recently written value means transaction restart. 2) values must // be written with a greater timestamp than the most recent read to // the same key. Check the timestamp cache for reads/writes which // are at least as recent as the timestamp of this write. For // writes, send WriteTooOldError; for reads, update the write's // timestamp. When the write returns, the updated timestamp will // inform the final commit timestamp. if usesTimestampCache(args) { r.Lock() rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID()) r.Unlock() // Always push the timestamp forward if there's been a read which // occurred after our txn timestamp. if !rTS.Less(header.Timestamp) { header.Timestamp = rTS.Next() } // If there's a newer write timestamp... if !wTS.Less(header.Timestamp) { // If we're in a txn, set a write too old error in reply. We // still go ahead and try the write because we want to avoid // restarting the transaction in the event that there isn't an // intent or the intent can be pushed by us. if header.Txn != nil { err := &proto.WriteTooOldError{Timestamp: header.Timestamp, ExistingTimestamp: wTS} reply.Header().SetGoError(err) } else { // Otherwise, make sure we advance the request's timestamp. header.Timestamp = wTS.Next() } } } errChan, pendingCmd := r.proposeRaftCommand(ctx, args, reply) // Create a completion func for mandatory cleanups which we either // run synchronously if we're waiting or in a goroutine otherwise. completionFunc := func() error { // First wait for raft to commit or abort the command. var err error if err = <-errChan; err == nil { // Next if the command was committed, wait for the range to apply it. err = <-pendingCmd.done } else if err == multiraft.ErrGroupDeleted { // This error needs to be converted appropriately so that // clients will retry. err = proto.NewRangeNotFoundError(r.Desc().RaftID) } // As for reads, update timestamp cache with the timestamp // of this write on success. This ensures a strictly higher // timestamp for successive writes to the same key or key range. r.endCmd(cmdKey, args, err, false /* !readOnly */) return err } if wait { return completionFunc() } go func() { // If the original client didn't wait (e.g. resolve write intent), // log execution errors so they're surfaced somewhere. if err := completionFunc(); err != nil { // TODO(tschottdorf): possible security risk to log args. log.Warningc(ctx, "async execution of %v failed: %s", args, err) } }() return nil }
// processIntentsAsync asynchronously processes intents which were // encountered during another command but did not interfere with the // execution of that command. This occurs in two cases: inconsistent // reads and EndTransaction (which queues its own external intents for // processing via this method). The two cases are handled somewhat // differently and would be better served by different entry points, // but combining them simplifies the plumbing necessary in Replica. func (ir *intentResolver) processIntentsAsync(r *Replica, intents []intentsWithArg) { if len(intents) == 0 { return } now := r.store.Clock().Now() ctx := r.context(context.TODO()) stopper := r.store.Stopper() for _, item := range intents { if item.args.Method() != roachpb.EndTransaction { stopper.RunLimitedAsyncTask(ir.sem, func() { // Everything here is best effort; give up rather than waiting // too long (helps avoid deadlocks during test shutdown, // although this is imperfect due to the use of an // uninterruptible WaitGroup.Wait in beginCmds). ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() h := roachpb.Header{Timestamp: now} resolveIntents, pushErr := ir.maybePushTransactions(ctxWithTimeout, item.intents, h, roachpb.PUSH_TOUCH, true /* skipInFlight */) // resolveIntents with poison=true because we're resolving // intents outside of the context of an EndTransaction. // // Naively, it doesn't seem like we need to poison the abort // cache since we're pushing with PUSH_TOUCH - meaning that // the primary way our Push leads to aborting intents is that // of the transaction having timed out (and thus presumably no // client being around any more, though at the time of writing // we don't guarantee that). But there's another path in which // the Push comes back successful, namely that of the // transaction already having been aborted by someone else, in // which case the client may still be running. Thus, we must // poison. if err := ir.resolveIntents(ctxWithTimeout, r, resolveIntents, true /* wait */, true /* poison */); err != nil { log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", err) return } if pushErr != nil { log.Warningc(ctxWithTimeout, "failed to push during intent resolution: %s", pushErr) return } }) } else { // EndTransaction stopper.RunLimitedAsyncTask(ir.sem, func() { ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() // For EndTransaction, we know the transaction is finalized so // we can skip the push and go straight to the resolve. // // This mechanism assumes that when an EndTransaction fails, // the client makes no assumptions about the result. For // example, an attempt to explicitly rollback the transaction // may succeed (triggering this code path), but the result may // not make it back to the client. if err := ir.resolveIntents(ctxWithTimeout, r, item.intents, true /* wait */, false /* !poison */); err != nil { log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", err) return } // We successfully resolved the intents, so we're able to GC from // the txn span directly. var ba roachpb.BatchRequest ba.Timestamp = now txn := item.intents[0].Txn gcArgs := roachpb.GCRequest{ Span: roachpb.Span{ Key: r.Desc().StartKey.AsRawKey(), EndKey: r.Desc().EndKey.AsRawKey(), }, } gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{ Key: keys.TransactionKey(txn.Key, txn.ID), }) ba.Add(&gcArgs) if _, pErr := r.addWriteCmd(ctxWithTimeout, ba, nil /* nil */); pErr != nil { log.Warningf("could not GC completed transaction: %s", pErr) } }) } } }
// resolveIntents resolves the given intents. For those which are local to the // range, we submit directly to the range-local Raft instance; the call returns // as soon as all resolve commands have been **proposed** (not executed). This // ensures that if a waiting client retries immediately after conflict // resolution, it will not hit the same intents again. All non-local intents // are resolved asynchronously in a batch. // TODO(tschottdorf): once Txn records have a list of possibly open intents, // resolveIntents should send an RPC to update the transaction(s) as well (for // those intents with non-pending Txns). func (r *Replica) resolveIntents(ctx context.Context, intents []proto.Intent) { trace := tracer.FromCtx(ctx) tracer.ToCtx(ctx, nil) // we're doing async stuff below; those need new traces trace.Event("resolving intents [async]") var wg sync.WaitGroup bArgs := &proto.BatchRequest{} bArgs.User = security.RootUser for i := range intents { intent := intents[i] // avoids a race in `i, intent := range ...` var resolveArgs proto.Request var local bool // whether this intent lives on this Range { header := proto.RequestHeader{ // Use the pushee's timestamp, which might be lower than the // pusher's request timestamp. No need to push the intent higher // than the pushee's txn! Timestamp: intent.Txn.Timestamp, Key: intent.Key, EndKey: intent.EndKey, User: security.RootUser, Txn: &intent.Txn, } if len(intent.EndKey) == 0 { resolveArgs = &proto.ResolveIntentRequest{RequestHeader: header} local = r.ContainsKey(intent.Key) } else { resolveArgs = &proto.ResolveIntentRangeRequest{RequestHeader: header} local = r.ContainsKeyRange(intent.Key, intent.EndKey) } } // If the intent isn't (completely) local, we'll need to send an external request. // We'll batch them all up and send at the end. if !local { bArgs.Add(resolveArgs) continue } // If it is local, it goes directly into Raft. // TODO(tschottdorf): this may be premature optimization. Consider just // treating everything as an external request. This means having to // wait for complete execution of the command (whereas now we just wait // for proposition) and some more overhead sending things around. wg.Add(1) action := func() { // Trace this under the ID of the intent owner. ctx := tracer.ToCtx(ctx, r.rm.Tracer().NewTrace(resolveArgs.Header().Txn)) if _, err := r.addWriteCmd(ctx, resolveArgs, &wg); err != nil && log.V(1) { log.Warningc(ctx, "resolve for key %s failed: %s", intent.Key, err) } } if !r.rm.Stopper().RunAsyncTask(action) { // Still run the task. Our caller already has a task and going async // here again is merely for performance, but some intents need to // be resolved because they might block other tasks. See #1684. // Note that handleSkippedIntents has a TODO in case #1684 comes // back. action() } } // Resolve all of the intents which aren't local to the Range. This is a // no-op if all are local. b := &client.Batch{} b.InternalAddCall(proto.Call{Args: bArgs, Reply: &proto.BatchResponse{}}) action := func() { // TODO(tschottdorf): no tracing here yet. Probably useful at some point, // but needs a) the corresponding interface and b) facilities for tracing // multiple tracees at the same time (batch full of possibly individual // txns). if err := r.rm.DB().Run(b); err != nil { if log.V(1) { log.Infoc(ctx, "%s", err) } } } if !r.rm.Stopper().RunAsyncTask(action) { // As with local intents, try async to not keep the caller waiting, but // when draining just go ahead and do it synchronously. See #1684. action() } // Wait until all the local `ResolveIntent`s have been submitted to raft. // No-op if all were external. wg.Wait() }
// processIntentsAsync asynchronously processes intents which were // encountered during another command but did not interfere with the // execution of that command. This occurs in two cases: inconsistent // reads and EndTransaction (which queues its own external intents for // processing via this method). The two cases are handled somewhat // differently and would be better served by different entry points, // but combining them simplifies the plumbing necessary in Replica. func (ir *intentResolver) processIntentsAsync(r *Replica, intents []intentsWithArg) { if len(intents) == 0 { return } now := r.store.Clock().Now() ctx := r.context() stopper := r.store.Stopper() for _, item := range intents { if item.args.Method() != roachpb.EndTransaction { stopper.RunLimitedAsyncTask(ir.sem, func() { // Everything here is best effort; give up rather than waiting // too long (helps avoid deadlocks during test shutdown, // although this is imperfect due to the use of an // uninterruptible WaitGroup.Wait in beginCmds). ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() h := roachpb.Header{Timestamp: now} resolveIntents, pushErr := ir.maybePushTransactions(ctxWithTimeout, item.intents, h, roachpb.PUSH_TOUCH, true /* skipInFlight */) if pErr := ir.resolveIntents(ctxWithTimeout, r, resolveIntents, true /* wait */, false /* TODO(tschottdorf): #5088 */); pErr != nil { log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", pErr) return } if pushErr != nil { log.Warningc(ctxWithTimeout, "failed to push during intent resolution: %s", pushErr) return } }) } else { // EndTransaction stopper.RunLimitedAsyncTask(ir.sem, func() { ctxWithTimeout, cancel := context.WithTimeout(ctx, base.NetworkTimeout) defer cancel() // For EndTransaction, we know the transaction is finalized so // we can skip the push and go straight to the resolve. if pErr := ir.resolveIntents(ctxWithTimeout, r, item.intents, true /* wait */, false /* TODO(tschottdorf): #5088 */); pErr != nil { log.Warningc(ctxWithTimeout, "failed to resolve intents: %s", pErr) return } // We successfully resolved the intents, so we're able to GC from // the txn span directly. Note that the sequence cache was cleared // out synchronously with EndTransaction (see comments within for // an explanation of why that is kosher). // // Note that we poisoned the sequence caches on the external ranges // above. This may seem counter-intuitive, but it's actually // necessary: Assume a transaction has committed here, with two // external intents, and assume that we did not poison. Normally, // these two intents would be resolved in the same batch, but that // is not guaranteed (for example, if DistSender has a stale // descriptor after a Merge). When resolved separately, the first // ResolveIntent would clear out the sequence cache; an individual // write on the second (still present) intent could then be // replayed and would resolve to a real value (at least for a // window of time unless we delete the local txn entry). That's not // OK for non-idempotent commands such as Increment. // TODO(tschottdorf): We should have another side effect on // MVCCResolveIntent (on commit/abort): If it were able to remove // the txn from its corresponding entries in the timestamp cache, // no more replays at the same timestamp would be possible. This // appears to be a useful performance optimization; we could then // not poison on EndTransaction. In fact, the above mechanism // could be an effective alternative to sequence-cache based // poisoning (or the whole sequence cache?) itself. // // TODO(tschottdorf): down the road, can probably unclog the system // here by batching up a bunch of those GCRequests before proposing. var ba roachpb.BatchRequest txn := item.intents[0].Txn gcArgs := roachpb.GCRequest{ Span: roachpb.Span{ Key: r.Desc().StartKey.AsRawKey(), EndKey: r.Desc().EndKey.AsRawKey(), }, } gcArgs.Keys = append(gcArgs.Keys, roachpb.GCRequest_GCKey{ Key: keys.TransactionKey(txn.Key, txn.ID), }) ba.Add(&gcArgs) if _, pErr := r.addWriteCmd(ctxWithTimeout, ba, nil /* nil */); pErr != nil { log.Warningf("could not GC completed transaction: %s", pErr) } }) } } }