// maybeGossipFirstRange adds the sentinel and first range metadata to gossip // if this is the first range and a leader lease can be obtained. The Store // calls this periodically on first range replicas. func (r *Range) maybeGossipFirstRange() error { if !r.IsFirstRange() { return nil } ctx := r.context() // Gossip the cluster ID from all replicas of the first range. log.Infoc(ctx, "gossiping cluster id %s from store %d, range %d", r.rm.ClusterID(), r.rm.StoreID(), r.Desc().RaftID) if err := r.rm.Gossip().AddInfo(gossip.KeyClusterID, r.rm.ClusterID(), clusterIDGossipTTL); err != nil { log.Errorc(ctx, "failed to gossip cluster ID: %s", err) } if ok, err := r.getLeaseForGossip(ctx); !ok || err != nil { return err } log.Infoc(ctx, "gossiping sentinel from store %d, range %d", r.rm.StoreID(), r.Desc().RaftID) if err := r.rm.Gossip().AddInfo(gossip.KeySentinel, r.rm.ClusterID(), clusterIDGossipTTL); err != nil { log.Errorc(ctx, "failed to gossip cluster ID: %s", err) } log.Infoc(ctx, "gossiping first range from store %d, range %d", r.rm.StoreID(), r.Desc().RaftID) if err := r.rm.Gossip().AddInfo(gossip.KeyFirstRangeDescriptor, *r.Desc(), configGossipTTL); err != nil { log.Errorc(ctx, "failed to gossip first range metadata: %s", err) } return nil }
// processWriteIntentError tries to push the conflicting // transaction(s) responsible for the given WriteIntentError, and to // resolve those intents if possible. Returns a new error to be used // in place of the original. // // The returned error may be a copy of the original WriteIntentError, // with or without the Resolved flag set, which governs the client's // retry behavior (if the transaction is pushed, the Resolved flag is // set to tell the client to retry immediately; otherwise it is false // to cause the client to back off). func (ir *intentResolver) processWriteIntentError(ctx context.Context, wiPErr *roachpb.Error, r *Replica, args roachpb.Request, h roachpb.Header, pushType roachpb.PushTxnType) *roachpb.Error { wiErr, ok := wiPErr.GetDetail().(*roachpb.WriteIntentError) if !ok { return roachpb.NewErrorf("not a WriteIntentError: %v", wiPErr) } if log.V(6) { log.Infoc(ctx, "resolving write intent %s", wiErr) } method := args.Method() readOnly := roachpb.IsReadOnly(args) // TODO(tschottdorf): pass as param resolveIntents, pushErr := ir.maybePushTransactions(ctx, wiErr.Intents, h, pushType, false) if resErr := ir.resolveIntents(ctx, r, resolveIntents, false /* !wait */, pushType == roachpb.PUSH_ABORT /* poison */); resErr != nil { // When resolving without waiting, errors should not // usually be returned here, although there are some cases // when they may be (especially when a test cluster is in // the process of shutting down). log.Warningf("asynchronous resolveIntents failed: %s", resErr) } if pushErr != nil { if log.V(1) { log.Infoc(ctx, "on %s: %s", method, pushErr) } // For write/write conflicts within a transaction, propagate the // push failure, not the original write intent error. The push // failure will instruct the client to restart the transaction // with a backoff. if h.Txn != nil && h.Txn.ID != nil && !readOnly { return pushErr } // For read/write conflicts, and non-transactional write/write // conflicts, return the write intent error which engages // backoff/retry (with !Resolved). We don't need to restart the // txn, only resend the read with a backoff. return wiPErr } // We pushed all transactions, so tell the client everything's // resolved and it can retry immediately. wiErr.Resolved = true return wiPErr // references wiErr }
// start starts the node by registering the storage instance for the // RPC service "Node" and initializing stores for each specified // engine. Launches periodic store gossiping in a goroutine. func (n *Node) start(rpcServer *rpc.Server, addr net.Addr, engines []engine.Engine, attrs roachpb.Attributes, stopper *stop.Stopper) error { n.initDescriptor(addr, attrs) const method = "Node.Batch" if err := rpcServer.Register(method, n.executeCmd, &roachpb.BatchRequest{}); err != nil { log.Fatalf("unable to register node service with RPC server: %s", err) } // Start status monitor. n.status.StartMonitorFeed(n.ctx.EventFeed) // Initialize stores, including bootstrapping new ones. if err := n.initStores(engines, stopper); err != nil { return err } n.startedAt = n.ctx.Clock.Now().WallTime // Initialize publisher for Node Events. This requires the NodeID, which is // initialized by initStores(); because of this, some Store initialization // events will precede the StartNodeEvent on the feed. n.feed = status.NewNodeEventFeed(n.Descriptor.NodeID, n.ctx.EventFeed) n.feed.StartNode(n.Descriptor, n.startedAt) n.startPublishStatuses(stopper) n.startGossip(stopper) log.Infoc(n.context(), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs) return nil }
func (r *Range) maybeGossipConfigsLocked(match func(configPrefix proto.Key) bool) { if r.rm.Gossip() == nil || !r.isInitialized() { return } ctx := r.context() for i, cd := range configDescriptors { if match(cd.keyPrefix) { // Check for a bad range split. This should never happen as ranges // cannot be split mid-config. if !r.ContainsKey(cd.keyPrefix.PrefixEnd()) { // If we ever implement configs that span multiple ranges, // we must update store.startGossip accordingly. For the // time being, it will only fire the first range. log.Fatalc(ctx, "range splits configuration values for %s", cd.keyPrefix) } configMap, hash, err := loadConfigMap(r.rm.Engine(), cd.keyPrefix, cd.configI) if err != nil { log.Errorc(ctx, "failed loading %s config map: %s", cd.gossipKey, err) continue } if r.configHashes == nil { r.configHashes = map[int][]byte{} } if prevHash, ok := r.configHashes[i]; !ok || !bytes.Equal(prevHash, hash) { r.configHashes[i] = hash log.Infoc(ctx, "gossiping %s config from store %d, range %d", cd.gossipKey, r.rm.StoreID(), r.Desc().RaftID) if err := r.rm.Gossip().AddInfo(cd.gossipKey, configMap, 0*time.Second); err != nil { log.Errorc(ctx, "failed to gossip %s configMap: %s", cd.gossipKey, err) continue } } } } }
// start starts the node by registering the storage instance for the // RPC service "Node" and initializing stores for each specified // engine. Launches periodic store gossiping in a goroutine. func (n *Node) start(rpcServer *rpc.Server, engines []engine.Engine, attrs proto.Attributes, stopper *util.Stopper) error { n.initDescriptor(rpcServer.Addr(), attrs) if err := rpcServer.RegisterName("Node", (*nodeServer)(n)); err != nil { log.Fatalf("unable to register node service with RPC server: %s", err) } // Start status monitor. n.status.StartMonitorFeed(n.ctx.EventFeed) stopper.AddCloser(n.ctx.EventFeed) // Initialize stores, including bootstrapping new ones. if err := n.initStores(engines, stopper); err != nil { return err } // Pass NodeID to status monitor - this value is initialized in initStores, // but the StatusMonitor must be active before initStores. n.status.SetNodeID(n.Descriptor.NodeID) // Initialize publisher for Node Events. n.feed = status.NewNodeEventFeed(n.Descriptor.NodeID, n.ctx.EventFeed) n.startedAt = n.ctx.Clock.Now().WallTime n.startStoresScanner(stopper) n.startPublishStatuses(stopper) n.startGossip(stopper) log.Infoc(n.context(), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs) return nil }
func (s *Store) insertRangeLogEvent(txn *client.Txn, event rangeLogEvent) error { // Record range log event to console log. var info string if event.info != nil { info = *event.info } log.Infoc(txn.Context, "Range Event: %q, range: %d, info: %s", event.eventType, event.rangeID, info) const insertEventTableStmt = ` INSERT INTO system.rangelog ( timestamp, rangeID, storeID, eventType, otherRangeID, info ) VALUES( $1, $2, $3, $4, $5, $6 ) ` args := []interface{}{ event.timestamp, event.rangeID, event.storeID, event.eventType, nil, // otherRangeID nil, // info } if event.otherRangeID != nil { args[4] = *event.otherRangeID } if event.info != nil { args[5] = *event.info } // Update range event metrics. We do this close to the insertion of the // corresponding range log entry to reduce potential skew between metrics and // range log. switch event.eventType { case RangeEventLogSplit: s.metrics.rangeSplits.Inc(1) case RangeEventLogAdd: s.metrics.rangeAdds.Inc(1) case RangeEventLogRemove: s.metrics.rangeRemoves.Inc(1) } rows, err := s.ctx.SQLExecutor.ExecuteStatementInTransaction(txn, insertEventTableStmt, args...) if err != nil { return err } if rows != 1 { return errors.Errorf("%d rows affected by log insertion; expected exactly one row affected.", rows) } return nil }
// start starts the node by registering the storage instance for the // RPC service "Node" and initializing stores for each specified // engine. Launches periodic store gossiping in a goroutine. func (n *Node) start(rpcServer *rpc.Server, engines []engine.Engine, attrs proto.Attributes, stopper *stop.Stopper) error { n.initDescriptor(rpcServer.Addr(), attrs) requests := []proto.Request{ &proto.BatchRequest{}, &proto.GetRequest{}, &proto.PutRequest{}, &proto.ConditionalPutRequest{}, &proto.IncrementRequest{}, &proto.DeleteRequest{}, &proto.DeleteRangeRequest{}, &proto.ScanRequest{}, &proto.ReverseScanRequest{}, &proto.EndTransactionRequest{}, &proto.AdminSplitRequest{}, &proto.AdminMergeRequest{}, &proto.HeartbeatTxnRequest{}, &proto.GCRequest{}, &proto.PushTxnRequest{}, &proto.RangeLookupRequest{}, &proto.ResolveIntentRequest{}, &proto.ResolveIntentRangeRequest{}, &proto.MergeRequest{}, &proto.TruncateLogRequest{}, &proto.LeaderLeaseRequest{}, } for _, r := range requests { if err := rpcServer.Register("Node."+r.Method().String(), n.executeCmd, r); err != nil { log.Fatalf("unable to register node service with RPC server: %s", err) } } // Start status monitor. n.status.StartMonitorFeed(n.ctx.EventFeed) // Initialize stores, including bootstrapping new ones. if err := n.initStores(engines, stopper); err != nil { return err } n.startedAt = n.ctx.Clock.Now().WallTime // Initialize publisher for Node Events. This requires the NodeID, which is // initialized by initStores(); because of this, some Store initialization // events will precede the StartNodeEvent on the feed. n.feed = status.NewNodeEventFeed(n.Descriptor.NodeID, n.ctx.EventFeed) n.feed.StartNode(n.Descriptor, n.startedAt) n.startPublishStatuses(stopper) n.startGossip(stopper) log.Infoc(n.context(), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs) return nil }
// start starts the node by registering the storage instance for the // RPC service "Node" and initializing stores for each specified // engine. Launches periodic store gossiping in a goroutine. func (n *Node) start(rpcServer *rpc.Server, addr net.Addr, engines []engine.Engine, attrs roachpb.Attributes) error { n.initDescriptor(addr, attrs) // Start status monitor. n.status.StartMonitorFeed(n.ctx.EventFeed) // Initialize stores, including bootstrapping new ones. if err := n.initStores(engines, n.stopper); err != nil { if err == errNeedsBootstrap { // This node has no initialized stores and no way to connect to // an existing cluster, so we bootstrap it. clusterID, err := bootstrapCluster(engines) if err != nil { return err } log.Infof("**** cluster %s has been created", clusterID) log.Infof("**** add additional nodes by specifying --join=%s", addr) // Make sure we add the node as a resolver. selfResolver, err := resolver.NewResolverFromAddress(addr) if err != nil { return err } n.ctx.Gossip.SetResolvers([]resolver.Resolver{selfResolver}) // After bootstrapping, try again to initialize the stores. if err := n.initStores(engines, n.stopper); err != nil { return err } } else { return err } } n.startedAt = n.ctx.Clock.Now().WallTime // Initialize publisher for Node Events. This requires the NodeID, which is // initialized by initStores(); because of this, some Store initialization // events will precede the StartNodeEvent on the feed. n.feed = status.NewNodeEventFeed(n.Descriptor.NodeID, n.ctx.EventFeed) n.feed.StartNode(n.Descriptor, n.startedAt) n.startPublishStatuses(n.stopper) n.startGossip(n.stopper) // Register the RPC methods we support last as doing so allows RPCs to be // received which may access state initialized above without locks. const method = "Node.Batch" if err := rpcServer.Register(method, n.executeCmd, &roachpb.BatchRequest{}); err != nil { log.Fatalf("unable to register node service with RPC server: %s", err) } log.Infoc(n.context(), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs) return nil }
// start starts the node by registering the storage instance for the // RPC service "Node" and initializing stores for each specified // engine. Launches periodic store gossiping in a goroutine. func (n *Node) start(addr net.Addr, engines []engine.Engine, attrs roachpb.Attributes) error { n.initDescriptor(addr, attrs) // Initialize stores, including bootstrapping new ones. if err := n.initStores(engines, n.stopper); err != nil { if err == errNeedsBootstrap { n.initialBoot = true // This node has no initialized stores and no way to connect to // an existing cluster, so we bootstrap it. clusterID, err := bootstrapCluster(engines, n.txnMetrics) if err != nil { return err } log.Infof("**** cluster %s has been created", clusterID) log.Infof("**** add additional nodes by specifying --join=%s", addr) // Make sure we add the node as a resolver. selfResolver, err := resolver.NewResolverFromAddress(addr) if err != nil { return err } n.ctx.Gossip.SetResolvers([]resolver.Resolver{selfResolver}) // After bootstrapping, try again to initialize the stores. if err := n.initStores(engines, n.stopper); err != nil { return err } } else { return err } } n.startedAt = n.ctx.Clock.Now().WallTime // Initialize the recorder with the NodeID, which is initialized by initStores(). n.recorder.NodeStarted(n.Descriptor, n.startedAt) n.startComputePeriodicMetrics(n.stopper) n.startGossip(n.stopper) // Record node started event. n.recordJoinEvent() log.Infoc(n.context(context.TODO()), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs) return nil }
// InsertEventRecord inserts a single event into the event log as part of the // provided transaction. func (ev EventLogger) InsertEventRecord(txn *client.Txn, eventType EventLogType, targetID, reportingID int32, info interface{}) error { // Record event record insertion in local log output. log.Infoc(txn.Context, "Event: %q, target: %d, info: %+v", eventType, targetID, info) const insertEventTableStmt = ` INSERT INTO system.eventlog ( timestamp, eventType, targetID, reportingID, info ) VALUES( $1, $2, $3, $4, $5 ) ` args := []interface{}{ ev.selectEventTimestamp(txn.Proto.Timestamp), eventType, targetID, reportingID, nil, // info } if info != nil { infoBytes, err := json.Marshal(info) if err != nil { return err } args[4] = string(infoBytes) } rows, err := ev.ExecuteStatementInTransaction(txn, insertEventTableStmt, args...) if err != nil { return err } if rows != 1 { return errors.Errorf("%d rows affected by log insertion; expected exactly one row affected.", rows) } return nil }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } committed := false // The very last thing we do before returning is move the applied index // forward, unless that has already happened as part of a successfully // committed batch. defer func() { if !committed { // We didn't commit the batch, but advance the last applied index nonetheless. if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not advance applied index"), err, rErr) return } atomic.StoreUint64(&r.appliedIndex, index) } }() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no more be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return r.newNotLeaderError(lease) } // Anything happening from now on needs to enter the response cache. defer func() { // TODO(tamird,tschottdorf): according to #1400 we intend to set the reply // header's error as late as possible and in a central location. Range // commands still write to the header directly, but once they don't this // could be the authoritative location that sets the reply error for any- // thing that makes it into Raft. Note that we must set this prior to // signaling cmd.done below, or the waiting RPC handler might proceed // before we've updated its reply. // // It is important that the error is set before the reply is saved into // the response cache. reply.Header().SetGoError(rErr) if proto.IsWrite(args) { // No matter the result, add result to the response cache if this // is a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not put to response cache"), err, rErr) return } } }() header := args.Header() // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } return err } else if ok && err != nil { return newReplicaCorruptionError( util.Errorf("could not read from response cache"), err) } } // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() defer batch.Close() // Create a engine.MVCCStats instance. ms := engine.MVCCStats{} // Execute the command; the error will also be set in the reply header. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. intents, err := r.executeCmd(batch, &ms, args, reply) // If the execution of the command wasn't successful, stop here. if err != nil { return err } if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError( util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Advance the applied index atomically within the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { return newReplicaCorruptionError( util.Errorf("could not update applied index"), err) } if proto.IsWrite(args) { // On success, flush the MVCC stats to the batch and commit. if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil { return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err) } if err := batch.Commit(); err != nil { return newReplicaCorruptionError(util.Errorf("could not commit batch"), err) } committed = true // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // After successful commit, update cached stats and appliedIndex value. atomic.StoreUint64(&r.appliedIndex, index) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if header.Key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(header.Key, configPrefix) }) } } } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return nil }
// applyRaftCommandInBatch executes the command in a batch engine and // returns the batch containing the results. The caller is responsible // for committing the batch, even on error. func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, ms *engine.MVCCStats) (engine.Engine, proto.Response, error) { // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no longer be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return batch, nil, r.newNotLeaderError(lease, originNode) } // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if reply, err := r.respCache.GetResponse(batch, args.Header().CmdID); err != nil { // Any error encountered while fetching the response cache entry means corruption. return batch, reply, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err) } else if reply != nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } // TODO(tamird): move this into the response cache itself defer func() { reply.Header().Error = nil }() // We successfully read from the response cache, so return whatever error // was present in the cached entry (if any). return batch, reply, reply.Header().GoError() } } // Execute the command. reply, intents, rErr := r.executeCmd(batch, ms, args) // Regardless of error, add result to the response cache if this is // a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if proto.IsWrite(args) { if rErr == nil { // If command was successful, flush the MVCC stats to the batch. if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil { log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err) } } else { // Otherwise, reset the batch to clear out partial execution and // prepare for the failed response cache entry. batch.Close() batch = r.rm.Engine().NewBatch() } // TODO(tamird): move this into the response cache itself if reply == nil { reply = args.CreateReply() } if reply.Header().Error != nil { panic("the world is on fire") } reply.Header().SetGoError(rErr) if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil { log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err) } reply.Header().Error = nil } // If the execution of the command wasn't successful, stop here. if rErr != nil { return batch, reply, rErr } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return batch, reply, nil }
// resolveIntents resolves the given intents. For those which are local to the // range, we submit directly to the range-local Raft instance; the call returns // as soon as all resolve commands have been **proposed** (not executed). This // ensures that if a waiting client retries immediately after conflict // resolution, it will not hit the same intents again. All non-local intents // are resolved asynchronously in a batch. // TODO(tschottdorf): once Txn records have a list of possibly open intents, // resolveIntents should send an RPC to update the transaction(s) as well (for // those intents with non-pending Txns). func (r *Replica) resolveIntents(ctx context.Context, intents []proto.Intent) { trace := tracer.FromCtx(ctx) tracer.ToCtx(ctx, nil) // we're doing async stuff below; those need new traces trace.Event("resolving intents [async]") var wg sync.WaitGroup bArgs := &proto.BatchRequest{} bArgs.User = security.RootUser for i := range intents { intent := intents[i] // avoids a race in `i, intent := range ...` var resolveArgs proto.Request var local bool // whether this intent lives on this Range { header := proto.RequestHeader{ // Use the pushee's timestamp, which might be lower than the // pusher's request timestamp. No need to push the intent higher // than the pushee's txn! Timestamp: intent.Txn.Timestamp, Key: intent.Key, EndKey: intent.EndKey, User: security.RootUser, Txn: &intent.Txn, } if len(intent.EndKey) == 0 { resolveArgs = &proto.ResolveIntentRequest{RequestHeader: header} local = r.ContainsKey(intent.Key) } else { resolveArgs = &proto.ResolveIntentRangeRequest{RequestHeader: header} local = r.ContainsKeyRange(intent.Key, intent.EndKey) } } // If the intent isn't (completely) local, we'll need to send an external request. // We'll batch them all up and send at the end. if !local { bArgs.Add(resolveArgs) continue } // If it is local, it goes directly into Raft. // TODO(tschottdorf): this may be premature optimization. Consider just // treating everything as an external request. This means having to // wait for complete execution of the command (whereas now we just wait // for proposition) and some more overhead sending things around. wg.Add(1) action := func() { // Trace this under the ID of the intent owner. ctx := tracer.ToCtx(ctx, r.rm.Tracer().NewTrace(resolveArgs.Header().Txn)) if _, err := r.addWriteCmd(ctx, resolveArgs, &wg); err != nil && log.V(1) { log.Warningc(ctx, "resolve for key %s failed: %s", intent.Key, err) } } if !r.rm.Stopper().RunAsyncTask(action) { // Still run the task. Our caller already has a task and going async // here again is merely for performance, but some intents need to // be resolved because they might block other tasks. See #1684. // Note that handleSkippedIntents has a TODO in case #1684 comes // back. action() } } // Resolve all of the intents which aren't local to the Range. This is a // no-op if all are local. b := &client.Batch{} b.InternalAddCall(proto.Call{Args: bArgs, Reply: &proto.BatchResponse{}}) action := func() { // TODO(tschottdorf): no tracing here yet. Probably useful at some point, // but needs a) the corresponding interface and b) facilities for tracing // multiple tracees at the same time (batch full of possibly individual // txns). if err := r.rm.DB().Run(b); err != nil { if log.V(1) { log.Infoc(ctx, "%s", err) } } } if !r.rm.Stopper().RunAsyncTask(action) { // As with local intents, try async to not keep the caller waiting, but // when draining just go ahead and do it synchronously. See #1684. action() } // Wait until all the local `ResolveIntent`s have been submitted to raft. // No-op if all were external. wg.Wait() }