Ejemplo n.º 1
0
// maybeGossipFirstRange adds the sentinel and first range metadata to gossip
// if this is the first range and a leader lease can be obtained. The Store
// calls this periodically on first range replicas.
func (r *Range) maybeGossipFirstRange() error {
	if !r.IsFirstRange() {
		return nil
	}

	ctx := r.context()

	// Gossip the cluster ID from all replicas of the first range.
	log.Infoc(ctx, "gossiping cluster id %s from store %d, range %d", r.rm.ClusterID(),
		r.rm.StoreID(), r.Desc().RaftID)
	if err := r.rm.Gossip().AddInfo(gossip.KeyClusterID, r.rm.ClusterID(), clusterIDGossipTTL); err != nil {
		log.Errorc(ctx, "failed to gossip cluster ID: %s", err)
	}

	if ok, err := r.getLeaseForGossip(ctx); !ok || err != nil {
		return err
	}
	log.Infoc(ctx, "gossiping sentinel from store %d, range %d", r.rm.StoreID(), r.Desc().RaftID)
	if err := r.rm.Gossip().AddInfo(gossip.KeySentinel, r.rm.ClusterID(), clusterIDGossipTTL); err != nil {
		log.Errorc(ctx, "failed to gossip cluster ID: %s", err)
	}
	log.Infoc(ctx, "gossiping first range from store %d, range %d", r.rm.StoreID(), r.Desc().RaftID)
	if err := r.rm.Gossip().AddInfo(gossip.KeyFirstRangeDescriptor, *r.Desc(), configGossipTTL); err != nil {
		log.Errorc(ctx, "failed to gossip first range metadata: %s", err)
	}
	return nil
}
Ejemplo n.º 2
0
// processWriteIntentError tries to push the conflicting
// transaction(s) responsible for the given WriteIntentError, and to
// resolve those intents if possible. Returns a new error to be used
// in place of the original.
//
// The returned error may be a copy of the original WriteIntentError,
// with or without the Resolved flag set, which governs the client's
// retry behavior (if the transaction is pushed, the Resolved flag is
// set to tell the client to retry immediately; otherwise it is false
// to cause the client to back off).
func (ir *intentResolver) processWriteIntentError(ctx context.Context,
	wiPErr *roachpb.Error, r *Replica, args roachpb.Request, h roachpb.Header,
	pushType roachpb.PushTxnType) *roachpb.Error {
	wiErr, ok := wiPErr.GetDetail().(*roachpb.WriteIntentError)
	if !ok {
		return roachpb.NewErrorf("not a WriteIntentError: %v", wiPErr)
	}

	if log.V(6) {
		log.Infoc(ctx, "resolving write intent %s", wiErr)
	}

	method := args.Method()
	readOnly := roachpb.IsReadOnly(args) // TODO(tschottdorf): pass as param

	resolveIntents, pushErr := ir.maybePushTransactions(ctx, wiErr.Intents, h, pushType, false)

	if resErr := ir.resolveIntents(ctx, r, resolveIntents,
		false /* !wait */, pushType == roachpb.PUSH_ABORT /* poison */); resErr != nil {
		// When resolving without waiting, errors should not
		// usually be returned here, although there are some cases
		// when they may be (especially when a test cluster is in
		// the process of shutting down).
		log.Warningf("asynchronous resolveIntents failed: %s", resErr)
	}

	if pushErr != nil {
		if log.V(1) {
			log.Infoc(ctx, "on %s: %s", method, pushErr)
		}

		// For write/write conflicts within a transaction, propagate the
		// push failure, not the original write intent error. The push
		// failure will instruct the client to restart the transaction
		// with a backoff.
		if h.Txn != nil && h.Txn.ID != nil && !readOnly {
			return pushErr
		}

		// For read/write conflicts, and non-transactional write/write
		// conflicts, return the write intent error which engages
		// backoff/retry (with !Resolved). We don't need to restart the
		// txn, only resend the read with a backoff.
		return wiPErr
	}

	// We pushed all transactions, so tell the client everything's
	// resolved and it can retry immediately.
	wiErr.Resolved = true
	return wiPErr // references wiErr
}
Ejemplo n.º 3
0
// start starts the node by registering the storage instance for the
// RPC service "Node" and initializing stores for each specified
// engine. Launches periodic store gossiping in a goroutine.
func (n *Node) start(rpcServer *rpc.Server, addr net.Addr, engines []engine.Engine,
	attrs roachpb.Attributes, stopper *stop.Stopper) error {
	n.initDescriptor(addr, attrs)
	const method = "Node.Batch"
	if err := rpcServer.Register(method, n.executeCmd, &roachpb.BatchRequest{}); err != nil {
		log.Fatalf("unable to register node service with RPC server: %s", err)
	}

	// Start status monitor.
	n.status.StartMonitorFeed(n.ctx.EventFeed)

	// Initialize stores, including bootstrapping new ones.
	if err := n.initStores(engines, stopper); err != nil {
		return err
	}

	n.startedAt = n.ctx.Clock.Now().WallTime

	// Initialize publisher for Node Events. This requires the NodeID, which is
	// initialized by initStores(); because of this, some Store initialization
	// events will precede the StartNodeEvent on the feed.
	n.feed = status.NewNodeEventFeed(n.Descriptor.NodeID, n.ctx.EventFeed)
	n.feed.StartNode(n.Descriptor, n.startedAt)

	n.startPublishStatuses(stopper)
	n.startGossip(stopper)
	log.Infoc(n.context(), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs)
	return nil
}
Ejemplo n.º 4
0
func (r *Range) maybeGossipConfigsLocked(match func(configPrefix proto.Key) bool) {
	if r.rm.Gossip() == nil || !r.isInitialized() {
		return
	}
	ctx := r.context()
	for i, cd := range configDescriptors {
		if match(cd.keyPrefix) {
			// Check for a bad range split. This should never happen as ranges
			// cannot be split mid-config.
			if !r.ContainsKey(cd.keyPrefix.PrefixEnd()) {
				// If we ever implement configs that span multiple ranges,
				// we must update store.startGossip accordingly. For the
				// time being, it will only fire the first range.
				log.Fatalc(ctx, "range splits configuration values for %s", cd.keyPrefix)
			}
			configMap, hash, err := loadConfigMap(r.rm.Engine(), cd.keyPrefix, cd.configI)
			if err != nil {
				log.Errorc(ctx, "failed loading %s config map: %s", cd.gossipKey, err)
				continue
			}
			if r.configHashes == nil {
				r.configHashes = map[int][]byte{}
			}
			if prevHash, ok := r.configHashes[i]; !ok || !bytes.Equal(prevHash, hash) {
				r.configHashes[i] = hash
				log.Infoc(ctx, "gossiping %s config from store %d, range %d", cd.gossipKey, r.rm.StoreID(), r.Desc().RaftID)
				if err := r.rm.Gossip().AddInfo(cd.gossipKey, configMap, 0*time.Second); err != nil {
					log.Errorc(ctx, "failed to gossip %s configMap: %s", cd.gossipKey, err)
					continue
				}
			}
		}
	}
}
Ejemplo n.º 5
0
// start starts the node by registering the storage instance for the
// RPC service "Node" and initializing stores for each specified
// engine. Launches periodic store gossiping in a goroutine.
func (n *Node) start(rpcServer *rpc.Server, engines []engine.Engine,
	attrs proto.Attributes, stopper *util.Stopper) error {
	n.initDescriptor(rpcServer.Addr(), attrs)
	if err := rpcServer.RegisterName("Node", (*nodeServer)(n)); err != nil {
		log.Fatalf("unable to register node service with RPC server: %s", err)
	}

	// Start status monitor.
	n.status.StartMonitorFeed(n.ctx.EventFeed)
	stopper.AddCloser(n.ctx.EventFeed)

	// Initialize stores, including bootstrapping new ones.
	if err := n.initStores(engines, stopper); err != nil {
		return err
	}

	// Pass NodeID to status monitor - this value is initialized in initStores,
	// but the StatusMonitor must be active before initStores.
	n.status.SetNodeID(n.Descriptor.NodeID)

	// Initialize publisher for Node Events.
	n.feed = status.NewNodeEventFeed(n.Descriptor.NodeID, n.ctx.EventFeed)

	n.startedAt = n.ctx.Clock.Now().WallTime
	n.startStoresScanner(stopper)
	n.startPublishStatuses(stopper)
	n.startGossip(stopper)
	log.Infoc(n.context(), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs)
	return nil
}
Ejemplo n.º 6
0
func (s *Store) insertRangeLogEvent(txn *client.Txn, event rangeLogEvent) error {
	// Record range log event to console log.
	var info string
	if event.info != nil {
		info = *event.info
	}
	log.Infoc(txn.Context, "Range Event: %q, range: %d, info: %s",
		event.eventType,
		event.rangeID,
		info)

	const insertEventTableStmt = `
INSERT INTO system.rangelog (
  timestamp, rangeID, storeID, eventType, otherRangeID, info
)
VALUES(
  $1, $2, $3, $4, $5, $6
)
`
	args := []interface{}{
		event.timestamp,
		event.rangeID,
		event.storeID,
		event.eventType,
		nil, // otherRangeID
		nil, // info
	}
	if event.otherRangeID != nil {
		args[4] = *event.otherRangeID
	}
	if event.info != nil {
		args[5] = *event.info
	}

	// Update range event metrics. We do this close to the insertion of the
	// corresponding range log entry to reduce potential skew between metrics and
	// range log.
	switch event.eventType {
	case RangeEventLogSplit:
		s.metrics.rangeSplits.Inc(1)
	case RangeEventLogAdd:
		s.metrics.rangeAdds.Inc(1)
	case RangeEventLogRemove:
		s.metrics.rangeRemoves.Inc(1)
	}

	rows, err := s.ctx.SQLExecutor.ExecuteStatementInTransaction(txn, insertEventTableStmt, args...)
	if err != nil {
		return err
	}
	if rows != 1 {
		return errors.Errorf("%d rows affected by log insertion; expected exactly one row affected.", rows)
	}
	return nil
}
Ejemplo n.º 7
0
// start starts the node by registering the storage instance for the
// RPC service "Node" and initializing stores for each specified
// engine. Launches periodic store gossiping in a goroutine.
func (n *Node) start(rpcServer *rpc.Server, engines []engine.Engine,
	attrs proto.Attributes, stopper *stop.Stopper) error {
	n.initDescriptor(rpcServer.Addr(), attrs)
	requests := []proto.Request{
		&proto.BatchRequest{},
		&proto.GetRequest{},
		&proto.PutRequest{},
		&proto.ConditionalPutRequest{},
		&proto.IncrementRequest{},
		&proto.DeleteRequest{},
		&proto.DeleteRangeRequest{},
		&proto.ScanRequest{},
		&proto.ReverseScanRequest{},
		&proto.EndTransactionRequest{},
		&proto.AdminSplitRequest{},
		&proto.AdminMergeRequest{},
		&proto.HeartbeatTxnRequest{},
		&proto.GCRequest{},
		&proto.PushTxnRequest{},
		&proto.RangeLookupRequest{},
		&proto.ResolveIntentRequest{},
		&proto.ResolveIntentRangeRequest{},
		&proto.MergeRequest{},
		&proto.TruncateLogRequest{},
		&proto.LeaderLeaseRequest{},
	}
	for _, r := range requests {
		if err := rpcServer.Register("Node."+r.Method().String(), n.executeCmd, r); err != nil {
			log.Fatalf("unable to register node service with RPC server: %s", err)
		}
	}

	// Start status monitor.
	n.status.StartMonitorFeed(n.ctx.EventFeed)

	// Initialize stores, including bootstrapping new ones.
	if err := n.initStores(engines, stopper); err != nil {
		return err
	}

	n.startedAt = n.ctx.Clock.Now().WallTime

	// Initialize publisher for Node Events. This requires the NodeID, which is
	// initialized by initStores(); because of this, some Store initialization
	// events will precede the StartNodeEvent on the feed.
	n.feed = status.NewNodeEventFeed(n.Descriptor.NodeID, n.ctx.EventFeed)
	n.feed.StartNode(n.Descriptor, n.startedAt)

	n.startPublishStatuses(stopper)
	n.startGossip(stopper)
	log.Infoc(n.context(), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs)
	return nil
}
Ejemplo n.º 8
0
// start starts the node by registering the storage instance for the
// RPC service "Node" and initializing stores for each specified
// engine. Launches periodic store gossiping in a goroutine.
func (n *Node) start(rpcServer *rpc.Server, addr net.Addr, engines []engine.Engine, attrs roachpb.Attributes) error {
	n.initDescriptor(addr, attrs)

	// Start status monitor.
	n.status.StartMonitorFeed(n.ctx.EventFeed)

	// Initialize stores, including bootstrapping new ones.
	if err := n.initStores(engines, n.stopper); err != nil {
		if err == errNeedsBootstrap {
			// This node has no initialized stores and no way to connect to
			// an existing cluster, so we bootstrap it.
			clusterID, err := bootstrapCluster(engines)
			if err != nil {
				return err
			}
			log.Infof("**** cluster %s has been created", clusterID)
			log.Infof("**** add additional nodes by specifying --join=%s", addr)
			// Make sure we add the node as a resolver.
			selfResolver, err := resolver.NewResolverFromAddress(addr)
			if err != nil {
				return err
			}
			n.ctx.Gossip.SetResolvers([]resolver.Resolver{selfResolver})
			// After bootstrapping, try again to initialize the stores.
			if err := n.initStores(engines, n.stopper); err != nil {
				return err
			}
		} else {
			return err
		}
	}

	n.startedAt = n.ctx.Clock.Now().WallTime

	// Initialize publisher for Node Events. This requires the NodeID, which is
	// initialized by initStores(); because of this, some Store initialization
	// events will precede the StartNodeEvent on the feed.
	n.feed = status.NewNodeEventFeed(n.Descriptor.NodeID, n.ctx.EventFeed)
	n.feed.StartNode(n.Descriptor, n.startedAt)

	n.startPublishStatuses(n.stopper)
	n.startGossip(n.stopper)

	// Register the RPC methods we support last as doing so allows RPCs to be
	// received which may access state initialized above without locks.
	const method = "Node.Batch"
	if err := rpcServer.Register(method, n.executeCmd, &roachpb.BatchRequest{}); err != nil {
		log.Fatalf("unable to register node service with RPC server: %s", err)
	}

	log.Infoc(n.context(), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs)
	return nil
}
Ejemplo n.º 9
0
// start starts the node by registering the storage instance for the
// RPC service "Node" and initializing stores for each specified
// engine. Launches periodic store gossiping in a goroutine.
func (n *Node) start(addr net.Addr, engines []engine.Engine, attrs roachpb.Attributes) error {
	n.initDescriptor(addr, attrs)

	// Initialize stores, including bootstrapping new ones.
	if err := n.initStores(engines, n.stopper); err != nil {
		if err == errNeedsBootstrap {
			n.initialBoot = true
			// This node has no initialized stores and no way to connect to
			// an existing cluster, so we bootstrap it.
			clusterID, err := bootstrapCluster(engines, n.txnMetrics)
			if err != nil {
				return err
			}
			log.Infof("**** cluster %s has been created", clusterID)
			log.Infof("**** add additional nodes by specifying --join=%s", addr)
			// Make sure we add the node as a resolver.
			selfResolver, err := resolver.NewResolverFromAddress(addr)
			if err != nil {
				return err
			}
			n.ctx.Gossip.SetResolvers([]resolver.Resolver{selfResolver})
			// After bootstrapping, try again to initialize the stores.
			if err := n.initStores(engines, n.stopper); err != nil {
				return err
			}
		} else {
			return err
		}
	}

	n.startedAt = n.ctx.Clock.Now().WallTime

	// Initialize the recorder with the NodeID, which is initialized by initStores().
	n.recorder.NodeStarted(n.Descriptor, n.startedAt)

	n.startComputePeriodicMetrics(n.stopper)
	n.startGossip(n.stopper)

	// Record node started event.
	n.recordJoinEvent()

	log.Infoc(n.context(context.TODO()), "Started node with %v engine(s) and attributes %v", engines, attrs.Attrs)
	return nil
}
Ejemplo n.º 10
0
// InsertEventRecord inserts a single event into the event log as part of the
// provided transaction.
func (ev EventLogger) InsertEventRecord(txn *client.Txn, eventType EventLogType, targetID, reportingID int32, info interface{}) error {
	// Record event record insertion in local log output.
	log.Infoc(txn.Context, "Event: %q, target: %d, info: %+v",
		eventType,
		targetID,
		info)

	const insertEventTableStmt = `
INSERT INTO system.eventlog (
  timestamp, eventType, targetID, reportingID, info
)
VALUES(
  $1, $2, $3, $4, $5
)
`
	args := []interface{}{
		ev.selectEventTimestamp(txn.Proto.Timestamp),
		eventType,
		targetID,
		reportingID,
		nil, // info
	}
	if info != nil {
		infoBytes, err := json.Marshal(info)
		if err != nil {
			return err
		}
		args[4] = string(infoBytes)
	}

	rows, err := ev.ExecuteStatementInTransaction(txn, insertEventTableStmt, args...)
	if err != nil {
		return err
	}
	if rows != 1 {
		return errors.Errorf("%d rows affected by log insertion; expected exactly one row affected.", rows)
	}
	return nil
}
Ejemplo n.º 11
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	committed := false
	// The very last thing we do before returning is move the applied index
	// forward, unless that has already happened as part of a successfully
	// committed batch.
	defer func() {
		if !committed {
			// We didn't commit the batch, but advance the last applied index nonetheless.
			if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not advance applied index"), err, rErr)
				return
			}
			atomic.StoreUint64(&r.appliedIndex, index)
		}
	}()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no more be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		//
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return r.newNotLeaderError(lease)
	}

	// Anything happening from now on needs to enter the response cache.
	defer func() {
		// TODO(tamird,tschottdorf): according to #1400 we intend to set the reply
		// header's error as late as possible and in a central location. Range
		// commands still write to the header directly, but once they don't this
		// could be the authoritative location that sets the reply error for any-
		// thing that makes it into Raft. Note that we must set this prior to
		// signaling cmd.done below, or the waiting RPC handler might proceed
		// before we've updated its reply.
		//
		// It is important that the error is set before the reply is saved into
		// the response cache.
		reply.Header().SetGoError(rErr)

		if proto.IsWrite(args) {
			// No matter the result, add result to the response cache if this
			// is a write method. This must be done as part of the execution of
			// raft commands so that every replica maintains the same responses
			// to continue request idempotence, even if leadership changes.
			if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not put to response cache"), err, rErr)
				return
			}
		}
	}()

	header := args.Header()

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			}
			return err
		} else if ok && err != nil {
			return newReplicaCorruptionError(
				util.Errorf("could not read from response cache"), err)
		}
	}

	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()
	defer batch.Close()

	// Create a engine.MVCCStats instance.
	ms := engine.MVCCStats{}

	// Execute the command; the error will also be set in the reply header.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	intents, err := r.executeCmd(batch, &ms, args, reply)
	// If the execution of the command wasn't successful, stop here.
	if err != nil {
		return err
	}

	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(
			util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Advance the applied index atomically within the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		return newReplicaCorruptionError(
			util.Errorf("could not update applied index"), err)
	}

	if proto.IsWrite(args) {
		// On success, flush the MVCC stats to the batch and commit.
		if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err)
		}
		if err := batch.Commit(); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not commit batch"), err)
		}
		committed = true
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// After successful commit, update cached stats and appliedIndex value.
		atomic.StoreUint64(&r.appliedIndex, index)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if header.Key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(header.Key, configPrefix)
				})
			}
		}
	}
	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)
	}

	return nil
}
Ejemplo n.º 12
0
// applyRaftCommandInBatch executes the command in a batch engine and
// returns the batch containing the results. The caller is responsible
// for committing the batch, even on error.
func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID,
	args proto.Request, ms *engine.MVCCStats) (engine.Engine, proto.Response, error) {
	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no longer be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		//
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return batch, nil, r.newNotLeaderError(lease, originNode)
	}

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if reply, err := r.respCache.GetResponse(batch, args.Header().CmdID); err != nil {
			// Any error encountered while fetching the response cache entry means corruption.
			return batch, reply, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err)
		} else if reply != nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			}
			// TODO(tamird): move this into the response cache itself
			defer func() { reply.Header().Error = nil }()
			// We successfully read from the response cache, so return whatever error
			// was present in the cached entry (if any).
			return batch, reply, reply.Header().GoError()
		}
	}

	// Execute the command.
	reply, intents, rErr := r.executeCmd(batch, ms, args)
	// Regardless of error, add result to the response cache if this is
	// a write method. This must be done as part of the execution of
	// raft commands so that every replica maintains the same responses
	// to continue request idempotence, even if leadership changes.
	if proto.IsWrite(args) {
		if rErr == nil {
			// If command was successful, flush the MVCC stats to the batch.
			if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil {
				log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err)
			}
		} else {
			// Otherwise, reset the batch to clear out partial execution and
			// prepare for the failed response cache entry.
			batch.Close()
			batch = r.rm.Engine().NewBatch()
		}
		// TODO(tamird): move this into the response cache itself
		if reply == nil {
			reply = args.CreateReply()
		}
		if reply.Header().Error != nil {
			panic("the world is on fire")
		}
		reply.Header().SetGoError(rErr)
		if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil {
			log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err)
		}
		reply.Header().Error = nil
	}

	// If the execution of the command wasn't successful, stop here.
	if rErr != nil {
		return batch, reply, rErr
	}

	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)
	}

	return batch, reply, nil
}
Ejemplo n.º 13
0
// resolveIntents resolves the given intents. For those which are local to the
// range, we submit directly to the range-local Raft instance; the call returns
// as soon as all resolve commands have been **proposed** (not executed). This
// ensures that if a waiting client retries immediately after conflict
// resolution, it will not hit the same intents again. All non-local intents
// are resolved asynchronously in a batch.
// TODO(tschottdorf): once Txn records have a list of possibly open intents,
// resolveIntents should send an RPC to update the transaction(s) as well (for
// those intents with non-pending Txns).
func (r *Replica) resolveIntents(ctx context.Context, intents []proto.Intent) {
	trace := tracer.FromCtx(ctx)
	tracer.ToCtx(ctx, nil) // we're doing async stuff below; those need new traces
	trace.Event("resolving intents [async]")
	var wg sync.WaitGroup

	bArgs := &proto.BatchRequest{}
	bArgs.User = security.RootUser
	for i := range intents {
		intent := intents[i] // avoids a race in `i, intent := range ...`
		var resolveArgs proto.Request
		var local bool // whether this intent lives on this Range
		{
			header := proto.RequestHeader{
				// Use the pushee's timestamp, which might be lower than the
				// pusher's request timestamp. No need to push the intent higher
				// than the pushee's txn!
				Timestamp: intent.Txn.Timestamp,
				Key:       intent.Key,
				EndKey:    intent.EndKey,
				User:      security.RootUser,
				Txn:       &intent.Txn,
			}

			if len(intent.EndKey) == 0 {
				resolveArgs = &proto.ResolveIntentRequest{RequestHeader: header}
				local = r.ContainsKey(intent.Key)
			} else {
				resolveArgs = &proto.ResolveIntentRangeRequest{RequestHeader: header}
				local = r.ContainsKeyRange(intent.Key, intent.EndKey)
			}
		}

		// If the intent isn't (completely) local, we'll need to send an external request.
		// We'll batch them all up and send at the end.
		if !local {
			bArgs.Add(resolveArgs)
			continue
		}

		// If it is local, it goes directly into Raft.
		// TODO(tschottdorf): this may be premature optimization. Consider just
		// treating everything as an external request. This means having to
		// wait for complete execution of the command (whereas now we just wait
		// for proposition) and some more overhead sending things around.
		wg.Add(1)
		action := func() {
			// Trace this under the ID of the intent owner.
			ctx := tracer.ToCtx(ctx, r.rm.Tracer().NewTrace(resolveArgs.Header().Txn))
			if _, err := r.addWriteCmd(ctx, resolveArgs, &wg); err != nil && log.V(1) {
				log.Warningc(ctx, "resolve for key %s failed: %s", intent.Key, err)
			}
		}
		if !r.rm.Stopper().RunAsyncTask(action) {
			// Still run the task. Our caller already has a task and going async
			// here again is merely for performance, but some intents need to
			// be resolved because they might block other tasks. See #1684.
			// Note that handleSkippedIntents has a TODO in case #1684 comes
			// back.
			action()
		}
	}
	// Resolve all of the intents which aren't local to the Range. This is a
	// no-op if all are local.
	b := &client.Batch{}
	b.InternalAddCall(proto.Call{Args: bArgs, Reply: &proto.BatchResponse{}})
	action := func() {
		// TODO(tschottdorf): no tracing here yet. Probably useful at some point,
		// but needs a) the corresponding interface and b) facilities for tracing
		// multiple tracees at the same time (batch full of possibly individual
		// txns).
		if err := r.rm.DB().Run(b); err != nil {
			if log.V(1) {
				log.Infoc(ctx, "%s", err)
			}
		}
	}
	if !r.rm.Stopper().RunAsyncTask(action) {
		// As with local intents, try async to not keep the caller waiting, but
		// when draining just go ahead and do it synchronously. See #1684.
		action()
	}

	// Wait until all the local `ResolveIntent`s have been submitted to raft.
	// No-op if all were external.
	wg.Wait()
}