Example #1
0
// ReadWriteCmd first consults the response cache to determine whether
// this command has already been sent to the range. If a response is
// found, it's returned immediately and not submitted to raft. Next,
// the timestamp cache is checked to determine if any newer accesses to
// this command's affected keys have been made. If so, this command's
// timestamp is moved forward. Finally the keys affected by this
// command are added as pending writes to the read queue and the
// command is submitted to Raft. Upon completion, the write is removed
// from the read queue and the reply is added to the repsonse cache.
func (r *Range) ReadWriteCmd(method string, args proto.Request, reply proto.Response) error {
	// Check the response cache in case this is a replay. This call
	// may block if the same command is already underway.
	header := args.Header()
	if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok || err != nil {
		if ok { // this is a replay! extract error for return
			return reply.Header().GoError()
		}
		// In this case there was an error reading from the response
		// cache. Instead of failing the request just because we can't
		// decode the reply in the response cache, we proceed as though
		// idempotence has expired.
		log.Errorf("unable to read result for %+v from the response cache: %v", args, err)
	}

	// One of the prime invariants of Cockroach is that a mutating command
	// cannot write a key with an earlier timestamp than the most recent
	// read of the same key. So first order of business here is to check
	// the timestamp cache for reads/writes which are more recent than the
	// timestamp of this write. If more recent, we simply update the
	// write's timestamp before enqueuing it for execution. When the write
	// returns, the updated timestamp will inform the final commit
	// timestamp.
	r.Lock() // Protect access to timestamp cache and read queue.
	if ts := r.tsCache.GetMax(header.Key, header.EndKey); header.Timestamp.Less(ts) {
		if glog.V(1) {
			glog.Infof("Overriding existing timestamp %s with %s", header.Timestamp, ts)
		}
		ts.Logical++ // increment logical component by one to differentiate.
		// Update the request timestamp.
		header.Timestamp = ts
	}
	// Just as for reads, we update the timestamp cache with the
	// timestamp of this write. This ensures a strictly higher timestamp
	// for successive writes to the same key or key range.
	r.tsCache.Add(header.Key, header.EndKey, header.Timestamp)

	// The next step is to add the write to the read queue to inform
	// subsequent reads that there is a pending write. Reads which
	// overlap pending writes must wait for those writes to complete.
	wKey := r.readQ.AddWrite(header.Key, header.EndKey)
	r.Unlock()

	// Create command and enqueue for Raft.
	cmd := &Cmd{
		Method: method,
		Args:   args,
		Reply:  reply,
		done:   make(chan error, 1),
	}
	// This waits for the command to complete.
	err := r.EnqueueCmd(cmd)

	// Now that the command has completed, remove the pending write.
	r.Lock()
	r.readQ.RemoveWrite(wKey)
	r.Unlock()

	return err
}
Example #2
0
// executeCmd switches over the method and multiplexes to execute the
// appropriate storage API command.
func (r *Range) executeCmd(method string, args proto.Request, reply proto.Response) error {
	switch method {
	case Contains:
		r.Contains(args.(*proto.ContainsRequest), reply.(*proto.ContainsResponse))
	case Get:
		r.Get(args.(*proto.GetRequest), reply.(*proto.GetResponse))
	case Put:
		r.Put(args.(*proto.PutRequest), reply.(*proto.PutResponse))
	case ConditionalPut:
		r.ConditionalPut(args.(*proto.ConditionalPutRequest), reply.(*proto.ConditionalPutResponse))
	case Increment:
		r.Increment(args.(*proto.IncrementRequest), reply.(*proto.IncrementResponse))
	case Delete:
		r.Delete(args.(*proto.DeleteRequest), reply.(*proto.DeleteResponse))
	case DeleteRange:
		r.DeleteRange(args.(*proto.DeleteRangeRequest), reply.(*proto.DeleteRangeResponse))
	case Scan:
		r.Scan(args.(*proto.ScanRequest), reply.(*proto.ScanResponse))
	case EndTransaction:
		r.EndTransaction(args.(*proto.EndTransactionRequest), reply.(*proto.EndTransactionResponse))
	case AccumulateTS:
		r.AccumulateTS(args.(*proto.AccumulateTSRequest), reply.(*proto.AccumulateTSResponse))
	case ReapQueue:
		r.ReapQueue(args.(*proto.ReapQueueRequest), reply.(*proto.ReapQueueResponse))
	case EnqueueUpdate:
		r.EnqueueUpdate(args.(*proto.EnqueueUpdateRequest), reply.(*proto.EnqueueUpdateResponse))
	case EnqueueMessage:
		r.EnqueueMessage(args.(*proto.EnqueueMessageRequest), reply.(*proto.EnqueueMessageResponse))
	case InternalRangeLookup:
		r.InternalRangeLookup(args.(*proto.InternalRangeLookupRequest), reply.(*proto.InternalRangeLookupResponse))
	case InternalHeartbeatTxn:
		r.InternalHeartbeatTxn(args.(*proto.InternalHeartbeatTxnRequest), reply.(*proto.InternalHeartbeatTxnResponse))
	case InternalPushTxn:
		r.InternalPushTxn(args.(*proto.InternalPushTxnRequest), reply.(*proto.InternalPushTxnResponse))
	case InternalResolveIntent:
		r.InternalResolveIntent(args.(*proto.InternalResolveIntentRequest), reply.(*proto.InternalResolveIntentResponse))
	case InternalSnapshotCopy:
		r.InternalSnapshotCopy(args.(*proto.InternalSnapshotCopyRequest), reply.(*proto.InternalSnapshotCopyResponse))
	default:
		return util.Errorf("unrecognized command type: %s", method)
	}

	// Propagate the request timestamp (which may have changed).
	reply.Header().Timestamp = args.Header().Timestamp

	// Add this command's result to the response cache if this is a
	// read/write method. This must be done as part of the execution of
	// raft commands so that every replica maintains the same responses
	// to continue request idempotence when leadership changes.
	if !IsReadOnly(method) {
		if putErr := r.respCache.PutResponse(args.Header().CmdID, reply); putErr != nil {
			log.Errorf("unable to write result of %+v: %+v to the response cache: %v",
				args, reply, putErr)
		}
	}

	// Return the error (if any) set in the reply.
	return reply.Header().GoError()
}
Example #3
0
// TestMultiRangeScanWithMaxResults tests that commands which access multiple
// ranges with MaxResults parameter are carried out properly.
func TestMultiRangeScanWithMaxResults(t *testing.T) {
	defer leaktest.AfterTest(t)
	testCases := []struct {
		splitKeys []proto.Key
		keys      []proto.Key
	}{
		{[]proto.Key{proto.Key("m")},
			[]proto.Key{proto.Key("a"), proto.Key("z")}},
		{[]proto.Key{proto.Key("h"), proto.Key("q")},
			[]proto.Key{proto.Key("b"), proto.Key("f"), proto.Key("k"),
				proto.Key("r"), proto.Key("w"), proto.Key("y")}},
	}

	for i, tc := range testCases {
		s := StartTestServer(t)
		ds := kv.NewDistSender(&kv.DistSenderContext{Clock: s.Clock()}, s.Gossip())
		tds := kv.NewTxnCoordSender(ds, s.Clock(), testContext.Linearizable, nil, s.stopper)

		for _, sk := range tc.splitKeys {
			if err := s.node.ctx.DB.AdminSplit(sk); err != nil {
				t.Fatal(err)
			}
		}

		var reply proto.Response
		for _, k := range tc.keys {
			put := proto.NewPut(k, proto.Value{Bytes: k})
			var err error
			reply, err = batchutil.SendWrapped(tds, put)
			if err != nil {
				t.Fatal(err)
			}
		}

		// Try every possible ScanRequest startKey.
		for start := 0; start < len(tc.keys); start++ {
			// Try every possible maxResults, from 1 to beyond the size of key array.
			for maxResults := 1; maxResults <= len(tc.keys)-start+1; maxResults++ {
				scan := proto.NewScan(tc.keys[start], tc.keys[len(tc.keys)-1].Next(),
					int64(maxResults))
				scan.Header().Timestamp = reply.Header().Timestamp
				reply, err := batchutil.SendWrapped(tds, scan)
				if err != nil {
					t.Fatal(err)
				}
				rows := reply.(*proto.ScanResponse).Rows
				if start+maxResults <= len(tc.keys) && len(rows) != maxResults {
					t.Errorf("%d: start=%s: expected %d rows, but got %d", i, tc.keys[start], maxResults, len(rows))
				} else if start+maxResults == len(tc.keys)+1 && len(rows) != maxResults-1 {
					t.Errorf("%d: expected %d rows, but got %d", i, maxResults-1, len(rows))
				}
			}
		}
		defer s.Stop()
	}
}
Example #4
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) error {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	// If we have an out of order index, there's corruption. No sense in trying
	// to update anything or run the command. Simply return a corruption error.
	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Call the helper, which returns a batch containing data written
	// during command execution and any associated error.
	ms := engine.MVCCStats{}
	batch, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, reply, &ms)
	// ALWAYS set the reply header error to the error returned by the
	// helper. This is the definitive result of the execution. The
	// error must be set before saving to the response cache.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	reply.Header().SetGoError(rErr)
	defer batch.Close()

	// Advance the last applied index and commit the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err)
	}
	if err := batch.Commit(); err != nil {
		rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr)
	} else {
		// Update cached appliedIndex if we were able to set the applied index on disk.
		atomic.StoreUint64(&r.appliedIndex, index)
	}

	// On successful write commands, flush to event feed, and handle other
	// write-related triggers including splitting and config gossip updates.
	if rErr == nil && proto.IsWrite(args) {
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if key := args.Header().Key; key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(key, configPrefix)
				})
			}
		}
	}

	return rErr
}
Example #5
0
// CallComplete is called by a node whenever it completes a request. This will
// publish an appropriate event to the feed based on the results of the call.
func (nef NodeEventFeed) CallComplete(args proto.Request, reply proto.Response) {
	if err := reply.Header().Error; err != nil &&
		err.CanRestartTransaction() == proto.TransactionRestart_ABORT {
		nef.f.Publish(&CallErrorEvent{
			NodeID: nef.id,
			Method: args.Method(),
		})
	} else {
		nef.f.Publish(&CallSuccessEvent{
			NodeID: nef.id,
			Method: args.Method(),
		})
	}
}
Example #6
0
// shouldCacheResponse returns whether the response should be cached.
// Responses with write-too-old, write-intent and not leader errors
// are retried on the server, and so are not recorded in the response
// cache in the hopes of retrying to a successful outcome.
func (rc *ResponseCache) shouldCacheResponse(reply proto.Response) bool {
	switch reply.Header().GoError().(type) {
	case *proto.WriteTooOldError, *proto.WriteIntentError, *proto.NotLeaderError:
		return false
	}
	return true
}
Example #7
0
// executeCmd creates a proto.Call struct and sends it via our local sender.
func (n *nodeServer) executeCmd(args proto.Request, reply proto.Response) error {
	// TODO(tschottdorf) get a hold of the client's ID, add it to the
	// context before dispatching, and create an ID for tracing the request.
	header := args.Header()
	header.CmdID = header.GetOrCreateCmdID(n.ctx.Clock.PhysicalNow())
	trace := n.ctx.Tracer.NewTrace(header)
	defer trace.Finalize()
	defer trace.Epoch("node")()
	ctx := tracer.ToCtx((*Node)(n).context(), trace)

	n.lSender.Send(ctx, proto.Call{Args: args, Reply: reply})
	n.feed.CallComplete(args, reply)
	if err := reply.Header().GoError(); err != nil {
		trace.Event(fmt.Sprintf("error: %T", err))
	}
	return nil
}
Example #8
0
// addAdminCmd executes the command directly. There is no interaction
// with the command queue or the timestamp cache, as admin commands
// are not meant to consistently access or modify the underlying data.
// Admin commands must run on the leader replica.
func (r *Range) addAdminCmd(ctx context.Context, args proto.Request, reply proto.Response) error {
	// Admin commands always require the leader lease.
	if err := r.redirectOnOrAcquireLeaderLease(args.Header().Timestamp); err != nil {
		reply.Header().SetGoError(err)
		return err
	}

	switch args.(type) {
	case *proto.AdminSplitRequest:
		r.AdminSplit(args.(*proto.AdminSplitRequest), reply.(*proto.AdminSplitResponse))
	case *proto.AdminMergeRequest:
		r.AdminMerge(args.(*proto.AdminMergeRequest), reply.(*proto.AdminMergeResponse))
	default:
		return util.Error("unrecognized admin command")
	}
	return reply.Header().GoError()
}
Example #9
0
// addReadOnlyCmd updates the read timestamp cache and waits for any
// overlapping writes currently processing through Raft ahead of us to
// clear via the read queue.
func (r *Range) addReadOnlyCmd(ctx context.Context, args proto.Request, reply proto.Response) error {
	header := args.Header()

	// If read-consistency is set to INCONSISTENT, run directly.
	if header.ReadConsistency == proto.INCONSISTENT {
		// But disallow any inconsistent reads within txns.
		if header.Txn != nil {
			reply.Header().SetGoError(util.Error("cannot allow inconsistent reads within a transaction"))
			return reply.Header().GoError()
		}
		if header.Timestamp.Equal(proto.ZeroTimestamp) {
			header.Timestamp = r.rm.Clock().Now()
		}
		intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply)
		if err == nil {
			r.handleSkippedIntents(args, intents)
		}
		return err
	} else if header.ReadConsistency == proto.CONSENSUS {
		reply.Header().SetGoError(util.Error("consensus reads not implemented"))
		return reply.Header().GoError()
	}

	// Add the read to the command queue to gate subsequent
	// overlapping commands until this command completes.
	cmdKey := r.beginCmd(header, true)

	// This replica must have leader lease to process a consistent read.
	if err := r.redirectOnOrAcquireLeaderLease(args.Header().Timestamp); err != nil {
		r.endCmd(cmdKey, args, err, true /* readOnly */)
		reply.Header().SetGoError(err)
		return err
	}

	// Execute read-only command.
	intents, err := r.executeCmd(r.rm.Engine(), nil, args, reply)

	// Only update the timestamp cache if the command succeeded.
	r.endCmd(cmdKey, args, err, true /* readOnly */)

	if err == nil {
		r.handleSkippedIntents(args, intents)
	}
	return err
}
Example #10
0
// MaybeWrap wraps the given argument in a batch, unless it is already one.
func maybeWrap(args proto.Request) (*proto.BatchRequest, func(*proto.BatchResponse) proto.Response) {
	if ba, ok := args.(*proto.BatchRequest); ok {
		return ba, func(br *proto.BatchResponse) proto.Response { return br }
	}
	ba := &proto.BatchRequest{}
	ba.RequestHeader = *(gogoproto.Clone(args.Header()).(*proto.RequestHeader))
	ba.Add(args)
	return ba, func(br *proto.BatchResponse) proto.Response {
		var unwrappedReply proto.Response
		if len(br.Responses) == 0 {
			unwrappedReply = args.CreateReply()
		} else {
			unwrappedReply = br.Responses[0].GetInner()
		}
		// The ReplyTxn is propagated from one response to the next request,
		// and we adopt the mechanism that whenever the Txn changes, it needs
		// to be set in the reply, for example to ratched up the transaction
		// timestamp on writes when necessary.
		// This is internally necessary to sequentially execute the batch,
		// so it makes some sense to take the burden of updating the Txn
		// from TxnCoordSender - it will only need to act on retries/aborts
		// in the future.
		unwrappedReply.Header().Txn = br.Txn
		if unwrappedReply.Header().Error == nil {
			unwrappedReply.Header().Error = br.Error
		}
		return unwrappedReply
	}
}
Example #11
0
// CallComplete is called by a node whenever it completes a request. This will
// publish an appropriate event to the feed based on the results of the call.
// TODO(tschottdorf): move to batch, account for multiple methods per batch.
// In particular, on error want an error position to identify the failed
// request.
func (nef NodeEventFeed) CallComplete(args proto.Request, reply proto.Response) {
	method := args.Method()
	if ba, ok := args.(*proto.BatchRequest); ok && len(ba.Requests) > 0 {
		method = ba.Requests[0].GetInner().Method()
	}
	if err := reply.Header().Error; err != nil &&
		err.TransactionRestart == proto.TransactionRestart_ABORT {
		nef.f.Publish(&CallErrorEvent{
			NodeID: nef.id,
			Method: method,
		})
	} else {
		nef.f.Publish(&CallSuccessEvent{
			NodeID: nef.id,
			Method: method,
		})
	}
}
Example #12
0
// ExecuteCmd fetches a range based on the header's replica, assembles
// method, args & reply into a Raft Cmd struct and executes the
// command using the fetched range.
func (s *Store) ExecuteCmd(method string, args proto.Request, reply proto.Response) error {
	// If the request has a zero timestamp, initialize to this node's clock.
	header := args.Header()
	if header.Timestamp.WallTime == 0 && header.Timestamp.Logical == 0 {
		// Update both incoming and outgoing timestamps.
		now := s.clock.Now()
		args.Header().Timestamp = now
		reply.Header().Timestamp = now
	} else {
		// Otherwise, update our clock with the incoming request. This
		// advances the local node's clock to a high water mark from
		// amongst all nodes with which it has interacted. The update is
		// bounded by the max clock drift.
		_, err := s.clock.Update(header.Timestamp)
		if err != nil {
			return err
		}
	}

	// Verify specified range contains the command's implicated keys.
	rng, err := s.GetRange(header.Replica.RangeID)
	if err != nil {
		return err
	}
	if !rng.ContainsKeyRange(header.Key, header.EndKey) {
		return proto.NewRangeKeyMismatchError(header.Key, header.EndKey, rng.Meta)
	}
	if !rng.IsLeader() {
		// TODO(spencer): when we happen to know the leader, fill it in here via replica.
		return &proto.NotLeaderError{}
	}

	// Differentiate between read-only and read-write.
	if IsReadOnly(method) {
		return rng.ReadOnlyCmd(method, args, reply)
	}

	return rng.ReadWriteCmd(method, args, reply)
}
Example #13
0
// applyRaftCommandInBatch executes the command in a batch engine and
// returns the batch containing the results. The caller is responsible
// for committing the batch, even on error.
func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID,
	args proto.Request, reply proto.Response, ms *engine.MVCCStats) (engine.Engine, error) {
	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no longer be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		//
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return batch, r.newNotLeaderError(lease, originNode)
	}

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if ok, err := r.respCache.GetResponse(batch, args.Header().CmdID, reply); err != nil {
			// Any error encountered while fetching the response cache entry means corruption.
			return batch, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err)
		} else if ok {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			}
			// We successfully read from the response cache, so return whatever error
			// was present in the cached entry (if any).
			return batch, reply.Header().GoError()
		}
	}

	// Execute the command.
	intents, rErr := r.executeCmd(batch, ms, args, reply)
	// Regardless of error, add result to the response cache if this is
	// a write method. This must be done as part of the execution of
	// raft commands so that every replica maintains the same responses
	// to continue request idempotence, even if leadership changes.
	if proto.IsWrite(args) {
		if rErr == nil {
			// If command was successful, flush the MVCC stats to the batch.
			if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil {
				log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err)
			}
		} else {
			// Otherwise, reset the batch to clear out partial execution and
			// prepare for the failed response cache entry.
			batch.Close()
			batch = r.rm.Engine().NewBatch()
		}
		if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil {
			log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err)
		}
	}

	// If the execution of the command wasn't successful, stop here.
	if rErr != nil {
		return batch, rErr
	}

	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)
	}

	return batch, nil
}
Example #14
0
// executeCmd switches over the method and multiplexes to execute the
// appropriate storage API command. It returns an error and, for some calls
// such as inconsistent reads, the intents they skipped.
func (r *Range) executeCmd(batch engine.Engine, ms *engine.MVCCStats, args proto.Request, reply proto.Response) ([]proto.Intent, error) {
	// Verify key is contained within range here to catch any range split
	// or merge activity.
	header := args.Header()

	if err := r.checkCmdHeader(header); err != nil {
		reply.Header().SetGoError(err)
		return nil, err
	}

	// If a unittest filter was installed, check for an injected error; otherwise, continue.
	if TestingCommandFilter != nil && TestingCommandFilter(args, reply) {
		return nil, reply.Header().GoError()
	}

	var intents []proto.Intent
	switch tArgs := args.(type) {
	case *proto.GetRequest:
		intents = r.Get(batch, tArgs, reply.(*proto.GetResponse))
	case *proto.PutRequest:
		r.Put(batch, ms, tArgs, reply.(*proto.PutResponse))
	case *proto.ConditionalPutRequest:
		r.ConditionalPut(batch, ms, tArgs, reply.(*proto.ConditionalPutResponse))
	case *proto.IncrementRequest:
		r.Increment(batch, ms, tArgs, reply.(*proto.IncrementResponse))
	case *proto.DeleteRequest:
		r.Delete(batch, ms, tArgs, reply.(*proto.DeleteResponse))
	case *proto.DeleteRangeRequest:
		r.DeleteRange(batch, ms, tArgs, reply.(*proto.DeleteRangeResponse))
	case *proto.ScanRequest:
		intents = r.Scan(batch, tArgs, reply.(*proto.ScanResponse))
	case *proto.EndTransactionRequest:
		r.EndTransaction(batch, ms, tArgs, reply.(*proto.EndTransactionResponse))
	case *proto.InternalRangeLookupRequest:
		intents = r.InternalRangeLookup(batch, tArgs, reply.(*proto.InternalRangeLookupResponse))
	case *proto.InternalHeartbeatTxnRequest:
		r.InternalHeartbeatTxn(batch, ms, tArgs, reply.(*proto.InternalHeartbeatTxnResponse))
	case *proto.InternalGCRequest:
		r.InternalGC(batch, ms, tArgs, reply.(*proto.InternalGCResponse))
	case *proto.InternalPushTxnRequest:
		r.InternalPushTxn(batch, ms, tArgs, reply.(*proto.InternalPushTxnResponse))
	case *proto.InternalResolveIntentRequest:
		r.InternalResolveIntent(batch, ms, tArgs, reply.(*proto.InternalResolveIntentResponse))
	case *proto.InternalResolveIntentRangeRequest:
		r.InternalResolveIntentRange(batch, ms, tArgs, reply.(*proto.InternalResolveIntentRangeResponse))
	case *proto.InternalMergeRequest:
		r.InternalMerge(batch, ms, tArgs, reply.(*proto.InternalMergeResponse))
	case *proto.InternalTruncateLogRequest:
		r.InternalTruncateLog(batch, ms, tArgs, reply.(*proto.InternalTruncateLogResponse))
	case *proto.InternalLeaderLeaseRequest:
		r.InternalLeaderLease(batch, ms, tArgs, reply.(*proto.InternalLeaderLeaseResponse))
	default:
		return nil, util.Errorf("unrecognized command %s", args.Method())
	}

	if log.V(2) {
		log.Infof("executed %s command %+v: %+v", args.Method(), args, reply)
	}

	// Update the node clock with the serviced request. This maintains a
	// high water mark for all ops serviced, so that received ops
	// without a timestamp specified are guaranteed one higher than any
	// op already executed for overlapping keys.
	r.rm.Clock().Update(header.Timestamp)

	// Propagate the request timestamp (which may have changed).
	reply.Header().Timestamp = header.Timestamp

	err := reply.Header().GoError()

	// A ReadWithinUncertaintyIntervalError contains the timestamp of the value
	// that provoked the conflict. However, we forward the timestamp to the
	// node's time here. The reason is that the caller (which is always
	// transactional when this error occurs) in our implementation wants to
	// use this information to extract a timestamp after which reads from
	// the nodes are causally consistent with the transaction. This allows
	// the node to be classified as without further uncertain reads for the
	// remainder of the transaction.
	// See the comment on proto.Transaction.CertainNodes.
	if tErr, ok := reply.Header().GoError().(*proto.ReadWithinUncertaintyIntervalError); ok && tErr != nil {
		// Note that we can use this node's clock (which may be different from
		// other replicas') because this error attaches the existing timestamp
		// to the node itself when retrying.
		tErr.ExistingTimestamp.Forward(r.rm.Clock().Now())
	}

	// Return the error (if any) set in the reply.
	return intents, err
}
Example #15
0
// Send implements the client.Sender interface. It verifies
// permissions and looks up the appropriate range based on the
// supplied key and sends the RPC according to the specified options.
//
// If the request spans multiple ranges (which is possible for Scan or
// DeleteRange requests), Send sends requests to the individual ranges
// sequentially and combines the results transparently.
//
// This may temporarily adjust the request headers, so the proto.Call
// must not be used concurrently until Send has returned.
func (ds *DistSender) Send(ctx context.Context, call proto.Call) {
	args := call.Args

	// Verify permissions.
	if err := ds.verifyPermissions(call.Args); err != nil {
		call.Reply.Header().SetGoError(err)
		return
	}

	trace := tracer.FromCtx(ctx)

	// In the event that timestamp isn't set and read consistency isn't
	// required, set the timestamp using the local clock.
	if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) {
		// Make sure that after the call, args hasn't changed.
		defer func(timestamp proto.Timestamp) {
			args.Header().Timestamp = timestamp
		}(args.Header().Timestamp)
		args.Header().Timestamp = ds.clock.Now()
	}

	// If this is a bounded request, we will change its bound as we receive
	// replies. This undoes that when we return.
	boundedArgs, argsBounded := args.(proto.Bounded)

	if argsBounded {
		defer func(bound int64) {
			boundedArgs.SetBound(bound)
		}(boundedArgs.GetBound())
	}
	_, isReverseScan := call.Args.(*proto.ReverseScanRequest)
	// Restore to the original range if the scan/reverse_scan crosses range boundaries.
	if isReverseScan {
		defer func(key proto.Key) {
			args.Header().EndKey = key
		}(args.Header().EndKey)
	} else {
		defer func(key proto.Key) {
			args.Header().Key = key
		}(args.Header().Key)
	}

	first := true

	// Retry logic for lookup of range by key and RPCs to range replicas.
	for {
		var curReply proto.Response
		var desc, descNext *proto.RangeDescriptor
		var err error
		for r := retry.Start(ds.rpcRetryOptions); r.Next(); {
			// Get range descriptor (or, when spanning range, descriptors). Our
			// error handling below may clear them on certain errors, so we
			// refresh (likely from the cache) on every retry.
			descDone := trace.Epoch("meta descriptor lookup")
			// It is safe to pass call here (with its embedded reply) because
			// the reply is only used to check that it implements
			// proto.Combinable if the request spans multiple ranges.
			desc, descNext, err = ds.getDescriptors(call)
			descDone()
			// getDescriptors may fail retryably if the first range isn't
			// available via Gossip.
			if err != nil {
				if rErr, ok := err.(retry.Retryable); ok && rErr.CanRetry() {
					if log.V(1) {
						log.Warning(err)
					}
					continue
				}
				break
			}
			// At this point reply.Header().Error may be non-nil!
			curReply, err = ds.sendAttempt(trace, args, desc)

			descKey := args.Header().Key
			if isReverseScan {
				descKey = args.Header().EndKey
			}

			if err != nil {
				trace.Event(fmt.Sprintf("send error: %T", err))
				// For an RPC error to occur, we must've been unable to contact any
				// replicas. In this case, likely all nodes are down (or not getting back
				// to us within a reasonable amount of time).
				// We may simply not be trying to talk to the up-to-date replicas, so
				// clearing the descriptor here should be a good idea.
				// TODO(tschottdorf): If a replica group goes dead, this will cause clients
				// to put high read pressure on the first range, so there should be some
				// rate limiting here.
				ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan)
			} else {
				err = curReply.Header().GoError()
			}

			if err == nil {
				break
			}

			if log.V(1) {
				log.Warningf("failed to invoke %s: %s", call.Method(), err)
			}

			// If retryable, allow retry. For range not found or range
			// key mismatch errors, we don't backoff on the retry,
			// but reset the backoff loop so we can retry immediately.
			switch tErr := err.(type) {
			case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError:
				trace.Event(fmt.Sprintf("reply error: %T", err))
				// Range descriptor might be out of date - evict it.
				ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan)
				// On addressing errors, don't backoff; retry immediately.
				r.Reset()
				if log.V(1) {
					log.Warning(err)
				}
				continue
			case *proto.NotLeaderError:
				trace.Event(fmt.Sprintf("reply error: %T", err))
				newLeader := tErr.GetLeader()
				// Verify that leader is a known replica according to the
				// descriptor. If not, we've got a stale replica; evict cache.
				// Next, cache the new leader.
				if newLeader != nil {
					if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 {
						if log.V(1) {
							log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc)
						}
						ds.rangeCache.EvictCachedRangeDescriptor(descKey, desc, isReverseScan)
					}
				} else {
					newLeader = &proto.Replica{}
				}
				ds.updateLeaderCache(proto.RangeID(desc.RangeID), *newLeader)
				if log.V(1) {
					log.Warning(err)
				}
				r.Reset()
				continue
			case retry.Retryable:
				if tErr.CanRetry() {
					if log.V(1) {
						log.Warning(err)
					}
					trace.Event(fmt.Sprintf("reply error: %T", err))
					continue
				}
			}
			break
		}

		// Immediately return if querying a range failed non-retryably.
		// For multi-range requests, we return the failing range's reply.
		if err != nil {
			call.Reply.Header().SetGoError(err)
			return
		}

		if first {
			// Equivalent of `*call.Reply = curReply`. Generics!
			dst := reflect.ValueOf(call.Reply).Elem()
			dst.Set(reflect.ValueOf(curReply).Elem())
		} else {
			// This was the second or later call in a multi-range request.
			// Combine the new response with the existing one.
			if cReply, ok := call.Reply.(proto.Combinable); ok {
				cReply.Combine(curReply)
			} else {
				// This should never apply in practice, as we'll only end up here
				// for range-spanning requests.
				call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type"))
				return
			}
		}

		first = false

		// If this request has a bound, such as MaxResults in
		// ScanRequest, check whether enough rows have been retrieved.
		if argsBounded {
			if prevBound := boundedArgs.GetBound(); prevBound > 0 {
				if cReply, ok := curReply.(proto.Countable); ok {
					if nextBound := prevBound - cReply.Count(); nextBound > 0 {
						// Update bound for the next round.
						// We've deferred restoring the original bound earlier.
						boundedArgs.SetBound(nextBound)
					} else {
						// Set flag to break the loop.
						descNext = nil
					}
				}
			}
		}

		// If this was the last range accessed by this call, exit loop.
		if descNext == nil {
			break
		}

		if isReverseScan {
			// In next iteration, query previous range.
			// We use the StartKey of the current descriptor as opposed to the
			// EndKey of the previous one.
			args.Header().EndKey = desc.StartKey
		} else {
			// In next iteration, query next range.
			// It's important that we use the EndKey of the current descriptor
			// as opposed to the StartKey of the next one: if the former is stale,
			// it's possible that the next range has since merged the subsequent
			// one, and unless both descriptors are stale, the next descriptor's
			// StartKey would move us to the beginning of the current range,
			// resulting in a duplicate scan.
			args.Header().Key = desc.EndKey
		}
		trace.Event("querying next range")
	}
}
Example #16
0
// addWriteCmd first consults the response cache to determine whether
// this command has already been sent to the range. If a response is
// found, it's returned immediately and not submitted to raft. Next,
// the timestamp cache is checked to determine if any newer accesses to
// this command's affected keys have been made. If so, this command's
// timestamp is moved forward. Finally the keys affected by this
// command are added as pending writes to the read queue and the
// command is submitted to Raft. Upon completion, the write is removed
// from the read queue and the reply is added to the response cache.
// If wait is true, will block until the command is complete.
func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, reply proto.Response, wait bool) error {
	// Check the response cache in case this is a replay. This call
	// may block if the same command is already underway.
	header := args.Header()

	// Add the write to the command queue to gate subsequent overlapping
	// Commands until this command completes. Note that this must be
	// done before getting the max timestamp for the key(s), as
	// timestamp cache is only updated after preceding commands have
	// been run to successful completion.
	cmdKey := r.beginCmd(header, false)

	// This replica must have leader lease to process a write.
	if err := r.redirectOnOrAcquireLeaderLease(header.Timestamp); err != nil {
		r.endCmd(cmdKey, args, err, false /* !readOnly */)
		reply.Header().SetGoError(err)
		return err
	}

	// Two important invariants of Cockroach: 1) encountering a more
	// recently written value means transaction restart. 2) values must
	// be written with a greater timestamp than the most recent read to
	// the same key. Check the timestamp cache for reads/writes which
	// are at least as recent as the timestamp of this write. For
	// writes, send WriteTooOldError; for reads, update the write's
	// timestamp. When the write returns, the updated timestamp will
	// inform the final commit timestamp.
	if usesTimestampCache(args) {
		r.Lock()
		rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID())
		r.Unlock()

		// Always push the timestamp forward if there's been a read which
		// occurred after our txn timestamp.
		if !rTS.Less(header.Timestamp) {
			header.Timestamp = rTS.Next()
		}
		// If there's a newer write timestamp...
		if !wTS.Less(header.Timestamp) {
			// If we're in a txn, set a write too old error in reply. We
			// still go ahead and try the write because we want to avoid
			// restarting the transaction in the event that there isn't an
			// intent or the intent can be pushed by us.
			if header.Txn != nil {
				err := &proto.WriteTooOldError{Timestamp: header.Timestamp, ExistingTimestamp: wTS}
				reply.Header().SetGoError(err)
			} else {
				// Otherwise, make sure we advance the request's timestamp.
				header.Timestamp = wTS.Next()
			}
		}
	}

	errChan, pendingCmd := r.proposeRaftCommand(ctx, args, reply)

	// Create a completion func for mandatory cleanups which we either
	// run synchronously if we're waiting or in a goroutine otherwise.
	completionFunc := func() error {
		// First wait for raft to commit or abort the command.
		var err error
		if err = <-errChan; err == nil {
			// Next if the command was committed, wait for the range to apply it.
			err = <-pendingCmd.done
		} else if err == multiraft.ErrGroupDeleted {
			// This error needs to be converted appropriately so that
			// clients will retry.
			err = proto.NewRangeNotFoundError(r.Desc().RaftID)
		}
		// As for reads, update timestamp cache with the timestamp
		// of this write on success. This ensures a strictly higher
		// timestamp for successive writes to the same key or key range.
		r.endCmd(cmdKey, args, err, false /* !readOnly */)
		return err
	}

	if wait {
		return completionFunc()
	}
	go func() {
		// If the original client didn't wait (e.g. resolve write intent),
		// log execution errors so they're surfaced somewhere.
		if err := completionFunc(); err != nil {
			// TODO(tschottdorf): possible security risk to log args.
			log.Warningc(ctx, "async execution of %v failed: %s", args, err)
		}
	}()
	return nil
}
Example #17
0
// addWriteCmd first adds the keys affected by this command as pending writes
// to the command queue. Next, the timestamp cache is checked to determine if
// any newer accesses to this command's affected keys have been made. If so,
// the command's timestamp is moved forward. Finally, the command is submitted
// to Raft. Upon completion, the write is removed from the read queue and any
// error returned. If a WaitGroup is supplied, it is signaled when the command
// enters Raft or the function returns with a preprocessing error, whichever
// happens earlier.
func (r *Range) addWriteCmd(ctx context.Context, args proto.Request, reply proto.Response, wg *sync.WaitGroup) error {
	signal := func() {
		if wg != nil {
			wg.Done()
			wg = nil
		}
	}

	// This happens more eagerly below, but it's important to guarantee that
	// early returns do not skip this.
	defer signal()

	header := args.Header()

	if err := r.checkCmdHeader(args.Header()); err != nil {
		reply.Header().SetGoError(err)
		return err
	}

	trace := tracer.FromCtx(ctx)

	// Add the write to the command queue to gate subsequent overlapping
	// Commands until this command completes. Note that this must be
	// done before getting the max timestamp for the key(s), as
	// timestamp cache is only updated after preceding commands have
	// been run to successful completion.
	qDone := trace.Epoch("command queue")
	cmdKey := r.beginCmd(header, false)
	qDone()

	// This replica must have leader lease to process a write.
	if err := r.redirectOnOrAcquireLeaderLease(trace, header.Timestamp); err != nil {
		r.endCmd(cmdKey, args, err, false /* !readOnly */)
		reply.Header().SetGoError(err)
		return err
	}

	// Two important invariants of Cockroach: 1) encountering a more
	// recently written value means transaction restart. 2) values must
	// be written with a greater timestamp than the most recent read to
	// the same key. Check the timestamp cache for reads/writes which
	// are at least as recent as the timestamp of this write. For
	// writes, send WriteTooOldError; for reads, update the write's
	// timestamp. When the write returns, the updated timestamp will
	// inform the final commit timestamp.
	if usesTimestampCache(args) {
		r.Lock()
		rTS, wTS := r.tsCache.GetMax(header.Key, header.EndKey, header.Txn.GetID())
		r.Unlock()

		// Always push the timestamp forward if there's been a read which
		// occurred after our txn timestamp.
		if !rTS.Less(header.Timestamp) {
			header.Timestamp = rTS.Next()
		}
		// If there's a newer write timestamp...
		if !wTS.Less(header.Timestamp) {
			// If we're in a txn, we still go ahead and try the write since
			// we want to avoid restarting the transaction in the event that
			// there isn't an intent or the intent can be pushed by us.
			//
			// If we're not in a txn, it's trivial to just advance our timestamp.
			if header.Txn == nil {
				header.Timestamp = wTS.Next()
			}
		}
	}

	defer trace.Epoch("raft")()

	errChan, pendingCmd := r.proposeRaftCommand(ctx, args, reply)

	signal()

	// First wait for raft to commit or abort the command.
	var err error
	if err = <-errChan; err == nil {
		// Next if the command was committed, wait for the range to apply it.
		err = <-pendingCmd.done
	} else if err == multiraft.ErrGroupDeleted {
		// This error needs to be converted appropriately so that
		// clients will retry.
		err = proto.NewRangeNotFoundError(r.Desc().RaftID)
	}
	// As for reads, update timestamp cache with the timestamp
	// of this write on success. This ensures a strictly higher
	// timestamp for successive writes to the same key or key range.
	r.endCmd(cmdKey, args, err, false /* !readOnly */)
	return err
}
Example #18
0
// applyRaftCommand applies a raft command from the replicated log to the
// underlying state machine (i.e. the engine).
// When certain critical operations fail, a replicaCorruptionError may be
// returned and must be handled by the caller.
func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) {
	if index <= 0 {
		log.Fatalc(ctx, "raft command index is <= 0")
	}

	committed := false
	// The very last thing we do before returning is move the applied index
	// forward, unless that has already happened as part of a successfully
	// committed batch.
	defer func() {
		if !committed {
			// We didn't commit the batch, but advance the last applied index nonetheless.
			if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not advance applied index"), err, rErr)
				return
			}
			atomic.StoreUint64(&r.appliedIndex, index)
		}
	}()

	if lease := r.getLease(); args.Method() != proto.InternalLeaderLease &&
		(!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) {
		// Verify the leader lease is held, unless this command is trying to
		// obtain it. Any other Raft command has had the leader lease held
		// by the replica at proposal time, but this may no more be the case.
		// Corruption aside, the most likely reason is a leadership change (the
		// most recent leader assumes responsibility for all past timestamps as
		// well). In that case, it's not valid to go ahead with the execution:
		// Writes must be aware of the last time the mutated key was read, and
		// since reads are served locally by the lease holder without going
		// through Raft, a read which was not taken into account may have been
		// served. Hence, we must retry at the current leader.
		//
		// It's crucial that we don't update the response cache for the error
		// returned below since the request is going to be retried with the
		// same ClientCmdID and would get the distributed sender stuck in an
		// infinite loop, retrieving a stale NotLeaderError over and over
		// again, even when proposing at the correct replica.
		return r.newNotLeaderError(lease)
	}

	// Anything happening from now on needs to enter the response cache.
	defer func() {
		// TODO(tamird,tschottdorf): according to #1400 we intend to set the reply
		// header's error as late as possible and in a central location. Range
		// commands still write to the header directly, but once they don't this
		// could be the authoritative location that sets the reply error for any-
		// thing that makes it into Raft. Note that we must set this prior to
		// signaling cmd.done below, or the waiting RPC handler might proceed
		// before we've updated its reply.
		//
		// It is important that the error is set before the reply is saved into
		// the response cache.
		reply.Header().SetGoError(rErr)

		if proto.IsWrite(args) {
			// No matter the result, add result to the response cache if this
			// is a write method. This must be done as part of the execution of
			// raft commands so that every replica maintains the same responses
			// to continue request idempotence, even if leadership changes.
			if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil {
				rErr = newReplicaCorruptionError(
					util.Errorf("could not put to response cache"), err, rErr)
				return
			}
		}
	}()

	header := args.Header()

	// Check the response cache to ensure idempotency.
	if proto.IsWrite(args) {
		if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil {
			if log.V(1) {
				log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID)
			}
			return err
		} else if ok && err != nil {
			return newReplicaCorruptionError(
				util.Errorf("could not read from response cache"), err)
		}
	}

	// Create a new batch for the command to ensure all or nothing semantics.
	batch := r.rm.Engine().NewBatch()
	defer batch.Close()

	// Create a engine.MVCCStats instance.
	ms := engine.MVCCStats{}

	// Execute the command; the error will also be set in the reply header.
	// TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not
	// touch the reply header's error field.
	intents, err := r.executeCmd(batch, &ms, args, reply)
	// If the execution of the command wasn't successful, stop here.
	if err != nil {
		return err
	}

	if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index {
		return newReplicaCorruptionError(
			util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index))
	}

	// Advance the applied index atomically within the batch.
	if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil {
		return newReplicaCorruptionError(
			util.Errorf("could not update applied index"), err)
	}

	if proto.IsWrite(args) {
		// On success, flush the MVCC stats to the batch and commit.
		if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err)
		}
		if err := batch.Commit(); err != nil {
			return newReplicaCorruptionError(util.Errorf("could not commit batch"), err)
		}
		committed = true
		// Publish update to event feed.
		r.rm.EventFeed().updateRange(r, args.Method(), &ms)
		// After successful commit, update cached stats and appliedIndex value.
		atomic.StoreUint64(&r.appliedIndex, index)
		// If the commit succeeded, potentially add range to split queue.
		r.maybeAddToSplitQueue()
		// Maybe update gossip configs on a put.
		switch args.(type) {
		case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest:
			if header.Key.Less(keys.SystemMax) {
				// We hold the lock already.
				r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool {
					return bytes.HasPrefix(header.Key, configPrefix)
				})
			}
		}
	}
	// On success and only on the replica on which this command originated,
	// resolve skipped intents asynchronously.
	if originNode == r.rm.RaftNodeID() {
		r.handleSkippedIntents(args, intents)
	}

	return nil
}
Example #19
0
func safeSetGoError(reply proto.Response, err error) {
	if reply.Header().Error != nil {
		panic(proto.ErrorUnexpectedlySet)
	}
	reply.Header().SetGoError(err)
}