// AddCmd adds a command for execution on this range. The command's // affected keys are verified to be contained within the range and the // range's leadership is confirmed. The command is then dispatched // either along the read-only execution path or the read-write Raft // command queue. func (r *Range) AddCmd(ctx context.Context, call proto.Call) error { args := call.Args // TODO(tschottdorf) Some (internal) requests go here directly, so they // won't be traced. trace := tracer.FromCtx(ctx) // Differentiate between admin, read-only and read-write. var reply proto.Response var err error if proto.IsAdmin(args) { defer trace.Epoch("admin path")() reply, err = r.addAdminCmd(ctx, args) } else if proto.IsReadOnly(args) { defer trace.Epoch("read-only path")() reply, err = r.addReadOnlyCmd(ctx, args) } else if proto.IsWrite(args) { defer trace.Epoch("read-write path")() reply, err = r.addWriteCmd(ctx, args, nil) } else { panic(fmt.Sprintf("don't know how to handle command %T", args)) } if reply != nil { gogoproto.Merge(call.Reply, reply) } if err != nil { replyHeader := call.Reply.Header() if replyHeader.Error != nil { panic("the world is on fire") } replyHeader.SetGoError(err) } return err }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) error { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } // If we have an out of order index, there's corruption. No sense in trying // to update anything or run the command. Simply return a corruption error. if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Call the helper, which returns a batch containing data written // during command execution and any associated error. ms := engine.MVCCStats{} batch, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, reply, &ms) // ALWAYS set the reply header error to the error returned by the // helper. This is the definitive result of the execution. The // error must be set before saving to the response cache. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. reply.Header().SetGoError(rErr) defer batch.Close() // Advance the last applied index and commit the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err) } if err := batch.Commit(); err != nil { rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr) } else { // Update cached appliedIndex if we were able to set the applied index on disk. atomic.StoreUint64(&r.appliedIndex, index) } // On successful write commands, flush to event feed, and handle other // write-related triggers including splitting and config gossip updates. if rErr == nil && proto.IsWrite(args) { // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if key := args.Header().Key; key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(key, configPrefix) }) } } } return rErr }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request) (proto.Response, error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } // If we have an out of order index, there's corruption. No sense in trying // to update anything or run the command. Simply return a corruption error. if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return nil, newReplicaCorruptionError(util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Call the helper, which returns a batch containing data written // during command execution and any associated error. ms := engine.MVCCStats{} batch, reply, rErr := r.applyRaftCommandInBatch(ctx, index, originNode, args, &ms) defer batch.Close() // Advance the last applied index and commit the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { log.Fatalc(ctx, "setting applied index in a batch should never fail: %s", err) } if err := batch.Commit(); err != nil { rErr = newReplicaCorruptionError(util.Errorf("could not commit batch"), err, rErr) } else { // Update cached appliedIndex if we were able to set the applied index on disk. atomic.StoreUint64(&r.appliedIndex, index) } // On successful write commands, flush to event feed, and handle other // write-related triggers including splitting and config gossip updates. if rErr == nil && proto.IsWrite(args) { // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs if the command is not part of a transaction. // If the command is part of an uncommitted transaction, we rely on the // periodic configGossipInterval loop since we will not see the update // until the transaction is committed. if key := args.Header().Key; key.Less(keys.SystemMax) && args.Header().Txn == nil { r.maybeGossipConfigs(func(configPrefix proto.Key) bool { return bytes.HasPrefix(key, configPrefix) }) } } return reply, rErr }
// AddCmd adds a command for execution on this range. The command's // affected keys are verified to be contained within the range and the // range's leadership is confirmed. The command is then dispatched // either along the read-only execution path or the read-write Raft // command queue. func (r *Replica) AddCmd(ctx context.Context, args proto.Request) (proto.Response, error) { // TODO(tschottdorf) Some (internal) requests go here directly, so they // won't be traced. trace := tracer.FromCtx(ctx) // Differentiate between admin, read-only and read-write. var reply proto.Response var err error if proto.IsAdmin(args) { defer trace.Epoch("admin path")() reply, err = r.addAdminCmd(ctx, args) } else if proto.IsReadOnly(args) { defer trace.Epoch("read-only path")() reply, err = r.addReadOnlyCmd(ctx, args) } else if proto.IsWrite(args) { defer trace.Epoch("read-write path")() reply, err = r.addWriteCmd(ctx, args, nil) } else { panic(fmt.Sprintf("don't know how to handle command %T", args)) } return reply, err }
// verifyPermissions verifies that the requesting user (header.User) // has permission to read/write (capabilities depend on method // name). In the event that multiple permission configs apply to the // key range implicated by the command, the lowest common denominator // for permission. For example, if a scan crosses two permission // configs, both configs must allow read permissions or the entire // scan will fail. func (ds *DistSender) verifyPermissions(args proto.Request) error { // The root user can always proceed. header := args.Header() if header.User == storage.UserRoot { return nil } // Check for admin methods. if proto.IsAdmin(args) { if header.User != storage.UserRoot { return util.Errorf("user %q cannot invoke admin command %s", header.User, args.Method()) } return nil } // Get permissions map from gossip. configMap, err := ds.gossip.GetInfo(gossip.KeyConfigPermission) if err != nil { return util.Errorf("permissions not available via gossip") } if configMap == nil { return util.Errorf("perm configs not available; cannot execute %s", args.Method()) } permMap := configMap.(storage.PrefixConfigMap) headerEnd := header.EndKey if len(headerEnd) == 0 { headerEnd = header.Key } // Visit PermConfig(s) which apply to the method's key range. // - For each perm config which the range covers, verify read or writes // are allowed as method requires. // - Verify the permissions hierarchically; that is, if permissions aren't // granted at the longest prefix, try next longest, then next, etc., up // to and including the default prefix. // // TODO(spencer): it might make sense to visit prefixes from the // shortest to longest instead for performance. Keep an eye on profiling // for this code path as permission sets grow large. return permMap.VisitPrefixes(header.Key, headerEnd, func(start, end proto.Key, config interface{}) (bool, error) { hasPerm := false if err := permMap.VisitPrefixesHierarchically(start, func(start, end proto.Key, config interface{}) (bool, error) { perm := config.(*proto.PermConfig) if proto.IsRead(args) && !perm.CanRead(header.User) { return false, nil } if proto.IsWrite(args) && !perm.CanWrite(header.User) { return false, nil } // Return done = true, as permissions have been granted by this config. hasPerm = true return true, nil }); err != nil { return false, err } if !hasPerm { if len(header.EndKey) == 0 { return false, util.Errorf("user %q cannot invoke %s at %q", header.User, args.Method(), start) } return false, util.Errorf("user %q cannot invoke %s at %q-%q", header.User, args.Method(), start, end) } return false, nil }) }
// applyRaftCommand applies a raft command from the replicated log to the // underlying state machine (i.e. the engine). // When certain critical operations fail, a replicaCorruptionError may be // returned and must be handled by the caller. func (r *Range) applyRaftCommand(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, reply proto.Response) (rErr error) { if index <= 0 { log.Fatalc(ctx, "raft command index is <= 0") } committed := false // The very last thing we do before returning is move the applied index // forward, unless that has already happened as part of a successfully // committed batch. defer func() { if !committed { // We didn't commit the batch, but advance the last applied index nonetheless. if err := setAppliedIndex(r.rm.Engine(), r.Desc().RaftID, index); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not advance applied index"), err, rErr) return } atomic.StoreUint64(&r.appliedIndex, index) } }() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no more be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return r.newNotLeaderError(lease) } // Anything happening from now on needs to enter the response cache. defer func() { // TODO(tamird,tschottdorf): according to #1400 we intend to set the reply // header's error as late as possible and in a central location. Range // commands still write to the header directly, but once they don't this // could be the authoritative location that sets the reply error for any- // thing that makes it into Raft. Note that we must set this prior to // signaling cmd.done below, or the waiting RPC handler might proceed // before we've updated its reply. // // It is important that the error is set before the reply is saved into // the response cache. reply.Header().SetGoError(rErr) if proto.IsWrite(args) { // No matter the result, add result to the response cache if this // is a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if err := r.respCache.PutResponse(args.Header().CmdID, reply); err != nil { rErr = newReplicaCorruptionError( util.Errorf("could not put to response cache"), err, rErr) return } } }() header := args.Header() // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if ok, err := r.respCache.GetResponse(header.CmdID, reply); ok && err == nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } return err } else if ok && err != nil { return newReplicaCorruptionError( util.Errorf("could not read from response cache"), err) } } // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() defer batch.Close() // Create a engine.MVCCStats instance. ms := engine.MVCCStats{} // Execute the command; the error will also be set in the reply header. // TODO(tschottdorf,tamird) For #1400, want to refactor executeCmd to not // touch the reply header's error field. intents, err := r.executeCmd(batch, &ms, args, reply) // If the execution of the command wasn't successful, stop here. if err != nil { return err } if oldIndex := atomic.LoadUint64(&r.appliedIndex); oldIndex >= index { return newReplicaCorruptionError( util.Errorf("applied index moved backwards: %d >= %d", oldIndex, index)) } // Advance the applied index atomically within the batch. if err := setAppliedIndex(batch, r.Desc().RaftID, index); err != nil { return newReplicaCorruptionError( util.Errorf("could not update applied index"), err) } if proto.IsWrite(args) { // On success, flush the MVCC stats to the batch and commit. if err := r.stats.MergeMVCCStats(batch, &ms, header.Timestamp.WallTime); err != nil { return newReplicaCorruptionError(util.Errorf("could not merge MVCC stats"), err) } if err := batch.Commit(); err != nil { return newReplicaCorruptionError(util.Errorf("could not commit batch"), err) } committed = true // Publish update to event feed. r.rm.EventFeed().updateRange(r, args.Method(), &ms) // After successful commit, update cached stats and appliedIndex value. atomic.StoreUint64(&r.appliedIndex, index) // If the commit succeeded, potentially add range to split queue. r.maybeAddToSplitQueue() // Maybe update gossip configs on a put. switch args.(type) { case *proto.PutRequest, *proto.DeleteRequest, *proto.DeleteRangeRequest: if header.Key.Less(keys.SystemMax) { // We hold the lock already. r.maybeGossipConfigsLocked(func(configPrefix proto.Key) bool { return bytes.HasPrefix(header.Key, configPrefix) }) } } } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return nil }
// applyRaftCommandInBatch executes the command in a batch engine and // returns the batch containing the results. The caller is responsible // for committing the batch, even on error. func (r *Range) applyRaftCommandInBatch(ctx context.Context, index uint64, originNode proto.RaftNodeID, args proto.Request, ms *engine.MVCCStats) (engine.Engine, proto.Response, error) { // Create a new batch for the command to ensure all or nothing semantics. batch := r.rm.Engine().NewBatch() if lease := r.getLease(); args.Method() != proto.InternalLeaderLease && (!lease.OwnedBy(originNode) || !lease.Covers(args.Header().Timestamp)) { // Verify the leader lease is held, unless this command is trying to // obtain it. Any other Raft command has had the leader lease held // by the replica at proposal time, but this may no longer be the case. // Corruption aside, the most likely reason is a leadership change (the // most recent leader assumes responsibility for all past timestamps as // well). In that case, it's not valid to go ahead with the execution: // Writes must be aware of the last time the mutated key was read, and // since reads are served locally by the lease holder without going // through Raft, a read which was not taken into account may have been // served. Hence, we must retry at the current leader. // // It's crucial that we don't update the response cache for the error // returned below since the request is going to be retried with the // same ClientCmdID and would get the distributed sender stuck in an // infinite loop, retrieving a stale NotLeaderError over and over // again, even when proposing at the correct replica. return batch, nil, r.newNotLeaderError(lease, originNode) } // Check the response cache to ensure idempotency. if proto.IsWrite(args) { if reply, err := r.respCache.GetResponse(batch, args.Header().CmdID); err != nil { // Any error encountered while fetching the response cache entry means corruption. return batch, reply, newReplicaCorruptionError(util.Errorf("could not read from response cache"), err) } else if reply != nil { if log.V(1) { log.Infoc(ctx, "found response cache entry for %+v", args.Header().CmdID) } // TODO(tamird): move this into the response cache itself defer func() { reply.Header().Error = nil }() // We successfully read from the response cache, so return whatever error // was present in the cached entry (if any). return batch, reply, reply.Header().GoError() } } // Execute the command. reply, intents, rErr := r.executeCmd(batch, ms, args) // Regardless of error, add result to the response cache if this is // a write method. This must be done as part of the execution of // raft commands so that every replica maintains the same responses // to continue request idempotence, even if leadership changes. if proto.IsWrite(args) { if rErr == nil { // If command was successful, flush the MVCC stats to the batch. if err := r.stats.MergeMVCCStats(batch, ms, args.Header().Timestamp.WallTime); err != nil { log.Fatalc(ctx, "setting mvcc stats in a batch should never fail: %s", err) } } else { // Otherwise, reset the batch to clear out partial execution and // prepare for the failed response cache entry. batch.Close() batch = r.rm.Engine().NewBatch() } // TODO(tamird): move this into the response cache itself if reply == nil { reply = args.CreateReply() } if reply.Header().Error != nil { panic("the world is on fire") } reply.Header().SetGoError(rErr) if err := r.respCache.PutResponse(batch, args.Header().CmdID, reply); err != nil { log.Fatalc(ctx, "putting a response cache entry in a batch should never fail: %s", err) } reply.Header().Error = nil } // If the execution of the command wasn't successful, stop here. if rErr != nil { return batch, reply, rErr } // On success and only on the replica on which this command originated, // resolve skipped intents asynchronously. if originNode == r.rm.RaftNodeID() { r.handleSkippedIntents(args, intents) } return batch, reply, nil }
// TestVerifyPermissions verifies permissions are checked for single // zones and across multiple zones. It also verifies that permissions // are checked hierarchically. func TestVerifyPermissions(t *testing.T) { defer leaktest.AfterTest(t) n := simulation.NewNetwork(1, "tcp", gossip.TestInterval) ds := NewDistSender(nil, n.Nodes[0].Gossip) config1 := &proto.PermConfig{ Read: []string{"read1", "readAll", "rw1", "rwAll"}, Write: []string{"write1", "writeAll", "rw1", "rwAll"}} config2 := &proto.PermConfig{ Read: []string{"read2", "readAll", "rw2", "rwAll"}, Write: []string{"write2", "writeAll", "rw2", "rwAll"}} configs := []*storage.PrefixConfig{ {proto.KeyMin, nil, config1}, {proto.Key("a"), nil, config2}, } configMap, err := storage.NewPrefixConfigMap(configs) if err != nil { t.Fatalf("failed to make prefix config map, err: %s", err.Error()) } if err := ds.gossip.AddInfo(gossip.KeyConfigPermission, configMap, time.Hour); err != nil { t.Fatal(err) } allRequestTypes := []proto.Request{ &proto.GetRequest{}, &proto.PutRequest{}, &proto.ConditionalPutRequest{}, &proto.IncrementRequest{}, &proto.DeleteRequest{}, &proto.DeleteRangeRequest{}, &proto.ScanRequest{}, &proto.EndTransactionRequest{}, &proto.BatchRequest{}, &proto.AdminSplitRequest{}, &proto.AdminMergeRequest{}, &proto.InternalHeartbeatTxnRequest{}, &proto.InternalGCRequest{}, &proto.InternalPushTxnRequest{}, &proto.InternalRangeLookupRequest{}, &proto.InternalResolveIntentRequest{}, &proto.InternalResolveIntentRangeRequest{}, &proto.InternalMergeRequest{}, &proto.InternalTruncateLogRequest{}, &proto.InternalLeaderLeaseRequest{}, &proto.InternalBatchRequest{}, } var readOnlyRequests []proto.Request var writeOnlyRequests []proto.Request var readWriteRequests []proto.Request for _, r := range allRequestTypes { if proto.IsRead(r) && !proto.IsWrite(r) { readOnlyRequests = append(readOnlyRequests, r) } if proto.IsWrite(r) && !proto.IsRead(r) { writeOnlyRequests = append(writeOnlyRequests, r) } if proto.IsRead(r) && proto.IsWrite(r) { readWriteRequests = append(readWriteRequests, r) } } testData := []struct { // Permission-based db methods from the storage package. requests []proto.Request user string startKey, endKey proto.Key hasPermission bool }{ // Test permissions within a single range {readOnlyRequests, "read1", proto.KeyMin, proto.KeyMin, true}, {readOnlyRequests, "rw1", proto.KeyMin, proto.KeyMin, true}, {readOnlyRequests, "write1", proto.KeyMin, proto.KeyMin, false}, {readOnlyRequests, "random", proto.KeyMin, proto.KeyMin, false}, {readWriteRequests, "rw1", proto.KeyMin, proto.KeyMin, true}, {readWriteRequests, "read1", proto.KeyMin, proto.KeyMin, false}, {readWriteRequests, "write1", proto.KeyMin, proto.KeyMin, false}, {writeOnlyRequests, "write1", proto.KeyMin, proto.KeyMin, true}, {writeOnlyRequests, "rw1", proto.KeyMin, proto.KeyMin, true}, {writeOnlyRequests, "read1", proto.KeyMin, proto.KeyMin, false}, {writeOnlyRequests, "random", proto.KeyMin, proto.KeyMin, false}, // Test permissions hierarchically. {readOnlyRequests, "read1", proto.Key("a"), proto.Key("a1"), true}, {readWriteRequests, "rw1", proto.Key("a"), proto.Key("a1"), true}, {writeOnlyRequests, "write1", proto.Key("a"), proto.Key("a1"), true}, // Test permissions across both ranges. {readOnlyRequests, "readAll", proto.KeyMin, proto.Key("b"), true}, {readOnlyRequests, "read1", proto.KeyMin, proto.Key("b"), true}, {readOnlyRequests, "read2", proto.KeyMin, proto.Key("b"), false}, {readOnlyRequests, "random", proto.KeyMin, proto.Key("b"), false}, {readWriteRequests, "rwAll", proto.KeyMin, proto.Key("b"), true}, {readWriteRequests, "rw1", proto.KeyMin, proto.Key("b"), true}, {readWriteRequests, "random", proto.KeyMin, proto.Key("b"), false}, {writeOnlyRequests, "writeAll", proto.KeyMin, proto.Key("b"), true}, {writeOnlyRequests, "write1", proto.KeyMin, proto.Key("b"), true}, {writeOnlyRequests, "write2", proto.KeyMin, proto.Key("b"), false}, {writeOnlyRequests, "random", proto.KeyMin, proto.Key("b"), false}, // Test permissions within and around the boundaries of a range, // representatively using rw methods. {readWriteRequests, "rw2", proto.Key("a"), proto.Key("b"), true}, {readWriteRequests, "rwAll", proto.Key("a"), proto.Key("b"), true}, {readWriteRequests, "rw2", proto.Key("a"), proto.Key("a"), true}, {readWriteRequests, "rw2", proto.Key("a"), proto.Key("a1"), true}, {readWriteRequests, "rw2", proto.Key("a"), proto.Key("b1"), false}, {readWriteRequests, "rw2", proto.Key("a3"), proto.Key("a4"), true}, {readWriteRequests, "rw2", proto.Key("a3"), proto.Key("b1"), false}, } for i, test := range testData { for _, r := range test.requests { *r.Header() = proto.RequestHeader{ User: test.user, Key: test.startKey, EndKey: test.endKey, } err := ds.verifyPermissions(r) if err != nil && test.hasPermission { t.Errorf("test %d: user %s should have had permission to %s, err: %s", i, test.user, r.Method(), err.Error()) break } else if err == nil && !test.hasPermission { t.Errorf("test %d: user %s should not have had permission to %s", i, test.user, r.Method()) break } } } n.Stop() }