// sendAttempt is invoked by Send. It temporarily truncates the arguments to // match the descriptor's EndKey (if necessary) and gathers and rearranges the // replicas before making a single attempt at sending the request. It returns // the result of sending the RPC; a potential error contained in the reply has // to be handled separately by the caller. func (ds *DistSender) sendAttempt(trace *tracer.Trace, args proto.Request, reply proto.Response, desc *proto.RangeDescriptor) error { defer trace.Epoch("sending RPC")() // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if endKey := args.Header().EndKey; endKey != nil && !endKey.Less(desc.EndKey) { defer func(k proto.Key) { args.Header().EndKey = k }(endKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(trace, desc.RaftID, replicas, order, args, reply) }
// Send implements the client.Sender interface. It verifies // permissions and looks up the appropriate range based on the // supplied key and sends the RPC according to the specified options. // // If the request spans multiple ranges (which is possible for Scan or // DeleteRange requests), Send sends requests to the individual ranges // sequentially and combines the results transparently. // // This may temporarily adjust the request headers, so the proto.Call // must not be used concurrently until Send has returned. func (ds *DistSender) Send(_ context.Context, call proto.Call) { args := call.Args finalReply := call.Reply // Verify permissions. if err := ds.verifyPermissions(call.Args); err != nil { call.Reply.Header().SetGoError(err) return } // In the event that timestamp isn't set and read consistency isn't // required, set the timestamp using the local clock. if args.Header().ReadConsistency == proto.INCONSISTENT && args.Header().Timestamp.Equal(proto.ZeroTimestamp) { // Make sure that after the call, args hasn't changed. defer func(timestamp proto.Timestamp) { args.Header().Timestamp = timestamp }(args.Header().Timestamp) args.Header().Timestamp = ds.clock.Now() } // If this is a bounded request, we will change its bound as we receive // replies. This undoes that when we return. boundedArgs, argsBounded := args.(proto.Bounded) if argsBounded { defer func(bound int64) { boundedArgs.SetBound(bound) }(boundedArgs.GetBound()) } defer func(key proto.Key) { args.Header().Key = key }(args.Header().Key) // Retry logic for lookup of range by key and RPCs to range replicas. curReply := finalReply for { call.Reply = curReply curReply.Header().Reset() var desc, descNext *proto.RangeDescriptor var err error for r := retry.Start(ds.rpcRetryOptions); r.Next(); { // Get range descriptor (or, when spanning range, descriptors). // sendAttempt below may clear them on certain errors, so we // refresh (likely from the cache) on every retry. desc, descNext, err = ds.getDescriptors(call) // getDescriptors may fail retryably if the first range isn't // available via Gossip. if err != nil { if rErr, ok := err.(util.Retryable); ok && rErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } break } err = func() error { // Truncate the request to our current range, making sure not to // touch it unless we have to (it is illegal to send EndKey on // commands which do not operate on ranges). if descNext != nil { defer func(endKey proto.Key) { args.Header().EndKey = endKey }(args.Header().EndKey) args.Header().EndKey = desc.EndKey } leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } return ds.sendRPC(desc.RaftID, replicas, order, args, curReply) }() if err != nil { // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } else { err = curReply.Header().GoError() } if err != nil { if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) // On addressing errors, don't backoff; retry immediately. r.Reset() if log.V(1) { log.Warning(err) } continue case *proto.NotLeaderError: newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader) if log.V(1) { log.Warning(err) } r.Reset() continue case util.Retryable: if tErr.CanRetry() { if log.V(1) { log.Warning(err) } continue } } } break } // Immediately return if querying a range failed non-retryably. // For multi-range requests, we return the failing range's reply. if err != nil { call.Reply.Header().SetGoError(err) return } if finalReply != curReply { // This was the second or later call in a multi-range request. // Combine the new response with the existing one. if cFinalReply, ok := finalReply.(proto.Combinable); ok { cFinalReply.Combine(curReply) } else { // This should never apply in practice, as we'll only end up here // for range-spanning requests. call.Reply.Header().SetGoError(util.Errorf("multi-range request with non-combinable response type")) return } } // If this request has a bound, such as MaxResults in // ScanRequest, check whether enough rows have been retrieved. if argsBounded { if prevBound := boundedArgs.GetBound(); prevBound > 0 { if cReply, ok := curReply.(proto.Countable); ok { if nextBound := prevBound - cReply.Count(); nextBound > 0 { // Update bound for the next round. // We've deferred restoring the original bound earlier. boundedArgs.SetBound(nextBound) } else { // Set flag to break the loop. descNext = nil } } } } // If this was the last range accessed by this call, exit loop. if descNext == nil { break } // In next iteration, query next range. // It's important that we use the EndKey of the current descriptor // as opposed to the StartKey of the next one: if the former is stale, // it's possible that the next range has since merged the subsequent // one, and unless both descriptors are stale, the next descriptor's // StartKey would move us to the beginning of the current range, // resulting in a duplicate scan. args.Header().Key = desc.EndKey // This is a multi-range request, make a new reply object for // subsequent iterations of the loop. curReply = args.CreateReply() } call.Reply = finalReply }
// verifyPermissions verifies that the requesting user (header.User) // has permission to read/write (capabilities depend on method // name). In the event that multiple permission configs apply to the // key range implicated by the command, the lowest common denominator // for permission. For example, if a scan crosses two permission // configs, both configs must allow read permissions or the entire // scan will fail. func (ds *DistSender) verifyPermissions(args proto.Request) error { // The root user can always proceed. header := args.Header() if header.User == storage.UserRoot { return nil } // Check for admin methods. if proto.IsAdmin(args) { if header.User != storage.UserRoot { return util.Errorf("user %q cannot invoke admin command %s", header.User, args.Method()) } return nil } // Get permissions map from gossip. configMap, err := ds.gossip.GetInfo(gossip.KeyConfigPermission) if err != nil { return util.Errorf("permissions not available via gossip") } if configMap == nil { return util.Errorf("perm configs not available; cannot execute %s", args.Method()) } permMap := configMap.(storage.PrefixConfigMap) headerEnd := header.EndKey if len(headerEnd) == 0 { headerEnd = header.Key } // Visit PermConfig(s) which apply to the method's key range. // - For each perm config which the range covers, verify read or writes // are allowed as method requires. // - Verify the permissions hierarchically; that is, if permissions aren't // granted at the longest prefix, try next longest, then next, etc., up // to and including the default prefix. // // TODO(spencer): it might make sense to visit prefixes from the // shortest to longest instead for performance. Keep an eye on profiling // for this code path as permission sets grow large. return permMap.VisitPrefixes(header.Key, headerEnd, func(start, end proto.Key, config interface{}) (bool, error) { hasPerm := false if err := permMap.VisitPrefixesHierarchically(start, func(start, end proto.Key, config interface{}) (bool, error) { perm := config.(*proto.PermConfig) if proto.IsRead(args) && !perm.CanRead(header.User) { return false, nil } if proto.IsWrite(args) && !perm.CanWrite(header.User) { return false, nil } // Return done = true, as permissions have been granted by this config. hasPerm = true return true, nil }); err != nil { return false, err } if !hasPerm { if len(header.EndKey) == 0 { return false, util.Errorf("user %q cannot invoke %s at %q", header.User, args.Method(), start) } return false, util.Errorf("user %q cannot invoke %s at %q-%q", header.User, args.Method(), start, end) } return false, nil }) }
// sendAttempt is invoked by Send and handles retry logic and cache eviction // for a call sent to a single range. It returns a retry status, which is Break // on success and either Break, Continue or Reset depending on error condition. // This method is expected to be invoked from within a backoff / retry loop to // retry the send repeatedly (e.g. to continue processing after a critical node // becomes available after downtime or the range descriptor is refreshed via // lookup). func (ds *DistSender) sendAttempt(desc *proto.RangeDescriptor, call proto.Call) (retry.Status, error) { leader := ds.leaderCache.Lookup(proto.RaftID(desc.RaftID)) // Try to send the call. replicas := newReplicaSlice(ds.gossip, desc) // Rearrange the replicas so that those replicas with long common // prefix of attributes end up first. If there's no prefix, this is a // no-op. order := ds.optimizeReplicaOrder(replicas) args := call.Args reply := call.Reply // If this request needs to go to a leader and we know who that is, move // it to the front. if !(proto.IsRead(args) && args.Header().ReadConsistency == proto.INCONSISTENT) && leader.StoreID > 0 { if i := replicas.FindReplica(leader.StoreID); i >= 0 { replicas.MoveToFront(i) order = rpc.OrderStable } } err := ds.sendRPC(desc.RaftID, replicas, order, args, reply) if err != nil { // For an RPC error to occur, we must've been unable to contact any // replicas. In this case, likely all nodes are down (or not getting back // to us within a reasonable amount of time). // We may simply not be trying to talk to the up-to-date replicas, so // clearing the descriptor here should be a good idea. // TODO(tschottdorf): If a replica group goes dead, this will cause clients // to put high read pressure on the first range, so there should be some // rate limiting here. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } else { err = reply.Header().GoError() } if err != nil { if log.V(1) { log.Warningf("failed to invoke %s: %s", call.Method(), err) } // If retryable, allow retry. For range not found or range // key mismatch errors, we don't backoff on the retry, // but reset the backoff loop so we can retry immediately. switch tErr := err.(type) { case *proto.RangeNotFoundError, *proto.RangeKeyMismatchError: // Range descriptor might be out of date - evict it. ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) // On addressing errors, don't backoff; retry immediately. return retry.Reset, err case *proto.NotLeaderError: newLeader := tErr.GetLeader() // Verify that leader is a known replica according to the // descriptor. If not, we've got a stale replica; evict cache. // Next, cache the new leader. if newLeader != nil { if i, _ := desc.FindReplica(newLeader.StoreID); i == -1 { if log.V(1) { log.Infof("error indicates unknown leader %s, expunging descriptor %s", newLeader, desc) } ds.rangeCache.EvictCachedRangeDescriptor(args.Header().Key, desc) } } else { newLeader = &proto.Replica{} } ds.updateLeaderCache(proto.RaftID(desc.RaftID), *newLeader) return retry.Reset, err case util.Retryable: if tErr.CanRetry() { return retry.Continue, err } } return retry.Break, err } return retry.Break, nil }
// TestVerifyPermissions verifies permissions are checked for single // zones and across multiple zones. It also verifies that permissions // are checked hierarchically. func TestVerifyPermissions(t *testing.T) { defer leaktest.AfterTest(t) n := simulation.NewNetwork(1, "tcp", gossip.TestInterval) ds := NewDistSender(nil, n.Nodes[0].Gossip) config1 := &proto.PermConfig{ Read: []string{"read1", "readAll", "rw1", "rwAll"}, Write: []string{"write1", "writeAll", "rw1", "rwAll"}} config2 := &proto.PermConfig{ Read: []string{"read2", "readAll", "rw2", "rwAll"}, Write: []string{"write2", "writeAll", "rw2", "rwAll"}} configs := []*storage.PrefixConfig{ {proto.KeyMin, nil, config1}, {proto.Key("a"), nil, config2}, } configMap, err := storage.NewPrefixConfigMap(configs) if err != nil { t.Fatalf("failed to make prefix config map, err: %s", err.Error()) } if err := ds.gossip.AddInfo(gossip.KeyConfigPermission, configMap, time.Hour); err != nil { t.Fatal(err) } allRequestTypes := []proto.Request{ &proto.GetRequest{}, &proto.PutRequest{}, &proto.ConditionalPutRequest{}, &proto.IncrementRequest{}, &proto.DeleteRequest{}, &proto.DeleteRangeRequest{}, &proto.ScanRequest{}, &proto.EndTransactionRequest{}, &proto.BatchRequest{}, &proto.AdminSplitRequest{}, &proto.AdminMergeRequest{}, &proto.InternalHeartbeatTxnRequest{}, &proto.InternalGCRequest{}, &proto.InternalPushTxnRequest{}, &proto.InternalRangeLookupRequest{}, &proto.InternalResolveIntentRequest{}, &proto.InternalResolveIntentRangeRequest{}, &proto.InternalMergeRequest{}, &proto.InternalTruncateLogRequest{}, &proto.InternalLeaderLeaseRequest{}, &proto.InternalBatchRequest{}, } var readOnlyRequests []proto.Request var writeOnlyRequests []proto.Request var readWriteRequests []proto.Request for _, r := range allRequestTypes { if proto.IsRead(r) && !proto.IsWrite(r) { readOnlyRequests = append(readOnlyRequests, r) } if proto.IsWrite(r) && !proto.IsRead(r) { writeOnlyRequests = append(writeOnlyRequests, r) } if proto.IsRead(r) && proto.IsWrite(r) { readWriteRequests = append(readWriteRequests, r) } } testData := []struct { // Permission-based db methods from the storage package. requests []proto.Request user string startKey, endKey proto.Key hasPermission bool }{ // Test permissions within a single range {readOnlyRequests, "read1", proto.KeyMin, proto.KeyMin, true}, {readOnlyRequests, "rw1", proto.KeyMin, proto.KeyMin, true}, {readOnlyRequests, "write1", proto.KeyMin, proto.KeyMin, false}, {readOnlyRequests, "random", proto.KeyMin, proto.KeyMin, false}, {readWriteRequests, "rw1", proto.KeyMin, proto.KeyMin, true}, {readWriteRequests, "read1", proto.KeyMin, proto.KeyMin, false}, {readWriteRequests, "write1", proto.KeyMin, proto.KeyMin, false}, {writeOnlyRequests, "write1", proto.KeyMin, proto.KeyMin, true}, {writeOnlyRequests, "rw1", proto.KeyMin, proto.KeyMin, true}, {writeOnlyRequests, "read1", proto.KeyMin, proto.KeyMin, false}, {writeOnlyRequests, "random", proto.KeyMin, proto.KeyMin, false}, // Test permissions hierarchically. {readOnlyRequests, "read1", proto.Key("a"), proto.Key("a1"), true}, {readWriteRequests, "rw1", proto.Key("a"), proto.Key("a1"), true}, {writeOnlyRequests, "write1", proto.Key("a"), proto.Key("a1"), true}, // Test permissions across both ranges. {readOnlyRequests, "readAll", proto.KeyMin, proto.Key("b"), true}, {readOnlyRequests, "read1", proto.KeyMin, proto.Key("b"), true}, {readOnlyRequests, "read2", proto.KeyMin, proto.Key("b"), false}, {readOnlyRequests, "random", proto.KeyMin, proto.Key("b"), false}, {readWriteRequests, "rwAll", proto.KeyMin, proto.Key("b"), true}, {readWriteRequests, "rw1", proto.KeyMin, proto.Key("b"), true}, {readWriteRequests, "random", proto.KeyMin, proto.Key("b"), false}, {writeOnlyRequests, "writeAll", proto.KeyMin, proto.Key("b"), true}, {writeOnlyRequests, "write1", proto.KeyMin, proto.Key("b"), true}, {writeOnlyRequests, "write2", proto.KeyMin, proto.Key("b"), false}, {writeOnlyRequests, "random", proto.KeyMin, proto.Key("b"), false}, // Test permissions within and around the boundaries of a range, // representatively using rw methods. {readWriteRequests, "rw2", proto.Key("a"), proto.Key("b"), true}, {readWriteRequests, "rwAll", proto.Key("a"), proto.Key("b"), true}, {readWriteRequests, "rw2", proto.Key("a"), proto.Key("a"), true}, {readWriteRequests, "rw2", proto.Key("a"), proto.Key("a1"), true}, {readWriteRequests, "rw2", proto.Key("a"), proto.Key("b1"), false}, {readWriteRequests, "rw2", proto.Key("a3"), proto.Key("a4"), true}, {readWriteRequests, "rw2", proto.Key("a3"), proto.Key("b1"), false}, } for i, test := range testData { for _, r := range test.requests { *r.Header() = proto.RequestHeader{ User: test.user, Key: test.startKey, EndKey: test.endKey, } err := ds.verifyPermissions(r) if err != nil && test.hasPermission { t.Errorf("test %d: user %s should have had permission to %s, err: %s", i, test.user, r.Method(), err.Error()) break } else if err == nil && !test.hasPermission { t.Errorf("test %d: user %s should not have had permission to %s", i, test.user, r.Method()) break } } } n.Stop() }