// RangeLookup implements the RangeDescriptorDB interface. // RangeLookup dispatches a RangeLookup request for the given metadata // key to the replicas of the given range. Note that we allow // inconsistent reads when doing range lookups for efficiency. Getting // stale data is not a correctness problem but instead may // infrequently result in additional latency as additional range // lookups may be required. Note also that rangeLookup bypasses the // DistSender's Send() method, so there is no error inspection and // retry logic here; this is not an issue since the lookup performs a // single inconsistent read only. func (ds *DistSender) RangeLookup( ctx context.Context, key roachpb.RKey, desc *roachpb.RangeDescriptor, useReverseScan bool, ) ([]roachpb.RangeDescriptor, []roachpb.RangeDescriptor, *roachpb.Error) { ba := roachpb.BatchRequest{} ba.ReadConsistency = roachpb.INCONSISTENT ba.Add(&roachpb.RangeLookupRequest{ Span: roachpb.Span{ // We can interpret the RKey as a Key here since it's a metadata // lookup; those are never local. Key: key.AsRawKey(), }, MaxRanges: ds.rangeLookupMaxRanges, Reverse: useReverseScan, }) replicas := NewReplicaSlice(ds.gossip, desc) shuffle.Shuffle(replicas) br, err := ds.sendRPC(ctx, desc.RangeID, replicas, ba) if err != nil { return nil, nil, roachpb.NewError(err) } if br.Error != nil { return nil, nil, br.Error } resp := br.Responses[0].GetInner().(*roachpb.RangeLookupResponse) return resp.Ranges, resp.PrefetchedRanges, nil }
// SortByCommonAttributePrefix rearranges the ReplicaSlice by comparing the // attributes to the given reference attributes. The basis for the comparison // is that of the common prefix of replica attributes (i.e. the number of equal // attributes, starting at the first), with a longer prefix sorting first. The // number of attributes successfully matched to at least one replica is // returned (hence, if the return value equals the length of the ReplicaSlice, // at least one replica matched all attributes). func (rs ReplicaSlice) SortByCommonAttributePrefix(attrs []string) int { if len(rs) < 2 { return 0 } topIndex := len(rs) - 1 for bucket := 0; bucket < len(attrs); bucket++ { firstNotOrdered := 0 for i := 0; i <= topIndex; i++ { if bucket < len(rs[i].attrs()) && rs[i].attrs()[bucket] == attrs[bucket] { // Move replica which matches this attribute to an earlier // place in the array, just behind the last matching replica. // This packs all matching replicas together. rs.Swap(firstNotOrdered, i) firstNotOrdered++ } } if topIndex < len(rs)-1 { shuffle.Shuffle(rs[firstNotOrdered : topIndex+1]) } if firstNotOrdered == 0 { return bucket } topIndex = firstNotOrdered - 1 } return len(attrs) }
// OptimizeReplicaOrder sorts the replicas in the order in which they're to be // used for sending RPCs (meaning in the order in which they'll be probed for // the lease). "Closer" (matching in more attributes) replicas are ordered // first. If the current node is a replica, then it'll be the first one. // // nodeDesc is the descriptor of the current node. It can be nil, in which case // information about the current descriptor is not used in optimizing the order. // // Note that this method is not concerned with any information the node might // have about who the lease holder might be. If there is such info (e.g. in a // LeaseHolderCache), the caller will probably want to further tweak the head of // the ReplicaSlice. func (rs ReplicaSlice) OptimizeReplicaOrder(nodeDesc *roachpb.NodeDescriptor) { // If we don't know which node we're on, send the RPCs randomly. if nodeDesc == nil { shuffle.Shuffle(rs) return } // Sort replicas by attribute affinity, which we treat as a stand-in for // proximity (for now). rs.SortByCommonAttributePrefix(nodeDesc.Attrs.Attrs) // If there is a replica in local node, move it to the front. if i := rs.FindReplicaByNodeID(nodeDesc.NodeID); i > 0 { rs.MoveToFront(i) } }
// getStoreList returns a storeList that contains all active stores that // contain the required attributes and their associated stats. It also returns // the total number of alive and throttled stores. func (sp *StorePool) getStoreList(rangeID roachpb.RangeID) (StoreList, int, int) { sp.mu.RLock() defer sp.mu.RUnlock() var storeIDs roachpb.StoreIDSlice for storeID := range sp.mu.storeDetails { storeIDs = append(storeIDs, storeID) } if sp.deterministic { sort.Sort(storeIDs) } else { shuffle.Shuffle(storeIDs) } var aliveStoreCount int var throttledStoreCount int var storeDescriptors []roachpb.StoreDescriptor now := sp.clock.PhysicalTime() for _, storeID := range storeIDs { detail := sp.mu.storeDetails[storeID] switch s := detail.status(now, sp.timeUntilStoreDead, rangeID, sp.nodeLivenessFn); s { case storeStatusThrottled: aliveStoreCount++ throttledStoreCount++ case storeStatusReplicaCorrupted: aliveStoreCount++ case storeStatusAvailable: aliveStoreCount++ storeDescriptors = append(storeDescriptors, *detail.desc) case storeStatusDead, storeStatusUnavailable: // Do nothing; this node cannot be used. default: panic(fmt.Sprintf("unknown store status: %d", s)) } } return makeStoreList(storeDescriptors), aliveStoreCount, throttledStoreCount }
// optimizeReplicaOrder sorts the replicas in the order in which they are to be // used for sending RPCs (meaning in the order in which they'll be probed for // the lease). "Closer" replicas (matching in more attributes) are ordered // first. Replicas matching in the same number of attributes are shuffled // randomly. // If the current node is a replica, then it'll be the first one. func (ds *DistSender) optimizeReplicaOrder(replicas ReplicaSlice) { // TODO(spencer): going to need to also sort by affinity; closest // ping time should win. Makes sense to have the rpc client/server // heartbeat measure ping times. With a bit of seasoning, each // node will be able to order the healthy replicas based on latency. // Unless we know better, send the RPCs randomly. nodeDesc := ds.getNodeDescriptor() // If we don't know which node we're on, don't optimize anything. if nodeDesc == nil { shuffle.Shuffle(replicas) return } // Sort replicas by attribute affinity (if any), which we treat as a stand-in // for proximity (for now). replicas.SortByCommonAttributePrefix(nodeDesc.Attrs.Attrs) // If there is a replica in local node, move it to the front. if i := replicas.FindReplicaByNodeID(nodeDesc.NodeID); i > 0 { replicas.MoveToFront(i) } }
// getStoreList returns a storeList that contains all active stores that // contain the required attributes and their associated stats. It also returns // the total number of alive and throttled stores. func (sp *StorePool) getStoreList(rangeID roachpb.RangeID) (StoreList, int, int) { sp.mu.RLock() defer sp.mu.RUnlock() var storeIDs roachpb.StoreIDSlice for storeID := range sp.mu.storeDetails { storeIDs = append(storeIDs, storeID) } if sp.deterministic { sort.Sort(storeIDs) } else { shuffle.Shuffle(storeIDs) } var aliveStoreCount int var throttledStoreCount int var storeDescriptors []roachpb.StoreDescriptor now := sp.clock.PhysicalTime() for _, storeID := range storeIDs { detail := sp.mu.storeDetails[storeID] switch detail.status(now, rangeID) { case storeStatusThrottled: aliveStoreCount++ throttledStoreCount++ case storeStatusReplicaCorrupted: aliveStoreCount++ case storeStatusAvailable: aliveStoreCount++ storeDescriptors = append(storeDescriptors, *detail.desc) } } return makeStoreList(storeDescriptors), aliveStoreCount, throttledStoreCount }