// GetOverlaps returns a slice of values which overlap the specified // interval. The slice is only valid until the next call to GetOverlaps. func (ic *IntervalCache) GetOverlaps(start, end []byte) []*Entry { ic.overlapKey.Range = interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } ic.tree.DoMatching(ic.doOverlaps, ic.overlapKey.Range) overlaps := ic.overlaps ic.overlaps = ic.overlaps[:0] return overlaps }
// getOverlaps returns a slice of values which overlap the specified // interval. The slice is only valid until the next call to GetOverlaps. func (cq *CommandQueue) getOverlaps(start, end []byte) []*cmd { rng := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } cq.tree.DoMatching(cq.doOverlaps, rng) overlaps := cq.overlaps cq.overlaps = cq.overlaps[:0] return overlaps }
// MakeKey creates a new interval key defined by start and end values. func (ic *IntervalCache) MakeKey(start, end []byte) IntervalKey { if bytes.Compare(start, end) >= 0 { panic(fmt.Sprintf("start key greater than or equal to end key %q >= %q", start, end)) } return IntervalKey{ Range: interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), }, id: uintptr(atomic.AddInt64(&intervalAlloc, 1)), } }
// add adds commands to the queue which affect the specified key ranges. Ranges // without an end key affect only the start key. The returned interface is the // key for the command queue and must be re-supplied on subsequent invocation // of remove(). // // add should be invoked after waiting on already-executing, overlapping // commands via the WaitGroup initialized through getWait(). func (cq *CommandQueue) add(readOnly bool, spans ...roachpb.Span) *cmd { prepareSpans(spans...) // Compute the min and max key that covers all of the spans. minKey, maxKey := spans[0].Key, spans[0].EndKey for i := 1; i < len(spans); i++ { start, end := spans[i].Key, spans[i].EndKey if minKey.Compare(start) > 0 { minKey = start } if maxKey.Compare(end) < 0 { maxKey = end } } numCmds := 1 if len(spans) > 1 { numCmds += len(spans) } cmds := make([]cmd, numCmds) // Create the covering entry. Note that this may have an "illegal" key range // spanning from range-local to range-global, but that's acceptable here as // long as we're careful in the future. cmd := &cmds[0] cmd.id = cq.nextID() cmd.key = interval.Range{ Start: interval.Comparable(minKey), End: interval.Comparable(maxKey), } cmd.readOnly = readOnly cmd.expanded = false if len(spans) > 1 { // Populate the covering entry's children. cmd.children = cmds[1:] for i, span := range spans { child := &cmd.children[i] child.id = cq.nextID() child.key = interval.Range{ Start: interval.Comparable(span.Key), End: interval.Comparable(span.EndKey), } child.readOnly = readOnly child.expanded = true } } if err := cq.tree.Insert(cmd, false /* !fast */); err != nil { panic(err) } return cmd }
// addKeyRange adds the specified key range to the range group, // taking care not to add this range if existing entries already // completely cover the range. func addKeyRange(keys interval.RangeGroup, start, end roachpb.Key) { // This gives us a memory-efficient end key if end is empty. // The most common case for keys in the intents interval map // is for single keys. However, the range group requires // a non-empty interval, so we create two key slices which // share the same underlying byte array. if len(end) == 0 { end = start.Next() start = end[:len(start)] } keyR := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } keys.Add(keyR) }
// Add the specified timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. txnID is nil for no transaction. readTSCache specifies // whether the command adding this timestamp should update the read // timestamp; false to update the write timestamp cache. func (tc *TimestampCache) Add(start, end roachpb.Key, timestamp roachpb.Timestamp, txnID *uuid.UUID, readTSCache bool) { // This gives us a memory-efficient end key if end is empty. if len(end) == 0 { end = start.Next() start = end[:len(start)] } if tc.latest.Less(timestamp) { tc.latest = timestamp } // Only add to the cache if the timestamp is more recent than the // low water mark. if tc.lowWater.Less(timestamp) { cache := tc.wCache if readTSCache { cache = tc.rCache } addRange := func(r interval.Range) { value := cacheValue{timestamp: timestamp, txnID: txnID} key := cache.MakeKey(r.Start, r.End) entry := makeCacheEntry(key, value) cache.AddEntry(entry) } r := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } // Check existing, overlapping entries and truncate/split/remove if // superseded and in the past. If existing entries are in the future, // subtract from the range/ranges that need to be added to cache. for _, o := range cache.GetOverlaps(r.Start, r.End) { cv := o.Value.(*cacheValue) sCmp := r.Start.Compare(o.Key.Start) eCmp := r.End.Compare(o.Key.End) if !timestamp.Less(cv.timestamp) { // The existing interval has a timestamp less than or equal to the new interval. // Compare interval ranges to determine how to modify existing interval. switch { case sCmp == 0 && eCmp == 0: // New and old are equal; replace old with new and avoid the need to insert new. // // New: ------------ // Old: ------------ // // New: ------------ *cv = cacheValue{timestamp: timestamp, txnID: txnID} cache.MoveToEnd(o.Entry) return case sCmp <= 0 && eCmp >= 0: // New contains or is equal to old; delete old. // // New: ------------ ------------ ------------ // Old: -------- or ---------- or ---------- // // Old: cache.DelEntry(o.Entry) case sCmp > 0 && eCmp < 0: // Old contains new; split up old into two. // // New: ---- // Old: ------------ // // Old: ---- ---- oldEnd := o.Key.End o.Key.End = r.Start key := cache.MakeKey(r.End, oldEnd) entry := makeCacheEntry(key, *cv) cache.AddEntryAfter(entry, o.Entry) case eCmp >= 0: // Left partial overlap; truncate old end. // // New: -------- -------- // Old: -------- or ------------ // // Old: ---- ---- o.Key.End = r.Start case sCmp <= 0: // Right partial overlap; truncate old start. // // New: -------- -------- // Old: -------- or ------------ // // Old: ---- ---- o.Key.Start = r.End default: panic(fmt.Sprintf("no overlap between %v and %v", o.Key.Range, r)) } } else { // The existing interval has a timestamp greater than the new interval. // Compare interval ranges to determine how to modify new interval before // adding it to the timestamp cache. switch { case sCmp >= 0 && eCmp <= 0: // Old contains or is equal to new; no need to add. // // Old: ----------- ----------- ----------- ----------- // New: ----- or ----------- or -------- or -------- // // New: return case sCmp < 0 && eCmp > 0: // New contains old; split up old into two. We can add the left piece // immediately because it is guaranteed to be before the rest of the // overlaps. // // Old: ------ // New: ------------ // // New: --- --- lr := interval.Range{Start: r.Start, End: o.Key.Start} addRange(lr) r.Start = o.Key.End case eCmp > 0: // Left partial overlap; truncate new start. // // Old: -------- -------- // New: -------- or ------------ // // New: ---- ---- r.Start = o.Key.End case sCmp < 0: // Right partial overlap; truncate new end. // // Old: -------- -------- // New: -------- or ------------ // // New: ---- ---- r.End = o.Key.Start default: panic(fmt.Sprintf("no overlap between %v and %v", o.Key.Range, r)) } } } addRange(r) } }
// GetWait initializes the supplied wait group with the number of executing // commands which overlap the specified key ranges. If an end key is empty, it // only affects the start key. The caller should call wg.Wait() to wait for // confirmation that all gating commands have completed or failed, and then // call Add() to add the keys to the command queue. readOnly is true if the // requester is a read-only command; false for read-write. func (cq *CommandQueue) GetWait(readOnly bool, wg *sync.WaitGroup, spans ...roachpb.Span) { for _, span := range spans { // This gives us a memory-efficient end key if end is empty. start, end := span.Key, span.EndKey if len(end) == 0 { end = start.Next() start = end[:len(start)] } newCmdRange := interval.Range{ Start: interval.Comparable(start), End: interval.Comparable(end), } overlaps := cq.cache.GetOverlaps(newCmdRange.Start, newCmdRange.End) if readOnly { // If both commands are read-only, there are no dependencies between them, // so these can be filtered out of the overlapping commands. overlaps = filterReadWrite(overlaps) } // Sort overlapping commands by command ID and iterate from latest to earliest, // adding the commands' ranges to the RangeGroup to determine gating keyspace // command dependencies. Because all commands are given WaitGroup dependencies // to the most recent commands that they are dependent on, and because of the // causality provided by the strictly increasing command ID allocation, this // approach will construct a DAG-like dependency graph between WaitGroups with // overlapping keys. This comes as an alternative to creating explicit WaitGroups // dependencies to all gating commands for each new command, which could result // in an exponential dependency explosion. // // For example, consider the following 5 write commands, each with key ranges // represented on the x axis and WaitGroup dependencies represented by vertical // lines: // // cmd 1: -------------- // | | // cmd 2: | ------------- // | | | // cmd 3: ------- | // | | // cmd 4: ------- // | // cmd 5: ------- // // Instead of having each command establish explicit dependencies on all previous // overlapping commands, each command only needs to establish explicit dependencies // on the set of overlapping commands closest to the new command that together span // the new commands overlapped range. Following this strategy, the other dependencies // will be implicitly enforced, which reduces memory utilization and synchronization // costs. // // The exception are existing reads: since reads don't wait for each other, an incoming // write must wait for reads even when they are covered by a "later" read (since that // "later" read won't wait for the earlier read to complete). However, if that read is // covered by a "later" write, we don't need to wait because writes can't be reordered. // // Two example of how this logic works are shown below. Notice in the first example how // the overlapping reads do not establish dependencies on each other, and can therefore // be reordered. Also notice in the second example that once read command 4 overlaps // a "later" write, it no longer needs to be a dependency for the new write command 5. // However, because read command 3 does not overlap a "later" write, it is still a // dependency for the new write, but can be safely reordered before or after command 4. // // cmd 1 [R]: ----- ---------- // | | // cmd 2 [W]: ======== ======== // | | | | // cmd 3 [R]: --+------ --+------ // | | | | // cmd 4 [R]: -------+----- -----------+----- // | | | | // cmd 5 [W]: ===== | | ======= | // | | | | | // cmd 5 [W]: ==================== ==================== // cq.oHeap.Init(overlaps) for enclosed := false; cq.oHeap.Len() > 0 && !enclosed; { o := cq.oHeap.PopOverlap() keyRange, cmd := o.Key.Range, o.Value.(*cmd) if cmd.readOnly { // If the current overlap is a read (meaning we're a write because other reads will // be filtered out if we're a read as well), we only need to wait if the write RangeGroup // doesn't already overlap the read. Otherwise, we know that this current read is a dependent // itself to a command already accounted for in out write RangeGroup. Either way, we need to add // this current command to the combined RangeGroup. cq.rwRg.Add(keyRange) if !cq.wRg.Overlaps(keyRange) { cmd.pending = append(cmd.pending, wg) wg.Add(1) } } else { // If the current overlap is a write, pick which RangeGroup will be used to determine necessary // dependencies based on if we are a read or write. overlapRg := cq.wRg if !readOnly { // We only use the combined read-write RangeGroup when we are a new write command, because // otherwise all read commands would have been filtered out so we can avoid using a second // RangeGroup. Here, the previous reads rely on a distinction between a write command RangeGroup // and an all command RangeGroup. This is so that they can avoid establishing a dependency // if they are already dependent on previous writes, but can remain independent from other // reads. overlapRg = cq.rwRg } // We only need to establish a dependency when this write command key range is not overlapping // any other reads or writes in its future. If it is overlapping, we know there was already a // dependency established with a dependent of the current overlap, meaning we already established // an implicit transitive dependency to the current overlap. if !overlapRg.Overlaps(keyRange) { cmd.pending = append(cmd.pending, wg) wg.Add(1) } // The current command is a write, so add it to the write RangeGroup and observe if the group grows. if cq.wRg.Add(keyRange) { // We can stop dependency creation early in the case that the write RangeGroup fully encloses // our new range, which means that no new dependencies are needed. This looks only at the // write RangeGroup because even if the combined range group encloses us, there can always be // more reads that are necessary dependencies if they themselves don't overlap any writes. We // only need to perform this check when the write RangeGroup grows. // // We check the write RangeGroup's length before checking if it encloses the new command's // range because we know (based on the fact that these are all overlapping commands) that the // RangeGroup can enclose us only if its length is 1 (meaning all ranges inserted have coalesced). // This guarantees that this enclosure check will always be run in constant time. if cq.wRg.Len() == 1 && cq.wRg.Encloses(newCmdRange) { enclosed = true } } // Make sure the current command's range gets added to the combined RangeGroup if we are using it. if overlapRg == cq.rwRg { cq.rwRg.Add(keyRange) } } } // Clear heap to avoid leaking anything it is currently storing. cq.oHeap.Clear() // Clear the RangeGroups so that they can be used again. This is an alternative // to using local variables that must be allocated in every iteration. cq.wRg.Clear() cq.rwRg.Clear() } }