// loadAppliedIndex returns the Raft applied index and the lease applied index. func loadAppliedIndex( ctx context.Context, reader engine.Reader, rangeID roachpb.RangeID, ) (uint64, uint64, error) { var appliedIndex uint64 v, _, err := engine.MVCCGet(ctx, reader, keys.RaftAppliedIndexKey(rangeID), hlc.ZeroTimestamp, true, nil) if err != nil { return 0, 0, err } if v != nil { int64AppliedIndex, err := v.GetInt() if err != nil { return 0, 0, err } appliedIndex = uint64(int64AppliedIndex) } // TODO(tschottdorf): code duplication. var leaseAppliedIndex uint64 v, _, err = engine.MVCCGet(ctx, reader, keys.LeaseAppliedIndexKey(rangeID), hlc.ZeroTimestamp, true, nil) if err != nil { return 0, 0, err } if v != nil { int64LeaseAppliedIndex, err := v.GetInt() if err != nil { return 0, 0, err } leaseAppliedIndex = uint64(int64LeaseAppliedIndex) } return appliedIndex, leaseAppliedIndex, nil }
// loadLastIndex retrieves the last index from storage. func loadLastIndex(eng engine.Reader, rangeID roachpb.RangeID, isInitialized bool) (uint64, error) { lastIndex := uint64(0) v, _, err := engine.MVCCGet(context.Background(), eng, keys.RaftLastIndexKey(rangeID), roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } if v != nil { int64LastIndex, err := v.GetInt() if err != nil { return 0, err } lastIndex = uint64(int64LastIndex) } else { // The log is empty, which means we are either starting from scratch // or the entire log has been truncated away. raftTruncatedState // handles both cases. lastEnt, err := raftTruncatedState(eng, rangeID, isInitialized) if err != nil { return 0, err } lastIndex = lastEnt.Index } return lastIndex, nil }
// TestRejectFutureCommand verifies that lease holders reject commands that // would cause a large time jump. func TestRejectFutureCommand(t *testing.T) { defer leaktest.AfterTest(t)() const maxOffset = 100 * time.Millisecond manual := hlc.NewManualClock(0) clock := hlc.NewClock(manual.UnixNano) clock.SetMaxOffset(maxOffset) mtc := multiTestContext{ clock: clock, } mtc.Start(t, 1) defer mtc.Stop() // First do a write. The first write will advance the clock by MaxOffset // because of the read cache's low water mark. getArgs := putArgs([]byte("b"), []byte("b")) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &getArgs); err != nil { t.Fatal(err) } if now := clock.Now(); now.WallTime != int64(maxOffset) { t.Fatalf("expected clock to advance to 100ms; got %s", now) } // The logical clock has advanced past the physical clock; increment // the "physical" clock to catch up. manual.Increment(int64(maxOffset)) startTime := manual.UnixNano() // Commands with a future timestamp that is within the MaxOffset // bound will be accepted and will cause the clock to advance. for i := int64(0); i < 3; i++ { incArgs := incrementArgs([]byte("a"), 5) ts := hlc.ZeroTimestamp.Add(startTime+((i+1)*30)*int64(time.Millisecond), 0) if _, err := client.SendWrappedWith(rg1(mtc.stores[0]), nil, roachpb.Header{Timestamp: ts}, &incArgs); err != nil { t.Fatal(err) } } if now := clock.Now(); now.WallTime != int64(190*time.Millisecond) { t.Fatalf("expected clock to advance to 190ms; got %s", now) } // Once the accumulated offset reaches MaxOffset, commands will be rejected. incArgs := incrementArgs([]byte("a"), 11) ts := hlc.ZeroTimestamp.Add(int64((time.Duration(startTime)+maxOffset+1)*time.Millisecond), 0) if _, err := client.SendWrappedWith(rg1(mtc.stores[0]), nil, roachpb.Header{Timestamp: ts}, &incArgs); err == nil { t.Fatalf("expected clock offset error but got nil") } // The clock remained at 190ms and the final command was not executed. if now := clock.Now(); now.WallTime != int64(190*time.Millisecond) { t.Errorf("expected clock to advance to 190ms; got %s", now) } val, _, err := engine.MVCCGet(context.Background(), mtc.engines[0], roachpb.Key("a"), clock.Now(), true, nil) if err != nil { t.Fatal(err) } if v := mustGetInt(val); v != 15 { t.Errorf("expected 15, got %v", v) } }
// loadLastIndexLocked retrieves the last index from storage. // loadLastIndexLocked requires that the replica lock is held. func (r *Replica) loadLastIndexLocked() (uint64, error) { lastIndex := uint64(0) v, _, err := engine.MVCCGet(r.store.Engine(), keys.RaftLastIndexKey(r.RangeID), roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } if v != nil { int64LastIndex, err := v.GetInt() if err != nil { return 0, err } lastIndex = uint64(int64LastIndex) } else { // The log is empty, which means we are either starting from scratch // or the entire log has been truncated away. raftTruncatedState // handles both cases. lastEnt, err := r.raftTruncatedStateLocked() if err != nil { return 0, err } lastIndex = lastEnt.Index } return lastIndex, nil }
func loadLastIndex( ctx context.Context, reader engine.Reader, rangeID roachpb.RangeID, ) (uint64, error) { lastIndex := uint64(0) v, _, err := engine.MVCCGet(ctx, reader, keys.RaftLastIndexKey(rangeID), hlc.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } if v != nil { int64LastIndex, err := v.GetInt() if err != nil { return 0, err } lastIndex = uint64(int64LastIndex) } else { // The log is empty, which means we are either starting from scratch // or the entire log has been truncated away. lastEnt, err := loadTruncatedState(ctx, reader, rangeID) if err != nil { return 0, err } lastIndex = lastEnt.Index } return lastIndex, nil }
// Get returns the value for a specified key. func (r *Range) Get(batch engine.Engine, args proto.GetRequest) (proto.GetResponse, []proto.Intent, error) { var reply proto.GetResponse val, intents, err := engine.MVCCGet(batch, args.Key, args.Timestamp, args.ReadConsistency == proto.CONSISTENT, args.Txn) reply.Value = val return reply, intents, err }
// loadLastIndex retrieves the last index from storage. func (r *Replica) loadLastIndex() (uint64, error) { lastIndex := uint64(0) v, _, err := engine.MVCCGet(r.rm.Engine(), keys.RaftLastIndexKey(r.Desc().RangeID), roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } if v != nil { var err error _, lastIndex, err = encoding.DecodeUint64(v.GetRawBytes()) if err != nil { return 0, err } } else { // The log is empty, which means we are either starting from scratch // or the entire log has been truncated away. raftTruncatedState // handles both cases. lastEnt, err := r.raftTruncatedState() if err != nil { return 0, err } lastIndex = lastEnt.Index } return lastIndex, nil }
func loadFrozenStatus(reader engine.Reader, rangeID roachpb.RangeID) (bool, error) { val, _, err := engine.MVCCGet(context.Background(), reader, keys.RangeFrozenStatusKey(rangeID), hlc.ZeroTimestamp, true, nil) if err != nil { return false, err } if val == nil { return false, nil } return val.GetBool() }
// TestRangeCommandClockUpdate verifies that followers update their // clocks when executing a command, even if the lease holder's clock is far // in the future. func TestRangeCommandClockUpdate(t *testing.T) { defer leaktest.AfterTest(t)() const numNodes = 3 var manuals []*hlc.ManualClock var clocks []*hlc.Clock for i := 0; i < numNodes; i++ { manuals = append(manuals, hlc.NewManualClock(1)) clocks = append(clocks, hlc.NewClock(manuals[i].UnixNano)) clocks[i].SetMaxOffset(100 * time.Millisecond) } mtc := multiTestContext{ clocks: clocks, } mtc.Start(t, numNodes) defer mtc.Stop() mtc.replicateRange(1, 1, 2) // Advance the lease holder's clock ahead of the followers (by more than // MaxOffset but less than the range lease) and execute a command. manuals[0].Increment(int64(500 * time.Millisecond)) incArgs := incrementArgs([]byte("a"), 5) ts := clocks[0].Now() if _, err := client.SendWrappedWith(rg1(mtc.stores[0]), nil, roachpb.Header{Timestamp: ts}, &incArgs); err != nil { t.Fatal(err) } // Wait for that command to execute on all the followers. util.SucceedsSoon(t, func() error { values := []int64{} for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(context.Background(), eng, roachpb.Key("a"), clocks[0].Now(), true, nil) if err != nil { return err } values = append(values, mustGetInt(val)) } if !reflect.DeepEqual(values, []int64{5, 5, 5}) { return errors.Errorf("expected (5, 5, 5), got %v", values) } return nil }) // Verify that all the followers have accepted the clock update from // node 0 even though it comes from outside the usual max offset. now := clocks[0].Now() for i, clock := range clocks { // Only compare the WallTimes: it's normal for clock 0 to be a few logical ticks ahead. if clock.Now().WallTime < now.WallTime { t.Errorf("clock %d is behind clock 0: %s vs %s", i, clock.Now(), now) } } }
// GetSequence looks up the latest sequence number recorded for this family. On a // cache miss, zero is returned. func (rc *ResponseCache) GetSequence(e engine.Engine, family []byte) (int64, error) { if len(family) == 0 { return 0, errEmptyID } // Pull response from the cache and read into reply if available. key := keys.ResponseCacheKey(rc.rangeID, family) v, _, err := engine.MVCCGet(e, key, roachpb.ZeroTimestamp, true, nil) if err != nil { return 0, err } if v == nil { return 0, nil } return v.GetInt() }
// TestProgressWithDownNode verifies that a surviving quorum can make progress // with a downed node. func TestProgressWithDownNode(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() rangeID := roachpb.RangeID(1) mtc.replicateRange(rangeID, 1, 2) incArgs := incrementArgs([]byte("a"), 5) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &incArgs); err != nil { t.Fatal(err) } // Verify that the first increment propagates to all the engines. verify := func(expected []int64) { util.SucceedsWithin(t, time.Second, func() error { values := []int64{} for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(eng, roachpb.Key("a"), mtc.clock.Now(), true, nil) if err != nil { return err } values = append(values, mustGetInt(val)) } if !reflect.DeepEqual(expected, values) { return util.Errorf("expected %v, got %v", expected, values) } return nil }) } verify([]int64{5, 5, 5}) // Stop one of the replicas and issue a new increment. mtc.stopStore(1) incArgs = incrementArgs([]byte("a"), 11) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &incArgs); err != nil { t.Fatal(err) } // The new increment can be seen on both live replicas. verify([]int64{16, 5, 16}) // Once the downed node is restarted, it will catch up. mtc.restartStore(1) verify([]int64{16, 16, 16}) }
// loadAppliedIndex retrieves the applied index from the supplied engine. func (r *Range) loadAppliedIndex(eng engine.Engine) (uint64, error) { var appliedIndex uint64 if r.isInitialized() { appliedIndex = raftInitialLogIndex } else { appliedIndex = 0 } v, _, err := engine.MVCCGet(eng, keys.RaftAppliedIndexKey(r.Desc().RaftID), proto.ZeroTimestamp, true, nil) if err != nil { return 0, err } if v != nil { _, appliedIndex = encoding.DecodeUint64(v.Bytes) } return appliedIndex, nil }
// loadAppliedIndex retrieves the applied index from the supplied engine. func (r *Replica) loadAppliedIndex(eng engine.Engine) (uint64, error) { var appliedIndex uint64 if r.isInitialized() { appliedIndex = raftInitialLogIndex } else { appliedIndex = 0 } v, _, err := engine.MVCCGet(eng, keys.RaftAppliedIndexKey(r.Desc().RangeID), roachpb.ZeroTimestamp, true, nil) if err != nil { return 0, err } if v != nil { var err error _, appliedIndex, err = encoding.DecodeUint64(v.GetRawBytes()) if err != nil { return 0, err } } return appliedIndex, nil }
// loadAppliedIndex retrieves the applied index from the supplied engine. func loadAppliedIndex(eng engine.Reader, rangeID roachpb.RangeID, isInitialized bool) (uint64, error) { var appliedIndex uint64 if isInitialized { appliedIndex = raftInitialLogIndex } else { appliedIndex = 0 } v, _, err := engine.MVCCGet(context.Background(), eng, keys.RaftAppliedIndexKey(rangeID), roachpb.ZeroTimestamp, true, nil) if err != nil { return 0, err } if v != nil { int64AppliedIndex, err := v.GetInt() if err != nil { return 0, err } appliedIndex = uint64(int64AppliedIndex) } return appliedIndex, nil }
// loadAppliedIndexLocked retrieves the applied index from the supplied engine. // loadAppliedIndexLocked requires that the replica lock is held. func (r *Replica) loadAppliedIndexLocked(eng engine.Engine) (uint64, error) { var appliedIndex uint64 if r.isInitializedLocked() { appliedIndex = raftInitialLogIndex } else { appliedIndex = 0 } v, _, err := engine.MVCCGet(eng, keys.RaftAppliedIndexKey(r.RangeID), roachpb.ZeroTimestamp, true, nil) if err != nil { return 0, err } if v != nil { int64AppliedIndex, err := v.GetInt() if err != nil { return 0, err } appliedIndex = uint64(int64AppliedIndex) } return appliedIndex, nil }
// TestStoreRangeMergeWithData attempts to merge two collocate ranges // each containing data. func TestStoreRangeMergeWithData(t *testing.T) { defer leaktest.AfterTest(t) content := roachpb.Key("testing!") store, stopper := createTestStore(t) defer stopper.Stop() aDesc, bDesc, err := createSplitRanges(store) if err != nil { t.Fatal(err) } // Write some values left and right of the proposed split key. pArgs := putArgs([]byte("aaa"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("ccc"), content) if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: bDesc.RangeID, }, &pArgs); err != nil { t.Fatal(err) } // Confirm the values are there. gArgs := getArgs([]byte("aaa")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } gArgs = getArgs([]byte("ccc")) if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: bDesc.RangeID, }, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } // Merge the b range back into the a range. args := adminMergeArgs(roachpb.KeyMin) if _, err := client.SendWrapped(rg1(store), nil, &args); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []roachpb.Key{keys.RangeDescriptorKey(aDesc.StartKey), keys.RangeDescriptorKey(bDesc.StartKey)} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } // Verify the merge by looking up keys from both ranges. rangeA := store.LookupReplica([]byte("a"), nil) rangeB := store.LookupReplica([]byte("c"), nil) rangeADesc := rangeA.Desc() rangeBDesc := rangeB.Desc() if !reflect.DeepEqual(rangeA, rangeB) { t.Fatalf("ranges were not merged %+v=%+v", rangeADesc, rangeBDesc) } if !bytes.Equal(rangeADesc.StartKey, roachpb.RKeyMin) { t.Fatalf("The start key is not equal to KeyMin %q=%q", rangeADesc.StartKey, roachpb.RKeyMin) } if !bytes.Equal(rangeADesc.EndKey, roachpb.RKeyMax) { t.Fatalf("The end key is not equal to KeyMax %q=%q", rangeADesc.EndKey, roachpb.RKeyMax) } // Try to get values from after the merge. gArgs = getArgs([]byte("aaa")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } gArgs = getArgs([]byte("ccc")) if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: rangeB.RangeID, }, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } // Put new values after the merge on both sides. pArgs = putArgs([]byte("aaaa"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("cccc"), content) if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: rangeB.RangeID, }, &pArgs); err != nil { t.Fatal(err) } // Try to get the newly placed values. gArgs = getArgs([]byte("aaaa")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } gArgs = getArgs([]byte("cccc")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } }
// ApplySnapshot implements the multiraft.WriteableGroupStorage interface. func (r *Range) ApplySnapshot(snap raftpb.Snapshot) error { snapData := proto.RaftSnapshotData{} err := gogoproto.Unmarshal(snap.Data, &snapData) if err != nil { return err } // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. hardStateKey := keys.RaftHardStateKey(r.Desc().RaftID) hardState, _, err := engine.MVCCGet(r.rm.Engine(), hardStateKey, proto.ZeroTimestamp, true /* consistent */, nil) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor batch := r.rm.Engine().NewBatch() defer batch.Close() // Delete everything in the range and recreate it from the snapshot. for iter := newRangeDataIterator(&desc, r.rm.Engine()); iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return err } } // Write the snapshot into the range. for _, kv := range snapData.KV { if err := batch.Put(kv.Key, kv.Value); err != nil { return err } } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, proto.ZeroTimestamp, nil) if err != nil { return err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, proto.ZeroTimestamp, *hardState, nil) if err != nil { return err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RaftID) if err != nil { return err } // Copy range stats to new range. oldStats := r.stats r.stats, err = newRangeStats(desc.RaftID, batch) if err != nil { r.stats = oldStats return err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, r.Desc().RaftID, snap.Metadata.Index); err != nil { return err } if err := batch.Commit(); err != nil { return err } // As outlined above, last and applied index are the same after applying // the snapshot. atomic.StoreUint64(&r.lastIndex, snap.Metadata.Index) atomic.StoreUint64(&r.appliedIndex, snap.Metadata.Index) // Atomically update the descriptor and lease. if err := r.setDesc(&desc); err != nil { return err } atomic.StorePointer(&r.lease, unsafe.Pointer(lease)) return nil }
// applySnapshot updates the replica based on the given snapshot. func (r *Replica) applySnapshot(snap raftpb.Snapshot) error { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return err } rangeID := r.Desc().RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(r.store.Engine(), hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor batch := r.store.Engine().NewBatch() defer batch.Close() // Delete everything in the range and recreate it from the snapshot. iter := newReplicaDataIterator(&desc, r.store.Engine()) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return err } } // Write the snapshot into the range. for _, kv := range snapData.KV { mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return err } } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return err } if err := batch.Commit(); err != nil { return err } // Update the range stats. r.stats.Replace(newStats) // As outlined above, last and applied index are the same after applying // the snapshot. atomic.StoreUint64(&r.lastIndex, snap.Metadata.Index) atomic.StoreUint64(&r.appliedIndex, snap.Metadata.Index) // Atomically update the descriptor and lease. if err := r.setDesc(&desc); err != nil { return err } // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarangee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(); err != nil { return err } atomic.StorePointer(&r.lease, unsafe.Pointer(lease)) return nil }
func TestReplicateAddAndRemove(t *testing.T) { defer leaktest.AfterTest(t) testFunc := func(addFirst bool) { mtc := startMultiTestContext(t, 4) defer mtc.Stop() // Replicate the initial range to three of the four nodes. rangeID := roachpb.RangeID(1) mtc.replicateRange(rangeID, 3, 1) incArgs := incrementArgs([]byte("a"), 5) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &incArgs); err != nil { t.Fatal(err) } verify := func(expected []int64) { util.SucceedsWithin(t, 3*time.Second, func() error { values := []int64{} for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(eng, roachpb.Key("a"), mtc.clock.Now(), true, nil) if err != nil { return err } values = append(values, mustGetInt(val)) } if !reflect.DeepEqual(expected, values) { return util.Errorf("addFirst: %t, expected %v, got %v", addFirst, expected, values) } return nil }) } // The first increment is visible on all three replicas. verify([]int64{5, 5, 0, 5}) // Stop a store and replace it. mtc.stopStore(1) if addFirst { mtc.replicateRange(rangeID, 2) mtc.unreplicateRange(rangeID, 1) } else { mtc.unreplicateRange(rangeID, 1) mtc.replicateRange(rangeID, 2) } verify([]int64{5, 5, 5, 5}) // Ensure that the rest of the group can make progress. incArgs = incrementArgs([]byte("a"), 11) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &incArgs); err != nil { t.Fatal(err) } verify([]int64{16, 5, 16, 16}) // Bring the downed store back up (required for a clean shutdown). mtc.restartStore(1) // Node 1 never sees the increment that was added while it was // down. Perform another increment on the live nodes to verify. incArgs = incrementArgs([]byte("a"), 23) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &incArgs); err != nil { t.Fatal(err) } verify([]int64{39, 5, 39, 39}) // Wait out the leader lease and the unleased duration to make the replica GC'able. mtc.manualClock.Increment(int64(storage.ReplicaGCQueueInactivityThreshold+storage.DefaultLeaderLeaseDuration) + 1) mtc.stores[1].ForceReplicaGCScanAndProcess() // The removed store no longer has any of the data from the range. verify([]int64{39, 0, 39, 39}) desc := mtc.stores[0].LookupReplica(roachpb.RKeyMin, nil).Desc() replicaIDsByStore := map[roachpb.StoreID]roachpb.ReplicaID{} for _, rep := range desc.Replicas { replicaIDsByStore[rep.StoreID] = rep.ReplicaID } expected := map[roachpb.StoreID]roachpb.ReplicaID{1: 1, 4: 2, 3: 4} if !reflect.DeepEqual(expected, replicaIDsByStore) { t.Fatalf("expected replica IDs to be %v but got %v", expected, replicaIDsByStore) } } // Run the test twice, once adding the replacement before removing // the downed node, and once removing the downed node first. testFunc(true) testFunc(false) }
// TestStoreRangeMergeWithData attempts to merge two collocate ranges // each containing data. func TestStoreRangeMergeWithData(t *testing.T) { defer leaktest.AfterTest(t) content := proto.Key("testing!") store, stopper := createTestStore(t) defer stopper.Stop() aDesc, bDesc, err := createSplitRanges(store) if err != nil { t.Fatal(err) } // Write some values left and right of the proposed split key. pArgs := putArgs([]byte("aaa"), content, aDesc.RangeID, store.StoreID()) if _, err := store.ExecuteCmd(context.Background(), &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("ccc"), content, bDesc.RangeID, store.StoreID()) if _, err := store.ExecuteCmd(context.Background(), &pArgs); err != nil { t.Fatal(err) } // Confirm the values are there. gArgs := getArgs([]byte("aaa"), aDesc.RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } gArgs = getArgs([]byte("ccc"), bDesc.RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } // Merge the b range back into the a range. args := adminMergeArgs(proto.KeyMin, 1, store.StoreID()) if _, err := store.ExecuteCmd(context.Background(), &args); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []proto.Key{keys.RangeDescriptorKey(aDesc.StartKey), keys.RangeDescriptorKey(bDesc.StartKey)} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } // Verify the merge by looking up keys from both ranges. rangeA := store.LookupRange([]byte("a"), nil) rangeB := store.LookupRange([]byte("c"), nil) if !reflect.DeepEqual(rangeA, rangeB) { t.Fatalf("ranges were not merged %+v=%+v", rangeA.Desc(), rangeB.Desc()) } if !bytes.Equal(rangeA.Desc().StartKey, proto.KeyMin) { t.Fatalf("The start key is not equal to KeyMin %q=%q", rangeA.Desc().StartKey, proto.KeyMin) } if !bytes.Equal(rangeA.Desc().EndKey, proto.KeyMax) { t.Fatalf("The end key is not equal to KeyMax %q=%q", rangeA.Desc().EndKey, proto.KeyMax) } // Try to get values from after the merge. gArgs = getArgs([]byte("aaa"), rangeA.Desc().RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } gArgs = getArgs([]byte("ccc"), rangeB.Desc().RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } // Put new values after the merge on both sides. pArgs = putArgs([]byte("aaaa"), content, rangeA.Desc().RangeID, store.StoreID()) if _, err = store.ExecuteCmd(context.Background(), &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("cccc"), content, rangeB.Desc().RangeID, store.StoreID()) if _, err = store.ExecuteCmd(context.Background(), &pArgs); err != nil { t.Fatal(err) } // Try to get the newly placed values. gArgs = getArgs([]byte("aaaa"), rangeA.Desc().RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } gArgs = getArgs([]byte("cccc"), rangeA.Desc().RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } rangeID := r.RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. This is // usually unnecessary because a snapshot is nearly always // accompanied by a new HardState which incorporates both our former // state and new information from the leader, but in the event that // the HardState has not changed, we want to use our own previous // HardState and not one that was transmitted via the snapshot. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, snapData.LogEntries); err != nil { return 0, err } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return 0, err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range stats. r.stats.Replace(newStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }
// TestStoreRangeSplit executes a split of a range and verifies that the // resulting ranges respond to the right key ranges and that their stats // and response caches have been properly accounted for. func TestStoreRangeSplit(t *testing.T) { defer leaktest.AfterTest(t) store, stopper := createTestStore(t) defer stopper.Stop() raftID := proto.RaftID(1) splitKey := proto.Key("m") content := proto.Key("asdvb") // First, write some values left and right of the proposed split key. pArgs, pReply := putArgs([]byte("c"), content, raftID, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: pArgs, Reply: pReply}); err != nil { t.Fatal(err) } pArgs, pReply = putArgs([]byte("x"), content, raftID, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: pArgs, Reply: pReply}); err != nil { t.Fatal(err) } // Increments are a good way of testing the response cache. Up here, we // address them to the original range, then later to the one that contains // the key. lIncArgs, lIncReply := incrementArgs([]byte("apoptosis"), 100, raftID, store.StoreID()) lIncArgs.CmdID = proto.ClientCmdID{WallTime: 123, Random: 423} if err := store.ExecuteCmd(context.Background(), proto.Call{Args: lIncArgs, Reply: lIncReply}); err != nil { t.Fatal(err) } rIncArgs, rIncReply := incrementArgs([]byte("wobble"), 10, raftID, store.StoreID()) rIncArgs.CmdID = proto.ClientCmdID{WallTime: 12, Random: 42} if err := store.ExecuteCmd(context.Background(), proto.Call{Args: rIncArgs, Reply: rIncReply}); err != nil { t.Fatal(err) } // Get the original stats for key and value bytes. var ms engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), raftID, &ms); err != nil { t.Fatal(err) } keyBytes, valBytes := ms.KeyBytes, ms.ValBytes // Split the range. args, reply := adminSplitArgs(proto.KeyMin, splitKey, 1, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: args, Reply: reply}); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []proto.Key{keys.RangeDescriptorKey(proto.KeyMin), keys.RangeDescriptorKey(splitKey)} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } rng := store.LookupRange(proto.KeyMin, nil) newRng := store.LookupRange([]byte("m"), nil) if !bytes.Equal(newRng.Desc().StartKey, splitKey) || !bytes.Equal(splitKey, rng.Desc().EndKey) { t.Errorf("ranges mismatched, wanted %q=%q=%q", newRng.Desc().StartKey, splitKey, rng.Desc().EndKey) } if !bytes.Equal(newRng.Desc().EndKey, proto.KeyMax) || !bytes.Equal(rng.Desc().StartKey, proto.KeyMin) { t.Errorf("new ranges do not cover KeyMin-KeyMax, but only %q-%q", rng.Desc().StartKey, newRng.Desc().EndKey) } // Try to get values from both left and right of where the split happened. gArgs, gReply := getArgs([]byte("c"), raftID, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: gArgs, Reply: gReply}); err != nil || !bytes.Equal(gReply.Value.Bytes, content) { t.Fatal(err) } gArgs, gReply = getArgs([]byte("x"), newRng.Desc().RaftID, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: gArgs, Reply: gReply}); err != nil || !bytes.Equal(gReply.Value.Bytes, content) { t.Fatal(err) } // Send out an increment request copied from above (same ClientCmdID) which // remains in the old range. lIncReply = &proto.IncrementResponse{} if err := store.ExecuteCmd(context.Background(), proto.Call{Args: lIncArgs, Reply: lIncReply}); err != nil { t.Fatal(err) } if lIncReply.NewValue != 100 { t.Errorf("response cache broken in old range, expected %d but got %d", lIncArgs.Increment, lIncReply.NewValue) } // Send out the same increment copied from above (same ClientCmdID), but // now to the newly created range (which should hold that key). rIncArgs.RequestHeader.RaftID = newRng.Desc().RaftID rIncReply = &proto.IncrementResponse{} if err := store.ExecuteCmd(context.Background(), proto.Call{Args: rIncArgs, Reply: rIncReply}); err != nil { t.Fatal(err) } if rIncReply.NewValue != 10 { t.Errorf("response cache not copied correctly to new range, expected %d but got %d", rIncArgs.Increment, rIncReply.NewValue) } // Compare stats of split ranges to ensure they are non ero and // exceed the original range when summed. var left, right engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), raftID, &left); err != nil { t.Fatal(err) } lKeyBytes, lValBytes := left.KeyBytes, left.ValBytes if err := engine.MVCCGetRangeStats(store.Engine(), newRng.Desc().RaftID, &right); err != nil { t.Fatal(err) } rKeyBytes, rValBytes := right.KeyBytes, right.ValBytes if lKeyBytes == 0 || rKeyBytes == 0 { t.Errorf("expected non-zero key bytes; got %d, %d", lKeyBytes, rKeyBytes) } if lValBytes == 0 || rValBytes == 0 { t.Errorf("expected non-zero val bytes; got %d, %d", lValBytes, rValBytes) } if lKeyBytes+rKeyBytes <= keyBytes { t.Errorf("left + right key bytes don't match; %d + %d <= %d", lKeyBytes, rKeyBytes, keyBytes) } if lValBytes+rValBytes <= valBytes { t.Errorf("left + right val bytes don't match; %d + %d <= %d", lValBytes, rValBytes, valBytes) } }
func TestReplicateAddAndRemove(t *testing.T) { defer leaktest.AfterTest(t) // Run the test twice, once adding the replacement before removing // the downed node, and once removing the downed node first. for _, addFirst := range []bool{true, false} { mtc := startMultiTestContext(t, 4) defer mtc.Stop() // Replicate the initial range to three of the four nodes. raftID := proto.RaftID(1) mtc.replicateRange(raftID, 0, 3, 1) incArgs, incResp := incrementArgs([]byte("a"), 5, raftID, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } verify := func(expected []int64) { util.SucceedsWithin(t, time.Second, func() error { values := []int64{} for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(eng, proto.Key("a"), mtc.clock.Now(), true, nil) if err != nil { return err } values = append(values, mustGetInteger(val)) } if !reflect.DeepEqual(expected, values) { return util.Errorf("expected %v, got %v", expected, values) } return nil }) } // The first increment is visible on all three replicas. verify([]int64{5, 5, 0, 5}) // Stop a store and replace it. mtc.stopStore(1) if addFirst { mtc.replicateRange(raftID, 0, 2) mtc.unreplicateRange(raftID, 0, 1) } else { mtc.unreplicateRange(raftID, 0, 1) mtc.replicateRange(raftID, 0, 2) } verify([]int64{5, 5, 5, 5}) // Ensure that the rest of the group can make progress. incArgs, incResp = incrementArgs([]byte("a"), 11, raftID, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } verify([]int64{16, 5, 16, 16}) // Bring the downed store back up (required for a clean shutdown). mtc.restartStore(1) // Node 1 never sees the increment that was added while it was // down. Perform another increment on the live nodes to verify. incArgs, incResp = incrementArgs([]byte("a"), 23, raftID, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } verify([]int64{39, 5, 39, 39}) // TODO(bdarnell): when we have GC of removed ranges, verify that // the downed node removes the data from this range after coming // back up. // Wait out the leader lease and the unleased duration to make the range GC'able. mtc.manualClock.Increment(int64(storage.DefaultLeaderLeaseDuration) + int64(storage.RangeGCQueueUnleasedDuration) + 1) mtc.stores[1].ForceRangeGCScan(t) // The removed store no longer has any of the data from the range. verify([]int64{39, 0, 39, 39}) } }
// Get returns the value for a specified key. func (r *Range) Get(batch engine.Engine, args *proto.GetRequest, reply *proto.GetResponse) []proto.Intent { val, intents, err := engine.MVCCGet(batch, args.Key, args.Timestamp, args.ReadConsistency == proto.CONSISTENT, args.Txn) reply.Value = val reply.SetGoError(err) return intents }
// TestMultiStoreEventFeed verifies that events on multiple stores are properly // recieved by a single event reader. func TestMultiStoreEventFeed(t *testing.T) { defer leaktest.AfterTest(t) t.Skip("disabled until #1531 is fixed") // Create a multiTestContext which publishes all store events to the given // feed. feed := &util.Feed{} mtc := &multiTestContext{ feed: feed, } // Start reading events from the feed before starting the stores. ser := &storeEventReader{ recordUpdateDetail: false, } readStopper := stop.NewStopper() sub := feed.Subscribe() readStopper.RunWorker(func() { ser.readEvents(sub) }) mtc.Start(t, 3) defer mtc.Stop() // Replicate the default range. raftID := proto.RaftID(1) mtc.replicateRange(raftID, 0, 1, 2) // Add some data in a transaction err := mtc.db.Txn(func(txn *client.Txn) error { b := &client.Batch{} b.Put("a", "asdf") b.Put("c", "jkl;") return txn.Commit(b) }) if err != nil { t.Fatalf("error putting data to db: %s", err) } // AdminSplit in between the two ranges. if err := mtc.db.AdminSplit("b"); err != nil { t.Fatalf("error splitting initial: %s", err) } // AdminSplit an empty range at the end of the second range. if err := mtc.db.AdminSplit("z"); err != nil { t.Fatalf("error splitting second range: %s", err) } // AdminMerge the empty range back into the second range. if err := mtc.db.AdminMerge("c"); err != nil { t.Fatalf("error merging final range: %s", err) } // Add an additional put through the system and wait for all // replicas to receive it. if _, err := mtc.db.Inc("aa", 5); err != nil { t.Fatalf("error putting data to db: %s", err) } util.SucceedsWithin(t, time.Second, func() error { for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(eng, proto.Key("aa"), mtc.clock.Now(), true, nil) if err != nil { return err } if a, e := mustGetInteger(val), int64(5); a != e { return util.Errorf("expected aa = %d, got %d", e, a) } } return nil }) // Close feed and wait for reader to receive all events. feed.Close() readStopper.Stop() // Compare events to expected values. expected := map[proto.StoreID][]string{ proto.StoreID(1): { "StartStore", "BeginScanRanges", "RegisterRange scan=true, rid=1, live=.*", "EndScanRanges", "SplitRange origId=1, newId=2, origKey=336, newKey=15", "SplitRange origId=2, newId=3, origKey=15, newKey=0", "MergeRange rid=2, subId=3, key=15, subKey=0", }, proto.StoreID(2): { "StartStore", "BeginScanRanges", "EndScanRanges", "RegisterRange scan=false, rid=1, live=.*", "SplitRange origId=1, newId=2, origKey=336, newKey=15", "SplitRange origId=2, newId=3, origKey=15, newKey=0", "MergeRange rid=2, subId=3, key=15, subKey=0", }, proto.StoreID(3): { "StartStore", "BeginScanRanges", "EndScanRanges", "RegisterRange scan=false, rid=1, live=.*", "SplitRange origId=1, newId=2, origKey=336, newKey=15", "SplitRange origId=2, newId=3, origKey=15, newKey=0", "MergeRange rid=2, subId=3, key=15, subKey=0", }, } if a, e := ser.perStoreFeeds, expected; !checkMatch(e, a) { t.Errorf("event feed did not match expected value. Actual values have been printed to compare with above expectation.\n") t.Logf("Event feed information:\n%s", ser.eventFeedString()) } // Expected count of update events on a per-method basis. expectedUpdateCount := map[proto.StoreID]map[proto.Method]int{ proto.StoreID(1): { proto.Put: 18, proto.ConditionalPut: 7, proto.Increment: 2, proto.Delete: 2, proto.EndTransaction: 6, proto.InternalLeaderLease: 3, }, proto.StoreID(2): { proto.Put: 16, proto.ConditionalPut: 6, proto.Increment: 2, proto.Delete: 2, proto.EndTransaction: 5, proto.InternalLeaderLease: 2, }, proto.StoreID(3): { proto.Put: 14, proto.ConditionalPut: 5, proto.Increment: 2, proto.Delete: 2, proto.EndTransaction: 4, proto.InternalLeaderLease: 2, }, } if a, e := ser.perStoreUpdateCount, expectedUpdateCount; !reflect.DeepEqual(a, e) { t.Errorf("update counts did not match expected value. Actual values have been printed to compare with above expectation.\n") t.Logf("Update count information:\n%s", ser.updateCountString()) } }
// TestStoreRangeSplit executes a split of a range and verifies that the // resulting ranges respond to the right key ranges and that their stats // and sequence cache have been properly accounted for. func TestStoreRangeSplitIdempotency(t *testing.T) { defer leaktest.AfterTest(t) store, stopper := createTestStore(t) defer stopper.Stop() rangeID := roachpb.RangeID(1) splitKey := roachpb.Key("m") content := roachpb.Key("asdvb") // First, write some values left and right of the proposed split key. pArgs := putArgs([]byte("c"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("x"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } // Increments are a good way of testing the sequence cache. Up here, we // address them to the original range, then later to the one that contains // the key. txn := roachpb.NewTransaction("test", []byte("c"), 10, roachpb.SERIALIZABLE, store.Clock().Now(), 0) lIncArgs := incrementArgs([]byte("apoptosis"), 100) if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ Txn: txn, }, &lIncArgs); err != nil { t.Fatal(err) } rIncArgs := incrementArgs([]byte("wobble"), 10) txn.Sequence++ if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ Txn: txn, }, &rIncArgs); err != nil { t.Fatal(err) } // Get the original stats for key and value bytes. var ms engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), rangeID, &ms); err != nil { t.Fatal(err) } keyBytes, valBytes := ms.KeyBytes, ms.ValBytes // Split the range. args := adminSplitArgs(roachpb.KeyMin, splitKey) if _, err := client.SendWrapped(rg1(store), nil, &args); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []roachpb.Key{keys.RangeDescriptorKey(roachpb.RKeyMin), keys.RangeDescriptorKey(keys.Addr(splitKey))} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } rng := store.LookupReplica(roachpb.RKeyMin, nil) newRng := store.LookupReplica([]byte("m"), nil) if !bytes.Equal(newRng.Desc().StartKey, splitKey) || !bytes.Equal(splitKey, rng.Desc().EndKey) { t.Errorf("ranges mismatched, wanted %q=%q=%q", newRng.Desc().StartKey, splitKey, rng.Desc().EndKey) } if !bytes.Equal(newRng.Desc().EndKey, roachpb.RKeyMax) || !bytes.Equal(rng.Desc().StartKey, roachpb.RKeyMin) { t.Errorf("new ranges do not cover KeyMin-KeyMax, but only %q-%q", rng.Desc().StartKey, newRng.Desc().EndKey) } // Try to get values from both left and right of where the split happened. gArgs := getArgs([]byte("c")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } gArgs = getArgs([]byte("x")) if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: newRng.Desc().RangeID, }, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } // Send out an increment request copied from above (same txn/sequence) // which remains in the old range. _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ Txn: txn, }, &lIncArgs) if _, ok := err.(*roachpb.TransactionRetryError); !ok { t.Fatalf("unexpected sequence cache miss: %v", err) } // Send out the same increment copied from above (same txn/sequence), but // now to the newly created range (which should hold that key). _, err = client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: newRng.Desc().RangeID, Txn: txn, }, &rIncArgs) if _, ok := err.(*roachpb.TransactionRetryError); !ok { t.Fatalf("unexpected sequence cache miss: %v", err) } // Compare stats of split ranges to ensure they are non zero and // exceed the original range when summed. var left, right engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), rangeID, &left); err != nil { t.Fatal(err) } lKeyBytes, lValBytes := left.KeyBytes, left.ValBytes if err := engine.MVCCGetRangeStats(store.Engine(), newRng.Desc().RangeID, &right); err != nil { t.Fatal(err) } rKeyBytes, rValBytes := right.KeyBytes, right.ValBytes if lKeyBytes == 0 || rKeyBytes == 0 { t.Errorf("expected non-zero key bytes; got %d, %d", lKeyBytes, rKeyBytes) } if lValBytes == 0 || rValBytes == 0 { t.Errorf("expected non-zero val bytes; got %d, %d", lValBytes, rValBytes) } if lKeyBytes+rKeyBytes <= keyBytes { t.Errorf("left + right key bytes don't match; %d + %d <= %d", lKeyBytes, rKeyBytes, keyBytes) } if lValBytes+rValBytes <= valBytes { t.Errorf("left + right val bytes don't match; %d + %d <= %d", lValBytes, rValBytes, valBytes) } }
// TestStoreRangeSplit executes a split of a range and verifies that the // resulting ranges respond to the right key ranges and that their stats // and response caches have been properly accounted for. func TestStoreRangeSplit(t *testing.T) { defer leaktest.AfterTest(t) store, stopper := createTestStore(t) defer stopper.Stop() rangeID := roachpb.RangeID(1) splitKey := roachpb.RKey("m") content := roachpb.Key("asdvb") // First, write some values left and right of the proposed split key. pArgs := putArgs([]byte("c"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("x"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } // Increments are a good way of testing the response cache. Up here, we // address them to the original range, then later to the one that contains // the key. lCmdID := roachpb.ClientCmdID{WallTime: 123, Random: 423} lIncArgs := incrementArgs([]byte("apoptosis"), 100) if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ CmdID: lCmdID, }, &lIncArgs); err != nil { t.Fatal(err) } rIncArgs := incrementArgs([]byte("wobble"), 10) rCmdID := roachpb.ClientCmdID{WallTime: 12, Random: 42} if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ CmdID: rCmdID, }, &rIncArgs); err != nil { t.Fatal(err) } // Get the original stats for key and value bytes. var ms engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), rangeID, &ms); err != nil { t.Fatal(err) } keyBytes, valBytes := ms.KeyBytes, ms.ValBytes // Split the range. args := adminSplitArgs(roachpb.RKeyMin, splitKey) if _, err := client.SendWrapped(rg1(store), nil, &args); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []roachpb.Key{keys.RangeDescriptorKey(roachpb.RKeyMin), keys.RangeDescriptorKey(splitKey)} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } rng := store.LookupReplica(roachpb.RKeyMin, nil) newRng := store.LookupReplica([]byte("m"), nil) if !bytes.Equal(newRng.Desc().StartKey, splitKey) || !bytes.Equal(splitKey, rng.Desc().EndKey) { t.Errorf("ranges mismatched, wanted %q=%q=%q", newRng.Desc().StartKey, splitKey, rng.Desc().EndKey) } if !bytes.Equal(newRng.Desc().EndKey, roachpb.RKeyMax) || !bytes.Equal(rng.Desc().StartKey, roachpb.RKeyMin) { t.Errorf("new ranges do not cover KeyMin-KeyMax, but only %q-%q", rng.Desc().StartKey, newRng.Desc().EndKey) } // Try to get values from both left and right of where the split happened. gArgs := getArgs([]byte("c")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*roachpb.GetResponse); !bytes.Equal(gReply.Value.GetRawBytes(), content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.GetRawBytes(), content) } gArgs = getArgs([]byte("x")) if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: newRng.Desc().RangeID, }, &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*roachpb.GetResponse); !bytes.Equal(gReply.Value.GetRawBytes(), content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.GetRawBytes(), content) } // Send out an increment request copied from above (same ClientCmdID) which // remains in the old range. if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ CmdID: lCmdID, }, &lIncArgs); err != nil { t.Fatal(err) } else if lIncReply := reply.(*roachpb.IncrementResponse); lIncReply.NewValue != 100 { t.Errorf("response cache broken in old range, expected %d but got %d", lIncArgs.Increment, lIncReply.NewValue) } // Send out the same increment copied from above (same ClientCmdID), but // now to the newly created range (which should hold that key). if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: newRng.Desc().RangeID, CmdID: rCmdID, }, &rIncArgs); err != nil { t.Fatal(err) } else if rIncReply := reply.(*roachpb.IncrementResponse); rIncReply.NewValue != 10 { t.Errorf("response cache not copied correctly to new range, expected %d but got %d", rIncArgs.Increment, rIncReply.NewValue) } // Compare stats of split ranges to ensure they are non zero and // exceed the original range when summed. var left, right engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), rangeID, &left); err != nil { t.Fatal(err) } lKeyBytes, lValBytes := left.KeyBytes, left.ValBytes if err := engine.MVCCGetRangeStats(store.Engine(), newRng.Desc().RangeID, &right); err != nil { t.Fatal(err) } rKeyBytes, rValBytes := right.KeyBytes, right.ValBytes if lKeyBytes == 0 || rKeyBytes == 0 { t.Errorf("expected non-zero key bytes; got %d, %d", lKeyBytes, rKeyBytes) } if lValBytes == 0 || rValBytes == 0 { t.Errorf("expected non-zero val bytes; got %d, %d", lValBytes, rValBytes) } if lKeyBytes+rKeyBytes <= keyBytes { t.Errorf("left + right key bytes don't match; %d + %d <= %d", lKeyBytes, rKeyBytes, keyBytes) } if lValBytes+rValBytes <= valBytes { t.Errorf("left + right val bytes don't match; %d + %d <= %d", lValBytes, rValBytes, valBytes) } }
// InternalRangeLookup is used to look up RangeDescriptors - a RangeDescriptor // is a metadata structure which describes the key range and replica locations // of a distinct range in the cluster. // // RangeDescriptors are stored as values in the cockroach cluster's key-value // store. However, they are always stored using special "Range Metadata keys", // which are "ordinary" keys with a special prefix prepended. The Range Metadata // Key for an ordinary key can be generated with the `keys.RangeMetaKey(key)` // function. The RangeDescriptor for the range which contains a given key can be // retrieved by generating its Range Metadata Key and dispatching it to // InternalRangeLookup. // // Note that the Range Metadata Key sent to InternalRangeLookup is NOT the key // at which the desired RangeDescriptor is stored. Instead, this method returns // the RangeDescriptor stored at the _lowest_ existing key which is _greater_ // than the given key. The returned RangeDescriptor will thus contain the // ordinary key which was originally used to generate the Range Metadata Key // sent to InternalRangeLookup. // // The "Range Metadata Key" for a range is built by appending the end key of // the range to the meta[12] prefix because the RocksDB iterator only supports // a Seek() interface which acts as a Ceil(). Using the start key of the range // would cause Seek() to find the key after the meta indexing record we're // looking for, which would result in having to back the iterator up, an option // which is both less efficient and not available in all cases. // // This method has an important optimization: instead of just returning the // request RangeDescriptor, it also returns a slice of additional range // descriptors immediately consecutive to the desired RangeDescriptor. This is // intended to serve as a sort of caching pre-fetch, so that the requesting // nodes can aggressively cache RangeDescriptors which are likely to be desired // by their current workload. func (r *Range) InternalRangeLookup(batch engine.Engine, args *proto.InternalRangeLookupRequest, reply *proto.InternalRangeLookupResponse) []proto.Intent { if err := keys.ValidateRangeMetaKey(args.Key); err != nil { reply.SetGoError(err) return nil } rangeCount := int64(args.MaxRanges) if rangeCount < 1 { reply.SetGoError(util.Errorf( "Range lookup specified invalid maximum range count %d: must be > 0", rangeCount)) return nil } if args.IgnoreIntents { rangeCount = 1 // simplify lookup because we may have to retry to read new } // We want to search for the metadata key just greater than args.Key. Scan // for both the requested key and the keys immediately afterwards, up to // MaxRanges. startKey, endKey := keys.MetaScanBounds(args.Key) // Scan inconsistently. Any intents encountered are bundled up, but other- // wise ignored. kvs, intents, err := engine.MVCCScan(batch, startKey, endKey, rangeCount, args.Timestamp, false /* !consistent */, args.Txn) if err != nil { // An error here would likely amount to something seriously going // wrong. reply.SetGoError(err) return nil } if args.IgnoreIntents && len(intents) > 0 { // NOTE (subtle): in general, we want to try to clean up dangling // intents on meta records. However, if we're in the process of // cleaning up a dangling intent on a meta record by pushing the // transaction, we don't want to create an infinite loop: // // intent! -> push-txn -> range-lookup -> intent! -> etc... // // Instead we want: // // intent! -> push-txn -> range-lookup -> ignore intent, return old/new ranges // // On the range-lookup from a push transaction, we therefore // want to suppress WriteIntentErrors and return a value // anyway. But which value? We don't know whether the range // update succeeded or failed, but if we don't return the // correct range descriptor we may not be able to find the // transaction to push. Since we cannot know the correct answer, // we choose randomly between the pre- and post- transaction // values. If we guess wrong, the client will try again and get // the other value (within a few tries). if rand.Intn(2) == 0 { key, txn := intents[0].Key, &intents[0].Txn val, _, err := engine.MVCCGet(batch, key, txn.Timestamp, true, txn) if err != nil { reply.SetGoError(err) return nil } kvs = []proto.KeyValue{{Key: key, Value: *val}} } } if len(kvs) == 0 { // No matching results were returned from the scan. This could // indicate a very bad system error, but for now we will just // treat it as a retryable Key Mismatch error. err := proto.NewRangeKeyMismatchError(args.Key, args.EndKey, r.Desc()) reply.SetGoError(err) log.Errorf("InternalRangeLookup dispatched to correct range, but no matching RangeDescriptor was found. %s", err) return nil } // Decode all scanned range descriptors, stopping if a range is encountered // which does not have the same metadata prefix as the queried key. rds := make([]proto.RangeDescriptor, len(kvs)) for i := range kvs { // TODO(tschottdorf) Candidate for a ReplicaCorruptionError, once we // introduce that. if err = gogoproto.Unmarshal(kvs[i].Value.Bytes, &rds[i]); err != nil { reply.SetGoError(err) return nil } } reply.Ranges = rds return intents }