func countRangeReplicas(db *client.DB) (int, error) { desc := &roachpb.RangeDescriptor{} if err := db.GetProto(keys.RangeDescriptorKey(roachpb.KeyMin), desc); err != nil { return 0, err } return len(desc.Replicas), nil }
// Snapshot implements the raft.Storage interface. func (r *Replica) Snapshot() (raftpb.Snapshot, error) { // Copy all the data from a consistent RocksDB snapshot into a RaftSnapshotData. snap := r.rm.NewSnapshot() defer snap.Close() var snapData proto.RaftSnapshotData // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, err := r.loadAppliedIndex(snap) if err != nil { return raftpb.Snapshot{}, err } var desc proto.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(snap, keys.RangeDescriptorKey(r.Desc().StartKey), r.rm.Clock().Now(), false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, util.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the response cache. for iter := newRangeDataIterator(r.Desc(), snap); iter.Valid(); iter.Next() { snapData.KV = append(snapData.KV, &proto.RaftSnapshotData_KeyValue{Key: iter.Key(), Value: iter.Value()}) } data, err := gogoproto.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID))) } term, err := r.Term(appliedIndex) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }
// TestReplicateRange verifies basic replication functionality by creating two stores // and a range, replicating the range to the second store, and reading its data there. func TestReplicateRange(t *testing.T) { defer leaktest.AfterTest(t) mtc := multiTestContext{} mtc.Start(t, 2) defer mtc.Stop() // Issue a command on the first node before replicating. incArgs, incResp := incrementArgs([]byte("a"), 5, 1, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } rng, err := mtc.stores[0].GetRange(1) if err != nil { t.Fatal(err) } if err := rng.ChangeReplicas(proto.ADD_REPLICA, proto.Replica{ NodeID: mtc.stores[1].Ident.NodeID, StoreID: mtc.stores[1].Ident.StoreID, }); err != nil { t.Fatal(err) } // Verify no intent remains on range descriptor key. key := keys.RangeDescriptorKey(rng.Desc().StartKey) desc := proto.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key, mtc.stores[0].Clock().Now(), true, nil, &desc); !ok || err != nil { t.Fatalf("fetching range descriptor yielded %t, %s", ok, err) } // Verify that in time, no intents remain on meta addressing // keys, and that range descriptor on the meta records is correct. util.SucceedsWithin(t, 1*time.Second, func() error { meta2 := keys.RangeMetaKey(proto.KeyMax) meta1 := keys.RangeMetaKey(meta2) for _, key := range []proto.Key{meta2, meta1} { metaDesc := proto.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key, mtc.stores[0].Clock().Now(), true, nil, &metaDesc); !ok || err != nil { return util.Errorf("failed to resolve %s", key) } if !reflect.DeepEqual(metaDesc, desc) { return util.Errorf("descs not equal: %+v != %+v", metaDesc, desc) } } return nil }) // Verify that the same data is available on the replica. util.SucceedsWithin(t, 1*time.Second, func() error { getArgs, getResp := getArgs([]byte("a"), 1, mtc.stores[1].StoreID()) getArgs.ReadConsistency = proto.INCONSISTENT if err := mtc.stores[1].ExecuteCmd(context.Background(), proto.Call{Args: getArgs, Reply: getResp}); err != nil { return util.Errorf("failed to read data") } if v := mustGetInteger(getResp.Value); v != 5 { return util.Errorf("failed to read correct data: %d", v) } return nil }) }
// TestRemoveRangeWithoutGC ensures that we do not panic when a // replica has been removed but not yet GC'd (and therefore // does not have an active raft group). func TestRemoveRangeWithoutGC(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 2) defer mtc.Stop() // Disable the GC queue and move the range from store 0 to 1. mtc.stores[0].DisableReplicaGCQueue(true) const rangeID roachpb.RangeID = 1 mtc.replicateRange(rangeID, 1) mtc.unreplicateRange(rangeID, 0) // Wait for store 0 to process the removal. util.SucceedsWithin(t, time.Second, func() error { rep, err := mtc.stores[0].GetReplica(rangeID) if err != nil { return err } desc := rep.Desc() if len(desc.Replicas) != 1 { return util.Errorf("range has %d replicas", len(desc.Replicas)) } return nil }) // The replica's data is still on disk even though the Replica // object is removed. var desc roachpb.RangeDescriptor descKey := keys.RangeDescriptorKey(roachpb.RKeyMin) if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), descKey, mtc.stores[0].Clock().Now(), true, nil, &desc); err != nil { t.Fatal(err) } else if !ok { t.Fatal("expected range descriptor to be present") } // Stop and restart the store to reset the replica's raftGroup // pointer to nil. As long as the store has not been restarted it // can continue to use its last known replica ID. mtc.stopStore(0) mtc.restartStore(0) // Turn off the GC queue to ensure that the replica is deleted at // startup instead of by the scanner. This is not 100% guaranteed // since the scanner could have already run at this point, but it // should be enough to prevent us from accidentally relying on the // scanner. mtc.stores[0].DisableReplicaGCQueue(true) // The Replica object is not recreated. if _, err := mtc.stores[0].GetReplica(rangeID); err == nil { t.Fatalf("expected replica to be missing") } // And the data is no longer on disk. if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), descKey, mtc.stores[0].Clock().Now(), true, nil, &desc); err != nil { t.Fatal(err) } else if ok { t.Fatal("expected range descriptor to be absent") } }
func (tc *TestCluster) changeReplicas( action roachpb.ReplicaChangeType, startKey roachpb.RKey, targets ...ReplicationTarget, ) (*roachpb.RangeDescriptor, error) { rangeDesc := &roachpb.RangeDescriptor{} // TODO(andrei): the following code has been adapted from // multiTestContext.replicateRange(). Find a way to share. for _, target := range targets { // Perform a consistent read to get the updated range descriptor (as opposed // to just going to one of the stores), to make sure we have the effects of // the previous ChangeReplicas call. By the time ChangeReplicas returns the // raft leader is guaranteed to have the updated version, but followers are // not. if err := tc.Servers[0].DB().GetProto( keys.RangeDescriptorKey(startKey), rangeDesc); err != nil { return nil, err } // Ask an arbitrary replica of the range to perform the change. Note that // the target for addition/removal is specified, this is about the choice // of which replica receives the ChangeReplicas operation. store, err := tc.findMemberStore(rangeDesc.Replicas[0].StoreID) if err != nil { return nil, err } replica, err := store.GetReplica(rangeDesc.RangeID) if err != nil { return nil, err } err = replica.ChangeReplicas(context.Background(), action, roachpb.ReplicaDescriptor{ NodeID: target.NodeID, StoreID: target.StoreID, }, rangeDesc) if err != nil { return nil, err } } if err := tc.Servers[0].DB().GetProto( keys.RangeDescriptorKey(startKey), rangeDesc); err != nil { return nil, err } return rangeDesc, nil }
// TestReplicateRange verifies basic replication functionality by creating two stores // and a range, replicating the range to the second store, and reading its data there. func TestReplicateRange(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 2) defer mtc.Stop() // Issue a command on the first node before replicating. incArgs := incrementArgs([]byte("a"), 5) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &incArgs); err != nil { t.Fatal(err) } rng, err := mtc.stores[0].GetReplica(1) if err != nil { t.Fatal(err) } if err := rng.ChangeReplicas(roachpb.ADD_REPLICA, roachpb.ReplicaDescriptor{ NodeID: mtc.stores[1].Ident.NodeID, StoreID: mtc.stores[1].Ident.StoreID, }, rng.Desc()); err != nil { t.Fatal(err) } // Verify no intent remains on range descriptor key. key := keys.RangeDescriptorKey(rng.Desc().StartKey) desc := roachpb.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key, mtc.stores[0].Clock().Now(), true, nil, &desc); !ok || err != nil { t.Fatalf("fetching range descriptor yielded %t, %s", ok, err) } // Verify that in time, no intents remain on meta addressing // keys, and that range descriptor on the meta records is correct. util.SucceedsWithin(t, 1*time.Second, func() error { meta2 := keys.Addr(keys.RangeMetaKey(roachpb.RKeyMax)) meta1 := keys.Addr(keys.RangeMetaKey(meta2)) for _, key := range []roachpb.RKey{meta2, meta1} { metaDesc := roachpb.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key.AsRawKey(), mtc.stores[0].Clock().Now(), true, nil, &metaDesc); !ok || err != nil { return util.Errorf("failed to resolve %s", key.AsRawKey()) } if !reflect.DeepEqual(metaDesc, desc) { return util.Errorf("descs not equal: %+v != %+v", metaDesc, desc) } } return nil }) // Verify that the same data is available on the replica. util.SucceedsWithin(t, replicaReadTimeout, func() error { getArgs := getArgs([]byte("a")) if reply, err := client.SendWrappedWith(rg1(mtc.stores[1]), nil, roachpb.Header{ ReadConsistency: roachpb.INCONSISTENT, }, &getArgs); err != nil { return util.Errorf("failed to read data: %s", err) } else if e, v := int64(5), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e { return util.Errorf("failed to read correct data: expected %d, got %d", e, v) } return nil }) }
// SplitRange splits the range containing splitKey. // The right range created by the split starts at the split key and extends to the // original range's end key. // Returns the new descriptors of the left and right ranges. // // splitKey must correspond to a SQL table key (it must end with a family ID / // col ID). func (tc *TestCluster) SplitRange( splitKey roachpb.Key, ) (*roachpb.RangeDescriptor, *roachpb.RangeDescriptor, error) { splitRKey, err := keys.Addr(splitKey) if err != nil { return nil, nil, err } origRangeDesc, err := tc.LookupRange(splitKey) if err != nil { return nil, nil, err } if origRangeDesc.StartKey.Equal(splitRKey) { return nil, nil, errors.Errorf( "cannot split range %+v at start key %q", origRangeDesc, splitKey) } splitReq := roachpb.AdminSplitRequest{ Span: roachpb.Span{ Key: splitKey, }, SplitKey: splitKey, } _, pErr := client.SendWrapped(tc.Servers[0].GetDistSender(), nil, &splitReq) if pErr != nil { return nil, nil, errors.Errorf( "%q: split unexpected error: %s", splitReq.SplitKey, pErr) } leftRangeDesc := new(roachpb.RangeDescriptor) rightRangeDesc := new(roachpb.RangeDescriptor) if err := tc.Servers[0].DB().GetProto( keys.RangeDescriptorKey(origRangeDesc.StartKey), leftRangeDesc); err != nil { return nil, nil, errors.Wrap(err, "could not look up left-hand side descriptor") } // The split point might not be exactly the one we requested (it can be // adjusted slightly so we don't split in the middle of SQL rows). Update it // to the real point. splitRKey = leftRangeDesc.EndKey if err := tc.Servers[0].DB().GetProto( keys.RangeDescriptorKey(splitRKey), rightRangeDesc); err != nil { return nil, nil, errors.Wrap(err, "could not look up right-hand side descriptor") } return leftRangeDesc, rightRangeDesc, nil }
// TestBatchPrevNext tests batch.{Prev,Next}. func TestBatchPrevNext(t *testing.T) { defer leaktest.AfterTest(t)() loc := func(s string) string { return string(keys.RangeDescriptorKey(roachpb.RKey(s))) } span := func(strs ...string) []roachpb.Span { var r []roachpb.Span for i, str := range strs { if i%2 == 0 { r = append(r, roachpb.Span{Key: roachpb.Key(str)}) } else { r[len(r)-1].EndKey = roachpb.Key(str) } } return r } max, min := string(roachpb.RKeyMax), string(roachpb.RKeyMin) abc := span("a", "", "b", "", "c", "") testCases := []struct { spans []roachpb.Span key, expFW, expBW string }{ {spans: span("a", "c", "b", ""), key: "b", expFW: "b", expBW: "b"}, {spans: span("a", "c", "b", ""), key: "a", expFW: "a", expBW: "a"}, {spans: span("a", "c", "d", ""), key: "c", expFW: "d", expBW: "c"}, {spans: span("a", "c\x00", "d", ""), key: "c", expFW: "c", expBW: "c"}, {spans: abc, key: "b", expFW: "b", expBW: "b"}, {spans: abc, key: "b\x00", expFW: "c", expBW: "b\x00"}, {spans: abc, key: "bb", expFW: "c", expBW: "b"}, {spans: span(), key: "whatevs", expFW: max, expBW: min}, {spans: span(loc("a"), loc("c")), key: "c", expFW: "c", expBW: "c"}, {spans: span(loc("a"), loc("c")), key: "c\x00", expFW: max, expBW: "c\x00"}, } for i, test := range testCases { var ba roachpb.BatchRequest for _, span := range test.spans { args := &roachpb.ScanRequest{} args.Key, args.EndKey = span.Key, span.EndKey ba.Add(args) } if next, err := next(ba, roachpb.RKey(test.key)); err != nil { t.Errorf("%d: %v", i, err) } else if !bytes.Equal(next, roachpb.Key(test.expFW)) { t.Errorf("%d: next: expected %q, got %q", i, test.expFW, next) } if prev, err := prev(ba, roachpb.RKey(test.key)); err != nil { t.Errorf("%d: %v", i, err) } else if !bytes.Equal(prev, roachpb.Key(test.expBW)) { t.Errorf("%d: prev: expected %q, got %q", i, test.expBW, prev) } } }
// createRangeData creates sample range data in all possible areas of // the key space. Returns a slice of the encoded keys of all created // data. func createRangeData(t *testing.T, r *Replica) []engine.MVCCKey { ts0 := hlc.ZeroTimestamp ts := hlc.Timestamp{WallTime: 1} desc := r.Desc() keyTSs := []struct { key roachpb.Key ts hlc.Timestamp }{ {keys.AbortCacheKey(r.RangeID, testTxnID), ts0}, {keys.AbortCacheKey(r.RangeID, testTxnID2), ts0}, {keys.RangeFrozenStatusKey(r.RangeID), ts0}, {keys.RangeLastGCKey(r.RangeID), ts0}, {keys.RaftAppliedIndexKey(r.RangeID), ts0}, {keys.RaftTruncatedStateKey(r.RangeID), ts0}, {keys.LeaseAppliedIndexKey(r.RangeID), ts0}, {keys.RangeStatsKey(r.RangeID), ts0}, {keys.RaftHardStateKey(r.RangeID), ts0}, {keys.RaftLastIndexKey(r.RangeID), ts0}, {keys.RaftLogKey(r.RangeID, 1), ts0}, {keys.RaftLogKey(r.RangeID, 2), ts0}, {keys.RangeLastReplicaGCTimestampKey(r.RangeID), ts0}, {keys.RangeLastVerificationTimestampKey(r.RangeID), ts0}, {keys.RangeDescriptorKey(desc.StartKey), ts}, {keys.TransactionKey(roachpb.Key(desc.StartKey), uuid.NewV4()), ts0}, {keys.TransactionKey(roachpb.Key(desc.StartKey.Next()), uuid.NewV4()), ts0}, {keys.TransactionKey(fakePrevKey(desc.EndKey), uuid.NewV4()), ts0}, // TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space. // Once we have resolved https://github.com/cockroachdb/cockroach/issues/437, // replace this with something that reliably generates the first valid key in the range. //{r.Desc().StartKey.Next(), ts}, // The following line is similar to StartKey.Next() but adds more to the key to // avoid falling into the system-local space. {append(append([]byte{}, desc.StartKey...), '\x02'), ts}, {fakePrevKey(r.Desc().EndKey), ts}, } keys := []engine.MVCCKey{} for _, keyTS := range keyTSs { if err := engine.MVCCPut(context.Background(), r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil { t.Fatal(err) } keys = append(keys, engine.MVCCKey{Key: keyTS.key, Timestamp: keyTS.ts}) } return keys }
// createRangeData creates sample range data in all possible areas of // the key space. Returns a slice of the encoded keys of all created // data. func createRangeData(r *Replica, t *testing.T) []roachpb.EncodedKey { ts0 := roachpb.ZeroTimestamp ts := roachpb.Timestamp{WallTime: 1} keyTSs := []struct { key roachpb.Key ts roachpb.Timestamp }{ {keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 1, Random: 1}), ts0}, {keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 2, Random: 2}), ts0}, {keys.RaftHardStateKey(r.Desc().RangeID), ts0}, {keys.RaftLogKey(r.Desc().RangeID, 1), ts0}, {keys.RaftLogKey(r.Desc().RangeID, 2), ts0}, {keys.RangeGCMetadataKey(r.Desc().RangeID), ts0}, {keys.RangeLastVerificationTimestampKey(r.Desc().RangeID), ts0}, {keys.RangeStatsKey(r.Desc().RangeID), ts0}, {keys.RangeDescriptorKey(r.Desc().StartKey), ts}, {keys.TransactionKey(roachpb.Key(r.Desc().StartKey), []byte("1234")), ts0}, {keys.TransactionKey(roachpb.Key(r.Desc().StartKey.Next()), []byte("5678")), ts0}, {keys.TransactionKey(fakePrevKey(r.Desc().EndKey), []byte("2468")), ts0}, // TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space. // Once we have resolved https://github.com/cockroachdb/cockroach/issues/437, // replace this with something that reliably generates the first valid key in the range. //{r.Desc().StartKey.Next(), ts}, // The following line is similar to StartKey.Next() but adds more to the key to // avoid falling into the system-local space. {append(append([]byte{}, r.Desc().StartKey...), '\x01'), ts}, {fakePrevKey(r.Desc().EndKey), ts}, } keys := []roachpb.EncodedKey{} for _, keyTS := range keyTSs { if err := engine.MVCCPut(r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil { t.Fatal(err) } keys = append(keys, engine.MVCCEncodeKey(keyTS.key)) if !keyTS.ts.Equal(ts0) { keys = append(keys, engine.MVCCEncodeVersionKey(keyTS.key, keyTS.ts)) } } return keys }
func TestTruncate(t *testing.T) { defer leaktest.AfterTest(t) loc := func(s string) string { return string(keys.RangeDescriptorKey(roachpb.RKey(s))) } testCases := []struct { keys [][2]string expKeys [][2]string from, to string desc [2]string // optional, defaults to {from,to} err string }{ { // Keys inside of active range. keys: [][2]string{{"a", "q"}, {"c"}, {"b, e"}, {"q"}}, expKeys: [][2]string{{"a", "q"}, {"c"}, {"b, e"}, {"q"}}, from: "a", to: "q\x00", }, { // Keys outside of active range. keys: [][2]string{{"a"}, {"a", "b"}, {"q"}, {"q", "z"}}, expKeys: [][2]string{{}, {}, {}, {}}, from: "b", to: "q", }, { // Range-local keys inside of active range. keys: [][2]string{{loc("b")}, {loc("c")}}, expKeys: [][2]string{{loc("b")}, {loc("c")}}, from: "b", to: "e", }, { // Range-local key outside of active range. keys: [][2]string{{loc("a")}}, expKeys: [][2]string{{}}, from: "b", to: "e", }, { // Range-local range contained in active range. keys: [][2]string{{loc("b"), loc("e") + "\x00"}}, expKeys: [][2]string{{loc("b"), loc("e") + "\x00"}}, from: "b", to: "e\x00", }, { // Range-local range not contained in active range. keys: [][2]string{{loc("a"), loc("b")}}, from: "b", to: "e", err: "local key range must not span ranges", }, { // Mixed range-local vs global key range. keys: [][2]string{{loc("c"), "d\x00"}}, from: "b", to: "e", err: "local key mixed with global key", }, { // Key range touching and intersecting active range. keys: [][2]string{{"a", "b"}, {"a", "c"}, {"p", "q"}, {"p", "r"}, {"a", "z"}}, expKeys: [][2]string{{}, {"b", "c"}, {"p", "q"}, {"p", "q"}, {"b", "q"}}, from: "b", to: "q", }, // Active key range is intersection of descriptor and [from,to). { keys: [][2]string{{"c", "q"}}, expKeys: [][2]string{{"d", "p"}}, from: "a", to: "z", desc: [2]string{"d", "p"}, }, { keys: [][2]string{{"c", "q"}}, expKeys: [][2]string{{"d", "p"}}, from: "d", to: "p", desc: [2]string{"a", "z"}, }, } for i, test := range testCases { ba := &roachpb.BatchRequest{} for _, ks := range test.keys { if len(ks[1]) > 0 { ba.Add(&roachpb.ScanRequest{ Span: roachpb.Span{Key: roachpb.Key(ks[0]), EndKey: roachpb.Key(ks[1])}, }) } else { ba.Add(&roachpb.GetRequest{ Span: roachpb.Span{Key: roachpb.Key(ks[0])}, }) } } original := proto.Clone(ba).(*roachpb.BatchRequest) desc := &roachpb.RangeDescriptor{ StartKey: roachpb.RKey(test.desc[0]), EndKey: roachpb.RKey(test.desc[1]), } if len(desc.StartKey) == 0 { desc.StartKey = roachpb.RKey(test.from) } if len(desc.EndKey) == 0 { desc.EndKey = roachpb.RKey(test.to) } rs := rSpan{key: roachpb.RKey(test.from), endKey: roachpb.RKey(test.to)} undo, num, err := truncate(ba, desc, rs) if err != nil || test.err != "" { if test.err == "" || !testutils.IsError(err, test.err) { t.Errorf("%d: %v (expected: %s)", i, err, test.err) } continue } var reqs int for j, arg := range ba.Requests { req := arg.GetInner() if h := req.Header(); !bytes.Equal(h.Key, roachpb.Key(test.expKeys[j][0])) || !bytes.Equal(h.EndKey, roachpb.Key(test.expKeys[j][1])) { t.Errorf("%d.%d: range mismatch: actual [%q,%q), wanted [%q,%q)", i, j, h.Key, h.EndKey, test.expKeys[j][0], test.expKeys[j][1]) } else if _, ok := req.(*roachpb.NoopRequest); ok != (len(h.Key) == 0) { t.Errorf("%d.%d: expected NoopRequest, got %T", i, j, req) } else if len(h.Key) != 0 { reqs++ } } if reqs != num { t.Errorf("%d: counted %d requests, but truncation indicated %d", i, reqs, num) } undo() if !reflect.DeepEqual(ba, original) { t.Errorf("%d: undoing truncation failed:\nexpected: %s\nactual: %s", i, original, ba) } } }
// TestStoreRangeSplit executes a split of a range and verifies that the // resulting ranges respond to the right key ranges and that their stats // and response caches have been properly accounted for. func TestStoreRangeSplit(t *testing.T) { defer leaktest.AfterTest(t) store, stopper := createTestStore(t) defer stopper.Stop() rangeID := roachpb.RangeID(1) splitKey := roachpb.RKey("m") content := roachpb.Key("asdvb") // First, write some values left and right of the proposed split key. pArgs := putArgs([]byte("c"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("x"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } // Increments are a good way of testing the response cache. Up here, we // address them to the original range, then later to the one that contains // the key. lCmdID := roachpb.ClientCmdID{WallTime: 123, Random: 423} lIncArgs := incrementArgs([]byte("apoptosis"), 100) if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ CmdID: lCmdID, }, &lIncArgs); err != nil { t.Fatal(err) } rIncArgs := incrementArgs([]byte("wobble"), 10) rCmdID := roachpb.ClientCmdID{WallTime: 12, Random: 42} if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ CmdID: rCmdID, }, &rIncArgs); err != nil { t.Fatal(err) } // Get the original stats for key and value bytes. var ms engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), rangeID, &ms); err != nil { t.Fatal(err) } keyBytes, valBytes := ms.KeyBytes, ms.ValBytes // Split the range. args := adminSplitArgs(roachpb.RKeyMin, splitKey) if _, err := client.SendWrapped(rg1(store), nil, &args); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []roachpb.Key{keys.RangeDescriptorKey(roachpb.RKeyMin), keys.RangeDescriptorKey(splitKey)} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } rng := store.LookupReplica(roachpb.RKeyMin, nil) newRng := store.LookupReplica([]byte("m"), nil) if !bytes.Equal(newRng.Desc().StartKey, splitKey) || !bytes.Equal(splitKey, rng.Desc().EndKey) { t.Errorf("ranges mismatched, wanted %q=%q=%q", newRng.Desc().StartKey, splitKey, rng.Desc().EndKey) } if !bytes.Equal(newRng.Desc().EndKey, roachpb.RKeyMax) || !bytes.Equal(rng.Desc().StartKey, roachpb.RKeyMin) { t.Errorf("new ranges do not cover KeyMin-KeyMax, but only %q-%q", rng.Desc().StartKey, newRng.Desc().EndKey) } // Try to get values from both left and right of where the split happened. gArgs := getArgs([]byte("c")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*roachpb.GetResponse); !bytes.Equal(gReply.Value.GetRawBytes(), content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.GetRawBytes(), content) } gArgs = getArgs([]byte("x")) if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: newRng.Desc().RangeID, }, &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*roachpb.GetResponse); !bytes.Equal(gReply.Value.GetRawBytes(), content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.GetRawBytes(), content) } // Send out an increment request copied from above (same ClientCmdID) which // remains in the old range. if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ CmdID: lCmdID, }, &lIncArgs); err != nil { t.Fatal(err) } else if lIncReply := reply.(*roachpb.IncrementResponse); lIncReply.NewValue != 100 { t.Errorf("response cache broken in old range, expected %d but got %d", lIncArgs.Increment, lIncReply.NewValue) } // Send out the same increment copied from above (same ClientCmdID), but // now to the newly created range (which should hold that key). if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: newRng.Desc().RangeID, CmdID: rCmdID, }, &rIncArgs); err != nil { t.Fatal(err) } else if rIncReply := reply.(*roachpb.IncrementResponse); rIncReply.NewValue != 10 { t.Errorf("response cache not copied correctly to new range, expected %d but got %d", rIncArgs.Increment, rIncReply.NewValue) } // Compare stats of split ranges to ensure they are non zero and // exceed the original range when summed. var left, right engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), rangeID, &left); err != nil { t.Fatal(err) } lKeyBytes, lValBytes := left.KeyBytes, left.ValBytes if err := engine.MVCCGetRangeStats(store.Engine(), newRng.Desc().RangeID, &right); err != nil { t.Fatal(err) } rKeyBytes, rValBytes := right.KeyBytes, right.ValBytes if lKeyBytes == 0 || rKeyBytes == 0 { t.Errorf("expected non-zero key bytes; got %d, %d", lKeyBytes, rKeyBytes) } if lValBytes == 0 || rValBytes == 0 { t.Errorf("expected non-zero val bytes; got %d, %d", lValBytes, rValBytes) } if lKeyBytes+rKeyBytes <= keyBytes { t.Errorf("left + right key bytes don't match; %d + %d <= %d", lKeyBytes, rKeyBytes, keyBytes) } if lValBytes+rValBytes <= valBytes { t.Errorf("left + right val bytes don't match; %d + %d <= %d", lValBytes, rValBytes, valBytes) } }
// TestStoreVerifyKeys checks that key length is enforced and // that end keys must sort >= start. func TestStoreVerifyKeys(t *testing.T) { defer leaktest.AfterTest(t) store, _, stopper := createTestStore(t) defer stopper.Stop() tooLongKey := proto.Key(strings.Repeat("x", proto.KeyMaxLength+1)) // Start with a too-long key on a get. gArgs := getArgs(tooLongKey, 1, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &gArgs, Reply: gArgs.CreateReply()}); err == nil { t.Fatal("expected error for key too long") } // Try a start key == KeyMax. gArgs.Key = proto.KeyMax if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &gArgs, Reply: gArgs.CreateReply()}); err == nil { t.Fatal("expected error for start key == KeyMax") } // Try a get with an end key specified (get requires only a start key and should fail). gArgs.EndKey = proto.KeyMax if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &gArgs, Reply: gArgs.CreateReply()}); err == nil { t.Fatal("expected error for end key specified on a non-range-based operation") } // Try a scan with too-long EndKey. sArgs := scanArgs(proto.KeyMin, tooLongKey, 1, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &sArgs, Reply: sArgs.CreateReply()}); err == nil { t.Fatal("expected error for end key too long") } // Try a scan with end key < start key. sArgs.Key = []byte("b") sArgs.EndKey = []byte("a") if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &sArgs, Reply: sArgs.CreateReply()}); err == nil { t.Fatal("expected error for end key < start") } // Try a scan with start key == end key. sArgs.Key = []byte("a") sArgs.EndKey = sArgs.Key if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &sArgs, Reply: sArgs.CreateReply()}); err == nil { t.Fatal("expected error for start == end key") } // Try a put to meta2 key which would otherwise exceed maximum key // length, but is accepted because of the meta prefix. meta2KeyMax := keys.MakeKey(keys.Meta2Prefix, proto.KeyMax) pArgs := putArgs(meta2KeyMax, []byte("value"), 1, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &pArgs, Reply: pArgs.CreateReply()}); err != nil { t.Fatalf("unexpected error on put to meta2 value: %s", err) } // Try to put a range descriptor record for a start key which is // maximum length. key := append([]byte{}, proto.KeyMax...) key[len(key)-1] = 0x01 pArgs = putArgs(keys.RangeDescriptorKey(key), []byte("value"), 1, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &pArgs, Reply: pArgs.CreateReply()}); err != nil { t.Fatalf("unexpected error on put to range descriptor for KeyMax value: %s", err) } // Try a put to txn record for a meta2 key (note that this doesn't // actually happen in practice, as txn records are not put directly, // but are instead manipulated only through txn methods). pArgs = putArgs(keys.TransactionKey(meta2KeyMax, []byte(uuid.NewUUID4())), []byte("value"), 1, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &pArgs, Reply: pArgs.CreateReply()}); err != nil { t.Fatalf("unexpected error on put to txn meta2 value: %s", err) } }
func TestLogRebalances(t *testing.T) { defer leaktest.AfterTest(t)() s := server.StartTestServer(t) defer s.Stop() // Use a client to get the RangeDescriptor for the first range. We will use // this range's information to log fake rebalance events. db := s.DB() desc := &roachpb.RangeDescriptor{} if pErr := db.GetProto(keys.RangeDescriptorKey(roachpb.RKeyMin), desc); pErr != nil { t.Fatal(pErr) } // This code assumes that there is only one TestServer, and thus that // StoreID 1 is present on the testserver. If this assumption changes in the // future, *any* store will work, but a new method will need to be added to // Stores (or a creative usage of VisitStores could suffice). store, pErr := s.Stores().GetStore(roachpb.StoreID(1)) if pErr != nil { t.Fatal(pErr) } // Log several fake events using the store. logEvent := func(changeType roachpb.ReplicaChangeType) { if pErr := db.Txn(func(txn *client.Txn) *roachpb.Error { return store.LogReplicaChangeTest(txn, changeType, desc.Replicas[0], *desc) }); pErr != nil { t.Fatal(pErr) } } reg := store.Registry() checkMetrics := func(expAdds, expRemoves int64) { if a, e := reg.GetCounter("range.adds").Count(), expAdds; a != e { t.Errorf("range adds %d != expected %d", a, e) } if a, e := reg.GetCounter("range.removes").Count(), expRemoves; a != e { t.Errorf("range removes %d != expected %d", a, e) } } logEvent(roachpb.ADD_REPLICA) checkMetrics(1 /*add*/, 0 /*remove*/) logEvent(roachpb.ADD_REPLICA) checkMetrics(2 /*adds*/, 0 /*remove*/) logEvent(roachpb.REMOVE_REPLICA) checkMetrics(2 /*adds*/, 1 /*remove*/) // Open a SQL connection to verify that the events have been logged. pgURL, cleanupFn := sqlutils.PGUrl(t, s, security.RootUser, "TestLogRebalances") defer cleanupFn() sqlDB, err := sql.Open("postgres", pgURL.String()) if err != nil { t.Fatal(err) } defer sqlDB.Close() // verify that two add replica events have been logged. // TODO(mrtracy): placeholders still appear to be broken, this query should // be using a string placeholder for the eventType value. rows, err := sqlDB.Query(`SELECT rangeID, info FROM system.rangelog WHERE eventType = 'add'`) if err != nil { t.Fatal(err) } var count int for rows.Next() { count++ var rangeID int64 var infoStr sql.NullString if err := rows.Scan(&rangeID, &infoStr); err != nil { t.Fatal(err) } if a, e := roachpb.RangeID(rangeID), desc.RangeID; a != e { t.Errorf("wrong rangeID %d recorded for add event, expected %d", a, e) } // Verify that info returns a json struct. if !infoStr.Valid { t.Errorf("info not recorded for add replica of range %d", rangeID) } var info struct { AddReplica roachpb.ReplicaDescriptor UpdatedDesc roachpb.RangeDescriptor } if err := json.Unmarshal([]byte(infoStr.String), &info); err != nil { t.Errorf("error unmarshalling info string for add replica %d: %s", rangeID, err) continue } if int64(info.UpdatedDesc.RangeID) != rangeID { t.Errorf("recorded wrong updated descriptor %s for add replica of range %d", info.UpdatedDesc, rangeID) } if a, e := info.AddReplica, desc.Replicas[0]; a != e { t.Errorf("recorded wrong updated replica %s for add replica of range %d, expected %s", a, rangeID, e) } } if rows.Err() != nil { t.Fatal(rows.Err()) } if a, e := count, 2; a != e { t.Errorf("expected %d AddReplica events logged, found %d", e, a) } // verify that one remove replica event was logged. rows, err = sqlDB.Query(`SELECT rangeID, info FROM system.rangelog WHERE eventType = 'remove'`) if err != nil { t.Fatal(err) } count = 0 for rows.Next() { count++ var rangeID int64 var infoStr sql.NullString if err := rows.Scan(&rangeID, &infoStr); err != nil { t.Fatal(err) } if a, e := roachpb.RangeID(rangeID), desc.RangeID; a != e { t.Errorf("wrong rangeID %d recorded for remove event, expected %d", a, e) } // Verify that info returns a json struct. if !infoStr.Valid { t.Errorf("info not recorded for remove replica of range %d", rangeID) } var info struct { RemovedReplica roachpb.ReplicaDescriptor UpdatedDesc roachpb.RangeDescriptor } if err := json.Unmarshal([]byte(infoStr.String), &info); err != nil { t.Errorf("error unmarshalling info string for remove replica %d: %s", rangeID, err) continue } if int64(info.UpdatedDesc.RangeID) != rangeID { t.Errorf("recorded wrong updated descriptor %s for remove replica of range %d", info.UpdatedDesc, rangeID) } if a, e := info.RemovedReplica, desc.Replicas[0]; a != e { t.Errorf("recorded wrong updated replica %s for remove replica of range %d, expected %s", a, rangeID, e) } } if rows.Err() != nil { t.Fatal(rows.Err()) } if a, e := count, 1; a != e { t.Errorf("expected %d RemoveReplica events logged, found %d", e, a) } }
func TestTruncate(t *testing.T) { defer leaktest.AfterTest(t)() loc := func(s string) string { return string(keys.RangeDescriptorKey(roachpb.RKey(s))) } locPrefix := func(s string) string { return string(keys.MakeRangeKeyPrefix(roachpb.RKey(s))) } testCases := []struct { keys [][2]string expKeys [][2]string from, to string desc [2]string // optional, defaults to {from,to} err string }{ { // Keys inside of active range. keys: [][2]string{{"a", "q"}, {"c"}, {"b, e"}, {"q"}}, expKeys: [][2]string{{"a", "q"}, {"c"}, {"b, e"}, {"q"}}, from: "a", to: "q\x00", }, { // Keys outside of active range. keys: [][2]string{{"a"}, {"a", "b"}, {"q"}, {"q", "z"}}, expKeys: [][2]string{{}, {}, {}, {}}, from: "b", to: "q", }, { // Range-local keys inside of active range. keys: [][2]string{{loc("b")}, {loc("c")}}, expKeys: [][2]string{{loc("b")}, {loc("c")}}, from: "b", to: "e", }, { // Range-local key outside of active range. keys: [][2]string{{loc("a")}}, expKeys: [][2]string{{}}, from: "b", to: "e", }, { // Range-local range contained in active range. keys: [][2]string{{loc("b"), loc("e") + "\x00"}}, expKeys: [][2]string{{loc("b"), loc("e") + "\x00"}}, from: "b", to: "e\x00", }, { // Range-local range not contained in active range. keys: [][2]string{{loc("a"), loc("b")}}, expKeys: [][2]string{{}}, from: "c", to: "e", }, { // Range-local range not contained in active range. keys: [][2]string{{loc("a"), locPrefix("b")}, {loc("e"), loc("f")}}, expKeys: [][2]string{{}, {}}, from: "b", to: "e", }, { // Range-local range partially contained in active range. keys: [][2]string{{loc("a"), loc("b")}}, expKeys: [][2]string{{loc("a"), locPrefix("b")}}, from: "a", to: "b", }, { // Range-local range partially contained in active range. keys: [][2]string{{loc("a"), loc("b")}}, expKeys: [][2]string{{locPrefix("b"), loc("b")}}, from: "b", to: "e", }, { // Range-local range contained in active range. keys: [][2]string{{locPrefix("b"), loc("b")}}, expKeys: [][2]string{{locPrefix("b"), loc("b")}}, from: "b", to: "c", }, { // Mixed range-local vs global key range. keys: [][2]string{{loc("c"), "d\x00"}}, from: "b", to: "e", err: "local key mixed with global key", }, { // Key range touching and intersecting active range. keys: [][2]string{{"a", "b"}, {"a", "c"}, {"p", "q"}, {"p", "r"}, {"a", "z"}}, expKeys: [][2]string{{}, {"b", "c"}, {"p", "q"}, {"p", "q"}, {"b", "q"}}, from: "b", to: "q", }, // Active key range is intersection of descriptor and [from,to). { keys: [][2]string{{"c", "q"}}, expKeys: [][2]string{{"d", "p"}}, from: "a", to: "z", desc: [2]string{"d", "p"}, }, { keys: [][2]string{{"c", "q"}}, expKeys: [][2]string{{"d", "p"}}, from: "d", to: "p", desc: [2]string{"a", "z"}, }, } for i, test := range testCases { goldenOriginal := roachpb.BatchRequest{} for _, ks := range test.keys { if len(ks[1]) > 0 { goldenOriginal.Add(&roachpb.ResolveIntentRangeRequest{ Span: roachpb.Span{Key: roachpb.Key(ks[0]), EndKey: roachpb.Key(ks[1])}, IntentTxn: roachpb.TxnMeta{ID: uuid.NewV4()}, }) } else { goldenOriginal.Add(&roachpb.GetRequest{ Span: roachpb.Span{Key: roachpb.Key(ks[0])}, }) } } original := roachpb.BatchRequest{Requests: make([]roachpb.RequestUnion, len(goldenOriginal.Requests))} for i, request := range goldenOriginal.Requests { original.Requests[i].SetValue(request.GetInner().ShallowCopy()) } desc := &roachpb.RangeDescriptor{ StartKey: roachpb.RKey(test.desc[0]), EndKey: roachpb.RKey(test.desc[1]), } if len(desc.StartKey) == 0 { desc.StartKey = roachpb.RKey(test.from) } if len(desc.EndKey) == 0 { desc.EndKey = roachpb.RKey(test.to) } rs := roachpb.RSpan{Key: roachpb.RKey(test.from), EndKey: roachpb.RKey(test.to)} rs, err := rs.Intersect(desc) if err != nil { t.Errorf("%d: intersection failure: %v", i, err) continue } ba, num, err := truncate(original, rs) if err != nil || test.err != "" { if test.err == "" || !testutils.IsError(err, test.err) { t.Errorf("%d: %v (expected: %s)", i, err, test.err) } continue } var reqs int for j, arg := range ba.Requests { req := arg.GetInner() if h := req.Header(); !bytes.Equal(h.Key, roachpb.Key(test.expKeys[j][0])) || !bytes.Equal(h.EndKey, roachpb.Key(test.expKeys[j][1])) { t.Errorf("%d.%d: range mismatch: actual [%q,%q), wanted [%q,%q)", i, j, h.Key, h.EndKey, test.expKeys[j][0], test.expKeys[j][1]) } else if _, ok := req.(*roachpb.NoopRequest); ok != (len(h.Key) == 0) { t.Errorf("%d.%d: expected NoopRequest, got %T", i, j, req) } else if len(h.Key) != 0 { reqs++ } } if reqs != num { t.Errorf("%d: counted %d requests, but truncation indicated %d", i, reqs, num) } if !reflect.DeepEqual(original, goldenOriginal) { t.Errorf("%d: truncation mutated original:\nexpected: %s\nactual: %s", i, goldenOriginal, original) } } }
// TestStoreVerifyKeys checks that key length is enforced and // that end keys must sort >= start. func TestStoreVerifyKeys(t *testing.T) { defer leaktest.AfterTest(t) store, _, stopper := createTestStore(t) defer stopper.Stop() tooLongKey := roachpb.Key(strings.Repeat("x", roachpb.KeyMaxLength+1)) // Start with a too-long key on a get. gArgs := getArgs(tooLongKey, 1, store.StoreID()) if _, err := client.SendWrapped(store, nil, &gArgs); !testutils.IsError(err, "exceeded") { t.Fatalf("unexpected error for key too long: %v", err) } // Try a start key == KeyMax. gArgs.Key = roachpb.KeyMax if _, err := client.SendWrapped(store, nil, &gArgs); !testutils.IsError(err, "must be less than KeyMax") { t.Fatalf("expected error for start key == KeyMax: %v", err) } // Try a get with an end key specified (get requires only a start key and should fail). gArgs.EndKey = roachpb.KeyMax if _, err := client.SendWrapped(store, nil, &gArgs); !testutils.IsError(err, "must be less than KeyMax") { t.Fatalf("unexpected error for end key specified on a non-range-based operation: %v", err) } // Try a scan with too-long EndKey. sArgs := scanArgs(roachpb.KeyMin, tooLongKey, 1, store.StoreID()) if _, err := client.SendWrapped(store, nil, &sArgs); !testutils.IsError(err, "length exceeded") { t.Fatalf("unexpected error for end key too long: %v", err) } // Try a scan with end key < start key. sArgs.Key = []byte("b") sArgs.EndKey = []byte("a") if _, err := client.SendWrapped(store, nil, &sArgs); !testutils.IsError(err, "must be greater than") { t.Fatalf("unexpected error for end key < start: %v", err) } // Try a scan with start key == end key. sArgs.Key = []byte("a") sArgs.EndKey = sArgs.Key if _, err := client.SendWrapped(store, nil, &sArgs); !testutils.IsError(err, "must be greater than") { t.Fatalf("unexpected error for start == end key: %v", err) } // Try a scan with range-local start key, but "regular" end key. sArgs.Key = keys.MakeRangeKey([]byte("test"), []byte("sffx"), nil) sArgs.EndKey = []byte("z") if _, err := client.SendWrapped(store, nil, &sArgs); !testutils.IsError(err, "range-local") { t.Fatalf("unexpected error for local start, non-local end key: %v", err) } // Try a put to meta2 key which would otherwise exceed maximum key // length, but is accepted because of the meta prefix. meta2KeyMax := keys.MakeKey(keys.Meta2Prefix, roachpb.KeyMax) pArgs := putArgs(meta2KeyMax, []byte("value"), 1, store.StoreID()) if _, err := client.SendWrapped(store, nil, &pArgs); err != nil { t.Fatalf("unexpected error on put to meta2 value: %s", err) } // Try to put a range descriptor record for a start key which is // maximum length. key := append([]byte{}, roachpb.KeyMax...) key[len(key)-1] = 0x01 pArgs = putArgs(keys.RangeDescriptorKey(key), []byte("value"), 1, store.StoreID()) if _, err := client.SendWrapped(store, nil, &pArgs); err != nil { t.Fatalf("unexpected error on put to range descriptor for KeyMax value: %s", err) } // Try a put to txn record for a meta2 key (note that this doesn't // actually happen in practice, as txn records are not put directly, // but are instead manipulated only through txn methods). pArgs = putArgs(keys.TransactionKey(meta2KeyMax, []byte(uuid.NewUUID4())), []byte("value"), 1, store.StoreID()) if _, err := client.SendWrapped(store, nil, &pArgs); err != nil { t.Fatalf("unexpected error on put to txn meta2 value: %s", err) } }
// AdminMerge extends the range to subsume the range that comes next in // the key space. The range being subsumed is provided in args.SubsumedRange. // The EndKey of the subsuming range must equal the start key of the // range being subsumed. The merge is performed inside of a distributed // transaction which writes the updated range descriptor for the subsuming range // and deletes the range descriptor for the subsumed one. It also updates the // range addressing metadata. The handover of responsibility for // the reassigned key range is carried out seamlessly through a merge trigger // carried out as part of the commit of that transaction. // A merge requires that the two ranges are collocate on the same set of replicas. func (r *Range) AdminMerge(args *proto.AdminMergeRequest, reply *proto.AdminMergeResponse) { // Only allow a single split/merge per range at a time. r.metaLock.Lock() defer r.metaLock.Unlock() // Lookup subsumed range. desc := r.Desc() if desc.EndKey.Equal(proto.KeyMax) { // Noop. return } subsumedRng := r.rm.LookupRange(desc.EndKey, nil) if subsumedRng == nil { reply.SetGoError(util.Errorf("ranges not collocated; migration of ranges in anticipation of merge not yet implemented")) return } subsumedDesc := subsumedRng.Desc() // Make sure the range being subsumed follows this one. if !bytes.Equal(desc.EndKey, subsumedDesc.StartKey) { reply.SetGoError(util.Errorf("Ranges that are not adjacent cannot be merged, %s != %s", desc.EndKey, subsumedDesc.StartKey)) return } // Ensure that both ranges are collocate by intersecting the store ids from // their replicas. if !replicaSetsEqual(subsumedDesc.GetReplicas(), desc.GetReplicas()) { reply.SetGoError(util.Error("The two ranges replicas are not collocate")) return } // Init updated version of existing range descriptor. updatedDesc := *desc updatedDesc.EndKey = subsumedDesc.EndKey log.Infof("initiating a merge of %s into %s", subsumedRng, r) if err := r.rm.DB().Txn(func(txn *client.Txn) error { // Update the range descriptor for the receiving range. b := &client.Batch{} desc1Key := keys.RangeDescriptorKey(updatedDesc.StartKey) if err := updateRangeDescriptor(b, desc1Key, desc, &updatedDesc); err != nil { return err } // Remove the range descriptor for the deleted range. // TODO(bdarnell): need a conditional delete? desc2Key := keys.RangeDescriptorKey(subsumedDesc.StartKey) b.Del(desc2Key) if err := mergeRangeAddressing(b, desc, &updatedDesc); err != nil { return err } // End the transaction manually instead of letting RunTransaction // loop do it, in order to provide a merge trigger. b.InternalAddCall(proto.Call{ Args: &proto.EndTransactionRequest{ RequestHeader: proto.RequestHeader{Key: args.Key}, Commit: true, InternalCommitTrigger: &proto.InternalCommitTrigger{ MergeTrigger: &proto.MergeTrigger{ UpdatedDesc: updatedDesc, SubsumedRaftID: subsumedDesc.RaftID, }, Intents: []proto.Key{desc1Key, desc2Key}, }, }, Reply: &proto.EndTransactionResponse{}, }) return txn.Run(b) }); err != nil { reply.SetGoError(util.Errorf("merge of range %d into %d failed: %s", subsumedDesc.RaftID, desc.RaftID, err)) } }
// AdminSplit divides the range into into two ranges, using either // args.SplitKey (if provided) or an internally computed key that aims to // roughly equipartition the range by size. The split is done inside of // a distributed txn which writes updated and new range descriptors, and // updates the range addressing metadata. The handover of responsibility for // the reassigned key range is carried out seamlessly through a split trigger // carried out as part of the commit of that transaction. func (r *Range) AdminSplit(args *proto.AdminSplitRequest, reply *proto.AdminSplitResponse) { // Only allow a single split per range at a time. r.metaLock.Lock() defer r.metaLock.Unlock() // Determine split key if not provided with args. This scan is // allowed to be relatively slow because admin commands don't block // other commands. desc := r.Desc() splitKey := proto.Key(args.SplitKey) if len(splitKey) == 0 { snap := r.rm.NewSnapshot() defer snap.Close() var err error if splitKey, err = engine.MVCCFindSplitKey(snap, desc.RaftID, desc.StartKey, desc.EndKey); err != nil { reply.SetGoError(util.Errorf("unable to determine split key: %s", err)) return } } // First verify this condition so that it will not return // proto.NewRangeKeyMismatchError if splitKey equals to desc.EndKey, // otherwise it will cause infinite retry loop. if splitKey.Equal(desc.StartKey) || splitKey.Equal(desc.EndKey) { reply.SetGoError(util.Errorf("range is already split at key %s", splitKey)) return } // Verify some properties of split key. if !r.ContainsKey(splitKey) { reply.SetGoError(proto.NewRangeKeyMismatchError(splitKey, splitKey, desc)) return } if !engine.IsValidSplitKey(splitKey) { reply.SetGoError(util.Errorf("cannot split range at key %s", splitKey)) return } // Create new range descriptor with newly-allocated replica IDs and Raft IDs. newDesc, err := r.rm.NewRangeDescriptor(splitKey, desc.EndKey, desc.Replicas) if err != nil { reply.SetGoError(util.Errorf("unable to allocate new range descriptor: %s", err)) return } // Init updated version of existing range descriptor. updatedDesc := *desc updatedDesc.EndKey = splitKey log.Infof("initiating a split of %s at key %s", r, splitKey) if err = r.rm.DB().Txn(func(txn *client.Txn) error { // Create range descriptor for second half of split. // Note that this put must go first in order to locate the // transaction record on the correct range. b := &client.Batch{} desc1Key := keys.RangeDescriptorKey(newDesc.StartKey) if err := updateRangeDescriptor(b, desc1Key, nil, newDesc); err != nil { return err } // Update existing range descriptor for first half of split. desc2Key := keys.RangeDescriptorKey(updatedDesc.StartKey) if err := updateRangeDescriptor(b, desc2Key, desc, &updatedDesc); err != nil { return err } // Update range descriptor addressing record(s). if err := splitRangeAddressing(b, newDesc, &updatedDesc); err != nil { return err } if err := txn.Run(b); err != nil { return err } // Update the RangeTree. b = &client.Batch{} if err := InsertRange(txn, b, newDesc.StartKey); err != nil { return err } // End the transaction manually, instead of letting RunTransaction // loop do it, in order to provide a split trigger. b.InternalAddCall(proto.Call{ Args: &proto.EndTransactionRequest{ RequestHeader: proto.RequestHeader{Key: args.Key}, Commit: true, InternalCommitTrigger: &proto.InternalCommitTrigger{ SplitTrigger: &proto.SplitTrigger{ UpdatedDesc: updatedDesc, NewDesc: *newDesc, }, Intents: []proto.Key{desc1Key, desc2Key}, }, }, Reply: &proto.EndTransactionResponse{}, }) return txn.Run(b) }); err != nil { reply.SetGoError(util.Errorf("split at key %s failed: %s", splitKey, err)) } }
// ChangeReplicas adds or removes a replica of a range. The change is performed // in a distributed transaction and takes effect when that transaction is committed. // When removing a replica, only the NodeID and StoreID fields of the Replica are used. func (r *Range) ChangeReplicas(changeType proto.ReplicaChangeType, replica proto.Replica) error { // Only allow a single change per range at a time. r.metaLock.Lock() defer r.metaLock.Unlock() // Validate the request and prepare the new descriptor. desc := r.Desc() updatedDesc := *desc updatedDesc.Replicas = append([]proto.Replica{}, desc.Replicas...) found := -1 // tracks NodeID && StoreID nodeUsed := false // tracks NodeID only for i, existingRep := range desc.Replicas { nodeUsed = nodeUsed || existingRep.NodeID == replica.NodeID if existingRep.NodeID == replica.NodeID && existingRep.StoreID == replica.StoreID { found = i break } } if changeType == proto.ADD_REPLICA { // If the replica exists on the remote node, no matter in which store, // abort the replica add. if nodeUsed { return util.Errorf("adding replica %v which is already present in range %d", replica, desc.RaftID) } updatedDesc.Replicas = append(updatedDesc.Replicas, replica) } else if changeType == proto.REMOVE_REPLICA { // If that exact node-store combination does not have the replica, // abort the removal. if found == -1 { return util.Errorf("removing replica %v which is not present in range %d", replica, desc.RaftID) } updatedDesc.Replicas[found] = updatedDesc.Replicas[len(updatedDesc.Replicas)-1] updatedDesc.Replicas = updatedDesc.Replicas[:len(updatedDesc.Replicas)-1] } err := r.rm.DB().Txn(func(txn *client.Txn) error { // Important: the range descriptor must be the first thing touched in the transaction // so the transaction record is co-located with the range being modified. b := &client.Batch{} descKey := keys.RangeDescriptorKey(updatedDesc.StartKey) if err := updateRangeDescriptor(b, descKey, desc, &updatedDesc); err != nil { return err } // Update range descriptor addressing record(s). if err := updateRangeAddressing(b, &updatedDesc); err != nil { return err } // End the transaction manually instead of letting RunTransaction // loop do it, in order to provide a commit trigger. b.InternalAddCall(proto.Call{ Args: &proto.EndTransactionRequest{ RequestHeader: proto.RequestHeader{Key: updatedDesc.StartKey}, Commit: true, InternalCommitTrigger: &proto.InternalCommitTrigger{ ChangeReplicasTrigger: &proto.ChangeReplicasTrigger{ NodeID: replica.NodeID, StoreID: replica.StoreID, ChangeType: changeType, UpdatedReplicas: updatedDesc.Replicas, }, Intents: []proto.Key{descKey}, }, }, Reply: &proto.EndTransactionResponse{}, }) return txn.Run(b) }) if err != nil { return util.Errorf("change replicas of %d failed: %s", desc.RaftID, err) } return nil }
// TestStoreRangeMergeWithData attempts to merge two collocate ranges // each containing data. func TestStoreRangeMergeWithData(t *testing.T) { defer leaktest.AfterTest(t) content := proto.Key("testing!") store, stopper := createTestStore(t) defer stopper.Stop() aDesc, bDesc, err := createSplitRanges(store) if err != nil { t.Fatal(err) } // Write some values left and right of the proposed split key. pArgs := putArgs([]byte("aaa"), content, aDesc.RangeID, store.StoreID()) if _, err := store.ExecuteCmd(context.Background(), &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("ccc"), content, bDesc.RangeID, store.StoreID()) if _, err := store.ExecuteCmd(context.Background(), &pArgs); err != nil { t.Fatal(err) } // Confirm the values are there. gArgs := getArgs([]byte("aaa"), aDesc.RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } gArgs = getArgs([]byte("ccc"), bDesc.RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } // Merge the b range back into the a range. args := adminMergeArgs(proto.KeyMin, 1, store.StoreID()) if _, err := store.ExecuteCmd(context.Background(), &args); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []proto.Key{keys.RangeDescriptorKey(aDesc.StartKey), keys.RangeDescriptorKey(bDesc.StartKey)} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } // Verify the merge by looking up keys from both ranges. rangeA := store.LookupRange([]byte("a"), nil) rangeB := store.LookupRange([]byte("c"), nil) if !reflect.DeepEqual(rangeA, rangeB) { t.Fatalf("ranges were not merged %+v=%+v", rangeA.Desc(), rangeB.Desc()) } if !bytes.Equal(rangeA.Desc().StartKey, proto.KeyMin) { t.Fatalf("The start key is not equal to KeyMin %q=%q", rangeA.Desc().StartKey, proto.KeyMin) } if !bytes.Equal(rangeA.Desc().EndKey, proto.KeyMax) { t.Fatalf("The end key is not equal to KeyMax %q=%q", rangeA.Desc().EndKey, proto.KeyMax) } // Try to get values from after the merge. gArgs = getArgs([]byte("aaa"), rangeA.Desc().RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } gArgs = getArgs([]byte("ccc"), rangeB.Desc().RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } // Put new values after the merge on both sides. pArgs = putArgs([]byte("aaaa"), content, rangeA.Desc().RangeID, store.StoreID()) if _, err = store.ExecuteCmd(context.Background(), &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("cccc"), content, rangeB.Desc().RangeID, store.StoreID()) if _, err = store.ExecuteCmd(context.Background(), &pArgs); err != nil { t.Fatal(err) } // Try to get the newly placed values. gArgs = getArgs([]byte("aaaa"), rangeA.Desc().RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } gArgs = getArgs([]byte("cccc"), rangeA.Desc().RangeID, store.StoreID()) if reply, err := store.ExecuteCmd(context.Background(), &gArgs); err != nil { t.Fatal(err) } else if gReply := reply.(*proto.GetResponse); !bytes.Equal(gReply.Value.Bytes, content) { t.Fatalf("actual value %q did not match expected value %q", gReply.Value.Bytes, content) } }
// TestStoreRangeSplit executes a split of a range and verifies that the // resulting ranges respond to the right key ranges and that their stats // and sequence cache have been properly accounted for. func TestStoreRangeSplitIdempotency(t *testing.T) { defer leaktest.AfterTest(t) store, stopper := createTestStore(t) defer stopper.Stop() rangeID := roachpb.RangeID(1) splitKey := roachpb.Key("m") content := roachpb.Key("asdvb") // First, write some values left and right of the proposed split key. pArgs := putArgs([]byte("c"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("x"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } // Increments are a good way of testing the sequence cache. Up here, we // address them to the original range, then later to the one that contains // the key. txn := roachpb.NewTransaction("test", []byte("c"), 10, roachpb.SERIALIZABLE, store.Clock().Now(), 0) lIncArgs := incrementArgs([]byte("apoptosis"), 100) if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ Txn: txn, }, &lIncArgs); err != nil { t.Fatal(err) } rIncArgs := incrementArgs([]byte("wobble"), 10) txn.Sequence++ if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ Txn: txn, }, &rIncArgs); err != nil { t.Fatal(err) } // Get the original stats for key and value bytes. var ms engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), rangeID, &ms); err != nil { t.Fatal(err) } keyBytes, valBytes := ms.KeyBytes, ms.ValBytes // Split the range. args := adminSplitArgs(roachpb.KeyMin, splitKey) if _, err := client.SendWrapped(rg1(store), nil, &args); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []roachpb.Key{keys.RangeDescriptorKey(roachpb.RKeyMin), keys.RangeDescriptorKey(keys.Addr(splitKey))} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } rng := store.LookupReplica(roachpb.RKeyMin, nil) newRng := store.LookupReplica([]byte("m"), nil) if !bytes.Equal(newRng.Desc().StartKey, splitKey) || !bytes.Equal(splitKey, rng.Desc().EndKey) { t.Errorf("ranges mismatched, wanted %q=%q=%q", newRng.Desc().StartKey, splitKey, rng.Desc().EndKey) } if !bytes.Equal(newRng.Desc().EndKey, roachpb.RKeyMax) || !bytes.Equal(rng.Desc().StartKey, roachpb.RKeyMin) { t.Errorf("new ranges do not cover KeyMin-KeyMax, but only %q-%q", rng.Desc().StartKey, newRng.Desc().EndKey) } // Try to get values from both left and right of where the split happened. gArgs := getArgs([]byte("c")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } gArgs = getArgs([]byte("x")) if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: newRng.Desc().RangeID, }, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } // Send out an increment request copied from above (same txn/sequence) // which remains in the old range. _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ Txn: txn, }, &lIncArgs) if _, ok := err.(*roachpb.TransactionRetryError); !ok { t.Fatalf("unexpected sequence cache miss: %v", err) } // Send out the same increment copied from above (same txn/sequence), but // now to the newly created range (which should hold that key). _, err = client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: newRng.Desc().RangeID, Txn: txn, }, &rIncArgs) if _, ok := err.(*roachpb.TransactionRetryError); !ok { t.Fatalf("unexpected sequence cache miss: %v", err) } // Compare stats of split ranges to ensure they are non zero and // exceed the original range when summed. var left, right engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), rangeID, &left); err != nil { t.Fatal(err) } lKeyBytes, lValBytes := left.KeyBytes, left.ValBytes if err := engine.MVCCGetRangeStats(store.Engine(), newRng.Desc().RangeID, &right); err != nil { t.Fatal(err) } rKeyBytes, rValBytes := right.KeyBytes, right.ValBytes if lKeyBytes == 0 || rKeyBytes == 0 { t.Errorf("expected non-zero key bytes; got %d, %d", lKeyBytes, rKeyBytes) } if lValBytes == 0 || rValBytes == 0 { t.Errorf("expected non-zero val bytes; got %d, %d", lValBytes, rValBytes) } if lKeyBytes+rKeyBytes <= keyBytes { t.Errorf("left + right key bytes don't match; %d + %d <= %d", lKeyBytes, rKeyBytes, keyBytes) } if lValBytes+rValBytes <= valBytes { t.Errorf("left + right val bytes don't match; %d + %d <= %d", lValBytes, rValBytes, valBytes) } }
// TestTruncateWithLocalSpanAndDescriptor verifies that a batch request with local keys // is truncated with a range span and the range of a descriptor found in cache. func TestTruncateWithLocalSpanAndDescriptor(t *testing.T) { defer leaktest.AfterTest(t)() g, s := makeTestGossip(t) defer s() if err := g.SetNodeDescriptor(&roachpb.NodeDescriptor{NodeID: 1}); err != nil { t.Fatal(err) } nd := &roachpb.NodeDescriptor{ NodeID: roachpb.NodeID(1), Address: util.MakeUnresolvedAddr(testAddress.Network(), testAddress.String()), } if err := g.AddInfoProto(gossip.MakeNodeIDKey(roachpb.NodeID(1)), nd, time.Hour); err != nil { t.Fatal(err) } // Fill mockRangeDescriptorDB with two descriptors. var descriptor1 = roachpb.RangeDescriptor{ RangeID: 1, StartKey: roachpb.RKeyMin, EndKey: roachpb.RKey("b"), Replicas: []roachpb.ReplicaDescriptor{ { NodeID: 1, StoreID: 1, }, }, } var descriptor2 = roachpb.RangeDescriptor{ RangeID: 2, StartKey: roachpb.RKey("b"), EndKey: roachpb.RKey("c"), Replicas: []roachpb.ReplicaDescriptor{ { NodeID: 1, StoreID: 1, }, }, } var descriptor3 = roachpb.RangeDescriptor{ RangeID: 3, StartKey: roachpb.RKey("c"), EndKey: roachpb.RKeyMax, Replicas: []roachpb.ReplicaDescriptor{ { NodeID: 1, StoreID: 1, }, }, } descDB := mockRangeDescriptorDB(func(key roachpb.RKey, _, _ bool) ([]roachpb.RangeDescriptor, *roachpb.Error) { switch { case !key.Less(roachpb.RKey("c")): return []roachpb.RangeDescriptor{descriptor3}, nil case !key.Less(roachpb.RKey("b")): return []roachpb.RangeDescriptor{descriptor2}, nil default: return []roachpb.RangeDescriptor{descriptor1}, nil } }) // Define our rpcSend stub which checks the span of the batch // requests. requests := 0 sendStub := func(_ SendOptions, _ ReplicaSlice, ba roachpb.BatchRequest, _ *rpc.Context) (*roachpb.BatchResponse, error) { h := ba.Requests[0].GetInner().Header() switch requests { case 0: wantStart := keys.RangeDescriptorKey(roachpb.RKey("a")) wantEnd := keys.MakeRangeKeyPrefix(roachpb.RKey("b")) if !(h.Key.Equal(wantStart) && h.EndKey.Equal(wantEnd)) { t.Errorf("Unexpected span [%s,%s), want [%s,%s)", h.Key, h.EndKey, wantStart, wantEnd) } case 1: wantStart := keys.MakeRangeKeyPrefix(roachpb.RKey("b")) wantEnd := keys.MakeRangeKeyPrefix(roachpb.RKey("c")) if !(h.Key.Equal(wantStart) && h.EndKey.Equal(wantEnd)) { t.Errorf("Unexpected span [%s,%s), want [%s,%s)", h.Key, h.EndKey, wantStart, wantEnd) } case 2: wantStart := keys.MakeRangeKeyPrefix(roachpb.RKey("c")) wantEnd := keys.RangeDescriptorKey(roachpb.RKey("c")) if !(h.Key.Equal(wantStart) && h.EndKey.Equal(wantEnd)) { t.Errorf("Unexpected span [%s,%s), want [%s,%s)", h.Key, h.EndKey, wantStart, wantEnd) } } requests++ batchReply := &roachpb.BatchResponse{} reply := &roachpb.ScanResponse{} batchReply.Add(reply) return batchReply, nil } ctx := &DistSenderContext{ RPCSend: sendStub, RangeDescriptorDB: descDB, } ds := NewDistSender(ctx, g) // Send a batch request contains two scans. In the first // attempt, the range of the descriptor found in the cache is // ["", "b"). The request is truncated to contain only the scan // on local keys that address up to "b". // // In the second attempt, The range of the descriptor found in // the cache is ["b", "d"), The request is truncated to contain // only the scan on local keys that address from "b" to "d". ba := roachpb.BatchRequest{} ba.Txn = &roachpb.Transaction{Name: "test"} ba.Add(roachpb.NewScan(keys.RangeDescriptorKey(roachpb.RKey("a")), keys.RangeDescriptorKey(roachpb.RKey("c")), 0)) if _, pErr := ds.Send(context.Background(), ba); pErr != nil { t.Fatal(pErr) } if want := 3; requests != want { t.Errorf("expected request to be split into %d parts, found %d", want, requests) } }
func TestTruncate(t *testing.T) { defer leaktest.AfterTest(t) loc := func(s string) string { return string(keys.RangeDescriptorKey(proto.Key(s))) } testCases := []struct { keys [][2]string expKeys [][2]string from, to string desc [2]string // optional, defaults to {from,to} }{ { // Keys inside of active range. keys: [][2]string{{"a", "q"}, {"c"}, {"b, e"}, {"q"}}, expKeys: [][2]string{{"a", "q"}, {"c"}, {"b, e"}, {"q"}}, from: "a", to: "q\x00", }, { // Keys outside of active range. keys: [][2]string{{"a"}, {"a", "b"}, {"q"}, {"q", "z"}}, expKeys: [][2]string{{}, {}, {}, {}}, from: "b", to: "q", }, { // Range-local Keys outside of active range. keys: [][2]string{{loc("e")}, {loc("a"), loc("b")}, {loc("e"), loc("z")}}, expKeys: [][2]string{{}, {}, {}}, from: "b", to: "e", }, { // Range-local Keys overlapping active range in various ways. // TODO(tschottdorf): those aren't handled nicely but I'll address // it in #2198. Right now local ranges can wind up going all over // the place. keys: [][2]string{{loc("b")}, {loc("a"), loc("b\x00")}, {loc("c"), loc("f")}, {loc("a"), loc("z")}}, expKeys: [][2]string{{loc("b")}, {"b", loc("b\x00")}, {loc("c"), "e"}, {"b", "e"}}, from: "b", to: "e", }, { // Key range touching and intersecting active range. keys: [][2]string{{"a", "b"}, {"a", "c"}, {"p", "q"}, {"p", "r"}, {"a", "z"}}, expKeys: [][2]string{{}, {"b", "c"}, {"p", "q"}, {"p", "q"}, {"b", "q"}}, from: "b", to: "q", }, // Active key range is intersection of descriptor and [from,to). { keys: [][2]string{{"c", "q"}}, expKeys: [][2]string{{"d", "p"}}, from: "a", to: "z", desc: [2]string{"d", "p"}, }, { keys: [][2]string{{"c", "q"}}, expKeys: [][2]string{{"d", "p"}}, from: "d", to: "p", desc: [2]string{"a", "z"}, }, } for i, test := range testCases { ba := &proto.BatchRequest{} for _, ks := range test.keys { if len(ks[1]) > 0 { ba.Add(&proto.ScanRequest{ RequestHeader: proto.RequestHeader{Key: proto.Key(ks[0]), EndKey: proto.Key(ks[1])}, }) } else { ba.Add(&proto.GetRequest{ RequestHeader: proto.RequestHeader{Key: proto.Key(ks[0])}, }) } } original := gogoproto.Clone(ba).(*proto.BatchRequest) desc := &proto.RangeDescriptor{ StartKey: proto.Key(test.desc[0]), EndKey: proto.Key(test.desc[1]), } if len(desc.StartKey) == 0 { desc.StartKey = proto.Key(test.from) } if len(desc.EndKey) == 0 { desc.EndKey = proto.Key(test.to) } undo, num, err := truncate(ba, desc, proto.Key(test.from), proto.Key(test.to)) if err != nil { t.Errorf("%d: %s", i, err) } var reqs int for j, arg := range ba.Requests { req := arg.GetInner() if h := req.Header(); !bytes.Equal(h.Key, proto.Key(test.expKeys[j][0])) || !bytes.Equal(h.EndKey, proto.Key(test.expKeys[j][1])) { t.Errorf("%d.%d: range mismatch: actual [%q,%q), wanted [%q,%q)", i, j, h.Key, h.EndKey, test.expKeys[j][0], test.expKeys[j][1]) } else if _, ok := req.(*proto.NoopRequest); ok != (len(h.Key) == 0) { t.Errorf("%d.%d: expected NoopRequest, got %T", i, j, req) } else if len(h.Key) != 0 { reqs++ } } if reqs != num { t.Errorf("%d: counted %d requests, but truncation indicated %d", i, reqs, num) } undo() if !reflect.DeepEqual(ba, original) { t.Errorf("%d: undoing truncation failed:\nexpected: %s\nactual: %s", i, original, ba) } } }
func snapshot( ctx context.Context, snap engine.Reader, rangeID roachpb.RangeID, eCache *raftEntryCache, startKey roachpb.RKey, ) (raftpb.Snapshot, error) { start := timeutil.Now() var snapData roachpb.RaftSnapshotData truncState, err := loadTruncatedState(ctx, snap, rangeID) if err != nil { return raftpb.Snapshot{}, err } firstIndex := truncState.Index + 1 // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, _, err := loadAppliedIndex(ctx, snap, rangeID) if err != nil { return raftpb.Snapshot{}, err } var desc roachpb.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey), hlc.MaxTimestamp, false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, errors.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, errors.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the sequence cache. iter := NewReplicaDataIterator(&desc, snap, true /* replicatedOnly */) defer iter.Close() var alloc bufalloc.ByteAllocator for ; iter.Valid(); iter.Next() { var key engine.MVCCKey var value []byte alloc, key, value = iter.allocIterKeyValue(alloc) snapData.KV = append(snapData.KV, roachpb.RaftSnapshotData_KeyValue{ Key: key.Key, Value: value, Timestamp: key.Timestamp, }) } endIndex := appliedIndex + 1 snapData.LogEntries = make([][]byte, 0, endIndex-firstIndex) scanFunc := func(kv roachpb.KeyValue) (bool, error) { bytes, err := kv.Value.GetBytes() if err == nil { snapData.LogEntries = append(snapData.LogEntries, bytes) } return false, err } if err := iterateEntries(ctx, snap, rangeID, firstIndex, endIndex, scanFunc); err != nil { return raftpb.Snapshot{}, err } data, err := protoutil.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } term, err := term(ctx, snap, rangeID, eCache, appliedIndex) if err != nil { return raftpb.Snapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } log.Infof(ctx, "generated snapshot for range %s at index %d in %s. encoded size=%d, %d KV pairs, %d log entries", rangeID, appliedIndex, timeutil.Since(start), len(data), len(snapData.KV), len(snapData.LogEntries)) return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }
// Snapshot implements the raft.Storage interface. // Snapshot requires that the replica lock is held. func (r *Replica) Snapshot() (raftpb.Snapshot, error) { // Copy all the data from a consistent RocksDB snapshot into a RaftSnapshotData. snap := r.store.NewSnapshot() defer snap.Close() var snapData roachpb.RaftSnapshotData firstIndex, err := r.FirstIndex() if err != nil { return raftpb.Snapshot{}, err } // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, err := r.loadAppliedIndexLocked(snap) if err != nil { return raftpb.Snapshot{}, err } var desc roachpb.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(snap, keys.RangeDescriptorKey(r.mu.desc.StartKey), r.store.Clock().Now(), false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, util.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the sequence cache. iter := newReplicaDataIterator(&desc, snap, true /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { key := iter.Key() snapData.KV = append(snapData.KV, roachpb.RaftSnapshotData_KeyValue{ Key: key.Key, Value: iter.Value(), Timestamp: key.Timestamp, }) } entries, err := r.entries(snap, firstIndex, appliedIndex+1, 0) if err != nil { return raftpb.Snapshot{}, err } snapData.LogEntries = entries data, err := proto.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } term, err := r.Term(appliedIndex) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }
// TestStoreRangeSplit executes a split of a range and verifies that the // resulting ranges respond to the right key ranges and that their stats // and response caches have been properly accounted for. func TestStoreRangeSplit(t *testing.T) { defer leaktest.AfterTest(t) store, stopper := createTestStore(t) defer stopper.Stop() raftID := proto.RaftID(1) splitKey := proto.Key("m") content := proto.Key("asdvb") // First, write some values left and right of the proposed split key. pArgs, pReply := putArgs([]byte("c"), content, raftID, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: pArgs, Reply: pReply}); err != nil { t.Fatal(err) } pArgs, pReply = putArgs([]byte("x"), content, raftID, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: pArgs, Reply: pReply}); err != nil { t.Fatal(err) } // Increments are a good way of testing the response cache. Up here, we // address them to the original range, then later to the one that contains // the key. lIncArgs, lIncReply := incrementArgs([]byte("apoptosis"), 100, raftID, store.StoreID()) lIncArgs.CmdID = proto.ClientCmdID{WallTime: 123, Random: 423} if err := store.ExecuteCmd(context.Background(), proto.Call{Args: lIncArgs, Reply: lIncReply}); err != nil { t.Fatal(err) } rIncArgs, rIncReply := incrementArgs([]byte("wobble"), 10, raftID, store.StoreID()) rIncArgs.CmdID = proto.ClientCmdID{WallTime: 12, Random: 42} if err := store.ExecuteCmd(context.Background(), proto.Call{Args: rIncArgs, Reply: rIncReply}); err != nil { t.Fatal(err) } // Get the original stats for key and value bytes. var ms engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), raftID, &ms); err != nil { t.Fatal(err) } keyBytes, valBytes := ms.KeyBytes, ms.ValBytes // Split the range. args, reply := adminSplitArgs(proto.KeyMin, splitKey, 1, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: args, Reply: reply}); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []proto.Key{keys.RangeDescriptorKey(proto.KeyMin), keys.RangeDescriptorKey(splitKey)} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } rng := store.LookupRange(proto.KeyMin, nil) newRng := store.LookupRange([]byte("m"), nil) if !bytes.Equal(newRng.Desc().StartKey, splitKey) || !bytes.Equal(splitKey, rng.Desc().EndKey) { t.Errorf("ranges mismatched, wanted %q=%q=%q", newRng.Desc().StartKey, splitKey, rng.Desc().EndKey) } if !bytes.Equal(newRng.Desc().EndKey, proto.KeyMax) || !bytes.Equal(rng.Desc().StartKey, proto.KeyMin) { t.Errorf("new ranges do not cover KeyMin-KeyMax, but only %q-%q", rng.Desc().StartKey, newRng.Desc().EndKey) } // Try to get values from both left and right of where the split happened. gArgs, gReply := getArgs([]byte("c"), raftID, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: gArgs, Reply: gReply}); err != nil || !bytes.Equal(gReply.Value.Bytes, content) { t.Fatal(err) } gArgs, gReply = getArgs([]byte("x"), newRng.Desc().RaftID, store.StoreID()) if err := store.ExecuteCmd(context.Background(), proto.Call{Args: gArgs, Reply: gReply}); err != nil || !bytes.Equal(gReply.Value.Bytes, content) { t.Fatal(err) } // Send out an increment request copied from above (same ClientCmdID) which // remains in the old range. lIncReply = &proto.IncrementResponse{} if err := store.ExecuteCmd(context.Background(), proto.Call{Args: lIncArgs, Reply: lIncReply}); err != nil { t.Fatal(err) } if lIncReply.NewValue != 100 { t.Errorf("response cache broken in old range, expected %d but got %d", lIncArgs.Increment, lIncReply.NewValue) } // Send out the same increment copied from above (same ClientCmdID), but // now to the newly created range (which should hold that key). rIncArgs.RequestHeader.RaftID = newRng.Desc().RaftID rIncReply = &proto.IncrementResponse{} if err := store.ExecuteCmd(context.Background(), proto.Call{Args: rIncArgs, Reply: rIncReply}); err != nil { t.Fatal(err) } if rIncReply.NewValue != 10 { t.Errorf("response cache not copied correctly to new range, expected %d but got %d", rIncArgs.Increment, rIncReply.NewValue) } // Compare stats of split ranges to ensure they are non ero and // exceed the original range when summed. var left, right engine.MVCCStats if err := engine.MVCCGetRangeStats(store.Engine(), raftID, &left); err != nil { t.Fatal(err) } lKeyBytes, lValBytes := left.KeyBytes, left.ValBytes if err := engine.MVCCGetRangeStats(store.Engine(), newRng.Desc().RaftID, &right); err != nil { t.Fatal(err) } rKeyBytes, rValBytes := right.KeyBytes, right.ValBytes if lKeyBytes == 0 || rKeyBytes == 0 { t.Errorf("expected non-zero key bytes; got %d, %d", lKeyBytes, rKeyBytes) } if lValBytes == 0 || rValBytes == 0 { t.Errorf("expected non-zero val bytes; got %d, %d", lValBytes, rValBytes) } if lKeyBytes+rKeyBytes <= keyBytes { t.Errorf("left + right key bytes don't match; %d + %d <= %d", lKeyBytes, rKeyBytes, keyBytes) } if lValBytes+rValBytes <= valBytes { t.Errorf("left + right val bytes don't match; %d + %d <= %d", lValBytes, rValBytes, valBytes) } }
// TestStoreRangeMergeWithData attempts to merge two collocate ranges // each containing data. func TestStoreRangeMergeWithData(t *testing.T) { defer leaktest.AfterTest(t) content := roachpb.Key("testing!") store, stopper := createTestStore(t) defer stopper.Stop() aDesc, bDesc, err := createSplitRanges(store) if err != nil { t.Fatal(err) } // Write some values left and right of the proposed split key. pArgs := putArgs([]byte("aaa"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("ccc"), content) if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: bDesc.RangeID, }, &pArgs); err != nil { t.Fatal(err) } // Confirm the values are there. gArgs := getArgs([]byte("aaa")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } gArgs = getArgs([]byte("ccc")) if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: bDesc.RangeID, }, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } // Merge the b range back into the a range. args := adminMergeArgs(roachpb.KeyMin) if _, err := client.SendWrapped(rg1(store), nil, &args); err != nil { t.Fatal(err) } // Verify no intents remains on range descriptor keys. for _, key := range []roachpb.Key{keys.RangeDescriptorKey(aDesc.StartKey), keys.RangeDescriptorKey(bDesc.StartKey)} { if _, _, err := engine.MVCCGet(store.Engine(), key, store.Clock().Now(), true, nil); err != nil { t.Fatal(err) } } // Verify the merge by looking up keys from both ranges. rangeA := store.LookupReplica([]byte("a"), nil) rangeB := store.LookupReplica([]byte("c"), nil) rangeADesc := rangeA.Desc() rangeBDesc := rangeB.Desc() if !reflect.DeepEqual(rangeA, rangeB) { t.Fatalf("ranges were not merged %+v=%+v", rangeADesc, rangeBDesc) } if !bytes.Equal(rangeADesc.StartKey, roachpb.RKeyMin) { t.Fatalf("The start key is not equal to KeyMin %q=%q", rangeADesc.StartKey, roachpb.RKeyMin) } if !bytes.Equal(rangeADesc.EndKey, roachpb.RKeyMax) { t.Fatalf("The end key is not equal to KeyMax %q=%q", rangeADesc.EndKey, roachpb.RKeyMax) } // Try to get values from after the merge. gArgs = getArgs([]byte("aaa")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } gArgs = getArgs([]byte("ccc")) if reply, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: rangeB.RangeID, }, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } // Put new values after the merge on both sides. pArgs = putArgs([]byte("aaaa"), content) if _, err := client.SendWrapped(rg1(store), nil, &pArgs); err != nil { t.Fatal(err) } pArgs = putArgs([]byte("cccc"), content) if _, err := client.SendWrappedWith(rg1(store), nil, roachpb.Header{ RangeID: rangeB.RangeID, }, &pArgs); err != nil { t.Fatal(err) } // Try to get the newly placed values. gArgs = getArgs([]byte("aaaa")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } gArgs = getArgs([]byte("cccc")) if reply, err := client.SendWrapped(rg1(store), nil, &gArgs); err != nil { t.Fatal(err) } else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil { t.Fatal(err) } else if !bytes.Equal(replyBytes, content) { t.Fatalf("actual value %q did not match expected value %q", replyBytes, content) } }
func runDebugGCCmd(cmd *cobra.Command, args []string) error { stopper := stop.NewStopper() defer stopper.Stop() if len(args) != 1 { return errors.New("required arguments: dir") } var rangeID roachpb.RangeID if len(args) == 2 { var err error if rangeID, err = parseRangeID(args[1]); err != nil { return err } } db, err := openStore(cmd, args[0], stopper) if err != nil { return err } start := keys.RangeDescriptorKey(roachpb.RKeyMin) end := keys.RangeDescriptorKey(roachpb.RKeyMax) var descs []roachpb.RangeDescriptor if _, err := engine.MVCCIterate(context.Background(), db, start, end, hlc.MaxTimestamp, false /* !consistent */, nil, /* txn */ false /* !reverse */, func(kv roachpb.KeyValue) (bool, error) { var desc roachpb.RangeDescriptor _, suffix, _, err := keys.DecodeRangeKey(kv.Key) if err != nil { return false, err } if !bytes.Equal(suffix, keys.LocalRangeDescriptorSuffix) { return false, nil } if err := kv.Value.GetProto(&desc); err != nil { return false, err } if desc.RangeID == rangeID || rangeID == 0 { descs = append(descs, desc) } return desc.RangeID == rangeID, nil }); err != nil { return err } if len(descs) == 0 { return fmt.Errorf("no range matching the criteria found") } for _, desc := range descs { snap := db.NewSnapshot() defer snap.Close() _, info, err := storage.RunGC(context.Background(), &desc, snap, hlc.Timestamp{WallTime: timeutil.Now().UnixNano()}, config.GCPolicy{TTLSeconds: 24 * 60 * 60 /* 1 day */}, func(_ hlc.Timestamp, _ *roachpb.Transaction, _ roachpb.PushTxnType) { }, func(_ []roachpb.Intent, _, _ bool) error { return nil }) if err != nil { return err } fmt.Printf("RangeID: %d [%s, %s):\n", desc.RangeID, desc.StartKey, desc.EndKey) _, _ = pretty.Println(info) } return nil }