func setAppliedIndex( ctx context.Context, eng engine.ReadWriter, ms *enginepb.MVCCStats, rangeID roachpb.RangeID, appliedIndex, leaseAppliedIndex uint64, ) error { var value roachpb.Value value.SetInt(int64(appliedIndex)) if err := engine.MVCCPut(ctx, eng, ms, keys.RaftAppliedIndexKey(rangeID), hlc.ZeroTimestamp, value, nil /* txn */); err != nil { return err } value.SetInt(int64(leaseAppliedIndex)) return engine.MVCCPut(ctx, eng, ms, keys.LeaseAppliedIndexKey(rangeID), hlc.ZeroTimestamp, value, nil /* txn */) }
// setAppliedIndex persists a new applied index. func setAppliedIndex(eng engine.Engine, raftID proto.RaftID, appliedIndex uint64) error { return engine.MVCCPut(eng, nil, /* stats */ keys.RaftAppliedIndexKey(raftID), proto.ZeroTimestamp, proto.Value{Bytes: encoding.EncodeUint64(nil, appliedIndex)}, nil /* txn */) }
// setAppliedIndex persists a new applied index. func setAppliedIndex(eng engine.Engine, rangeID roachpb.RangeID, appliedIndex uint64) error { return engine.MVCCPut(eng, nil, /* stats */ keys.RaftAppliedIndexKey(rangeID), roachpb.ZeroTimestamp, roachpb.MakeValueFromBytes(encoding.EncodeUint64(nil, appliedIndex)), nil /* txn */) }
func setFrozenStatus( eng engine.ReadWriter, ms *enginepb.MVCCStats, rangeID roachpb.RangeID, frozen bool, ) error { var val roachpb.Value val.SetBool(frozen) return engine.MVCCPut(context.Background(), eng, ms, keys.RangeFrozenStatusKey(rangeID), hlc.ZeroTimestamp, val, nil) }
func setLastIndex(eng engine.ReadWriter, rangeID roachpb.RangeID, lastIndex uint64) error { var value roachpb.Value value.SetInt(int64(lastIndex)) return engine.MVCCPut(context.Background(), eng, nil, keys.RaftLastIndexKey(rangeID), hlc.ZeroTimestamp, value, nil /* txn */) }
// setLastIndex persists a new last index. func setLastIndex(eng engine.Engine, rangeID roachpb.RangeID, lastIndex uint64) error { var value roachpb.Value value.SetInt(int64(lastIndex)) return engine.MVCCPut(eng, nil, keys.RaftLastIndexKey(rangeID), roachpb.ZeroTimestamp, value, nil /* txn */) }
// setAppliedIndex persists a new applied index. func setAppliedIndex(eng engine.Engine, ms *engine.MVCCStats, rangeID roachpb.RangeID, appliedIndex uint64) error { var value roachpb.Value value.SetInt(int64(appliedIndex)) return engine.MVCCPut(eng, ms, keys.RaftAppliedIndexKey(rangeID), roachpb.ZeroTimestamp, value, nil /* txn */) }
// Indirectly this tests that the transaction remembers the NodeID of the node // being read from correctly, at least in this simple case. Not remembering the // node would lead to thousands of transaction restarts and almost certainly a // test timeout. func TestUncertaintyRestarts(t *testing.T) { { db, eng, clock, mClock, _, transport, err := createTestDB() if err != nil { t.Fatal(err) } defer transport.Close() // Set a large offset so that a busy restart-loop // really shows. Also makes sure that the values // we write in the future below don't actually // wind up in the past. offset := 4000 * time.Millisecond clock.SetMaxOffset(offset) key := proto.Key("key") value := proto.Value{ Bytes: nil, // Set for each Put } // With the correct restart behaviour, we see only one restart // and the value read is the very first one (as nothing else // has been written) wantedBytes := []byte("value-0") txnOpts := &client.TransactionOptions{ Name: "uncertainty", } gr := &proto.GetResponse{} i := -1 tErr := db.RunTransaction(txnOpts, func(txn *client.KV) error { i++ mClock.Increment(1) futureTS := clock.Now() futureTS.WallTime++ value.Bytes = []byte(fmt.Sprintf("value-%d", i)) err = engine.MVCCPut(eng, nil, key, futureTS, value, nil) if err != nil { t.Fatal(err) } gr.Reset() if err := txn.Call(proto.Get, proto.GetArgs(key), gr); err != nil { return err } if gr.Value == nil || !bytes.Equal(gr.Value.Bytes, wantedBytes) { t.Fatalf("%d: read wrong value: %v, wanted %q", i, gr.Value, wantedBytes) } return nil }) if i != 1 { t.Errorf("txn restarted %d times, expected only one restart", i) } if tErr != nil { t.Fatal(tErr) } } }
// PutSequence writes a sequence number for the specified family. func (rc *ResponseCache) PutSequence(e engine.Engine, family []byte, sequence int64, err error) error { if sequence <= 0 || len(family) == 0 { return errEmptyID } if !rc.shouldCacheError(err) { return nil } // Write the response value to the engine. key := keys.ResponseCacheKey(rc.rangeID, family) var v roachpb.Value v.SetInt(sequence) return engine.MVCCPut(e, nil /* ms */, key, roachpb.ZeroTimestamp, v, nil /* txn */) }
// append the given entries to the raft log. Takes the previous values of // r.mu.lastIndex and r.mu.raftLogSize, and returns new values. We do this // rather than modifying them directly because these modifications need to be // atomic with the commit of the batch. func (r *Replica) append( ctx context.Context, batch engine.ReadWriter, prevLastIndex uint64, prevRaftLogSize int64, entries []raftpb.Entry, ) (uint64, int64, error) { if len(entries) == 0 { return prevLastIndex, prevRaftLogSize, nil } var diff enginepb.MVCCStats var value roachpb.Value for i := range entries { ent := &entries[i] key := keys.RaftLogKey(r.RangeID, ent.Index) if err := value.SetProto(ent); err != nil { return 0, 0, err } value.InitChecksum(key) var err error if ent.Index > prevLastIndex { err = engine.MVCCBlindPut(ctx, batch, &diff, key, hlc.ZeroTimestamp, value, nil /* txn */) } else { err = engine.MVCCPut(ctx, batch, &diff, key, hlc.ZeroTimestamp, value, nil /* txn */) } if err != nil { return 0, 0, err } } // Delete any previously appended log entries which never committed. lastIndex := entries[len(entries)-1].Index for i := lastIndex + 1; i <= prevLastIndex; i++ { err := engine.MVCCDelete(ctx, batch, &diff, keys.RaftLogKey(r.RangeID, i), hlc.ZeroTimestamp, nil /* txn */) if err != nil { return 0, 0, err } } if err := setLastIndex(ctx, batch, r.RangeID, lastIndex); err != nil { return 0, 0, err } raftLogSize := prevRaftLogSize + diff.SysBytes return lastIndex, raftLogSize, nil }
// createRangeData creates sample range data in all possible areas of // the key space. Returns a slice of the encoded keys of all created // data. func createRangeData(t *testing.T, r *Replica) []engine.MVCCKey { ts0 := hlc.ZeroTimestamp ts := hlc.Timestamp{WallTime: 1} desc := r.Desc() keyTSs := []struct { key roachpb.Key ts hlc.Timestamp }{ {keys.AbortCacheKey(r.RangeID, testTxnID), ts0}, {keys.AbortCacheKey(r.RangeID, testTxnID2), ts0}, {keys.RangeFrozenStatusKey(r.RangeID), ts0}, {keys.RangeLastGCKey(r.RangeID), ts0}, {keys.RaftAppliedIndexKey(r.RangeID), ts0}, {keys.RaftTruncatedStateKey(r.RangeID), ts0}, {keys.LeaseAppliedIndexKey(r.RangeID), ts0}, {keys.RangeStatsKey(r.RangeID), ts0}, {keys.RaftHardStateKey(r.RangeID), ts0}, {keys.RaftLastIndexKey(r.RangeID), ts0}, {keys.RaftLogKey(r.RangeID, 1), ts0}, {keys.RaftLogKey(r.RangeID, 2), ts0}, {keys.RangeLastReplicaGCTimestampKey(r.RangeID), ts0}, {keys.RangeLastVerificationTimestampKey(r.RangeID), ts0}, {keys.RangeDescriptorKey(desc.StartKey), ts}, {keys.TransactionKey(roachpb.Key(desc.StartKey), uuid.NewV4()), ts0}, {keys.TransactionKey(roachpb.Key(desc.StartKey.Next()), uuid.NewV4()), ts0}, {keys.TransactionKey(fakePrevKey(desc.EndKey), uuid.NewV4()), ts0}, // TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space. // Once we have resolved https://github.com/cockroachdb/cockroach/issues/437, // replace this with something that reliably generates the first valid key in the range. //{r.Desc().StartKey.Next(), ts}, // The following line is similar to StartKey.Next() but adds more to the key to // avoid falling into the system-local space. {append(append([]byte{}, desc.StartKey...), '\x02'), ts}, {fakePrevKey(r.Desc().EndKey), ts}, } keys := []engine.MVCCKey{} for _, keyTS := range keyTSs { if err := engine.MVCCPut(context.Background(), r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil { t.Fatal(err) } keys = append(keys, engine.MVCCKey{Key: keyTS.key, Timestamp: keyTS.ts}) } return keys }
// Indirectly this tests that the transaction remembers the NodeID of the node // being read from correctly, at least in this simple case. Not remembering the // node would lead to thousands of transaction restarts and almost certainly a // test timeout. func TestUncertaintyRestarts(t *testing.T) { defer leaktest.AfterTest(t) s := createTestDB(t) defer s.Stop() // Set a large offset so that a busy restart-loop // really shows. Also makes sure that the values // we write in the future below don't actually // wind up in the past. offset := 4000 * time.Millisecond s.Clock.SetMaxOffset(offset) key := proto.Key("key") value := proto.Value{ Bytes: nil, // Set for each Put } // With the correct restart behaviour, we see only one restart // and the value read is the very first one (as nothing else // has been written) wantedBytes := []byte("value-0") i := -1 tErr := s.DB.Txn(func(txn *client.Txn) error { i++ s.Manual.Increment(1) futureTS := s.Clock.Now() futureTS.WallTime++ value.Bytes = []byte(fmt.Sprintf("value-%d", i)) if err := engine.MVCCPut(s.Eng, nil, key, futureTS, value, nil); err != nil { t.Fatal(err) } gr, err := txn.Get(key) if err != nil { return err } if !gr.Exists() || !bytes.Equal(gr.ValueBytes(), wantedBytes) { t.Fatalf("%d: read wrong value: %v, wanted %q", i, gr.Value, wantedBytes) } return nil }) if i != 1 { t.Errorf("txn restarted %d times, expected only one restart", i) } if tErr != nil { t.Fatal(tErr) } }
// createRangeData creates sample range data in all possible areas of // the key space. Returns a slice of the encoded keys of all created // data. func createRangeData(r *Replica, t *testing.T) []roachpb.EncodedKey { ts0 := roachpb.ZeroTimestamp ts := roachpb.Timestamp{WallTime: 1} keyTSs := []struct { key roachpb.Key ts roachpb.Timestamp }{ {keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 1, Random: 1}), ts0}, {keys.ResponseCacheKey(r.Desc().RangeID, &roachpb.ClientCmdID{WallTime: 2, Random: 2}), ts0}, {keys.RaftHardStateKey(r.Desc().RangeID), ts0}, {keys.RaftLogKey(r.Desc().RangeID, 1), ts0}, {keys.RaftLogKey(r.Desc().RangeID, 2), ts0}, {keys.RangeGCMetadataKey(r.Desc().RangeID), ts0}, {keys.RangeLastVerificationTimestampKey(r.Desc().RangeID), ts0}, {keys.RangeStatsKey(r.Desc().RangeID), ts0}, {keys.RangeDescriptorKey(r.Desc().StartKey), ts}, {keys.TransactionKey(roachpb.Key(r.Desc().StartKey), []byte("1234")), ts0}, {keys.TransactionKey(roachpb.Key(r.Desc().StartKey.Next()), []byte("5678")), ts0}, {keys.TransactionKey(fakePrevKey(r.Desc().EndKey), []byte("2468")), ts0}, // TODO(bdarnell): KeyMin.Next() results in a key in the reserved system-local space. // Once we have resolved https://github.com/cockroachdb/cockroach/issues/437, // replace this with something that reliably generates the first valid key in the range. //{r.Desc().StartKey.Next(), ts}, // The following line is similar to StartKey.Next() but adds more to the key to // avoid falling into the system-local space. {append(append([]byte{}, r.Desc().StartKey...), '\x01'), ts}, {fakePrevKey(r.Desc().EndKey), ts}, } keys := []roachpb.EncodedKey{} for _, keyTS := range keyTSs { if err := engine.MVCCPut(r.store.Engine(), nil, keyTS.key, keyTS.ts, roachpb.MakeValueFromString("value"), nil); err != nil { t.Fatal(err) } keys = append(keys, engine.MVCCEncodeKey(keyTS.key)) if !keyTS.ts.Equal(ts0) { keys = append(keys, engine.MVCCEncodeVersionKey(keyTS.key, keyTS.ts)) } } return keys }
// ApplySnapshot implements the multiraft.WriteableGroupStorage interface. func (r *Range) ApplySnapshot(snap raftpb.Snapshot) error { snapData := proto.RaftSnapshotData{} err := gogoproto.Unmarshal(snap.Data, &snapData) if err != nil { return err } // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. hardStateKey := keys.RaftHardStateKey(r.Desc().RaftID) hardState, _, err := engine.MVCCGet(r.rm.Engine(), hardStateKey, proto.ZeroTimestamp, true /* consistent */, nil) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor batch := r.rm.Engine().NewBatch() defer batch.Close() // Delete everything in the range and recreate it from the snapshot. for iter := newRangeDataIterator(&desc, r.rm.Engine()); iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return err } } // Write the snapshot into the range. for _, kv := range snapData.KV { if err := batch.Put(kv.Key, kv.Value); err != nil { return err } } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, proto.ZeroTimestamp, nil) if err != nil { return err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, proto.ZeroTimestamp, *hardState, nil) if err != nil { return err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RaftID) if err != nil { return err } // Copy range stats to new range. oldStats := r.stats r.stats, err = newRangeStats(desc.RaftID, batch) if err != nil { r.stats = oldStats return err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, r.Desc().RaftID, snap.Metadata.Index); err != nil { return err } if err := batch.Commit(); err != nil { return err } // As outlined above, last and applied index are the same after applying // the snapshot. atomic.StoreUint64(&r.lastIndex, snap.Metadata.Index) atomic.StoreUint64(&r.appliedIndex, snap.Metadata.Index) // Atomically update the descriptor and lease. if err := r.setDesc(&desc); err != nil { return err } atomic.StorePointer(&r.lease, unsafe.Pointer(lease)) return nil }
// setLastIndex persists a new last index. func setLastIndex(eng engine.Engine, raftID proto.RaftID, lastIndex uint64) error { return engine.MVCCPut(eng, nil, keys.RaftLastIndexKey(raftID), proto.ZeroTimestamp, proto.Value{ Bytes: encoding.EncodeUint64(nil, lastIndex), }, nil) }
// applySnapshot updates the replica based on the given snapshot. func (r *Replica) applySnapshot(snap raftpb.Snapshot) error { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return err } rangeID := r.Desc().RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(r.store.Engine(), hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor batch := r.store.Engine().NewBatch() defer batch.Close() // Delete everything in the range and recreate it from the snapshot. iter := newReplicaDataIterator(&desc, r.store.Engine()) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return err } } // Write the snapshot into the range. for _, kv := range snapData.KV { mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return err } } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return err } if err := batch.Commit(); err != nil { return err } // Update the range stats. r.stats.Replace(newStats) // As outlined above, last and applied index are the same after applying // the snapshot. atomic.StoreUint64(&r.lastIndex, snap.Metadata.Index) atomic.StoreUint64(&r.appliedIndex, snap.Metadata.Index) // Atomically update the descriptor and lease. if err := r.setDesc(&desc); err != nil { return err } // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarangee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(); err != nil { return err } atomic.StorePointer(&r.lease, unsafe.Pointer(lease)) return nil }
// applySnapshot updates the replica based on the given snapshot. // Returns the new last index. func (r *Replica) applySnapshot(batch engine.Engine, snap raftpb.Snapshot) (uint64, error) { snapData := roachpb.RaftSnapshotData{} err := proto.Unmarshal(snap.Data, &snapData) if err != nil { return 0, err } rangeID := r.RangeID // First, save the HardState. The HardState must not be changed // because it may record a previous vote cast by this node. This is // usually unnecessary because a snapshot is nearly always // accompanied by a new HardState which incorporates both our former // state and new information from the leader, but in the event that // the HardState has not changed, we want to use our own previous // HardState and not one that was transmitted via the snapshot. hardStateKey := keys.RaftHardStateKey(rangeID) hardState, _, err := engine.MVCCGet(batch, hardStateKey, roachpb.ZeroTimestamp, true /* consistent */, nil) if err != nil { return 0, err } // Extract the updated range descriptor. desc := snapData.RangeDescriptor // Delete everything in the range and recreate it from the snapshot. // We need to delete any old Raft log entries here because any log entries // that predate the snapshot will be orphaned and never truncated or GC'd. iter := newReplicaDataIterator(&desc, batch, false /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { if err := batch.Clear(iter.Key()); err != nil { return 0, err } } // Determine the unreplicated key prefix so we can drop any // unreplicated keys from the snapshot. unreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(desc.RangeID) // Write the snapshot into the range. for _, kv := range snapData.KV { if bytes.HasPrefix(kv.Key, unreplicatedPrefix) { continue } mvccKey := engine.MVCCKey{ Key: kv.Key, Timestamp: kv.Timestamp, } if err := batch.Put(mvccKey, kv.Value); err != nil { return 0, err } } // Write the snapshot's Raft log into the range. if _, err := r.append(batch, 0, snapData.LogEntries); err != nil { return 0, err } // Restore the saved HardState. if hardState == nil { err := engine.MVCCDelete(batch, nil, hardStateKey, roachpb.ZeroTimestamp, nil) if err != nil { return 0, err } } else { err := engine.MVCCPut(batch, nil, hardStateKey, roachpb.ZeroTimestamp, *hardState, nil) if err != nil { return 0, err } } // Read the leader lease. lease, err := loadLeaderLease(batch, desc.RangeID) if err != nil { return 0, err } // Load updated range stats. The local newStats variable will be assigned // to r.stats after the batch commits. newStats, err := newRangeStats(desc.RangeID, batch) if err != nil { return 0, err } // The next line sets the persisted last index to the last applied index. // This is not a correctness issue, but means that we may have just // transferred some entries we're about to re-request from the leader and // overwrite. // However, raft.MultiNode currently expects this behaviour, and the // performance implications are not likely to be drastic. If our feelings // about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. if err := setLastIndex(batch, rangeID, snap.Metadata.Index); err != nil { return 0, err } batch.Defer(func() { // Update the range stats. r.stats.Replace(newStats) r.mu.Lock() // As outlined above, last and applied index are the same after applying // the snapshot. r.mu.appliedIndex = snap.Metadata.Index r.mu.leaderLease = lease r.mu.Unlock() // Update other fields which are uninitialized or need updating. // This may not happen if the system config has not yet been loaded. // While config update will correctly set the fields, there is no order // guarantee in ApplySnapshot. // TODO: should go through the standard store lock when adding a replica. if err := r.updateRangeInfo(&desc); err != nil { panic(err) } // Update the range descriptor. This is done last as this is the step that // makes the Replica visible in the Store. if err := r.setDesc(&desc); err != nil { panic(err) } }) return snap.Metadata.Index, nil }
// Put sets the value for a specified key. func (r *Range) Put(batch engine.Engine, ms *engine.MVCCStats, args proto.PutRequest) (proto.PutResponse, error) { var reply proto.PutResponse return reply, engine.MVCCPut(batch, ms, args.Key, args.Timestamp, args.Value, args.Txn) }
// TestUncertaintyObservedTimestampForwarding checks that when receiving an // uncertainty restart on a node, the next attempt to read (at the increased // timestamp) is free from uncertainty. See roachpb.Transaction for details. func TestUncertaintyMaxTimestampForwarding(t *testing.T) { defer leaktest.AfterTest(t)() s := createTestDB(t) disableOwnNodeCertain(s) defer s.Stop() // Large offset so that any value in the future is an uncertain read. // Also makes sure that the values we write in the future below don't // actually wind up in the past. s.Clock.SetMaxOffset(50 * time.Second) offsetNS := int64(100) keySlow := roachpb.Key("slow") keyFast := roachpb.Key("fast") valSlow := []byte("wols") valFast := []byte("tsaf") // Write keySlow at now+offset, keyFast at now+2*offset futureTS := s.Clock.Now() futureTS.WallTime += offsetNS val := roachpb.MakeValueFromBytes(valSlow) if err := engine.MVCCPut(s.Eng, nil, keySlow, futureTS, val, nil); err != nil { t.Fatal(err) } futureTS.WallTime += offsetNS val.SetBytes(valFast) if err := engine.MVCCPut(s.Eng, nil, keyFast, futureTS, val, nil); err != nil { t.Fatal(err) } i := 0 if tErr := s.DB.Txn(func(txn *client.Txn) *roachpb.Error { i++ // The first command serves to start a Txn, fixing the timestamps. // There will be a restart, but this is idempotent. if _, pErr := txn.Scan("t", roachpb.Key("t").Next(), 0); pErr != nil { t.Fatal(pErr) } // This is a bit of a hack for the sake of this test: By visiting the // node above, we've made a note of its clock, which allows us to // prevent the restart. But we want to catch the restart, so reset the // observed timestamps. txn.Proto.ResetObservedTimestamps() // The server's clock suddenly jumps ahead of keyFast's timestamp. s.Manual.Set(2*offsetNS + 1) // Now read slowKey first. It should read at 0, catch an uncertainty error, // and get keySlow's timestamp in that error, but upgrade it to the larger // node clock (which is ahead of keyFast as well). If the last part does // not happen, the read of keyFast should fail (i.e. read nothing). // There will be exactly one restart here. if gr, pErr := txn.Get(keySlow); pErr != nil { if i != 1 { t.Fatalf("unexpected transaction error: %s", pErr) } return pErr } else if !gr.Exists() || !bytes.Equal(gr.ValueBytes(), valSlow) { t.Fatalf("read of %q returned %v, wanted value %q", keySlow, gr.Value, valSlow) } // The node should already be certain, so we expect no restart here // and to read the correct key. if gr, pErr := txn.Get(keyFast); pErr != nil { t.Fatalf("second Get failed with %s", pErr) } else if !gr.Exists() || !bytes.Equal(gr.ValueBytes(), valFast) { t.Fatalf("read of %q returned %v, wanted value %q", keyFast, gr.Value, valFast) } return nil }); tErr != nil { t.Fatal(tErr) } }
// TestUncertaintyMaxTimestampForwarding checks that we correctly read from // hosts which for which we control the uncertainty by checking that when a // transaction restarts after an uncertain read, it will also take into account // the target node's clock at the time of the failed read when forwarding the // read timestamp. // This is a prerequisite for being able to prevent further uncertainty // restarts for that node and transaction without sacrificing correctness. // See proto.Transaction.CertainNodes for details. func TestUncertaintyMaxTimestampForwarding(t *testing.T) { db, eng, clock, mClock, _, transport, err := createTestDB() defer transport.Close() // Large offset so that any value in the future is an uncertain read. // Also makes sure that the values we write in the future below don't // actually wind up in the past. clock.SetMaxOffset(50000 * time.Millisecond) txnOpts := &client.TransactionOptions{ Name: "uncertainty", } offsetNS := int64(100) keySlow := proto.Key("slow") keyFast := proto.Key("fast") valSlow := []byte("wols") valFast := []byte("tsaf") // Write keySlow at now+offset, keyFast at now+2*offset futureTS := clock.Now() futureTS.WallTime += offsetNS err = engine.MVCCPut(eng, nil, keySlow, futureTS, proto.Value{Bytes: valSlow}, nil) if err != nil { t.Fatal(err) } futureTS.WallTime += offsetNS err = engine.MVCCPut(eng, nil, keyFast, futureTS, proto.Value{Bytes: valFast}, nil) if err != nil { t.Fatal(err) } i := 0 if tErr := db.RunTransaction(txnOpts, func(txn *client.KV) error { i++ // The first command serves to start a Txn, fixing the timestamps. // There will be a restart, but this is idempotent. sr := &proto.ScanResponse{} if err = txn.Call(proto.Scan, proto.ScanArgs(proto.Key("t"), proto.Key("t"), 0), sr); err != nil { t.Fatal(err) } // The server's clock suddenly jumps ahead of keyFast's timestamp. // There will be a restart, but this is idempotent. mClock.Set(2*offsetNS + 1) // Now read slowKey first. It should read at 0, catch an uncertainty error, // and get keySlow's timestamp in that error, but upgrade it to the larger // node clock (which is ahead of keyFast as well). If the last part does // not happen, the read of keyFast should fail (i.e. read nothing). // There will be exactly one restart here. gr := &proto.GetResponse{} if err = txn.Call(proto.Get, proto.GetArgs(keySlow), gr); err != nil { if i != 1 { t.Errorf("unexpected transaction error: %v", err) } return err } if gr.Value == nil || !bytes.Equal(gr.Value.Bytes, valSlow) { t.Errorf("read of %q returned %v, wanted value %q", keySlow, gr.Value, valSlow) } gr.Reset() // The node should already be certain, so we expect no restart here // and to read the correct key. if err = txn.Call(proto.Get, proto.GetArgs(keyFast), gr); err != nil { t.Errorf("second Get failed with %v", err) } if gr.Value == nil || !bytes.Equal(gr.Value.Bytes, valFast) { t.Errorf("read of %q returned %v, wanted value %q", keyFast, gr.Value, valFast) } return nil }); tErr != nil { t.Fatal(tErr) } }
// setLastIndex persists a new last index. func setLastIndex(eng engine.Engine, rangeID roachpb.RangeID, lastIndex uint64) error { return engine.MVCCPut(eng, nil, keys.RaftLastIndexKey(rangeID), roachpb.ZeroTimestamp, roachpb.MakeValueFromBytes(encoding.EncodeUint64(nil, lastIndex)), nil) }
// Put sets the value for a specified key. func (r *Range) Put(batch engine.Engine, ms *engine.MVCCStats, args *proto.PutRequest, reply *proto.PutResponse) { err := engine.MVCCPut(batch, ms, args.Key, args.Timestamp, args.Value, args.Txn) reply.SetGoError(err) }