// requestLeaderLease sends a request to obtain or extend a leader lease for // this replica. Unless an error is returned, the obtained lease will be valid // for a time interval containing the requested timestamp. func (r *Range) requestLeaderLease(timestamp proto.Timestamp) error { // TODO(Tobias): get duration from configuration, either as a config flag // or, later, dynamically adjusted. duration := int64(DefaultLeaderLeaseDuration) // Prepare a Raft command to get a leader lease for this replica. expiration := timestamp.Add(duration, 0) args := &proto.InternalLeaderLeaseRequest{ RequestHeader: proto.RequestHeader{ Key: r.Desc().StartKey, Timestamp: timestamp, CmdID: proto.ClientCmdID{ WallTime: r.rm.Clock().Now().WallTime, Random: rand.Int63(), }, }, Lease: proto.Lease{ Start: timestamp, Expiration: expiration, RaftNodeID: r.rm.RaftNodeID(), }, } // Send lease request directly to raft in order to skip unnecessary // checks from normal request machinery, (e.g. the command queue). errChan, pendingCmd := r.proposeRaftCommand(r.context(), args, &proto.InternalLeaderLeaseResponse{}) var err error if err = <-errChan; err == nil { // Next if the command was committed, wait for the range to apply it. err = <-pendingCmd.done } return err }
// getInternal implements the actual logic of get function. // The values of multiple versions for the given key should // be organized as follows: // ... // keyA : MVCCMetatata of keyA // keyA_Timestamp_n : value of version_n // keyA_Timestamp_n-1 : value of version_n-1 // ... // keyA_Timestamp_0 : value of version_0 // keyB : MVCCMetadata of keyB // ... func (mvcc *MVCC) getInternal(key Key, timestamp proto.Timestamp, txnID []byte) ([]byte, proto.Timestamp, []byte, error) { meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, key, meta) if err != nil || !ok { return nil, proto.Timestamp{}, nil, err } // If the read timestamp is greater than the latest one, we can just // fetch the value without a scan. if !timestamp.Less(meta.Timestamp) { if len(meta.TxnID) > 0 && (len(txnID) == 0 || !bytes.Equal(meta.TxnID, txnID)) { return nil, proto.Timestamp{}, nil, &writeIntentError{TxnID: meta.TxnID} } latestKey := mvccEncodeKey(key, meta.Timestamp) val, err := mvcc.engine.Get(latestKey) return val, meta.Timestamp, meta.TxnID, err } nextKey := mvccEncodeKey(key, timestamp) // We use the PrefixEndKey(key) as the upper bound for scan. // If there is no other version after nextKey, it won't return // the value of the next key. kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(key), 1) if len(kvs) > 0 { _, ts, _ := mvccDecodeKey(kvs[0].Key) return kvs[0].Value, ts, nil, err } return nil, proto.Timestamp{}, nil, err }
// TestCoordinatorHeartbeat verifies periodic heartbeat of the // transaction record. func TestCoordinatorHeartbeat(t *testing.T) { db, _, manual := createTestDB(t) defer db.Close() // Set heartbeat interval to 1ms for testing. db.coordinator.heartbeatInterval = 1 * time.Millisecond txnID := engine.Key("txn") <-db.Put(createPutRequest(engine.Key("a"), []byte("value"), txnID)) // Verify 3 heartbeats. var heartbeatTS proto.Timestamp for i := 0; i < 3; i++ { if err := util.IsTrueWithin(func() bool { ok, txn, err := getTxn(db, engine.MakeKey(engine.KeyLocalTransactionPrefix, txnID)) if !ok || err != nil { return false } // Advance clock by 1ns. // Locking the coordinator to prevent a data race. db.coordinator.Lock() *manual = hlc.ManualClock(*manual + 1) db.coordinator.Unlock() if heartbeatTS.Less(*txn.LastHeartbeat) { heartbeatTS = *txn.LastHeartbeat return true } return false }, 50*time.Millisecond); err != nil { t.Error("expected initial heartbeat within 50ms") } } }
// requestLeaderLease sends a request to obtain or extend a leader lease for // this replica. Unless an error is returned, the obtained lease will be valid // for a time interval containing the requested timestamp. func (r *Replica) requestLeaderLease(timestamp proto.Timestamp) error { // TODO(Tobias): get duration from configuration, either as a config flag // or, later, dynamically adjusted. duration := int64(DefaultLeaderLeaseDuration) // Prepare a Raft command to get a leader lease for this replica. expiration := timestamp.Add(duration, 0) desc := r.Desc() args := &proto.LeaderLeaseRequest{ RequestHeader: proto.RequestHeader{ Key: desc.StartKey, Timestamp: timestamp, CmdID: proto.ClientCmdID{ WallTime: r.rm.Clock().Now().WallTime, Random: rand.Int63(), }, RangeID: desc.RangeID, }, Lease: proto.Lease{ Start: timestamp, Expiration: expiration, RaftNodeID: r.rm.RaftNodeID(), }, } // Send lease request directly to raft in order to skip unnecessary // checks from normal request machinery, (e.g. the command queue). // Note that the command itself isn't traced, but usually the caller // waiting for the result has an active Trace. errChan, pendingCmd := r.proposeRaftCommand(r.context(), args) if err := <-errChan; err != nil { return err } // Next if the command was committed, wait for the range to apply it. return (<-pendingCmd.done).Err }
// ExampleNewClock shows how to create a new // hybrid logical clock based on the local machine's // physical clock. The sanity checks in this example // will, of course, not fail and the output will be // the age of the Unix epoch in nanoseconds. func ExampleNewClock() { // Initialize a new clock, using the local // physical clock. c := NewClock(UnixNano) // Update the state of the hybrid clock. s := c.Now() time.Sleep(50 * time.Nanosecond) t := proto.Timestamp{WallTime: UnixNano()} // The sanity checks below will usually never be triggered. // Timestamp implements the util.Ordered interface. if s.Less(t) || !t.Less(s) { log.Fatalf("The later timestamp is smaller than the earlier one") } if t.WallTime-s.WallTime > 0 { log.Fatalf("HLC timestamp %d deviates from physical clock %d", s, t) } if s.Logical > 0 { log.Fatalf("Trivial timestamp has logical component") } fmt.Printf("The Unix Epoch is now approximately %dns old.\n", t.WallTime) }
// Add the specified timestamp to the cache as covering the range of // keys from start to end. If end is nil, the range covers the start // key only. txnID is nil for no transaction. readOnly specifies // whether the command adding this timestamp was read-only or not. func (tc *TimestampCache) Add(start, end proto.Key, timestamp proto.Timestamp, txnID []byte, readOnly bool) { // This gives us a memory-efficient end key if end is empty. if len(end) == 0 { end = start.Next() start = end[:len(start)] } if tc.latest.Less(timestamp) { tc.latest = timestamp } // Only add to the cache if the timestamp is more recent than the // low water mark. if tc.lowWater.Less(timestamp) { // Check existing, overlapping entries. Remove superseded // entries or return without adding this entry if necessary. key := tc.cache.NewKey(start, end) for _, o := range tc.cache.GetOverlaps(start, end) { ce := o.Value.(cacheEntry) if ce.readOnly != readOnly { continue } if o.Key.Contains(key) && !ce.timestamp.Less(timestamp) { return // don't add this key; there's already a cache entry with >= timestamp. } else if key.Contains(o.Key) && !timestamp.Less(ce.timestamp) { tc.cache.Del(o.Key) // delete existing key; this cache entry supersedes. } } ce := cacheEntry{timestamp: timestamp, txnID: txnID, readOnly: readOnly} tc.cache.Add(key, ce) } }
// TestClock performs a complete test of all basic phenomena, // including backward jumps in local physical time and clock offset. func TestClock(t *testing.T) { m := NewManualClock(0) c := NewClock(m.UnixNano) c.SetMaxOffset(1000) expectedHistory := []struct { // The physical time that this event should take place at. wallClock int64 event Event // If this is a receive event, this holds the "input" timestamp. input *proto.Timestamp // The expected timestamp generated from the input. expected proto.Timestamp }{ // A few valid steps to warm up. {5, SEND, nil, proto.Timestamp{WallTime: 5, Logical: 0}}, {6, SEND, nil, proto.Timestamp{WallTime: 6, Logical: 0}}, {10, RECV, &proto.Timestamp{WallTime: 10, Logical: 5}, proto.Timestamp{WallTime: 10, Logical: 6}}, // Our clock mysteriously jumps back. {7, SEND, nil, proto.Timestamp{WallTime: 10, Logical: 7}}, // Wall clocks coincide, but the local logical clock wins. {8, RECV, &proto.Timestamp{WallTime: 10, Logical: 4}, proto.Timestamp{WallTime: 10, Logical: 8}}, // The next message comes from a faulty clock and should // be discarded. {9, RECV, &proto.Timestamp{WallTime: 1100, Logical: 888}, proto.Timestamp{WallTime: 10, Logical: 8}}, // Wall clocks coincide, but the remote logical clock wins. {10, RECV, &proto.Timestamp{WallTime: 10, Logical: 99}, proto.Timestamp{WallTime: 10, Logical: 100}}, // The physical clock has caught up and takes over. {11, RECV, &proto.Timestamp{WallTime: 10, Logical: 31}, proto.Timestamp{WallTime: 11, Logical: 0}}, {11, SEND, nil, proto.Timestamp{WallTime: 11, Logical: 1}}, } var current proto.Timestamp var err error for i, step := range expectedHistory { m.Set(step.wallClock) switch step.event { case SEND: current = c.Now() case RECV: fallthrough default: previous := c.Timestamp() current, err = c.Update(*step.input) if current.Equal(previous) && err == nil { t.Errorf("%d: clock not updated even though no error occurred", i) } } if !current.Equal(step.expected) { t.Fatalf("HLC error: %d expected %v, got %v", i, step.expected, current) } } c.Now() }
// Get returns the value for the key specified in the request, while // satisfying the given timestamp condition. The key may be // arbitrarily encoded; it will be binary-encoded to remove any // internal null characters. If no value for the key exists, or has // been deleted, returns nil for value. // // The values of multiple versions for the given key should // be organized as follows: // ... // keyA : MVCCMetatata of keyA // keyA_Timestamp_n : value of version_n // keyA_Timestamp_n-1 : value of version_n-1 // ... // keyA_Timestamp_0 : value of version_0 // keyB : MVCCMetadata of keyB // ... func (mvcc *MVCC) Get(key Key, timestamp proto.Timestamp, txn *proto.Transaction) (*proto.Value, error) { binKey := encoding.EncodeBinary(nil, key) meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, binKey, meta) if err != nil || !ok { return nil, err } // If the read timestamp is greater than the latest one, we can just // fetch the value without a scan. ts := proto.Timestamp{} var valBytes []byte if !timestamp.Less(meta.Timestamp) { if meta.Txn != nil && (txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID)) { return nil, &writeIntentError{Txn: meta.Txn} } latestKey := mvccEncodeKey(binKey, meta.Timestamp) valBytes, err = mvcc.engine.Get(latestKey) ts = meta.Timestamp } else { nextKey := mvccEncodeKey(binKey, timestamp) // We use the PrefixEndKey(key) as the upper bound for scan. // If there is no other version after nextKey, it won't return // the value of the next key. kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(binKey), 1) if len(kvs) == 0 { return nil, err } _, ts, _ = mvccDecodeKey(kvs[0].Key) valBytes = kvs[0].Value } if valBytes == nil { return nil, nil } // Unmarshal the mvcc value. value := &proto.MVCCValue{} if err := gogoproto.Unmarshal(valBytes, value); err != nil { return nil, err } // Set the timestamp if the value is not nil (i.e. not a deletion tombstone). if value.Value != nil { value.Value.Timestamp = &ts } else if !value.Deleted { log.Warningf("encountered MVCC value at key %q with a nil proto.Value but with !Deleted: %+v", key, value) } return value.Value, nil }
// TODO(Tobias): Turn this into a writebatch with account stats in a reusable way. // This requires use of RocksDB's merge operator to implement increasable counters func (mvcc *MVCC) putInternal(key Key, timestamp proto.Timestamp, value []byte, txnID []byte) error { meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, key, meta) if err != nil { return err } // In case the key metadata exists. if ok { // There is an uncommitted write intent and the current Put // operation does not come from the same transaction. // This should not happen since range should check the existing // write intent before executing any Put action at MVCC level. if len(meta.TxnID) > 0 && (len(txnID) == 0 || !bytes.Equal(meta.TxnID, txnID)) { return &writeIntentError{TxnID: meta.TxnID} } if meta.Timestamp.Less(timestamp) || (timestamp.Equal(meta.Timestamp) && bytes.Equal(meta.TxnID, txnID)) { // Update MVCC metadata. meta = &proto.MVCCMetadata{TxnID: txnID, Timestamp: timestamp} if err := PutProto(mvcc.engine, key, meta); err != nil { return err } } else { // In case we receive a Put request to update an old version, // it must be an error since raft should handle any client // retry from timeout. return &writeTimestampTooOldError{Timestamp: meta.Timestamp} } } else { // In case the key metadata does not exist yet. // Create key metadata. meta = &proto.MVCCMetadata{TxnID: txnID, Timestamp: timestamp} if err := PutProto(mvcc.engine, key, meta); err != nil { return err } } // Save the value with the given version (Key + Timestamp). return mvcc.engine.Put(mvccEncodeKey(key, timestamp), value) }
// TestTxnCoordSenderHeartbeat verifies periodic heartbeat of the // transaction record. func TestTxnCoordSenderHeartbeat(t *testing.T) { defer leaktest.AfterTest(t) s := createTestDB(t) defer s.Stop() defer teardownHeartbeats(s.Sender) // Set heartbeat interval to 1ms for testing. s.Sender.heartbeatInterval = 1 * time.Millisecond initialTxn := newTxn(s.Clock, proto.Key("a")) call := proto.Call{ Args: createPutRequest(proto.Key("a"), []byte("value"), initialTxn), Reply: &proto.PutResponse{}} if err := sendCall(s.Sender, call); err != nil { t.Fatal(err) } *initialTxn = *call.Reply.Header().Txn // Verify 3 heartbeats. var heartbeatTS proto.Timestamp for i := 0; i < 3; i++ { if err := util.IsTrueWithin(func() bool { ok, txn, err := getTxn(s.Sender, initialTxn) if !ok || err != nil { return false } // Advance clock by 1ns. // Locking the TxnCoordSender to prevent a data race. s.Sender.Lock() s.Manual.Increment(1) s.Sender.Unlock() if heartbeatTS.Less(*txn.LastHeartbeat) { heartbeatTS = *txn.LastHeartbeat return true } return false }, 50*time.Millisecond); err != nil { t.Error("expected initial heartbeat within 50ms") } } }
func (kv *KeyValue) setTimestamp(t proto.Timestamp) { kv.Timestamp = t.GoTime() }
// putInternal adds a new timestamped value to the specified key. // If value is nil, creates a deletion tombstone value. func (mvcc *MVCC) putInternal(key Key, timestamp proto.Timestamp, value proto.MVCCValue, txn *proto.Transaction) error { if value.Value != nil && value.Value.Bytes != nil && value.Value.Integer != nil { return util.Errorf("key %q value contains both a byte slice and an integer value: %+v", key, value) } meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, key, meta) if err != nil { return err } // Use a batch because a put involves multiple writes. var batch []interface{} // In case the key metadata exists. if ok { // There is an uncommitted write intent and the current Put // operation does not come from the same transaction. // This should not happen since range should check the existing // write intent before executing any Put action at MVCC level. if meta.Txn != nil && (txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID)) { return &writeIntentError{Txn: meta.Txn} } // We can update the current metadata only if both the timestamp // and epoch of the new intent are greater than or equal to // existing. If either of these conditions doesn't hold, it's // likely the case that an older RPC is arriving out of order. if !timestamp.Less(meta.Timestamp) && (meta.Txn == nil || txn.Epoch >= meta.Txn.Epoch) { // If this is an intent and timestamps have changed, need to remove old version. if meta.Txn != nil && !timestamp.Equal(meta.Timestamp) { batch = append(batch, BatchDelete(mvccEncodeKey(key, meta.Timestamp))) } meta = &proto.MVCCMetadata{Txn: txn, Timestamp: timestamp} batchPut, err := MakeBatchPutProto(key, meta) if err != nil { return err } batch = append(batch, batchPut) } else { // In case we receive a Put request to update an old version, // it must be an error since raft should handle any client // retry from timeout. return &writeTooOldError{Timestamp: meta.Timestamp, Txn: meta.Txn} } } else { // In case the key metadata does not exist yet. // Create key metadata. meta = &proto.MVCCMetadata{Txn: txn, Timestamp: timestamp} batchPut, err := MakeBatchPutProto(key, meta) if err != nil { return err } batch = append(batch, batchPut) } // Make sure to zero the redundant timestamp (timestamp is encoded // into the key, so don't need it in both places). if value.Value != nil { value.Value.Timestamp = nil } batchPut, err := MakeBatchPutProto(mvccEncodeKey(key, timestamp), &value) if err != nil { return err } batch = append(batch, batchPut) return mvcc.engine.WriteBatch(batch) }