Example #1
0
// requestLeaderLease sends a request to obtain or extend a leader lease for
// this replica. Unless an error is returned, the obtained lease will be valid
// for a time interval containing the requested timestamp.
func (r *Range) requestLeaderLease(timestamp proto.Timestamp) error {
	// TODO(Tobias): get duration from configuration, either as a config flag
	// or, later, dynamically adjusted.
	duration := int64(DefaultLeaderLeaseDuration)
	// Prepare a Raft command to get a leader lease for this replica.
	expiration := timestamp.Add(duration, 0)
	args := &proto.InternalLeaderLeaseRequest{
		RequestHeader: proto.RequestHeader{
			Key:       r.Desc().StartKey,
			Timestamp: timestamp,
			CmdID: proto.ClientCmdID{
				WallTime: r.rm.Clock().Now().WallTime,
				Random:   rand.Int63(),
			},
		},
		Lease: proto.Lease{
			Start:      timestamp,
			Expiration: expiration,
			RaftNodeID: r.rm.RaftNodeID(),
		},
	}
	// Send lease request directly to raft in order to skip unnecessary
	// checks from normal request machinery, (e.g. the command queue).
	errChan, pendingCmd := r.proposeRaftCommand(r.context(), args, &proto.InternalLeaderLeaseResponse{})
	var err error
	if err = <-errChan; err == nil {
		// Next if the command was committed, wait for the range to apply it.
		err = <-pendingCmd.done
	}
	return err
}
Example #2
0
// getInternal implements the actual logic of get function.
// The values of multiple versions for the given key should
// be organized as follows:
// ...
// keyA : MVCCMetatata of keyA
// keyA_Timestamp_n : value of version_n
// keyA_Timestamp_n-1 : value of version_n-1
// ...
// keyA_Timestamp_0 : value of version_0
// keyB : MVCCMetadata of keyB
// ...
func (mvcc *MVCC) getInternal(key Key, timestamp proto.Timestamp, txnID []byte) ([]byte, proto.Timestamp, []byte, error) {
	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, key, meta)
	if err != nil || !ok {
		return nil, proto.Timestamp{}, nil, err
	}
	// If the read timestamp is greater than the latest one, we can just
	// fetch the value without a scan.
	if !timestamp.Less(meta.Timestamp) {
		if len(meta.TxnID) > 0 && (len(txnID) == 0 || !bytes.Equal(meta.TxnID, txnID)) {
			return nil, proto.Timestamp{}, nil, &writeIntentError{TxnID: meta.TxnID}
		}

		latestKey := mvccEncodeKey(key, meta.Timestamp)
		val, err := mvcc.engine.Get(latestKey)
		return val, meta.Timestamp, meta.TxnID, err
	}

	nextKey := mvccEncodeKey(key, timestamp)
	// We use the PrefixEndKey(key) as the upper bound for scan.
	// If there is no other version after nextKey, it won't return
	// the value of the next key.
	kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(key), 1)
	if len(kvs) > 0 {
		_, ts, _ := mvccDecodeKey(kvs[0].Key)
		return kvs[0].Value, ts, nil, err
	}
	return nil, proto.Timestamp{}, nil, err
}
Example #3
0
// TestCoordinatorHeartbeat verifies periodic heartbeat of the
// transaction record.
func TestCoordinatorHeartbeat(t *testing.T) {
	db, _, manual := createTestDB(t)
	defer db.Close()

	// Set heartbeat interval to 1ms for testing.
	db.coordinator.heartbeatInterval = 1 * time.Millisecond

	txnID := engine.Key("txn")
	<-db.Put(createPutRequest(engine.Key("a"), []byte("value"), txnID))

	// Verify 3 heartbeats.
	var heartbeatTS proto.Timestamp
	for i := 0; i < 3; i++ {
		if err := util.IsTrueWithin(func() bool {
			ok, txn, err := getTxn(db, engine.MakeKey(engine.KeyLocalTransactionPrefix, txnID))
			if !ok || err != nil {
				return false
			}
			// Advance clock by 1ns.
			// Locking the coordinator to prevent a data race.
			db.coordinator.Lock()
			*manual = hlc.ManualClock(*manual + 1)
			db.coordinator.Unlock()
			if heartbeatTS.Less(*txn.LastHeartbeat) {
				heartbeatTS = *txn.LastHeartbeat
				return true
			}
			return false
		}, 50*time.Millisecond); err != nil {
			t.Error("expected initial heartbeat within 50ms")
		}
	}
}
Example #4
0
// requestLeaderLease sends a request to obtain or extend a leader lease for
// this replica. Unless an error is returned, the obtained lease will be valid
// for a time interval containing the requested timestamp.
func (r *Replica) requestLeaderLease(timestamp proto.Timestamp) error {
	// TODO(Tobias): get duration from configuration, either as a config flag
	// or, later, dynamically adjusted.
	duration := int64(DefaultLeaderLeaseDuration)
	// Prepare a Raft command to get a leader lease for this replica.
	expiration := timestamp.Add(duration, 0)
	desc := r.Desc()
	args := &proto.LeaderLeaseRequest{
		RequestHeader: proto.RequestHeader{
			Key:       desc.StartKey,
			Timestamp: timestamp,
			CmdID: proto.ClientCmdID{
				WallTime: r.rm.Clock().Now().WallTime,
				Random:   rand.Int63(),
			},
			RangeID: desc.RangeID,
		},
		Lease: proto.Lease{
			Start:      timestamp,
			Expiration: expiration,
			RaftNodeID: r.rm.RaftNodeID(),
		},
	}
	// Send lease request directly to raft in order to skip unnecessary
	// checks from normal request machinery, (e.g. the command queue).
	// Note that the command itself isn't traced, but usually the caller
	// waiting for the result has an active Trace.
	errChan, pendingCmd := r.proposeRaftCommand(r.context(), args)
	if err := <-errChan; err != nil {
		return err
	}
	// Next if the command was committed, wait for the range to apply it.
	return (<-pendingCmd.done).Err
}
Example #5
0
// ExampleNewClock shows how to create a new
// hybrid logical clock based on the local machine's
// physical clock. The sanity checks in this example
// will, of course, not fail and the output will be
// the age of the Unix epoch in nanoseconds.
func ExampleNewClock() {
	// Initialize a new clock, using the local
	// physical clock.
	c := NewClock(UnixNano)
	// Update the state of the hybrid clock.
	s := c.Now()
	time.Sleep(50 * time.Nanosecond)
	t := proto.Timestamp{WallTime: UnixNano()}
	// The sanity checks below will usually never be triggered.

	// Timestamp implements the util.Ordered interface.
	if s.Less(t) || !t.Less(s) {
		log.Fatalf("The later timestamp is smaller than the earlier one")
	}

	if t.WallTime-s.WallTime > 0 {
		log.Fatalf("HLC timestamp %d deviates from physical clock %d", s, t)
	}

	if s.Logical > 0 {
		log.Fatalf("Trivial timestamp has logical component")
	}

	fmt.Printf("The Unix Epoch is now approximately %dns old.\n", t.WallTime)
}
Example #6
0
// Add the specified timestamp to the cache as covering the range of
// keys from start to end. If end is nil, the range covers the start
// key only. txnID is nil for no transaction. readOnly specifies
// whether the command adding this timestamp was read-only or not.
func (tc *TimestampCache) Add(start, end proto.Key, timestamp proto.Timestamp, txnID []byte, readOnly bool) {
	// This gives us a memory-efficient end key if end is empty.
	if len(end) == 0 {
		end = start.Next()
		start = end[:len(start)]
	}
	if tc.latest.Less(timestamp) {
		tc.latest = timestamp
	}
	// Only add to the cache if the timestamp is more recent than the
	// low water mark.
	if tc.lowWater.Less(timestamp) {
		// Check existing, overlapping entries. Remove superseded
		// entries or return without adding this entry if necessary.
		key := tc.cache.NewKey(start, end)
		for _, o := range tc.cache.GetOverlaps(start, end) {
			ce := o.Value.(cacheEntry)
			if ce.readOnly != readOnly {
				continue
			}
			if o.Key.Contains(key) && !ce.timestamp.Less(timestamp) {
				return // don't add this key; there's already a cache entry with >= timestamp.
			} else if key.Contains(o.Key) && !timestamp.Less(ce.timestamp) {
				tc.cache.Del(o.Key) // delete existing key; this cache entry supersedes.
			}
		}
		ce := cacheEntry{timestamp: timestamp, txnID: txnID, readOnly: readOnly}
		tc.cache.Add(key, ce)
	}
}
Example #7
0
// TestClock performs a complete test of all basic phenomena,
// including backward jumps in local physical time and clock offset.
func TestClock(t *testing.T) {
	m := NewManualClock(0)
	c := NewClock(m.UnixNano)
	c.SetMaxOffset(1000)
	expectedHistory := []struct {
		// The physical time that this event should take place at.
		wallClock int64
		event     Event
		// If this is a receive event, this holds the "input" timestamp.
		input *proto.Timestamp
		// The expected timestamp generated from the input.
		expected proto.Timestamp
	}{
		// A few valid steps to warm up.
		{5, SEND, nil, proto.Timestamp{WallTime: 5, Logical: 0}},
		{6, SEND, nil, proto.Timestamp{WallTime: 6, Logical: 0}},
		{10, RECV, &proto.Timestamp{WallTime: 10, Logical: 5}, proto.Timestamp{WallTime: 10, Logical: 6}},
		// Our clock mysteriously jumps back.
		{7, SEND, nil, proto.Timestamp{WallTime: 10, Logical: 7}},
		// Wall clocks coincide, but the local logical clock wins.
		{8, RECV, &proto.Timestamp{WallTime: 10, Logical: 4}, proto.Timestamp{WallTime: 10, Logical: 8}},
		// The next message comes from a faulty clock and should
		// be discarded.
		{9, RECV, &proto.Timestamp{WallTime: 1100, Logical: 888}, proto.Timestamp{WallTime: 10, Logical: 8}},
		// Wall clocks coincide, but the remote logical clock wins.
		{10, RECV, &proto.Timestamp{WallTime: 10, Logical: 99}, proto.Timestamp{WallTime: 10, Logical: 100}},
		// The physical clock has caught up and takes over.
		{11, RECV, &proto.Timestamp{WallTime: 10, Logical: 31}, proto.Timestamp{WallTime: 11, Logical: 0}},
		{11, SEND, nil, proto.Timestamp{WallTime: 11, Logical: 1}},
	}

	var current proto.Timestamp
	var err error
	for i, step := range expectedHistory {
		m.Set(step.wallClock)
		switch step.event {
		case SEND:
			current = c.Now()
		case RECV:
			fallthrough
		default:
			previous := c.Timestamp()
			current, err = c.Update(*step.input)
			if current.Equal(previous) && err == nil {
				t.Errorf("%d: clock not updated even though no error occurred", i)
			}
		}
		if !current.Equal(step.expected) {
			t.Fatalf("HLC error: %d expected %v, got %v", i, step.expected, current)
		}
	}
	c.Now()
}
Example #8
0
// Get returns the value for the key specified in the request, while
// satisfying the given timestamp condition. The key may be
// arbitrarily encoded; it will be binary-encoded to remove any
// internal null characters. If no value for the key exists, or has
// been deleted, returns nil for value.
//
// The values of multiple versions for the given key should
// be organized as follows:
// ...
// keyA : MVCCMetatata of keyA
// keyA_Timestamp_n : value of version_n
// keyA_Timestamp_n-1 : value of version_n-1
// ...
// keyA_Timestamp_0 : value of version_0
// keyB : MVCCMetadata of keyB
// ...
func (mvcc *MVCC) Get(key Key, timestamp proto.Timestamp, txn *proto.Transaction) (*proto.Value, error) {
	binKey := encoding.EncodeBinary(nil, key)
	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, binKey, meta)
	if err != nil || !ok {
		return nil, err
	}
	// If the read timestamp is greater than the latest one, we can just
	// fetch the value without a scan.
	ts := proto.Timestamp{}
	var valBytes []byte
	if !timestamp.Less(meta.Timestamp) {
		if meta.Txn != nil && (txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID)) {
			return nil, &writeIntentError{Txn: meta.Txn}
		}

		latestKey := mvccEncodeKey(binKey, meta.Timestamp)
		valBytes, err = mvcc.engine.Get(latestKey)
		ts = meta.Timestamp
	} else {
		nextKey := mvccEncodeKey(binKey, timestamp)
		// We use the PrefixEndKey(key) as the upper bound for scan.
		// If there is no other version after nextKey, it won't return
		// the value of the next key.
		kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(binKey), 1)
		if len(kvs) == 0 {
			return nil, err
		}
		_, ts, _ = mvccDecodeKey(kvs[0].Key)
		valBytes = kvs[0].Value
	}
	if valBytes == nil {
		return nil, nil
	}
	// Unmarshal the mvcc value.
	value := &proto.MVCCValue{}
	if err := gogoproto.Unmarshal(valBytes, value); err != nil {
		return nil, err
	}
	// Set the timestamp if the value is not nil (i.e. not a deletion tombstone).
	if value.Value != nil {
		value.Value.Timestamp = &ts
	} else if !value.Deleted {
		log.Warningf("encountered MVCC value at key %q with a nil proto.Value but with !Deleted: %+v", key, value)
	}
	return value.Value, nil
}
Example #9
0
// TODO(Tobias): Turn this into a writebatch with account stats in a reusable way.
// This requires use of RocksDB's merge operator to implement increasable counters
func (mvcc *MVCC) putInternal(key Key, timestamp proto.Timestamp, value []byte, txnID []byte) error {
	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, key, meta)
	if err != nil {
		return err
	}

	// In case the key metadata exists.
	if ok {
		// There is an uncommitted write intent and the current Put
		// operation does not come from the same transaction.
		// This should not happen since range should check the existing
		// write intent before executing any Put action at MVCC level.
		if len(meta.TxnID) > 0 && (len(txnID) == 0 || !bytes.Equal(meta.TxnID, txnID)) {
			return &writeIntentError{TxnID: meta.TxnID}
		}

		if meta.Timestamp.Less(timestamp) ||
			(timestamp.Equal(meta.Timestamp) && bytes.Equal(meta.TxnID, txnID)) {
			// Update MVCC metadata.
			meta = &proto.MVCCMetadata{TxnID: txnID, Timestamp: timestamp}
			if err := PutProto(mvcc.engine, key, meta); err != nil {
				return err
			}
		} else {
			// In case we receive a Put request to update an old version,
			// it must be an error since raft should handle any client
			// retry from timeout.
			return &writeTimestampTooOldError{Timestamp: meta.Timestamp}
		}
	} else { // In case the key metadata does not exist yet.
		// Create key metadata.
		meta = &proto.MVCCMetadata{TxnID: txnID, Timestamp: timestamp}
		if err := PutProto(mvcc.engine, key, meta); err != nil {
			return err
		}
	}

	// Save the value with the given version (Key + Timestamp).
	return mvcc.engine.Put(mvccEncodeKey(key, timestamp), value)
}
// TestTxnCoordSenderHeartbeat verifies periodic heartbeat of the
// transaction record.
func TestTxnCoordSenderHeartbeat(t *testing.T) {
	defer leaktest.AfterTest(t)
	s := createTestDB(t)
	defer s.Stop()
	defer teardownHeartbeats(s.Sender)

	// Set heartbeat interval to 1ms for testing.
	s.Sender.heartbeatInterval = 1 * time.Millisecond

	initialTxn := newTxn(s.Clock, proto.Key("a"))
	call := proto.Call{
		Args:  createPutRequest(proto.Key("a"), []byte("value"), initialTxn),
		Reply: &proto.PutResponse{}}
	if err := sendCall(s.Sender, call); err != nil {
		t.Fatal(err)
	}
	*initialTxn = *call.Reply.Header().Txn

	// Verify 3 heartbeats.
	var heartbeatTS proto.Timestamp
	for i := 0; i < 3; i++ {
		if err := util.IsTrueWithin(func() bool {
			ok, txn, err := getTxn(s.Sender, initialTxn)
			if !ok || err != nil {
				return false
			}
			// Advance clock by 1ns.
			// Locking the TxnCoordSender to prevent a data race.
			s.Sender.Lock()
			s.Manual.Increment(1)
			s.Sender.Unlock()
			if heartbeatTS.Less(*txn.LastHeartbeat) {
				heartbeatTS = *txn.LastHeartbeat
				return true
			}
			return false
		}, 50*time.Millisecond); err != nil {
			t.Error("expected initial heartbeat within 50ms")
		}
	}
}
Example #11
0
func (kv *KeyValue) setTimestamp(t proto.Timestamp) {
	kv.Timestamp = t.GoTime()
}
Example #12
0
// putInternal adds a new timestamped value to the specified key.
// If value is nil, creates a deletion tombstone value.
func (mvcc *MVCC) putInternal(key Key, timestamp proto.Timestamp, value proto.MVCCValue, txn *proto.Transaction) error {
	if value.Value != nil && value.Value.Bytes != nil && value.Value.Integer != nil {
		return util.Errorf("key %q value contains both a byte slice and an integer value: %+v", key, value)
	}

	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, key, meta)
	if err != nil {
		return err
	}

	// Use a batch because a put involves multiple writes.
	var batch []interface{}

	// In case the key metadata exists.
	if ok {
		// There is an uncommitted write intent and the current Put
		// operation does not come from the same transaction.
		// This should not happen since range should check the existing
		// write intent before executing any Put action at MVCC level.
		if meta.Txn != nil && (txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID)) {
			return &writeIntentError{Txn: meta.Txn}
		}

		// We can update the current metadata only if both the timestamp
		// and epoch of the new intent are greater than or equal to
		// existing. If either of these conditions doesn't hold, it's
		// likely the case that an older RPC is arriving out of order.
		if !timestamp.Less(meta.Timestamp) && (meta.Txn == nil || txn.Epoch >= meta.Txn.Epoch) {
			// If this is an intent and timestamps have changed, need to remove old version.
			if meta.Txn != nil && !timestamp.Equal(meta.Timestamp) {
				batch = append(batch, BatchDelete(mvccEncodeKey(key, meta.Timestamp)))
			}
			meta = &proto.MVCCMetadata{Txn: txn, Timestamp: timestamp}
			batchPut, err := MakeBatchPutProto(key, meta)
			if err != nil {
				return err
			}
			batch = append(batch, batchPut)
		} else {
			// In case we receive a Put request to update an old version,
			// it must be an error since raft should handle any client
			// retry from timeout.
			return &writeTooOldError{Timestamp: meta.Timestamp, Txn: meta.Txn}
		}
	} else { // In case the key metadata does not exist yet.
		// Create key metadata.
		meta = &proto.MVCCMetadata{Txn: txn, Timestamp: timestamp}
		batchPut, err := MakeBatchPutProto(key, meta)
		if err != nil {
			return err
		}
		batch = append(batch, batchPut)
	}

	// Make sure to zero the redundant timestamp (timestamp is encoded
	// into the key, so don't need it in both places).
	if value.Value != nil {
		value.Value.Timestamp = nil
	}
	batchPut, err := MakeBatchPutProto(mvccEncodeKey(key, timestamp), &value)
	if err != nil {
		return err
	}
	batch = append(batch, batchPut)
	return mvcc.engine.WriteBatch(batch)
}