Example #1
0
// ResolveWriteIntentRange commits or aborts (rolls back) the range of
// write intents specified by start and end keys for a given txnID
// according to commit parameter.  ResolveWriteIntentRange will skip
// write intents of other txnIDs. Specify max=0 for unbounded
// resolves.
func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txnID []byte, commit bool) (int64, error) {
	if len(txnID) == 0 {
		return 0, util.Error("missing txnID in request")
	}

	binKey := encoding.EncodeBinary(nil, key)
	binEndKey := encoding.EncodeBinary(nil, endKey)
	nextKey := binKey

	num := int64(0)
	for {
		kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1)
		if err != nil {
			return num, err
		}
		// No more keys exists in the given range.
		if len(kvs) == 0 {
			break
		}

		remainder, currentKey := encoding.DecodeBinary(kvs[0].Key)
		if len(remainder) != 0 {
			return 0, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key)
		}
		_, _, existingTxnID, err := mvcc.getInternal(kvs[0].Key, proto.MaxTimestamp, txnID)
		// Return the error unless its a writeIntentError, which
		// will occur in the event we scan a key with a write
		// intent belonging to a different transaction.
		if _, ok := err.(*writeIntentError); err != nil && !ok {
			return num, err
		}
		// ResolveWriteIntent only needs to deal with the write
		// intents for the given txnID.
		if err == nil && bytes.Equal(existingTxnID, txnID) {
			// commits or aborts (rolls back) the write intent of
			// the given txnID.
			err = mvcc.ResolveWriteIntent(currentKey, txnID, commit)
			if err != nil {
				return num, err
			}
			num++
		}

		if max != 0 && max == num {
			break
		}

		// In order to efficiently skip the possibly long list of
		// old versions for this key; refer to Scan for details.
		nextKey = encoding.EncodeBinary(nil, NextKey(currentKey))
	}

	return num, nil
}
Example #2
0
// Scan scans the key range specified by start key through end key up
// to some maximum number of results. Specify max=0 for unbounded scans.
func (mvcc *MVCC) Scan(key Key, endKey Key, max int64, timestamp proto.Timestamp, txn *proto.Transaction) ([]proto.KeyValue, error) {
	binKey := encoding.EncodeBinary(nil, key)
	binEndKey := encoding.EncodeBinary(nil, endKey)
	nextKey := binKey

	res := []proto.KeyValue{}
	for {
		kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1)
		if err != nil {
			return nil, err
		}
		// No more keys exists in the given range.
		if len(kvs) == 0 {
			break
		}

		remainder, currentKey := encoding.DecodeBinary(kvs[0].Key)
		if len(remainder) != 0 {
			return nil, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key)
		}
		value, err := mvcc.Get(currentKey, timestamp, txn)
		if err != nil {
			return res, err
		}

		if value != nil {
			res = append(res, proto.KeyValue{Key: currentKey, Value: *value})
		}

		if max != 0 && max == int64(len(res)) {
			break
		}

		// In order to efficiently skip the possibly long list of
		// old versions for this key, we move instead to the next
		// highest key and the for loop continues by scanning again
		// with nextKey.
		// Let's say you have:
		// a
		// a<T=2>
		// a<T=1>
		// aa
		// aa<T=3>
		// aa<T=2>
		// b
		// b<T=5>
		// In this case, if we scan from "a"-"b", we wish to skip
		// a<T=2> and a<T=1> and find "aa'.
		nextKey = encoding.EncodeBinary(nil, NextKey(currentKey))
	}

	return res, nil
}
Example #3
0
// ResolveWriteIntent either commits or aborts (rolls back) an extant
// write intent for a given txnID according to commit parameter.
// ResolveWriteIntent will skip write intents of other txnIDs.
func (mvcc *MVCC) ResolveWriteIntent(key Key, txnID []byte, commit bool) error {
	if len(txnID) == 0 {
		return util.Error("missing txnID in request")
	}

	binKey := encoding.EncodeBinary(nil, key)
	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, binKey, meta)
	if err != nil {
		return err
	}
	if !ok {
		return util.Errorf("key %q does not exist", key)
	}

	if len(meta.TxnID) == 0 {
		return util.Errorf("write intent %q does not exist", key)
	}
	if !bytes.Equal(meta.TxnID, txnID) {
		return util.Errorf("cannot commit another TxnID %s from TxnID %s",
			meta.TxnID, txnID)
	}

	if !commit {
		latestKey := mvccEncodeKey(binKey, meta.Timestamp)
		err = mvcc.engine.Clear(latestKey)
		if err != nil {
			return err
		}

		// Compute the next possible mvcc value for this key.
		nextKey := NextKey(latestKey)
		// Compute the last possible mvcc value for this key.
		endScanKey := encoding.EncodeBinary(nil, NextKey(key))
		kvs, err := mvcc.engine.Scan(nextKey, endScanKey, 1)
		if err != nil {
			return err
		}
		// If there is no other version, we should just clean up the key entirely.
		if len(kvs) == 0 {
			return mvcc.engine.Clear(binKey)
		}
		_, ts, isValue := mvccDecodeKey(kvs[0].Key)
		if !isValue {
			return util.Errorf("expected an MVCC value key: %s", kvs[0].Key)
		}
		// Update the keyMetadata with the next version.
		return PutProto(mvcc.engine, binKey, &proto.MVCCMetadata{TxnID: nil, Timestamp: ts})
	}

	return PutProto(mvcc.engine, binKey, &proto.MVCCMetadata{TxnID: nil, Timestamp: meta.Timestamp})
}
Example #4
0
// FindSplitKey suggests a split key from the given user-space key range that
// aims to roughly cut into half the total number of bytes used (in raw key and
// value byte strings) in both subranges. It will operate on a snapshot of the
// underlying engine if a snapshotID is given, and in that case may safely be
// invoked in a goroutine.
// TODO(Tobias): leverage the work done here anyways to gather stats.
func (mvcc *MVCC) FindSplitKey(key Key, endKey Key, snapshotID string) (Key, error) {
	rs := util.NewWeightedReservoirSample(splitReservoirSize, nil)
	h := rs.Heap.(*util.WeightedValueHeap)

	// We expect most keys to contain anywhere between 2^4 to 2^14 bytes, so we
	// normalize to obtain typical weights that are numerically unproblematic.
	// The relevant expression is rand(0,1)**(1/weight).
	normalize := float64(1 << 6)
	binStartKey := encoding.EncodeBinary(nil, key)
	binEndKey := encoding.EncodeBinary(nil, endKey)
	totalSize := 0
	err := iterateRangeSnapshot(mvcc.engine, binStartKey, binEndKey,
		splitScanRowCount, snapshotID, func(kvs []proto.RawKeyValue) error {
			for _, kv := range kvs {
				byteCount := len(kv.Key) + len(kv.Value)
				rs.ConsiderWeighted(splitSampleItem{kv.Key, totalSize}, float64(byteCount)/normalize)
				totalSize += byteCount
			}
			return nil
		})
	if err != nil {
		return nil, err
	}

	if totalSize == 0 {
		return nil, util.Errorf("the range is empty")
	}

	// Inspect the sample to get the closest candidate that has sizeBefore >= totalSize/2.
	candidate := (*h)[0].Value.(splitSampleItem)
	cb := candidate.sizeBefore
	halfSize := totalSize / 2
	for i := 1; i < len(*h); i++ {
		if sb := (*h)[i].Value.(splitSampleItem).sizeBefore; (cb < halfSize && cb < sb) ||
			(cb > halfSize && cb > sb && sb > halfSize) {
			// The current candidate hasn't yet cracked 50% and the this value
			// is closer to doing so or we're already above but now we can
			// decrese the gap.
			candidate = (*h)[i].Value.(splitSampleItem)
			cb = candidate.sizeBefore
		}
	}
	// The key is an MVCC key, so to avoid corrupting MVCC we get the
	// associated sentinel metadata key, which is fine to split in front of.
	decodedKey, _, _ := mvccDecodeKey(candidate.Key)
	rest, humanKey := encoding.DecodeBinary(decodedKey)
	if len(rest) > 0 {
		return nil, util.Errorf("corrupt key encountered")
	}
	return humanKey, nil
}
Example #5
0
func TestMVCCAbortTxnWithPreviousVersion(t *testing.T) {
	mvcc := createTestMVCC(t)
	err := mvcc.Put(testKey1, makeTS(0, 0), value1, nil)
	err = mvcc.Put(testKey1, makeTS(1, 0), value2, nil)
	err = mvcc.Put(testKey1, makeTS(2, 0), value3, txn1)
	err = mvcc.ResolveWriteIntent(testKey1, txn1, false)

	meta, err := mvcc.engine.Get(encoding.EncodeBinary(nil, testKey1))
	if err != nil {
		t.Fatal(err)
	}
	if len(meta) == 0 {
		t.Fatalf("expected the MVCCMetadata")
	}

	value, err := mvcc.Get(testKey1, makeTS(3, 0), nil)
	if err != nil {
		t.Fatal(err)
	}
	if !value.Timestamp.Equal(makeTS(1, 0)) {
		t.Fatalf("expected timestamp %+v == %+v", value.Timestamp, makeTS(1, 0))
	}
	if !bytes.Equal(value2.Bytes, value.Bytes) {
		t.Fatalf("the value %s in get result does not match the value %s in request",
			value.Bytes, value2.Bytes)
	}
}
Example #6
0
// Put sets the value for a specified key. It will save the value with
// different versions according to its timestamp and update the key metadata.
// We assume the range will check for an existing write intent before
// executing any Put action at the MVCC level.
func (mvcc *MVCC) Put(key Key, timestamp proto.Timestamp, value proto.Value, txn *proto.Transaction) error {
	binKey := encoding.EncodeBinary(nil, key)
	if value.Timestamp != nil && !value.Timestamp.Equal(timestamp) {
		return util.Errorf(
			"the timestamp %+v provided in value does not match the timestamp %+v in request",
			value.Timestamp, timestamp)
	}
	return mvcc.putInternal(binKey, timestamp, proto.MVCCValue{Value: &value}, txn)
}
Example #7
0
// Get returns the value for the key specified in the request, while
// satisfying the given timestamp condition. The key may be
// arbitrarily encoded; it will be binary-encoded to remove any
// internal null characters. txnID in the response is used to indicate
// that the response value belongs to a write intent.
func (mvcc *MVCC) Get(key Key, timestamp proto.Timestamp, txnID []byte) (proto.Value, error) {
	binKey := encoding.EncodeBinary(nil, key)
	value, ts, _, err := mvcc.getInternal(binKey, timestamp, txnID)
	// In case of error, or the key doesn't exist, or the key was deleted.
	if err != nil || len(value) == 0 || value[0] == valueDeletedPrefix {
		return proto.Value{}, err
	}

	// TODO(Jiang-Ming): use unwrapChecksum here.
	return proto.Value{Bytes: value[1:], Timestamp: ts}, nil
}
Example #8
0
// Verify the sort ordering of successive keys with metadata and
// versioned values. In particular, the following sequence of keys /
// versions:
//
// a
// a<t=max>
// a<t=1>
// a<t=0>
// a\x00
// a\x00<t=max>
// a\x00<t=1>
// a\x00<t=0>
func TestMVCCKeys(t *testing.T) {
	aBinKey := encoding.EncodeBinary(nil, []byte("a"))
	a0BinKey := encoding.EncodeBinary(nil, []byte("a\x00"))
	keys := []string{
		string(aBinKey),
		string(mvccEncodeKey(aBinKey, makeTS(math.MaxInt64, 0))),
		string(mvccEncodeKey(aBinKey, makeTS(1, 0))),
		string(mvccEncodeKey(aBinKey, makeTS(0, 0))),
		string(a0BinKey),
		string(mvccEncodeKey(a0BinKey, makeTS(math.MaxInt64, 0))),
		string(mvccEncodeKey(a0BinKey, makeTS(1, 0))),
		string(mvccEncodeKey(a0BinKey, makeTS(0, 0))),
	}
	sortKeys := make([]string, len(keys))
	copy(sortKeys, keys)
	sort.Strings(sortKeys)
	if !reflect.DeepEqual(sortKeys, keys) {
		t.Error("expected keys to sort in order %s, but got %s", keys, sortKeys)
	}
}
Example #9
0
// ResolveWriteIntentRange commits or aborts (rolls back) the range of
// write intents specified by start and end keys for a given txn
// according to commit parameter. ResolveWriteIntentRange will skip
// write intents of other txns. Specify max=0 for unbounded resolves.
func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txn *proto.Transaction, commit bool) (int64, error) {
	if txn == nil {
		return 0, util.Error("no txn specified")
	}

	binKey := encoding.EncodeBinary(nil, key)
	binEndKey := encoding.EncodeBinary(nil, endKey)
	nextKey := binKey

	num := int64(0)
	for {
		kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1)
		if err != nil {
			return num, err
		}
		// No more keys exists in the given range.
		if len(kvs) == 0 {
			break
		}

		remainder, currentKey := encoding.DecodeBinary(kvs[0].Key)
		if len(remainder) != 0 {
			return 0, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key)
		}
		err = mvcc.ResolveWriteIntent(currentKey, txn, commit)
		if err != nil {
			log.Warningf("failed to resolve intent for key %q: %v", currentKey, err)
		} else {
			num++
			if max != 0 && max == num {
				break
			}
		}

		// In order to efficiently skip the possibly long list of
		// old versions for this key; refer to Scan for details.
		nextKey = encoding.EncodeBinary(nil, NextKey(currentKey))
	}

	return num, nil
}
Example #10
0
// Put sets the value for a specified key. It will save the value with
// different versions according to its timestamp and update the key metadata.
// We assume the range will check for an existing write intent before
// executing any Put action at the MVCC level.
func (mvcc *MVCC) Put(key Key, timestamp proto.Timestamp, value proto.Value, txnID []byte) error {
	binKey := encoding.EncodeBinary(nil, key)
	if !value.Timestamp.Equal(proto.Timestamp{}) && !value.Timestamp.Equal(timestamp) {
		return util.Errorf(
			"the timestamp %+v provided in value does not match the timestamp %+v in request",
			value.Timestamp, timestamp)
	}

	// TODO(Jiang-Ming): use wrapChecksum here.
	// into val which need to be used in get response.
	val := bytes.Join([][]byte{[]byte{valueNormalPrefix}, value.Bytes}, []byte(""))
	return mvcc.putInternal(binKey, timestamp, val, txnID)
}
Example #11
0
// Get returns the value for the key specified in the request, while
// satisfying the given timestamp condition. The key may be
// arbitrarily encoded; it will be binary-encoded to remove any
// internal null characters. If no value for the key exists, or has
// been deleted, returns nil for value.
//
// The values of multiple versions for the given key should
// be organized as follows:
// ...
// keyA : MVCCMetatata of keyA
// keyA_Timestamp_n : value of version_n
// keyA_Timestamp_n-1 : value of version_n-1
// ...
// keyA_Timestamp_0 : value of version_0
// keyB : MVCCMetadata of keyB
// ...
func (mvcc *MVCC) Get(key Key, timestamp proto.Timestamp, txn *proto.Transaction) (*proto.Value, error) {
	binKey := encoding.EncodeBinary(nil, key)
	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, binKey, meta)
	if err != nil || !ok {
		return nil, err
	}
	// If the read timestamp is greater than the latest one, we can just
	// fetch the value without a scan.
	ts := proto.Timestamp{}
	var valBytes []byte
	if !timestamp.Less(meta.Timestamp) {
		if meta.Txn != nil && (txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID)) {
			return nil, &writeIntentError{Txn: meta.Txn}
		}

		latestKey := mvccEncodeKey(binKey, meta.Timestamp)
		valBytes, err = mvcc.engine.Get(latestKey)
		ts = meta.Timestamp
	} else {
		nextKey := mvccEncodeKey(binKey, timestamp)
		// We use the PrefixEndKey(key) as the upper bound for scan.
		// If there is no other version after nextKey, it won't return
		// the value of the next key.
		kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(binKey), 1)
		if len(kvs) == 0 {
			return nil, err
		}
		_, ts, _ = mvccDecodeKey(kvs[0].Key)
		valBytes = kvs[0].Value
	}
	if valBytes == nil {
		return nil, nil
	}
	// Unmarshal the mvcc value.
	value := &proto.MVCCValue{}
	if err := gogoproto.Unmarshal(valBytes, value); err != nil {
		return nil, err
	}
	// Set the timestamp if the value is not nil (i.e. not a deletion tombstone).
	if value.Value != nil {
		value.Value.Timestamp = &ts
	} else if !value.Deleted {
		log.Warningf("encountered MVCC value at key %q with a nil proto.Value but with !Deleted: %+v", key, value)
	}
	return value.Value, nil
}
Example #12
0
func TestMVCCAbortTxn(t *testing.T) {
	mvcc := createTestMVCC(t)
	err := mvcc.Put(testKey1, makeTS(0, 0), value1, txn1)
	err = mvcc.ResolveWriteIntent(testKey1, txn1, false)
	if err != nil {
		t.Fatal(err)
	}

	value, err := mvcc.Get(testKey1, makeTS(1, 0), nil)
	if value != nil {
		t.Fatalf("the value should be empty")
	}
	meta, err := mvcc.engine.Get(encoding.EncodeBinary(nil, testKey1))
	if err != nil {
		t.Fatal(err)
	}
	if len(meta) != 0 {
		t.Fatalf("expected no more MVCCMetadata")
	}
}
Example #13
0
// ResolveWriteIntent either commits or aborts (rolls back) an extant
// write intent for a given txn according to commit parameter.
// ResolveWriteIntent will skip write intents of other txns.
//
// Transaction epochs deserve a bit of explanation. The epoch for a
// transaction is incremented on transaction retry. Transaction retry
// is different from abort. Retries occur in SSI transactions when the
// commit timestamp is not equal to the proposed transaction
// timestamp. This might be because writes to different keys had to
// use higher timestamps than expected because of existing, committed
// value, or because reads pushed the transaction's commit timestamp
// forward. Retries also occur in the event that the txn tries to push
// another txn in order to write an intent but fails (i.e. it has
// lower priority).
//
// Because successive retries of a transaction may end up writing to
// different keys, the epochs serve to classify which intents get
// committed in the event the transaction succeeds (all those with
// epoch matching the commit epoch), and which intents get aborted,
// even if the transaction succeeds.
func (mvcc *MVCC) ResolveWriteIntent(key Key, txn *proto.Transaction, commit bool) error {
	if txn == nil {
		return util.Error("no txn specified")
	}

	binKey := encoding.EncodeBinary(nil, key)
	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, binKey, meta)
	if err != nil {
		return err
	}
	// For cases where there's no write intent to resolve, or one exists
	// which we can't resolve, this is a noop.
	if !ok || meta.Txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID) {
		return nil
	}
	// If we're committing the intent and the txn epochs match, the
	// intent value is good to go and we just set meta.Txn to nil.
	// We may have to update the actual version value if timestamps
	// are different between meta and txn.
	if commit && meta.Txn.Epoch == txn.Epoch {
		// Use a write batch because we may have multiple puts.
		var batch []interface{}
		origTimestamp := meta.Timestamp
		batchPut, err := MakeBatchPutProto(binKey, &proto.MVCCMetadata{Timestamp: txn.Timestamp})
		if err != nil {
			return err
		}
		batch = append(batch, batchPut)
		// If timestamp of value changed, need to rewrite versioned value.
		// TODO(spencer,tobias): think about a new merge operator for
		// updating key of intent value to new timestamp instead of
		// read-then-write.
		if !origTimestamp.Equal(txn.Timestamp) {
			origKey := mvccEncodeKey(binKey, origTimestamp)
			newKey := mvccEncodeKey(binKey, txn.Timestamp)
			valBytes, err := mvcc.engine.Get(origKey)
			if err != nil {
				return err
			}
			batch = append(batch, BatchDelete(origKey))
			batch = append(batch, BatchPut(proto.RawKeyValue{Key: newKey, Value: valBytes}))
		}
		return mvcc.engine.WriteBatch(batch)
	}

	// If not committing (this can be the case if commit=true, but the
	// committed epoch is different from this intent's epoch), we must
	// find the next versioned value and reset the metadata's latest
	// timestamp. If there are no other versioned values, we delete the
	// metadata key. Because there are multiple steps here and we want
	// them all to commit, or none to commit, we schedule them using a
	// write batch.
	var batch []interface{}

	// First clear the intent value.
	latestKey := mvccEncodeKey(binKey, meta.Timestamp)
	batch = append(batch, BatchDelete(latestKey))

	// Compute the next possible mvcc value for this key.
	nextKey := NextKey(latestKey)
	// Compute the last possible mvcc value for this key.
	endScanKey := encoding.EncodeBinary(nil, NextKey(key))
	kvs, err := mvcc.engine.Scan(nextKey, endScanKey, 1)
	if err != nil {
		return err
	}
	// If there is no other version, we should just clean up the key entirely.
	if len(kvs) == 0 {
		batch = append(batch, BatchDelete(binKey))
	} else {
		_, ts, isValue := mvccDecodeKey(kvs[0].Key)
		if !isValue {
			return util.Errorf("expected an MVCC value key: %s", kvs[0].Key)
		}
		// Update the keyMetadata with the next version.
		batchPut, err := MakeBatchPutProto(binKey, &proto.MVCCMetadata{Timestamp: ts})
		if err != nil {
			return err
		}
		batch = append(batch, batchPut)
	}

	return mvcc.engine.WriteBatch(batch)
}
Example #14
0
// Author: Spencer Kimball ([email protected])

package engine

import (
	"bytes"
	"reflect"
	"testing"

	gogoproto "code.google.com/p/gogoprotobuf/proto"
	"github.com/cockroachdb/cockroach/proto"
	"github.com/cockroachdb/cockroach/util/encoding"
)

var (
	aKey  = encoding.EncodeBinary(nil, Key("a"))
	bKey  = encoding.EncodeBinary(nil, Key("b"))
	cKey  = encoding.EncodeBinary(nil, Key("c"))
	aKeys = []Key{
		aKey,
		mvccEncodeKey(aKey, makeTS(2E9, 0)),
		mvccEncodeKey(aKey, makeTS(1E9, 1)),
		mvccEncodeKey(aKey, makeTS(1E9, 0)),
	}
	bKeys = []Key{
		bKey,
		mvccEncodeKey(bKey, makeTS(2E9, 0)),
		mvccEncodeKey(bKey, makeTS(1E9, 0)),
	}
	cKeys = []Key{
		mvccEncodeKey(cKey, makeTS(1E9, 0)),
Example #15
0
// Delete marks the key deleted and will not return in the next get response.
func (mvcc *MVCC) Delete(key Key, timestamp proto.Timestamp, txn *proto.Transaction) error {
	binKey := encoding.EncodeBinary(nil, key)
	return mvcc.putInternal(binKey, timestamp, proto.MVCCValue{Deleted: true}, txn)
}
Example #16
0
// TestRangeSnapshot.
func TestRangeSnapshot(t *testing.T) {
	rng, _, clock, _ := createTestRangeWithClock(t)
	defer rng.Stop()

	key1 := []byte("a")
	key2 := []byte("b")
	val1 := []byte("1")
	val2 := []byte("2")
	val3 := []byte("3")

	pArgs, pReply := putArgs(key1, val1, 0)
	pArgs.Timestamp = clock.Now()
	err := rng.ReadWriteCmd("Put", pArgs, pReply)

	pArgs, pReply = putArgs(key2, val2, 0)
	pArgs.Timestamp = clock.Now()
	err = rng.ReadWriteCmd("Put", pArgs, pReply)

	gArgs, gReply := getArgs(key1, 0)
	gArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("Get", gArgs, gReply)

	if err != nil {
		t.Fatalf("error : %s", err)
	}
	if !bytes.Equal(gReply.Value.Bytes, val1) {
		t.Fatalf("the value %s in get result does not match the value %s in request",
			gReply.Value.Bytes, val1)
	}

	iscArgs, iscReply := internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0)
	iscArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	snapshotID := iscReply.SnapshotId
	expectedKey := encoding.EncodeBinary(nil, key1)
	expectedVal := getSerializedMVCCValue(&proto.Value{Bytes: val1})
	if len(iscReply.Rows) != 4 ||
		!bytes.Equal(iscReply.Rows[0].Key, expectedKey) ||
		!bytes.Equal(iscReply.Rows[1].Value, expectedVal) {
		t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request",
			iscReply.Rows[1].Value, iscReply.Rows[0].Key, expectedVal, expectedKey)
	}

	pArgs, pReply = putArgs(key2, val3, 0)
	pArgs.Timestamp = clock.Now()
	err = rng.ReadWriteCmd("Put", pArgs, pReply)

	// Scan with the previous snapshot will get the old value val2 of key2.
	iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, snapshotID, 0)
	iscArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	expectedKey = encoding.EncodeBinary(nil, key2)
	expectedVal = getSerializedMVCCValue(&proto.Value{Bytes: val2})
	if len(iscReply.Rows) != 4 ||
		!bytes.Equal(iscReply.Rows[2].Key, expectedKey) ||
		!bytes.Equal(iscReply.Rows[3].Value, expectedVal) {
		t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request",
			iscReply.Rows[3].Value, iscReply.Rows[2].Key, expectedVal, expectedKey)
	}
	snapshotLastKey := iscReply.Rows[3].Key

	// Create a new snapshot to cover the latest value.
	iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0)
	iscArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	snapshotID2 := iscReply.SnapshotId
	expectedKey = encoding.EncodeBinary(nil, key2)
	expectedVal = getSerializedMVCCValue(&proto.Value{Bytes: val3})
	// Expect one more mvcc version.
	if len(iscReply.Rows) != 5 ||
		!bytes.Equal(iscReply.Rows[2].Key, expectedKey) ||
		!bytes.Equal(iscReply.Rows[3].Value, expectedVal) {
		t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request",
			iscReply.Rows[3].Value, iscReply.Rows[2].Key, expectedVal, expectedKey)
	}
	snapshot2LastKey := iscReply.Rows[4].Key

	iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(snapshotLastKey), engine.KeyMax, 50, snapshotID, 0)
	iscArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	if len(iscReply.Rows) != 0 {
		t.Fatalf("error : %d", len(iscReply.Rows))
	}
	iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(snapshot2LastKey), engine.KeyMax, 50, snapshotID2, 0)
	iscArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	if len(iscReply.Rows) != 0 {
		t.Fatalf("error : %d", len(iscReply.Rows))
	}
}
Example #17
0
// ResolveWriteIntent either commits or aborts (rolls back) an extant
// write intent for a given txn according to commit parameter.
// ResolveWriteIntent will skip write intents of other txns.
//
// Transaction epochs deserve a bit of explanation. The epoch for a
// transaction is incremented on transaction retry. Transaction retry
// is different from abort. Retries occur in SSI transactions when the
// commit timestamp is not equal to the proposed transaction
// timestamp. This might be because writes to different keys had to
// use higher timestamps than expected because of existing, committed
// value, or because reads pushed the transaction's commit timestamp
// forward. Retries also occur in the event that the txn tries to push
// another txn in order to write an intent but fails (i.e. it has
// lower priority).
//
// Because successive retries of a transaction may end up writing to
// different keys, the epochs serve to classify which intents get
// committed in the event the transaction succeeds (all those with
// epoch matching the commit epoch), and which intents get aborted,
// even if the transaction succeeds.
func (mvcc *MVCC) ResolveWriteIntent(key Key, txn *proto.Transaction) error {
	if txn == nil {
		return util.Error("no txn specified")
	}

	binKey := encoding.EncodeBinary(nil, key)
	meta := &proto.MVCCMetadata{}
	ok, err := GetProto(mvcc.engine, binKey, meta)
	if err != nil {
		return err
	}
	// For cases where there's no write intent to resolve, or one exists
	// which we can't resolve, this is a noop.
	if !ok || meta.Txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID) {
		return nil
	}
	// If we're committing, or if the commit timestamp of the intent has
	// been moved forward, and if the proposed epoch matches the existing
	// epoch: update the meta.Txn. For commit, it's set to nil;
	// otherwise, we update its value. We may have to update the actual
	// version value (remove old and create new with proper
	// timestamp-encoded key) if timestamp changed.
	commit := txn.Status == proto.COMMITTED
	pushed := txn.Status == proto.PENDING && meta.Txn.Timestamp.Less(txn.Timestamp)
	if (commit || pushed) && meta.Txn.Epoch == txn.Epoch {
		// Use a write batch because we may have multiple puts.
		var batch []interface{}
		origTimestamp := meta.Timestamp
		var metaTxn *proto.Transaction
		if pushed { // keep intent if we're pushing timestamp
			metaTxn = txn
		}
		batchPut, err := MakeBatchPutProto(binKey, &proto.MVCCMetadata{Timestamp: txn.Timestamp, Txn: metaTxn})
		if err != nil {
			return err
		}
		batch = append(batch, batchPut)
		// If timestamp of value changed, need to rewrite versioned value.
		// TODO(spencer,tobias): think about a new merge operator for
		// updating key of intent value to new timestamp instead of
		// read-then-write.
		if !origTimestamp.Equal(txn.Timestamp) {
			origKey := mvccEncodeKey(binKey, origTimestamp)
			newKey := mvccEncodeKey(binKey, txn.Timestamp)
			valBytes, err := mvcc.engine.Get(origKey)
			if err != nil {
				return err
			}
			batch = append(batch, BatchDelete(origKey))
			batch = append(batch, BatchPut(proto.RawKeyValue{Key: newKey, Value: valBytes}))
		}
		return mvcc.engine.WriteBatch(batch)
	}

	// This method shouldn't be called with this instance, but there's
	// nothing to do if the epochs match and the state is still PENDING.
	if txn.Status == proto.PENDING && meta.Txn.Epoch == txn.Epoch {
		return nil
	}

	// Otherwise, we're deleting the intent. We must find the next
	// versioned value and reset the metadata's latest timestamp. If
	// there are no other versioned values, we delete the metadata
	// key. Because there are multiple steps here and we want them all
	// to commit, or none to commit, we schedule them using a write
	// batch.
	var batch []interface{}

	// First clear the intent value.
	latestKey := mvccEncodeKey(binKey, meta.Timestamp)
	batch = append(batch, BatchDelete(latestKey))

	// Compute the next possible mvcc value for this key.
	nextKey := NextKey(latestKey)
	// Compute the last possible mvcc value for this key.
	endScanKey := encoding.EncodeBinary(nil, NextKey(key))
	kvs, err := mvcc.engine.Scan(nextKey, endScanKey, 1)
	if err != nil {
		return err
	}
	// If there is no other version, we should just clean up the key entirely.
	if len(kvs) == 0 {
		batch = append(batch, BatchDelete(binKey))
	} else {
		_, ts, isValue := mvccDecodeKey(kvs[0].Key)
		if !isValue {
			return util.Errorf("expected an MVCC value key: %s", kvs[0].Key)
		}
		// Update the keyMetadata with the next version.
		batchPut, err := MakeBatchPutProto(binKey, &proto.MVCCMetadata{Timestamp: ts})
		if err != nil {
			return err
		}
		batch = append(batch, batchPut)
	}

	return mvcc.engine.WriteBatch(batch)
}
Example #18
0
// Delete marks the key deleted and will not return in the next get response.
func (mvcc *MVCC) Delete(key Key, timestamp proto.Timestamp, txnID []byte) error {
	binKey := encoding.EncodeBinary(nil, key)
	return mvcc.putInternal(binKey, timestamp, []byte{valueDeletedPrefix}, txnID)
}
Example #19
0
// TestRangeSnapshot.
func TestRangeSnapshot(t *testing.T) {
	rng, _, clock, _ := createTestRangeWithClock(t)
	defer rng.Stop()

	key1 := "a"
	key2 := "b"
	val1 := "1"
	val2 := "2"
	val3 := "3"

	pArgs, pReply := putArgs(key1, val1, 0)
	pArgs.Timestamp = clock.Now()
	err := rng.ReadWriteCmd("Put", pArgs, pReply)

	pArgs, pReply = putArgs(key2, val2, 0)
	pArgs.Timestamp = clock.Now()
	err = rng.ReadWriteCmd("Put", pArgs, pReply)

	gArgs, gReply := getArgs(key1, 0)
	gArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("Get", gArgs, gReply)

	if err != nil {
		t.Fatalf("error : %s", err)
	}
	if !bytes.Equal(gReply.Value.Bytes, []byte(val1)) {
		t.Fatalf("the value %s in get result does not match the value %s in request",
			gReply.Value.Bytes, []byte(val1))
	}

	irsArgs, irsReply := internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0)
	irsArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	snapshotID := irsReply.SnapshotId
	var valueNormalPrefix = byte(0)
	expectedKey := encoding.EncodeBinary(nil, []byte(key1))
	expectedVal := bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val1)}, []byte(""))
	if len(irsReply.Rows) != 4 ||
		!bytes.Equal(irsReply.Rows[0].Key, expectedKey) ||
		!bytes.Equal(irsReply.Rows[1].Value, expectedVal) {
		t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request",
			irsReply.Rows[1].Value, irsReply.Rows[0].Key, expectedVal, expectedKey)
	}

	pArgs, pReply = putArgs(key2, val3, 0)
	pArgs.Timestamp = clock.Now()
	err = rng.ReadWriteCmd("Put", pArgs, pReply)

	// Scan with the previous snapshot will get the old value val2 of key2.
	irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, snapshotID, 0)
	irsArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	expectedKey = encoding.EncodeBinary(nil, []byte(key2))
	expectedVal = bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val2)}, []byte(""))
	if len(irsReply.Rows) != 4 ||
		!bytes.Equal(irsReply.Rows[2].Key, expectedKey) ||
		!bytes.Equal(irsReply.Rows[3].Value, expectedVal) {
		t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request",
			irsReply.Rows[3].Value, irsReply.Rows[2].Key, expectedVal, expectedKey)
	}
	snapshotLastKey := irsReply.Rows[3].Key

	// Create a new snapshot to cover the latest value.
	irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0)
	irsArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	snapshotID2 := irsReply.SnapshotId
	expectedKey = encoding.EncodeBinary(nil, []byte(key2))
	expectedVal = bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val3)}, []byte(""))
	// Expect one more mvcc version.
	if len(irsReply.Rows) != 5 ||
		!bytes.Equal(irsReply.Rows[2].Key, expectedKey) ||
		!bytes.Equal(irsReply.Rows[3].Value, expectedVal) {
		t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request",
			irsReply.Rows[3].Value, irsReply.Rows[2].Key, expectedVal, expectedKey)
	}
	snapshot2LastKey := irsReply.Rows[4].Key

	irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(snapshotLastKey), engine.KeyMax, 50, snapshotID, 0)
	irsArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	if len(irsReply.Rows) != 0 {
		t.Fatalf("error : %d", len(irsReply.Rows))
	}
	irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(snapshot2LastKey), engine.KeyMax, 50, snapshotID2, 0)
	irsArgs.Timestamp = clock.Now()
	err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply)
	if err != nil {
		t.Fatalf("error : %s", err)
	}
	if len(irsReply.Rows) != 0 {
		t.Fatalf("error : %d", len(irsReply.Rows))
	}
}