// ResolveWriteIntentRange commits or aborts (rolls back) the range of // write intents specified by start and end keys for a given txnID // according to commit parameter. ResolveWriteIntentRange will skip // write intents of other txnIDs. Specify max=0 for unbounded // resolves. func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txnID []byte, commit bool) (int64, error) { if len(txnID) == 0 { return 0, util.Error("missing txnID in request") } binKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) nextKey := binKey num := int64(0) for { kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1) if err != nil { return num, err } // No more keys exists in the given range. if len(kvs) == 0 { break } remainder, currentKey := encoding.DecodeBinary(kvs[0].Key) if len(remainder) != 0 { return 0, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key) } _, _, existingTxnID, err := mvcc.getInternal(kvs[0].Key, proto.MaxTimestamp, txnID) // Return the error unless its a writeIntentError, which // will occur in the event we scan a key with a write // intent belonging to a different transaction. if _, ok := err.(*writeIntentError); err != nil && !ok { return num, err } // ResolveWriteIntent only needs to deal with the write // intents for the given txnID. if err == nil && bytes.Equal(existingTxnID, txnID) { // commits or aborts (rolls back) the write intent of // the given txnID. err = mvcc.ResolveWriteIntent(currentKey, txnID, commit) if err != nil { return num, err } num++ } if max != 0 && max == num { break } // In order to efficiently skip the possibly long list of // old versions for this key; refer to Scan for details. nextKey = encoding.EncodeBinary(nil, NextKey(currentKey)) } return num, nil }
// Scan scans the key range specified by start key through end key up // to some maximum number of results. Specify max=0 for unbounded scans. func (mvcc *MVCC) Scan(key Key, endKey Key, max int64, timestamp proto.Timestamp, txn *proto.Transaction) ([]proto.KeyValue, error) { binKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) nextKey := binKey res := []proto.KeyValue{} for { kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1) if err != nil { return nil, err } // No more keys exists in the given range. if len(kvs) == 0 { break } remainder, currentKey := encoding.DecodeBinary(kvs[0].Key) if len(remainder) != 0 { return nil, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key) } value, err := mvcc.Get(currentKey, timestamp, txn) if err != nil { return res, err } if value != nil { res = append(res, proto.KeyValue{Key: currentKey, Value: *value}) } if max != 0 && max == int64(len(res)) { break } // In order to efficiently skip the possibly long list of // old versions for this key, we move instead to the next // highest key and the for loop continues by scanning again // with nextKey. // Let's say you have: // a // a<T=2> // a<T=1> // aa // aa<T=3> // aa<T=2> // b // b<T=5> // In this case, if we scan from "a"-"b", we wish to skip // a<T=2> and a<T=1> and find "aa'. nextKey = encoding.EncodeBinary(nil, NextKey(currentKey)) } return res, nil }
// ResolveWriteIntent either commits or aborts (rolls back) an extant // write intent for a given txnID according to commit parameter. // ResolveWriteIntent will skip write intents of other txnIDs. func (mvcc *MVCC) ResolveWriteIntent(key Key, txnID []byte, commit bool) error { if len(txnID) == 0 { return util.Error("missing txnID in request") } binKey := encoding.EncodeBinary(nil, key) meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, binKey, meta) if err != nil { return err } if !ok { return util.Errorf("key %q does not exist", key) } if len(meta.TxnID) == 0 { return util.Errorf("write intent %q does not exist", key) } if !bytes.Equal(meta.TxnID, txnID) { return util.Errorf("cannot commit another TxnID %s from TxnID %s", meta.TxnID, txnID) } if !commit { latestKey := mvccEncodeKey(binKey, meta.Timestamp) err = mvcc.engine.Clear(latestKey) if err != nil { return err } // Compute the next possible mvcc value for this key. nextKey := NextKey(latestKey) // Compute the last possible mvcc value for this key. endScanKey := encoding.EncodeBinary(nil, NextKey(key)) kvs, err := mvcc.engine.Scan(nextKey, endScanKey, 1) if err != nil { return err } // If there is no other version, we should just clean up the key entirely. if len(kvs) == 0 { return mvcc.engine.Clear(binKey) } _, ts, isValue := mvccDecodeKey(kvs[0].Key) if !isValue { return util.Errorf("expected an MVCC value key: %s", kvs[0].Key) } // Update the keyMetadata with the next version. return PutProto(mvcc.engine, binKey, &proto.MVCCMetadata{TxnID: nil, Timestamp: ts}) } return PutProto(mvcc.engine, binKey, &proto.MVCCMetadata{TxnID: nil, Timestamp: meta.Timestamp}) }
// FindSplitKey suggests a split key from the given user-space key range that // aims to roughly cut into half the total number of bytes used (in raw key and // value byte strings) in both subranges. It will operate on a snapshot of the // underlying engine if a snapshotID is given, and in that case may safely be // invoked in a goroutine. // TODO(Tobias): leverage the work done here anyways to gather stats. func (mvcc *MVCC) FindSplitKey(key Key, endKey Key, snapshotID string) (Key, error) { rs := util.NewWeightedReservoirSample(splitReservoirSize, nil) h := rs.Heap.(*util.WeightedValueHeap) // We expect most keys to contain anywhere between 2^4 to 2^14 bytes, so we // normalize to obtain typical weights that are numerically unproblematic. // The relevant expression is rand(0,1)**(1/weight). normalize := float64(1 << 6) binStartKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) totalSize := 0 err := iterateRangeSnapshot(mvcc.engine, binStartKey, binEndKey, splitScanRowCount, snapshotID, func(kvs []proto.RawKeyValue) error { for _, kv := range kvs { byteCount := len(kv.Key) + len(kv.Value) rs.ConsiderWeighted(splitSampleItem{kv.Key, totalSize}, float64(byteCount)/normalize) totalSize += byteCount } return nil }) if err != nil { return nil, err } if totalSize == 0 { return nil, util.Errorf("the range is empty") } // Inspect the sample to get the closest candidate that has sizeBefore >= totalSize/2. candidate := (*h)[0].Value.(splitSampleItem) cb := candidate.sizeBefore halfSize := totalSize / 2 for i := 1; i < len(*h); i++ { if sb := (*h)[i].Value.(splitSampleItem).sizeBefore; (cb < halfSize && cb < sb) || (cb > halfSize && cb > sb && sb > halfSize) { // The current candidate hasn't yet cracked 50% and the this value // is closer to doing so or we're already above but now we can // decrese the gap. candidate = (*h)[i].Value.(splitSampleItem) cb = candidate.sizeBefore } } // The key is an MVCC key, so to avoid corrupting MVCC we get the // associated sentinel metadata key, which is fine to split in front of. decodedKey, _, _ := mvccDecodeKey(candidate.Key) rest, humanKey := encoding.DecodeBinary(decodedKey) if len(rest) > 0 { return nil, util.Errorf("corrupt key encountered") } return humanKey, nil }
func TestMVCCAbortTxnWithPreviousVersion(t *testing.T) { mvcc := createTestMVCC(t) err := mvcc.Put(testKey1, makeTS(0, 0), value1, nil) err = mvcc.Put(testKey1, makeTS(1, 0), value2, nil) err = mvcc.Put(testKey1, makeTS(2, 0), value3, txn1) err = mvcc.ResolveWriteIntent(testKey1, txn1, false) meta, err := mvcc.engine.Get(encoding.EncodeBinary(nil, testKey1)) if err != nil { t.Fatal(err) } if len(meta) == 0 { t.Fatalf("expected the MVCCMetadata") } value, err := mvcc.Get(testKey1, makeTS(3, 0), nil) if err != nil { t.Fatal(err) } if !value.Timestamp.Equal(makeTS(1, 0)) { t.Fatalf("expected timestamp %+v == %+v", value.Timestamp, makeTS(1, 0)) } if !bytes.Equal(value2.Bytes, value.Bytes) { t.Fatalf("the value %s in get result does not match the value %s in request", value.Bytes, value2.Bytes) } }
// Put sets the value for a specified key. It will save the value with // different versions according to its timestamp and update the key metadata. // We assume the range will check for an existing write intent before // executing any Put action at the MVCC level. func (mvcc *MVCC) Put(key Key, timestamp proto.Timestamp, value proto.Value, txn *proto.Transaction) error { binKey := encoding.EncodeBinary(nil, key) if value.Timestamp != nil && !value.Timestamp.Equal(timestamp) { return util.Errorf( "the timestamp %+v provided in value does not match the timestamp %+v in request", value.Timestamp, timestamp) } return mvcc.putInternal(binKey, timestamp, proto.MVCCValue{Value: &value}, txn) }
// Get returns the value for the key specified in the request, while // satisfying the given timestamp condition. The key may be // arbitrarily encoded; it will be binary-encoded to remove any // internal null characters. txnID in the response is used to indicate // that the response value belongs to a write intent. func (mvcc *MVCC) Get(key Key, timestamp proto.Timestamp, txnID []byte) (proto.Value, error) { binKey := encoding.EncodeBinary(nil, key) value, ts, _, err := mvcc.getInternal(binKey, timestamp, txnID) // In case of error, or the key doesn't exist, or the key was deleted. if err != nil || len(value) == 0 || value[0] == valueDeletedPrefix { return proto.Value{}, err } // TODO(Jiang-Ming): use unwrapChecksum here. return proto.Value{Bytes: value[1:], Timestamp: ts}, nil }
// Verify the sort ordering of successive keys with metadata and // versioned values. In particular, the following sequence of keys / // versions: // // a // a<t=max> // a<t=1> // a<t=0> // a\x00 // a\x00<t=max> // a\x00<t=1> // a\x00<t=0> func TestMVCCKeys(t *testing.T) { aBinKey := encoding.EncodeBinary(nil, []byte("a")) a0BinKey := encoding.EncodeBinary(nil, []byte("a\x00")) keys := []string{ string(aBinKey), string(mvccEncodeKey(aBinKey, makeTS(math.MaxInt64, 0))), string(mvccEncodeKey(aBinKey, makeTS(1, 0))), string(mvccEncodeKey(aBinKey, makeTS(0, 0))), string(a0BinKey), string(mvccEncodeKey(a0BinKey, makeTS(math.MaxInt64, 0))), string(mvccEncodeKey(a0BinKey, makeTS(1, 0))), string(mvccEncodeKey(a0BinKey, makeTS(0, 0))), } sortKeys := make([]string, len(keys)) copy(sortKeys, keys) sort.Strings(sortKeys) if !reflect.DeepEqual(sortKeys, keys) { t.Error("expected keys to sort in order %s, but got %s", keys, sortKeys) } }
// ResolveWriteIntentRange commits or aborts (rolls back) the range of // write intents specified by start and end keys for a given txn // according to commit parameter. ResolveWriteIntentRange will skip // write intents of other txns. Specify max=0 for unbounded resolves. func (mvcc *MVCC) ResolveWriteIntentRange(key Key, endKey Key, max int64, txn *proto.Transaction, commit bool) (int64, error) { if txn == nil { return 0, util.Error("no txn specified") } binKey := encoding.EncodeBinary(nil, key) binEndKey := encoding.EncodeBinary(nil, endKey) nextKey := binKey num := int64(0) for { kvs, err := mvcc.engine.Scan(nextKey, binEndKey, 1) if err != nil { return num, err } // No more keys exists in the given range. if len(kvs) == 0 { break } remainder, currentKey := encoding.DecodeBinary(kvs[0].Key) if len(remainder) != 0 { return 0, util.Errorf("expected an MVCC metadata key: %s", kvs[0].Key) } err = mvcc.ResolveWriteIntent(currentKey, txn, commit) if err != nil { log.Warningf("failed to resolve intent for key %q: %v", currentKey, err) } else { num++ if max != 0 && max == num { break } } // In order to efficiently skip the possibly long list of // old versions for this key; refer to Scan for details. nextKey = encoding.EncodeBinary(nil, NextKey(currentKey)) } return num, nil }
// Put sets the value for a specified key. It will save the value with // different versions according to its timestamp and update the key metadata. // We assume the range will check for an existing write intent before // executing any Put action at the MVCC level. func (mvcc *MVCC) Put(key Key, timestamp proto.Timestamp, value proto.Value, txnID []byte) error { binKey := encoding.EncodeBinary(nil, key) if !value.Timestamp.Equal(proto.Timestamp{}) && !value.Timestamp.Equal(timestamp) { return util.Errorf( "the timestamp %+v provided in value does not match the timestamp %+v in request", value.Timestamp, timestamp) } // TODO(Jiang-Ming): use wrapChecksum here. // into val which need to be used in get response. val := bytes.Join([][]byte{[]byte{valueNormalPrefix}, value.Bytes}, []byte("")) return mvcc.putInternal(binKey, timestamp, val, txnID) }
// Get returns the value for the key specified in the request, while // satisfying the given timestamp condition. The key may be // arbitrarily encoded; it will be binary-encoded to remove any // internal null characters. If no value for the key exists, or has // been deleted, returns nil for value. // // The values of multiple versions for the given key should // be organized as follows: // ... // keyA : MVCCMetatata of keyA // keyA_Timestamp_n : value of version_n // keyA_Timestamp_n-1 : value of version_n-1 // ... // keyA_Timestamp_0 : value of version_0 // keyB : MVCCMetadata of keyB // ... func (mvcc *MVCC) Get(key Key, timestamp proto.Timestamp, txn *proto.Transaction) (*proto.Value, error) { binKey := encoding.EncodeBinary(nil, key) meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, binKey, meta) if err != nil || !ok { return nil, err } // If the read timestamp is greater than the latest one, we can just // fetch the value without a scan. ts := proto.Timestamp{} var valBytes []byte if !timestamp.Less(meta.Timestamp) { if meta.Txn != nil && (txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID)) { return nil, &writeIntentError{Txn: meta.Txn} } latestKey := mvccEncodeKey(binKey, meta.Timestamp) valBytes, err = mvcc.engine.Get(latestKey) ts = meta.Timestamp } else { nextKey := mvccEncodeKey(binKey, timestamp) // We use the PrefixEndKey(key) as the upper bound for scan. // If there is no other version after nextKey, it won't return // the value of the next key. kvs, err := mvcc.engine.Scan(nextKey, PrefixEndKey(binKey), 1) if len(kvs) == 0 { return nil, err } _, ts, _ = mvccDecodeKey(kvs[0].Key) valBytes = kvs[0].Value } if valBytes == nil { return nil, nil } // Unmarshal the mvcc value. value := &proto.MVCCValue{} if err := gogoproto.Unmarshal(valBytes, value); err != nil { return nil, err } // Set the timestamp if the value is not nil (i.e. not a deletion tombstone). if value.Value != nil { value.Value.Timestamp = &ts } else if !value.Deleted { log.Warningf("encountered MVCC value at key %q with a nil proto.Value but with !Deleted: %+v", key, value) } return value.Value, nil }
func TestMVCCAbortTxn(t *testing.T) { mvcc := createTestMVCC(t) err := mvcc.Put(testKey1, makeTS(0, 0), value1, txn1) err = mvcc.ResolveWriteIntent(testKey1, txn1, false) if err != nil { t.Fatal(err) } value, err := mvcc.Get(testKey1, makeTS(1, 0), nil) if value != nil { t.Fatalf("the value should be empty") } meta, err := mvcc.engine.Get(encoding.EncodeBinary(nil, testKey1)) if err != nil { t.Fatal(err) } if len(meta) != 0 { t.Fatalf("expected no more MVCCMetadata") } }
// ResolveWriteIntent either commits or aborts (rolls back) an extant // write intent for a given txn according to commit parameter. // ResolveWriteIntent will skip write intents of other txns. // // Transaction epochs deserve a bit of explanation. The epoch for a // transaction is incremented on transaction retry. Transaction retry // is different from abort. Retries occur in SSI transactions when the // commit timestamp is not equal to the proposed transaction // timestamp. This might be because writes to different keys had to // use higher timestamps than expected because of existing, committed // value, or because reads pushed the transaction's commit timestamp // forward. Retries also occur in the event that the txn tries to push // another txn in order to write an intent but fails (i.e. it has // lower priority). // // Because successive retries of a transaction may end up writing to // different keys, the epochs serve to classify which intents get // committed in the event the transaction succeeds (all those with // epoch matching the commit epoch), and which intents get aborted, // even if the transaction succeeds. func (mvcc *MVCC) ResolveWriteIntent(key Key, txn *proto.Transaction, commit bool) error { if txn == nil { return util.Error("no txn specified") } binKey := encoding.EncodeBinary(nil, key) meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, binKey, meta) if err != nil { return err } // For cases where there's no write intent to resolve, or one exists // which we can't resolve, this is a noop. if !ok || meta.Txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID) { return nil } // If we're committing the intent and the txn epochs match, the // intent value is good to go and we just set meta.Txn to nil. // We may have to update the actual version value if timestamps // are different between meta and txn. if commit && meta.Txn.Epoch == txn.Epoch { // Use a write batch because we may have multiple puts. var batch []interface{} origTimestamp := meta.Timestamp batchPut, err := MakeBatchPutProto(binKey, &proto.MVCCMetadata{Timestamp: txn.Timestamp}) if err != nil { return err } batch = append(batch, batchPut) // If timestamp of value changed, need to rewrite versioned value. // TODO(spencer,tobias): think about a new merge operator for // updating key of intent value to new timestamp instead of // read-then-write. if !origTimestamp.Equal(txn.Timestamp) { origKey := mvccEncodeKey(binKey, origTimestamp) newKey := mvccEncodeKey(binKey, txn.Timestamp) valBytes, err := mvcc.engine.Get(origKey) if err != nil { return err } batch = append(batch, BatchDelete(origKey)) batch = append(batch, BatchPut(proto.RawKeyValue{Key: newKey, Value: valBytes})) } return mvcc.engine.WriteBatch(batch) } // If not committing (this can be the case if commit=true, but the // committed epoch is different from this intent's epoch), we must // find the next versioned value and reset the metadata's latest // timestamp. If there are no other versioned values, we delete the // metadata key. Because there are multiple steps here and we want // them all to commit, or none to commit, we schedule them using a // write batch. var batch []interface{} // First clear the intent value. latestKey := mvccEncodeKey(binKey, meta.Timestamp) batch = append(batch, BatchDelete(latestKey)) // Compute the next possible mvcc value for this key. nextKey := NextKey(latestKey) // Compute the last possible mvcc value for this key. endScanKey := encoding.EncodeBinary(nil, NextKey(key)) kvs, err := mvcc.engine.Scan(nextKey, endScanKey, 1) if err != nil { return err } // If there is no other version, we should just clean up the key entirely. if len(kvs) == 0 { batch = append(batch, BatchDelete(binKey)) } else { _, ts, isValue := mvccDecodeKey(kvs[0].Key) if !isValue { return util.Errorf("expected an MVCC value key: %s", kvs[0].Key) } // Update the keyMetadata with the next version. batchPut, err := MakeBatchPutProto(binKey, &proto.MVCCMetadata{Timestamp: ts}) if err != nil { return err } batch = append(batch, batchPut) } return mvcc.engine.WriteBatch(batch) }
// Author: Spencer Kimball ([email protected]) package engine import ( "bytes" "reflect" "testing" gogoproto "code.google.com/p/gogoprotobuf/proto" "github.com/cockroachdb/cockroach/proto" "github.com/cockroachdb/cockroach/util/encoding" ) var ( aKey = encoding.EncodeBinary(nil, Key("a")) bKey = encoding.EncodeBinary(nil, Key("b")) cKey = encoding.EncodeBinary(nil, Key("c")) aKeys = []Key{ aKey, mvccEncodeKey(aKey, makeTS(2E9, 0)), mvccEncodeKey(aKey, makeTS(1E9, 1)), mvccEncodeKey(aKey, makeTS(1E9, 0)), } bKeys = []Key{ bKey, mvccEncodeKey(bKey, makeTS(2E9, 0)), mvccEncodeKey(bKey, makeTS(1E9, 0)), } cKeys = []Key{ mvccEncodeKey(cKey, makeTS(1E9, 0)),
// Delete marks the key deleted and will not return in the next get response. func (mvcc *MVCC) Delete(key Key, timestamp proto.Timestamp, txn *proto.Transaction) error { binKey := encoding.EncodeBinary(nil, key) return mvcc.putInternal(binKey, timestamp, proto.MVCCValue{Deleted: true}, txn) }
// TestRangeSnapshot. func TestRangeSnapshot(t *testing.T) { rng, _, clock, _ := createTestRangeWithClock(t) defer rng.Stop() key1 := []byte("a") key2 := []byte("b") val1 := []byte("1") val2 := []byte("2") val3 := []byte("3") pArgs, pReply := putArgs(key1, val1, 0) pArgs.Timestamp = clock.Now() err := rng.ReadWriteCmd("Put", pArgs, pReply) pArgs, pReply = putArgs(key2, val2, 0) pArgs.Timestamp = clock.Now() err = rng.ReadWriteCmd("Put", pArgs, pReply) gArgs, gReply := getArgs(key1, 0) gArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("Get", gArgs, gReply) if err != nil { t.Fatalf("error : %s", err) } if !bytes.Equal(gReply.Value.Bytes, val1) { t.Fatalf("the value %s in get result does not match the value %s in request", gReply.Value.Bytes, val1) } iscArgs, iscReply := internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } snapshotID := iscReply.SnapshotId expectedKey := encoding.EncodeBinary(nil, key1) expectedVal := getSerializedMVCCValue(&proto.Value{Bytes: val1}) if len(iscReply.Rows) != 4 || !bytes.Equal(iscReply.Rows[0].Key, expectedKey) || !bytes.Equal(iscReply.Rows[1].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", iscReply.Rows[1].Value, iscReply.Rows[0].Key, expectedVal, expectedKey) } pArgs, pReply = putArgs(key2, val3, 0) pArgs.Timestamp = clock.Now() err = rng.ReadWriteCmd("Put", pArgs, pReply) // Scan with the previous snapshot will get the old value val2 of key2. iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, snapshotID, 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } expectedKey = encoding.EncodeBinary(nil, key2) expectedVal = getSerializedMVCCValue(&proto.Value{Bytes: val2}) if len(iscReply.Rows) != 4 || !bytes.Equal(iscReply.Rows[2].Key, expectedKey) || !bytes.Equal(iscReply.Rows[3].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", iscReply.Rows[3].Value, iscReply.Rows[2].Key, expectedVal, expectedKey) } snapshotLastKey := iscReply.Rows[3].Key // Create a new snapshot to cover the latest value. iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } snapshotID2 := iscReply.SnapshotId expectedKey = encoding.EncodeBinary(nil, key2) expectedVal = getSerializedMVCCValue(&proto.Value{Bytes: val3}) // Expect one more mvcc version. if len(iscReply.Rows) != 5 || !bytes.Equal(iscReply.Rows[2].Key, expectedKey) || !bytes.Equal(iscReply.Rows[3].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", iscReply.Rows[3].Value, iscReply.Rows[2].Key, expectedVal, expectedKey) } snapshot2LastKey := iscReply.Rows[4].Key iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(snapshotLastKey), engine.KeyMax, 50, snapshotID, 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } if len(iscReply.Rows) != 0 { t.Fatalf("error : %d", len(iscReply.Rows)) } iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(snapshot2LastKey), engine.KeyMax, 50, snapshotID2, 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } if len(iscReply.Rows) != 0 { t.Fatalf("error : %d", len(iscReply.Rows)) } }
// ResolveWriteIntent either commits or aborts (rolls back) an extant // write intent for a given txn according to commit parameter. // ResolveWriteIntent will skip write intents of other txns. // // Transaction epochs deserve a bit of explanation. The epoch for a // transaction is incremented on transaction retry. Transaction retry // is different from abort. Retries occur in SSI transactions when the // commit timestamp is not equal to the proposed transaction // timestamp. This might be because writes to different keys had to // use higher timestamps than expected because of existing, committed // value, or because reads pushed the transaction's commit timestamp // forward. Retries also occur in the event that the txn tries to push // another txn in order to write an intent but fails (i.e. it has // lower priority). // // Because successive retries of a transaction may end up writing to // different keys, the epochs serve to classify which intents get // committed in the event the transaction succeeds (all those with // epoch matching the commit epoch), and which intents get aborted, // even if the transaction succeeds. func (mvcc *MVCC) ResolveWriteIntent(key Key, txn *proto.Transaction) error { if txn == nil { return util.Error("no txn specified") } binKey := encoding.EncodeBinary(nil, key) meta := &proto.MVCCMetadata{} ok, err := GetProto(mvcc.engine, binKey, meta) if err != nil { return err } // For cases where there's no write intent to resolve, or one exists // which we can't resolve, this is a noop. if !ok || meta.Txn == nil || !bytes.Equal(meta.Txn.ID, txn.ID) { return nil } // If we're committing, or if the commit timestamp of the intent has // been moved forward, and if the proposed epoch matches the existing // epoch: update the meta.Txn. For commit, it's set to nil; // otherwise, we update its value. We may have to update the actual // version value (remove old and create new with proper // timestamp-encoded key) if timestamp changed. commit := txn.Status == proto.COMMITTED pushed := txn.Status == proto.PENDING && meta.Txn.Timestamp.Less(txn.Timestamp) if (commit || pushed) && meta.Txn.Epoch == txn.Epoch { // Use a write batch because we may have multiple puts. var batch []interface{} origTimestamp := meta.Timestamp var metaTxn *proto.Transaction if pushed { // keep intent if we're pushing timestamp metaTxn = txn } batchPut, err := MakeBatchPutProto(binKey, &proto.MVCCMetadata{Timestamp: txn.Timestamp, Txn: metaTxn}) if err != nil { return err } batch = append(batch, batchPut) // If timestamp of value changed, need to rewrite versioned value. // TODO(spencer,tobias): think about a new merge operator for // updating key of intent value to new timestamp instead of // read-then-write. if !origTimestamp.Equal(txn.Timestamp) { origKey := mvccEncodeKey(binKey, origTimestamp) newKey := mvccEncodeKey(binKey, txn.Timestamp) valBytes, err := mvcc.engine.Get(origKey) if err != nil { return err } batch = append(batch, BatchDelete(origKey)) batch = append(batch, BatchPut(proto.RawKeyValue{Key: newKey, Value: valBytes})) } return mvcc.engine.WriteBatch(batch) } // This method shouldn't be called with this instance, but there's // nothing to do if the epochs match and the state is still PENDING. if txn.Status == proto.PENDING && meta.Txn.Epoch == txn.Epoch { return nil } // Otherwise, we're deleting the intent. We must find the next // versioned value and reset the metadata's latest timestamp. If // there are no other versioned values, we delete the metadata // key. Because there are multiple steps here and we want them all // to commit, or none to commit, we schedule them using a write // batch. var batch []interface{} // First clear the intent value. latestKey := mvccEncodeKey(binKey, meta.Timestamp) batch = append(batch, BatchDelete(latestKey)) // Compute the next possible mvcc value for this key. nextKey := NextKey(latestKey) // Compute the last possible mvcc value for this key. endScanKey := encoding.EncodeBinary(nil, NextKey(key)) kvs, err := mvcc.engine.Scan(nextKey, endScanKey, 1) if err != nil { return err } // If there is no other version, we should just clean up the key entirely. if len(kvs) == 0 { batch = append(batch, BatchDelete(binKey)) } else { _, ts, isValue := mvccDecodeKey(kvs[0].Key) if !isValue { return util.Errorf("expected an MVCC value key: %s", kvs[0].Key) } // Update the keyMetadata with the next version. batchPut, err := MakeBatchPutProto(binKey, &proto.MVCCMetadata{Timestamp: ts}) if err != nil { return err } batch = append(batch, batchPut) } return mvcc.engine.WriteBatch(batch) }
// Delete marks the key deleted and will not return in the next get response. func (mvcc *MVCC) Delete(key Key, timestamp proto.Timestamp, txnID []byte) error { binKey := encoding.EncodeBinary(nil, key) return mvcc.putInternal(binKey, timestamp, []byte{valueDeletedPrefix}, txnID) }
// TestRangeSnapshot. func TestRangeSnapshot(t *testing.T) { rng, _, clock, _ := createTestRangeWithClock(t) defer rng.Stop() key1 := "a" key2 := "b" val1 := "1" val2 := "2" val3 := "3" pArgs, pReply := putArgs(key1, val1, 0) pArgs.Timestamp = clock.Now() err := rng.ReadWriteCmd("Put", pArgs, pReply) pArgs, pReply = putArgs(key2, val2, 0) pArgs.Timestamp = clock.Now() err = rng.ReadWriteCmd("Put", pArgs, pReply) gArgs, gReply := getArgs(key1, 0) gArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("Get", gArgs, gReply) if err != nil { t.Fatalf("error : %s", err) } if !bytes.Equal(gReply.Value.Bytes, []byte(val1)) { t.Fatalf("the value %s in get result does not match the value %s in request", gReply.Value.Bytes, []byte(val1)) } irsArgs, irsReply := internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } snapshotID := irsReply.SnapshotId var valueNormalPrefix = byte(0) expectedKey := encoding.EncodeBinary(nil, []byte(key1)) expectedVal := bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val1)}, []byte("")) if len(irsReply.Rows) != 4 || !bytes.Equal(irsReply.Rows[0].Key, expectedKey) || !bytes.Equal(irsReply.Rows[1].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", irsReply.Rows[1].Value, irsReply.Rows[0].Key, expectedVal, expectedKey) } pArgs, pReply = putArgs(key2, val3, 0) pArgs.Timestamp = clock.Now() err = rng.ReadWriteCmd("Put", pArgs, pReply) // Scan with the previous snapshot will get the old value val2 of key2. irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, snapshotID, 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } expectedKey = encoding.EncodeBinary(nil, []byte(key2)) expectedVal = bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val2)}, []byte("")) if len(irsReply.Rows) != 4 || !bytes.Equal(irsReply.Rows[2].Key, expectedKey) || !bytes.Equal(irsReply.Rows[3].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", irsReply.Rows[3].Value, irsReply.Rows[2].Key, expectedVal, expectedKey) } snapshotLastKey := irsReply.Rows[3].Key // Create a new snapshot to cover the latest value. irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } snapshotID2 := irsReply.SnapshotId expectedKey = encoding.EncodeBinary(nil, []byte(key2)) expectedVal = bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val3)}, []byte("")) // Expect one more mvcc version. if len(irsReply.Rows) != 5 || !bytes.Equal(irsReply.Rows[2].Key, expectedKey) || !bytes.Equal(irsReply.Rows[3].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", irsReply.Rows[3].Value, irsReply.Rows[2].Key, expectedVal, expectedKey) } snapshot2LastKey := irsReply.Rows[4].Key irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(snapshotLastKey), engine.KeyMax, 50, snapshotID, 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } if len(irsReply.Rows) != 0 { t.Fatalf("error : %d", len(irsReply.Rows)) } irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(snapshot2LastKey), engine.KeyMax, 50, snapshotID2, 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } if len(irsReply.Rows) != 0 { t.Fatalf("error : %d", len(irsReply.Rows)) } }