// createTestEngine creates an in-memory engine and initializes some // default configuration settings. func createTestEngine(t *testing.T) engine.Engine { e := engine.NewInMem(proto.Attributes{Attrs: []string{"dc1", "mem"}}, 1<<20) if err := engine.PutProto(e, engine.KeyConfigAccountingPrefix, &testDefaultAcctConfig); err != nil { t.Fatal(err) } if err := engine.PutProto(e, engine.KeyConfigPermissionPrefix, &testDefaultPermConfig); err != nil { t.Fatal(err) } if err := engine.PutProto(e, engine.KeyConfigZonePrefix, &testDefaultZoneConfig); err != nil { t.Fatal(err) } return e }
func copySeqCache(e engine.Engine, srcID, dstID roachpb.RangeID, keyMin, keyMax engine.MVCCKey) error { var scratch [64]byte return e.Iterate(keyMin, keyMax, func(kv engine.MVCCKeyValue) (bool, error) { // Decode the key into a cmd, skipping on error. Otherwise, // write it to the corresponding key in the new cache. id, epoch, seq, err := decodeSequenceCacheMVCCKey(kv.Key, scratch[:0]) if err != nil { return false, util.Errorf("could not decode a sequence cache key %s: %s", kv.Key, err) } key := keys.SequenceCacheKey(dstID, id, epoch, seq) encKey := engine.MakeMVCCMetadataKey(key) // Decode the value, update the checksum and re-encode. meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(kv.Value, meta); err != nil { return false, util.Errorf("could not decode sequence cache value %s [% x]: %s", kv.Key, kv.Value, err) } value := meta.Value() value.ClearChecksum() value.InitChecksum(key) meta.RawBytes = value.RawBytes _, _, err = engine.PutProto(e, encKey, meta) return false, err }) }
// CopyFrom copies all the cached results from the originRangeID // response cache into this one. Note that the cache will not be // locked while copying is in progress. Failures decoding individual // cache entries return an error. The copy is done directly using the // engine instead of interpreting values through MVCC for efficiency. func (rc *ResponseCache) CopyFrom(e engine.Engine, originRangeID proto.RangeID) error { prefix := keys.ResponseCacheKey(originRangeID, nil) // response cache prefix start := engine.MVCCEncodeKey(prefix) end := engine.MVCCEncodeKey(prefix.PrefixEnd()) return e.Iterate(start, end, func(kv proto.RawKeyValue) (bool, error) { // Decode the key into a cmd, skipping on error. Otherwise, // write it to the corresponding key in the new cache. cmdID, err := rc.decodeResponseCacheKey(kv.Key) if err != nil { return false, util.Errorf("could not decode a response cache key %s: %s", proto.Key(kv.Key), err) } key := keys.ResponseCacheKey(rc.rangeID, &cmdID) encKey := engine.MVCCEncodeKey(key) // Decode the value, update the checksum and re-encode. meta := &engine.MVCCMetadata{} if err := gogoproto.Unmarshal(kv.Value, meta); err != nil { return false, util.Errorf("could not decode response cache value %s [% x]: %s", proto.Key(kv.Key), kv.Value, err) } meta.Value.Checksum = nil meta.Value.InitChecksum(key) _, _, err = engine.PutProto(e, encKey, meta) return false, err }) }
// CopyInto copies all the cached results from this response cache // into the destRangeID response cache. Failures decoding individual // cache entries return an error. func (rc *ResponseCache) CopyInto(e engine.Engine, destRangeID roachpb.RangeID) error { start := engine.MVCCEncodeKey( keys.ResponseCacheKey(rc.rangeID, roachpb.KeyMin)) end := engine.MVCCEncodeKey( keys.ResponseCacheKey(rc.rangeID, roachpb.KeyMax)) return e.Iterate(start, end, func(kv engine.MVCCKeyValue) (bool, error) { // Decode the key into a cmd, skipping on error. Otherwise, // write it to the corresponding key in the new cache. family, err := rc.decodeResponseCacheKey(kv.Key) if err != nil { return false, util.Errorf("could not decode a response cache key %s: %s", roachpb.Key(kv.Key), err) } key := keys.ResponseCacheKey(destRangeID, family) encKey := engine.MVCCEncodeKey(key) // Decode the value, update the checksum and re-encode. meta := &engine.MVCCMetadata{} if err := proto.Unmarshal(kv.Value, meta); err != nil { return false, util.Errorf("could not decode response cache value %s [% x]: %s", roachpb.Key(kv.Key), kv.Value, err) } meta.Value.Checksum = nil meta.Value.InitChecksum(key) _, _, err = engine.PutProto(e, encKey, meta) return false, err }) }
// CreateRange allocates a new range ID and stores range metadata. // On success, returns the new range. func (s *Store) CreateRange(startKey, endKey engine.Key, replicas []proto.Replica) (*Range, error) { rangeID, err := engine.Increment(s.engine, engine.KeyLocalRangeIDGenerator, 1) if err != nil { return nil, err } if ok, _ := engine.GetProto(s.engine, makeRangeKey(rangeID), nil); ok { return nil, util.Error("newly allocated range ID already in use") } // RangeMetadata is stored local to this store only. It is neither // replicated via raft nor available via the global kv store. meta := &proto.RangeMetadata{ ClusterID: s.Ident.ClusterID, RangeID: rangeID, RangeDescriptor: proto.RangeDescriptor{ StartKey: startKey, EndKey: endKey, Replicas: replicas, }, } err = engine.PutProto(s.engine, makeRangeKey(rangeID), meta) if err != nil { return nil, err } rng := NewRange(meta, s.clock, s.engine, s.allocator, s.gossip, s) rng.Start() s.mu.Lock() defer s.mu.Unlock() s.ranges[rangeID] = rng return rng, nil }
// TestRangeGossipConfigWithMultipleKeyPrefixes verifies that multiple // key prefixes for a config are gossipped. func TestRangeGossipConfigWithMultipleKeyPrefixes(t *testing.T) { e := createTestEngine(t) // Add a permission for a new key prefix. db1Perm := proto.PermConfig{ Read: []string{"spencer", "foo", "bar", "baz"}, Write: []string{"spencer"}, } key := engine.MakeKey(engine.KeyConfigPermissionPrefix, engine.Key("/db1")) if err := engine.PutProto(e, key, &db1Perm); err != nil { t.Fatal(err) } r, g := createTestRange(e, t) defer r.Stop() info, err := g.GetInfo(gossip.KeyConfigPermission) if err != nil { t.Fatal(err) } configMap := info.(PrefixConfigMap) expConfigs := []*PrefixConfig{ &PrefixConfig{engine.KeyMin, nil, &testDefaultPermConfig}, &PrefixConfig{engine.Key("/db1"), nil, &db1Perm}, &PrefixConfig{engine.Key("/db2"), engine.KeyMin, &testDefaultPermConfig}, } if !reflect.DeepEqual([]*PrefixConfig(configMap), expConfigs) { t.Errorf("expected gossiped configs to be equal %s vs %s", configMap, expConfigs) } }
// InternalHeartbeatTxn updates the transaction status and heartbeat // timestamp after receiving transaction heartbeat messages from // coordinator. Returns the udpated transaction. func (r *Range) InternalHeartbeatTxn(args *proto.InternalHeartbeatTxnRequest, reply *proto.InternalHeartbeatTxnResponse) { // Create the actual key to the system-local transaction table. key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key) var txn proto.Transaction ok, err := engine.GetProto(r.engine, key, &txn) if err != nil { reply.SetGoError(err) return } // If no existing transaction record was found, initialize // to the transaction in the request header. if !ok { gogoproto.Merge(&txn, args.Txn) } if txn.Status == proto.PENDING { if txn.LastHeartbeat == nil { txn.LastHeartbeat = &proto.Timestamp{} } if txn.LastHeartbeat.Less(args.Header().Timestamp) { *txn.LastHeartbeat = args.Header().Timestamp } if err := engine.PutProto(r.engine, key, &txn); err != nil { reply.SetGoError(err) return } } reply.Txn = &txn }
// CreateRange creates a new Range using the provided RangeMetadata. // It persists the metadata locally and adds the new range to the // ranges map and sorted rangesByKey slice for doing range lookups // by key. func (s *Store) CreateRange(meta *proto.RangeMetadata) (*Range, error) { // Set the RangeID for meta based on the replica which matches this store. for _, repl := range meta.Replicas { if repl.StoreID == s.Ident.StoreID { meta.RangeID = repl.RangeID break } } if meta.RangeID == 0 { return nil, util.Errorf("unable to determine range ID for this range; no replicas match store %d: %s", s.Ident.StoreID, meta.Replicas) } err := engine.PutProto(s.engine, makeRangeKey(meta.RangeID), meta) if err != nil { return nil, err } s.mu.Lock() defer s.mu.Unlock() rng := NewRange(meta, s.clock, s.engine, s.allocator, s.gossip, s) rng.Start() s.ranges[meta.RangeID] = rng s.rangesByKey = append(s.rangesByKey, rng) sort.Sort(s.rangesByKey) return rng, nil }
// Bootstrap writes a new store ident to the underlying engine. To // ensure that no crufty data already exists in the engine, it scans // the engine contents before writing the new store ident. The engine // should be completely empty. It returns an error if called on a // non-empty engine. func (s *Store) Bootstrap(ident proto.StoreIdent) error { s.Ident = ident kvs, err := s.engine.Scan(engine.KeyMin, engine.KeyMax, 1 /* only need one entry to fail! */) if err != nil { return util.Errorf("unable to scan engine to verify empty: %v", err) } else if len(kvs) > 0 { return util.Errorf("bootstrap failed; non-empty map with first key %q", kvs[0].Key) } return engine.PutProto(s.engine, engine.KeyLocalIdent, &s.Ident) }
// TestEndTransactionWithErrors verifies various error conditions // are checked such as transaction already being committed or // aborted, or timestamp or epoch regression. func TestEndTransactionWithErrors(t *testing.T) { rng, mc, clock, _ := createTestRangeWithClock(t) defer rng.Stop() regressTS := clock.Now() *mc = hlc.ManualClock(1) txn := NewTransaction(engine.Key(""), 1, proto.SERIALIZABLE, clock) testCases := []struct { key engine.Key existStatus proto.TransactionStatus existEpoch int32 existTS proto.Timestamp expErrRegexp string }{ {engine.Key("a"), proto.COMMITTED, txn.Epoch, txn.Timestamp, "txn {.*}: already committed"}, {engine.Key("b"), proto.ABORTED, txn.Epoch, txn.Timestamp, "txn {.*}: already aborted"}, {engine.Key("c"), proto.PENDING, txn.Epoch + 1, txn.Timestamp, "txn {.*}: epoch regression: 0"}, {engine.Key("d"), proto.PENDING, txn.Epoch, regressTS, "txn {.*}: timestamp regression: {WallTime:1 Logical:0 .*}"}, } for _, test := range testCases { // Establish existing txn state by writing directly to range engine. var existTxn proto.Transaction gogoproto.Merge(&existTxn, txn) existTxn.ID = test.key existTxn.Status = test.existStatus existTxn.Epoch = test.existEpoch existTxn.Timestamp = test.existTS txnKey := engine.MakeKey(engine.KeyLocalTransactionPrefix, test.key) if err := engine.PutProto(rng.engine, txnKey, &existTxn); err != nil { t.Fatal(err) } // End the transaction, verify expected error. txn.ID = test.key args, reply := endTxnArgs(txn, true, 0) args.Timestamp = txn.Timestamp err := rng.ReadWriteCmd("EndTransaction", args, reply) if err == nil { t.Errorf("expected error matching %q", test.expErrRegexp) } else { if matched, regexpErr := regexp.MatchString(test.expErrRegexp, err.Error()); !matched || regexpErr != nil { t.Errorf("expected error to match %q (%v): %v", test.expErrRegexp, regexpErr, err.Error()) } } } }
func copySeqCache( e engine.Engine, ms *engine.MVCCStats, srcID, dstID roachpb.RangeID, keyMin, keyMax engine.MVCCKey, ) (int, error) { var scratch [64]byte var count int var meta engine.MVCCMetadata // TODO(spencer): look into making this an MVCCIteration and writing // the values using MVCC so we can avoid the ugliness of updating // the MVCCStats by hand below. err := e.Iterate(keyMin, keyMax, func(kv engine.MVCCKeyValue) (bool, error) { // Decode the key, skipping on error. Otherwise, write it to the // corresponding key in the new cache. txnID, err := decodeAbortCacheMVCCKey(kv.Key, scratch[:0]) if err != nil { return false, util.Errorf("could not decode an abort cache key %s: %s", kv.Key, err) } key := keys.AbortCacheKey(dstID, txnID) encKey := engine.MakeMVCCMetadataKey(key) // Decode the MVCCMetadata value. if err := proto.Unmarshal(kv.Value, &meta); err != nil { return false, util.Errorf("could not decode mvcc metadata %s [% x]: %s", kv.Key, kv.Value, err) } value := meta.Value() value.ClearChecksum() value.InitChecksum(key) meta.RawBytes = value.RawBytes keyBytes, valBytes, err := engine.PutProto(e, encKey, &meta) if err != nil { return false, err } count++ if ms != nil { ms.SysBytes += keyBytes + valBytes ms.SysCount++ } return false, nil }) return count, err }
// InternalHeartbeatTxn updates the transaction status and heartbeat // timestamp after receiving transaction heartbeat messages from // coordinator. The range will return the current status for this // transaction to the coordinator. func (r *Range) InternalHeartbeatTxn(args *proto.InternalHeartbeatTxnRequest, reply *proto.InternalHeartbeatTxnResponse) { // Create the actual key to the system-local transaction table. key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key) var txn proto.Transaction if _, err := engine.GetProto(r.engine, key, &txn); err != nil { reply.SetGoError(err) return } if txn.Status == proto.PENDING { if !args.Header().Timestamp.Less(txn.LastHeartbeat) { txn.LastHeartbeat = args.Header().Timestamp } if err := engine.PutProto(r.engine, key, &txn); err != nil { reply.SetGoError(err) return } } reply.Status = txn.Status }
// PutResponse writes a response to the cache for the specified cmdID. // The inflight entry corresponding to cmdID is removed from the // inflight map. Any requests waiting on the outcome of the inflight // command will be signaled to wakeup and read the command response // from the cache. func (rc *ResponseCache) PutResponse(cmdID proto.ClientCmdID, reply interface{}) error { // Do nothing if command ID is empty. if cmdID.IsEmpty() { return nil } // Write the response value to the engine. key := rc.makeKey(cmdID) rwResp := &proto.ReadWriteCmdResponse{} rwResp.SetValue(reply) err := engine.PutProto(rc.engine, key, rwResp) // Take lock after writing response to cache! rc.Lock() defer rc.Unlock() // Even on error, we remove the entry from the inflight map. rc.removeInflightLocked(cmdID) return err }
// EndTransaction either commits or aborts (rolls back) an extant // transaction according to the args.Commit parameter. func (r *Range) EndTransaction(args *proto.EndTransactionRequest, reply *proto.EndTransactionResponse) { // Create the actual key to the system-local transaction table. key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key) // Start with supplied transaction, then possibly load from txn record. reply.Txn = gogoproto.Clone(args.Txn).(*proto.Transaction) // Fetch existing transaction if possible. existTxn := &proto.Transaction{} ok, err := engine.GetProto(r.engine, key, existTxn) if err != nil { reply.SetGoError(err) return } // If the transaction record already exists, verify that we can either // commit it or abort it (according to args.Commit), and also that the // Timestamp and Epoch have not suffered regression. if ok { if existTxn.Status == proto.COMMITTED { reply.SetGoError(proto.NewTransactionStatusError(existTxn, "already committed")) return } else if existTxn.Status == proto.ABORTED { reply.SetGoError(proto.NewTransactionStatusError(existTxn, "already aborted")) return } else if args.Txn.Epoch < existTxn.Epoch { reply.SetGoError(proto.NewTransactionStatusError(existTxn, fmt.Sprintf("epoch regression: %d", args.Txn.Epoch))) return } else if existTxn.Timestamp.Less(args.Txn.Timestamp) { // The transaction record can only ever be pushed forward, so it's an // error if somehow the transaction record has an earlier timestamp // than the transaction timestamp. reply.SetGoError(proto.NewTransactionStatusError(existTxn, fmt.Sprintf("timestamp regression: %+v", args.Txn.Timestamp))) return } // Use the persisted transaction record as final transaction. gogoproto.Merge(reply.Txn, existTxn) } // Take max of requested timestamp and possibly "pushed" txn // record timestamp as the final commit timestamp. if reply.Txn.Timestamp.Less(args.Timestamp) { reply.Txn.Timestamp = args.Timestamp } // Set transaction status to COMMITTED or ABORTED as per the // args.Commit parameter. if args.Commit { // If the isolation level is SERIALIZABLE, return a transaction // retry error if the commit timestamp isn't equal to the txn // timestamp. if args.Txn.Isolation == proto.SERIALIZABLE && !reply.Txn.Timestamp.Equal(args.Txn.Timestamp) { reply.SetGoError(proto.NewTransactionRetryError(reply.Txn)) return } reply.Txn.Status = proto.COMMITTED } else { reply.Txn.Status = proto.ABORTED } // Persist the transaction record with updated status (& possibly timestmap). if err := engine.PutProto(r.engine, key, reply.Txn); err != nil { reply.SetGoError(err) return } }
// InternalPushTxn resolves conflicts between concurrent txns (or // between a non-transactional reader or writer and a txn) in several // ways depending on the statuses and priorities of the conflicting // transactions. The InternalPushTxn operation is invoked by a // "pusher" (the writer trying to abort a conflicting txn or the // reader trying to push a conflicting txn's commit timestamp // forward), who attempts to resolve a conflict with a "pushee" // (args.PushTxn -- the pushee txn whose intent(s) caused the // conflict). // // Txn already committed/aborted: If pushee txn is committed or // aborted return success. // // Txn Timeout: If pushee txn entry isn't present or its LastHeartbeat // timestamp isn't set, use PushTxn.Timestamp as LastHeartbeat. If // current time - LastHeartbeat > 2 * DefaultHeartbeatInterval, then // the pushee txn should be either pushed forward or aborted, // depending on value of Request.Abort. // // Old Txn Epoch: If persisted pushee txn entry has a newer Epoch than // PushTxn.Epoch, return success, as older epoch may be removed. // // Lower Txn Priority: If pushee txn has a lower priority than pusher, // adjust pushee's persisted txn depending on value of args.Abort. If // args.Abort is true, set txn.Status to ABORTED, and priority to one // less than the pusher's priority and return success. If args.Abort // is false, set txn.Timestamp to pusher's txn.Timestamp + 1. // // Higher Txn Priority: If pushee txn has a higher priority than // pusher, return TransactionRetryError. Transaction will be retried // with priority one less than the pushee's higher priority. func (r *Range) InternalPushTxn(args *proto.InternalPushTxnRequest, reply *proto.InternalPushTxnResponse) { if !bytes.Equal(args.Key, args.PusheeTxn.ID) { reply.SetGoError(util.Errorf("request key %q should match pushee's txn ID %q", args.Key, args.PusheeTxn.ID)) return } // Create the actual key to the system-local transaction table. key := engine.MakeKey(engine.KeyLocalTransactionPrefix, args.Key) // Fetch existing transaction if possible. existTxn := &proto.Transaction{} ok, err := engine.GetProto(r.engine, key, existTxn) if err != nil { reply.SetGoError(err) return } if ok { // Start with the persisted transaction record as final transaction. reply.PusheeTxn = gogoproto.Clone(existTxn).(*proto.Transaction) // Upgrade the epoch and timestamp as necessary. if reply.PusheeTxn.Epoch < args.PusheeTxn.Epoch { reply.PusheeTxn.Epoch = args.PusheeTxn.Epoch } if reply.PusheeTxn.Timestamp.Less(args.PusheeTxn.Timestamp) { reply.PusheeTxn.Timestamp = args.PusheeTxn.Timestamp } } else { // Some sanity checks for case where we don't find a transaction record. if args.PusheeTxn.LastHeartbeat != nil { reply.SetGoError(proto.NewTransactionStatusError(&args.PusheeTxn, "no txn persisted, yet intent has heartbeat")) return } else if args.PusheeTxn.Status != proto.PENDING { reply.SetGoError(proto.NewTransactionStatusError(&args.PusheeTxn, fmt.Sprintf("no txn persisted, yet intent has status %s", args.PusheeTxn.Status))) return } // The transaction doesn't exist yet on disk; use the supplied version. reply.PusheeTxn = gogoproto.Clone(&args.PusheeTxn).(*proto.Transaction) } // If already committed or aborted, return success. if reply.PusheeTxn.Status != proto.PENDING { // Trivial noop. return } // If we're trying to move the timestamp forward, and it's already // far enough forward, return success. if !args.Abort && args.Timestamp.Less(reply.PusheeTxn.Timestamp) { // Trivial noop. return } // pusherWins bool is true in the event the pusher prevails. var pusherWins bool // Check for txn timeout. if reply.PusheeTxn.LastHeartbeat == nil { reply.PusheeTxn.LastHeartbeat = &reply.PusheeTxn.Timestamp } // Compute heartbeat expiration. expiry := r.clock.Now() expiry.WallTime -= 2 * DefaultHeartbeatInterval.Nanoseconds() if reply.PusheeTxn.LastHeartbeat.Less(expiry) { log.V(1).Infof("pushing expired txn %+v", reply.PusheeTxn) pusherWins = true } else if args.PusheeTxn.Epoch < reply.PusheeTxn.Epoch { // Check for an intent from a prior epoch. log.V(1).Infof("pushing intent from previous epoch for txn %+v", reply.PusheeTxn) pusherWins = true } else if reply.PusheeTxn.Priority < args.Txn.Priority || (reply.PusheeTxn.Priority == args.Txn.Priority && args.Txn.Timestamp.Less(reply.PusheeTxn.Timestamp)) { // Finally, choose based on priority; if priorities are equal, order by lower txn timestamp. log.V(1).Infof("pushing intent from txn with lower priority %+v vs %+v", reply.PusheeTxn, args.Txn) pusherWins = true } if !pusherWins { log.V(1).Infof("failed to push intent %+v vs %+v", reply.PusheeTxn, args.Txn) reply.SetGoError(proto.NewTransactionRetryError(reply.PusheeTxn)) return } // If aborting transaction, set new status and return success. if args.Abort { reply.PusheeTxn.Status = proto.ABORTED } else { // Otherwise, update timestamp to be one greater than the request's timestamp. reply.PusheeTxn.Timestamp = args.Timestamp reply.PusheeTxn.Timestamp.Logical++ } // Persist the pushed transaction. if err := engine.PutProto(r.engine, key, reply.PusheeTxn); err != nil { reply.SetGoError(err) return } }