// NewPrefixConfigMap creates a new prefix config map and sorts // the entries by key prefix and then adds additional entries to mark // the ends of each key prefix range. For example, if the map // contains entries for: // // "/": config1 // "/db1": config2 // "/db1/table": config3 // "/db3": config4 // // ...then entries will be added for: // // "/db1/tablf": config2 // "/db2": config1 // "/db4": config1 // // These additional entries allow for simple lookups by prefix and // provide a way to split a range by prefixes which affect it. This // last is necessary for zone configs; ranges must not span zone // config boundaries. func NewPrefixConfigMap(configs []*PrefixConfig) (PrefixConfigMap, error) { p := PrefixConfigMap(configs) sort.Sort(p) if len(p) == 0 || bytes.Compare(p[0].Prefix, engine.KeyMin) != 0 { return nil, util.Errorf("no default prefix specified") } var newConfigs []*PrefixConfig stack := list.New() for _, entry := range p { // Pop entries from the stack which aren't prefixes. for stack.Len() > 0 && !bytes.HasPrefix(entry.Prefix, stack.Back().Value.(*PrefixConfig).Prefix) { stack.Remove(stack.Back()) } if stack.Len() != 0 { newConfigs = append(newConfigs, &PrefixConfig{ Prefix: engine.PrefixEndKey(entry.Prefix), Canonical: stack.Back().Value.(*PrefixConfig).Prefix, Config: stack.Back().Value.(*PrefixConfig).Config, }) } stack.PushBack(entry) } // Add newly created configs and re-sort. for _, config := range newConfigs { p = append(p, config) } sort.Sort(p) return p, nil }
// createTestStore creates a test store using an in-memory // engine. Returns the store clock's manual unix nanos time and the // store. If createDefaultRange is true, creates a single range from // key "a" to key "z" with a default replica descriptor (i.e. StoreID // = 0, RangeID = 1, etc.). The caller is responsible for closing the // store on exit. func createTestStore(createDefaultRange bool, t *testing.T) (*Store, *hlc.ManualClock) { manual := hlc.ManualClock(0) clock := hlc.NewClock(manual.UnixNano) eng := engine.NewInMem(proto.Attributes{}, 1<<20) store := NewStore(clock, eng, nil, nil) if err := store.Bootstrap(proto.StoreIdent{StoreID: 1}); err != nil { t.Fatal(err) } db, _ := newTestDB(store) store.db = db replica := proto.Replica{StoreID: 1, RangeID: 1} // Create system key range for allocations. meta := store.BootstrapRangeMetadata() meta.StartKey = engine.KeySystemPrefix meta.EndKey = engine.PrefixEndKey(engine.KeySystemPrefix) _, err := store.CreateRange(meta) if err != nil { t.Fatal(err) } if err := store.Init(); err != nil { t.Fatal(err) } // Now that the system key range is available, initialize the store. set store DB so new // ranges can be allocated as needed for tests. // If requested, create a default range for tests from "a"-"z". if createDefaultRange { replica = proto.Replica{StoreID: 1} _, err := store.CreateRange(store.NewRangeMetadata(engine.Key("a"), engine.Key("z"), []proto.Replica{replica})) if err != nil { t.Fatal(err) } } return store, &manual }
// Get retrieves the zone configuration for the specified key. If the // key is empty, all zone configurations are returned. Otherwise, the // leading "/" path delimiter is stripped and the zone configuration // matching the remainder is retrieved. Note that this will retrieve // the default zone config if "key" is equal to "/", and will list all // configs if "key" is equal to "". The body result contains // JSON-formatted output for a listing of keys and YAML-formatted // output for retrieval of a zone config. func (zh *zoneHandler) Get(path string, r *http.Request) (body []byte, contentType string, err error) { // Scan all zones if the key is empty. if len(path) == 0 { sr := <-zh.kvDB.Scan(&storage.ScanRequest{ RequestHeader: storage.RequestHeader{ Key: engine.KeyConfigZonePrefix, EndKey: engine.PrefixEndKey(engine.KeyConfigZonePrefix), User: storage.UserRoot, }, MaxResults: maxGetResults, }) if sr.Error != nil { err = sr.Error return } if len(sr.Rows) == maxGetResults { glog.Warningf("retrieved maximum number of results (%d); some may be missing", maxGetResults) } var prefixes []string for _, kv := range sr.Rows { trimmed := bytes.TrimPrefix(kv.Key, engine.KeyConfigZonePrefix) prefixes = append(prefixes, url.QueryEscape(string(trimmed))) } // JSON-encode the prefixes array. contentType = "application/json" if body, err = json.Marshal(prefixes); err != nil { err = util.Errorf("unable to format zone configurations: %v", err) } } else { zoneKey := engine.MakeKey(engine.KeyConfigZonePrefix, engine.Key(path[1:])) var ok bool config := &storage.ZoneConfig{} if ok, _, err = kv.GetI(zh.kvDB, zoneKey, config); err != nil { return } // On get, if there's no zone config for the requested prefix, // return a not found error. if !ok { err = util.Errorf("no config found for key prefix %q", path) return } var out []byte if out, err = yaml.Marshal(config); err != nil { err = util.Errorf("unable to marshal zone config %+v to yaml: %v", config, err) return } if !utf8.ValidString(string(out)) { err = util.Errorf("config contents not valid utf8: %q", out) return } contentType = "text/yaml" body = out } return }
func TestLocalKVLookupReplica(t *testing.T) { manual := hlc.ManualClock(0) clock := hlc.NewClock(manual.UnixNano) eng := engine.NewInMem(proto.Attributes{}, 1<<20) kv := NewLocalKV() db := NewDB(kv, clock) store := storage.NewStore(clock, eng, db, nil) if err := store.Bootstrap(proto.StoreIdent{StoreID: 1}); err != nil { t.Fatal(err) } kv.AddStore(store) meta := store.BootstrapRangeMetadata() meta.StartKey = engine.KeySystemPrefix meta.EndKey = engine.PrefixEndKey(engine.KeySystemPrefix) if _, err := store.CreateRange(meta); err != nil { t.Fatal(err) } if err := store.Init(); err != nil { t.Fatal(err) } // Create two new stores with ranges we care about. var s [2]*storage.Store ranges := []struct { storeID int32 start, end engine.Key }{ {2, engine.Key("a"), engine.Key("c")}, {3, engine.Key("x"), engine.Key("z")}, } for i, rng := range ranges { s[i] = storage.NewStore(clock, eng, db, nil) s[i].Ident.StoreID = rng.storeID replica := proto.Replica{StoreID: rng.storeID} _, err := s[i].CreateRange(store.NewRangeMetadata(rng.start, rng.end, []proto.Replica{replica})) if err != nil { t.Fatal(err) } kv.AddStore(s[i]) } if r, err := kv.lookupReplica(engine.Key("a"), engine.Key("c")); r.StoreID != s[0].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err) } if r, err := kv.lookupReplica(engine.Key("b"), nil); r.StoreID != s[0].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[0].Ident.StoreID, r.StoreID, err) } if r, err := kv.lookupReplica(engine.Key("b"), engine.Key("d")); r != nil || err == nil { t.Errorf("expected store 0 and error got %d", r.StoreID) } if r, err := kv.lookupReplica(engine.Key("x"), engine.Key("z")); r.StoreID != s[1].Ident.StoreID { t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err) } if r, err := kv.lookupReplica(engine.Key("y"), nil); r.StoreID != s[1].Ident.StoreID || err != nil { t.Errorf("expected store %d; got %d: %v", s[1].Ident.StoreID, r.StoreID, err) } }
// TestPrefixEndKey verifies the end keys on prefixes. func TestPrefixEndKey(t *testing.T) { testData := []struct { prefix, expEnd engine.Key }{ {engine.KeyMin, engine.KeyMax}, {engine.Key("0"), engine.Key("1")}, {engine.Key("a"), engine.Key("b")}, {engine.Key("db0"), engine.Key("db1")}, {engine.Key("\xfe"), engine.Key("\xff")}, {engine.KeyMax, engine.KeyMax}, {engine.Key("\xff\xff"), engine.Key("\xff\xff")}, } for i, test := range testData { if bytes.Compare(engine.PrefixEndKey(test.prefix), test.expEnd) != 0 { t.Errorf("%d: %q end key %q != %q", i, test.prefix, engine.PrefixEndKey(test.prefix), test.expEnd) } } }
// InternalRangeLookup is used to look up RangeDescriptors - a RangeDescriptor // is a metadata structure which describes the key range and replica locations // of a distinct range in the cluster. // // RangeDescriptors are stored as values in the cockroach cluster's key-value // store. However, they are always stored using special "Range Metadata keys", // which are "ordinary" keys with a special prefix appended. The Range Metadata // Key for an ordinary key can be generated with the `engine.RangeMetaKey(key)` // function. The RangeDescriptor for the range which contains a given key can be // retrieved by generating its Range Metadata Key and dispatching it to // InternalRangeLookup. // // Note that the Range Metadata Key sent to InternalRangeLookup is NOT the key // at which the desired RangeDescriptor is stored. Instead, this method returns // the RangeDescriptor stored at the _lowest_ existing key which is _greater_ // than the given key. The returned RangeDescriptor will thus contain the // ordinary key which was originally used to generate the Range Metadata Key // sent to InternalRangeLookup. // // This method has an important optimization: instead of just returning the // request RangeDescriptor, it also returns a slice of additional range // descriptors immediately consecutive to the desired RangeDescriptor. This is // intended to serve as a sort of caching pre-fetch, so that the requesting // nodes can aggressively cache RangeDescriptors which are likely to be desired // by their current workload. func (r *Range) InternalRangeLookup(args *InternalRangeLookupRequest, reply *InternalRangeLookupResponse) { if err := engine.ValidateRangeMetaKey(args.Key); err != nil { reply.Error = err return } rangeCount := int64(args.MaxRanges) if rangeCount < 1 { reply.Error = util.Errorf( "Range lookup specified invalid maximum range count %d: must be > 0", rangeCount) return } // We want to search for the metadata key just greater than args.Key. Scan // for both the requested key and the keys immediately afterwards, up to // MaxRanges. metaPrefix := args.Key[:len(engine.KeyMeta1Prefix)] nextKey := engine.NextKey(args.Key) kvs, err := r.engine.Scan(nextKey, engine.PrefixEndKey(metaPrefix), rangeCount) if err != nil { reply.Error = err return } // The initial key must have the same metadata level prefix as we queried. if len(kvs) == 0 { // At this point the range has been verified to contain the requested // key, but no matching results were returned from the scan. This could // indicate a very bad system error, but for now we will just treat it // as a retryable Key Mismatch error. reply.Error = NewRangeKeyMismatchError(args.Key, args.Key, r.Meta) log.Errorf("InternalRangeLookup dispatched to correct range, but no matching RangeDescriptor was found. %s", reply.Error.Error()) return } // Decode all scanned range descriptors, stopping if a range is encountered // which does not have the same metadata prefix as the queried key. rds := make([]*RangeDescriptor, 0, len(kvs)) for i := range kvs { rds = append(rds, &RangeDescriptor{}) if err = gob.NewDecoder(bytes.NewBuffer(kvs[i].Value)).Decode(rds[i]); err != nil { reply.Error = err return } } reply.Ranges = rds return }
// loadConfigMap scans the config entries under keyPrefix and // instantiates/returns a config map. Prefix configuration maps // include accounting, permissions, and zones. func (r *Range) loadConfigMap(keyPrefix engine.Key, configI interface{}) (PrefixConfigMap, error) { // TODO(spencer): need to make sure range splitting never // crosses a configuration map's key prefix. kvs, err := r.engine.Scan(keyPrefix, engine.PrefixEndKey(keyPrefix), 0) if err != nil { return nil, err } var configs []*PrefixConfig for _, kv := range kvs { // Instantiate an instance of the config type by unmarshalling // gob encoded config from the Value into a new instance of configI. config := reflect.New(reflect.TypeOf(configI)).Interface().(gogoproto.Message) if err := gogoproto.Unmarshal(kv.Value, config); err != nil { return nil, util.Errorf("unable to unmarshal config key %s: %v", string(kv.Key), err) } configs = append(configs, &PrefixConfig{Prefix: bytes.TrimPrefix(kv.Key, keyPrefix), Config: config}) } return NewPrefixConfigMap(configs) }
// Init starts the engine, sets the GC and reads the StoreIdent. func (s *Store) Init() error { // Close store for idempotency. s.Close() // Start engine and set garbage collector. if err := s.engine.Start(); err != nil { return err } // Create ID allocators. s.raftIDAlloc = NewIDAllocator(engine.KeyRaftIDGenerator, s.db, 2, raftIDAllocCount) s.rangeIDAlloc = NewIDAllocator(engine.KeyRangeIDGenerator, s.db, 2, rangeIDAllocCount) // GCTimeouts method is called each time an engine compaction is // underway. It sets minimum timeouts for transaction records and // response cache entries. s.engine.SetGCTimeouts(func() (minTxnTS, minRCacheTS int64) { now := s.clock.Now() minTxnTS = 0 // disable GC of transactions until we know minimum write intent age minRCacheTS = now.WallTime - GCResponseCacheExpiration.Nanoseconds() return }) // Read store ident and return a not-bootstrapped error if necessary. ok, err := engine.GetProto(s.engine, engine.KeyLocalIdent, &s.Ident) if err != nil { return err } else if !ok { return &NotBootstrappedError{} } s.mu.Lock() defer s.mu.Unlock() start := engine.KeyLocalRangeMetadataPrefix end := engine.PrefixEndKey(start) const rows = 64 for { kvs, err := s.engine.Scan(start, end, rows) if err != nil { return err } for _, kv := range kvs { var meta proto.RangeMetadata if err := gogoproto.Unmarshal(kv.Value, &meta); err != nil { return err } rng := NewRange(&meta, s.clock, s.engine, s.allocator, s.gossip, s) rng.Start() s.ranges[meta.RangeID] = rng s.rangesByKey = append(s.rangesByKey, rng) } if len(kvs) < rows { break } start = engine.NextKey(kvs[rows-1].Key) } // Ensure that ranges are sorted. sort.Sort(s.rangesByKey) return nil }
// TestRangeSnapshot. func TestRangeSnapshot(t *testing.T) { rng, _, clock, _ := createTestRangeWithClock(t) defer rng.Stop() key1 := []byte("a") key2 := []byte("b") val1 := []byte("1") val2 := []byte("2") val3 := []byte("3") pArgs, pReply := putArgs(key1, val1, 0) pArgs.Timestamp = clock.Now() err := rng.ReadWriteCmd("Put", pArgs, pReply) pArgs, pReply = putArgs(key2, val2, 0) pArgs.Timestamp = clock.Now() err = rng.ReadWriteCmd("Put", pArgs, pReply) gArgs, gReply := getArgs(key1, 0) gArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("Get", gArgs, gReply) if err != nil { t.Fatalf("error : %s", err) } if !bytes.Equal(gReply.Value.Bytes, val1) { t.Fatalf("the value %s in get result does not match the value %s in request", gReply.Value.Bytes, val1) } iscArgs, iscReply := internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } snapshotID := iscReply.SnapshotId expectedKey := encoding.EncodeBinary(nil, key1) expectedVal := getSerializedMVCCValue(&proto.Value{Bytes: val1}) if len(iscReply.Rows) != 4 || !bytes.Equal(iscReply.Rows[0].Key, expectedKey) || !bytes.Equal(iscReply.Rows[1].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", iscReply.Rows[1].Value, iscReply.Rows[0].Key, expectedVal, expectedKey) } pArgs, pReply = putArgs(key2, val3, 0) pArgs.Timestamp = clock.Now() err = rng.ReadWriteCmd("Put", pArgs, pReply) // Scan with the previous snapshot will get the old value val2 of key2. iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, snapshotID, 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } expectedKey = encoding.EncodeBinary(nil, key2) expectedVal = getSerializedMVCCValue(&proto.Value{Bytes: val2}) if len(iscReply.Rows) != 4 || !bytes.Equal(iscReply.Rows[2].Key, expectedKey) || !bytes.Equal(iscReply.Rows[3].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", iscReply.Rows[3].Value, iscReply.Rows[2].Key, expectedVal, expectedKey) } snapshotLastKey := iscReply.Rows[3].Key // Create a new snapshot to cover the latest value. iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } snapshotID2 := iscReply.SnapshotId expectedKey = encoding.EncodeBinary(nil, key2) expectedVal = getSerializedMVCCValue(&proto.Value{Bytes: val3}) // Expect one more mvcc version. if len(iscReply.Rows) != 5 || !bytes.Equal(iscReply.Rows[2].Key, expectedKey) || !bytes.Equal(iscReply.Rows[3].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", iscReply.Rows[3].Value, iscReply.Rows[2].Key, expectedVal, expectedKey) } snapshot2LastKey := iscReply.Rows[4].Key iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(snapshotLastKey), engine.KeyMax, 50, snapshotID, 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } if len(iscReply.Rows) != 0 { t.Fatalf("error : %d", len(iscReply.Rows)) } iscArgs, iscReply = internalSnapshotCopyArgs(engine.PrefixEndKey(snapshot2LastKey), engine.KeyMax, 50, snapshotID2, 0) iscArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", iscArgs, iscReply) if err != nil { t.Fatalf("error : %s", err) } if len(iscReply.Rows) != 0 { t.Fatalf("error : %d", len(iscReply.Rows)) } }
// TestRangeSnapshot. func TestRangeSnapshot(t *testing.T) { rng, _, clock, _ := createTestRangeWithClock(t) defer rng.Stop() key1 := "a" key2 := "b" val1 := "1" val2 := "2" val3 := "3" pArgs, pReply := putArgs(key1, val1, 0) pArgs.Timestamp = clock.Now() err := rng.ReadWriteCmd("Put", pArgs, pReply) pArgs, pReply = putArgs(key2, val2, 0) pArgs.Timestamp = clock.Now() err = rng.ReadWriteCmd("Put", pArgs, pReply) gArgs, gReply := getArgs(key1, 0) gArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("Get", gArgs, gReply) if err != nil { t.Fatalf("error : %s", err) } if !bytes.Equal(gReply.Value.Bytes, []byte(val1)) { t.Fatalf("the value %s in get result does not match the value %s in request", gReply.Value.Bytes, []byte(val1)) } irsArgs, irsReply := internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } snapshotID := irsReply.SnapshotId var valueNormalPrefix = byte(0) expectedKey := encoding.EncodeBinary(nil, []byte(key1)) expectedVal := bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val1)}, []byte("")) if len(irsReply.Rows) != 4 || !bytes.Equal(irsReply.Rows[0].Key, expectedKey) || !bytes.Equal(irsReply.Rows[1].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", irsReply.Rows[1].Value, irsReply.Rows[0].Key, expectedVal, expectedKey) } pArgs, pReply = putArgs(key2, val3, 0) pArgs.Timestamp = clock.Now() err = rng.ReadWriteCmd("Put", pArgs, pReply) // Scan with the previous snapshot will get the old value val2 of key2. irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, snapshotID, 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } expectedKey = encoding.EncodeBinary(nil, []byte(key2)) expectedVal = bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val2)}, []byte("")) if len(irsReply.Rows) != 4 || !bytes.Equal(irsReply.Rows[2].Key, expectedKey) || !bytes.Equal(irsReply.Rows[3].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", irsReply.Rows[3].Value, irsReply.Rows[2].Key, expectedVal, expectedKey) } snapshotLastKey := irsReply.Rows[3].Key // Create a new snapshot to cover the latest value. irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(engine.KeyLocalPrefix), engine.KeyMax, 50, "", 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } snapshotID2 := irsReply.SnapshotId expectedKey = encoding.EncodeBinary(nil, []byte(key2)) expectedVal = bytes.Join([][]byte{[]byte{valueNormalPrefix}, []byte(val3)}, []byte("")) // Expect one more mvcc version. if len(irsReply.Rows) != 5 || !bytes.Equal(irsReply.Rows[2].Key, expectedKey) || !bytes.Equal(irsReply.Rows[3].Value, expectedVal) { t.Fatalf("the value %v of key %v in get result does not match the value %v of key %v in request", irsReply.Rows[3].Value, irsReply.Rows[2].Key, expectedVal, expectedKey) } snapshot2LastKey := irsReply.Rows[4].Key irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(snapshotLastKey), engine.KeyMax, 50, snapshotID, 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } if len(irsReply.Rows) != 0 { t.Fatalf("error : %d", len(irsReply.Rows)) } irsArgs, irsReply = internalRangeScanArgs(engine.PrefixEndKey(snapshot2LastKey), engine.KeyMax, 50, snapshotID2, 0) irsArgs.Timestamp = clock.Now() err = rng.ReadOnlyCmd("InternalSnapshotCopy", irsArgs, irsReply) if err != nil { t.Fatalf("error : %s", err) } if len(irsReply.Rows) != 0 { t.Fatalf("error : %d", len(irsReply.Rows)) } }