// TestRemoveRangeWithoutGC ensures that we do not panic when a // replica has been removed but not yet GC'd (and therefore // does not have an active raft group). func TestRemoveRangeWithoutGC(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 2) defer mtc.Stop() // Disable the GC queue and move the range from store 0 to 1. mtc.stores[0].DisableReplicaGCQueue(true) const rangeID roachpb.RangeID = 1 mtc.replicateRange(rangeID, 1) mtc.unreplicateRange(rangeID, 0) // Wait for store 0 to process the removal. util.SucceedsWithin(t, time.Second, func() error { rep, err := mtc.stores[0].GetReplica(rangeID) if err != nil { return err } desc := rep.Desc() if len(desc.Replicas) != 1 { return util.Errorf("range has %d replicas", len(desc.Replicas)) } return nil }) // The replica's data is still on disk even though the Replica // object is removed. var desc roachpb.RangeDescriptor descKey := keys.RangeDescriptorKey(roachpb.RKeyMin) if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), descKey, mtc.stores[0].Clock().Now(), true, nil, &desc); err != nil { t.Fatal(err) } else if !ok { t.Fatal("expected range descriptor to be present") } // Stop and restart the store to reset the replica's raftGroup // pointer to nil. As long as the store has not been restarted it // can continue to use its last known replica ID. mtc.stopStore(0) mtc.restartStore(0) // Turn off the GC queue to ensure that the replica is deleted at // startup instead of by the scanner. This is not 100% guaranteed // since the scanner could have already run at this point, but it // should be enough to prevent us from accidentally relying on the // scanner. mtc.stores[0].DisableReplicaGCQueue(true) // The Replica object is not recreated. if _, err := mtc.stores[0].GetReplica(rangeID); err == nil { t.Fatalf("expected replica to be missing") } // And the data is no longer on disk. if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), descKey, mtc.stores[0].Clock().Now(), true, nil, &desc); err != nil { t.Fatal(err) } else if ok { t.Fatal("expected range descriptor to be absent") } }
// TestReplicateRange verifies basic replication functionality by creating two stores // and a range, replicating the range to the second store, and reading its data there. func TestReplicateRange(t *testing.T) { defer leaktest.AfterTest(t) mtc := multiTestContext{} mtc.Start(t, 2) defer mtc.Stop() // Issue a command on the first node before replicating. incArgs, incResp := incrementArgs([]byte("a"), 5, 1, mtc.stores[0].StoreID()) if err := mtc.stores[0].ExecuteCmd(context.Background(), proto.Call{Args: incArgs, Reply: incResp}); err != nil { t.Fatal(err) } rng, err := mtc.stores[0].GetRange(1) if err != nil { t.Fatal(err) } if err := rng.ChangeReplicas(proto.ADD_REPLICA, proto.Replica{ NodeID: mtc.stores[1].Ident.NodeID, StoreID: mtc.stores[1].Ident.StoreID, }); err != nil { t.Fatal(err) } // Verify no intent remains on range descriptor key. key := keys.RangeDescriptorKey(rng.Desc().StartKey) desc := proto.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key, mtc.stores[0].Clock().Now(), true, nil, &desc); !ok || err != nil { t.Fatalf("fetching range descriptor yielded %t, %s", ok, err) } // Verify that in time, no intents remain on meta addressing // keys, and that range descriptor on the meta records is correct. util.SucceedsWithin(t, 1*time.Second, func() error { meta2 := keys.RangeMetaKey(proto.KeyMax) meta1 := keys.RangeMetaKey(meta2) for _, key := range []proto.Key{meta2, meta1} { metaDesc := proto.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key, mtc.stores[0].Clock().Now(), true, nil, &metaDesc); !ok || err != nil { return util.Errorf("failed to resolve %s", key) } if !reflect.DeepEqual(metaDesc, desc) { return util.Errorf("descs not equal: %+v != %+v", metaDesc, desc) } } return nil }) // Verify that the same data is available on the replica. util.SucceedsWithin(t, 1*time.Second, func() error { getArgs, getResp := getArgs([]byte("a"), 1, mtc.stores[1].StoreID()) getArgs.ReadConsistency = proto.INCONSISTENT if err := mtc.stores[1].ExecuteCmd(context.Background(), proto.Call{Args: getArgs, Reply: getResp}); err != nil { return util.Errorf("failed to read data") } if v := mustGetInteger(getResp.Value); v != 5 { return util.Errorf("failed to read correct data: %d", v) } return nil }) }
// TestReplicateRange verifies basic replication functionality by creating two stores // and a range, replicating the range to the second store, and reading its data there. func TestReplicateRange(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 2) defer mtc.Stop() // Issue a command on the first node before replicating. incArgs := incrementArgs([]byte("a"), 5) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &incArgs); err != nil { t.Fatal(err) } rng, err := mtc.stores[0].GetReplica(1) if err != nil { t.Fatal(err) } if err := rng.ChangeReplicas(roachpb.ADD_REPLICA, roachpb.ReplicaDescriptor{ NodeID: mtc.stores[1].Ident.NodeID, StoreID: mtc.stores[1].Ident.StoreID, }, rng.Desc()); err != nil { t.Fatal(err) } // Verify no intent remains on range descriptor key. key := keys.RangeDescriptorKey(rng.Desc().StartKey) desc := roachpb.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key, mtc.stores[0].Clock().Now(), true, nil, &desc); !ok || err != nil { t.Fatalf("fetching range descriptor yielded %t, %s", ok, err) } // Verify that in time, no intents remain on meta addressing // keys, and that range descriptor on the meta records is correct. util.SucceedsWithin(t, 1*time.Second, func() error { meta2 := keys.Addr(keys.RangeMetaKey(roachpb.RKeyMax)) meta1 := keys.Addr(keys.RangeMetaKey(meta2)) for _, key := range []roachpb.RKey{meta2, meta1} { metaDesc := roachpb.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key.AsRawKey(), mtc.stores[0].Clock().Now(), true, nil, &metaDesc); !ok || err != nil { return util.Errorf("failed to resolve %s", key.AsRawKey()) } if !reflect.DeepEqual(metaDesc, desc) { return util.Errorf("descs not equal: %+v != %+v", metaDesc, desc) } } return nil }) // Verify that the same data is available on the replica. util.SucceedsWithin(t, replicaReadTimeout, func() error { getArgs := getArgs([]byte("a")) if reply, err := client.SendWrappedWith(rg1(mtc.stores[1]), nil, roachpb.Header{ ReadConsistency: roachpb.INCONSISTENT, }, &getArgs); err != nil { return util.Errorf("failed to read data: %s", err) } else if e, v := int64(5), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e { return util.Errorf("failed to read correct data: expected %d, got %d", e, v) } return nil }) }
// InternalHeartbeatTxn updates the transaction status and heartbeat // timestamp after receiving transaction heartbeat messages from // coordinator. Returns the updated transaction. func (r *Range) InternalHeartbeatTxn(batch engine.Engine, ms *engine.MVCCStats, args proto.InternalHeartbeatTxnRequest) (proto.InternalHeartbeatTxnResponse, error) { var reply proto.InternalHeartbeatTxnResponse key := keys.TransactionKey(args.Txn.Key, args.Txn.ID) var txn proto.Transaction if ok, err := engine.MVCCGetProto(batch, key, proto.ZeroTimestamp, true, nil, &txn); err != nil { return reply, err } else if !ok { // If no existing transaction record was found, initialize to a // shallow copy of the transaction in the request header. We copy // to avoid mutating the original below. txn = *args.Txn } if txn.Status == proto.PENDING { if txn.LastHeartbeat == nil { txn.LastHeartbeat = &proto.Timestamp{} } if txn.LastHeartbeat.Less(args.Header().Timestamp) { *txn.LastHeartbeat = args.Header().Timestamp } if err := engine.MVCCPutProto(batch, ms, key, proto.ZeroTimestamp, nil, &txn); err != nil { return reply, err } } reply.Txn = &txn return reply, nil }
// InternalHeartbeatTxn updates the transaction status and heartbeat // timestamp after receiving transaction heartbeat messages from // coordinator. Returns the updated transaction. func (r *Range) InternalHeartbeatTxn(batch engine.Engine, ms *engine.MVCCStats, args *proto.InternalHeartbeatTxnRequest, reply *proto.InternalHeartbeatTxnResponse) { key := keys.TransactionKey(args.Txn.Key, args.Txn.ID) var txn proto.Transaction ok, err := engine.MVCCGetProto(batch, key, proto.ZeroTimestamp, true, nil, &txn) if err != nil { reply.SetGoError(err) return } // If no existing transaction record was found, initialize // to the transaction in the request header. if !ok { gogoproto.Merge(&txn, args.Txn) } if txn.Status == proto.PENDING { if txn.LastHeartbeat == nil { txn.LastHeartbeat = &proto.Timestamp{} } if txn.LastHeartbeat.Less(args.Header().Timestamp) { *txn.LastHeartbeat = args.Header().Timestamp } if err := engine.MVCCPutProto(batch, ms, key, proto.ZeroTimestamp, nil, &txn); err != nil { reply.SetGoError(err) return } } reply.Txn = &txn }
// InitialState implements the raft.Storage interface. func (r *Range) InitialState() (raftpb.HardState, raftpb.ConfState, error) { var hs raftpb.HardState found, err := engine.MVCCGetProto(r.rm.Engine(), keys.RaftHardStateKey(r.Desc().RaftID), proto.ZeroTimestamp, true, nil, &hs) if err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } if !found { // We don't have a saved HardState, so set up the defaults. if r.isInitialized() { // Set the initial log term. hs.Term = raftInitialLogTerm hs.Commit = raftInitialLogIndex atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex) } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. atomic.StoreUint64(&r.lastIndex, 0) } } var cs raftpb.ConfState // For uninitalized ranges, membership is unknown at this point. if found || r.isInitialized() { for _, rep := range r.Desc().Replicas { cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID))) } } return hs, cs, nil }
func loadLeaderLease(eng engine.Engine, raftID proto.RaftID) (*proto.Lease, error) { lease := &proto.Lease{} if _, err := engine.MVCCGetProto(eng, keys.RaftLeaderLeaseKey(raftID), proto.ZeroTimestamp, true, nil, lease); err != nil { return nil, err } return lease, nil }
func (ls *Stores) readBootstrapInfoLocked(bi *gossip.BootstrapInfo) error { latestTS := roachpb.ZeroTimestamp timestamps := map[roachpb.StoreID]roachpb.Timestamp{} // Find the most recent bootstrap info, collecting timestamps for // each store along the way. for id, s := range ls.storeMap { var storeBI gossip.BootstrapInfo ok, err := engine.MVCCGetProto(s.engine, keys.StoreGossipKey(), roachpb.ZeroTimestamp, true, nil, &storeBI) if err != nil { return err } timestamps[id] = storeBI.Timestamp if ok && latestTS.Less(storeBI.Timestamp) { latestTS = storeBI.Timestamp *bi = storeBI } } // Update all stores with an earlier timestamp. for id, s := range ls.storeMap { if timestamps[id].Less(latestTS) { if err := engine.MVCCPutProto(s.engine, nil, keys.StoreGossipKey(), roachpb.ZeroTimestamp, nil, bi); err != nil { return err } log.Infof("updated gossip bootstrap info to %s", s) } } ls.biLatestTS = latestTS return nil }
// raftTruncatedStateLocked returns metadata about the log that preceded the // first current entry. This includes both entries that have been compacted away // and the dummy entries that make up the starting point of an empty log. // raftTruncatedStateLocked requires that the replica lock be held. func (r *Replica) raftTruncatedStateLocked() (roachpb.RaftTruncatedState, error) { if r.mu.truncatedState != nil { return *r.mu.truncatedState, nil } ts := roachpb.RaftTruncatedState{} ok, err := engine.MVCCGetProto(r.store.Engine(), keys.RaftTruncatedStateKey(r.RangeID), roachpb.ZeroTimestamp, true, nil, &ts) if err != nil { return ts, err } if !ok { if r.isInitializedLocked() { // If we created this range, set the initial log index/term. ts.Index = raftInitialLogIndex ts.Term = raftInitialLogTerm } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. ts.Index = 0 ts.Term = 0 } } if ts.Index != 0 { r.mu.truncatedState = &ts } return ts, nil }
// raftTruncatedState returns metadata about the log that preceded the first // current entry. This includes both entries that have been compacted away // and the dummy entries that make up the starting point of an empty log. func (r *Replica) raftTruncatedState() (proto.RaftTruncatedState, error) { if ts := r.getCachedTruncatedState(); ts != nil { return *ts, nil } ts := proto.RaftTruncatedState{} ok, err := engine.MVCCGetProto(r.rm.Engine(), keys.RaftTruncatedStateKey(r.Desc().RangeID), proto.ZeroTimestamp, true, nil, &ts) if err != nil { return ts, err } if !ok { if r.isInitialized() { // If we created this range, set the initial log index/term. ts.Index = raftInitialLogIndex ts.Term = raftInitialLogTerm } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. ts.Index = 0 ts.Term = 0 } } if ts.Index != 0 { r.setCachedTruncatedState(&ts) } return ts, nil }
// Snapshot implements the raft.Storage interface. func (r *Replica) Snapshot() (raftpb.Snapshot, error) { // Copy all the data from a consistent RocksDB snapshot into a RaftSnapshotData. snap := r.rm.NewSnapshot() defer snap.Close() var snapData proto.RaftSnapshotData // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, err := r.loadAppliedIndex(snap) if err != nil { return raftpb.Snapshot{}, err } var desc proto.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(snap, keys.RangeDescriptorKey(r.Desc().StartKey), r.rm.Clock().Now(), false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, util.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the response cache. for iter := newRangeDataIterator(r.Desc(), snap); iter.Valid(); iter.Next() { snapData.KV = append(snapData.KV, &proto.RaftSnapshotData_KeyValue{Key: iter.Key(), Value: iter.Value()}) } data, err := gogoproto.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID))) } term, err := r.Term(appliedIndex) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }
func raftTruncatedState( eng engine.Reader, rangeID roachpb.RangeID, ) (roachpb.RaftTruncatedState, error) { ts := roachpb.RaftTruncatedState{} _, err := engine.MVCCGetProto(context.Background(), eng, keys.RaftTruncatedStateKey(rangeID), hlc.ZeroTimestamp, true, nil, &ts) return ts /* zero if not found */, err }
func loadGCThreshold( ctx context.Context, reader engine.Reader, rangeID roachpb.RangeID, ) (hlc.Timestamp, error) { var t hlc.Timestamp _, err := engine.MVCCGetProto(ctx, reader, keys.RangeLastGCKey(rangeID), hlc.ZeroTimestamp, true, nil, &t) return t, err }
// GetGCMetadata reads the latest GC metadata for this range. func (r *Range) GetGCMetadata() (*proto.GCMetadata, error) { key := keys.RangeGCMetadataKey(r.Desc().RaftID) gcMeta := &proto.GCMetadata{} _, err := engine.MVCCGetProto(r.rm.Engine(), key, proto.ZeroTimestamp, true, nil, gcMeta) if err != nil { return nil, err } return gcMeta, nil }
// GetLastVerificationTimestamp reads the timestamp at which the range's // data was last verified. func (r *Range) GetLastVerificationTimestamp() (proto.Timestamp, error) { key := keys.RangeLastVerificationTimestampKey(r.Desc().RaftID) timestamp := proto.Timestamp{} _, err := engine.MVCCGetProto(r.rm.Engine(), key, proto.ZeroTimestamp, true, nil, ×tamp) if err != nil { return proto.ZeroTimestamp, err } return timestamp, nil }
func loadLease(reader engine.Reader, rangeID roachpb.RangeID) (*roachpb.Lease, error) { lease := &roachpb.Lease{} _, err := engine.MVCCGetProto(context.Background(), reader, keys.RangeLeaderLeaseKey(rangeID), hlc.ZeroTimestamp, true, nil, lease) if err != nil { return nil, err } return lease, nil }
// Get looks up an abort cache entry recorded for this transaction ID. // Returns whether an abort record was found and any error. func (sc *AbortCache) Get(e engine.Engine, txnID *uuid.UUID, entry *roachpb.AbortCacheEntry) (bool, error) { if txnID == nil { return false, errEmptyTxnID } // Pull response from disk and read into reply if available. key := keys.AbortCacheKey(sc.rangeID, txnID) ok, err := engine.MVCCGetProto(e, key, roachpb.ZeroTimestamp, true /* consistent */, nil /* txn */, entry) return ok, err }
// TestStoreResolveWriteIntent adds write intent and then verifies // that a put returns success and aborts intent's txn in the event the // pushee has lower priority. Othwerise, verifies that a // TransactionPushError is returned. func TestStoreResolveWriteIntent(t *testing.T) { defer leaktest.AfterTest(t) store, _, stopper := createTestStore(t) defer stopper.Stop() for i, resolvable := range []bool{true, false} { key := proto.Key(fmt.Sprintf("key-%d", i)) pusher := newTransaction("test", key, 1, proto.SERIALIZABLE, store.ctx.Clock) pushee := newTransaction("test", key, 1, proto.SERIALIZABLE, store.ctx.Clock) if resolvable { pushee.Priority = 1 pusher.Priority = 2 // Pusher will win. } else { pushee.Priority = 2 pusher.Priority = 1 // Pusher will lose. } // First lay down intent using the pushee's txn. pArgs := putArgs(key, []byte("value"), 1, store.StoreID()) pArgs.Timestamp = store.ctx.Clock.Now() pArgs.Txn = pushee if err := store.ExecuteCmd(context.Background(), proto.Call{Args: &pArgs, Reply: pArgs.CreateReply()}); err != nil { t.Fatal(err) } // Now, try a put using the pusher's txn. pArgs.Timestamp = store.ctx.Clock.Now() pArgs.Txn = pusher err := store.ExecuteCmd(context.Background(), proto.Call{Args: &pArgs, Reply: pArgs.CreateReply()}) if resolvable { if err != nil { t.Errorf("expected intent resolved; got unexpected error: %s", err) } txnKey := keys.TransactionKey(pushee.Key, pushee.ID) var txn proto.Transaction ok, err := engine.MVCCGetProto(store.Engine(), txnKey, proto.ZeroTimestamp, true, nil, &txn) if !ok || err != nil { t.Fatalf("not found or err: %s", err) } if txn.Status != proto.ABORTED { t.Errorf("expected pushee to be aborted; got %s", txn.Status) } } else { if rErr, ok := err.(*proto.TransactionPushError); !ok { t.Errorf("expected txn push error; got %s", err) } else if !bytes.Equal(rErr.PusheeTxn.ID, pushee.ID) { t.Errorf("expected txn to match pushee %q; got %s", pushee.ID, rErr) } // Trying again should fail again. if err = store.ExecuteCmd(context.Background(), proto.Call{Args: &pArgs, Reply: pArgs.CreateReply()}); err == nil { t.Errorf("expected another error on latent write intent but succeeded") } } } }
func loadTruncatedState( ctx context.Context, reader engine.Reader, rangeID roachpb.RangeID, ) (roachpb.RaftTruncatedState, error) { var truncState roachpb.RaftTruncatedState if _, err := engine.MVCCGetProto(ctx, reader, keys.RaftTruncatedStateKey(rangeID), hlc.ZeroTimestamp, true, nil, &truncState); err != nil { return roachpb.RaftTruncatedState{}, err } return truncState, nil }
func loadHardState( ctx context.Context, reader engine.Reader, rangeID roachpb.RangeID, ) (raftpb.HardState, error) { var hs raftpb.HardState found, err := engine.MVCCGetProto(ctx, reader, keys.RaftHardStateKey(rangeID), hlc.ZeroTimestamp, true, nil, &hs) if !found || err != nil { return raftpb.HardState{}, err } return hs, nil }
// loadReplicaDestroyedError loads the replica destroyed error for the specified // range. If there is no error, nil is returned. func loadReplicaDestroyedError( ctx context.Context, reader engine.Reader, rangeID roachpb.RangeID, ) (*roachpb.Error, error) { var v roachpb.Error found, err := engine.MVCCGetProto(ctx, reader, keys.RangeReplicaDestroyedErrorKey(rangeID), hlc.ZeroTimestamp, true /* consistent */, nil, &v) if err != nil { return nil, err } if !found { return nil, nil } return &v, nil }
// GetResponse looks up a response matching the specified cmdID and // returns true if found. The response is deserialized into the // supplied reply parameter. If no response is found, returns // false. If a command is pending already for the cmdID, then this // method will block until the the command is completed or the // response cache is cleared. func (rc *ResponseCache) GetResponse(cmdID proto.ClientCmdID, reply proto.Response) (bool, error) { // Do nothing if command ID is empty. if cmdID.IsEmpty() { return false, nil } // If the response is in the cache or we experienced an error, return. rwResp := proto.ReadWriteCmdResponse{} key := keys.ResponseCacheKey(rc.raftID, &cmdID) ok, err := engine.MVCCGetProto(rc.engine, key, proto.ZeroTimestamp, true, nil, &rwResp) if ok && err == nil { gogoproto.Merge(reply, rwResp.GetValue().(gogoproto.Message)) } return ok, err }
// GetResponse looks up a response matching the specified cmdID and // returns true if found. The response is deserialized into the // supplied reply parameter. If no response is found, returns // false. If a command is pending already for the cmdID, then this // method will block until the the command is completed or the // response cache is cleared. func (rc *ResponseCache) GetResponse(e engine.Engine, cmdID proto.ClientCmdID, reply proto.Response) (bool, error) { // Do nothing if command ID is empty. if cmdID.IsEmpty() { return false, nil } // Pull response from the cache and read into reply if available. rwResp := proto.ReadWriteCmdResponse{} key := keys.ResponseCacheKey(rc.raftID, &cmdID) ok, err := engine.MVCCGetProto(e, key, proto.ZeroTimestamp, true, nil, &rwResp) if ok && err == nil { gogoproto.Merge(reply, rwResp.GetValue().(gogoproto.Message)) } return ok, err }
// ReadBootstrapInfo implements the gossip.Storage interface. Read // attempts to read gossip bootstrap info from every known store and // finds the most recent from all stores to initialize the bootstrap // info argument. Returns an error on any issues reading data for the // stores (but excluding the case in which no data has been persisted // yet). func (ls *Stores) ReadBootstrapInfo(bi *gossip.BootstrapInfo) error { ls.mu.RLock() defer ls.mu.RUnlock() latestTS := hlc.ZeroTimestamp // Find the most recent bootstrap info. for _, s := range ls.storeMap { var storeBI gossip.BootstrapInfo ok, err := engine.MVCCGetProto(context.Background(), s.engine, keys.StoreGossipKey(), hlc.ZeroTimestamp, true, nil, &storeBI) if err != nil { return err } if ok && latestTS.Less(storeBI.Timestamp) { latestTS = storeBI.Timestamp *bi = storeBI } } log.Infof("read %d node addresses from persistent storage", len(bi.Addresses)) return ls.updateBootstrapInfo(bi) }
// GetResponse looks up a response matching the specified cmdID. If the // response is found, it is returned along with its associated error. // If the response is not found, nil is returned for both the response // and its error. In all cases, the third return value is the error // returned from the engine when reading the on-disk cache. func (rc *ResponseCache) GetResponse(e engine.Engine, cmdID proto.ClientCmdID) (proto.ResponseWithError, error) { // Do nothing if command ID is empty. if cmdID.IsEmpty() { return proto.ResponseWithError{}, nil } // Pull response from the cache and read into reply if available. br := &proto.BatchResponse{} key := keys.ResponseCacheKey(rc.rangeID, &cmdID) ok, err := engine.MVCCGetProto(e, key, proto.ZeroTimestamp, true, nil, br) if err != nil { return proto.ResponseWithError{}, err } if ok { header := br.Header() defer func() { header.Error = nil }() return proto.ResponseWithError{Reply: br, Err: header.GoError()}, nil } return proto.ResponseWithError{}, nil }
// InitialState implements the raft.Storage interface. func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) { var hs raftpb.HardState desc := r.Desc() found, err := engine.MVCCGetProto(r.store.Engine(), keys.RaftHardStateKey(desc.RangeID), roachpb.ZeroTimestamp, true, nil, &hs) if err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } initialized := r.isInitialized() if !found { // We don't have a saved HardState, so set up the defaults. if initialized { // Set the initial log term. hs.Term = raftInitialLogTerm hs.Commit = raftInitialLogIndex atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex) } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. atomic.StoreUint64(&r.lastIndex, 0) } } else if initialized && hs.Commit == 0 { // Normally, when the commit index changes, raft gives us a new // commit index to persist, however, during initialization, which // occurs entirely in cockroach, raft has no knowledge of this. // By setting this to the initial log index, we avoid a panic in // raft caused by this inconsistency. hs.Commit = raftInitialLogIndex } var cs raftpb.ConfState // For uninitalized ranges, membership is unknown at this point. if found || initialized { for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } } return hs, cs, nil }
// GetResponse looks up a response matching the specified cmdID. If the // response is found, it is returned along with its associated error. // If the response is not found, nil is returned for both the response // and its error. In all cases, the third return value is the error // returned from the engine when reading the on-disk cache. func (rc *ResponseCache) GetResponse(e engine.Engine, cmdID proto.ClientCmdID) (proto.ResponseWithError, error) { // Do nothing if command ID is empty. if cmdID.IsEmpty() { return proto.ResponseWithError{}, nil } // Pull response from the cache and read into reply if available. var rwResp proto.ReadWriteCmdResponse key := keys.ResponseCacheKey(rc.raftID, &cmdID) ok, err := engine.MVCCGetProto(e, key, proto.ZeroTimestamp, true, nil, &rwResp) if err != nil { return proto.ResponseWithError{}, err } if ok { resp := rwResp.GetValue().(proto.Response) header := resp.Header() defer func() { header.Error = nil }() return proto.ResponseWithError{resp, header.GoError()}, nil } return proto.ResponseWithError{}, nil }
func raftTruncatedState(eng engine.Engine, rangeID roachpb.RangeID, isInitialized bool) (roachpb.RaftTruncatedState, error) { ts := roachpb.RaftTruncatedState{} ok, err := engine.MVCCGetProto(context.Background(), eng, keys.RaftTruncatedStateKey(rangeID), roachpb.ZeroTimestamp, true, nil, &ts) if err != nil { return ts, err } if !ok { if isInitialized { // If we created this range, set the initial log index/term. ts.Index = raftInitialLogIndex ts.Term = raftInitialLogTerm } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. ts.Index = 0 ts.Term = 0 } } return ts, nil }
// GetResponse looks up a response matching the specified cmdID and // returns true if found. The response is deserialized into the // supplied reply parameter. If no response is found, returns // false. If a command is pending already for the cmdID, then this // method will block until the the command is completed or the // response cache is cleared. func (rc *ResponseCache) GetResponse(cmdID proto.ClientCmdID, reply proto.Response) (bool, error) { // Do nothing if command ID is empty. if cmdID.IsEmpty() { return false, nil } // If the command is inflight, wait for it to complete. rc.Lock() for { if cond, ok := rc.inflight[makeCmdIDKey(cmdID)]; ok { log.Infof("waiting on cmdID: %s", &cmdID) cond.Wait() } else { break } } // Adding inflight here is preemptive; we don't want to hold lock // while fetching from the on-disk cache. The vast, vast majority of // calls to GetResponse will be cache misses, so this saves us // from acquiring the lock twice: once here and once below in the // event we experience a cache miss. rc.addInflightLocked(cmdID) rc.Unlock() // If the response is in the cache or we experienced an error, return. rwResp := proto.ReadWriteCmdResponse{} key := engine.ResponseCacheKey(rc.raftID, &cmdID) if ok, err := engine.MVCCGetProto(rc.engine, key, proto.ZeroTimestamp, nil, &rwResp); ok || err != nil { rc.Lock() // Take lock after fetching response from cache. defer rc.Unlock() rc.removeInflightLocked(cmdID) if err == nil && rwResp.GetValue() != nil { gogoproto.Merge(reply.(gogoproto.Message), rwResp.GetValue().(gogoproto.Message)) } return ok, err } // There's no command result cached for this ID; but inflight was added above. return false, nil }
// Snapshot implements the raft.Storage interface. // Snapshot requires that the replica lock is held. func (r *Replica) Snapshot() (raftpb.Snapshot, error) { // Copy all the data from a consistent RocksDB snapshot into a RaftSnapshotData. snap := r.store.NewSnapshot() defer snap.Close() var snapData roachpb.RaftSnapshotData firstIndex, err := r.FirstIndex() if err != nil { return raftpb.Snapshot{}, err } // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, err := r.loadAppliedIndexLocked(snap) if err != nil { return raftpb.Snapshot{}, err } var desc roachpb.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(snap, keys.RangeDescriptorKey(r.mu.desc.StartKey), r.store.Clock().Now(), false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, util.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the sequence cache. iter := newReplicaDataIterator(&desc, snap, true /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { key := iter.Key() snapData.KV = append(snapData.KV, roachpb.RaftSnapshotData_KeyValue{ Key: key.Key, Value: iter.Value(), Timestamp: key.Timestamp, }) } entries, err := r.entries(snap, firstIndex, appliedIndex+1, 0) if err != nil { return raftpb.Snapshot{}, err } snapData.LogEntries = entries data, err := proto.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } term, err := r.Term(appliedIndex) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }