// InitialState implements the raft.Storage interface. func (r *Range) InitialState() (raftpb.HardState, raftpb.ConfState, error) { var hs raftpb.HardState found, err := engine.MVCCGetProto(r.rm.Engine(), keys.RaftHardStateKey(r.Desc().RaftID), proto.ZeroTimestamp, true, nil, &hs) if err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } if !found { // We don't have a saved HardState, so set up the defaults. if r.isInitialized() { // Set the initial log term. hs.Term = raftInitialLogTerm hs.Commit = raftInitialLogIndex atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex) } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. atomic.StoreUint64(&r.lastIndex, 0) } } var cs raftpb.ConfState // For uninitalized ranges, membership is unknown at this point. if found || r.isInitialized() { for _, rep := range r.Desc().Replicas { cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID))) } } return hs, cs, nil }
// New initializes a replication.LogReplicator using an already open kv.DB and // registers a raft service with server. It is the caller's responsibility to // call Serve. func New( thisReplica uint64, initialReplicas []uint64, db kv.DB, prefix []byte, clk clock.Clock, tickInterval time.Duration, server *grpc.Server, dial func(id uint64) proto.RaftClient, ) replication.LogReplicator { confState := raftpb.ConfState{} for _, id := range initialReplicas { confState.Nodes = append(confState.Nodes, id) } storage := mkRaftStorage(db, prefix, confState) l := &raftLog{ config: raft.Config{ ID: thisReplica, ElectionTick: 10, HeartbeatTick: 1, MaxSizePerMsg: 4 * 1024, MaxInflightMsgs: 256, Storage: storage, }, node: nil, clk: clk, tickInterval: tickInterval, grpcServer: server, dial: dial, } proto.RegisterRaftServer(l.grpcServer, l) return l }
// Snapshot implements the raft.Storage interface. func (r *Replica) Snapshot() (raftpb.Snapshot, error) { // Copy all the data from a consistent RocksDB snapshot into a RaftSnapshotData. snap := r.rm.NewSnapshot() defer snap.Close() var snapData proto.RaftSnapshotData // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, err := r.loadAppliedIndex(snap) if err != nil { return raftpb.Snapshot{}, err } var desc proto.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(snap, keys.RangeDescriptorKey(r.Desc().StartKey), r.rm.Clock().Now(), false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, util.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the response cache. for iter := newRangeDataIterator(r.Desc(), snap); iter.Valid(); iter.Next() { snapData.KV = append(snapData.KV, &proto.RaftSnapshotData_KeyValue{Key: iter.Key(), Value: iter.Value()}) } data, err := gogoproto.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID))) } term, err := r.Term(appliedIndex) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }
// InitialState implements the raft.Storage interface. // InitialState requires that the replica lock be held. func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) { hs, err := loadHardState(context.Background(), r.store.Engine(), r.RangeID) // For uninitialized ranges, membership is unknown at this point. if raft.IsEmptyHardState(hs) || err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } var cs raftpb.ConfState for _, rep := range r.mu.state.Desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } return hs, cs, nil }
// InitialState implements the raft.Storage interface. func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) { var hs raftpb.HardState desc := r.Desc() found, err := engine.MVCCGetProto(r.store.Engine(), keys.RaftHardStateKey(desc.RangeID), roachpb.ZeroTimestamp, true, nil, &hs) if err != nil { return raftpb.HardState{}, raftpb.ConfState{}, err } initialized := r.isInitialized() if !found { // We don't have a saved HardState, so set up the defaults. if initialized { // Set the initial log term. hs.Term = raftInitialLogTerm hs.Commit = raftInitialLogIndex atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex) } else { // This is a new range we are receiving from another node. Start // from zero so we will receive a snapshot. atomic.StoreUint64(&r.lastIndex, 0) } } else if initialized && hs.Commit == 0 { // Normally, when the commit index changes, raft gives us a new // commit index to persist, however, during initialization, which // occurs entirely in cockroach, raft has no knowledge of this. // By setting this to the initial log index, we avoid a panic in // raft caused by this inconsistency. hs.Commit = raftInitialLogIndex } var cs raftpb.ConfState // For uninitalized ranges, membership is unknown at this point. if found || initialized { for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } } return hs, cs, nil }
// Snapshot implements the raft.Storage interface. // Snapshot requires that the replica lock is held. func (r *Replica) Snapshot() (raftpb.Snapshot, error) { // Copy all the data from a consistent RocksDB snapshot into a RaftSnapshotData. snap := r.store.NewSnapshot() defer snap.Close() var snapData roachpb.RaftSnapshotData firstIndex, err := r.FirstIndex() if err != nil { return raftpb.Snapshot{}, err } // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, err := r.loadAppliedIndexLocked(snap) if err != nil { return raftpb.Snapshot{}, err } var desc roachpb.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(snap, keys.RangeDescriptorKey(r.mu.desc.StartKey), r.store.Clock().Now(), false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, util.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the sequence cache. iter := newReplicaDataIterator(&desc, snap, true /* !replicatedOnly */) defer iter.Close() for ; iter.Valid(); iter.Next() { key := iter.Key() snapData.KV = append(snapData.KV, roachpb.RaftSnapshotData_KeyValue{ Key: key.Key, Value: iter.Value(), Timestamp: key.Timestamp, }) } entries, err := r.entries(snap, firstIndex, appliedIndex+1, 0) if err != nil { return raftpb.Snapshot{}, err } snapData.LogEntries = entries data, err := proto.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } term, err := r.Term(appliedIndex) if err != nil { return raftpb.Snapshot{}, util.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }
func snapshot( ctx context.Context, snap engine.Reader, rangeID roachpb.RangeID, eCache *raftEntryCache, startKey roachpb.RKey, ) (raftpb.Snapshot, error) { start := timeutil.Now() var snapData roachpb.RaftSnapshotData truncState, err := loadTruncatedState(ctx, snap, rangeID) if err != nil { return raftpb.Snapshot{}, err } firstIndex := truncState.Index + 1 // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, _, err := loadAppliedIndex(ctx, snap, rangeID) if err != nil { return raftpb.Snapshot{}, err } var desc roachpb.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey), hlc.MaxTimestamp, false /* !consistent */, nil, &desc) if err != nil { return raftpb.Snapshot{}, errors.Errorf("failed to get desc: %s", err) } if !ok { return raftpb.Snapshot{}, errors.Errorf("couldn't find range descriptor") } // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Iterate over all the data in the range, including local-only data like // the sequence cache. iter := NewReplicaDataIterator(&desc, snap, true /* replicatedOnly */) defer iter.Close() var alloc bufalloc.ByteAllocator for ; iter.Valid(); iter.Next() { var key engine.MVCCKey var value []byte alloc, key, value = iter.allocIterKeyValue(alloc) snapData.KV = append(snapData.KV, roachpb.RaftSnapshotData_KeyValue{ Key: key.Key, Value: value, Timestamp: key.Timestamp, }) } endIndex := appliedIndex + 1 snapData.LogEntries = make([][]byte, 0, endIndex-firstIndex) scanFunc := func(kv roachpb.KeyValue) (bool, error) { bytes, err := kv.Value.GetBytes() if err == nil { snapData.LogEntries = append(snapData.LogEntries, bytes) } return false, err } if err := iterateEntries(ctx, snap, rangeID, firstIndex, endIndex, scanFunc); err != nil { return raftpb.Snapshot{}, err } data, err := protoutil.Marshal(&snapData) if err != nil { return raftpb.Snapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } term, err := term(ctx, snap, rangeID, eCache, appliedIndex) if err != nil { return raftpb.Snapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } log.Infof(ctx, "generated snapshot for range %s at index %d in %s. encoded size=%d, %d KV pairs, %d log entries", rangeID, appliedIndex, timeutil.Since(start), len(data), len(snapData.KV), len(snapData.LogEntries)) return raftpb.Snapshot{ Data: data, Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, nil }
// snapshot creates an OutgoingSnapshot containing a rocksdb snapshot for the given range. func snapshot( ctx context.Context, snapType string, snap engine.Reader, rangeID roachpb.RangeID, eCache *raftEntryCache, startKey roachpb.RKey, ) (OutgoingSnapshot, error) { var desc roachpb.RangeDescriptor // We ignore intents on the range descriptor (consistent=false) because we // know they cannot be committed yet; operations that modify range // descriptors resolve their own intents when they commit. ok, err := engine.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey), hlc.MaxTimestamp, false /* !consistent */, nil, &desc) if err != nil { return OutgoingSnapshot{}, errors.Errorf("failed to get desc: %s", err) } if !ok { return OutgoingSnapshot{}, errors.Errorf("couldn't find range descriptor") } var snapData roachpb.RaftSnapshotData // Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot() snapData.RangeDescriptor = desc // Read the range metadata from the snapshot instead of the members // of the Range struct because they might be changed concurrently. appliedIndex, _, err := loadAppliedIndex(ctx, snap, rangeID) if err != nil { return OutgoingSnapshot{}, err } // Synthesize our raftpb.ConfState from desc. var cs raftpb.ConfState for _, rep := range desc.Replicas { cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID)) } term, err := term(ctx, snap, rangeID, eCache, appliedIndex) if err != nil { return OutgoingSnapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err) } state, err := loadState(ctx, snap, &desc) if err != nil { return OutgoingSnapshot{}, err } // Intentionally let this iterator and the snapshot escape so that the // streamer can send chunks from it bit by bit. iter := NewReplicaDataIterator(&desc, snap, true /* replicatedOnly */) snapUUID := uuid.MakeV4() log.Infof(ctx, "generated %s snapshot %s at index %d", snapType, snapUUID.Short(), appliedIndex) return OutgoingSnapshot{ EngineSnap: snap, Iter: iter, State: state, SnapUUID: snapUUID, RaftSnap: raftpb.Snapshot{ Data: snapUUID.GetBytes(), Metadata: raftpb.SnapshotMetadata{ Index: appliedIndex, Term: term, ConfState: cs, }, }, }, nil }