// InitialState implements the raft.Storage interface.
func (r *Range) InitialState() (raftpb.HardState, raftpb.ConfState, error) {
	var hs raftpb.HardState
	found, err := engine.MVCCGetProto(r.rm.Engine(), keys.RaftHardStateKey(r.Desc().RaftID),
		proto.ZeroTimestamp, true, nil, &hs)
	if err != nil {
		return raftpb.HardState{}, raftpb.ConfState{}, err
	}
	if !found {
		// We don't have a saved HardState, so set up the defaults.
		if r.isInitialized() {
			// Set the initial log term.
			hs.Term = raftInitialLogTerm
			hs.Commit = raftInitialLogIndex

			atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex)
		} else {
			// This is a new range we are receiving from another node. Start
			// from zero so we will receive a snapshot.
			atomic.StoreUint64(&r.lastIndex, 0)
		}
	}

	var cs raftpb.ConfState
	// For uninitalized ranges, membership is unknown at this point.
	if found || r.isInitialized() {
		for _, rep := range r.Desc().Replicas {
			cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID)))
		}
	}

	return hs, cs, nil
}
Beispiel #2
0
// New initializes a replication.LogReplicator using an already open kv.DB and
// registers a raft service with server. It is the caller's responsibility to
// call Serve.
func New(
	thisReplica uint64, initialReplicas []uint64,
	db kv.DB, prefix []byte,
	clk clock.Clock, tickInterval time.Duration,
	server *grpc.Server, dial func(id uint64) proto.RaftClient,
) replication.LogReplicator {
	confState := raftpb.ConfState{}
	for _, id := range initialReplicas {
		confState.Nodes = append(confState.Nodes, id)
	}
	storage := mkRaftStorage(db, prefix, confState)
	l := &raftLog{
		config: raft.Config{
			ID:              thisReplica,
			ElectionTick:    10,
			HeartbeatTick:   1,
			MaxSizePerMsg:   4 * 1024,
			MaxInflightMsgs: 256,
			Storage:         storage,
		},
		node:         nil,
		clk:          clk,
		tickInterval: tickInterval,
		grpcServer:   server,
		dial:         dial,
	}
	proto.RegisterRaftServer(l.grpcServer, l)
	return l
}
// Snapshot implements the raft.Storage interface.
func (r *Replica) Snapshot() (raftpb.Snapshot, error) {
	// Copy all the data from a consistent RocksDB snapshot into a RaftSnapshotData.
	snap := r.rm.NewSnapshot()
	defer snap.Close()
	var snapData proto.RaftSnapshotData

	// Read the range metadata from the snapshot instead of the members
	// of the Range struct because they might be changed concurrently.
	appliedIndex, err := r.loadAppliedIndex(snap)
	if err != nil {
		return raftpb.Snapshot{}, err
	}
	var desc proto.RangeDescriptor
	// We ignore intents on the range descriptor (consistent=false) because we
	// know they cannot be committed yet; operations that modify range
	// descriptors resolve their own intents when they commit.
	ok, err := engine.MVCCGetProto(snap, keys.RangeDescriptorKey(r.Desc().StartKey),
		r.rm.Clock().Now(), false /* !consistent */, nil, &desc)
	if err != nil {
		return raftpb.Snapshot{}, util.Errorf("failed to get desc: %s", err)
	}
	if !ok {
		return raftpb.Snapshot{}, util.Errorf("couldn't find range descriptor")
	}

	// Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot()
	snapData.RangeDescriptor = desc

	// Iterate over all the data in the range, including local-only data like
	// the response cache.
	for iter := newRangeDataIterator(r.Desc(), snap); iter.Valid(); iter.Next() {
		snapData.KV = append(snapData.KV,
			&proto.RaftSnapshotData_KeyValue{Key: iter.Key(), Value: iter.Value()})
	}

	data, err := gogoproto.Marshal(&snapData)
	if err != nil {
		return raftpb.Snapshot{}, err
	}

	// Synthesize our raftpb.ConfState from desc.
	var cs raftpb.ConfState
	for _, rep := range desc.Replicas {
		cs.Nodes = append(cs.Nodes, uint64(proto.MakeRaftNodeID(rep.NodeID, rep.StoreID)))
	}

	term, err := r.Term(appliedIndex)
	if err != nil {
		return raftpb.Snapshot{}, util.Errorf("failed to fetch term of %d: %s", appliedIndex, err)
	}

	return raftpb.Snapshot{
		Data: data,
		Metadata: raftpb.SnapshotMetadata{
			Index:     appliedIndex,
			Term:      term,
			ConfState: cs,
		},
	}, nil
}
// InitialState implements the raft.Storage interface.
// InitialState requires that the replica lock be held.
func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) {
	hs, err := loadHardState(context.Background(), r.store.Engine(), r.RangeID)
	// For uninitialized ranges, membership is unknown at this point.
	if raft.IsEmptyHardState(hs) || err != nil {
		return raftpb.HardState{}, raftpb.ConfState{}, err
	}
	var cs raftpb.ConfState
	for _, rep := range r.mu.state.Desc.Replicas {
		cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID))
	}

	return hs, cs, nil
}
// InitialState implements the raft.Storage interface.
func (r *Replica) InitialState() (raftpb.HardState, raftpb.ConfState, error) {
	var hs raftpb.HardState
	desc := r.Desc()
	found, err := engine.MVCCGetProto(r.store.Engine(), keys.RaftHardStateKey(desc.RangeID),
		roachpb.ZeroTimestamp, true, nil, &hs)
	if err != nil {
		return raftpb.HardState{}, raftpb.ConfState{}, err
	}
	initialized := r.isInitialized()
	if !found {
		// We don't have a saved HardState, so set up the defaults.
		if initialized {
			// Set the initial log term.
			hs.Term = raftInitialLogTerm
			hs.Commit = raftInitialLogIndex

			atomic.StoreUint64(&r.lastIndex, raftInitialLogIndex)
		} else {
			// This is a new range we are receiving from another node. Start
			// from zero so we will receive a snapshot.
			atomic.StoreUint64(&r.lastIndex, 0)
		}
	} else if initialized && hs.Commit == 0 {
		// Normally, when the commit index changes, raft gives us a new
		// commit index to persist, however, during initialization, which
		// occurs entirely in cockroach, raft has no knowledge of this.
		// By setting this to the initial log index, we avoid a panic in
		// raft caused by this inconsistency.
		hs.Commit = raftInitialLogIndex
	}

	var cs raftpb.ConfState
	// For uninitalized ranges, membership is unknown at this point.
	if found || initialized {
		for _, rep := range desc.Replicas {
			cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID))
		}
	}

	return hs, cs, nil
}
// Snapshot implements the raft.Storage interface.
// Snapshot requires that the replica lock is held.
func (r *Replica) Snapshot() (raftpb.Snapshot, error) {
	// Copy all the data from a consistent RocksDB snapshot into a RaftSnapshotData.
	snap := r.store.NewSnapshot()
	defer snap.Close()
	var snapData roachpb.RaftSnapshotData

	firstIndex, err := r.FirstIndex()
	if err != nil {
		return raftpb.Snapshot{}, err
	}

	// Read the range metadata from the snapshot instead of the members
	// of the Range struct because they might be changed concurrently.
	appliedIndex, err := r.loadAppliedIndexLocked(snap)
	if err != nil {
		return raftpb.Snapshot{}, err
	}

	var desc roachpb.RangeDescriptor
	// We ignore intents on the range descriptor (consistent=false) because we
	// know they cannot be committed yet; operations that modify range
	// descriptors resolve their own intents when they commit.
	ok, err := engine.MVCCGetProto(snap, keys.RangeDescriptorKey(r.mu.desc.StartKey),
		r.store.Clock().Now(), false /* !consistent */, nil, &desc)
	if err != nil {
		return raftpb.Snapshot{}, util.Errorf("failed to get desc: %s", err)
	}
	if !ok {
		return raftpb.Snapshot{}, util.Errorf("couldn't find range descriptor")
	}

	// Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot()
	snapData.RangeDescriptor = desc

	// Iterate over all the data in the range, including local-only data like
	// the sequence cache.
	iter := newReplicaDataIterator(&desc, snap, true /* !replicatedOnly */)
	defer iter.Close()
	for ; iter.Valid(); iter.Next() {
		key := iter.Key()
		snapData.KV = append(snapData.KV,
			roachpb.RaftSnapshotData_KeyValue{
				Key:       key.Key,
				Value:     iter.Value(),
				Timestamp: key.Timestamp,
			})
	}

	entries, err := r.entries(snap, firstIndex, appliedIndex+1, 0)
	if err != nil {
		return raftpb.Snapshot{}, err
	}
	snapData.LogEntries = entries

	data, err := proto.Marshal(&snapData)
	if err != nil {
		return raftpb.Snapshot{}, err
	}

	// Synthesize our raftpb.ConfState from desc.
	var cs raftpb.ConfState
	for _, rep := range desc.Replicas {
		cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID))
	}

	term, err := r.Term(appliedIndex)
	if err != nil {
		return raftpb.Snapshot{}, util.Errorf("failed to fetch term of %d: %s", appliedIndex, err)
	}

	return raftpb.Snapshot{
		Data: data,
		Metadata: raftpb.SnapshotMetadata{
			Index:     appliedIndex,
			Term:      term,
			ConfState: cs,
		},
	}, nil
}
func snapshot(
	ctx context.Context,
	snap engine.Reader,
	rangeID roachpb.RangeID,
	eCache *raftEntryCache,
	startKey roachpb.RKey,
) (raftpb.Snapshot, error) {
	start := timeutil.Now()
	var snapData roachpb.RaftSnapshotData

	truncState, err := loadTruncatedState(ctx, snap, rangeID)
	if err != nil {
		return raftpb.Snapshot{}, err
	}
	firstIndex := truncState.Index + 1

	// Read the range metadata from the snapshot instead of the members
	// of the Range struct because they might be changed concurrently.
	appliedIndex, _, err := loadAppliedIndex(ctx, snap, rangeID)
	if err != nil {
		return raftpb.Snapshot{}, err
	}

	var desc roachpb.RangeDescriptor
	// We ignore intents on the range descriptor (consistent=false) because we
	// know they cannot be committed yet; operations that modify range
	// descriptors resolve their own intents when they commit.
	ok, err := engine.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey),
		hlc.MaxTimestamp, false /* !consistent */, nil, &desc)
	if err != nil {
		return raftpb.Snapshot{}, errors.Errorf("failed to get desc: %s", err)
	}
	if !ok {
		return raftpb.Snapshot{}, errors.Errorf("couldn't find range descriptor")
	}

	// Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot()
	snapData.RangeDescriptor = desc

	// Iterate over all the data in the range, including local-only data like
	// the sequence cache.
	iter := NewReplicaDataIterator(&desc, snap, true /* replicatedOnly */)
	defer iter.Close()
	var alloc bufalloc.ByteAllocator
	for ; iter.Valid(); iter.Next() {
		var key engine.MVCCKey
		var value []byte
		alloc, key, value = iter.allocIterKeyValue(alloc)
		snapData.KV = append(snapData.KV,
			roachpb.RaftSnapshotData_KeyValue{
				Key:       key.Key,
				Value:     value,
				Timestamp: key.Timestamp,
			})
	}

	endIndex := appliedIndex + 1
	snapData.LogEntries = make([][]byte, 0, endIndex-firstIndex)

	scanFunc := func(kv roachpb.KeyValue) (bool, error) {
		bytes, err := kv.Value.GetBytes()
		if err == nil {
			snapData.LogEntries = append(snapData.LogEntries, bytes)
		}
		return false, err
	}

	if err := iterateEntries(ctx, snap, rangeID, firstIndex, endIndex, scanFunc); err != nil {
		return raftpb.Snapshot{}, err
	}

	data, err := protoutil.Marshal(&snapData)
	if err != nil {
		return raftpb.Snapshot{}, err
	}

	// Synthesize our raftpb.ConfState from desc.
	var cs raftpb.ConfState
	for _, rep := range desc.Replicas {
		cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID))
	}

	term, err := term(ctx, snap, rangeID, eCache, appliedIndex)
	if err != nil {
		return raftpb.Snapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err)
	}

	log.Infof(ctx, "generated snapshot for range %s at index %d in %s. encoded size=%d, %d KV pairs, %d log entries",
		rangeID, appliedIndex, timeutil.Since(start), len(data), len(snapData.KV), len(snapData.LogEntries))

	return raftpb.Snapshot{
		Data: data,
		Metadata: raftpb.SnapshotMetadata{
			Index:     appliedIndex,
			Term:      term,
			ConfState: cs,
		},
	}, nil
}
// snapshot creates an OutgoingSnapshot containing a rocksdb snapshot for the given range.
func snapshot(
	ctx context.Context,
	snapType string,
	snap engine.Reader,
	rangeID roachpb.RangeID,
	eCache *raftEntryCache,
	startKey roachpb.RKey,
) (OutgoingSnapshot, error) {
	var desc roachpb.RangeDescriptor
	// We ignore intents on the range descriptor (consistent=false) because we
	// know they cannot be committed yet; operations that modify range
	// descriptors resolve their own intents when they commit.
	ok, err := engine.MVCCGetProto(ctx, snap, keys.RangeDescriptorKey(startKey),
		hlc.MaxTimestamp, false /* !consistent */, nil, &desc)
	if err != nil {
		return OutgoingSnapshot{}, errors.Errorf("failed to get desc: %s", err)
	}
	if !ok {
		return OutgoingSnapshot{}, errors.Errorf("couldn't find range descriptor")
	}

	var snapData roachpb.RaftSnapshotData
	// Store RangeDescriptor as metadata, it will be retrieved by ApplySnapshot()
	snapData.RangeDescriptor = desc

	// Read the range metadata from the snapshot instead of the members
	// of the Range struct because they might be changed concurrently.
	appliedIndex, _, err := loadAppliedIndex(ctx, snap, rangeID)
	if err != nil {
		return OutgoingSnapshot{}, err
	}

	// Synthesize our raftpb.ConfState from desc.
	var cs raftpb.ConfState
	for _, rep := range desc.Replicas {
		cs.Nodes = append(cs.Nodes, uint64(rep.ReplicaID))
	}

	term, err := term(ctx, snap, rangeID, eCache, appliedIndex)
	if err != nil {
		return OutgoingSnapshot{}, errors.Errorf("failed to fetch term of %d: %s", appliedIndex, err)
	}

	state, err := loadState(ctx, snap, &desc)
	if err != nil {
		return OutgoingSnapshot{}, err
	}

	// Intentionally let this iterator and the snapshot escape so that the
	// streamer can send chunks from it bit by bit.
	iter := NewReplicaDataIterator(&desc, snap, true /* replicatedOnly */)
	snapUUID := uuid.MakeV4()

	log.Infof(ctx, "generated %s snapshot %s at index %d",
		snapType, snapUUID.Short(), appliedIndex)
	return OutgoingSnapshot{
		EngineSnap: snap,
		Iter:       iter,
		State:      state,
		SnapUUID:   snapUUID,
		RaftSnap: raftpb.Snapshot{
			Data: snapUUID.GetBytes(),
			Metadata: raftpb.SnapshotMetadata{
				Index:     appliedIndex,
				Term:      term,
				ConfState: cs,
			},
		},
	}, nil
}