Beispiel #1
0
// apply takes entries received from Raft (after it has been committed) and
// applies them to the current state of the EtcdServer.
// The given entries should not be empty.
func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) {
	var applied uint64
	var shouldstop bool
	var err error
	for i := range es {
		e := es[i]
		switch e.Type {
		case raftpb.EntryNormal:
			// raft state machine may generate noop entry when leader confirmation.
			// skip it in advance to avoid some potential bug in the future
			if len(e.Data) == 0 {
				select {
				case s.forceVersionC <- struct{}{}:
				default:
				}
				break
			}
			var r pb.Request
			pbutil.MustUnmarshal(&r, e.Data)
			s.w.Trigger(r.ID, s.applyRequest(r))
		case raftpb.EntryConfChange:
			var cc raftpb.ConfChange
			pbutil.MustUnmarshal(&cc, e.Data)
			shouldstop, err = s.applyConfChange(cc, confState)
			s.w.Trigger(cc.ID, err)
		default:
			plog.Panicf("entry type should be either EntryNormal or EntryConfChange")
		}
		atomic.StoreUint64(&s.r.index, e.Index)
		atomic.StoreUint64(&s.r.term, e.Term)
		applied = e.Index
	}
	return applied, shouldstop
}
Beispiel #2
0
// apply takes entries received from Raft (after it has been committed) and
// applies them to the current state of the EtcdServer.
// The given entries should not be empty.
func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) {
	var applied uint64
	for i := range es {
		e := es[i]
		switch e.Type {
		case raftpb.EntryNormal:
			var r pb.Request
			pbutil.MustUnmarshal(&r, e.Data)
			s.w.Trigger(r.ID, s.applyRequest(r))
		case raftpb.EntryConfChange:
			var cc raftpb.ConfChange
			pbutil.MustUnmarshal(&cc, e.Data)
			shouldstop, err := s.applyConfChange(cc, confState)
			s.w.Trigger(cc.ID, err)
			if shouldstop {
				return applied, true
			}
		default:
			log.Panicf("entry type should be either EntryNormal or EntryConfChange")
		}
		atomic.StoreUint64(&s.raftIndex, e.Index)
		atomic.StoreUint64(&s.raftTerm, e.Term)
		applied = e.Index
	}
	return applied, false
}
Beispiel #3
0
// apply takes entries received from Raft (after it has been committed) and
// applies them to the current state of the EtcdServer.
// The given entries should not be empty.
func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) {
	var applied uint64
	var shouldstop bool
	for i := range es {
		e := es[i]
		// set the consistent index of current executing entry
		s.consistIndex.setConsistentIndex(e.Index)
		switch e.Type {
		case raftpb.EntryNormal:
			// raft state machine may generate noop entry when leader confirmation.
			// skip it in advance to avoid some potential bug in the future
			if len(e.Data) == 0 {
				select {
				case s.forceVersionC <- struct{}{}:
				default:
				}
				break
			}

			var raftReq pb.InternalRaftRequest
			if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible
				var r pb.Request
				pbutil.MustUnmarshal(&r, e.Data)
				s.w.Trigger(r.ID, s.applyRequest(r))
			} else {
				switch {
				case raftReq.V2 != nil:
					req := raftReq.V2
					s.w.Trigger(req.ID, s.applyRequest(*req))
				default:
					s.w.Trigger(raftReq.ID, s.applyV3Request(&raftReq))
				}
			}
		case raftpb.EntryConfChange:
			var cc raftpb.ConfChange
			pbutil.MustUnmarshal(&cc, e.Data)
			removedSelf, err := s.applyConfChange(cc, confState)
			shouldstop = shouldstop || removedSelf
			s.w.Trigger(cc.ID, err)
		default:
			plog.Panicf("entry type should be either EntryNormal or EntryConfChange")
		}
		atomic.StoreUint64(&s.r.index, e.Index)
		atomic.StoreUint64(&s.r.term, e.Term)
		applied = e.Index
	}
	return applied, shouldstop
}
Beispiel #4
0
// applyEntryNormal apples an EntryNormal type raftpb request to the EtcdServer
func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) {
	shouldApplyV3 := false
	if e.Index > s.consistIndex.ConsistentIndex() {
		// set the consistent index of current executing entry
		s.consistIndex.setConsistentIndex(e.Index)
		shouldApplyV3 = true
	}

	// raft state machine may generate noop entry when leader confirmation.
	// skip it in advance to avoid some potential bug in the future
	if len(e.Data) == 0 {
		select {
		case s.forceVersionC <- struct{}{}:
		default:
		}
		return
	}

	var raftReq pb.InternalRaftRequest
	if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible
		var r pb.Request
		pbutil.MustUnmarshal(&r, e.Data)
		s.w.Trigger(r.ID, s.applyV2Request(&r))
		return
	}
	if raftReq.V2 != nil {
		req := raftReq.V2
		s.w.Trigger(req.ID, s.applyV2Request(req))
		return
	}

	// do not re-apply applied entries.
	if !shouldApplyV3 {
		return
	}

	id := raftReq.ID
	if id == 0 {
		id = raftReq.Header.ID
	}

	ar := s.applyV3.Apply(&raftReq)
	s.setAppliedIndex(e.Index)
	if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 {
		s.w.Trigger(id, ar)
		return
	}

	plog.Errorf("applying raft message exceeded backend quota")
	go func() {
		a := &pb.AlarmRequest{
			MemberID: uint64(s.ID()),
			Action:   pb.AlarmRequest_ACTIVATE,
			Alarm:    pb.AlarmType_NOSPACE,
		}
		r := pb.InternalRaftRequest{Alarm: a}
		s.processInternalRaftRequest(context.TODO(), r)
		s.w.Trigger(id, ar)
	}()
}
Beispiel #5
0
func readWAL(waldir string, snap walpb.Snapshot) (w *wal.WAL, id, cid types.ID, st raftpb.HardState, ents []raftpb.Entry) {
	var (
		err       error
		wmetadata []byte
	)

	repaired := false
	for {
		if w, err = wal.Open(waldir, snap); err != nil {
			plog.Fatalf("open wal error: %v", err)
		}
		if wmetadata, st, ents, err = w.ReadAll(); err != nil {
			w.Close()
			// we can only repair ErrUnexpectedEOF and we never repair twice.
			if repaired || err != io.ErrUnexpectedEOF {
				plog.Fatalf("read wal error (%v) and cannot be repaired", err)
			}
			if !wal.Repair(waldir) {
				plog.Fatalf("WAL error (%v) cannot be repaired", err)
			} else {
				plog.Infof("repaired WAL error (%v)", err)
				repaired = true
			}
			continue
		}
		break
	}
	var metadata pb.Metadata
	pbutil.MustUnmarshal(&metadata, wmetadata)
	id = types.ID(metadata.NodeID)
	cid = types.ID(metadata.ClusterID)
	return
}
Beispiel #6
0
func parseWALMetadata(b []byte) (id, cid types.ID) {
	var metadata etcdserverpb.Metadata
	pbutil.MustUnmarshal(&metadata, b)
	id = types.ID(metadata.NodeID)
	cid = types.ID(metadata.ClusterID)
	return
}
Beispiel #7
0
// getIDs returns an ordered set of IDs included in the given snapshot and
// the entries. The given snapshot/entries can contain two kinds of
// ID-related entry:
// - ConfChangeAddNode, in which case the contained ID will be added into the set.
// - ConfChangeAddRemove, in which case the contained ID will be removed from the set.
func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 {
	ids := make(map[uint64]bool)
	if snap != nil {
		for _, id := range snap.Metadata.ConfState.Nodes {
			ids[id] = true
		}
	}
	for _, e := range ents {
		if e.Type != raftpb.EntryConfChange {
			continue
		}
		var cc raftpb.ConfChange
		pbutil.MustUnmarshal(&cc, e.Data)
		switch cc.Type {
		case raftpb.ConfChangeAddNode:
			ids[cc.NodeID] = true
		case raftpb.ConfChangeRemoveNode:
			delete(ids, cc.NodeID)
		case raftpb.ConfChangeUpdateNode:
			// do nothing
		default:
			plog.Panicf("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!")
		}
	}
	sids := make(types.Uint64Slice, 0)
	for id := range ids {
		sids = append(sids, id)
	}
	sort.Sort(sids)
	return []uint64(sids)
}
Beispiel #8
0
func rebuildStoreV2() store.Store {
	waldir := migrateWALdir
	if len(waldir) == 0 {
		waldir = path.Join(migrateDatadir, "member", "wal")
	}
	snapdir := path.Join(migrateDatadir, "member", "snap")

	ss := snap.New(snapdir)
	snapshot, err := ss.Load()
	if err != nil && err != snap.ErrNoSnapshot {
		ExitWithError(ExitError, err)
	}

	var walsnap walpb.Snapshot
	if snapshot != nil {
		walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
	}

	w, err := wal.OpenForRead(waldir, walsnap)
	if err != nil {
		ExitWithError(ExitError, err)
	}
	defer w.Close()

	_, _, ents, err := w.ReadAll()
	if err != nil {
		ExitWithError(ExitError, err)
	}

	st := store.New()
	if snapshot != nil {
		err := st.Recovery(snapshot.Data)
		if err != nil {
			ExitWithError(ExitError, err)
		}
	}

	applier := etcdserver.NewApplierV2(st, nil)
	for _, ent := range ents {
		if ent.Type != raftpb.EntryNormal {
			continue
		}

		var raftReq pb.InternalRaftRequest
		if !pbutil.MaybeUnmarshal(&raftReq, ent.Data) { // backward compatible
			var r pb.Request
			pbutil.MustUnmarshal(&r, ent.Data)
			applyRequest(&r, applier)
		} else {
			if raftReq.V2 != nil {
				req := raftReq.V2
				applyRequest(req, applier)
			}
		}
	}

	return st
}
Beispiel #9
0
// handleBackup handles a request that intends to do a backup.
func handleBackup(c *cli.Context) {
	srcSnap := path.Join(c.String("data-dir"), "member", "snap")
	destSnap := path.Join(c.String("backup-dir"), "member", "snap")
	srcWAL := path.Join(c.String("data-dir"), "member", "wal")
	destWAL := path.Join(c.String("backup-dir"), "member", "wal")

	if err := os.MkdirAll(destSnap, 0700); err != nil {
		log.Fatalf("failed creating backup snapshot dir %v: %v", destSnap, err)
	}
	ss := snap.New(srcSnap)
	snapshot, err := ss.Load()
	if err != nil && err != snap.ErrNoSnapshot {
		log.Fatal(err)
	}
	var walsnap walpb.Snapshot
	if snapshot != nil {
		walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
		newss := snap.New(destSnap)
		if err = newss.SaveSnap(*snapshot); err != nil {
			log.Fatal(err)
		}
	}

	w, err := wal.OpenForRead(srcWAL, walsnap)
	if err != nil {
		log.Fatal(err)
	}
	defer w.Close()
	wmetadata, state, ents, err := w.ReadAll()
	switch err {
	case nil:
	case wal.ErrSnapshotNotFound:
		fmt.Printf("Failed to find the match snapshot record %+v in wal %v.", walsnap, srcWAL)
		fmt.Printf("etcdctl will add it back. Start auto fixing...")
	default:
		log.Fatal(err)
	}
	var metadata etcdserverpb.Metadata
	pbutil.MustUnmarshal(&metadata, wmetadata)
	idgen := idutil.NewGenerator(0, time.Now())
	metadata.NodeID = idgen.Next()
	metadata.ClusterID = idgen.Next()

	neww, err := wal.Create(destWAL, pbutil.MustMarshal(&metadata))
	if err != nil {
		log.Fatal(err)
	}
	defer neww.Close()
	if err := neww.Save(state, ents); err != nil {
		log.Fatal(err)
	}
	if err := neww.SaveSnapshot(walsnap); err != nil {
		log.Fatal(err)
	}
}
Beispiel #10
0
func (s *EtcdServer) apply(es []raftpb.Entry, nodes []uint64) uint64 {
	var applied uint64
	for i := range es {
		e := es[i]
		switch e.Type {
		case raftpb.EntryNormal:
			var r pb.Request
			pbutil.MustUnmarshal(&r, e.Data)
			s.w.Trigger(r.ID, s.applyRequest(r))
		case raftpb.EntryConfChange:
			var cc raftpb.ConfChange
			pbutil.MustUnmarshal(&cc, e.Data)
			s.w.Trigger(cc.ID, s.applyConfChange(cc, nodes))
		default:
			panic("unexpected entry type")
		}
		atomic.StoreUint64(&s.raftIndex, e.Index)
		atomic.StoreUint64(&s.raftTerm, e.Term)
		applied = e.Index
	}
	return applied
}
Beispiel #11
0
func retrieveMessage(e pb.Entry) serverpb.Request {
	var request serverpb.Request
	var raftReq serverpb.InternalRaftRequest
	if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible
		pbutil.MustUnmarshal(&request, e.Data)
	} else {
		switch {
		case raftReq.V2 != nil:
			request = *raftReq.V2
		}
	}
	return request
}
Beispiel #12
0
func readWAL(waldir string, index uint64) (w *wal.WAL, id, cid types.ID, st raftpb.HardState, ents []raftpb.Entry) {
	var err error
	if w, err = wal.Open(waldir, index); err != nil {
		log.Fatalf("etcdserver: open wal error: %v", err)
	}
	var wmetadata []byte
	if wmetadata, st, ents, err = w.ReadAll(); err != nil {
		log.Fatalf("etcdserver: read wal error: %v", err)
	}
	var metadata pb.Metadata
	pbutil.MustUnmarshal(&metadata, wmetadata)
	id = types.ID(metadata.NodeID)
	cid = types.ID(metadata.ClusterID)
	return
}
Beispiel #13
0
// handleBackup handles a request that intends to do a backup.
func handleBackup(c *cli.Context) {
	srcSnap := path.Join(c.String("data-dir"), "snap")
	destSnap := path.Join(c.String("backup-dir"), "snap")
	srcWAL := path.Join(c.String("data-dir"), "wal")
	destWAL := path.Join(c.String("backup-dir"), "wal")

	if err := os.MkdirAll(destSnap, 0700); err != nil {
		log.Fatalf("failed creating backup snapshot dir %v: %v", destSnap, err)
	}
	ss := snap.New(srcSnap)
	snapshot, err := ss.Load()
	if err != nil && err != snap.ErrNoSnapshot {
		log.Fatal(err)
	}
	var index uint64
	if snapshot != nil {
		index = snapshot.Metadata.Index
		newss := snap.New(destSnap)
		if err := newss.SaveSnap(*snapshot); err != nil {
			log.Fatal(err)
		}
	}

	w, err := wal.OpenNotInUse(srcWAL, index)
	if err != nil {
		log.Fatal(err)
	}
	defer w.Close()
	wmetadata, state, ents, err := w.ReadAll()
	if err != nil {
		log.Fatal(err)
	}
	var metadata etcdserverpb.Metadata
	pbutil.MustUnmarshal(&metadata, wmetadata)
	rand.Seed(time.Now().UnixNano())
	metadata.NodeID = etcdserver.GenID()
	metadata.ClusterID = etcdserver.GenID()

	neww, err := wal.Create(destWAL, pbutil.MustMarshal(&metadata))
	if err != nil {
		log.Fatal(err)
	}
	defer neww.Close()
	if err := neww.Save(state, ents); err != nil {
		log.Fatal(err)
	}
}
Beispiel #14
0
// handleBackup handles a request that intends to do a backup.
func handleBackup(c *cli.Context) {
	srcSnap := path.Join(c.String("data-dir"), "snap")
	destSnap := path.Join(c.String("backup-dir"), "snap")
	srcWAL := path.Join(c.String("data-dir"), "wal")
	destWAL := path.Join(c.String("backup-dir"), "wal")

	if err := os.MkdirAll(destSnap, 0700); err != nil {
		log.Fatalf("failed creating backup snapshot dir %v: %v", destSnap, err)
	}
	ss := snap.New(srcSnap)
	snapshot, err := ss.Load()
	if err != nil && err != snap.ErrNoSnapshot {
		log.Fatal(err)
	}
	var walsnap walpb.Snapshot
	if snapshot != nil {
		walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
		newss := snap.New(destSnap)
		if err := newss.SaveSnap(*snapshot); err != nil {
			log.Fatal(err)
		}
	}

	w, err := wal.OpenNotInUse(srcWAL, walsnap)
	if err != nil {
		log.Fatal(err)
	}
	defer w.Close()
	wmetadata, state, ents, err := w.ReadAll()
	if err != nil {
		log.Fatal(err)
	}
	var metadata etcdserverpb.Metadata
	pbutil.MustUnmarshal(&metadata, wmetadata)
	idgen := idutil.NewGenerator(0, time.Now())
	metadata.NodeID = idgen.Next()
	metadata.ClusterID = idgen.Next()

	neww, err := wal.Create(destWAL, pbutil.MustMarshal(&metadata))
	if err != nil {
		log.Fatal(err)
	}
	defer neww.Close()
	if err := neww.Save(state, ents); err != nil {
		log.Fatal(err)
	}
}
Beispiel #15
0
func restartNode(cfg *ServerConfig, index uint64, snapshot *raftpb.Snapshot) (id uint64, n raft.Node, w *wal.WAL) {
	var err error
	// restart a node from previous wal
	if w, err = wal.OpenAtIndex(cfg.WALDir(), index); err != nil {
		log.Fatal(err)
	}
	wmetadata, st, ents, err := w.ReadAll()
	if err != nil {
		log.Fatal(err)
	}

	var metadata pb.Metadata
	pbutil.MustUnmarshal(&metadata, wmetadata)
	id = metadata.NodeID
	cfg.Cluster.SetID(metadata.ClusterID)
	log.Printf("etcdserver: restart member %x in cluster %x at commit index %d", id, cfg.Cluster.ID(), st.Commit)
	n = raft.RestartNode(id, 10, 1, snapshot, st, ents)
	return
}
Beispiel #16
0
// apply takes entries received from Raft (after it has been committed) and
// applies them to the current state of the EtcdServer.
// The given entries should not be empty.
func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) {
	var applied uint64
	var shouldstop bool
	for i := range es {
		e := es[i]
		switch e.Type {
		case raftpb.EntryNormal:
			s.applyEntryNormal(&e)
		case raftpb.EntryConfChange:
			var cc raftpb.ConfChange
			pbutil.MustUnmarshal(&cc, e.Data)
			removedSelf, err := s.applyConfChange(cc, confState)
			shouldstop = shouldstop || removedSelf
			s.w.Trigger(cc.ID, err)
		default:
			plog.Panicf("entry type should be either EntryNormal or EntryConfChange")
		}
		atomic.StoreUint64(&s.r.index, e.Index)
		atomic.StoreUint64(&s.r.term, e.Term)
		applied = e.Index
	}
	return applied, shouldstop
}
Beispiel #17
0
// ReadAll reads out records of the current WAL.
// If opened in write mode, it must read out all records until EOF. Or an error
// will be returned.
// If opened in read mode, it will try to read all records if possible.
// If it cannot read out the expected snap, it will return ErrSnapshotNotFound.
// If loaded snap doesn't match with the expected one, it will return
// all the records and error ErrSnapshotMismatch.
// TODO: detect not-last-snap error.
// TODO: maybe loose the checking of match.
// After ReadAll, the WAL will be ready for appending new records.
func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.Entry, err error) {
	w.mu.Lock()
	defer w.mu.Unlock()

	rec := &walpb.Record{}
	decoder := w.decoder

	var match bool
	for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) {
		switch rec.Type {
		case entryType:
			e := mustUnmarshalEntry(rec.Data)
			if e.Index > w.start.Index {
				ents = append(ents[:e.Index-w.start.Index-1], e)
			}
			w.enti = e.Index
		case stateType:
			state = mustUnmarshalState(rec.Data)
		case metadataType:
			if metadata != nil && !reflect.DeepEqual(metadata, rec.Data) {
				state.Reset()
				return nil, state, nil, ErrMetadataConflict
			}
			metadata = rec.Data
		case crcType:
			crc := decoder.crc.Sum32()
			// current crc of decoder must match the crc of the record.
			// do no need to match 0 crc, since the decoder is a new one at this case.
			if crc != 0 && rec.Validate(crc) != nil {
				state.Reset()
				return nil, state, nil, ErrCRCMismatch
			}
			decoder.updateCRC(rec.Crc)
		case snapshotType:
			var snap walpb.Snapshot
			pbutil.MustUnmarshal(&snap, rec.Data)
			if snap.Index == w.start.Index {
				if snap.Term != w.start.Term {
					state.Reset()
					return nil, state, nil, ErrSnapshotMismatch
				}
				match = true
			}
		default:
			state.Reset()
			return nil, state, nil, fmt.Errorf("unexpected block type %d", rec.Type)
		}
	}

	switch w.f {
	case nil:
		// We do not have to read out all entries in read mode.
		// The last record maybe a partial written one, so
		// ErrunexpectedEOF might be returned.
		if err != io.EOF && err != io.ErrUnexpectedEOF {
			state.Reset()
			return nil, state, nil, err
		}
	default:
		// We must read all of the entries if WAL is opened in write mode.
		if err != io.EOF {
			state.Reset()
			return nil, state, nil, err
		}
	}

	err = nil
	if !match {
		err = ErrSnapshotNotFound
	}

	// close decoder, disable reading
	w.decoder.close()
	w.start = walpb.Snapshot{}

	w.metadata = metadata

	if w.f != nil {
		// create encoder (chain crc with the decoder), enable appending
		w.encoder = newEncoder(w.f, w.decoder.lastCRC())
		w.decoder = nil
		lastIndexSaved.Set(float64(w.enti))
	}

	return metadata, state, ents, err
}
Beispiel #18
0
// applyEntryNormal apples an EntryNormal type raftpb request to the EtcdServer
func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) {
	shouldApplyV3 := false
	if e.Index > s.consistIndex.ConsistentIndex() {
		// set the consistent index of current executing entry
		s.consistIndex.setConsistentIndex(e.Index)
		shouldApplyV3 = true
	}
	defer s.setAppliedIndex(e.Index)

	// raft state machine may generate noop entry when leader confirmation.
	// skip it in advance to avoid some potential bug in the future
	if len(e.Data) == 0 {
		select {
		case s.forceVersionC <- struct{}{}:
		default:
		}
		// promote lessor when the local member is leader and finished
		// applying all entries from the last term.
		if s.isLeader() {
			s.lessor.Promote(s.Cfg.electionTimeout())
		}
		return
	}

	var raftReq pb.InternalRaftRequest
	if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible
		var r pb.Request
		pbutil.MustUnmarshal(&r, e.Data)
		s.w.Trigger(r.ID, s.applyV2Request(&r))
		return
	}
	if raftReq.V2 != nil {
		req := raftReq.V2
		s.w.Trigger(req.ID, s.applyV2Request(req))
		return
	}

	// do not re-apply applied entries.
	if !shouldApplyV3 {
		return
	}

	id := raftReq.ID
	if id == 0 {
		id = raftReq.Header.ID
	}

	var ar *applyResult
	needResult := s.w.IsRegistered(id)
	if needResult || !noSideEffect(&raftReq) {
		if !needResult && raftReq.Txn != nil {
			removeNeedlessRangeReqs(raftReq.Txn)
		}
		ar = s.applyV3.Apply(&raftReq)
	}

	if ar == nil {
		return
	}

	if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 {
		s.w.Trigger(id, ar)
		return
	}

	plog.Errorf("applying raft message exceeded backend quota")
	s.goAttach(func() {
		a := &pb.AlarmRequest{
			MemberID: uint64(s.ID()),
			Action:   pb.AlarmRequest_ACTIVATE,
			Alarm:    pb.AlarmType_NOSPACE,
		}
		r := pb.InternalRaftRequest{Alarm: a}
		s.processInternalRaftRequest(context.TODO(), r)
		s.w.Trigger(id, ar)
	})
}
Beispiel #19
0
// apply takes entries received from Raft (after it has been committed) and
// applies them to the current state of the EtcdServer.
// The given entries should not be empty.
func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) {
	var applied uint64
	var shouldstop bool
	for i := range es {
		e := es[i]
		switch e.Type {
		case raftpb.EntryNormal:
			// raft state machine may generate noop entry when leader confirmation.
			// skip it in advance to avoid some potential bug in the future
			if len(e.Data) == 0 {
				select {
				case s.forceVersionC <- struct{}{}:
				default:
				}
				break
			}

			var raftReq pb.InternalRaftRequest
			if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible
				var r pb.Request
				pbutil.MustUnmarshal(&r, e.Data)
				s.w.Trigger(r.ID, s.applyRequest(r))
			} else if raftReq.V2 != nil {
				req := raftReq.V2
				s.w.Trigger(req.ID, s.applyRequest(*req))
			} else {
				// do not re-apply applied entries.
				if e.Index <= s.consistIndex.ConsistentIndex() {
					break
				}
				// set the consistent index of current executing entry
				s.consistIndex.setConsistentIndex(e.Index)
				ar := s.applyV3Request(&raftReq)
				if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 {
					s.w.Trigger(raftReq.ID, ar)
					break
				}
				plog.Errorf("applying raft message exceeded backend quota")
				go func() {
					a := &pb.AlarmRequest{
						MemberID: uint64(s.ID()),
						Action:   pb.AlarmRequest_ACTIVATE,
						Alarm:    pb.AlarmType_NOSPACE,
					}
					r := pb.InternalRaftRequest{Alarm: a}
					s.processInternalRaftRequest(context.TODO(), r)
					s.w.Trigger(raftReq.ID, ar)
				}()
			}
		case raftpb.EntryConfChange:
			var cc raftpb.ConfChange
			pbutil.MustUnmarshal(&cc, e.Data)
			removedSelf, err := s.applyConfChange(cc, confState)
			shouldstop = shouldstop || removedSelf
			s.w.Trigger(cc.ID, err)
		default:
			plog.Panicf("entry type should be either EntryNormal or EntryConfChange")
		}
		atomic.StoreUint64(&s.r.index, e.Index)
		atomic.StoreUint64(&s.r.term, e.Term)
		applied = e.Index
	}
	return applied, shouldstop
}
Beispiel #20
0
func mustUnmarshalState(d []byte) raftpb.HardState {
	var s raftpb.HardState
	pbutil.MustUnmarshal(&s, d)
	return s
}
Beispiel #21
0
func mustUnmarshalEntry(d []byte) raftpb.Entry {
	var e raftpb.Entry
	pbutil.MustUnmarshal(&e, d)
	return e
}
Beispiel #22
0
// ReadAll reads out all records of the current WAL.
// If it cannot read out the expected snap, it will return ErrSnapshotNotFound.
// If loaded snap doesn't match with the expected one, it will return
// ErrSnapshotMismatch.
// TODO: detect not-last-snap error.
// TODO: maybe loose the checking of match.
// After ReadAll, the WAL will be ready for appending new records.
func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.Entry, err error) {
	rec := &walpb.Record{}
	decoder := w.decoder

	var match bool
	for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) {
		switch rec.Type {
		case entryType:
			e := mustUnmarshalEntry(rec.Data)
			if e.Index > w.start.Index {
				ents = append(ents[:e.Index-w.start.Index-1], e)
			}
			w.enti = e.Index
		case stateType:
			state = mustUnmarshalState(rec.Data)
		case metadataType:
			if metadata != nil && !reflect.DeepEqual(metadata, rec.Data) {
				state.Reset()
				return nil, state, nil, ErrMetadataConflict
			}
			metadata = rec.Data
		case crcType:
			crc := decoder.crc.Sum32()
			// current crc of decoder must match the crc of the record.
			// do no need to match 0 crc, since the decoder is a new one at this case.
			if crc != 0 && rec.Validate(crc) != nil {
				state.Reset()
				return nil, state, nil, ErrCRCMismatch
			}
			decoder.updateCRC(rec.Crc)
		case snapshotType:
			var snap walpb.Snapshot
			pbutil.MustUnmarshal(&snap, rec.Data)
			if snap.Index == w.start.Index {
				if snap.Term != w.start.Term {
					state.Reset()
					return nil, state, nil, ErrSnapshotMismatch
				}
				match = true
			}
		default:
			state.Reset()
			return nil, state, nil, fmt.Errorf("unexpected block type %d", rec.Type)
		}
	}
	if err != io.EOF {
		state.Reset()
		return nil, state, nil, err
	}
	if !match {
		state.Reset()
		return nil, state, nil, ErrSnapshotNotFound
	}

	// close decoder, disable reading
	w.decoder.close()
	w.start = walpb.Snapshot{}

	w.metadata = metadata
	// create encoder (chain crc with the decoder), enable appending
	w.encoder = newEncoder(w.f, w.decoder.lastCRC())
	w.decoder = nil
	return metadata, state, ents, nil
}
Beispiel #23
0
func rebuild(datadir string) ([]byte, *raftpb.HardState, store.Store, error) {
	waldir := path.Join(datadir, "member", "wal")
	snapdir := path.Join(datadir, "member", "snap")

	ss := snap.New(snapdir)
	snapshot, err := ss.Load()
	if err != nil && err != snap.ErrNoSnapshot {
		return nil, nil, nil, err
	}

	var walsnap walpb.Snapshot
	if snapshot != nil {
		walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
	}

	w, err := wal.OpenForRead(waldir, walsnap)
	if err != nil {
		return nil, nil, nil, err
	}
	defer w.Close()

	meta, hardstate, ents, err := w.ReadAll()
	if err != nil {
		return nil, nil, nil, err
	}

	st := store.New(etcdserver.StoreClusterPrefix, etcdserver.StoreKeysPrefix)
	if snapshot != nil {
		err := st.Recovery(snapshot.Data)
		if err != nil {
			return nil, nil, nil, err
		}
	}

	cluster := membership.NewCluster("")
	cluster.SetStore(st)
	cluster.Recover(func(*semver.Version) {})

	applier := etcdserver.NewApplierV2(st, cluster)
	for _, ent := range ents {
		if ent.Type == raftpb.EntryConfChange {
			var cc raftpb.ConfChange
			pbutil.MustUnmarshal(&cc, ent.Data)
			switch cc.Type {
			case raftpb.ConfChangeAddNode:
				m := new(membership.Member)
				if err := json.Unmarshal(cc.Context, m); err != nil {
					return nil, nil, nil, err
				}
				cluster.AddMember(m)
			case raftpb.ConfChangeRemoveNode:
				id := types.ID(cc.NodeID)
				cluster.RemoveMember(id)
			case raftpb.ConfChangeUpdateNode:
				m := new(membership.Member)
				if err := json.Unmarshal(cc.Context, m); err != nil {
					return nil, nil, nil, err
				}
				cluster.UpdateRaftAttributes(m.ID, m.RaftAttributes)
			}
			continue
		}

		var raftReq pb.InternalRaftRequest
		if !pbutil.MaybeUnmarshal(&raftReq, ent.Data) { // backward compatible
			var r pb.Request
			pbutil.MustUnmarshal(&r, ent.Data)
			applyRequest(&r, applier)
		} else {
			if raftReq.V2 != nil {
				req := raftReq.V2
				applyRequest(req, applier)
			}
		}
	}

	return meta, &hardstate, st, nil
}
Beispiel #24
0
func rebuildStoreV2() (store.Store, uint64) {
	var index uint64
	cl := membership.NewCluster("")

	waldir := migrateWALdir
	if len(waldir) == 0 {
		waldir = path.Join(migrateDatadir, "member", "wal")
	}
	snapdir := path.Join(migrateDatadir, "member", "snap")

	ss := snap.New(snapdir)
	snapshot, err := ss.Load()
	if err != nil && err != snap.ErrNoSnapshot {
		ExitWithError(ExitError, err)
	}

	var walsnap walpb.Snapshot
	if snapshot != nil {
		walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
		index = snapshot.Metadata.Index
	}

	w, err := wal.OpenForRead(waldir, walsnap)
	if err != nil {
		ExitWithError(ExitError, err)
	}
	defer w.Close()

	_, _, ents, err := w.ReadAll()
	if err != nil {
		ExitWithError(ExitError, err)
	}

	st := store.New()
	if snapshot != nil {
		err := st.Recovery(snapshot.Data)
		if err != nil {
			ExitWithError(ExitError, err)
		}
	}

	cl.SetStore(st)
	cl.Recover(api.UpdateCapability)

	applier := etcdserver.NewApplierV2(st, cl)
	for _, ent := range ents {
		if ent.Type == raftpb.EntryConfChange {
			var cc raftpb.ConfChange
			pbutil.MustUnmarshal(&cc, ent.Data)
			applyConf(cc, cl)
			continue
		}

		var raftReq pb.InternalRaftRequest
		if !pbutil.MaybeUnmarshal(&raftReq, ent.Data) { // backward compatible
			var r pb.Request
			pbutil.MustUnmarshal(&r, ent.Data)
			applyRequest(&r, applier)
		} else {
			if raftReq.V2 != nil {
				req := raftReq.V2
				applyRequest(req, applier)
			}
		}
		if ent.Index > index {
			index = ent.Index
		}
	}

	return st, index
}
Beispiel #25
0
// ReadAll reads out records of the current WAL.
// If opened in write mode, it must read out all records until EOF. Or an error
// will be returned.
// If opened in read mode, it will try to read all records if possible.
// If it cannot read out the expected snap, it will return ErrSnapshotNotFound.
// If loaded snap doesn't match with the expected one, it will return
// all the records and error ErrSnapshotMismatch.
// TODO: detect not-last-snap error.
// TODO: maybe loose the checking of match.
// After ReadAll, the WAL will be ready for appending new records.
func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.Entry, err error) {
	w.mu.Lock()
	defer w.mu.Unlock()

	rec := &walpb.Record{}
	decoder := w.decoder

	var match bool
	for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) {
		switch rec.Type {
		case entryType:
			e := mustUnmarshalEntry(rec.Data)
			if e.Index > w.start.Index {
				ents = append(ents[:e.Index-w.start.Index-1], e)
			}
			w.enti = e.Index
		case stateType:
			state = mustUnmarshalState(rec.Data)
		case metadataType:
			if metadata != nil && !bytes.Equal(metadata, rec.Data) {
				state.Reset()
				return nil, state, nil, ErrMetadataConflict
			}
			metadata = rec.Data
		case crcType:
			crc := decoder.crc.Sum32()
			// current crc of decoder must match the crc of the record.
			// do no need to match 0 crc, since the decoder is a new one at this case.
			if crc != 0 && rec.Validate(crc) != nil {
				state.Reset()
				return nil, state, nil, ErrCRCMismatch
			}
			decoder.updateCRC(rec.Crc)
		case snapshotType:
			var snap walpb.Snapshot
			pbutil.MustUnmarshal(&snap, rec.Data)
			if snap.Index == w.start.Index {
				if snap.Term != w.start.Term {
					state.Reset()
					return nil, state, nil, ErrSnapshotMismatch
				}
				match = true
			}
		default:
			state.Reset()
			return nil, state, nil, fmt.Errorf("unexpected block type %d", rec.Type)
		}
	}

	switch w.tail() {
	case nil:
		// We do not have to read out all entries in read mode.
		// The last record maybe a partial written one, so
		// ErrunexpectedEOF might be returned.
		if err != io.EOF && err != io.ErrUnexpectedEOF {
			state.Reset()
			return nil, state, nil, err
		}
	default:
		// We must read all of the entries if WAL is opened in write mode.
		if err != io.EOF {
			state.Reset()
			return nil, state, nil, err
		}
		// decodeRecord() will return io.EOF if it detects a zero record,
		// but this zero record may be followed by non-zero records from
		// a torn write. Overwriting some of these non-zero records, but
		// not all, will cause CRC errors on WAL open. Since the records
		// were never fully synced to disk in the first place, it's safe
		// to zero them out to avoid any CRC errors from new writes.
		if _, err = w.tail().Seek(w.decoder.lastOffset(), os.SEEK_SET); err != nil {
			return nil, state, nil, err
		}
		if err = fileutil.ZeroToEnd(w.tail().File); err != nil {
			return nil, state, nil, err
		}
	}

	err = nil
	if !match {
		err = ErrSnapshotNotFound
	}

	// close decoder, disable reading
	if w.readClose != nil {
		w.readClose()
		w.readClose = nil
	}
	w.start = walpb.Snapshot{}

	w.metadata = metadata

	if w.tail() != nil {
		// create encoder (chain crc with the decoder), enable appending
		w.encoder = newEncoder(w.tail(), w.decoder.lastCRC())
	}
	w.decoder = nil

	return metadata, state, ents, err
}
Beispiel #26
0
func (dec *msgAppV2Decoder) decode() (raftpb.Message, error) {
	var (
		m   raftpb.Message
		typ uint8
	)
	if _, err := io.ReadFull(dec.r, dec.uint8buf); err != nil {
		return m, err
	}
	typ = uint8(dec.uint8buf[0])
	switch typ {
	case msgTypeLinkHeartbeat:
		return linkHeartbeatMessage, nil
	case msgTypeAppEntries:
		m = raftpb.Message{
			Type:    raftpb.MsgApp,
			From:    uint64(dec.remote),
			To:      uint64(dec.local),
			Term:    dec.term,
			LogTerm: dec.term,
			Index:   dec.index,
		}

		// decode entries
		if _, err := io.ReadFull(dec.r, dec.uint64buf); err != nil {
			return m, err
		}
		l := binary.BigEndian.Uint64(dec.uint64buf)
		m.Entries = make([]raftpb.Entry, int(l))
		for i := 0; i < int(l); i++ {
			if _, err := io.ReadFull(dec.r, dec.uint64buf); err != nil {
				return m, err
			}
			size := binary.BigEndian.Uint64(dec.uint64buf)
			var buf []byte
			if size < msgAppV2BufSize {
				buf = dec.buf[:size]
				if _, err := io.ReadFull(dec.r, buf); err != nil {
					return m, err
				}
			} else {
				buf = make([]byte, int(size))
				if _, err := io.ReadFull(dec.r, buf); err != nil {
					return m, err
				}
			}
			dec.index++
			// 1 alloc
			pbutil.MustUnmarshal(&m.Entries[i], buf)
		}
		// decode commit index
		if _, err := io.ReadFull(dec.r, dec.uint64buf); err != nil {
			return m, err
		}
		m.Commit = binary.BigEndian.Uint64(dec.uint64buf)
	case msgTypeApp:
		var size uint64
		if err := binary.Read(dec.r, binary.BigEndian, &size); err != nil {
			return m, err
		}
		buf := make([]byte, int(size))
		if _, err := io.ReadFull(dec.r, buf); err != nil {
			return m, err
		}
		pbutil.MustUnmarshal(&m, buf)

		dec.term = m.Term
		dec.index = m.Index
		if l := len(m.Entries); l > 0 {
			dec.index = m.Entries[l-1].Index
		}
	default:
		return m, fmt.Errorf("failed to parse type %d in msgappv2 stream", typ)
	}
	return m, nil
}