// apply takes entries received from Raft (after it has been committed) and // applies them to the current state of the EtcdServer. // The given entries should not be empty. func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) { var applied uint64 var shouldstop bool var err error for i := range es { e := es[i] switch e.Type { case raftpb.EntryNormal: // raft state machine may generate noop entry when leader confirmation. // skip it in advance to avoid some potential bug in the future if len(e.Data) == 0 { select { case s.forceVersionC <- struct{}{}: default: } break } var r pb.Request pbutil.MustUnmarshal(&r, e.Data) s.w.Trigger(r.ID, s.applyRequest(r)) case raftpb.EntryConfChange: var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, e.Data) shouldstop, err = s.applyConfChange(cc, confState) s.w.Trigger(cc.ID, err) default: plog.Panicf("entry type should be either EntryNormal or EntryConfChange") } atomic.StoreUint64(&s.r.index, e.Index) atomic.StoreUint64(&s.r.term, e.Term) applied = e.Index } return applied, shouldstop }
// apply takes entries received from Raft (after it has been committed) and // applies them to the current state of the EtcdServer. // The given entries should not be empty. func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) { var applied uint64 for i := range es { e := es[i] switch e.Type { case raftpb.EntryNormal: var r pb.Request pbutil.MustUnmarshal(&r, e.Data) s.w.Trigger(r.ID, s.applyRequest(r)) case raftpb.EntryConfChange: var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, e.Data) shouldstop, err := s.applyConfChange(cc, confState) s.w.Trigger(cc.ID, err) if shouldstop { return applied, true } default: log.Panicf("entry type should be either EntryNormal or EntryConfChange") } atomic.StoreUint64(&s.raftIndex, e.Index) atomic.StoreUint64(&s.raftTerm, e.Term) applied = e.Index } return applied, false }
// apply takes entries received from Raft (after it has been committed) and // applies them to the current state of the EtcdServer. // The given entries should not be empty. func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) { var applied uint64 var shouldstop bool for i := range es { e := es[i] // set the consistent index of current executing entry s.consistIndex.setConsistentIndex(e.Index) switch e.Type { case raftpb.EntryNormal: // raft state machine may generate noop entry when leader confirmation. // skip it in advance to avoid some potential bug in the future if len(e.Data) == 0 { select { case s.forceVersionC <- struct{}{}: default: } break } var raftReq pb.InternalRaftRequest if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible var r pb.Request pbutil.MustUnmarshal(&r, e.Data) s.w.Trigger(r.ID, s.applyRequest(r)) } else { switch { case raftReq.V2 != nil: req := raftReq.V2 s.w.Trigger(req.ID, s.applyRequest(*req)) default: s.w.Trigger(raftReq.ID, s.applyV3Request(&raftReq)) } } case raftpb.EntryConfChange: var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, e.Data) removedSelf, err := s.applyConfChange(cc, confState) shouldstop = shouldstop || removedSelf s.w.Trigger(cc.ID, err) default: plog.Panicf("entry type should be either EntryNormal or EntryConfChange") } atomic.StoreUint64(&s.r.index, e.Index) atomic.StoreUint64(&s.r.term, e.Term) applied = e.Index } return applied, shouldstop }
// applyEntryNormal apples an EntryNormal type raftpb request to the EtcdServer func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) { shouldApplyV3 := false if e.Index > s.consistIndex.ConsistentIndex() { // set the consistent index of current executing entry s.consistIndex.setConsistentIndex(e.Index) shouldApplyV3 = true } // raft state machine may generate noop entry when leader confirmation. // skip it in advance to avoid some potential bug in the future if len(e.Data) == 0 { select { case s.forceVersionC <- struct{}{}: default: } return } var raftReq pb.InternalRaftRequest if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible var r pb.Request pbutil.MustUnmarshal(&r, e.Data) s.w.Trigger(r.ID, s.applyV2Request(&r)) return } if raftReq.V2 != nil { req := raftReq.V2 s.w.Trigger(req.ID, s.applyV2Request(req)) return } // do not re-apply applied entries. if !shouldApplyV3 { return } id := raftReq.ID if id == 0 { id = raftReq.Header.ID } ar := s.applyV3.Apply(&raftReq) s.setAppliedIndex(e.Index) if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 { s.w.Trigger(id, ar) return } plog.Errorf("applying raft message exceeded backend quota") go func() { a := &pb.AlarmRequest{ MemberID: uint64(s.ID()), Action: pb.AlarmRequest_ACTIVATE, Alarm: pb.AlarmType_NOSPACE, } r := pb.InternalRaftRequest{Alarm: a} s.processInternalRaftRequest(context.TODO(), r) s.w.Trigger(id, ar) }() }
func readWAL(waldir string, snap walpb.Snapshot) (w *wal.WAL, id, cid types.ID, st raftpb.HardState, ents []raftpb.Entry) { var ( err error wmetadata []byte ) repaired := false for { if w, err = wal.Open(waldir, snap); err != nil { plog.Fatalf("open wal error: %v", err) } if wmetadata, st, ents, err = w.ReadAll(); err != nil { w.Close() // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { plog.Fatalf("read wal error (%v) and cannot be repaired", err) } if !wal.Repair(waldir) { plog.Fatalf("WAL error (%v) cannot be repaired", err) } else { plog.Infof("repaired WAL error (%v)", err) repaired = true } continue } break } var metadata pb.Metadata pbutil.MustUnmarshal(&metadata, wmetadata) id = types.ID(metadata.NodeID) cid = types.ID(metadata.ClusterID) return }
func parseWALMetadata(b []byte) (id, cid types.ID) { var metadata etcdserverpb.Metadata pbutil.MustUnmarshal(&metadata, b) id = types.ID(metadata.NodeID) cid = types.ID(metadata.ClusterID) return }
// getIDs returns an ordered set of IDs included in the given snapshot and // the entries. The given snapshot/entries can contain two kinds of // ID-related entry: // - ConfChangeAddNode, in which case the contained ID will be added into the set. // - ConfChangeAddRemove, in which case the contained ID will be removed from the set. func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 { ids := make(map[uint64]bool) if snap != nil { for _, id := range snap.Metadata.ConfState.Nodes { ids[id] = true } } for _, e := range ents { if e.Type != raftpb.EntryConfChange { continue } var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, e.Data) switch cc.Type { case raftpb.ConfChangeAddNode: ids[cc.NodeID] = true case raftpb.ConfChangeRemoveNode: delete(ids, cc.NodeID) case raftpb.ConfChangeUpdateNode: // do nothing default: plog.Panicf("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!") } } sids := make(types.Uint64Slice, 0) for id := range ids { sids = append(sids, id) } sort.Sort(sids) return []uint64(sids) }
func rebuildStoreV2() store.Store { waldir := migrateWALdir if len(waldir) == 0 { waldir = path.Join(migrateDatadir, "member", "wal") } snapdir := path.Join(migrateDatadir, "member", "snap") ss := snap.New(snapdir) snapshot, err := ss.Load() if err != nil && err != snap.ErrNoSnapshot { ExitWithError(ExitError, err) } var walsnap walpb.Snapshot if snapshot != nil { walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term } w, err := wal.OpenForRead(waldir, walsnap) if err != nil { ExitWithError(ExitError, err) } defer w.Close() _, _, ents, err := w.ReadAll() if err != nil { ExitWithError(ExitError, err) } st := store.New() if snapshot != nil { err := st.Recovery(snapshot.Data) if err != nil { ExitWithError(ExitError, err) } } applier := etcdserver.NewApplierV2(st, nil) for _, ent := range ents { if ent.Type != raftpb.EntryNormal { continue } var raftReq pb.InternalRaftRequest if !pbutil.MaybeUnmarshal(&raftReq, ent.Data) { // backward compatible var r pb.Request pbutil.MustUnmarshal(&r, ent.Data) applyRequest(&r, applier) } else { if raftReq.V2 != nil { req := raftReq.V2 applyRequest(req, applier) } } } return st }
// handleBackup handles a request that intends to do a backup. func handleBackup(c *cli.Context) { srcSnap := path.Join(c.String("data-dir"), "member", "snap") destSnap := path.Join(c.String("backup-dir"), "member", "snap") srcWAL := path.Join(c.String("data-dir"), "member", "wal") destWAL := path.Join(c.String("backup-dir"), "member", "wal") if err := os.MkdirAll(destSnap, 0700); err != nil { log.Fatalf("failed creating backup snapshot dir %v: %v", destSnap, err) } ss := snap.New(srcSnap) snapshot, err := ss.Load() if err != nil && err != snap.ErrNoSnapshot { log.Fatal(err) } var walsnap walpb.Snapshot if snapshot != nil { walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term newss := snap.New(destSnap) if err = newss.SaveSnap(*snapshot); err != nil { log.Fatal(err) } } w, err := wal.OpenForRead(srcWAL, walsnap) if err != nil { log.Fatal(err) } defer w.Close() wmetadata, state, ents, err := w.ReadAll() switch err { case nil: case wal.ErrSnapshotNotFound: fmt.Printf("Failed to find the match snapshot record %+v in wal %v.", walsnap, srcWAL) fmt.Printf("etcdctl will add it back. Start auto fixing...") default: log.Fatal(err) } var metadata etcdserverpb.Metadata pbutil.MustUnmarshal(&metadata, wmetadata) idgen := idutil.NewGenerator(0, time.Now()) metadata.NodeID = idgen.Next() metadata.ClusterID = idgen.Next() neww, err := wal.Create(destWAL, pbutil.MustMarshal(&metadata)) if err != nil { log.Fatal(err) } defer neww.Close() if err := neww.Save(state, ents); err != nil { log.Fatal(err) } if err := neww.SaveSnapshot(walsnap); err != nil { log.Fatal(err) } }
func (s *EtcdServer) apply(es []raftpb.Entry, nodes []uint64) uint64 { var applied uint64 for i := range es { e := es[i] switch e.Type { case raftpb.EntryNormal: var r pb.Request pbutil.MustUnmarshal(&r, e.Data) s.w.Trigger(r.ID, s.applyRequest(r)) case raftpb.EntryConfChange: var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, e.Data) s.w.Trigger(cc.ID, s.applyConfChange(cc, nodes)) default: panic("unexpected entry type") } atomic.StoreUint64(&s.raftIndex, e.Index) atomic.StoreUint64(&s.raftTerm, e.Term) applied = e.Index } return applied }
func retrieveMessage(e pb.Entry) serverpb.Request { var request serverpb.Request var raftReq serverpb.InternalRaftRequest if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible pbutil.MustUnmarshal(&request, e.Data) } else { switch { case raftReq.V2 != nil: request = *raftReq.V2 } } return request }
func readWAL(waldir string, index uint64) (w *wal.WAL, id, cid types.ID, st raftpb.HardState, ents []raftpb.Entry) { var err error if w, err = wal.Open(waldir, index); err != nil { log.Fatalf("etcdserver: open wal error: %v", err) } var wmetadata []byte if wmetadata, st, ents, err = w.ReadAll(); err != nil { log.Fatalf("etcdserver: read wal error: %v", err) } var metadata pb.Metadata pbutil.MustUnmarshal(&metadata, wmetadata) id = types.ID(metadata.NodeID) cid = types.ID(metadata.ClusterID) return }
// handleBackup handles a request that intends to do a backup. func handleBackup(c *cli.Context) { srcSnap := path.Join(c.String("data-dir"), "snap") destSnap := path.Join(c.String("backup-dir"), "snap") srcWAL := path.Join(c.String("data-dir"), "wal") destWAL := path.Join(c.String("backup-dir"), "wal") if err := os.MkdirAll(destSnap, 0700); err != nil { log.Fatalf("failed creating backup snapshot dir %v: %v", destSnap, err) } ss := snap.New(srcSnap) snapshot, err := ss.Load() if err != nil && err != snap.ErrNoSnapshot { log.Fatal(err) } var index uint64 if snapshot != nil { index = snapshot.Metadata.Index newss := snap.New(destSnap) if err := newss.SaveSnap(*snapshot); err != nil { log.Fatal(err) } } w, err := wal.OpenNotInUse(srcWAL, index) if err != nil { log.Fatal(err) } defer w.Close() wmetadata, state, ents, err := w.ReadAll() if err != nil { log.Fatal(err) } var metadata etcdserverpb.Metadata pbutil.MustUnmarshal(&metadata, wmetadata) rand.Seed(time.Now().UnixNano()) metadata.NodeID = etcdserver.GenID() metadata.ClusterID = etcdserver.GenID() neww, err := wal.Create(destWAL, pbutil.MustMarshal(&metadata)) if err != nil { log.Fatal(err) } defer neww.Close() if err := neww.Save(state, ents); err != nil { log.Fatal(err) } }
// handleBackup handles a request that intends to do a backup. func handleBackup(c *cli.Context) { srcSnap := path.Join(c.String("data-dir"), "snap") destSnap := path.Join(c.String("backup-dir"), "snap") srcWAL := path.Join(c.String("data-dir"), "wal") destWAL := path.Join(c.String("backup-dir"), "wal") if err := os.MkdirAll(destSnap, 0700); err != nil { log.Fatalf("failed creating backup snapshot dir %v: %v", destSnap, err) } ss := snap.New(srcSnap) snapshot, err := ss.Load() if err != nil && err != snap.ErrNoSnapshot { log.Fatal(err) } var walsnap walpb.Snapshot if snapshot != nil { walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term newss := snap.New(destSnap) if err := newss.SaveSnap(*snapshot); err != nil { log.Fatal(err) } } w, err := wal.OpenNotInUse(srcWAL, walsnap) if err != nil { log.Fatal(err) } defer w.Close() wmetadata, state, ents, err := w.ReadAll() if err != nil { log.Fatal(err) } var metadata etcdserverpb.Metadata pbutil.MustUnmarshal(&metadata, wmetadata) idgen := idutil.NewGenerator(0, time.Now()) metadata.NodeID = idgen.Next() metadata.ClusterID = idgen.Next() neww, err := wal.Create(destWAL, pbutil.MustMarshal(&metadata)) if err != nil { log.Fatal(err) } defer neww.Close() if err := neww.Save(state, ents); err != nil { log.Fatal(err) } }
func restartNode(cfg *ServerConfig, index uint64, snapshot *raftpb.Snapshot) (id uint64, n raft.Node, w *wal.WAL) { var err error // restart a node from previous wal if w, err = wal.OpenAtIndex(cfg.WALDir(), index); err != nil { log.Fatal(err) } wmetadata, st, ents, err := w.ReadAll() if err != nil { log.Fatal(err) } var metadata pb.Metadata pbutil.MustUnmarshal(&metadata, wmetadata) id = metadata.NodeID cfg.Cluster.SetID(metadata.ClusterID) log.Printf("etcdserver: restart member %x in cluster %x at commit index %d", id, cfg.Cluster.ID(), st.Commit) n = raft.RestartNode(id, 10, 1, snapshot, st, ents) return }
// apply takes entries received from Raft (after it has been committed) and // applies them to the current state of the EtcdServer. // The given entries should not be empty. func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) { var applied uint64 var shouldstop bool for i := range es { e := es[i] switch e.Type { case raftpb.EntryNormal: s.applyEntryNormal(&e) case raftpb.EntryConfChange: var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, e.Data) removedSelf, err := s.applyConfChange(cc, confState) shouldstop = shouldstop || removedSelf s.w.Trigger(cc.ID, err) default: plog.Panicf("entry type should be either EntryNormal or EntryConfChange") } atomic.StoreUint64(&s.r.index, e.Index) atomic.StoreUint64(&s.r.term, e.Term) applied = e.Index } return applied, shouldstop }
// ReadAll reads out records of the current WAL. // If opened in write mode, it must read out all records until EOF. Or an error // will be returned. // If opened in read mode, it will try to read all records if possible. // If it cannot read out the expected snap, it will return ErrSnapshotNotFound. // If loaded snap doesn't match with the expected one, it will return // all the records and error ErrSnapshotMismatch. // TODO: detect not-last-snap error. // TODO: maybe loose the checking of match. // After ReadAll, the WAL will be ready for appending new records. func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.Entry, err error) { w.mu.Lock() defer w.mu.Unlock() rec := &walpb.Record{} decoder := w.decoder var match bool for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) { switch rec.Type { case entryType: e := mustUnmarshalEntry(rec.Data) if e.Index > w.start.Index { ents = append(ents[:e.Index-w.start.Index-1], e) } w.enti = e.Index case stateType: state = mustUnmarshalState(rec.Data) case metadataType: if metadata != nil && !reflect.DeepEqual(metadata, rec.Data) { state.Reset() return nil, state, nil, ErrMetadataConflict } metadata = rec.Data case crcType: crc := decoder.crc.Sum32() // current crc of decoder must match the crc of the record. // do no need to match 0 crc, since the decoder is a new one at this case. if crc != 0 && rec.Validate(crc) != nil { state.Reset() return nil, state, nil, ErrCRCMismatch } decoder.updateCRC(rec.Crc) case snapshotType: var snap walpb.Snapshot pbutil.MustUnmarshal(&snap, rec.Data) if snap.Index == w.start.Index { if snap.Term != w.start.Term { state.Reset() return nil, state, nil, ErrSnapshotMismatch } match = true } default: state.Reset() return nil, state, nil, fmt.Errorf("unexpected block type %d", rec.Type) } } switch w.f { case nil: // We do not have to read out all entries in read mode. // The last record maybe a partial written one, so // ErrunexpectedEOF might be returned. if err != io.EOF && err != io.ErrUnexpectedEOF { state.Reset() return nil, state, nil, err } default: // We must read all of the entries if WAL is opened in write mode. if err != io.EOF { state.Reset() return nil, state, nil, err } } err = nil if !match { err = ErrSnapshotNotFound } // close decoder, disable reading w.decoder.close() w.start = walpb.Snapshot{} w.metadata = metadata if w.f != nil { // create encoder (chain crc with the decoder), enable appending w.encoder = newEncoder(w.f, w.decoder.lastCRC()) w.decoder = nil lastIndexSaved.Set(float64(w.enti)) } return metadata, state, ents, err }
// applyEntryNormal apples an EntryNormal type raftpb request to the EtcdServer func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) { shouldApplyV3 := false if e.Index > s.consistIndex.ConsistentIndex() { // set the consistent index of current executing entry s.consistIndex.setConsistentIndex(e.Index) shouldApplyV3 = true } defer s.setAppliedIndex(e.Index) // raft state machine may generate noop entry when leader confirmation. // skip it in advance to avoid some potential bug in the future if len(e.Data) == 0 { select { case s.forceVersionC <- struct{}{}: default: } // promote lessor when the local member is leader and finished // applying all entries from the last term. if s.isLeader() { s.lessor.Promote(s.Cfg.electionTimeout()) } return } var raftReq pb.InternalRaftRequest if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible var r pb.Request pbutil.MustUnmarshal(&r, e.Data) s.w.Trigger(r.ID, s.applyV2Request(&r)) return } if raftReq.V2 != nil { req := raftReq.V2 s.w.Trigger(req.ID, s.applyV2Request(req)) return } // do not re-apply applied entries. if !shouldApplyV3 { return } id := raftReq.ID if id == 0 { id = raftReq.Header.ID } var ar *applyResult needResult := s.w.IsRegistered(id) if needResult || !noSideEffect(&raftReq) { if !needResult && raftReq.Txn != nil { removeNeedlessRangeReqs(raftReq.Txn) } ar = s.applyV3.Apply(&raftReq) } if ar == nil { return } if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 { s.w.Trigger(id, ar) return } plog.Errorf("applying raft message exceeded backend quota") s.goAttach(func() { a := &pb.AlarmRequest{ MemberID: uint64(s.ID()), Action: pb.AlarmRequest_ACTIVATE, Alarm: pb.AlarmType_NOSPACE, } r := pb.InternalRaftRequest{Alarm: a} s.processInternalRaftRequest(context.TODO(), r) s.w.Trigger(id, ar) }) }
// apply takes entries received from Raft (after it has been committed) and // applies them to the current state of the EtcdServer. // The given entries should not be empty. func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState) (uint64, bool) { var applied uint64 var shouldstop bool for i := range es { e := es[i] switch e.Type { case raftpb.EntryNormal: // raft state machine may generate noop entry when leader confirmation. // skip it in advance to avoid some potential bug in the future if len(e.Data) == 0 { select { case s.forceVersionC <- struct{}{}: default: } break } var raftReq pb.InternalRaftRequest if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // backward compatible var r pb.Request pbutil.MustUnmarshal(&r, e.Data) s.w.Trigger(r.ID, s.applyRequest(r)) } else if raftReq.V2 != nil { req := raftReq.V2 s.w.Trigger(req.ID, s.applyRequest(*req)) } else { // do not re-apply applied entries. if e.Index <= s.consistIndex.ConsistentIndex() { break } // set the consistent index of current executing entry s.consistIndex.setConsistentIndex(e.Index) ar := s.applyV3Request(&raftReq) if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 { s.w.Trigger(raftReq.ID, ar) break } plog.Errorf("applying raft message exceeded backend quota") go func() { a := &pb.AlarmRequest{ MemberID: uint64(s.ID()), Action: pb.AlarmRequest_ACTIVATE, Alarm: pb.AlarmType_NOSPACE, } r := pb.InternalRaftRequest{Alarm: a} s.processInternalRaftRequest(context.TODO(), r) s.w.Trigger(raftReq.ID, ar) }() } case raftpb.EntryConfChange: var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, e.Data) removedSelf, err := s.applyConfChange(cc, confState) shouldstop = shouldstop || removedSelf s.w.Trigger(cc.ID, err) default: plog.Panicf("entry type should be either EntryNormal or EntryConfChange") } atomic.StoreUint64(&s.r.index, e.Index) atomic.StoreUint64(&s.r.term, e.Term) applied = e.Index } return applied, shouldstop }
func mustUnmarshalState(d []byte) raftpb.HardState { var s raftpb.HardState pbutil.MustUnmarshal(&s, d) return s }
func mustUnmarshalEntry(d []byte) raftpb.Entry { var e raftpb.Entry pbutil.MustUnmarshal(&e, d) return e }
// ReadAll reads out all records of the current WAL. // If it cannot read out the expected snap, it will return ErrSnapshotNotFound. // If loaded snap doesn't match with the expected one, it will return // ErrSnapshotMismatch. // TODO: detect not-last-snap error. // TODO: maybe loose the checking of match. // After ReadAll, the WAL will be ready for appending new records. func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.Entry, err error) { rec := &walpb.Record{} decoder := w.decoder var match bool for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) { switch rec.Type { case entryType: e := mustUnmarshalEntry(rec.Data) if e.Index > w.start.Index { ents = append(ents[:e.Index-w.start.Index-1], e) } w.enti = e.Index case stateType: state = mustUnmarshalState(rec.Data) case metadataType: if metadata != nil && !reflect.DeepEqual(metadata, rec.Data) { state.Reset() return nil, state, nil, ErrMetadataConflict } metadata = rec.Data case crcType: crc := decoder.crc.Sum32() // current crc of decoder must match the crc of the record. // do no need to match 0 crc, since the decoder is a new one at this case. if crc != 0 && rec.Validate(crc) != nil { state.Reset() return nil, state, nil, ErrCRCMismatch } decoder.updateCRC(rec.Crc) case snapshotType: var snap walpb.Snapshot pbutil.MustUnmarshal(&snap, rec.Data) if snap.Index == w.start.Index { if snap.Term != w.start.Term { state.Reset() return nil, state, nil, ErrSnapshotMismatch } match = true } default: state.Reset() return nil, state, nil, fmt.Errorf("unexpected block type %d", rec.Type) } } if err != io.EOF { state.Reset() return nil, state, nil, err } if !match { state.Reset() return nil, state, nil, ErrSnapshotNotFound } // close decoder, disable reading w.decoder.close() w.start = walpb.Snapshot{} w.metadata = metadata // create encoder (chain crc with the decoder), enable appending w.encoder = newEncoder(w.f, w.decoder.lastCRC()) w.decoder = nil return metadata, state, ents, nil }
func rebuild(datadir string) ([]byte, *raftpb.HardState, store.Store, error) { waldir := path.Join(datadir, "member", "wal") snapdir := path.Join(datadir, "member", "snap") ss := snap.New(snapdir) snapshot, err := ss.Load() if err != nil && err != snap.ErrNoSnapshot { return nil, nil, nil, err } var walsnap walpb.Snapshot if snapshot != nil { walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term } w, err := wal.OpenForRead(waldir, walsnap) if err != nil { return nil, nil, nil, err } defer w.Close() meta, hardstate, ents, err := w.ReadAll() if err != nil { return nil, nil, nil, err } st := store.New(etcdserver.StoreClusterPrefix, etcdserver.StoreKeysPrefix) if snapshot != nil { err := st.Recovery(snapshot.Data) if err != nil { return nil, nil, nil, err } } cluster := membership.NewCluster("") cluster.SetStore(st) cluster.Recover(func(*semver.Version) {}) applier := etcdserver.NewApplierV2(st, cluster) for _, ent := range ents { if ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, ent.Data) switch cc.Type { case raftpb.ConfChangeAddNode: m := new(membership.Member) if err := json.Unmarshal(cc.Context, m); err != nil { return nil, nil, nil, err } cluster.AddMember(m) case raftpb.ConfChangeRemoveNode: id := types.ID(cc.NodeID) cluster.RemoveMember(id) case raftpb.ConfChangeUpdateNode: m := new(membership.Member) if err := json.Unmarshal(cc.Context, m); err != nil { return nil, nil, nil, err } cluster.UpdateRaftAttributes(m.ID, m.RaftAttributes) } continue } var raftReq pb.InternalRaftRequest if !pbutil.MaybeUnmarshal(&raftReq, ent.Data) { // backward compatible var r pb.Request pbutil.MustUnmarshal(&r, ent.Data) applyRequest(&r, applier) } else { if raftReq.V2 != nil { req := raftReq.V2 applyRequest(req, applier) } } } return meta, &hardstate, st, nil }
func rebuildStoreV2() (store.Store, uint64) { var index uint64 cl := membership.NewCluster("") waldir := migrateWALdir if len(waldir) == 0 { waldir = path.Join(migrateDatadir, "member", "wal") } snapdir := path.Join(migrateDatadir, "member", "snap") ss := snap.New(snapdir) snapshot, err := ss.Load() if err != nil && err != snap.ErrNoSnapshot { ExitWithError(ExitError, err) } var walsnap walpb.Snapshot if snapshot != nil { walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term index = snapshot.Metadata.Index } w, err := wal.OpenForRead(waldir, walsnap) if err != nil { ExitWithError(ExitError, err) } defer w.Close() _, _, ents, err := w.ReadAll() if err != nil { ExitWithError(ExitError, err) } st := store.New() if snapshot != nil { err := st.Recovery(snapshot.Data) if err != nil { ExitWithError(ExitError, err) } } cl.SetStore(st) cl.Recover(api.UpdateCapability) applier := etcdserver.NewApplierV2(st, cl) for _, ent := range ents { if ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange pbutil.MustUnmarshal(&cc, ent.Data) applyConf(cc, cl) continue } var raftReq pb.InternalRaftRequest if !pbutil.MaybeUnmarshal(&raftReq, ent.Data) { // backward compatible var r pb.Request pbutil.MustUnmarshal(&r, ent.Data) applyRequest(&r, applier) } else { if raftReq.V2 != nil { req := raftReq.V2 applyRequest(req, applier) } } if ent.Index > index { index = ent.Index } } return st, index }
// ReadAll reads out records of the current WAL. // If opened in write mode, it must read out all records until EOF. Or an error // will be returned. // If opened in read mode, it will try to read all records if possible. // If it cannot read out the expected snap, it will return ErrSnapshotNotFound. // If loaded snap doesn't match with the expected one, it will return // all the records and error ErrSnapshotMismatch. // TODO: detect not-last-snap error. // TODO: maybe loose the checking of match. // After ReadAll, the WAL will be ready for appending new records. func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.Entry, err error) { w.mu.Lock() defer w.mu.Unlock() rec := &walpb.Record{} decoder := w.decoder var match bool for err = decoder.decode(rec); err == nil; err = decoder.decode(rec) { switch rec.Type { case entryType: e := mustUnmarshalEntry(rec.Data) if e.Index > w.start.Index { ents = append(ents[:e.Index-w.start.Index-1], e) } w.enti = e.Index case stateType: state = mustUnmarshalState(rec.Data) case metadataType: if metadata != nil && !bytes.Equal(metadata, rec.Data) { state.Reset() return nil, state, nil, ErrMetadataConflict } metadata = rec.Data case crcType: crc := decoder.crc.Sum32() // current crc of decoder must match the crc of the record. // do no need to match 0 crc, since the decoder is a new one at this case. if crc != 0 && rec.Validate(crc) != nil { state.Reset() return nil, state, nil, ErrCRCMismatch } decoder.updateCRC(rec.Crc) case snapshotType: var snap walpb.Snapshot pbutil.MustUnmarshal(&snap, rec.Data) if snap.Index == w.start.Index { if snap.Term != w.start.Term { state.Reset() return nil, state, nil, ErrSnapshotMismatch } match = true } default: state.Reset() return nil, state, nil, fmt.Errorf("unexpected block type %d", rec.Type) } } switch w.tail() { case nil: // We do not have to read out all entries in read mode. // The last record maybe a partial written one, so // ErrunexpectedEOF might be returned. if err != io.EOF && err != io.ErrUnexpectedEOF { state.Reset() return nil, state, nil, err } default: // We must read all of the entries if WAL is opened in write mode. if err != io.EOF { state.Reset() return nil, state, nil, err } // decodeRecord() will return io.EOF if it detects a zero record, // but this zero record may be followed by non-zero records from // a torn write. Overwriting some of these non-zero records, but // not all, will cause CRC errors on WAL open. Since the records // were never fully synced to disk in the first place, it's safe // to zero them out to avoid any CRC errors from new writes. if _, err = w.tail().Seek(w.decoder.lastOffset(), os.SEEK_SET); err != nil { return nil, state, nil, err } if err = fileutil.ZeroToEnd(w.tail().File); err != nil { return nil, state, nil, err } } err = nil if !match { err = ErrSnapshotNotFound } // close decoder, disable reading if w.readClose != nil { w.readClose() w.readClose = nil } w.start = walpb.Snapshot{} w.metadata = metadata if w.tail() != nil { // create encoder (chain crc with the decoder), enable appending w.encoder = newEncoder(w.tail(), w.decoder.lastCRC()) } w.decoder = nil return metadata, state, ents, err }
func (dec *msgAppV2Decoder) decode() (raftpb.Message, error) { var ( m raftpb.Message typ uint8 ) if _, err := io.ReadFull(dec.r, dec.uint8buf); err != nil { return m, err } typ = uint8(dec.uint8buf[0]) switch typ { case msgTypeLinkHeartbeat: return linkHeartbeatMessage, nil case msgTypeAppEntries: m = raftpb.Message{ Type: raftpb.MsgApp, From: uint64(dec.remote), To: uint64(dec.local), Term: dec.term, LogTerm: dec.term, Index: dec.index, } // decode entries if _, err := io.ReadFull(dec.r, dec.uint64buf); err != nil { return m, err } l := binary.BigEndian.Uint64(dec.uint64buf) m.Entries = make([]raftpb.Entry, int(l)) for i := 0; i < int(l); i++ { if _, err := io.ReadFull(dec.r, dec.uint64buf); err != nil { return m, err } size := binary.BigEndian.Uint64(dec.uint64buf) var buf []byte if size < msgAppV2BufSize { buf = dec.buf[:size] if _, err := io.ReadFull(dec.r, buf); err != nil { return m, err } } else { buf = make([]byte, int(size)) if _, err := io.ReadFull(dec.r, buf); err != nil { return m, err } } dec.index++ // 1 alloc pbutil.MustUnmarshal(&m.Entries[i], buf) } // decode commit index if _, err := io.ReadFull(dec.r, dec.uint64buf); err != nil { return m, err } m.Commit = binary.BigEndian.Uint64(dec.uint64buf) case msgTypeApp: var size uint64 if err := binary.Read(dec.r, binary.BigEndian, &size); err != nil { return m, err } buf := make([]byte, int(size)) if _, err := io.ReadFull(dec.r, buf); err != nil { return m, err } pbutil.MustUnmarshal(&m, buf) dec.term = m.Term dec.index = m.Index if l := len(m.Entries); l > 0 { dec.index = m.Entries[l-1].Index } default: return m, fmt.Errorf("failed to parse type %d in msgappv2 stream", typ) } return m, nil }