func readWAL(waldir string, snap walpb.Snapshot) (w *wal.WAL, id, cid types.ID, st raftpb.HardState, ents []raftpb.Entry) { var ( err error wmetadata []byte ) repaired := false for { if w, err = wal.Open(waldir, snap); err != nil { plog.Fatalf("open wal error: %v", err) } if wmetadata, st, ents, err = w.ReadAll(); err != nil { w.Close() // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { plog.Fatalf("read wal error (%v) and cannot be repaired", err) } if !wal.Repair(waldir) { plog.Fatalf("WAL error (%v) cannot be repaired", err) } else { plog.Infof("repaired WAL error (%v)", err) repaired = true } continue } break } var metadata pb.Metadata pbutil.MustUnmarshal(&metadata, wmetadata) id = types.ID(metadata.NodeID) cid = types.ID(metadata.ClusterID) return }
// ReadRepairWAL opens a WAL for reading, and attempts to read it. If we can't read it, attempts to repair // and read again. func ReadRepairWAL( ctx context.Context, walDir string, walsnap walpb.Snapshot, factory WALFactory, ) (WAL, WALData, error) { var ( reader WAL metadata []byte st raftpb.HardState ents []raftpb.Entry err error ) repaired := false for { if reader, err = factory.Open(walDir, walsnap); err != nil { return nil, WALData{}, errors.Wrap(err, "failed to open WAL") } if metadata, st, ents, err = reader.ReadAll(); err != nil { if closeErr := reader.Close(); closeErr != nil { return nil, WALData{}, closeErr } if _, ok := err.(encryption.ErrCannotDecrypt); ok { return nil, WALData{}, errors.Wrap(err, "failed to decrypt WAL") } // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { return nil, WALData{}, errors.Wrap(err, "irreparable WAL error") } if !wal.Repair(walDir) { return nil, WALData{}, errors.Wrap(err, "WAL error cannot be repaired") } log.G(ctx).WithError(err).Info("repaired WAL error") repaired = true continue } break } return reader, WALData{ Metadata: metadata, HardState: st, Entries: ents, }, nil }
func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) { var ( walsnap walpb.Snapshot metadata []byte st raftpb.HardState ents []raftpb.Entry ) if snapshot != nil { walsnap.Index = snapshot.Metadata.Index walsnap.Term = snapshot.Metadata.Term } repaired := false for { if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil { return fmt.Errorf("open WAL error: %v", err) } if metadata, st, ents, err = n.wal.ReadAll(); err != nil { if err := n.wal.Close(); err != nil { return err } // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { return fmt.Errorf("read WAL error (%v) and cannot be repaired", err) } if !wal.Repair(n.walDir()) { return fmt.Errorf("WAL error (%v) cannot be repaired", err) } log.G(ctx).Infof("repaired WAL error (%v)", err) repaired = true continue } break } defer func() { if err != nil { if walErr := n.wal.Close(); walErr != nil { n.Config.Logger.Errorf("error closing raft WAL: %v", walErr) } } }() var raftNode api.RaftMember if err := raftNode.Unmarshal(metadata); err != nil { return fmt.Errorf("error unmarshalling WAL metadata: %v", err) } n.Config.ID = raftNode.RaftID // All members that are no longer part of the cluster must be added to // the removed list right away, so that we don't try to connect to them // before processing the configuration change entries, which could make // us get stuck. for _, ent := range ents { if ent.Index <= st.Commit && ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ent.Data); err != nil { return fmt.Errorf("error unmarshalling config change: %v", err) } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit) // All members that are being removed as part of the // force-new-cluster process must be added to the // removed list right away, so that we don't try to // connect to them before processing the configuration // change entries, which could make us get stuck. for _, ccEnt := range toAppEnts { if ccEnt.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ccEnt.Data); err != nil { return fmt.Errorf("error unmarshalling force-new-cluster config change: %v", err) } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.wal.Save(st, toAppEnts) if err != nil { log.G(context.Background()).Fatalf("%v", err) } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } if err := n.raftStore.Append(ents); err != nil { return err } return nil }
func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) { var ( walsnap walpb.Snapshot metadata []byte st raftpb.HardState ents []raftpb.Entry ) if snapshot != nil { walsnap.Index = snapshot.Metadata.Index walsnap.Term = snapshot.Metadata.Term } repaired := false for { if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil { return fmt.Errorf("open wal error: %v", err) } if metadata, st, ents, err = n.wal.ReadAll(); err != nil { if err := n.wal.Close(); err != nil { return err } // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { return fmt.Errorf("read wal error (%v) and cannot be repaired", err) } if !wal.Repair(n.walDir()) { return fmt.Errorf("WAL error (%v) cannot be repaired", err) } log.G(ctx).Infof("repaired WAL error (%v)", err) repaired = true continue } break } defer func() { if err != nil { if walErr := n.wal.Close(); walErr != nil { n.Config.Logger.Errorf("error closing raft WAL: %v", walErr) } } }() var raftNode api.RaftMember if err := raftNode.Unmarshal(metadata); err != nil { return fmt.Errorf("error unmarshalling wal metadata: %v", err) } n.Config.ID = raftNode.RaftID if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit) ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.wal.Save(st, toAppEnts) if err != nil { log.G(context.Background()).Fatalf("%v", err) } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } if err := n.raftStore.Append(ents); err != nil { return err } return nil }