func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) { var ( walsnap walpb.Snapshot metadata []byte st raftpb.HardState ents []raftpb.Entry ) if snapshot != nil { walsnap.Index = snapshot.Metadata.Index walsnap.Term = snapshot.Metadata.Term } repaired := false for { if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil { return fmt.Errorf("open WAL error: %v", err) } if metadata, st, ents, err = n.wal.ReadAll(); err != nil { if err := n.wal.Close(); err != nil { return err } // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { return fmt.Errorf("read WAL error (%v) and cannot be repaired", err) } if !wal.Repair(n.walDir()) { return fmt.Errorf("WAL error (%v) cannot be repaired", err) } log.G(ctx).Infof("repaired WAL error (%v)", err) repaired = true continue } break } defer func() { if err != nil { if walErr := n.wal.Close(); walErr != nil { n.Config.Logger.Errorf("error closing raft WAL: %v", walErr) } } }() var raftNode api.RaftMember if err := raftNode.Unmarshal(metadata); err != nil { return fmt.Errorf("error unmarshalling WAL metadata: %v", err) } n.Config.ID = raftNode.RaftID // All members that are no longer part of the cluster must be added to // the removed list right away, so that we don't try to connect to them // before processing the configuration change entries, which could make // us get stuck. for _, ent := range ents { if ent.Index <= st.Commit && ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ent.Data); err != nil { return fmt.Errorf("error unmarshalling config change: %v", err) } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit) // All members that are being removed as part of the // force-new-cluster process must be added to the // removed list right away, so that we don't try to // connect to them before processing the configuration // change entries, which could make us get stuck. for _, ccEnt := range toAppEnts { if ccEnt.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ccEnt.Data); err != nil { return fmt.Errorf("error unmarshalling force-new-cluster config change: %v", err) } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.wal.Save(st, toAppEnts) if err != nil { log.G(context.Background()).Fatalf("%v", err) } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } if err := n.raftStore.Append(ents); err != nil { return err } return nil }
// bootstraps a node's raft store from the raft logs and snapshots on disk func (n *Node) loadAndStart(ctx context.Context, forceNewCluster bool) error { snapshot, waldata, err := n.readFromDisk(ctx) if err != nil { return err } // Read logs to fully catch up store var raftNode api.RaftMember if err := raftNode.Unmarshal(waldata.Metadata); err != nil { return errors.Wrap(err, "failed to unmarshal WAL metadata") } n.Config.ID = raftNode.RaftID if snapshot != nil { snapCluster, err := n.clusterSnapshot(snapshot.Data) if err != nil { return err } var bootstrapMembers []*api.RaftMember if forceNewCluster { for _, m := range snapCluster.Members { if m.RaftID != n.Config.ID { n.cluster.RemoveMember(m.RaftID) continue } bootstrapMembers = append(bootstrapMembers, m) } } else { bootstrapMembers = snapCluster.Members } n.bootstrapMembers = bootstrapMembers for _, removedMember := range snapCluster.Removed { n.cluster.RemoveMember(removedMember) } } ents, st := waldata.Entries, waldata.HardState // All members that are no longer part of the cluster must be added to // the removed list right away, so that we don't try to connect to them // before processing the configuration change entries, which could make // us get stuck. for _, ent := range ents { if ent.Index <= st.Commit && ent.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ent.Data); err != nil { return errors.Wrap(err, "failed to unmarshal config change") } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(ctx).Infof("discarding %d uncommitted WAL entries", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), n.Config.ID, st.Term, st.Commit) // All members that are being removed as part of the // force-new-cluster process must be added to the // removed list right away, so that we don't try to // connect to them before processing the configuration // change entries, which could make us get stuck. for _, ccEnt := range toAppEnts { if ccEnt.Type == raftpb.EntryConfChange { var cc raftpb.ConfChange if err := cc.Unmarshal(ccEnt.Data); err != nil { return errors.Wrap(err, "error unmarshalling force-new-cluster config change") } if cc.Type == raftpb.ConfChangeRemoveNode { n.cluster.RemoveMember(cc.NodeID) } } } ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.raftLogger.SaveEntries(st, toAppEnts) if err != nil { log.G(ctx).WithError(err).Fatalf("failed to save WAL while forcing new cluster") } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } return n.raftStore.Append(ents) }
func (n *Node) readWAL(ctx context.Context, snapshot *raftpb.Snapshot, forceNewCluster bool) (err error) { var ( walsnap walpb.Snapshot metadata []byte st raftpb.HardState ents []raftpb.Entry ) if snapshot != nil { walsnap.Index = snapshot.Metadata.Index walsnap.Term = snapshot.Metadata.Term } repaired := false for { if n.wal, err = wal.Open(n.walDir(), walsnap); err != nil { return fmt.Errorf("open wal error: %v", err) } if metadata, st, ents, err = n.wal.ReadAll(); err != nil { if err := n.wal.Close(); err != nil { return err } // we can only repair ErrUnexpectedEOF and we never repair twice. if repaired || err != io.ErrUnexpectedEOF { return fmt.Errorf("read wal error (%v) and cannot be repaired", err) } if !wal.Repair(n.walDir()) { return fmt.Errorf("WAL error (%v) cannot be repaired", err) } log.G(ctx).Infof("repaired WAL error (%v)", err) repaired = true continue } break } defer func() { if err != nil { if walErr := n.wal.Close(); walErr != nil { n.Config.Logger.Errorf("error closing raft WAL: %v", walErr) } } }() var raftNode api.RaftMember if err := raftNode.Unmarshal(metadata); err != nil { return fmt.Errorf("error unmarshalling wal metadata: %v", err) } n.Config.ID = raftNode.RaftID if forceNewCluster { // discard the previously uncommitted entries for i, ent := range ents { if ent.Index > st.Commit { log.G(context.Background()).Infof("discarding %d uncommitted WAL entries ", len(ents)-i) ents = ents[:i] break } } // force append the configuration change entries toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(n.Config.ID), st.Term, st.Commit) ents = append(ents, toAppEnts...) // force commit newly appended entries err := n.wal.Save(st, toAppEnts) if err != nil { log.G(context.Background()).Fatalf("%v", err) } if len(toAppEnts) != 0 { st.Commit = toAppEnts[len(toAppEnts)-1].Index } } if snapshot != nil { if err := n.raftStore.ApplySnapshot(*snapshot); err != nil { return err } } if err := n.raftStore.SetHardState(st); err != nil { return err } if err := n.raftStore.Append(ents); err != nil { return err } return nil }