func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) { if raft.IsEmptySnap(apply.snapshot) { return } if apply.snapshot.Metadata.Index <= ep.appliedi { plog.Panicf("snapshot index [%d] should > appliedi[%d] + 1", apply.snapshot.Metadata.Index, ep.appliedi) } if s.cfg.V3demo { snapfn, err := s.r.storage.DBFilePath(apply.snapshot.Metadata.Index) if err != nil { plog.Panicf("get database snapshot file path error: %v", err) } fn := path.Join(s.cfg.SnapDir(), databaseFilename) if err := os.Rename(snapfn, fn); err != nil { plog.Panicf("rename snapshot file error: %v", err) } newbe := backend.NewDefaultBackend(fn) newKV := dstorage.New(newbe, &s.consistIndex) if err := newKV.Restore(); err != nil { plog.Panicf("restore KV error: %v", err) } oldKV := s.swapKV(newKV) // Closing oldKV might block until all the txns // on the kv are finished. // We do not want to wait on closing the old kv. go func() { if err := oldKV.Close(); err != nil { plog.Panicf("close KV error: %v", err) } }() } if err := s.store.Recovery(apply.snapshot.Data); err != nil { plog.Panicf("recovery store error: %v", err) } s.cluster.Recover() // recover raft transport s.r.transport.RemoveAllPeers() for _, m := range s.cluster.Members() { if m.ID == s.ID() { continue } s.r.transport.AddPeer(m.ID, m.PeerURLs) } ep.appliedi = apply.snapshot.Metadata.Index ep.snapi = ep.appliedi ep.confState = apply.snapshot.Metadata.ConfState plog.Infof("recovered from incoming snapshot at index %d", ep.snapi) }
// Applied > SnapCount should trigger a SaveSnap event func TestTriggerSnap(t *testing.T) { be, tmpPath := backend.NewDefaultTmpBackend() defer func() { os.RemoveAll(tmpPath) }() snapc := 10 st := mockstore.NewRecorder() p := mockstorage.NewStorageRecorderStream("") srv := &EtcdServer{ cfg: &ServerConfig{TickMs: 1}, snapCount: uint64(snapc), r: raftNode{ Node: newNodeCommitter(), raftStorage: raft.NewMemoryStorage(), storage: p, transport: rafthttp.NewNopTransporter(), }, store: st, reqIDGen: idutil.NewGenerator(0, time.Time{}), } srv.kv = dstorage.New(be, &lease.FakeLessor{}, &srv.consistIndex) srv.be = be srv.start() donec := make(chan struct{}) go func() { wcnt := 2 + snapc gaction, _ := p.Wait(wcnt) // each operation is recorded as a Save // (SnapCount+1) * Puts + SaveSnap = (SnapCount+1) * Save + SaveSnap if len(gaction) != wcnt { t.Fatalf("len(action) = %d, want %d", len(gaction), wcnt) } if !reflect.DeepEqual(gaction[wcnt-1], testutil.Action{Name: "SaveSnap"}) { t.Errorf("action = %s, want SaveSnap", gaction[wcnt-1]) } close(donec) }() for i := 0; i < snapc+1; i++ { srv.Do(context.Background(), pb.Request{Method: "PUT"}) } srv.Stop() <-donec }
// snapshot should snapshot the store and cut the persistent func TestSnapshot(t *testing.T) { be, tmpPath := backend.NewDefaultTmpBackend() defer func() { os.RemoveAll(tmpPath) }() s := raft.NewMemoryStorage() s.Append([]raftpb.Entry{{Index: 1}}) st := mockstore.NewRecorder() p := mockstorage.NewStorageRecorder("") srv := &EtcdServer{ cfg: &ServerConfig{}, r: raftNode{ Node: newNodeNop(), raftStorage: s, storage: p, }, store: st, } srv.kv = dstorage.New(be, &lease.FakeLessor{}, &srv.consistIndex) srv.be = be srv.snapshot(1, raftpb.ConfState{Nodes: []uint64{1}}) gaction, _ := st.Wait(2) if len(gaction) != 2 { t.Fatalf("len(action) = %d, want 1", len(gaction)) } if !reflect.DeepEqual(gaction[0], testutil.Action{Name: "Clone"}) { t.Errorf("action = %s, want Clone", gaction[0]) } if !reflect.DeepEqual(gaction[1], testutil.Action{Name: "SaveNoCopy"}) { t.Errorf("action = %s, want SaveNoCopy", gaction[1]) } gaction = p.Action() if len(gaction) != 1 { t.Fatalf("len(action) = %d, want 1", len(gaction)) } if !reflect.DeepEqual(gaction[0], testutil.Action{Name: "SaveSnap"}) { t.Errorf("action = %s, want SaveSnap", gaction[0]) } }
// NewServer creates a new EtcdServer from the supplied configuration. The // configuration is considered static for the lifetime of the EtcdServer. func NewServer(cfg *ServerConfig) (*EtcdServer, error) { st := store.New(StoreClusterPrefix, StoreKeysPrefix) var ( w *wal.WAL n raft.Node s *raft.MemoryStorage id types.ID cl *cluster ) if terr := fileutil.TouchDirAll(cfg.DataDir); terr != nil { return nil, fmt.Errorf("cannot access data directory: %v", terr) } if !cfg.V3demo && fileutil.Exist(path.Join(cfg.SnapDir(), databaseFilename)) { return nil, errors.New("experimental-v3demo cannot be disabled once it is enabled") } // Run the migrations. dataVer, err := version.DetectDataDir(cfg.DataDir) if err != nil { return nil, err } if err := upgradeDataDir(cfg.DataDir, cfg.Name, dataVer); err != nil { return nil, err } haveWAL := wal.Exist(cfg.WALDir()) ss := snap.New(cfg.SnapDir()) prt, err := rafthttp.NewRoundTripper(cfg.PeerTLSInfo, cfg.peerDialTimeout()) if err != nil { return nil, err } var remotes []*Member switch { case !haveWAL && !cfg.NewCluster: if err := cfg.VerifyJoinExisting(); err != nil { return nil, err } cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } existingCluster, err := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), prt) if err != nil { return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", err) } if err := ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) } if !isCompatibleWithCluster(cl, cl.MemberByName(cfg.Name).ID, prt) { return nil, fmt.Errorf("incomptible with current running cluster") } remotes = existingCluster.Members() cl.SetID(existingCluster.id) cl.SetStore(st) cfg.Print() id, n, s, w = startNode(cfg, cl, nil) case !haveWAL && cfg.NewCluster: if err := cfg.VerifyBootstrap(); err != nil { return nil, err } cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } m := cl.MemberByName(cfg.Name) if isMemberBootstrapped(cl, cfg.Name, prt, cfg.bootstrapTimeout()) { return nil, fmt.Errorf("member %s has already been bootstrapped", m.ID) } if cfg.ShouldDiscover() { var str string var err error str, err = discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) if err != nil { return nil, &DiscoveryError{Op: "join", Err: err} } urlsmap, err := types.NewURLsMap(str) if err != nil { return nil, err } if checkDuplicateURL(urlsmap) { return nil, fmt.Errorf("discovery cluster %s has duplicate url", urlsmap) } if cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, urlsmap); err != nil { return nil, err } } cl.SetStore(st) cfg.PrintWithInitial() id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) case haveWAL: if err := fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { return nil, fmt.Errorf("cannot write to member directory: %v", err) } if err := fileutil.IsDirWriteable(cfg.WALDir()); err != nil { return nil, fmt.Errorf("cannot write to WAL directory: %v", err) } if cfg.ShouldDiscover() { plog.Warningf("discovery token ignored since a cluster has already been initialized. Valid log found at %q", cfg.WALDir()) } var snapshot *raftpb.Snapshot var err error snapshot, err = ss.Load() if err != nil && err != snap.ErrNoSnapshot { return nil, err } if snapshot != nil { if err := st.Recovery(snapshot.Data); err != nil { plog.Panicf("recovered store from snapshot error: %v", err) } plog.Infof("recovered store from snapshot at index %d", snapshot.Metadata.Index) } cfg.Print() if !cfg.ForceNewCluster { id, cl, n, s, w = restartNode(cfg, snapshot) } else { id, cl, n, s, w = restartAsStandaloneNode(cfg, snapshot) } cl.SetStore(st) cl.Recover() default: return nil, fmt.Errorf("unsupported bootstrap config") } if terr := fileutil.TouchDirAll(cfg.MemberDir()); terr != nil { return nil, fmt.Errorf("cannot access member directory: %v", terr) } sstats := &stats.ServerStats{ Name: cfg.Name, ID: id.String(), } sstats.Initialize() lstats := stats.NewLeaderStats(id.String()) srv := &EtcdServer{ cfg: cfg, snapCount: cfg.SnapCount, errorc: make(chan error, 1), store: st, r: raftNode{ Node: n, ticker: time.Tick(time.Duration(cfg.TickMs) * time.Millisecond), raftStorage: s, storage: NewStorage(w, ss), }, id: id, attributes: Attributes{Name: cfg.Name, ClientURLs: cfg.ClientURLs.StringSlice()}, cluster: cl, stats: sstats, lstats: lstats, SyncTicker: time.Tick(500 * time.Millisecond), peerRt: prt, reqIDGen: idutil.NewGenerator(uint16(id), time.Now()), forceVersionC: make(chan struct{}), msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap), } if cfg.V3demo { srv.be = backend.NewDefaultBackend(path.Join(cfg.SnapDir(), databaseFilename)) srv.lessor = lease.NewLessor(srv.be) srv.kv = dstorage.New(srv.be, srv.lessor, &srv.consistIndex) srv.authStore = auth.NewAuthStore(srv) if h := cfg.AutoCompactionRetention; h != 0 { srv.compactor = compactor.NewPeriodic(h, srv.kv, srv) srv.compactor.Run() } } // TODO: move transport initialization near the definition of remote tr := &rafthttp.Transport{ TLSInfo: cfg.PeerTLSInfo, DialTimeout: cfg.peerDialTimeout(), ID: id, URLs: cfg.PeerURLs, ClusterID: cl.ID(), Raft: srv, Snapshotter: ss, ServerStats: sstats, LeaderStats: lstats, ErrorC: srv.errorc, V3demo: cfg.V3demo, } if err := tr.Start(); err != nil { return nil, err } // add all remotes into transport for _, m := range remotes { if m.ID != id { tr.AddRemote(m.ID, m.PeerURLs) } } for _, m := range cl.Members() { if m.ID != id { tr.AddPeer(m.ID, m.PeerURLs) } } srv.r.transport = tr return srv, nil }
// TestConcurrentApplyAndSnapshotV3 will send out snapshots concurrently with // proposals. func TestConcurrentApplyAndSnapshotV3(t *testing.T) { const ( // snapshots that may queue up at once without dropping maxInFlightMsgSnap = 16 ) n := newNopReadyNode() st := store.New() cl := membership.NewCluster("abc") cl.SetStore(st) testdir, err := ioutil.TempDir(os.TempDir(), "testsnapdir") if err != nil { t.Fatalf("Couldn't open tempdir (%v)", err) } defer os.RemoveAll(testdir) if err := os.MkdirAll(testdir+"/member/snap", 0755); err != nil { t.Fatalf("Couldn't make snap dir (%v)", err) } rs := raft.NewMemoryStorage() tr, snapDoneC := rafthttp.NewSnapTransporter(testdir) s := &EtcdServer{ cfg: &ServerConfig{ DataDir: testdir, }, r: raftNode{ Node: n, transport: tr, storage: mockstorage.NewStorageRecorder(testdir), raftStorage: rs, }, store: st, cluster: cl, msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap), } be, tmpPath := backend.NewDefaultTmpBackend() defer func() { os.RemoveAll(tmpPath) }() s.kv = dstorage.New(be, &lease.FakeLessor{}, &s.consistIndex) s.be = be s.start() defer s.Stop() // submit applied entries and snap entries idx := uint64(0) outdated := 0 accepted := 0 for k := 1; k <= 101; k++ { idx++ ch := s.w.Register(uint64(idx)) req := &pb.Request{Method: "QGET", ID: uint64(idx)} ent := raftpb.Entry{Index: uint64(idx), Data: pbutil.MustMarshal(req)} ready := raft.Ready{Entries: []raftpb.Entry{ent}} n.readyc <- ready ready = raft.Ready{CommittedEntries: []raftpb.Entry{ent}} n.readyc <- ready // "idx" applied <-ch // one snapshot for every two messages if k%2 != 0 { continue } n.readyc <- raft.Ready{Messages: []raftpb.Message{{Type: raftpb.MsgSnap}}} // get the snapshot sent by the transport snapMsg := <-snapDoneC // If the snapshot trails applied records, recovery will panic // since there's no allocated snapshot at the place of the // snapshot record. This only happens when the applier and the // snapshot sender get out of sync. if snapMsg.Snapshot.Metadata.Index == idx { idx++ snapMsg.Snapshot.Metadata.Index = idx ready = raft.Ready{Snapshot: snapMsg.Snapshot} n.readyc <- ready accepted++ } else { outdated++ } // don't wait for the snapshot to complete, move to next message } if accepted != 50 { t.Errorf("accepted=%v, want 50", accepted) } if outdated != 0 { t.Errorf("outdated=%v, want 0", outdated) } }
func (s *EtcdServer) run() { snap, err := s.r.raftStorage.Snapshot() if err != nil { plog.Panicf("get snapshot from raft storage error: %v", err) } confState := snap.Metadata.ConfState snapi := snap.Metadata.Index appliedi := snapi s.r.start(s) defer func() { s.r.stop() close(s.done) }() var shouldstop bool for { select { case apply := <-s.r.apply(): // apply snapshot if !raft.IsEmptySnap(apply.snapshot) { if apply.snapshot.Metadata.Index <= appliedi { plog.Panicf("snapshot index [%d] should > appliedi[%d] + 1", apply.snapshot.Metadata.Index, appliedi) } if s.cfg.V3demo { if err := s.kv.Close(); err != nil { plog.Panicf("close KV error: %v", err) } snapfn, err := s.r.raftStorage.snapStore.getSnapFilePath(apply.snapshot.Metadata.Index) if err != nil { plog.Panicf("get snapshot file path error: %v", err) } fn := path.Join(s.cfg.StorageDir(), databaseFilename) if err := os.Rename(snapfn, fn); err != nil { plog.Panicf("rename snapshot file error: %v", err) } s.kv = dstorage.New(fn, &s.consistIndex) if err := s.kv.Restore(); err != nil { plog.Panicf("restore KV error: %v", err) } } if err := s.store.Recovery(apply.snapshot.Data); err != nil { plog.Panicf("recovery store error: %v", err) } s.cluster.Recover() // recover raft transport s.r.transport.RemoveAllPeers() for _, m := range s.cluster.Members() { if m.ID == s.ID() { continue } s.r.transport.AddPeer(m.ID, m.PeerURLs) } appliedi = apply.snapshot.Metadata.Index snapi = appliedi confState = apply.snapshot.Metadata.ConfState plog.Infof("recovered from incoming snapshot at index %d", snapi) } // apply entries if len(apply.entries) != 0 { firsti := apply.entries[0].Index if firsti > appliedi+1 { plog.Panicf("first index of committed entry[%d] should <= appliedi[%d] + 1", firsti, appliedi) } var ents []raftpb.Entry if appliedi+1-firsti < uint64(len(apply.entries)) { ents = apply.entries[appliedi+1-firsti:] } if appliedi, shouldstop = s.apply(ents, &confState); shouldstop { go s.stopWithDelay(10*100*time.Millisecond, fmt.Errorf("the member has been permanently removed from the cluster")) } } // wait for the raft routine to finish the disk writes before triggering a // snapshot. or applied index might be greater than the last index in raft // storage, since the raft routine might be slower than apply routine. apply.done <- struct{}{} // trigger snapshot if appliedi-snapi > s.snapCount { plog.Infof("start to snapshot (applied: %d, lastsnap: %d)", appliedi, snapi) s.snapshot(appliedi, confState) snapi = appliedi } case <-s.r.raftStorage.reqsnap(): s.r.raftStorage.raftsnap() <- s.createRaftSnapshot(appliedi, confState) plog.Infof("requested snapshot created at %d", snapi) case err := <-s.errorc: plog.Errorf("%s", err) plog.Infof("the data-dir used by this member must be removed.") return case <-s.stop: return } } }
// NewServer creates a new EtcdServer from the supplied configuration. The // configuration is considered static for the lifetime of the EtcdServer. func NewServer(cfg *ServerConfig) (*EtcdServer, error) { st := store.New(StoreClusterPrefix, StoreKeysPrefix) var w *wal.WAL var n raft.Node var s *raft.MemoryStorage var id types.ID var cl *cluster if terr := fileutil.TouchDirAll(cfg.DataDir); terr != nil { return nil, fmt.Errorf("cannot access data directory: %v", terr) } // Run the migrations. dataVer, err := version.DetectDataDir(cfg.DataDir) if err != nil { return nil, err } if err := upgradeDataDir(cfg.DataDir, cfg.Name, dataVer); err != nil { return nil, err } haveWAL := wal.Exist(cfg.WALDir()) ss := snap.New(cfg.SnapDir()) var remotes []*Member switch { case !haveWAL && !cfg.NewCluster: if err := cfg.VerifyJoinExisting(); err != nil { return nil, err } cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } existingCluster, err := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), cfg.Transport) if err != nil { return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", err) } if err := ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) } if !isCompatibleWithCluster(cl, cl.MemberByName(cfg.Name).ID, cfg.Transport) { return nil, fmt.Errorf("incomptible with current running cluster") } remotes = existingCluster.Members() cl.SetID(existingCluster.id) cl.SetStore(st) cfg.Print() id, n, s, w = startNode(cfg, cl, nil) case !haveWAL && cfg.NewCluster: if err := cfg.VerifyBootstrap(); err != nil { return nil, err } cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } m := cl.MemberByName(cfg.Name) if isMemberBootstrapped(cl, cfg.Name, cfg.Transport) { return nil, fmt.Errorf("member %s has already been bootstrapped", m.ID) } if cfg.ShouldDiscover() { str, err := discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) if err != nil { return nil, &DiscoveryError{Op: "join", Err: err} } urlsmap, err := types.NewURLsMap(str) if err != nil { return nil, err } if checkDuplicateURL(urlsmap) { return nil, fmt.Errorf("discovery cluster %s has duplicate url", urlsmap) } if cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, urlsmap); err != nil { return nil, err } } cl.SetStore(st) cfg.PrintWithInitial() id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) case haveWAL: if err := fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { return nil, fmt.Errorf("cannot write to member directory: %v", err) } if err := fileutil.IsDirWriteable(cfg.WALDir()); err != nil { return nil, fmt.Errorf("cannot write to WAL directory: %v", err) } if cfg.ShouldDiscover() { plog.Warningf("discovery token ignored since a cluster has already been initialized. Valid log found at %q", cfg.WALDir()) } snapshot, err := ss.Load() if err != nil && err != snap.ErrNoSnapshot { return nil, err } if snapshot != nil { if err := st.Recovery(snapshot.Data); err != nil { plog.Panicf("recovered store from snapshot error: %v", err) } plog.Infof("recovered store from snapshot at index %d", snapshot.Metadata.Index) } cfg.Print() if snapshot != nil { plog.Infof("loaded cluster information from store: %s", cl) } if !cfg.ForceNewCluster { id, cl, n, s, w = restartNode(cfg, snapshot) } else { id, cl, n, s, w = restartAsStandaloneNode(cfg, snapshot) } cl.SetStore(st) cl.Recover() default: return nil, fmt.Errorf("unsupported bootstrap config") } if terr := fileutil.TouchDirAll(cfg.MemberDir()); terr != nil { return nil, fmt.Errorf("cannot access member directory: %v", terr) } sstats := &stats.ServerStats{ Name: cfg.Name, ID: id.String(), } sstats.Initialize() lstats := stats.NewLeaderStats(id.String()) srv := &EtcdServer{ cfg: cfg, snapCount: cfg.SnapCount, errorc: make(chan error, 1), store: st, r: raftNode{ Node: n, ticker: time.Tick(time.Duration(cfg.TickMs) * time.Millisecond), raftStorage: s, storage: NewStorage(w, ss), }, id: id, attributes: Attributes{Name: cfg.Name, ClientURLs: cfg.ClientURLs.StringSlice()}, cluster: cl, stats: sstats, lstats: lstats, SyncTicker: time.Tick(500 * time.Millisecond), reqIDGen: idutil.NewGenerator(uint8(id), time.Now()), forceVersionC: make(chan struct{}), } if cfg.V3demo { srv.kv = dstorage.New(path.Join(cfg.DataDir, "member", "v3demo")) } else { // we do not care about the error of the removal os.RemoveAll(path.Join(cfg.DataDir, "member", "v3demo")) } // TODO: move transport initialization near the definition of remote tr := rafthttp.NewTransporter(cfg.Transport, id, cl.ID(), srv, srv.errorc, sstats, lstats) // add all remotes into transport for _, m := range remotes { if m.ID != id { tr.AddRemote(m.ID, m.PeerURLs) } } for _, m := range cl.Members() { if m.ID != id { tr.AddPeer(m.ID, m.PeerURLs) } } srv.r.transport = tr return srv, nil }