func (n *Node) loadAndStart(ctx context.Context, forceNewCluster bool) error { walDir := n.walDir() snapDir := n.snapDir() if !fileutil.Exist(snapDir) { // If snapshots created by the etcd-v2 code exist, hard link // them at the new path. This prevents etc-v2 creating // snapshots that are visible to us, but out of sync with our // WALs, after a downgrade. legacySnapDir := n.legacySnapDir() if fileutil.Exist(legacySnapDir) { if err := migrateSnapshots(legacySnapDir, snapDir); err != nil { return err } } else if err := os.MkdirAll(snapDir, 0700); err != nil { return errors.Wrap(err, "failed to create snapshot directory") } } // Create a snapshotter n.snapshotter = snap.New(snapDir) if !wal.Exist(walDir) { // If wals created by the etcd-v2 wal code exist, copy them to // the new path to avoid adding backwards-incompatible entries // to those files. legacyWALDir := n.legacyWALDir() if !wal.Exist(legacyWALDir) { return errNoWAL } if err := migrateWALs(legacyWALDir, walDir); err != nil { return err } } // Load snapshot data snapshot, err := n.snapshotter.Load() if err != nil && err != snap.ErrNoSnapshot { return err } if snapshot != nil { // Load the snapshot data into the store if err := n.restoreFromSnapshot(snapshot.Data, forceNewCluster); err != nil { return err } } // Read logs to fully catch up store if err := n.readWAL(ctx, snapshot, forceNewCluster); err != nil { return err } return nil }
func TestDecrypt(t *testing.T) { tempdir, err := ioutil.TempDir("", "rafttool") require.NoError(t, err) defer os.RemoveAll(tempdir) kek := []byte("kek") dek := []byte("dek") unlockKey := encryption.HumanReadableKey(kek) // write a key to disk, else we won't be able to decrypt anything paths := certPaths(tempdir) krw := ca.NewKeyReadWriter(paths.Node, kek, manager.RaftDEKData{EncryptionKeys: raft.EncryptionKeys{CurrentDEK: dek}}) cert, key, err := testutils.CreateRootCertAndKey("not really a root, just need cert and key") require.NoError(t, err) require.NoError(t, krw.Write(cert, key, nil)) // create the encrypted v3 directory origSnapshot := raftpb.Snapshot{ Data: []byte("snapshot"), Metadata: raftpb.SnapshotMetadata{ Index: 1, Term: 1, }, } e, d := encryption.Defaults(dek) writeFakeRaftData(t, tempdir, &origSnapshot, storage.NewWALFactory(e, d), storage.NewSnapFactory(e, d)) outdir := filepath.Join(tempdir, "outdir") // if we use the wrong unlock key, we can't actually decrypt anything. The output directory won't get created. err = decryptRaftData(tempdir, outdir, "") require.IsType(t, ca.ErrInvalidKEK{}, err) require.False(t, fileutil.Exist(outdir)) // Using the right unlock key, we produce data that is unencrypted require.NoError(t, decryptRaftData(tempdir, outdir, unlockKey)) require.True(t, fileutil.Exist(outdir)) // The snapshot directory is readable by the regular snapshotter snapshot, err := storage.OriginalSnap.New(filepath.Join(outdir, "snap-decrypted")).Load() require.NoError(t, err) require.NotNil(t, snapshot) require.Equal(t, origSnapshot, *snapshot) // The wals are readable by the regular wal walreader, err := storage.OriginalWAL.Open(filepath.Join(outdir, "wal-decrypted"), walpb.Snapshot{Index: 1, Term: 1}) require.NoError(t, err) metadata, _, entries, err := walreader.ReadAll() require.NoError(t, err) require.Equal(t, []byte("v3metadata"), metadata) require.Len(t, entries, 5) }
func migrateSnapshots(legacySnapDir, snapDir string) error { // use temporary snaphot directory so initialization appears atomic tmpdirpath := filepath.Clean(snapDir) + ".tmp" if fileutil.Exist(tmpdirpath) { if err := os.RemoveAll(tmpdirpath); err != nil { return errors.Wrap(err, "could not remove temporary snapshot directory") } } if err := fileutil.CreateDirAll(tmpdirpath); err != nil { return errors.Wrap(err, "could not create temporary snapshot directory") } snapshotNames, err := fileutil.ReadDir(legacySnapDir) if err != nil { return errors.Wrapf(err, "could not list snapshot directory %s", legacySnapDir) } for _, fname := range snapshotNames { err := os.Link(filepath.Join(legacySnapDir, fname), filepath.Join(tmpdirpath, fname)) if err != nil { return errors.Wrap(err, "error linking snapshot file") } } if err := os.Rename(tmpdirpath, snapDir); err != nil { return err } return nil }
// MigrateSnapshot reads the latest existing snapshot from one directory, encoded one way, and writes // it to a new directory, encoded a different way func MigrateSnapshot(oldDir, newDir string, oldFactory, newFactory SnapFactory) error { // use temporary snapshot directory so initialization appears atomic oldSnapshotter := oldFactory.New(oldDir) snapshot, err := oldSnapshotter.Load() switch err { case snap.ErrNoSnapshot: // if there's no snapshot, the migration succeeded return nil case nil: break default: return err } tmpdirpath := filepath.Clean(newDir) + ".tmp" if fileutil.Exist(tmpdirpath) { if err := os.RemoveAll(tmpdirpath); err != nil { return errors.Wrap(err, "could not remove temporary snapshot directory") } } if err := fileutil.CreateDirAll(tmpdirpath); err != nil { return errors.Wrap(err, "could not create temporary snapshot directory") } tmpSnapshotter := newFactory.New(tmpdirpath) // write the new snapshot to the temporary location if err = tmpSnapshotter.SaveSnap(*snapshot); err != nil { return err } return os.Rename(tmpdirpath, newDir) }
// SaveDBFrom saves snapshot of the database from the given reader. It // guarantees the save operation is atomic. func (s *Snapshotter) SaveDBFrom(r io.Reader, id uint64) error { f, err := ioutil.TempFile(s.dir, "tmp") if err != nil { return err } var n int64 n, err = io.Copy(f, r) if err == nil { err = fileutil.Fsync(f) } f.Close() if err != nil { os.Remove(f.Name()) return err } fn := path.Join(s.dir, fmt.Sprintf("%016x.snap.db", id)) if fileutil.Exist(fn) { os.Remove(f.Name()) return nil } err = os.Rename(f.Name(), fn) if err != nil { os.Remove(f.Name()) return err } plog.Infof("saved database snapshot to disk [total bytes: %d]", n) return nil }
func testCtlV2Rm(t *testing.T, cfg *etcdProcessClusterConfig, noSync bool) { defer testutil.AfterTest(t) if fileutil.Exist("../bin/etcdctl") == false { t.Fatalf("could not find etcdctl binary") } epc, errC := newEtcdProcessCluster(cfg) if errC != nil { t.Fatalf("could not start etcd process cluster (%v)", errC) } defer func() { if errC := epc.Close(); errC != nil { t.Fatalf("error closing etcd processes (%v)", errC) } }() key, value := "foo", "bar" if err := etcdctlSet(epc, key, value, noSync); err != nil { t.Fatalf("failed set (%v)", err) } if err := etcdctlRm(epc, key, value, true, noSync); err != nil { t.Fatalf("failed rm (%v)", err) } if err := etcdctlRm(epc, key, value, false, noSync); err != nil { t.Fatalf("failed rm (%v)", err) } }
func testCtlV2Watch(t *testing.T, cfg *etcdProcessClusterConfig, noSync bool) { defer testutil.AfterTest(t) if fileutil.Exist("../bin/etcdctl") == false { t.Fatalf("could not find etcdctl binary") } epc, errC := newEtcdProcessCluster(cfg) if errC != nil { t.Fatalf("could not start etcd process cluster (%v)", errC) } defer func() { if errC := epc.Close(); errC != nil { t.Fatalf("error closing etcd processes (%v)", errC) } }() key, value := "foo", "bar" errc := etcdctlWatch(epc, key, value, noSync) if err := etcdctlSet(epc, key, value, noSync); err != nil { t.Fatalf("failed set (%v)", err) } select { case err := <-errc: if err != nil { t.Fatalf("failed watch (%v)", err) } case <-time.After(5 * time.Second): t.Fatalf("watch timed out") } }
// SaveDBFrom saves snapshot of the database from the given reader. It // guarantees the save operation is atomic. func (s *Snapshotter) SaveDBFrom(r io.Reader, id uint64) error { f, err := ioutil.TempFile(s.dir, "tmp") if err != nil { return err } _, err = io.Copy(f, r) if err == nil { err = f.Sync() } f.Close() if err != nil { os.Remove(f.Name()) return err } fn := path.Join(s.dir, fmt.Sprintf("%016x.snap.db", id)) if fileutil.Exist(fn) { os.Remove(f.Name()) return nil } err = os.Rename(f.Name(), fn) if err != nil { os.Remove(f.Name()) return err } return nil }
// TestReleaseUpgrade ensures that changes to master branch does not affect // upgrade from latest etcd releases. func TestReleaseUpgrade(t *testing.T) { lastReleaseBinary := binDir + "/etcd-last-release" if !fileutil.Exist(lastReleaseBinary) { t.Skipf("%q does not exist", lastReleaseBinary) } defer testutil.AfterTest(t) copiedCfg := configNoTLS copiedCfg.execPath = lastReleaseBinary copiedCfg.snapCount = 3 copiedCfg.baseScheme = "unix" // to avoid port conflict epc, err := newEtcdProcessCluster(&copiedCfg) if err != nil { t.Fatalf("could not start etcd process cluster (%v)", err) } defer func() { if errC := epc.Close(); errC != nil { t.Fatalf("error closing etcd processes (%v)", errC) } }() os.Setenv("ETCDCTL_API", "3") defer os.Unsetenv("ETCDCTL_API") cx := ctlCtx{ t: t, cfg: configNoTLS, dialTimeout: 7 * time.Second, quorum: true, epc: epc, } var kvs []kv for i := 0; i < 5; i++ { kvs = append(kvs, kv{key: fmt.Sprintf("foo%d", i), val: "bar"}) } for i := range kvs { if err := ctlV3Put(cx, kvs[i].key, kvs[i].val, ""); err != nil { cx.t.Fatalf("#%d: ctlV3Put error (%v)", i, err) } } for i := range epc.procs { if err := epc.procs[i].Stop(); err != nil { t.Fatalf("#%d: error closing etcd process (%v)", i, err) } epc.procs[i].cfg.execPath = binDir + "/etcd" epc.procs[i].cfg.keepDataDir = true if err := epc.procs[i].Restart(); err != nil { t.Fatalf("error restarting etcd process (%v)", err) } for j := range kvs { if err := ctlV3Get(cx, []string{kvs[j].key}, []kv{kvs[j]}...); err != nil { cx.t.Fatalf("#%d-%d: ctlV3Get error (%v)", i, j, err) } } } }
func testProcessClusterV2CtlWatch(t *testing.T, cfg *etcdProcessClusterConfig, noSync bool) { if fileutil.Exist("../bin/etcdctl") == false { t.Fatalf("could not find etcdctl binary") } epc, errC := newEtcdProcessCluster(cfg) if errC != nil { t.Fatalf("could not start etcd process cluster (%v)", errC) } defer func() { if errC := epc.Close(); errC != nil { t.Fatalf("error closing etcd processes (%v)", errC) } }() key, value := "foo", "bar" done, errChan := make(chan struct{}), make(chan error) go etcdctlWatch(epc, key, value, noSync, done, errChan) if err := etcdctlSet(epc, key, value, noSync); err != nil { t.Fatalf("failed set (%v)", err) } select { case <-done: return case err := <-errChan: t.Fatalf("failed watch (%v)", err) case <-time.After(5 * time.Second): t.Fatalf("watch timed out!") } }
func migrateWALs(legacyWALDir, walDir string) error { // keep temporary wal directory so WAL initialization appears atomic tmpdirpath := filepath.Clean(walDir) + ".tmp" if fileutil.Exist(tmpdirpath) { if err := os.RemoveAll(tmpdirpath); err != nil { return errors.Wrap(err, "could not remove temporary WAL directory") } } if err := fileutil.CreateDirAll(tmpdirpath); err != nil { return errors.Wrap(err, "could not create temporary WAL directory") } walNames, err := fileutil.ReadDir(legacyWALDir) if err != nil { return errors.Wrapf(err, "could not list WAL directory %s", legacyWALDir) } for _, fname := range walNames { _, err := copyFile(filepath.Join(legacyWALDir, fname), filepath.Join(tmpdirpath, fname), 0600) if err != nil { return errors.Wrap(err, "error copying WAL file") } } if err := os.Rename(tmpdirpath, walDir); err != nil { return err } return nil }
func (rc *raftNode) startRaft() { if !fileutil.Exist(rc.snapdir) { if err := os.Mkdir(rc.snapdir, 0750); err != nil { log.Fatalf("raftexample: cannot create dir for snapshot (%v)", err) } } rc.snapshotter = snap.New(rc.snapdir) rc.snapshotterReady <- rc.snapshotter oldwal := wal.Exist(rc.waldir) rc.wal = rc.replayWAL() rpeers := make([]raft.Peer, len(rc.peers)) for i := range rpeers { rpeers[i] = raft.Peer{ID: uint64(i + 1)} } c := &raft.Config{ ID: uint64(rc.id), ElectionTick: 10, HeartbeatTick: 1, Storage: rc.raftStorage, MaxSizePerMsg: 1024 * 1024, MaxInflightMsgs: 256, } if oldwal { rc.node = raft.RestartNode(c) } else { startPeers := rpeers if rc.join { startPeers = nil } rc.node = raft.StartNode(c, startPeers) } ss := &stats.ServerStats{} ss.Initialize() rc.transport = &rafthttp.Transport{ ID: types.ID(rc.id), ClusterID: 0x1000, Raft: rc, ServerStats: ss, LeaderStats: stats.NewLeaderStats(strconv.Itoa(rc.id)), ErrorC: make(chan error), } rc.transport.Start() for i := range rc.peers { if i+1 != rc.id { rc.transport.AddPeer(types.ID(i+1), []string{rc.peers[i]}) } } go rc.serveRaft() go rc.serveChannels() }
func moveDirAside(dirname string) error { if fileutil.Exist(dirname) { tempdir, err := ioutil.TempDir(filepath.Dir(dirname), filepath.Base(dirname)) if err != nil { return err } return os.Rename(dirname, tempdir) } return nil }
// Create creates a WAL ready for appending records. The given metadata is // recorded at the head of each WAL file, and can be retrieved with ReadAll. func Create(dirpath string, metadata []byte) (*WAL, error) { if Exist(dirpath) { return nil, os.ErrExist } // keep temporary wal directory so WAL initialization appears atomic tmpdirpath := path.Clean(dirpath) + ".tmp" if fileutil.Exist(tmpdirpath) { if err := os.RemoveAll(tmpdirpath); err != nil { return nil, err } } if err := os.MkdirAll(tmpdirpath, fileutil.PrivateDirMode); err != nil { return nil, err } p := path.Join(tmpdirpath, walName(0, 0)) f, err := fileutil.LockFile(p, os.O_WRONLY|os.O_CREATE, fileutil.PrivateFileMode) if err != nil { return nil, err } if _, err := f.Seek(0, os.SEEK_END); err != nil { return nil, err } if err := fileutil.Preallocate(f.File, segmentSizeBytes, true); err != nil { return nil, err } w := &WAL{ dir: dirpath, metadata: metadata, encoder: newEncoder(f, 0), } w.locks = append(w.locks, f) if err := w.saveCrc(0); err != nil { return nil, err } if err := w.encoder.encode(&walpb.Record{Type: metadataType, Data: metadata}); err != nil { return nil, err } if err := w.SaveSnapshot(walpb.Snapshot{}); err != nil { return nil, err } if err := os.RemoveAll(dirpath); err != nil { return nil, err } if err := os.Rename(tmpdirpath, dirpath); err != nil { return nil, err } w.fp = newFilePipeline(w.dir, segmentSizeBytes) return w, nil }
func newEtcdProcess(cfg *etcdProcessConfig) (*etcdProcess, error) { if !fileutil.Exist("../bin/etcd") { return nil, fmt.Errorf("could not find etcd binary") } if err := os.RemoveAll(cfg.dataDirPath); err != nil { return nil, err } child, err := spawnCmd(append([]string{"../bin/etcd"}, cfg.args...)) if err != nil { return nil, err } return &etcdProcess{cfg: cfg, proc: child, donec: make(chan struct{})}, nil }
// saveSnap saves snapshot into disk. // // If snapshot has existed in disk, it keeps the original snapshot and returns error. // The function guarantees that it always saves either complete snapshot or no snapshot, // even if the call is aborted because program is hard killed. func (ss *snapshotStore) saveSnap(s *snapshot) error { f, err := ioutil.TempFile(ss.dir, "tmp") if err != nil { return err } _, err = s.writeTo(f) f.Close() if err != nil { os.Remove(f.Name()) return err } fn := path.Join(ss.dir, fmt.Sprintf("%016x.db", s.raft().Metadata.Index)) if fileutil.Exist(fn) { os.Remove(f.Name()) return fmt.Errorf("snapshot to save has existed") } err = os.Rename(f.Name(), fn) if err != nil { os.Remove(f.Name()) return err } return nil }
// SaveFrom saves snapshot at the given index from the given reader. // If the snapshot with the given index has been saved successfully, it keeps // the original saved snapshot and returns error. // The function guarantees that SaveFrom always saves either complete // snapshot or no snapshot, even if the call is aborted because program // is hard killed. func (ss *snapshotStore) SaveFrom(r io.Reader, index uint64) error { f, err := ioutil.TempFile(ss.dir, "tmp") if err != nil { return err } _, err = io.Copy(f, r) f.Close() if err != nil { os.Remove(f.Name()) return err } fn := path.Join(ss.dir, fmt.Sprintf("%016x.db", index)) if fileutil.Exist(fn) { os.Remove(f.Name()) return fmt.Errorf("snapshot to save has existed") } err = os.Rename(f.Name(), fn) if err != nil { os.Remove(f.Name()) return err } return nil }
func testCtlV2Watch(t *testing.T, cfg *etcdProcessClusterConfig, noSync bool) { defer testutil.AfterTest(t) if fileutil.Exist("../bin/etcdctl") == false { t.Fatalf("could not find etcdctl binary") } epc, errC := newEtcdProcessCluster(cfg) if errC != nil { t.Fatalf("could not start etcd process cluster (%v)", errC) } defer func() { if errC := epc.Close(); errC != nil { t.Fatalf("error closing etcd processes (%v)", errC) } }() key, value := "foo", "bar" done, errChan := make(chan struct{}, 1), make(chan error, 1) go etcdctlWatch(epc, key, value, noSync, done, errChan) if err := etcdctlSet(epc, key, value, noSync); err != nil { t.Fatalf("failed set (%v)", err) } select { case <-done: return case err := <-errChan: t.Fatalf("failed watch (%v)", err) case <-time.After(5 * time.Second): // TODO: 'watch' sometimes times out in Semaphore CI environment // but works fine in every other environments t.Logf("[WARNING] watch timed out!") } }
// NewServer creates a new EtcdServer from the supplied configuration. The // configuration is considered static for the lifetime of the EtcdServer. func NewServer(cfg *ServerConfig) (*EtcdServer, error) { st := store.New(StoreClusterPrefix, StoreKeysPrefix) var ( w *wal.WAL n raft.Node s *raft.MemoryStorage id types.ID cl *cluster ) if terr := fileutil.TouchDirAll(cfg.DataDir); terr != nil { return nil, fmt.Errorf("cannot access data directory: %v", terr) } if !cfg.V3demo && fileutil.Exist(path.Join(cfg.SnapDir(), databaseFilename)) { return nil, errors.New("experimental-v3demo cannot be disabled once it is enabled") } // Run the migrations. dataVer, err := version.DetectDataDir(cfg.DataDir) if err != nil { return nil, err } if err := upgradeDataDir(cfg.DataDir, cfg.Name, dataVer); err != nil { return nil, err } haveWAL := wal.Exist(cfg.WALDir()) ss := snap.New(cfg.SnapDir()) prt, err := rafthttp.NewRoundTripper(cfg.PeerTLSInfo, cfg.peerDialTimeout()) if err != nil { return nil, err } var remotes []*Member switch { case !haveWAL && !cfg.NewCluster: if err := cfg.VerifyJoinExisting(); err != nil { return nil, err } cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } existingCluster, err := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), prt) if err != nil { return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", err) } if err := ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) } if !isCompatibleWithCluster(cl, cl.MemberByName(cfg.Name).ID, prt) { return nil, fmt.Errorf("incomptible with current running cluster") } remotes = existingCluster.Members() cl.SetID(existingCluster.id) cl.SetStore(st) cfg.Print() id, n, s, w = startNode(cfg, cl, nil) case !haveWAL && cfg.NewCluster: if err := cfg.VerifyBootstrap(); err != nil { return nil, err } cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } m := cl.MemberByName(cfg.Name) if isMemberBootstrapped(cl, cfg.Name, prt, cfg.bootstrapTimeout()) { return nil, fmt.Errorf("member %s has already been bootstrapped", m.ID) } if cfg.ShouldDiscover() { var str string var err error str, err = discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) if err != nil { return nil, &DiscoveryError{Op: "join", Err: err} } urlsmap, err := types.NewURLsMap(str) if err != nil { return nil, err } if checkDuplicateURL(urlsmap) { return nil, fmt.Errorf("discovery cluster %s has duplicate url", urlsmap) } if cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, urlsmap); err != nil { return nil, err } } cl.SetStore(st) cfg.PrintWithInitial() id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) case haveWAL: if err := fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { return nil, fmt.Errorf("cannot write to member directory: %v", err) } if err := fileutil.IsDirWriteable(cfg.WALDir()); err != nil { return nil, fmt.Errorf("cannot write to WAL directory: %v", err) } if cfg.ShouldDiscover() { plog.Warningf("discovery token ignored since a cluster has already been initialized. Valid log found at %q", cfg.WALDir()) } var snapshot *raftpb.Snapshot var err error snapshot, err = ss.Load() if err != nil && err != snap.ErrNoSnapshot { return nil, err } if snapshot != nil { if err := st.Recovery(snapshot.Data); err != nil { plog.Panicf("recovered store from snapshot error: %v", err) } plog.Infof("recovered store from snapshot at index %d", snapshot.Metadata.Index) } cfg.Print() if !cfg.ForceNewCluster { id, cl, n, s, w = restartNode(cfg, snapshot) } else { id, cl, n, s, w = restartAsStandaloneNode(cfg, snapshot) } cl.SetStore(st) cl.Recover() default: return nil, fmt.Errorf("unsupported bootstrap config") } if terr := fileutil.TouchDirAll(cfg.MemberDir()); terr != nil { return nil, fmt.Errorf("cannot access member directory: %v", terr) } sstats := &stats.ServerStats{ Name: cfg.Name, ID: id.String(), } sstats.Initialize() lstats := stats.NewLeaderStats(id.String()) srv := &EtcdServer{ cfg: cfg, snapCount: cfg.SnapCount, errorc: make(chan error, 1), store: st, r: raftNode{ Node: n, ticker: time.Tick(time.Duration(cfg.TickMs) * time.Millisecond), raftStorage: s, storage: NewStorage(w, ss), }, id: id, attributes: Attributes{Name: cfg.Name, ClientURLs: cfg.ClientURLs.StringSlice()}, cluster: cl, stats: sstats, lstats: lstats, SyncTicker: time.Tick(500 * time.Millisecond), peerRt: prt, reqIDGen: idutil.NewGenerator(uint16(id), time.Now()), forceVersionC: make(chan struct{}), msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap), } if cfg.V3demo { srv.be = backend.NewDefaultBackend(path.Join(cfg.SnapDir(), databaseFilename)) srv.lessor = lease.NewLessor(srv.be) srv.kv = dstorage.New(srv.be, srv.lessor, &srv.consistIndex) srv.authStore = auth.NewAuthStore(srv) if h := cfg.AutoCompactionRetention; h != 0 { srv.compactor = compactor.NewPeriodic(h, srv.kv, srv) srv.compactor.Run() } } // TODO: move transport initialization near the definition of remote tr := &rafthttp.Transport{ TLSInfo: cfg.PeerTLSInfo, DialTimeout: cfg.peerDialTimeout(), ID: id, URLs: cfg.PeerURLs, ClusterID: cl.ID(), Raft: srv, Snapshotter: ss, ServerStats: sstats, LeaderStats: lstats, ErrorC: srv.errorc, V3demo: cfg.V3demo, } if err := tr.Start(); err != nil { return nil, err } // add all remotes into transport for _, m := range remotes { if m.ID != id { tr.AddRemote(m.ID, m.PeerURLs) } } for _, m := range cl.Members() { if m.ID != id { tr.AddPeer(m.ID, m.PeerURLs) } } srv.r.transport = tr return srv, nil }
// NewServer creates a new EtcdServer from the supplied configuration. The // configuration is considered static for the lifetime of the EtcdServer. func NewServer(cfg *ServerConfig) (srv *EtcdServer, err error) { st := store.New(StoreClusterPrefix, StoreKeysPrefix) var ( w *wal.WAL n raft.Node s *raft.MemoryStorage id types.ID cl *membership.RaftCluster ) if terr := fileutil.TouchDirAll(cfg.DataDir); terr != nil { return nil, fmt.Errorf("cannot access data directory: %v", terr) } haveWAL := wal.Exist(cfg.WALDir()) if err = fileutil.TouchDirAll(cfg.SnapDir()); err != nil { plog.Fatalf("create snapshot directory error: %v", err) } ss := snap.New(cfg.SnapDir()) bepath := path.Join(cfg.SnapDir(), databaseFilename) beExist := fileutil.Exist(bepath) var be backend.Backend beOpened := make(chan struct{}) go func() { be = backend.NewDefaultBackend(bepath) beOpened <- struct{}{} }() select { case <-beOpened: case <-time.After(time.Second): plog.Warningf("another etcd process is running with the same data dir and holding the file lock.") plog.Warningf("waiting for it to exit before starting...") <-beOpened } defer func() { if err != nil { be.Close() } }() prt, err := rafthttp.NewRoundTripper(cfg.PeerTLSInfo, cfg.peerDialTimeout()) if err != nil { return nil, err } var ( remotes []*membership.Member snapshot *raftpb.Snapshot ) switch { case !haveWAL && !cfg.NewCluster: if err = cfg.VerifyJoinExisting(); err != nil { return nil, err } cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } existingCluster, gerr := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), prt) if gerr != nil { return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", gerr) } if err = membership.ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) } if !isCompatibleWithCluster(cl, cl.MemberByName(cfg.Name).ID, prt) { return nil, fmt.Errorf("incompatible with current running cluster") } remotes = existingCluster.Members() cl.SetID(existingCluster.ID()) cl.SetStore(st) cl.SetBackend(be) cfg.Print() id, n, s, w = startNode(cfg, cl, nil) case !haveWAL && cfg.NewCluster: if err = cfg.VerifyBootstrap(); err != nil { return nil, err } cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } m := cl.MemberByName(cfg.Name) if isMemberBootstrapped(cl, cfg.Name, prt, cfg.bootstrapTimeout()) { return nil, fmt.Errorf("member %s has already been bootstrapped", m.ID) } if cfg.ShouldDiscover() { var str string str, err = discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) if err != nil { return nil, &DiscoveryError{Op: "join", Err: err} } var urlsmap types.URLsMap urlsmap, err = types.NewURLsMap(str) if err != nil { return nil, err } if checkDuplicateURL(urlsmap) { return nil, fmt.Errorf("discovery cluster %s has duplicate url", urlsmap) } if cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, urlsmap); err != nil { return nil, err } } cl.SetStore(st) cl.SetBackend(be) cfg.PrintWithInitial() id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) case haveWAL: if err = fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { return nil, fmt.Errorf("cannot write to member directory: %v", err) } if err = fileutil.IsDirWriteable(cfg.WALDir()); err != nil { return nil, fmt.Errorf("cannot write to WAL directory: %v", err) } if cfg.ShouldDiscover() { plog.Warningf("discovery token ignored since a cluster has already been initialized. Valid log found at %q", cfg.WALDir()) } snapshot, err = ss.Load() if err != nil && err != snap.ErrNoSnapshot { return nil, err } if snapshot != nil { if err = st.Recovery(snapshot.Data); err != nil { plog.Panicf("recovered store from snapshot error: %v", err) } plog.Infof("recovered store from snapshot at index %d", snapshot.Metadata.Index) } cfg.Print() if !cfg.ForceNewCluster { id, cl, n, s, w = restartNode(cfg, snapshot) } else { id, cl, n, s, w = restartAsStandaloneNode(cfg, snapshot) } cl.SetStore(st) cl.SetBackend(be) cl.Recover(api.UpdateCapability) if cl.Version() != nil && !cl.Version().LessThan(semver.Version{Major: 3}) && !beExist { os.RemoveAll(bepath) return nil, fmt.Errorf("database file (%v) of the backend is missing", bepath) } default: return nil, fmt.Errorf("unsupported bootstrap config") } if terr := fileutil.TouchDirAll(cfg.MemberDir()); terr != nil { return nil, fmt.Errorf("cannot access member directory: %v", terr) } sstats := &stats.ServerStats{ Name: cfg.Name, ID: id.String(), } sstats.Initialize() lstats := stats.NewLeaderStats(id.String()) heartbeat := time.Duration(cfg.TickMs) * time.Millisecond srv = &EtcdServer{ readych: make(chan struct{}), Cfg: cfg, snapCount: cfg.SnapCount, errorc: make(chan error, 1), store: st, r: raftNode{ isIDRemoved: func(id uint64) bool { return cl.IsIDRemoved(types.ID(id)) }, Node: n, ticker: time.Tick(heartbeat), // set up contention detectors for raft heartbeat message. // expect to send a heartbeat within 2 heartbeat intervals. td: contention.NewTimeoutDetector(2 * heartbeat), heartbeat: heartbeat, raftStorage: s, storage: NewStorage(w, ss), msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap), readStateC: make(chan raft.ReadState, 1), }, id: id, attributes: membership.Attributes{Name: cfg.Name, ClientURLs: cfg.ClientURLs.StringSlice()}, cluster: cl, stats: sstats, lstats: lstats, SyncTicker: time.Tick(500 * time.Millisecond), peerRt: prt, reqIDGen: idutil.NewGenerator(uint16(id), time.Now()), forceVersionC: make(chan struct{}), } srv.applyV2 = &applierV2store{store: srv.store, cluster: srv.cluster} srv.be = be minTTL := time.Duration((3*cfg.ElectionTicks)/2) * heartbeat // always recover lessor before kv. When we recover the mvcc.KV it will reattach keys to its leases. // If we recover mvcc.KV first, it will attach the keys to the wrong lessor before it recovers. srv.lessor = lease.NewLessor(srv.be, int64(math.Ceil(minTTL.Seconds()))) srv.kv = mvcc.New(srv.be, srv.lessor, &srv.consistIndex) if beExist { kvindex := srv.kv.ConsistentIndex() // TODO: remove kvindex != 0 checking when we do not expect users to upgrade // etcd from pre-3.0 release. if snapshot != nil && kvindex < snapshot.Metadata.Index { if kvindex != 0 { return nil, fmt.Errorf("database file (%v index %d) does not match with snapshot (index %d).", bepath, kvindex, snapshot.Metadata.Index) } plog.Warningf("consistent index never saved (snapshot index=%d)", snapshot.Metadata.Index) } } srv.consistIndex.setConsistentIndex(srv.kv.ConsistentIndex()) srv.authStore = auth.NewAuthStore(srv.be) if h := cfg.AutoCompactionRetention; h != 0 { srv.compactor = compactor.NewPeriodic(h, srv.kv, srv) srv.compactor.Run() } srv.applyV3Base = &applierV3backend{srv} if err = srv.restoreAlarms(); err != nil { return nil, err } // TODO: move transport initialization near the definition of remote tr := &rafthttp.Transport{ TLSInfo: cfg.PeerTLSInfo, DialTimeout: cfg.peerDialTimeout(), ID: id, URLs: cfg.PeerURLs, ClusterID: cl.ID(), Raft: srv, Snapshotter: ss, ServerStats: sstats, LeaderStats: lstats, ErrorC: srv.errorc, } if err = tr.Start(); err != nil { return nil, err } // add all remotes into transport for _, m := range remotes { if m.ID != id { tr.AddRemote(m.ID, m.PeerURLs) } } for _, m := range cl.Members() { if m.ID != id { tr.AddPeer(m.ID, m.PeerURLs) } } srv.r.transport = tr return srv, nil }
func mustEtcdctl(t *testing.T) { if !fileutil.Exist("../bin/etcdctl") { t.Fatalf("could not find etcdctl binary") } }
// Create creates a WAL ready for appending records. The given metadata is // recorded at the head of each WAL file, and can be retrieved with ReadAll. func Create(dirpath string, metadata []byte) (*WAL, error) { if Exist(dirpath) { return nil, os.ErrExist } // keep temporary wal directory so WAL initialization appears atomic tmpdirpath := path.Clean(dirpath) + ".tmp" if fileutil.Exist(tmpdirpath) { if err := os.RemoveAll(tmpdirpath); err != nil { return nil, err } } if err := fileutil.CreateDirAll(tmpdirpath); err != nil { return nil, err } p := path.Join(tmpdirpath, walName(0, 0)) f, err := fileutil.LockFile(p, os.O_WRONLY|os.O_CREATE, fileutil.PrivateFileMode) if err != nil { return nil, err } if _, err = f.Seek(0, os.SEEK_END); err != nil { return nil, err } if err = fileutil.Preallocate(f.File, SegmentSizeBytes, true); err != nil { return nil, err } w := &WAL{ dir: dirpath, metadata: metadata, encoder: newEncoder(f, 0), } w.locks = append(w.locks, f) if err = w.saveCrc(0); err != nil { return nil, err } if err = w.encoder.encode(&walpb.Record{Type: metadataType, Data: metadata}); err != nil { return nil, err } if err = w.SaveSnapshot(walpb.Snapshot{}); err != nil { return nil, err } if w, err = w.renameWal(tmpdirpath); err != nil { return nil, err } // directory was renamed; sync parent dir to persist rename pdir, perr := fileutil.OpenDir(path.Dir(w.dir)) if perr != nil { return nil, perr } if perr = fileutil.Fsync(pdir); perr != nil { return nil, perr } if perr = pdir.Close(); err != nil { return nil, perr } return w, nil }
func mustCtlV3(t *testing.T) { if !fileutil.Exist("../bin/etcdctlv3") { t.Fatalf("could not find etcdctlv3 binary") } }
// Create creates a WAL ready for appending records. The given metadata is // recorded at the head of each WAL file, and can be retrieved with ReadAll. func Create(dirpath string, metadata []byte) (*WAL, error) { if Exist(dirpath) { return nil, os.ErrExist } // keep temporary wal directory so WAL initialization appears atomic tmpdirpath := path.Clean(dirpath) + ".tmp" if fileutil.Exist(tmpdirpath) { if err := os.RemoveAll(tmpdirpath); err != nil { return nil, err } } if err := fileutil.CreateDirAll(tmpdirpath); err != nil { return nil, err } p := path.Join(tmpdirpath, walName(0, 0)) f, err := fileutil.LockFile(p, os.O_WRONLY|os.O_CREATE, fileutil.PrivateFileMode) if err != nil { return nil, err } if _, err := f.Seek(0, os.SEEK_END); err != nil { return nil, err } if err := fileutil.Preallocate(f.File, SegmentSizeBytes, true); err != nil { return nil, err } w := &WAL{ dir: dirpath, metadata: metadata, encoder: newEncoder(f, 0), } w.locks = append(w.locks, f) if err := w.saveCrc(0); err != nil { return nil, err } if err := w.encoder.encode(&walpb.Record{Type: metadataType, Data: metadata}); err != nil { return nil, err } if err := w.SaveSnapshot(walpb.Snapshot{}); err != nil { return nil, err } // rename of directory with locked files doesn't work on windows; close // the WAL to release the locks so the directory can be renamed w.Close() if err := os.Rename(tmpdirpath, dirpath); err != nil { return nil, err } // reopen and relock newWAL, oerr := Open(dirpath, walpb.Snapshot{}) if oerr != nil { return nil, oerr } if _, _, _, err := newWAL.ReadAll(); err != nil { newWAL.Close() return nil, err } return newWAL, nil }
func TestRaftLeaderLeave(t *testing.T) { t.Parallel() nodes, clockSource := raftutils.NewRaftCluster(t, tc) defer raftutils.TeardownCluster(t, nodes) // node 1 is the leader assert.Equal(t, nodes[1].Leader(), nodes[1].Config.ID) // Try to leave the raft // Use gRPC instead of calling handler directly because of // authorization check. client, err := nodes[1].ConnectToMember(nodes[1].Address, 10*time.Second) assert.NoError(t, err) defer client.Conn.Close() raftClient := api.NewRaftMembershipClient(client.Conn) ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) resp, err := raftClient.Leave(ctx, &api.LeaveRequest{Node: &api.RaftMember{RaftID: nodes[1].Config.ID}}) assert.NoError(t, err, "error sending message to leave the raft") assert.NotNil(t, resp, "leave response message is nil") newCluster := map[uint64]*raftutils.TestNode{ 2: nodes[2], 3: nodes[3], } // Wait for election tick raftutils.WaitForCluster(t, clockSource, newCluster) // Node1's state should be cleared require.False(t, fileutil.Exist(filepath.Join(nodes[1].StateDir, "snap-v3-encrypted"))) require.False(t, fileutil.Exist(filepath.Join(nodes[1].StateDir, "wal-v3-encrypted"))) require.Equal(t, raft.EncryptionKeys{}, nodes[1].KeyRotator.GetKeys()) // Leader should not be 1 assert.NotEqual(t, nodes[2].Leader(), nodes[1].Config.ID) assert.Equal(t, nodes[2].Leader(), nodes[3].Leader()) leader := nodes[2].Leader() // Find the leader node and a follower node var ( leaderNode *raftutils.TestNode followerNode *raftutils.TestNode ) for i, n := range nodes { if n.Config.ID == leader { leaderNode = n if i == 2 { followerNode = nodes[3] } else { followerNode = nodes[2] } } } require.NotNil(t, leaderNode) require.NotNil(t, followerNode) // Propose a value value, err := raftutils.ProposeValue(t, leaderNode, DefaultProposalTime) assert.NoError(t, err, "failed to propose value") // The value should be replicated on all remaining nodes raftutils.CheckValue(t, clockSource, leaderNode, value) assert.Equal(t, len(leaderNode.GetMemberlist()), 2) raftutils.CheckValue(t, clockSource, followerNode, value) assert.Equal(t, len(followerNode.GetMemberlist()), 2) raftutils.TeardownCluster(t, newCluster) }
// BootstrapFromDisk creates a new snapshotter and wal, and also reads the latest snapshot and WALs from disk func (e *EncryptedRaftLogger) BootstrapFromDisk(ctx context.Context, oldEncryptionKeys ...[]byte) (*raftpb.Snapshot, WALData, error) { e.encoderMu.Lock() defer e.encoderMu.Unlock() walDir := e.walDir() snapDir := e.snapDir() encrypter, decrypter := encryption.Defaults(e.EncryptionKey) if oldEncryptionKeys != nil { decrypters := []encryption.Decrypter{decrypter} for _, key := range oldEncryptionKeys { _, d := encryption.Defaults(key) decrypters = append(decrypters, d) } decrypter = MultiDecrypter(decrypters) } snapFactory := NewSnapFactory(encrypter, decrypter) if !fileutil.Exist(snapDir) { // If snapshots created by the etcd-v2 code exist, or by swarmkit development version, // read the latest snapshot and write it encoded to the new path. The new path // prevents etc-v2 creating snapshots that are visible to us, but not encoded and // out of sync with our WALs, after a downgrade. for _, dirs := range versionedWALSnapDirs[1:] { legacySnapDir := filepath.Join(e.StateDir, dirs.snap) if fileutil.Exist(legacySnapDir) { if err := MigrateSnapshot(legacySnapDir, snapDir, OriginalSnap, snapFactory); err != nil { return nil, WALData{}, err } break } } } // ensure the new directory exists if err := os.MkdirAll(snapDir, 0700); err != nil { return nil, WALData{}, errors.Wrap(err, "failed to create snapshot directory") } var ( snapshotter Snapshotter walObj WAL err error ) // Create a snapshotter and load snapshot data snapshotter = snapFactory.New(snapDir) snapshot, err := snapshotter.Load() if err != nil && err != snap.ErrNoSnapshot { return nil, WALData{}, err } walFactory := NewWALFactory(encrypter, decrypter) var walsnap walpb.Snapshot if snapshot != nil { walsnap.Index = snapshot.Metadata.Index walsnap.Term = snapshot.Metadata.Term } if !wal.Exist(walDir) { var walExists bool // If wals created by the etcd-v2 wal code exist, read the latest ones based // on this snapshot and encode them to wals in the new path to avoid adding // backwards-incompatible entries to those files. for _, dirs := range versionedWALSnapDirs[1:] { legacyWALDir := filepath.Join(e.StateDir, dirs.wal) if !wal.Exist(legacyWALDir) { continue } if err = MigrateWALs(ctx, legacyWALDir, walDir, OriginalWAL, walFactory, walsnap); err != nil { return nil, WALData{}, err } walExists = true break } if !walExists { return nil, WALData{}, ErrNoWAL } } walObj, waldata, err := ReadRepairWAL(ctx, walDir, walsnap, walFactory) if err != nil { return nil, WALData{}, err } e.snapshotter = snapshotter e.wal = walObj return snapshot, waldata, nil }