// getRemotePeerURLs returns peer urls of remote members in the cluster. The // returned list is sorted in ascending lexicographical order. func getRemotePeerURLs(cl *membership.RaftCluster, local string) []string { us := make([]string, 0) for _, m := range cl.Members() { if m.Name == local { continue } us = append(us, m.PeerURLs...) } sort.Strings(us) return us }
// isMemberBootstrapped tries to check if the given member has been bootstrapped // in the given cluster. func isMemberBootstrapped(cl *membership.RaftCluster, member string, rt http.RoundTripper, timeout time.Duration) bool { rcl, err := getClusterFromRemotePeers(getRemotePeerURLs(cl, member), timeout, false, rt) if err != nil { return false } id := cl.MemberByName(member).ID m := rcl.Member(id) if m == nil { return false } if len(m.ClientURLs) > 0 { return true } return false }
// makeWAL creates a WAL for the initial cluster func makeWAL(waldir string, cl *membership.RaftCluster) { if err := os.MkdirAll(waldir, 0755); err != nil { ExitWithError(ExitIO, err) } m := cl.MemberByName(restoreName) md := &etcdserverpb.Metadata{NodeID: uint64(m.ID), ClusterID: uint64(cl.ID())} metadata, merr := md.Marshal() if merr != nil { ExitWithError(ExitInvalidInput, merr) } w, walerr := wal.Create(waldir, metadata) if walerr != nil { ExitWithError(ExitIO, walerr) } defer w.Close() peers := make([]raft.Peer, len(cl.MemberIDs())) for i, id := range cl.MemberIDs() { ctx, err := json.Marshal((*cl).Member(id)) if err != nil { ExitWithError(ExitInvalidInput, err) } peers[i] = raft.Peer{ID: uint64(id), Context: ctx} } ents := make([]raftpb.Entry, len(peers)) for i, p := range peers { cc := raftpb.ConfChange{ Type: raftpb.ConfChangeAddNode, NodeID: p.ID, Context: p.Context} d, err := cc.Marshal() if err != nil { ExitWithError(ExitInvalidInput, err) } e := raftpb.Entry{ Type: raftpb.EntryConfChange, Term: 1, Index: uint64(i + 1), Data: d, } ents[i] = e } w.Save(raftpb.HardState{ Term: 1, Vote: peers[0].ID, Commit: uint64(len(ents))}, ents) }
func applyConf(cc raftpb.ConfChange, cl *membership.RaftCluster) { if err := cl.ValidateConfigurationChange(cc); err != nil { return } switch cc.Type { case raftpb.ConfChangeAddNode: m := new(membership.Member) if err := json.Unmarshal(cc.Context, m); err != nil { panic(err) } cl.AddMember(m) case raftpb.ConfChangeRemoveNode: cl.RemoveMember(types.ID(cc.NodeID)) case raftpb.ConfChangeUpdateNode: m := new(membership.Member) if err := json.Unmarshal(cc.Context, m); err != nil { panic(err) } cl.UpdateRaftAttributes(m.ID, m.RaftAttributes) } }
func startNode(cfg *ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id types.ID, n raft.Node, s *raft.MemoryStorage, w *wal.WAL) { var err error member := cl.MemberByName(cfg.Name) metadata := pbutil.MustMarshal( &pb.Metadata{ NodeID: uint64(member.ID), ClusterID: uint64(cl.ID()), }, ) if err = os.MkdirAll(cfg.SnapDir(), privateDirMode); err != nil { plog.Fatalf("create snapshot directory error: %v", err) } if w, err = wal.Create(cfg.WALDir(), metadata); err != nil { plog.Fatalf("create wal error: %v", err) } peers := make([]raft.Peer, len(ids)) for i, id := range ids { ctx, err := json.Marshal((*cl).Member(id)) if err != nil { plog.Panicf("marshal member should never fail: %v", err) } peers[i] = raft.Peer{ID: uint64(id), Context: ctx} } id = member.ID plog.Infof("starting member %s in cluster %s", id, cl.ID()) s = raft.NewMemoryStorage() c := &raft.Config{ ID: uint64(id), ElectionTick: cfg.ElectionTicks, HeartbeatTick: 1, Storage: s, MaxSizePerMsg: maxSizePerMsg, MaxInflightMsgs: maxInflightMsgs, CheckQuorum: true, } n = raft.StartNode(c, peers) raftStatusMu.Lock() raftStatus = n.Status raftStatusMu.Unlock() advanceTicksForElection(n, c.ElectionTick) return }
// getVersions returns the versions of the members in the given cluster. // The key of the returned map is the member's ID. The value of the returned map // is the semver versions string, including server and cluster. // If it fails to get the version of a member, the key will be nil. func getVersions(cl *membership.RaftCluster, local types.ID, rt http.RoundTripper) map[string]*version.Versions { members := cl.Members() vers := make(map[string]*version.Versions) for _, m := range members { if m.ID == local { cv := "not_decided" if cl.Version() != nil { cv = cl.Version().String() } vers[m.ID.String()] = &version.Versions{Server: version.Version, Cluster: cv} continue } ver, err := getVersion(m, rt) if err != nil { plog.Warningf("cannot get the version of member %s (%v)", m.ID, err) vers[m.ID.String()] = nil } else { vers[m.ID.String()] = ver } } return vers }
// NewServer creates a new EtcdServer from the supplied configuration. The // configuration is considered static for the lifetime of the EtcdServer. func NewServer(cfg *ServerConfig) (srv *EtcdServer, err error) { st := store.New(StoreClusterPrefix, StoreKeysPrefix) var ( w *wal.WAL n raft.Node s *raft.MemoryStorage id types.ID cl *membership.RaftCluster ) if terr := fileutil.TouchDirAll(cfg.DataDir); terr != nil { return nil, fmt.Errorf("cannot access data directory: %v", terr) } // Run the migrations. dataVer, err := version.DetectDataDir(cfg.DataDir) if err != nil { return nil, err } if err = upgradeDataDir(cfg.DataDir, cfg.Name, dataVer); err != nil { return nil, err } haveWAL := wal.Exist(cfg.WALDir()) if err = os.MkdirAll(cfg.SnapDir(), privateDirMode); err != nil && !os.IsExist(err) { plog.Fatalf("create snapshot directory error: %v", err) } ss := snap.New(cfg.SnapDir()) be := backend.NewDefaultBackend(path.Join(cfg.SnapDir(), databaseFilename)) defer func() { if err != nil { be.Close() } }() prt, err := rafthttp.NewRoundTripper(cfg.PeerTLSInfo, cfg.peerDialTimeout()) if err != nil { return nil, err } var remotes []*membership.Member switch { case !haveWAL && !cfg.NewCluster: if err = cfg.VerifyJoinExisting(); err != nil { return nil, err } cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } existingCluster, gerr := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), prt) if gerr != nil { return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", gerr) } if err = membership.ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) } if !isCompatibleWithCluster(cl, cl.MemberByName(cfg.Name).ID, prt) { return nil, fmt.Errorf("incomptible with current running cluster") } remotes = existingCluster.Members() cl.SetID(existingCluster.ID()) cl.SetStore(st) cl.SetBackend(be) cfg.Print() id, n, s, w = startNode(cfg, cl, nil) case !haveWAL && cfg.NewCluster: if err = cfg.VerifyBootstrap(); err != nil { return nil, err } cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } m := cl.MemberByName(cfg.Name) if isMemberBootstrapped(cl, cfg.Name, prt, cfg.bootstrapTimeout()) { return nil, fmt.Errorf("member %s has already been bootstrapped", m.ID) } if cfg.ShouldDiscover() { var str string str, err = discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) if err != nil { return nil, &DiscoveryError{Op: "join", Err: err} } var urlsmap types.URLsMap urlsmap, err = types.NewURLsMap(str) if err != nil { return nil, err } if checkDuplicateURL(urlsmap) { return nil, fmt.Errorf("discovery cluster %s has duplicate url", urlsmap) } if cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, urlsmap); err != nil { return nil, err } } cl.SetStore(st) cl.SetBackend(be) cfg.PrintWithInitial() id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) case haveWAL: if err = fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { return nil, fmt.Errorf("cannot write to member directory: %v", err) } if err = fileutil.IsDirWriteable(cfg.WALDir()); err != nil { return nil, fmt.Errorf("cannot write to WAL directory: %v", err) } if cfg.ShouldDiscover() { plog.Warningf("discovery token ignored since a cluster has already been initialized. Valid log found at %q", cfg.WALDir()) } var snapshot *raftpb.Snapshot snapshot, err = ss.Load() if err != nil && err != snap.ErrNoSnapshot { return nil, err } if snapshot != nil { if err = st.Recovery(snapshot.Data); err != nil { plog.Panicf("recovered store from snapshot error: %v", err) } plog.Infof("recovered store from snapshot at index %d", snapshot.Metadata.Index) } cfg.Print() if !cfg.ForceNewCluster { id, cl, n, s, w = restartNode(cfg, snapshot) } else { id, cl, n, s, w = restartAsStandaloneNode(cfg, snapshot) } cl.SetStore(st) cl.SetBackend(be) cl.Recover() default: return nil, fmt.Errorf("unsupported bootstrap config") } if terr := fileutil.TouchDirAll(cfg.MemberDir()); terr != nil { return nil, fmt.Errorf("cannot access member directory: %v", terr) } sstats := &stats.ServerStats{ Name: cfg.Name, ID: id.String(), } sstats.Initialize() lstats := stats.NewLeaderStats(id.String()) srv = &EtcdServer{ readych: make(chan struct{}), Cfg: cfg, snapCount: cfg.SnapCount, errorc: make(chan error, 1), store: st, r: raftNode{ Node: n, ticker: time.Tick(time.Duration(cfg.TickMs) * time.Millisecond), raftStorage: s, storage: NewStorage(w, ss), }, id: id, attributes: membership.Attributes{Name: cfg.Name, ClientURLs: cfg.ClientURLs.StringSlice()}, cluster: cl, stats: sstats, lstats: lstats, SyncTicker: time.Tick(500 * time.Millisecond), peerRt: prt, reqIDGen: idutil.NewGenerator(uint16(id), time.Now()), forceVersionC: make(chan struct{}), msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap), } srv.applyV2 = &applierV2store{store: srv.store, cluster: srv.cluster} srv.be = be srv.lessor = lease.NewLessor(srv.be) srv.kv = mvcc.New(srv.be, srv.lessor, &srv.consistIndex) srv.consistIndex.setConsistentIndex(srv.kv.ConsistentIndex()) srv.authStore = auth.NewAuthStore(srv.be) if h := cfg.AutoCompactionRetention; h != 0 { srv.compactor = compactor.NewPeriodic(h, srv.kv, srv) srv.compactor.Run() } if err = srv.restoreAlarms(); err != nil { return nil, err } // TODO: move transport initialization near the definition of remote tr := &rafthttp.Transport{ TLSInfo: cfg.PeerTLSInfo, DialTimeout: cfg.peerDialTimeout(), ID: id, URLs: cfg.PeerURLs, ClusterID: cl.ID(), Raft: srv, Snapshotter: ss, ServerStats: sstats, LeaderStats: lstats, ErrorC: srv.errorc, } if err = tr.Start(); err != nil { return nil, err } // add all remotes into transport for _, m := range remotes { if m.ID != id { tr.AddRemote(m.ID, m.PeerURLs) } } for _, m := range cl.Members() { if m.ID != id { tr.AddPeer(m.ID, m.PeerURLs) } } srv.r.transport = tr return srv, nil }
// NewServer creates a new EtcdServer from the supplied configuration. The // configuration is considered static for the lifetime of the EtcdServer. func NewServer(cfg *ServerConfig) (srv *EtcdServer, err error) { st := store.New(StoreClusterPrefix, StoreKeysPrefix) var ( w *wal.WAL n raft.Node s *raft.MemoryStorage id types.ID cl *membership.RaftCluster ) if terr := fileutil.TouchDirAll(cfg.DataDir); terr != nil { return nil, fmt.Errorf("cannot access data directory: %v", terr) } haveWAL := wal.Exist(cfg.WALDir()) if err = fileutil.TouchDirAll(cfg.SnapDir()); err != nil { plog.Fatalf("create snapshot directory error: %v", err) } ss := snap.New(cfg.SnapDir()) bepath := path.Join(cfg.SnapDir(), databaseFilename) beExist := fileutil.Exist(bepath) var be backend.Backend beOpened := make(chan struct{}) go func() { be = backend.NewDefaultBackend(bepath) beOpened <- struct{}{} }() select { case <-beOpened: case <-time.After(time.Second): plog.Warningf("another etcd process is running with the same data dir and holding the file lock.") plog.Warningf("waiting for it to exit before starting...") <-beOpened } defer func() { if err != nil { be.Close() } }() prt, err := rafthttp.NewRoundTripper(cfg.PeerTLSInfo, cfg.peerDialTimeout()) if err != nil { return nil, err } var ( remotes []*membership.Member snapshot *raftpb.Snapshot ) switch { case !haveWAL && !cfg.NewCluster: if err = cfg.VerifyJoinExisting(); err != nil { return nil, err } cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } existingCluster, gerr := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), prt) if gerr != nil { return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", gerr) } if err = membership.ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) } if !isCompatibleWithCluster(cl, cl.MemberByName(cfg.Name).ID, prt) { return nil, fmt.Errorf("incompatible with current running cluster") } remotes = existingCluster.Members() cl.SetID(existingCluster.ID()) cl.SetStore(st) cl.SetBackend(be) cfg.Print() id, n, s, w = startNode(cfg, cl, nil) case !haveWAL && cfg.NewCluster: if err = cfg.VerifyBootstrap(); err != nil { return nil, err } cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } m := cl.MemberByName(cfg.Name) if isMemberBootstrapped(cl, cfg.Name, prt, cfg.bootstrapTimeout()) { return nil, fmt.Errorf("member %s has already been bootstrapped", m.ID) } if cfg.ShouldDiscover() { var str string str, err = discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) if err != nil { return nil, &DiscoveryError{Op: "join", Err: err} } var urlsmap types.URLsMap urlsmap, err = types.NewURLsMap(str) if err != nil { return nil, err } if checkDuplicateURL(urlsmap) { return nil, fmt.Errorf("discovery cluster %s has duplicate url", urlsmap) } if cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, urlsmap); err != nil { return nil, err } } cl.SetStore(st) cl.SetBackend(be) cfg.PrintWithInitial() id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) case haveWAL: if err = fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { return nil, fmt.Errorf("cannot write to member directory: %v", err) } if err = fileutil.IsDirWriteable(cfg.WALDir()); err != nil { return nil, fmt.Errorf("cannot write to WAL directory: %v", err) } if cfg.ShouldDiscover() { plog.Warningf("discovery token ignored since a cluster has already been initialized. Valid log found at %q", cfg.WALDir()) } snapshot, err = ss.Load() if err != nil && err != snap.ErrNoSnapshot { return nil, err } if snapshot != nil { if err = st.Recovery(snapshot.Data); err != nil { plog.Panicf("recovered store from snapshot error: %v", err) } plog.Infof("recovered store from snapshot at index %d", snapshot.Metadata.Index) } cfg.Print() if !cfg.ForceNewCluster { id, cl, n, s, w = restartNode(cfg, snapshot) } else { id, cl, n, s, w = restartAsStandaloneNode(cfg, snapshot) } cl.SetStore(st) cl.SetBackend(be) cl.Recover(api.UpdateCapability) if cl.Version() != nil && !cl.Version().LessThan(semver.Version{Major: 3}) && !beExist { os.RemoveAll(bepath) return nil, fmt.Errorf("database file (%v) of the backend is missing", bepath) } default: return nil, fmt.Errorf("unsupported bootstrap config") } if terr := fileutil.TouchDirAll(cfg.MemberDir()); terr != nil { return nil, fmt.Errorf("cannot access member directory: %v", terr) } sstats := &stats.ServerStats{ Name: cfg.Name, ID: id.String(), } sstats.Initialize() lstats := stats.NewLeaderStats(id.String()) heartbeat := time.Duration(cfg.TickMs) * time.Millisecond srv = &EtcdServer{ readych: make(chan struct{}), Cfg: cfg, snapCount: cfg.SnapCount, errorc: make(chan error, 1), store: st, r: raftNode{ isIDRemoved: func(id uint64) bool { return cl.IsIDRemoved(types.ID(id)) }, Node: n, ticker: time.Tick(heartbeat), // set up contention detectors for raft heartbeat message. // expect to send a heartbeat within 2 heartbeat intervals. td: contention.NewTimeoutDetector(2 * heartbeat), heartbeat: heartbeat, raftStorage: s, storage: NewStorage(w, ss), msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap), readStateC: make(chan raft.ReadState, 1), }, id: id, attributes: membership.Attributes{Name: cfg.Name, ClientURLs: cfg.ClientURLs.StringSlice()}, cluster: cl, stats: sstats, lstats: lstats, SyncTicker: time.Tick(500 * time.Millisecond), peerRt: prt, reqIDGen: idutil.NewGenerator(uint16(id), time.Now()), forceVersionC: make(chan struct{}), } srv.applyV2 = &applierV2store{store: srv.store, cluster: srv.cluster} srv.be = be minTTL := time.Duration((3*cfg.ElectionTicks)/2) * heartbeat // always recover lessor before kv. When we recover the mvcc.KV it will reattach keys to its leases. // If we recover mvcc.KV first, it will attach the keys to the wrong lessor before it recovers. srv.lessor = lease.NewLessor(srv.be, int64(math.Ceil(minTTL.Seconds()))) srv.kv = mvcc.New(srv.be, srv.lessor, &srv.consistIndex) if beExist { kvindex := srv.kv.ConsistentIndex() // TODO: remove kvindex != 0 checking when we do not expect users to upgrade // etcd from pre-3.0 release. if snapshot != nil && kvindex < snapshot.Metadata.Index { if kvindex != 0 { return nil, fmt.Errorf("database file (%v index %d) does not match with snapshot (index %d).", bepath, kvindex, snapshot.Metadata.Index) } plog.Warningf("consistent index never saved (snapshot index=%d)", snapshot.Metadata.Index) } } srv.consistIndex.setConsistentIndex(srv.kv.ConsistentIndex()) srv.authStore = auth.NewAuthStore(srv.be) if h := cfg.AutoCompactionRetention; h != 0 { srv.compactor = compactor.NewPeriodic(h, srv.kv, srv) srv.compactor.Run() } srv.applyV3Base = &applierV3backend{srv} if err = srv.restoreAlarms(); err != nil { return nil, err } // TODO: move transport initialization near the definition of remote tr := &rafthttp.Transport{ TLSInfo: cfg.PeerTLSInfo, DialTimeout: cfg.peerDialTimeout(), ID: id, URLs: cfg.PeerURLs, ClusterID: cl.ID(), Raft: srv, Snapshotter: ss, ServerStats: sstats, LeaderStats: lstats, ErrorC: srv.errorc, } if err = tr.Start(); err != nil { return nil, err } // add all remotes into transport for _, m := range remotes { if m.ID != id { tr.AddRemote(m.ID, m.PeerURLs) } } for _, m := range cl.Members() { if m.ID != id { tr.AddPeer(m.ID, m.PeerURLs) } } srv.r.transport = tr return srv, nil }
// makeWAL creates a WAL for the initial cluster func makeWALAndSnap(waldir, snapdir string, cl *membership.RaftCluster) { if err := fileutil.CreateDirAll(waldir); err != nil { ExitWithError(ExitIO, err) } // add members again to persist them to the store we create. st := store.New(etcdserver.StoreClusterPrefix, etcdserver.StoreKeysPrefix) cl.SetStore(st) for _, m := range cl.Members() { cl.AddMember(m) } m := cl.MemberByName(restoreName) md := &etcdserverpb.Metadata{NodeID: uint64(m.ID), ClusterID: uint64(cl.ID())} metadata, merr := md.Marshal() if merr != nil { ExitWithError(ExitInvalidInput, merr) } w, walerr := wal.Create(waldir, metadata) if walerr != nil { ExitWithError(ExitIO, walerr) } defer w.Close() peers := make([]raft.Peer, len(cl.MemberIDs())) for i, id := range cl.MemberIDs() { ctx, err := json.Marshal((*cl).Member(id)) if err != nil { ExitWithError(ExitInvalidInput, err) } peers[i] = raft.Peer{ID: uint64(id), Context: ctx} } ents := make([]raftpb.Entry, len(peers)) nodeIDs := make([]uint64, len(peers)) for i, p := range peers { nodeIDs[i] = p.ID cc := raftpb.ConfChange{ Type: raftpb.ConfChangeAddNode, NodeID: p.ID, Context: p.Context} d, err := cc.Marshal() if err != nil { ExitWithError(ExitInvalidInput, err) } e := raftpb.Entry{ Type: raftpb.EntryConfChange, Term: 1, Index: uint64(i + 1), Data: d, } ents[i] = e } commit, term := uint64(len(ents)), uint64(1) if err := w.Save(raftpb.HardState{ Term: term, Vote: peers[0].ID, Commit: commit}, ents); err != nil { ExitWithError(ExitIO, err) } b, berr := st.Save() if berr != nil { ExitWithError(ExitError, berr) } raftSnap := raftpb.Snapshot{ Data: b, Metadata: raftpb.SnapshotMetadata{ Index: commit, Term: term, ConfState: raftpb.ConfState{ Nodes: nodeIDs, }, }, } snapshotter := snap.New(snapdir) if err := snapshotter.SaveSnap(raftSnap); err != nil { panic(err) } if err := w.SaveSnapshot(walpb.Snapshot{Index: commit, Term: term}); err != nil { ExitWithError(ExitIO, err) } }