Beispiel #1
0
// monitorCluster assumes that the machine has tried to join the cluster and
// failed, so it waits for the interval at the beginning.
func (s *StandbyServer) monitorCluster() {
	for {
		timer := time.NewTimer(time.Duration(int64(s.SyncInterval * float64(time.Second))))
		defer timer.Stop()
		select {
		case <-s.closeChan:
			return
		case <-timer.C:
		}

		if err := s.syncCluster(nil); err != nil {
			log.Warnf("fail syncing cluster(%v): %v", s.ClusterURLs(), err)
			continue
		}

		leader := s.ClusterLeader()
		if leader == nil {
			log.Warnf("fail getting leader from cluster(%v)", s.ClusterURLs())
			continue
		}

		if err := s.join(leader.PeerURL); err != nil {
			log.Debugf("fail joining through leader %v: %v", leader, err)
			continue
		}

		log.Infof("join through leader %v", leader.PeerURL)
		go func() {
			s.Stop()
			close(s.removeNotify)
		}()
		return
	}
}
func (c *Config) handleDiscovery() error {
	p, err := discovery.Do(c.Discovery, c.Name, c.Peer.Addr)

	// This is fatal, discovery encountered an unexpected error
	// and we have no peer list.
	if err != nil && len(c.Peers) == 0 {
		log.Fatalf("Discovery failed and a backup peer list wasn't provided: %v", err)
		return err
	}

	// Warn about errors coming from discovery, this isn't fatal
	// since the user might have provided a peer list elsewhere.
	if err != nil {
		log.Warnf("Discovery encountered an error but a backup peer list (%v) was provided: %v", c.Peers, err)
	}

	for i := range p {
		// Strip the scheme off of the peer if it has one
		// TODO(bp): clean this up!
		purl, err := url.Parse(p[i])
		if err == nil {
			p[i] = purl.Host
		}
	}

	c.Peers = p

	return nil
}
Beispiel #3
0
func (s *StandbyServer) syncCluster(peerURLs []string) error {
	peerURLs = append(s.ClusterURLs(), peerURLs...)

	for _, peerURL := range peerURLs {
		// Fetch current peer list
		machines, err := s.client.GetMachines(peerURL)
		if err != nil {
			log.Debugf("fail getting machine messages from %v", peerURL)
			continue
		}

		config, err := s.client.GetClusterConfig(peerURL)
		if err != nil {
			log.Debugf("fail getting cluster config from %v", peerURL)
			continue
		}

		s.setCluster(machines)
		s.SetSyncInterval(config.SyncInterval)
		if err := s.saveInfo(); err != nil {
			log.Warnf("fail saving cluster info into disk: %v", err)
		}
		return nil
	}
	return fmt.Errorf("unreachable cluster")
}
Beispiel #4
0
// Response to vote request
func (ps *PeerServer) VoteHttpHandler(w http.ResponseWriter, req *http.Request) {
	rvreq := &raft.RequestVoteRequest{}

	if _, err := rvreq.Decode(req.Body); err != nil {
		http.Error(w, "", http.StatusBadRequest)
		log.Warnf("[recv] BADREQUEST %s/vote [%v]", ps.Config.URL, err)
		return
	}

	log.Debugf("[recv] POST %s/vote [%s]", ps.Config.URL, rvreq.CandidateName)

	resp := ps.raftServer.RequestVote(rvreq)

	if resp == nil {
		log.Warn("[vote] Error: nil response")
		http.Error(w, "", http.StatusInternalServerError)
		return
	}

	if _, err := resp.Encode(w); err != nil {
		log.Warn("[vote] Error: %v", err)
		http.Error(w, "", http.StatusInternalServerError)
		return
	}
}
Beispiel #5
0
// Response to append entries request
func (ps *PeerServer) AppendEntriesHttpHandler(w http.ResponseWriter, req *http.Request) {
	start := time.Now()
	aereq := &raft.AppendEntriesRequest{}

	if _, err := aereq.Decode(req.Body); err != nil {
		http.Error(w, "", http.StatusBadRequest)
		log.Warnf("[recv] BADREQUEST %s/log/append [%v]", ps.Config.URL, err)
		return
	}

	log.Debugf("[recv] POST %s/log/append [%d]", ps.Config.URL, len(aereq.Entries))

	ps.serverStats.RecvAppendReq(aereq.LeaderName, int(req.ContentLength))

	resp := ps.raftServer.AppendEntries(aereq)

	if resp == nil {
		log.Warn("[ae] Error: nil response")
		http.Error(w, "", http.StatusInternalServerError)
		return
	}

	if !resp.Success {
		log.Debugf("[Append Entry] Step back")
	}

	if _, err := resp.Encode(w); err != nil {
		log.Warn("[ae] Error: %v", err)
		http.Error(w, "", http.StatusInternalServerError)
		return
	}

	(*ps.metrics).Timer("timer.appendentries.handle").UpdateSince(start)
}
Beispiel #6
0
// Response to recover from snapshot request
func (ps *PeerServer) SnapshotHttpHandler(w http.ResponseWriter, req *http.Request) {
	ssreq := &raft.SnapshotRequest{}

	if _, err := ssreq.Decode(req.Body); err != nil {
		http.Error(w, "", http.StatusBadRequest)
		log.Warnf("[recv] BADREQUEST %s/snapshot [%v]", ps.Config.URL, err)
		return
	}

	log.Debugf("[recv] POST %s/snapshot", ps.Config.URL)

	resp := ps.raftServer.RequestSnapshot(ssreq)

	if resp == nil {
		log.Warn("[ss] Error: nil response")
		http.Error(w, "", http.StatusInternalServerError)
		return
	}

	if _, err := resp.Encode(w); err != nil {
		log.Warn("[ss] Error: %v", err)
		http.Error(w, "", http.StatusInternalServerError)
		return
	}
}
Beispiel #7
0
// applyJoin attempts to join a machine to the cluster.
func applyJoin(c *JoinCommand, context raft.Context) (uint64, error) {
	ps, _ := context.Server().Context().(*PeerServer)
	commitIndex := context.CommitIndex()

	// Make sure we're not getting a cached value from the registry.
	ps.registry.Invalidate(c.Name)

	// Check if the join command is from a previous peer, who lost all its previous log.
	if peerURL, ok := ps.registry.PeerURL(c.Name); ok {
		// If previous node restarts with different peer URL,
		// update its information.
		if peerURL != c.RaftURL {
			log.Infof("Rejoin with %v instead of %v from %v", c.RaftURL, peerURL, c.Name)
			if err := updatePeerURL(c, ps); err != nil {
				return 0, err
			}
		}
		if c.Name == context.Server().Name() {
			ps.removedInLog = false
		}
		return commitIndex, nil
	}

	// Check if the join command adds an instance that collides with existing one on peer URL.
	peerURLs := ps.registry.PeerURLs(ps.raftServer.Leader(), c.Name)
	for _, peerURL := range peerURLs {
		if peerURL == c.RaftURL {
			log.Warnf("%v tries to join the cluster with existing URL %v", c.Name, c.EtcdURL)
			return 0, etcdErr.NewError(etcdErr.EcodeExistingPeerAddr, c.EtcdURL, context.CommitIndex())
		}
	}

	// Check peer number in the cluster
	count := ps.registry.Count()
	// ClusterConfig doesn't init until first machine is added
	if count > 0 && count >= ps.ClusterConfig().ActiveSize {
		log.Debug("Reject join request from ", c.Name)
		return 0, etcdErr.NewError(etcdErr.EcodeNoMorePeer, "", context.CommitIndex())
	}

	// Add to shared peer registry.
	ps.registry.Register(c.Name, c.RaftURL, c.EtcdURL)

	// Add peer in raft
	if err := context.Server().AddPeer(c.Name, ""); err != nil {
		return 0, err
	}

	// Add peer stats
	if c.Name != ps.RaftServer().Name() {
		ps.followersStats.Followers[c.Name] = &raftFollowerStats{}
		ps.followersStats.Followers[c.Name].Latency.Minimum = 1 << 63
	}

	if c.Name == context.Server().Name() {
		ps.removedInLog = false
	}
	return commitIndex, nil
}
Beispiel #8
0
func DecodeJsonRequest(req *http.Request, data interface{}) error {
	decoder := json.NewDecoder(req.Body)
	if err := decoder.Decode(&data); err != nil && err != io.EOF {
		log.Warnf("Malformed json request: %v", err)
		return fmt.Errorf("Malformed json request: %v", err)
	}
	return nil
}
Beispiel #9
0
func (s *StandbyServer) fullPeerURL(urlStr string) string {
	u, err := url.Parse(urlStr)
	if err != nil {
		log.Warnf("fail parsing url %v", u)
		return urlStr
	}
	u.Scheme = s.Config.PeerScheme
	return u.String()
}
Beispiel #10
0
func NewStandbyServer(config StandbyServerConfig, client *Client) *StandbyServer {
	s := &StandbyServer{
		Config:      config,
		client:      client,
		standbyInfo: standbyInfo{SyncInterval: DefaultSyncInterval},
	}
	if err := s.loadInfo(); err != nil {
		log.Warnf("error load standby info file: %v", err)
	}
	return s
}
Beispiel #11
0
func (s *PeerServer) startAsFollower(cluster []string) {
	// start as a follower in a existing cluster
	for i := 0; i < s.Config.RetryTimes; i++ {
		ok := s.joinCluster(cluster)
		if ok {
			return
		}
		log.Warnf("Unable to join the cluster using any of the peers %v. Retrying in %.1f seconds", cluster, s.Config.RetryInterval)
		time.Sleep(time.Second * time.Duration(s.Config.RetryInterval))
	}

	log.Fatalf("Cannot join the cluster via given peers after %x retries", s.Config.RetryTimes)
}
Beispiel #12
0
func (s *PeerServer) startAsFollower(cluster []string) {
	// start as a follower in a existing cluster
	for i := 0; i < s.RetryTimes; i++ {
		ok := s.joinCluster(cluster)
		if ok {
			return
		}
		log.Warnf("cannot join to cluster via given peers, retry in %d seconds", RetryInterval)
		time.Sleep(time.Second * RetryInterval)
	}

	log.Fatalf("Cannot join the cluster via given peers after %x retries", s.RetryTimes)
}
Beispiel #13
0
// Join a server to the cluster
func (c *JoinCommandV1) Apply(context raft.Context) (interface{}, error) {
	ps, _ := context.Server().Context().(*PeerServer)

	b := make([]byte, 8)
	binary.PutUvarint(b, context.CommitIndex())

	// Make sure we're not getting a cached value from the registry.
	ps.registry.Invalidate(c.Name)

	// Check if the join command is from a previous peer, who lost all its previous log.
	if peerURL, ok := ps.registry.PeerURL(c.Name); ok {
		// If previous node restarts with different peer URL,
		// update its information.
		if peerURL != c.RaftURL {
			log.Infof("Rejoin with %v instead of %v from %v", c.RaftURL, peerURL, c.Name)
			if err := c.updatePeerURL(ps); err != nil {
				return []byte{0}, err
			}
		}
		return b, nil
	}

	// Check if the join command adds an instance that collides with existing one on peer URL.
	peerURLs := ps.registry.PeerURLs(ps.raftServer.Leader(), c.Name)
	for _, peerURL := range peerURLs {
		if peerURL == c.RaftURL {
			log.Warnf("%v tries to join the cluster with existing URL %v", c.Name, c.EtcdURL)
			return []byte{0}, etcdErr.NewError(etcdErr.EcodeExistingPeerAddr, c.EtcdURL, context.CommitIndex())
		}
	}

	// Check peer number in the cluster
	if ps.registry.PeerCount() >= ps.ClusterConfig().ActiveSize {
		log.Debug("Reject join request from ", c.Name)
		return []byte{0}, etcdErr.NewError(etcdErr.EcodeNoMorePeer, "", context.CommitIndex())
	}

	// Add to shared peer registry.
	ps.registry.RegisterPeer(c.Name, c.RaftURL, c.EtcdURL)

	// Add peer in raft
	err := context.Server().AddPeer(c.Name, "")

	// Add peer stats
	if c.Name != ps.RaftServer().Name() {
		ps.followersStats.Followers[c.Name] = &raftFollowerStats{}
		ps.followersStats.Followers[c.Name].Latency.Minimum = 1 << 63
	}

	return b, err
}
Beispiel #14
0
func (d *Discoverer) startHeartbeat() {
	// In case of errors we should attempt to heartbeat fairly frequently
	heartbeatInterval := defaultTTL / 8
	ticker := time.Tick(time.Second * time.Duration(heartbeatInterval))
	for {
		select {
		case <-ticker:
			err := d.heartbeat()
			if err != nil {
				log.Warnf("Discovery heartbeat failed: %v", err)
			}
		}
	}
}
Beispiel #15
0
// removeSelfFromList removes url of the peerServer from the peer list
func (s *PeerServer) removeSelfFromList(peers []string) []string {
	// Remove its own peer address from the peer list to join
	u, err := url.Parse(s.Config.URL)
	if err != nil {
		log.Warnf("failed parsing self peer address %v", s.Config.URL)
		u = nil
	}
	newPeers := make([]string, 0)
	for _, v := range peers {
		if u == nil || v != u.Host {
			newPeers = append(newPeers, v)
		}
	}
	return newPeers
}
Beispiel #16
0
// Stop stops the server gracefully.
func (s *StandbyServer) Stop() {
	s.Lock()
	defer s.Unlock()
	if !s.started {
		return
	}
	s.started = false

	close(s.closeChan)
	s.routineGroup.Wait()

	if err := s.saveInfo(); err != nil {
		log.Warnf("error saving cluster info for standby")
	}
	s.Running = false
}
Beispiel #17
0
func (s *PeerServer) startAsFollower(cluster []string, retryTimes int) error {
	// start as a follower in a existing cluster
	for i := 0; ; i++ {
		ok := s.joinCluster(cluster)
		if ok {
			break
		}
		if i == retryTimes-1 {
			return fmt.Errorf("Cannot join the cluster via given peers after %x retries", s.Config.RetryTimes)
		}
		log.Warnf("%v is unable to join the cluster using any of the peers %v at %dth time. Retrying in %.1f seconds", s.Config.Name, cluster, i, s.Config.RetryInterval)
		time.Sleep(time.Second * time.Duration(s.Config.RetryInterval))
	}

	s.raftServer.Start()
	return nil
}
Beispiel #18
0
func (d *Discoverer) startHeartbeat(closeChan <-chan bool) {
	// In case of errors we should attempt to heartbeat fairly frequently
	heartbeatInterval := defaultTTL / 8
	ticker := time.NewTicker(time.Second * time.Duration(heartbeatInterval))
	defer ticker.Stop()
	for {
		select {
		case <-ticker.C:
			err := d.heartbeat()
			if err != nil {
				log.Warnf("Discovery heartbeat failed: %v", err)
			}
		case <-closeChan:
			return
		}
	}
}
Beispiel #19
0
// Start starts the raft server.
// The function assumes that join has been accepted successfully.
func (s *PeerServer) Start(snapshot bool, discoverURL string, peers []string) error {
	s.Lock()
	defer s.Unlock()
	if s.started {
		return nil
	}
	s.started = true

	// LoadSnapshot
	if snapshot {
		err := s.raftServer.LoadSnapshot()

		if err == nil {
			log.Debugf("%s finished load snapshot", s.Config.Name)
		} else {
			log.Debug(err)
		}
	}

	s.raftServer.Init()

	// Set NOCOW for data directory in btrfs
	if btrfs.IsBtrfs(s.raftServer.LogPath()) {
		if err := btrfs.SetNOCOWFile(s.raftServer.LogPath()); err != nil {
			log.Warnf("Failed setting NOCOW: %v", err)
		}
	}

	s.findCluster(discoverURL, peers)

	s.stopNotify = make(chan bool)
	s.removeNotify = make(chan bool)
	s.closeChan = make(chan bool)

	s.startRoutine(s.monitorSync)
	s.startRoutine(s.monitorTimeoutThreshold)
	s.startRoutine(s.monitorActiveSize)
	s.startRoutine(s.monitorPeerActivity)

	// open the snapshot
	if snapshot {
		s.startRoutine(s.monitorSnapshot)
	}

	return nil
}
Beispiel #20
0
// IsBtrfs checks whether the file is in btrfs
func IsBtrfs(path string) bool {
	// btrfs is linux-only filesystem
	// exit on other platforms
	if runtime.GOOS != "linux" {
		return false
	}
	var buf syscall.Statfs_t
	if err := syscall.Statfs(path, &buf); err != nil {
		log.Warnf("Failed to statfs: %v", err)
		return false
	}
	log.Debugf("The type of path %v is %v", path, buf.Type)
	if buf.Type != BTRFS_SUPER_MAGIC {
		return false
	}
	log.Infof("The path %v is in btrfs", path)
	return true
}
Beispiel #21
0
func (s *PeerServer) SetRaftServer(raftServer raft.Server, snapshot bool) {
	s.snapConf = &snapshotConf{
		checkingInterval: time.Second * 3,
		// this is not accurate, we will update raft to provide an api
		lastIndex:   raftServer.CommitIndex(),
		snapshotThr: uint64(s.Config.SnapshotCount),
	}

	raftServer.AddEventListener(raft.StateChangeEventType, s.raftEventLogger)
	raftServer.AddEventListener(raft.LeaderChangeEventType, s.raftEventLogger)
	raftServer.AddEventListener(raft.TermChangeEventType, s.raftEventLogger)
	raftServer.AddEventListener(raft.AddPeerEventType, s.raftEventLogger)
	raftServer.AddEventListener(raft.RemovePeerEventType, s.raftEventLogger)
	raftServer.AddEventListener(raft.HeartbeatIntervalEventType, s.raftEventLogger)
	raftServer.AddEventListener(raft.ElectionTimeoutThresholdEventType, s.raftEventLogger)

	raftServer.AddEventListener(raft.HeartbeatEventType, s.recordMetricEvent)

	raftServer.AddEventListener(raft.RemovedEventType, s.removedEvent)

	s.raftServer = raftServer
	s.removedInLog = false

	// LoadSnapshot
	if snapshot {
		err := s.raftServer.LoadSnapshot()

		if err == nil {
			log.Debugf("%s finished load snapshot", s.Config.Name)
		} else {
			log.Debug(err)
		}
	}

	s.raftServer.Init()

	// Set NOCOW for data directory in btrfs
	if btrfs.IsBtrfs(s.raftServer.LogPath()) {
		if err := btrfs.SetNOCOWFile(s.raftServer.LogPath()); err != nil {
			log.Warnf("Failed setting NOCOW: %v", err)
		}
	}
}
Beispiel #22
0
func (s *PeerServer) joinCluster(cluster []string) bool {
	for _, peer := range cluster {
		if len(peer) == 0 {
			continue
		}

		err := s.joinByPeer(s.raftServer, peer, s.Config.Scheme)
		if err == nil {
			log.Debugf("%s joined the cluster via peer %s", s.Config.Name, peer)
			return true

		}

		if _, ok := err.(etcdErr.Error); ok {
			log.Fatal(err)
		}

		log.Warnf("Attempt to join via %s failed: %s", peer, err)
	}

	return false
}
Beispiel #23
0
func (s *StandbyServer) Start() {
	s.Lock()
	defer s.Unlock()
	if s.started {
		return
	}
	s.started = true

	s.removeNotify = make(chan bool)
	s.closeChan = make(chan bool)

	s.Running = true
	if err := s.saveInfo(); err != nil {
		log.Warnf("error saving cluster info for standby")
	}

	s.routineGroup.Add(1)
	go func() {
		defer s.routineGroup.Done()
		s.monitorCluster()
	}()
}
Beispiel #24
0
// raftEventLogger converts events from the Raft server into log messages.
func (s *PeerServer) raftEventLogger(event raft.Event) {
	value := event.Value()
	prevValue := event.PrevValue()
	if value == nil {
		value = "<nil>"
	}
	if prevValue == nil {
		prevValue = "<nil>"
	}

	switch event.Type() {
	case raft.StateChangeEventType:
		log.Infof("%s: state changed from '%v' to '%v'.", s.Config.Name, prevValue, value)
	case raft.TermChangeEventType:
		log.Infof("%s: term #%v started.", s.Config.Name, value)
	case raft.LeaderChangeEventType:
		log.Infof("%s: leader changed from '%v' to '%v'.", s.Config.Name, prevValue, value)
	case raft.AddPeerEventType:
		log.Infof("%s: peer added: '%v'", s.Config.Name, value)
	case raft.RemovePeerEventType:
		log.Infof("%s: peer removed: '%v'", s.Config.Name, value)
	case raft.HeartbeatIntervalEventType:
		peer, ok := value.(*raft.Peer)
		if !ok {
			log.Warnf("%s: heatbeat timeout from unknown peer", s.Config.Name)
			return
		}
		s.logHeartbeatTimeout(peer)
	case raft.ElectionTimeoutThresholdEventType:
		select {
		case s.timeoutThresholdChan <- value:
		default:
		}

	}
}
Beispiel #25
0
// Helper function to do discovery and return results in expected format
func (s *PeerServer) handleDiscovery(discoverURL string) (peers []string, err error) {
	peers, err = discovery.Do(discoverURL, s.Config.Name, s.Config.URL)

	// Warn about errors coming from discovery, this isn't fatal
	// since the user might have provided a peer list elsewhere,
	// or there is some log in data dir.
	if err != nil {
		log.Warnf("Discovery encountered an error: %v", err)
		return
	}

	for i := range peers {
		// Strip the scheme off of the peer if it has one
		// TODO(bp): clean this up!
		purl, err := url.Parse(peers[i])
		if err == nil {
			peers[i] = purl.Host
		}
	}

	log.Infof("Discovery fetched back peer list: %v", peers)

	return
}
Beispiel #26
0
// Try all possible ways to find clusters to join
// Include log data in -data-dir, -discovery and -peers
//
// Peer discovery follows this order:
// 1. previous peers in -data-dir
// 2. -discovery
// 3. -peers
func (s *PeerServer) FindCluster(discoverURL string, peers []string) (toStart bool, possiblePeers []string, err error) {
	name := s.Config.Name
	isNewNode := s.raftServer.IsLogEmpty()

	// Try its best to find possible peers, and connect with them.
	if !isNewNode {
		// It is not allowed to join the cluster with existing peer address
		// This prevents old node joining with different name by mistake.
		if !s.checkPeerAddressNonconflict() {
			err = fmt.Errorf("%v is not allowed to join the cluster with existing URL %v", s.Config.Name, s.Config.URL)
			return
		}

		// Take old nodes into account.
		possiblePeers = s.getKnownPeers()
		// Discover registered peers.
		// TODO(yichengq): It may mess up discoverURL if this is
		// set wrong by mistake. This may need to refactor discovery
		// module. Fix it later.
		if discoverURL != "" {
			discoverPeers, _ := s.handleDiscovery(discoverURL)
			possiblePeers = append(possiblePeers, discoverPeers...)
		}
		possiblePeers = append(possiblePeers, peers...)
		possiblePeers = s.removeSelfFromList(possiblePeers)

		if s.removedInLog {
			return
		}

		// If there is possible peer list, use it to find cluster.
		if len(possiblePeers) > 0 {
			// TODO(yichengq): joinCluster may fail if there's no leader for
			// current cluster. It should wait if the cluster is under
			// leader election, or the node with changed IP cannot join
			// the cluster then.
			if rejected, ierr := s.startAsFollower(possiblePeers, 1); rejected {
				log.Debugf("%s should work as standby for the cluster %v: %v", name, possiblePeers, ierr)
				return
			} else if ierr != nil {
				log.Warnf("%s cannot connect to previous cluster %v: %v", name, possiblePeers, ierr)
			} else {
				log.Debugf("%s joins to the previous cluster %v", name, possiblePeers)
				toStart = true
				return
			}
		}

		// TODO(yichengq): Think about the action that should be done
		// if it cannot connect any of the previous known node.
		log.Debugf("%s is restarting the cluster %v", name, possiblePeers)
		toStart = true
		return
	}

	// Attempt cluster discovery
	if discoverURL != "" {
		discoverPeers, discoverErr := s.handleDiscovery(discoverURL)
		// It is not registered in discover url
		if discoverErr != nil {
			log.Warnf("%s failed to connect discovery service[%v]: %v", name, discoverURL, discoverErr)
			if len(peers) == 0 {
				err = fmt.Errorf("%s, the new instance, must register itself to discovery service as required", name)
				return
			}
			log.Debugf("%s is joining peers %v from -peers flag", name, peers)
		} else {
			log.Debugf("%s is joining a cluster %v via discover service", name, discoverPeers)
			peers = discoverPeers
		}
	}
	possiblePeers = peers

	if len(possiblePeers) > 0 {
		if rejected, ierr := s.startAsFollower(possiblePeers, s.Config.RetryTimes); rejected {
			log.Debugf("%s should work as standby for the cluster %v: %v", name, possiblePeers, ierr)
		} else if ierr != nil {
			log.Warnf("%s cannot connect to existing peers %v: %v", name, possiblePeers, ierr)
			err = ierr
		} else {
			toStart = true
		}
		return
	}

	// start as a leader in a new cluster
	s.isNewCluster = true
	log.Infof("%s is starting a new cluster", s.Config.Name)
	toStart = true
	return
}
Beispiel #27
0
func main() {
	parseFlags()

	// Load configuration.
	var config = server.NewConfig()
	if err := config.Load(os.Args[1:]); err != nil {
		log.Fatal("Configuration error:", err)
	}

	// Turn on logging.
	if config.VeryVerbose {
		log.Verbose = true
		raft.SetLogLevel(raft.Debug)
	} else if config.Verbose {
		log.Verbose = true
	}

	// Setup a default directory based on the machine name
	if config.DataDir == "" {
		config.DataDir = config.Name + ".etcd"
		log.Warnf("Using the directory %s as the etcd configuration directory because a directory was not specified. ", config.DataDir)
	}

	// Create data directory if it doesn't already exist.
	if err := os.MkdirAll(config.DataDir, 0744); err != nil {
		log.Fatalf("Unable to create path: %s", err)
	}

	// Load info object.
	info, err := config.Info()
	if err != nil {
		log.Fatal("info:", err)
	}
	if info.Name == "" {
		host, err := os.Hostname()
		if err != nil || host == "" {
			log.Fatal("Machine name required and hostname not set. e.g. '-n=machine_name'")
		}
		log.Warnf("Using hostname %s as the machine name. You must ensure this name is unique among etcd machines.", host)
		info.Name = host
	}

	// Retrieve TLS configuration.
	tlsConfig, err := info.EtcdTLS.Config()
	if err != nil {
		log.Fatal("Client TLS:", err)
	}
	peerTLSConfig, err := info.RaftTLS.Config()
	if err != nil {
		log.Fatal("Peer TLS:", err)
	}

	// Create etcd key-value store and registry.
	store := store.New()
	registry := server.NewRegistry(store)

	// Create peer server.
	ps := server.NewPeerServer(info.Name, config.DataDir, info.RaftURL, info.RaftListenHost, &peerTLSConfig, &info.RaftTLS, registry, store, config.SnapCount)
	ps.MaxClusterSize = config.MaxClusterSize
	ps.RetryTimes = config.MaxRetryAttempts

	// Create client server.
	s := server.New(info.Name, info.EtcdURL, info.EtcdListenHost, &tlsConfig, &info.EtcdTLS, ps, registry, store)
	if err := s.AllowOrigins(config.Cors); err != nil {
		panic(err)
	}

	ps.SetServer(s)

	// Run peer server in separate thread while the client server blocks.
	go func() {
		log.Fatal(ps.ListenAndServe(config.Snapshot, config.Machines))
	}()
	log.Fatal(s.ListenAndServe())
}
Beispiel #28
0
func main() {
	// Load configuration.
	var config = config.New()
	if err := config.Load(os.Args[1:]); err != nil {
		fmt.Println(server.Usage() + "\n")
		fmt.Println(err.Error() + "\n")
		os.Exit(1)
	} else if config.ShowVersion {
		fmt.Println("etcd version", server.ReleaseVersion)
		os.Exit(0)
	} else if config.ShowHelp {
		fmt.Println(server.Usage() + "\n")
		os.Exit(0)
	}

	// Enable options.
	if config.VeryVeryVerbose {
		log.Verbose = true
		raft.SetLogLevel(raft.Trace)
	} else if config.VeryVerbose {
		log.Verbose = true
		raft.SetLogLevel(raft.Debug)
	} else if config.Verbose {
		log.Verbose = true
	}
	if config.CPUProfileFile != "" {
		profile(config.CPUProfileFile)
	}

	if config.DataDir == "" {
		log.Fatal("The data dir was not set and could not be guessed from machine name")
	}

	// Create data directory if it doesn't already exist.
	if err := os.MkdirAll(config.DataDir, 0744); err != nil {
		log.Fatalf("Unable to create path: %s", err)
	}

	// Warn people if they have an info file
	info := filepath.Join(config.DataDir, "info")
	if _, err := os.Stat(info); err == nil {
		log.Warnf("All cached configuration is now ignored. The file %s can be removed.", info)
	}

	var mbName string
	if config.Trace() {
		mbName = config.MetricsBucketName()
		runtime.SetBlockProfileRate(1)
	}

	mb := metrics.NewBucket(mbName)

	if config.GraphiteHost != "" {
		err := mb.Publish(config.GraphiteHost)
		if err != nil {
			panic(err)
		}
	}

	// Retrieve CORS configuration
	corsInfo, err := ehttp.NewCORSInfo(config.CorsOrigins)
	if err != nil {
		log.Fatal("CORS:", err)
	}

	// Create etcd key-value store and registry.
	store := store.New()
	registry := server.NewRegistry(store)

	// Create stats objects
	followersStats := server.NewRaftFollowersStats(config.Name)
	serverStats := server.NewRaftServerStats(config.Name)

	// Calculate all of our timeouts
	heartbeatInterval := time.Duration(config.Peer.HeartbeatInterval) * time.Millisecond
	electionTimeout := time.Duration(config.Peer.ElectionTimeout) * time.Millisecond
	dialTimeout := (3 * heartbeatInterval) + electionTimeout
	responseHeaderTimeout := (3 * heartbeatInterval) + electionTimeout

	// Create peer server
	psConfig := server.PeerServerConfig{
		Name:           config.Name,
		Scheme:         config.PeerTLSInfo().Scheme(),
		URL:            config.Peer.Addr,
		SnapshotCount:  config.SnapshotCount,
		MaxClusterSize: config.MaxClusterSize,
		RetryTimes:     config.MaxRetryAttempts,
		RetryInterval:  config.RetryInterval,
	}
	ps := server.NewPeerServer(psConfig, registry, store, &mb, followersStats, serverStats)

	var psListener net.Listener
	if psConfig.Scheme == "https" {
		peerServerTLSConfig, err := config.PeerTLSInfo().ServerConfig()
		if err != nil {
			log.Fatal("peer server TLS error: ", err)
		}

		psListener, err = server.NewTLSListener(config.Peer.BindAddr, peerServerTLSConfig)
		if err != nil {
			log.Fatal("Failed to create peer listener: ", err)
		}
	} else {
		psListener, err = server.NewListener(config.Peer.BindAddr)
		if err != nil {
			log.Fatal("Failed to create peer listener: ", err)
		}
	}

	// Create raft transporter and server
	raftTransporter := server.NewTransporter(followersStats, serverStats, registry, heartbeatInterval, dialTimeout, responseHeaderTimeout)
	if psConfig.Scheme == "https" {
		raftClientTLSConfig, err := config.PeerTLSInfo().ClientConfig()
		if err != nil {
			log.Fatal("raft client TLS error: ", err)
		}
		raftTransporter.SetTLSConfig(*raftClientTLSConfig)
	}
	raftServer, err := raft.NewServer(config.Name, config.DataDir, raftTransporter, store, ps, "")
	if err != nil {
		log.Fatal(err)
	}
	raftServer.SetElectionTimeout(electionTimeout)
	raftServer.SetHeartbeatInterval(heartbeatInterval)
	ps.SetRaftServer(raftServer)

	// Create etcd server
	s := server.New(config.Name, config.Addr, ps, registry, store, &mb)

	if config.Trace() {
		s.EnableTracing()
	}

	var sListener net.Listener
	if config.EtcdTLSInfo().Scheme() == "https" {
		etcdServerTLSConfig, err := config.EtcdTLSInfo().ServerConfig()
		if err != nil {
			log.Fatal("etcd TLS error: ", err)
		}

		sListener, err = server.NewTLSListener(config.BindAddr, etcdServerTLSConfig)
		if err != nil {
			log.Fatal("Failed to create TLS etcd listener: ", err)
		}
	} else {
		sListener, err = server.NewListener(config.BindAddr)
		if err != nil {
			log.Fatal("Failed to create etcd listener: ", err)
		}
	}

	ps.SetServer(s)
	ps.Start(config.Snapshot, config.Discovery, config.Peers)

	go func() {
		log.Infof("peer server [name %s, listen on %s, advertised url %s]", ps.Config.Name, psListener.Addr(), ps.Config.URL)
		sHTTP := &ehttp.CORSHandler{ps.HTTPHandler(), corsInfo}
		log.Fatal(http.Serve(psListener, sHTTP))
	}()

	log.Infof("etcd server [name %s, listen on %s, advertised url %s]", s.Name, sListener.Addr(), s.URL())
	sHTTP := &ehttp.CORSHandler{s.HTTPHandler(), corsInfo}
	log.Fatal(http.Serve(sListener, sHTTP))
}
Beispiel #29
0
// Try all possible ways to find clusters to join
// Include log data in -data-dir, -discovery and -peers
//
// Peer discovery follows this order:
// 1. previous peers in -data-dir
// 2. -discovery
// 3. -peers
//
// TODO(yichengq): RaftServer should be started as late as possible.
// Current implementation to start it is not that good,
// and should be refactored later.
func (s *PeerServer) findCluster(discoverURL string, peers []string) {
	name := s.Config.Name
	isNewNode := s.raftServer.IsLogEmpty()

	// Try its best to find possible peers, and connect with them.
	if !isNewNode {
		// It is not allowed to join the cluster with existing peer address
		// This prevents old node joining with different name by mistake.
		if !s.checkPeerAddressNonconflict() {
			log.Fatalf("%v is not allowed to join the cluster with existing URL %v", s.Config.Name, s.Config.URL)
		}

		// Take old nodes into account.
		allPeers := s.getKnownPeers()
		// Discover registered peers.
		// TODO(yichengq): It may mess up discoverURL if this is
		// set wrong by mistake. This may need to refactor discovery
		// module. Fix it later.
		if discoverURL != "" {
			discoverPeers, _ := s.handleDiscovery(discoverURL)
			allPeers = append(allPeers, discoverPeers...)
		}
		allPeers = append(allPeers, peers...)
		allPeers = s.removeSelfFromList(allPeers)

		// If there is possible peer list, use it to find cluster.
		if len(allPeers) > 0 {
			// TODO(yichengq): joinCluster may fail if there's no leader for
			// current cluster. It should wait if the cluster is under
			// leader election, or the node with changed IP cannot join
			// the cluster then.
			if err := s.startAsFollower(allPeers, 1); err == nil {
				log.Debugf("%s joins to the previous cluster %v", name, allPeers)
				return
			}

			log.Warnf("%s cannot connect to previous cluster %v", name, allPeers)
		}

		// TODO(yichengq): Think about the action that should be done
		// if it cannot connect any of the previous known node.
		s.raftServer.Start()
		log.Debugf("%s is restarting the cluster %v", name, allPeers)
		return
	}

	// Attempt cluster discovery
	if discoverURL != "" {
		discoverPeers, discoverErr := s.handleDiscovery(discoverURL)
		// It is registered in discover url
		if discoverErr == nil {
			// start as a leader in a new cluster
			if len(discoverPeers) == 0 {
				log.Debugf("%s is starting a new cluster via discover service", name)
				s.startAsLeader()
			} else {
				log.Debugf("%s is joining a cluster %v via discover service", name, discoverPeers)
				if err := s.startAsFollower(discoverPeers, s.Config.RetryTimes); err != nil {
					log.Fatal(err)
				}
			}
			return
		}
		log.Warnf("%s failed to connect discovery service[%v]: %v", name, discoverURL, discoverErr)

		if len(peers) == 0 {
			log.Fatalf("%s, the new leader, must register itself to discovery service as required", name)
		}
	}

	if len(peers) > 0 {
		if err := s.startAsFollower(peers, s.Config.RetryTimes); err != nil {
			log.Fatalf("%s cannot connect to existing cluster %v", name, peers)
		}
		return
	}

	log.Infof("%s is starting a new cluster.", s.Config.Name)
	s.startAsLeader()
	return
}
Beispiel #30
0
// DataDirFromName sets the data dir from a machine name and issue a warning
// that etcd is guessing.
func (c *Config) DataDirFromName() {
	c.DataDir = c.Name + ".etcd"
	log.Warnf("Using the directory %s as the etcd curation directory because a directory was not specified. ", c.DataDir)

	return
}