// monitorPeerActivity has the leader periodically for dead nodes and demotes them. func (s *PeerServer) monitorPeerActivity(closeChan chan bool) { for { select { case <-time.After(PeerActivityMonitorTimeout): case <-closeChan: return } // Ignore while this peer is not a leader. if s.raftServer.State() != raft.Leader { continue } // Check last activity for all peers. now := time.Now() promoteDelay := time.Duration(s.ClusterConfig().PromoteDelay) * time.Second peers := s.raftServer.Peers() for _, peer := range peers { // If the last response from the peer is longer than the promote delay // then automatically demote the peer. if !peer.LastActivity().IsZero() && now.Sub(peer.LastActivity()) > promoteDelay { log.Infof("%s: demoting node: %v; last activity %v ago", s.Config.Name, peer.Name, now.Sub(peer.LastActivity())) if _, err := s.raftServer.Do(&DemoteCommand{Name: peer.Name}); err != nil { log.Infof("%s: warning: autodemotion error: %v", s.Config.Name, err) } continue } } } }
func (e *Etcd) runServer() { var removeNotify <-chan bool for { if e.mode == PeerMode { log.Infof("%v starting in peer mode", e.Config.Name) // Starting peer server should be followed close by listening on its port // If not, it may leave many requests unaccepted, or cannot receive heartbeat from the cluster. // One severe problem caused if failing receiving heartbeats is when the second node joins one-node cluster, // the cluster could be out of work as long as the two nodes cannot transfer messages. e.PeerServer.Start(e.Config.Snapshot, e.Config.ClusterConfig()) removeNotify = e.PeerServer.RemoveNotify() } else { log.Infof("%v starting in standby mode", e.Config.Name) e.StandbyServer.Start() removeNotify = e.StandbyServer.RemoveNotify() } // etcd server is ready to accept connections, notify waiters. e.onceReady.Do(func() { close(e.readyNotify) }) select { case <-e.closeChan: e.PeerServer.Stop() e.StandbyServer.Stop() return case <-removeNotify: } if e.mode == PeerMode { peerURLs := e.Registry.PeerURLs(e.PeerServer.RaftServer().Leader(), e.Config.Name) e.StandbyServer.SyncCluster(peerURLs) e.setMode(StandbyMode) } else { // Create etcd key-value store and registry. e.Store = store.New() e.Registry = server.NewRegistry(e.Store) e.PeerServer.SetStore(e.Store) e.PeerServer.SetRegistry(e.Registry) e.Server.SetStore(e.Store) e.Server.SetRegistry(e.Registry) // Generate new peer server here. // TODO(yichengq): raft server cannot be started after stopped. // It should be removed when raft restart is implemented. heartbeatInterval := time.Duration(e.Config.Peer.HeartbeatInterval) * time.Millisecond electionTimeout := time.Duration(e.Config.Peer.ElectionTimeout) * time.Millisecond raftServer, err := raft.NewServer(e.Config.Name, e.Config.DataDir, e.PeerServer.RaftServer().Transporter(), e.Store, e.PeerServer, "") if err != nil { log.Fatal(err) } raftServer.SetElectionTimeout(electionTimeout) raftServer.SetHeartbeatInterval(heartbeatInterval) e.PeerServer.SetRaftServer(raftServer, e.Config.Snapshot) e.StandbyServer.SetRaftServer(raftServer) e.PeerServer.SetJoinIndex(e.StandbyServer.JoinIndex()) e.setMode(PeerMode) } } }
// monitorActiveSize has the leader periodically check the status of cluster // nodes and swaps them out for proxies as needed. func (s *PeerServer) monitorActiveSize(closeChan chan bool) { for { select { case <-time.After(ActiveMonitorTimeout): case <-closeChan: return } // Ignore while this peer is not a leader. if s.raftServer.State() != raft.Leader { continue } // Retrieve target active size and actual active size. activeSize := s.ClusterConfig().ActiveSize peerCount := s.registry.PeerCount() proxies := s.registry.Proxies() peers := s.registry.Peers() if index := sort.SearchStrings(peers, s.Config.Name); index < len(peers) && peers[index] == s.Config.Name { peers = append(peers[:index], peers[index+1:]...) } // If we have more active nodes than we should then demote. if peerCount > activeSize { peer := peers[rand.Intn(len(peers))] log.Infof("%s: demoting: %v", s.Config.Name, peer) if _, err := s.raftServer.Do(&DemoteCommand{Name: peer}); err != nil { log.Infof("%s: warning: demotion error: %v", s.Config.Name, err) } continue } // If we don't have enough active nodes then try to promote a proxy. if peerCount < activeSize && len(proxies) > 0 { loop: for _, i := range rand.Perm(len(proxies)) { proxy := proxies[i] proxyPeerURL, _ := s.registry.ProxyPeerURL(proxy) log.Infof("%s: attempting to promote: %v (%s)", s.Config.Name, proxy, proxyPeerURL) // Notify proxy to promote itself. client := &http.Client{ Transport: &http.Transport{ DisableKeepAlives: false, ResponseHeaderTimeout: ActiveMonitorTimeout, }, } resp, err := client.Post(fmt.Sprintf("%s/promote", proxyPeerURL), "application/json", nil) if err != nil { log.Infof("%s: warning: promotion error: %v", s.Config.Name, err) continue } else if resp.StatusCode != http.StatusOK { log.Infof("%s: warning: promotion failure: %v", s.Config.Name, resp.StatusCode) continue } break loop } } } }
// monitorPeerActivity has the leader periodically for dead nodes and demotes them. func (s *PeerServer) monitorPeerActivity() { for { timer := time.NewTimer(PeerActivityMonitorTimeout) defer timer.Stop() select { case <-s.closeChan: return case <-timer.C: } // Ignore while this peer is not a leader. if s.raftServer.State() != raft.Leader { continue } // Check last activity for all peers. now := time.Now() removeDelay := time.Duration(int64(s.ClusterConfig().RemoveDelay * float64(time.Second))) peers := s.raftServer.Peers() for _, peer := range peers { // If the last response from the peer is longer than the remove delay // then automatically demote the peer. if !peer.LastActivity().IsZero() && now.Sub(peer.LastActivity()) > removeDelay { log.Infof("%s: removing node: %v; last activity %v ago", s.Config.Name, peer.Name, now.Sub(peer.LastActivity())) if _, err := s.raftServer.Do(&RemoveCommand{Name: peer.Name}); err != nil { log.Infof("%s: warning: autodemotion error: %v", s.Config.Name, err) } continue } } } }
// monitorActiveSize has the leader periodically check the status of cluster // nodes and swaps them out for standbys as needed. func (s *PeerServer) monitorActiveSize() { for { timer := time.NewTimer(ActiveMonitorTimeout) defer timer.Stop() select { case <-s.closeChan: return case <-timer.C: } // Ignore while this peer is not a leader. if s.raftServer.State() != raft.Leader { continue } // Retrieve target active size and actual active size. activeSize := s.ClusterConfig().ActiveSize peers := s.registry.Names() peerCount := len(peers) if index := sort.SearchStrings(peers, s.Config.Name); index < len(peers) && peers[index] == s.Config.Name { peers = append(peers[:index], peers[index+1:]...) } // If we have more active nodes than we should then remove. if peerCount > activeSize { peer := peers[rand.Intn(len(peers))] log.Infof("%s: removing: %v", s.Config.Name, peer) if _, err := s.raftServer.Do(&RemoveCommand{Name: peer}); err != nil { log.Infof("%s: warning: remove error: %v", s.Config.Name, err) } continue } } }
// logSnapshot logs about the snapshot that was taken. func (s *PeerServer) logSnapshot(err error, currentIndex, count uint64) { info := fmt.Sprintf("%s: snapshot of %d events at index %d", s.Config.Name, count, currentIndex) if err != nil { log.Infof("%s attempted and failed: %v", info, err) } else { log.Infof("%s completed", info) } }
func (d *Discoverer) Do(discoveryURL string, name string, peer string) (peers []string, err error) { d.name = name d.peer = peer d.discoveryURL = discoveryURL u, err := url.Parse(discoveryURL) if err != nil { return } // prefix is prepended to all keys for this discovery d.prefix = strings.TrimPrefix(u.Path, "/v2/keys/") // keep the old path in case we need to set the KeyPrefix below oldPath := u.Path u.Path = "" // Connect to a scheme://host not a full URL with path log.Infof("Discovery via %s using prefix %s.", u.String(), d.prefix) d.client = etcd.NewClient([]string{u.String()}) if !strings.HasPrefix(oldPath, "/v2/keys") { d.client.SetKeyPrefix("") } // Register this machine first and announce that we are a member of // this cluster err = d.heartbeat() if err != nil { return } // Start the very slow heartbeat to the cluster now in anticipation // that everything is going to go alright now go d.startHeartbeat() // Attempt to take the leadership role, if there is no error we are it! resp, err := d.client.Create(path.Join(d.prefix, stateKey), startedState, 0) // Bail out on unexpected errors if err != nil { if clientErr, ok := err.(*etcd.EtcdError); !ok || clientErr.ErrorCode != etcdErr.EcodeNodeExist { return nil, err } } // If we got a response then the CAS was successful, we are leader if resp != nil && resp.Node.Value == startedState { // We are the leader, we have no peers log.Infof("Discovery _state was empty, so this machine is the initial leader.") return nil, nil } // Fall through to finding the other discovery peers return d.findPeers() }
func (s *StandbyServer) SyncCluster(peers []string) error { for i, url := range peers { peers[i] = s.fullPeerURL(url) } if err := s.syncCluster(peers); err != nil { log.Infof("fail syncing cluster(%v): %v", s.ClusterURLs(), err) return err } log.Infof("set cluster(%v) for standby server", s.ClusterURLs()) return nil }
// logHeartbeatTimeout logs about the edge triggered heartbeat timeout event // only if we haven't warned within a reasonable interval. func (s *PeerServer) logHeartbeatTimeout(peer *raft.Peer) { b, ok := s.logBackoffs[peer.Name] if !ok { b = &logBackoff{time.Time{}, time.Second, 1} s.logBackoffs[peer.Name] = b } if peer.LastActivity().After(b.next) { b.next = time.Time{} b.backoff = time.Second b.count = 1 } if b.next.After(time.Now()) { b.count++ return } b.backoff = 2 * b.backoff if b.backoff > MaxHeartbeatTimeoutBackoff { b.backoff = MaxHeartbeatTimeoutBackoff } b.next = time.Now().Add(b.backoff) log.Infof("%s: warning: heartbeat time out peer=%q missed=%d backoff=%q", s.Config.Name, peer.Name, b.count, b.backoff) }
// Retrieves the URLs for all nodes using url function. func (r *Registry) urls(leaderName, selfName string, url func(name string) (string, bool)) []string { r.Lock() defer r.Unlock() // Build list including the leader and self. urls := make([]string, 0) if url, _ := url(leaderName); len(url) > 0 { urls = append(urls, url) } // Retrieve a list of all nodes. if e, _ := r.store.Get(RegistryKey, false, false); e != nil { // Lookup the URL for each one. for _, pair := range e.Node.Nodes { _, name := filepath.Split(pair.Key) if url, _ := url(name); len(url) > 0 && name != leaderName { urls = append(urls, url) } } } log.Infof("URLs: %s / %s (%s)", leaderName, selfName, strings.Join(urls, ",")) return urls }
func (d *Discoverer) findPeers() (peers []string, err error) { resp, err := d.client.Get(path.Join(d.prefix), false, true) if err != nil { return nil, err } node := resp.Node if node == nil { return nil, fmt.Errorf("%s key doesn't exist.", d.prefix) } for _, n := range node.Nodes { // Skip our own entry in the list, there is no point if strings.HasSuffix(n.Key, "/"+d.name) { continue } peers = append(peers, n.Value) } if len(peers) == 0 { return nil, errors.New("Discovery found an initialized cluster but no reachable peers are registered.") } log.Infof("Discovery found peers %v", peers) return }
// Start to listen and response raft command func (s *PeerServer) startTransport(scheme string, tlsConf tls.Config) error { log.Infof("raft server [name %s, listen on %s, advertised url %s]", s.name, s.bindAddr, s.url) router := mux.NewRouter() s.httpServer = &http.Server{ Handler: router, TLSConfig: &tlsConf, Addr: s.bindAddr, } // internal commands router.HandleFunc("/name", s.NameHttpHandler) router.HandleFunc("/version", s.VersionHttpHandler) router.HandleFunc("/version/{version:[0-9]+}/check", s.VersionCheckHttpHandler) router.HandleFunc("/upgrade", s.UpgradeHttpHandler) router.HandleFunc("/join", s.JoinHttpHandler) router.HandleFunc("/remove/{name:.+}", s.RemoveHttpHandler) router.HandleFunc("/vote", s.VoteHttpHandler) router.HandleFunc("/log", s.GetLogHttpHandler) router.HandleFunc("/log/append", s.AppendEntriesHttpHandler) router.HandleFunc("/snapshot", s.SnapshotHttpHandler) router.HandleFunc("/snapshotRecovery", s.SnapshotRecoveryHttpHandler) router.HandleFunc("/etcdURL", s.EtcdURLHttpHandler) if scheme == "http" { return s.listenAndServe() } else { return s.listenAndServeTLS(s.tlsInfo.CertFile, s.tlsInfo.KeyFile) } }
// monitorCluster assumes that the machine has tried to join the cluster and // failed, so it waits for the interval at the beginning. func (s *StandbyServer) monitorCluster() { for { timer := time.NewTimer(time.Duration(int64(s.SyncInterval * float64(time.Second)))) defer timer.Stop() select { case <-s.closeChan: return case <-timer.C: } if err := s.syncCluster(nil); err != nil { log.Warnf("fail syncing cluster(%v): %v", s.ClusterURLs(), err) continue } leader := s.ClusterLeader() if leader == nil { log.Warnf("fail getting leader from cluster(%v)", s.ClusterURLs()) continue } if err := s.join(leader.PeerURL); err != nil { log.Debugf("fail joining through leader %v: %v", leader, err) continue } log.Infof("join through leader %v", leader.PeerURL) go func() { s.Stop() close(s.removeNotify) }() return } }
// SetNOCOWFile sets NOCOW flag for file func SetNOCOWFile(path string) error { file, err := os.Open(path) if err != nil { return err } defer file.Close() fileinfo, err := file.Stat() if err != nil { return err } if fileinfo.IsDir() { return fmt.Errorf("skip directory") } if fileinfo.Size() != 0 { return fmt.Errorf("skip nonempty file") } var attr int if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), FS_IOC_GETFLAGS, uintptr(unsafe.Pointer(&attr))); errno != 0 { return errno } attr |= FS_NOCOW_FL if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), FS_IOC_SETFLAGS, uintptr(unsafe.Pointer(&attr))); errno != 0 { return errno } log.Infof("Set NOCOW to path %v succeeded", path) return nil }
// Parses non-configuration flags. func parseFlags() { var versionFlag bool var cpuprofile string f := flag.NewFlagSet(os.Args[0], -1) f.SetOutput(ioutil.Discard) f.BoolVar(&versionFlag, "version", false, "print the version and exit") f.StringVar(&cpuprofile, "cpuprofile", "", "write cpu profile to file") f.Parse(os.Args[1:]) // Print version if necessary. if versionFlag { fmt.Println(server.ReleaseVersion) os.Exit(0) } // Begin CPU profiling if specified. if cpuprofile != "" { f, err := os.Create(cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) c := make(chan os.Signal, 1) signal.Notify(c, os.Interrupt) go func() { sig := <-c log.Infof("captured %v, stopping profiler and exiting..", sig) pprof.StopCPUProfile() os.Exit(1) }() } }
// applyJoin attempts to join a machine to the cluster. func applyJoin(c *JoinCommand, context raft.Context) (uint64, error) { ps, _ := context.Server().Context().(*PeerServer) commitIndex := context.CommitIndex() // Make sure we're not getting a cached value from the registry. ps.registry.Invalidate(c.Name) // Check if the join command is from a previous peer, who lost all its previous log. if peerURL, ok := ps.registry.PeerURL(c.Name); ok { // If previous node restarts with different peer URL, // update its information. if peerURL != c.RaftURL { log.Infof("Rejoin with %v instead of %v from %v", c.RaftURL, peerURL, c.Name) if err := updatePeerURL(c, ps); err != nil { return 0, err } } if c.Name == context.Server().Name() { ps.removedInLog = false } return commitIndex, nil } // Check if the join command adds an instance that collides with existing one on peer URL. peerURLs := ps.registry.PeerURLs(ps.raftServer.Leader(), c.Name) for _, peerURL := range peerURLs { if peerURL == c.RaftURL { log.Warnf("%v tries to join the cluster with existing URL %v", c.Name, c.EtcdURL) return 0, etcdErr.NewError(etcdErr.EcodeExistingPeerAddr, c.EtcdURL, context.CommitIndex()) } } // Check peer number in the cluster count := ps.registry.Count() // ClusterConfig doesn't init until first machine is added if count > 0 && count >= ps.ClusterConfig().ActiveSize { log.Debug("Reject join request from ", c.Name) return 0, etcdErr.NewError(etcdErr.EcodeNoMorePeer, "", context.CommitIndex()) } // Add to shared peer registry. ps.registry.Register(c.Name, c.RaftURL, c.EtcdURL) // Add peer in raft if err := context.Server().AddPeer(c.Name, ""); err != nil { return 0, err } // Add peer stats if c.Name != ps.RaftServer().Name() { ps.followersStats.Followers[c.Name] = &raftFollowerStats{} ps.followersStats.Followers[c.Name].Latency.Minimum = 1 << 63 } if c.Name == context.Server().Name() { ps.removedInLog = false } return commitIndex, nil }
// Attempt to rejoin the cluster as a peer. func (ps *PeerServer) PromoteHttpHandler(w http.ResponseWriter, req *http.Request) { log.Infof("%s attempting to promote in cluster: %s", ps.Config.Name, ps.standbyPeerURL) url, err := url.Parse(ps.standbyPeerURL) if err != nil { w.WriteHeader(http.StatusInternalServerError) return } err = ps.joinByPeer(ps.raftServer, url.Host, ps.Config.Scheme) if err != nil { log.Infof("%s error while promoting: %v", ps.Config.Name, err) w.WriteHeader(http.StatusInternalServerError) return } log.Infof("%s promoted in the cluster", ps.Config.Name) w.WriteHeader(http.StatusOK) }
// Try all possible ways to find clusters to join // Include -discovery, -peers and log data in -data-dir // // Peer discovery follows this order: // 1. -discovery // 2. -peers // 3. previous peers in -data-dir func (s *PeerServer) findCluster(discoverURL string, peers []string) { // Attempt cluster discovery toDiscover := discoverURL != "" if toDiscover { discoverPeers, discoverErr := s.handleDiscovery(discoverURL) // It is registered in discover url if discoverErr == nil { // start as a leader in a new cluster if len(discoverPeers) == 0 { log.Debug("This peer is starting a brand new cluster based on discover URL.") s.startAsLeader() } else { s.startAsFollower(discoverPeers) } return } } hasPeerList := len(peers) > 0 // if there is log in data dir, append previous peers to peers in config // to find cluster prevPeers := s.registry.PeerURLs(s.raftServer.Leader(), s.Config.Name) for i := 0; i < len(prevPeers); i++ { u, err := url.Parse(prevPeers[i]) if err != nil { log.Debug("rejoin cannot parse url: ", err) } prevPeers[i] = u.Host } peers = append(peers, prevPeers...) // if there is backup peer lists, use it to find cluster if len(peers) > 0 { ok := s.joinCluster(peers) if !ok { log.Warn("No living peers are found!") } else { log.Debugf("%s restart as a follower based on peers[%v]", s.Config.Name) return } } if !s.raftServer.IsLogEmpty() { log.Debug("Entire cluster is down! %v will restart the cluster.", s.Config.Name) return } if toDiscover { log.Fatalf("Discovery failed, no available peers in backup list, and no log data") } if hasPeerList { log.Fatalf("No available peers in backup list, and no log data") } log.Infof("This peer is starting a brand new cluster now.") s.startAsLeader() }
// Start to listen and response etcd client command func (s *Server) ListenAndServe() error { log.Infof("etcd server [name %s, listen on %s, advertised url %s]", s.name, s.Server.Addr, s.url) if s.tlsConf.Scheme == "http" { return s.listenAndServe() } else { return s.listenAndServeTLS(s.tlsInfo.CertFile, s.tlsInfo.KeyFile) } }
func (s *PeerServer) joinCluster(cluster []string) (bool, error) { for _, peer := range cluster { if len(peer) == 0 { continue } if rejected, err := s.joinByPeer(s.raftServer, peer, s.Config.Scheme); rejected { return true, fmt.Errorf("rejected by peer %s: %v", peer, err) } else if err == nil { log.Infof("%s joined the cluster via peer %s", s.Config.Name, peer) return false, nil } else { log.Infof("%s attempted to join via %s failed: %v", s.Config.Name, peer, err) } } return false, fmt.Errorf("unreachable cluster") }
// monitorTimeoutThreshold groups timeout threshold events together and prints // them as a single log line. func (s *PeerServer) monitorTimeoutThreshold(closeChan chan bool) { for { select { case value := <-s.timeoutThresholdChan: log.Infof("%s: warning: heartbeat near election timeout: %v", s.Config.Name, value) case <-closeChan: return } time.Sleep(ThresholdMonitorTimeout) } }
// raftEventLogger converts events from the Raft server into log messages. func (s *PeerServer) raftEventLogger(event raft.Event) { value := event.Value() prevValue := event.PrevValue() if value == nil { value = "<nil>" } if prevValue == nil { prevValue = "<nil>" } switch event.Type() { case raft.StateChangeEventType: log.Infof("%s: state changed from '%v' to '%v'.", s.Config.Name, prevValue, value) case raft.TermChangeEventType: log.Infof("%s: term #%v started.", s.Config.Name, value) case raft.LeaderChangeEventType: log.Infof("%s: leader changed from '%v' to '%v'.", s.Config.Name, prevValue, value) case raft.AddPeerEventType: log.Infof("%s: peer added: '%v'", s.Config.Name, value) case raft.RemovePeerEventType: log.Infof("%s: peer removed: '%v'", s.Config.Name, value) case raft.HeartbeatIntervalEventType: var name = "<unknown>" if peer, ok := value.(*raft.Peer); ok { name = peer.Name } log.Infof("%s: warning: heartbeat timed out: '%v'", s.Config.Name, name) case raft.ElectionTimeoutThresholdEventType: select { case s.timeoutThresholdChan <- value: default: } } }
// getKnownPeers gets the previous peers from log func (s *PeerServer) getKnownPeers() []string { peers := s.registry.PeerURLs(s.raftServer.Leader(), s.Config.Name) log.Infof("Peer URLs in log: %s / %s (%s)", s.raftServer.Leader(), s.Config.Name, strings.Join(peers, ",")) for i := range peers { u, err := url.Parse(peers[i]) if err != nil { log.Debugf("getKnownPeers cannot parse url %v", peers[i]) } peers[i] = u.Host } return peers }
// Join a server to the cluster func (c *JoinCommandV1) Apply(context raft.Context) (interface{}, error) { ps, _ := context.Server().Context().(*PeerServer) b := make([]byte, 8) binary.PutUvarint(b, context.CommitIndex()) // Make sure we're not getting a cached value from the registry. ps.registry.Invalidate(c.Name) // Check if the join command is from a previous peer, who lost all its previous log. if peerURL, ok := ps.registry.PeerURL(c.Name); ok { // If previous node restarts with different peer URL, // update its information. if peerURL != c.RaftURL { log.Infof("Rejoin with %v instead of %v from %v", c.RaftURL, peerURL, c.Name) if err := c.updatePeerURL(ps); err != nil { return []byte{0}, err } } return b, nil } // Check if the join command adds an instance that collides with existing one on peer URL. peerURLs := ps.registry.PeerURLs(ps.raftServer.Leader(), c.Name) for _, peerURL := range peerURLs { if peerURL == c.RaftURL { log.Warnf("%v tries to join the cluster with existing URL %v", c.Name, c.EtcdURL) return []byte{0}, etcdErr.NewError(etcdErr.EcodeExistingPeerAddr, c.EtcdURL, context.CommitIndex()) } } // Check peer number in the cluster if ps.registry.PeerCount() >= ps.ClusterConfig().ActiveSize { log.Debug("Reject join request from ", c.Name) return []byte{0}, etcdErr.NewError(etcdErr.EcodeNoMorePeer, "", context.CommitIndex()) } // Add to shared peer registry. ps.registry.RegisterPeer(c.Name, c.RaftURL, c.EtcdURL) // Add peer in raft err := context.Server().AddPeer(c.Name, "") // Add peer stats if c.Name != ps.RaftServer().Name() { ps.followersStats.Followers[c.Name] = &raftFollowerStats{} ps.followersStats.Followers[c.Name].Latency.Minimum = 1 << 63 } return b, err }
// profile starts CPU profiling. func profile(path string) { f, err := os.Create(path) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) c := make(chan os.Signal, 1) signal.Notify(c, os.Interrupt) go func() { sig := <-c log.Infof("captured %v, stopping profiler and exiting..", sig) pprof.StopCPUProfile() os.Exit(1) }() }
func (s *PeerServer) startAsFollower(cluster []string, retryTimes int) (bool, error) { // start as a follower in a existing cluster for i := 0; ; i++ { if rejected, err := s.joinCluster(cluster); rejected { return true, err } else if err == nil { return false, nil } if i == retryTimes-1 { break } log.Infof("%v is unable to join the cluster using any of the peers %v at %dth time. Retrying in %.1f seconds", s.Config.Name, cluster, i, s.Config.RetryInterval) time.Sleep(time.Second * time.Duration(s.Config.RetryInterval)) continue } return false, fmt.Errorf("fail joining the cluster via given peers after %x retries", retryTimes) }
// IsBtrfs checks whether the file is in btrfs func IsBtrfs(path string) bool { // btrfs is linux-only filesystem // exit on other platforms if runtime.GOOS != "linux" { return false } var buf syscall.Statfs_t if err := syscall.Statfs(path, &buf); err != nil { log.Warnf("Failed to statfs: %v", err) return false } log.Debugf("The type of path %v is %v", path, buf.Type) if buf.Type != BTRFS_SUPER_MAGIC { return false } log.Infof("The path %v is in btrfs", path) return true }
// monitorTimeoutThreshold groups timeout threshold events together and prints // them as a single log line. func (s *PeerServer) monitorTimeoutThreshold() { for { select { case <-s.closeChan: return case value := <-s.timeoutThresholdChan: log.Infof("%s: warning: heartbeat near election timeout: %v", s.Config.Name, value) } timer := time.NewTimer(ThresholdMonitorTimeout) defer timer.Stop() select { case <-s.closeChan: return case <-timer.C: } } }
// Apply executes the command. func (c *DemoteCommand) Apply(context raft.Context) (interface{}, error) { ps, _ := context.Server().Context().(*PeerServer) // Ignore this command if there is no peer. if !ps.registry.PeerExists(c.Name) { return nil, fmt.Errorf("peer does not exist: %s", c.Name) } // Save URLs. clientURL, _ := ps.registry.ClientURL(c.Name) peerURL, _ := ps.registry.PeerURL(c.Name) // Remove node from the shared registry. err := ps.registry.UnregisterPeer(c.Name) if err != nil { log.Debugf("Demote peer %s: Error while unregistering (%v)", c.Name, err) return nil, err } // Delete from stats delete(ps.followersStats.Followers, c.Name) // Remove peer in raft err = context.Server().RemovePeer(c.Name) if err != nil { log.Debugf("Demote peer %s: (%v)", c.Name, err) return nil, err } // Register node as a standby. ps.registry.RegisterStandby(c.Name, peerURL, clientURL) // Update mode if this change applies to this server. if c.Name == ps.Config.Name { log.Infof("Demote peer %s: Set mode to standby with %s", c.Name, ps.server.Leader()) ps.standbyPeerURL, _ = ps.registry.PeerURL(ps.server.Leader()) go ps.setMode(StandbyMode) } return nil, nil }
// Helper function to do discovery and return results in expected format func (s *PeerServer) handleDiscovery(discoverURL string) (peers []string, err error) { peers, err = discovery.Do(discoverURL, s.Config.Name, s.Config.URL) // Warn about errors coming from discovery, this isn't fatal // since the user might have provided a peer list elsewhere, // or there is some log in data dir. if err != nil { log.Warnf("Discovery encountered an error: %v", err) return } for i := range peers { // Strip the scheme off of the peer if it has one // TODO(bp): clean this up! purl, err := url.Parse(peers[i]) if err == nil { peers[i] = purl.Host } } log.Infof("Discovery fetched back peer list: %v", peers) return }