// RunTask returns nil when the underlyingtask ends or the error it // generated. func (ze *zElector) RunTask(task *electorTask) error { leaderPath := path.Join(ze.path, "leader") for { _, err := zkhelper.CreateRecursive(ze.zconn, leaderPath, "", 0, zk.WorldACL(zkhelper.PERM_FILE)) if err == nil || zkhelper.ZkErrorEqual(err, zk.ErrNodeExists) { break } log.Warnf("election leader create failed: %v", err) time.Sleep(500 * time.Millisecond) } for { err := ze.Lock("RunTask") if err != nil { log.Warnf("election lock failed: %v", err) if err == zkhelper.ErrInterrupted { return zkhelper.ErrInterrupted } continue } // Confirm your win and deliver acceptance speech. This notifies // listeners who will have been watching the leader node for // changes. _, err = ze.zconn.Set(leaderPath, []byte(ze.contents), -1) if err != nil { log.Warnf("election promotion failed: %v", err) continue } log.Infof("election promote leader %v", leaderPath) taskErrChan := make(chan error) go func() { taskErrChan <- task.Run() }() watchLeader: // Watch the leader so we can get notified if something goes wrong. data, _, watch, err := ze.zconn.GetW(leaderPath) if err != nil { log.Warnf("election unable to watch leader node %v %v", leaderPath, err) // FIXME(msolo) Add delay goto watchLeader } if string(data) != ze.contents { log.Warnf("election unable to promote leader") task.Stop() // We won the election, but we didn't become the leader. How is that possible? // (see Bush v. Gore for some inspiration) // It means: // 1. Someone isn't playing by the election rules (a bad actor). // Hard to detect - let's assume we don't have this problem. :) // 2. We lost our connection somehow and the ephemeral lock was cleared, // allowing someone else to win the election. continue } // This is where we start our target process and watch for its failure. waitForEvent: select { case <-ze.interrupted: log.Warn("election interrupted - stop child process") task.Stop() // Once the process dies from the signal, this will all tear down. goto waitForEvent case taskErr := <-taskErrChan: // If our code fails, unlock to trigger an election. log.Infof("election child process ended: %v", taskErr) ze.Unlock() if task.Interrupted() { log.Warnf("election child process interrupted - stepping down") return zkhelper.ErrInterrupted } continue case zevent := <-watch: // We had a zk connection hiccup. We have a few choices, // but it depends on the constraints and the events. // // If we get SESSION_EXPIRED our connection loss triggered an // election that we won't have won and the thus the lock was // automatically freed. We have no choice but to start over. if zevent.State == zk.StateExpired { log.Warnf("election leader watch expired") task.Stop() continue } // Otherwise, we had an intermittent issue or something touched // the node. Either we lost our position or someone broke // protocol and touched the leader node. We just reconnect and // revalidate. In the meantime, assume we are still the leader // until we determine otherwise. // // On a reconnect we will be able to see the leader // information. If we still hold the position, great. If not, we // kill the associated process. // // On a leader node change, we need to perform the same // validation. It's possible an election completes without the // old leader realizing he is out of touch. log.Warnf("election leader watch event %v", zevent) goto watchLeader } } panic("unreachable") }
func newRaft(c *Config, fsm raft.FSM) (Cluster, error) { r := new(Raft) if len(c.Raft.Addr) == 0 { return nil, nil } peers := make([]net.Addr, 0, len(c.Raft.Cluster)) r.raftAddr = c.Raft.Addr a, err := net.ResolveTCPAddr("tcp", r.raftAddr) if err != nil { return nil, fmt.Errorf("invalid raft addr format %s, must host:port, err:%v", r.raftAddr, err) } peers = raft.AddUniquePeer(peers, a) for _, cluster := range c.Raft.Cluster { a, err = net.ResolveTCPAddr("tcp", cluster) if err != nil { return nil, fmt.Errorf("invalid cluster format %s, must host:port, err:%v", cluster, err) } peers = raft.AddUniquePeer(peers, a) } os.MkdirAll(c.Raft.DataDir, 0755) cfg := raft.DefaultConfig() if len(c.Raft.LogDir) == 0 { r.log = os.Stdout } else { os.MkdirAll(c.Raft.LogDir, 0755) logFile := path.Join(c.Raft.LogDir, "raft.log") f, err := os.OpenFile(logFile, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0644) if err != nil { return nil, err } r.log = f cfg.LogOutput = r.log } raftDBPath := path.Join(c.Raft.DataDir, "raft_db") r.dbStore, err = raftboltdb.NewBoltStore(raftDBPath) if err != nil { return nil, err } fileStore, err := raft.NewFileSnapshotStore(c.Raft.DataDir, 1, r.log) if err != nil { return nil, err } r.trans, err = raft.NewTCPTransport(r.raftAddr, nil, 3, 5*time.Second, r.log) if err != nil { return nil, err } r.peerStore = raft.NewJSONPeers(c.Raft.DataDir, r.trans) if c.Raft.ClusterState == ClusterStateNew { log.Infof("cluster state is new, use new cluster config") r.peerStore.SetPeers(peers) } else { log.Infof("cluster state is existing, use previous + new cluster config") ps, err := r.peerStore.Peers() if err != nil { log.Errorf("get store peers error %v", err) return nil, err } for _, peer := range peers { ps = raft.AddUniquePeer(ps, peer) } r.peerStore.SetPeers(ps) } if peers, _ := r.peerStore.Peers(); len(peers) <= 1 { cfg.EnableSingleNode = true log.Warn("raft will run in single node mode, may only be used in test") } r.r, err = raft.NewRaft(cfg, fsm, r.dbStore, r.dbStore, fileStore, r.peerStore, r.trans) return r, err }