func (self *Meta) HasSeed(seed *topo.Node) bool { for _, s := range self.seeds { if s.Addr() == seed.Addr() { if s.Id == "" { *s = *seed } return true } } return false }
func checkMasterRole(node *topo.Node, ismaster bool) (bool, error) { addr := node.Addr() info, err := redis.FetchInfo(addr, "replication") if err != nil { return false, fmt.Errorf("Connect %s failed", addr) } if info.Get("role") == "master" { return true, nil } else { return false, nil } }
func checkSlaveRepliStatusOk(node *topo.Node) (bool, error) { addr := node.Addr() info, err := redis.FetchInfo(addr, "all") if err != nil { return false, err } if info.Get("role") == "master" { return false, nil } if info.Get("master_link_status") == "down" { return false, nil } else if info.Get("loading") == "1" { return false, nil } else { return true, nil } }
func (self *Inspector) isFreeNode(seed *topo.Node) (bool, *topo.Node) { resp, err := redis.ClusterNodesInRegion(seed.Addr(), self.LocalRegion) if err != nil && strings.HasPrefix(err.Error(), "ERR Wrong CLUSTER subcommand or number of arguments") { //server version do not support 'cluster nodes extra [region]' resp, err = redis.ClusterNodes(seed.Addr()) } if err != nil { return false, nil } numNode := 0 lines := strings.Split(resp, "\n") for _, line := range lines { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "# ") { continue } numNode++ } if numNode != 1 { return false, nil } for _, line := range lines { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "# ") { continue } node, myself, err := self.buildNode(line) if node.Ip == "127.0.0.1" { node.Ip = seed.Ip } // 只看到自己,是主,且没有slots,才认为是FreeNode if !myself { return false, nil } if err != nil || len(node.Ranges) > 0 || !node.IsMaster() { return false, nil } else { return true, node } } return false, nil }
func configAofAndRdb(node *topo.Node, state bool) error { addr := node.Addr() var err error var err1 error var err2 error if state { _, err = redis.RedisCli(addr, "config", "set", "appendonly", "yes") _, err1 = redis.RedisCli(addr, "config", "set", "dbfilename", "dump.rdb") } else { _, err = redis.RedisCli(addr, "config", "set", "appendonly", "no") _, err1 = redis.RedisCli(addr, "config", "set", "dbfilename", "tmp.rdb") } _, err2 = redis.RedisCli(addr, "config", "rewrite") if err != nil { return err } if err1 != nil { return err1 } if err2 != nil { return err2 } return nil }
func upgradeMaster(c *cli.Context) { pid := context.GetAppName() addr := context.GetLeaderAddr() url_fr := "http://" + addr + api.FetchReplicaSetsPath url_fl := "http://" + addr + api.NodeSetAsMasterPath extraHeader := &utils.ExtraHeader{ User: context.Config.User, Role: context.Config.Role, Token: context.Config.Token, } resp, err := utils.HttpGet(url_fr, nil, 5*time.Second) if err != nil { fmt.Println(err) return } var rss command.FetchReplicaSetsResult err = utils.InterfaceToStruct(resp.Body, &rss) if err != nil { fmt.Println(err) return } sort.Sort(topo.ByMasterId(rss.ReplicaSets)) sort.Sort(topo.ByNodeState(rss.ReplicaSets)) iidx, err := getIdx(IdxServerAddr, pid, "master") if err != nil { fmt.Println(err) return } fmt.Printf("Get last idx record: %d\n", iidx) var old_master *topo.Node var new_master *topo.Node //used to check status var new_slaves []*topo.Node old_master = nil new_master = nil for idx, rs := range rss.ReplicaSets { if rs.Master.IsArbiter() { continue } if idx <= iidx { fmt.Printf("Skipping replica(id:%s) (%d/%d) master\n", rs.Master.Id, idx, len(rss.ReplicaSets)) continue } //select a slave in the same IDC old_master = rs.Master old_master_r := getRegion(old_master) if old_master_r == "" { return } new_slaves = append(new_slaves, old_master) fmt.Printf("Upgrading replica(id:%s) (%d/%d) master\n", rs.Master.Id, idx, len(rss.ReplicaSets)) skip := false for _, s := range rs.Slaves { re := getRegion(s) if re == "" { return } if re == old_master_r && !skip { new_master = s skip = true } else { new_slaves = append(new_slaves, s) } } if new_master == nil { fmt.Printf("Select new master failed for master(%s) replica\n", old_master.Id) return } //send failover to the new master req := api.FailoverTakeoverParams{ NodeId: new_master.Id, } resp, err := utils.HttpPostExtra(url_fl, req, 10*time.Second, extraHeader) if err != nil { fmt.Println(err) return } if resp.Errno != 0 { fmt.Println(resp.Errmsg) return } //send failover request done,check the new_master role to a real master for { ismaster, err := checkMasterRole(new_master, true) if err != nil { fmt.Println(err) time.Sleep(10 * time.Second) continue } if ismaster == true { //to be a new master break } else { //wait for next check time.Sleep(10 * time.Second) } } //disable read flag of the all new slaves,including old master for _, s := range new_slaves { resp, err = configRead(s, false) if err != nil { fmt.Println(err) return } if resp.Errno != 0 { fmt.Println(resp.Errmsg) return } } //disable aof and rdb to speed up start err = configAofAndRdb(old_master, false) if err != nil { fmt.Println(err) return } //shutdown server err = shutdownServer(old_master) if err != nil { fmt.Printf("server %s restart\n", old_master.Addr()) } //check the status of old master cnt := 1 for { fmt.Printf("Check slave status %d times\n", cnt) cnt++ inner := func(nodes []*topo.Node) bool { rok := true for _, n := range nodes { ok, err := checkSlaveRepliStatusOk(n) if ok { //replica status ok,enable read flag,ignore result configRead(n, true) continue } if !ok || err != nil { rok = false } } return rok } ok := inner(new_slaves) if !ok { //not ok, wait for next trun check time.Sleep(10 * time.Second) } else { break } } //enable aof and rdb err = configAofAndRdb(old_master, true) if err != nil { fmt.Println(err) return } //save the idx of the process err = saveIdx(IdxServerAddr, pid, "master", idx) if err != nil { fmt.Println(err) return } } }
func shutdownServer(node *topo.Node) error { addr := node.Addr() _, err := redis.RedisCli(addr, "shutdown", "nosave") return err }
// 生成ClusterSnapshot func (self *Inspector) BuildClusterTopo() (*topo.Cluster, []*topo.Node, error) { self.mutex.Lock() defer self.mutex.Unlock() if len(meta.Seeds()) == 0 { return nil, nil, ErrNoSeed } // 过滤掉连接不上的节点 seeds := []*topo.Node{} for _, s := range meta.Seeds() { if redis.IsAlive(s.Addr()) { seeds = append(seeds, s) } else { // remove this seed from meta seeds // will re-add to seeds if join the cluster again meta.RemoveSeed(s.Addr()) } } if len(seeds) == 0 { return nil, seeds, ErrNoSeed } // 顺序选一个节点,获取nodes数据作为基准,再用其他节点的数据与基准做对比 if self.SeedIndex >= len(seeds) { self.SeedIndex = len(seeds) - 1 } var seed *topo.Node for i := 0; i < len(seeds); i++ { seed = seeds[self.SeedIndex] self.SeedIndex++ self.SeedIndex %= len(seeds) if seed.Free { glog.Info("Seed node is free ", seed.Addr()) } else { break } } cluster, err := self.initClusterTopo(seed) if err != nil { glog.Infof("InitClusterTopo failed") return nil, seeds, err } // 检查所有节点返回的信息是不是相同,如果不同说明正在变化中,直接返回等待重试 if len(seeds) > 1 { for _, s := range seeds { if s == seed { continue } err := self.checkClusterTopo(s, cluster) if err != nil { free, node := self.isFreeNode(s) if free { node.Free = true glog.Infof("Found free node %s", node.Addr()) cluster.AddNode(node) } else { glog.Infof("checkClusterTopo failed") return cluster, seeds, err } } else { s.Free = false } } } // 构造LocalRegion视图 for _, s := range cluster.LocalRegionNodes() { if s.PFailCount() > cluster.NumLocalRegionNode()/2 { glog.Infof("Found %d/%d PFAIL state on %s, set FAIL", s.PFailCount(), cluster.NumLocalRegionNode(), s.Addr()) s.SetFail(true) } } if meta.IsClusterLeader() { cluster.BuildReplicaSets() } meta.MergeSeeds(cluster.LocalRegionNodes()) self.ClusterTopo = cluster return cluster, seeds, nil }
func (self *Inspector) checkClusterTopo(seed *topo.Node, cluster *topo.Cluster) error { resp, err := redis.ClusterNodesInRegion(seed.Addr(), self.LocalRegion) if err != nil && strings.HasPrefix(err.Error(), "ERR Wrong CLUSTER subcommand or number of arguments") { //server version do not support 'cluster nodes extra [region]' resp, err = redis.ClusterNodes(seed.Addr()) } //this may lead to BuildClusterTopo update failed for a time //the node is step into this state after check IsAlive if err != nil && strings.HasPrefix(err.Error(), "LOADING") { return nil } if err != nil { return err } var summary topo.SummaryInfo lines := strings.Split(resp, "\n") for _, line := range lines { if strings.HasPrefix(line, "# ") { summary.ReadLine(line) continue } line = strings.TrimSpace(line) if line == "" { continue } s, myself, err := self.buildNode(line) if err == ErrNodeInHandShake || err == ErrNodeNoAddr { continue } // Fix 'cluster nodes extra' & 'cluster nodes extra region' compatiable if s.Region != self.LocalRegion { continue } if err != nil { return err } if s.Ip == "127.0.0.1" { s.Ip = seed.Ip } node := cluster.FindNode(s.Id) if node == nil { if s.PFail { glog.Warningf("forget dead node %s(%s) should be forgoten", s.Id, s.Addr()) //redis.ClusterForget(seed.Addr(), s.Id) } return fmt.Errorf("node not exist %s(%s)", s.Id, s.Addr()) } // 对比节点数据是否相同 if !node.Compare(s) { glog.Infof("%#v vs %#v different", s, node) if s.Tag == "-" && node.Tag != "-" { // 可能存在处于不被Cluster接受的节点,节点可以看见Cluster,但Cluster看不到它。 // 一种复现情况情况:某个节点已经死了,系统将其Forget,但是OP并未被摘除该节点, // 而是恢复了该节点。 glog.Warningf("remeet node %s", seed.Addr()) self.MeetNode(seed) } return ErrNodesInfoNotSame } if len(node.Ranges) == 0 && len(s.Ranges) > 0 { glog.Warningf("Ranges not equal, use nonempty ranges.") node.Ranges = s.Ranges } if myself { info, err := redis.FetchClusterInfo(node.Addr()) if err != nil { return err } node.ClusterInfo = info node.SummaryInfo = summary } if len(s.Migrating) != 0 { node.Migrating = s.Migrating } if len(s.Importing) != 0 { node.Importing = s.Importing } if s.PFail { node.IncrPFailCount() } } return nil }
func (self *Inspector) initClusterTopo(seed *topo.Node) (*topo.Cluster, error) { resp, err := redis.ClusterNodesInRegion(seed.Addr(), self.LocalRegion) if err != nil && strings.HasPrefix(err.Error(), "ERR Wrong CLUSTER subcommand or number of arguments") { //server version do not support 'cluster nodes extra [region]' resp, err = redis.ClusterNodes(seed.Addr()) } if err != nil { return nil, err } cluster := topo.NewCluster(self.LocalRegion) var summary topo.SummaryInfo var nodeidx *topo.Node var cnt int lines := strings.Split(resp, "\n") cnt = 0 for _, line := range lines { if strings.HasPrefix(line, "# ") { summary.ReadLine(line) continue } line = strings.TrimSpace(line) if line == "" { continue } node, myself, err := self.buildNode(line) if err == ErrNodeInHandShake || err == ErrNodeNoAddr { continue } // Fix 'cluster nodes extra' & 'cluster nodes extra region' compatiable if node.Region != self.LocalRegion { continue } if err != nil { return nil, err } if node.Ip == "127.0.0.1" { node.Ip = seed.Ip } // 遇到myself,读取该节点的ClusterInfo if myself { info, err := redis.FetchClusterInfo(node.Addr()) if err != nil { return nil, err } node.ClusterInfo = info node.SummaryInfo = summary } cluster.AddNode(node) nodeidx = node cnt++ } if cnt == 1 { if nodeidx.IsMaster() && len(nodeidx.Ranges) == 0 { glog.Infof("Node %s is free node", nodeidx.Addr()) nodeidx.SetFree(true) } } return cluster, nil }