Beispiel #1
0
func (self *Meta) HasSeed(seed *topo.Node) bool {
	for _, s := range self.seeds {
		if s.Addr() == seed.Addr() {
			if s.Id == "" {
				*s = *seed
			}
			return true
		}
	}
	return false
}
Beispiel #2
0
func checkMasterRole(node *topo.Node, ismaster bool) (bool, error) {
	addr := node.Addr()
	info, err := redis.FetchInfo(addr, "replication")
	if err != nil {
		return false, fmt.Errorf("Connect %s failed", addr)
	}
	if info.Get("role") == "master" {
		return true, nil
	} else {
		return false, nil
	}
}
Beispiel #3
0
func checkSlaveRepliStatusOk(node *topo.Node) (bool, error) {
	addr := node.Addr()
	info, err := redis.FetchInfo(addr, "all")
	if err != nil {
		return false, err
	}
	if info.Get("role") == "master" {
		return false, nil
	}
	if info.Get("master_link_status") == "down" {
		return false, nil
	} else if info.Get("loading") == "1" {
		return false, nil
	} else {
		return true, nil
	}
}
Beispiel #4
0
func (self *Inspector) isFreeNode(seed *topo.Node) (bool, *topo.Node) {
	resp, err := redis.ClusterNodesInRegion(seed.Addr(), self.LocalRegion)
	if err != nil && strings.HasPrefix(err.Error(), "ERR Wrong CLUSTER subcommand or number of arguments") {
		//server version do not support 'cluster nodes extra [region]'
		resp, err = redis.ClusterNodes(seed.Addr())
	}
	if err != nil {
		return false, nil
	}
	numNode := 0
	lines := strings.Split(resp, "\n")
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" || strings.HasPrefix(line, "# ") {
			continue
		}
		numNode++
	}
	if numNode != 1 {
		return false, nil
	}
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" || strings.HasPrefix(line, "# ") {
			continue
		}
		node, myself, err := self.buildNode(line)
		if node.Ip == "127.0.0.1" {
			node.Ip = seed.Ip
		}
		// 只看到自己,是主,且没有slots,才认为是FreeNode
		if !myself {
			return false, nil
		}
		if err != nil || len(node.Ranges) > 0 || !node.IsMaster() {
			return false, nil
		} else {
			return true, node
		}
	}
	return false, nil
}
Beispiel #5
0
func configAofAndRdb(node *topo.Node, state bool) error {
	addr := node.Addr()
	var err error
	var err1 error
	var err2 error
	if state {
		_, err = redis.RedisCli(addr, "config", "set", "appendonly", "yes")
		_, err1 = redis.RedisCli(addr, "config", "set", "dbfilename", "dump.rdb")
	} else {
		_, err = redis.RedisCli(addr, "config", "set", "appendonly", "no")
		_, err1 = redis.RedisCli(addr, "config", "set", "dbfilename", "tmp.rdb")
	}
	_, err2 = redis.RedisCli(addr, "config", "rewrite")
	if err != nil {
		return err
	}
	if err1 != nil {
		return err1
	}
	if err2 != nil {
		return err2
	}
	return nil
}
Beispiel #6
0
func upgradeMaster(c *cli.Context) {
	pid := context.GetAppName()
	addr := context.GetLeaderAddr()
	url_fr := "http://" + addr + api.FetchReplicaSetsPath
	url_fl := "http://" + addr + api.NodeSetAsMasterPath
	extraHeader := &utils.ExtraHeader{
		User:  context.Config.User,
		Role:  context.Config.Role,
		Token: context.Config.Token,
	}

	resp, err := utils.HttpGet(url_fr, nil, 5*time.Second)
	if err != nil {
		fmt.Println(err)
		return
	}

	var rss command.FetchReplicaSetsResult
	err = utils.InterfaceToStruct(resp.Body, &rss)
	if err != nil {
		fmt.Println(err)
		return
	}
	sort.Sort(topo.ByMasterId(rss.ReplicaSets))
	sort.Sort(topo.ByNodeState(rss.ReplicaSets))

	iidx, err := getIdx(IdxServerAddr, pid, "master")
	if err != nil {
		fmt.Println(err)
		return
	}

	fmt.Printf("Get last idx record: %d\n", iidx)
	var old_master *topo.Node
	var new_master *topo.Node

	//used to check status
	var new_slaves []*topo.Node
	old_master = nil
	new_master = nil

	for idx, rs := range rss.ReplicaSets {
		if rs.Master.IsArbiter() {
			continue
		}
		if idx <= iidx {
			fmt.Printf("Skipping replica(id:%s) (%d/%d) master\n", rs.Master.Id, idx, len(rss.ReplicaSets))
			continue
		}
		//select a slave in the same IDC
		old_master = rs.Master
		old_master_r := getRegion(old_master)
		if old_master_r == "" {
			return
		}
		new_slaves = append(new_slaves, old_master)

		fmt.Printf("Upgrading replica(id:%s) (%d/%d) master\n", rs.Master.Id, idx, len(rss.ReplicaSets))
		skip := false
		for _, s := range rs.Slaves {
			re := getRegion(s)
			if re == "" {
				return
			}
			if re == old_master_r && !skip {
				new_master = s
				skip = true
			} else {
				new_slaves = append(new_slaves, s)
			}
		}
		if new_master == nil {
			fmt.Printf("Select new master failed for master(%s) replica\n", old_master.Id)
			return
		}
		//send failover to the new master
		req := api.FailoverTakeoverParams{
			NodeId: new_master.Id,
		}
		resp, err := utils.HttpPostExtra(url_fl, req, 10*time.Second, extraHeader)
		if err != nil {
			fmt.Println(err)
			return
		}
		if resp.Errno != 0 {
			fmt.Println(resp.Errmsg)
			return
		}
		//send failover request done,check the new_master role to a real master
		for {
			ismaster, err := checkMasterRole(new_master, true)
			if err != nil {
				fmt.Println(err)
				time.Sleep(10 * time.Second)
				continue
			}
			if ismaster == true {
				//to be a new master
				break
			} else {
				//wait for next check
				time.Sleep(10 * time.Second)
			}
		}
		//disable read flag of the all new slaves,including old master
		for _, s := range new_slaves {
			resp, err = configRead(s, false)
			if err != nil {
				fmt.Println(err)
				return
			}
			if resp.Errno != 0 {
				fmt.Println(resp.Errmsg)
				return
			}
		}
		//disable aof and rdb to speed up start
		err = configAofAndRdb(old_master, false)
		if err != nil {
			fmt.Println(err)
			return
		}
		//shutdown server
		err = shutdownServer(old_master)
		if err != nil {
			fmt.Printf("server %s restart\n", old_master.Addr())
		}
		//check the status of old master
		cnt := 1
		for {
			fmt.Printf("Check slave status %d times\n", cnt)
			cnt++
			inner := func(nodes []*topo.Node) bool {
				rok := true
				for _, n := range nodes {
					ok, err := checkSlaveRepliStatusOk(n)
					if ok {
						//replica status ok,enable read flag,ignore result
						configRead(n, true)
						continue
					}
					if !ok || err != nil {
						rok = false
					}
				}
				return rok
			}

			ok := inner(new_slaves)
			if !ok {
				//not ok, wait for next trun check
				time.Sleep(10 * time.Second)
			} else {
				break
			}
		}
		//enable aof and rdb
		err = configAofAndRdb(old_master, true)
		if err != nil {
			fmt.Println(err)
			return
		}
		//save the idx of the process
		err = saveIdx(IdxServerAddr, pid, "master", idx)
		if err != nil {
			fmt.Println(err)
			return
		}
	}
}
Beispiel #7
0
func shutdownServer(node *topo.Node) error {
	addr := node.Addr()
	_, err := redis.RedisCli(addr, "shutdown", "nosave")
	return err
}
Beispiel #8
0
// 生成ClusterSnapshot
func (self *Inspector) BuildClusterTopo() (*topo.Cluster, []*topo.Node, error) {
	self.mutex.Lock()
	defer self.mutex.Unlock()
	if len(meta.Seeds()) == 0 {
		return nil, nil, ErrNoSeed
	}

	// 过滤掉连接不上的节点
	seeds := []*topo.Node{}
	for _, s := range meta.Seeds() {
		if redis.IsAlive(s.Addr()) {
			seeds = append(seeds, s)
		} else {
			// remove this seed from meta seeds
			// will re-add to seeds if join the cluster again
			meta.RemoveSeed(s.Addr())
		}
	}

	if len(seeds) == 0 {
		return nil, seeds, ErrNoSeed
	}

	// 顺序选一个节点,获取nodes数据作为基准,再用其他节点的数据与基准做对比
	if self.SeedIndex >= len(seeds) {
		self.SeedIndex = len(seeds) - 1
	}
	var seed *topo.Node
	for i := 0; i < len(seeds); i++ {
		seed = seeds[self.SeedIndex]
		self.SeedIndex++
		self.SeedIndex %= len(seeds)
		if seed.Free {
			glog.Info("Seed node is free ", seed.Addr())
		} else {
			break
		}
	}
	cluster, err := self.initClusterTopo(seed)
	if err != nil {
		glog.Infof("InitClusterTopo failed")
		return nil, seeds, err
	}

	// 检查所有节点返回的信息是不是相同,如果不同说明正在变化中,直接返回等待重试
	if len(seeds) > 1 {
		for _, s := range seeds {
			if s == seed {
				continue
			}
			err := self.checkClusterTopo(s, cluster)
			if err != nil {
				free, node := self.isFreeNode(s)
				if free {
					node.Free = true
					glog.Infof("Found free node %s", node.Addr())
					cluster.AddNode(node)
				} else {
					glog.Infof("checkClusterTopo failed")
					return cluster, seeds, err
				}
			} else {
				s.Free = false
			}
		}
	}

	// 构造LocalRegion视图
	for _, s := range cluster.LocalRegionNodes() {
		if s.PFailCount() > cluster.NumLocalRegionNode()/2 {
			glog.Infof("Found %d/%d PFAIL state on %s, set FAIL",
				s.PFailCount(), cluster.NumLocalRegionNode(), s.Addr())
			s.SetFail(true)
		}
	}

	if meta.IsClusterLeader() {
		cluster.BuildReplicaSets()
	}

	meta.MergeSeeds(cluster.LocalRegionNodes())
	self.ClusterTopo = cluster
	return cluster, seeds, nil
}
Beispiel #9
0
func (self *Inspector) checkClusterTopo(seed *topo.Node, cluster *topo.Cluster) error {
	resp, err := redis.ClusterNodesInRegion(seed.Addr(), self.LocalRegion)
	if err != nil && strings.HasPrefix(err.Error(), "ERR Wrong CLUSTER subcommand or number of arguments") {
		//server version do not support 'cluster nodes extra [region]'
		resp, err = redis.ClusterNodes(seed.Addr())
	}

	//this may lead to BuildClusterTopo update failed for a time
	//the node is step into this state after check IsAlive
	if err != nil && strings.HasPrefix(err.Error(), "LOADING") {
		return nil
	}
	if err != nil {
		return err
	}

	var summary topo.SummaryInfo
	lines := strings.Split(resp, "\n")
	for _, line := range lines {
		if strings.HasPrefix(line, "# ") {
			summary.ReadLine(line)
			continue
		}
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}

		s, myself, err := self.buildNode(line)
		if err == ErrNodeInHandShake || err == ErrNodeNoAddr {
			continue
		}
		// Fix 'cluster nodes extra' & 'cluster nodes extra region' compatiable
		if s.Region != self.LocalRegion {
			continue
		}
		if err != nil {
			return err
		}
		if s.Ip == "127.0.0.1" {
			s.Ip = seed.Ip
		}
		node := cluster.FindNode(s.Id)
		if node == nil {
			if s.PFail {
				glog.Warningf("forget dead node %s(%s) should be forgoten", s.Id, s.Addr())
				//redis.ClusterForget(seed.Addr(), s.Id)
			}
			return fmt.Errorf("node not exist %s(%s)", s.Id, s.Addr())
		}

		// 对比节点数据是否相同
		if !node.Compare(s) {
			glog.Infof("%#v vs %#v different", s, node)
			if s.Tag == "-" && node.Tag != "-" {
				// 可能存在处于不被Cluster接受的节点,节点可以看见Cluster,但Cluster看不到它。
				// 一种复现情况情况:某个节点已经死了,系统将其Forget,但是OP并未被摘除该节点,
				// 而是恢复了该节点。
				glog.Warningf("remeet node %s", seed.Addr())
				self.MeetNode(seed)
			}
			return ErrNodesInfoNotSame
		}
		if len(node.Ranges) == 0 && len(s.Ranges) > 0 {
			glog.Warningf("Ranges not equal, use nonempty ranges.")
			node.Ranges = s.Ranges
		}

		if myself {
			info, err := redis.FetchClusterInfo(node.Addr())
			if err != nil {
				return err
			}
			node.ClusterInfo = info
			node.SummaryInfo = summary
		}

		if len(s.Migrating) != 0 {
			node.Migrating = s.Migrating
		}
		if len(s.Importing) != 0 {
			node.Importing = s.Importing
		}

		if s.PFail {
			node.IncrPFailCount()
		}
	}
	return nil
}
Beispiel #10
0
func (self *Inspector) initClusterTopo(seed *topo.Node) (*topo.Cluster, error) {
	resp, err := redis.ClusterNodesInRegion(seed.Addr(), self.LocalRegion)
	if err != nil && strings.HasPrefix(err.Error(), "ERR Wrong CLUSTER subcommand or number of arguments") {
		//server version do not support 'cluster nodes extra [region]'
		resp, err = redis.ClusterNodes(seed.Addr())
	}
	if err != nil {
		return nil, err
	}

	cluster := topo.NewCluster(self.LocalRegion)

	var summary topo.SummaryInfo
	var nodeidx *topo.Node
	var cnt int

	lines := strings.Split(resp, "\n")
	cnt = 0
	for _, line := range lines {
		if strings.HasPrefix(line, "# ") {
			summary.ReadLine(line)
			continue
		}
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		node, myself, err := self.buildNode(line)
		if err == ErrNodeInHandShake || err == ErrNodeNoAddr {
			continue
		}
		// Fix 'cluster nodes extra' & 'cluster nodes extra region' compatiable
		if node.Region != self.LocalRegion {
			continue
		}
		if err != nil {
			return nil, err
		}
		if node.Ip == "127.0.0.1" {
			node.Ip = seed.Ip
		}
		// 遇到myself,读取该节点的ClusterInfo
		if myself {
			info, err := redis.FetchClusterInfo(node.Addr())
			if err != nil {
				return nil, err
			}
			node.ClusterInfo = info
			node.SummaryInfo = summary
		}
		cluster.AddNode(node)
		nodeidx = node
		cnt++
	}
	if cnt == 1 {
		if nodeidx.IsMaster() && len(nodeidx.Ranges) == 0 {
			glog.Infof("Node %s is free node", nodeidx.Addr())
			nodeidx.SetFree(true)
		}
	}

	return cluster, nil
}