Example #1
0
// 似乎,只有同时进行Forget和Reset才有意义,否则都是一个不一致的状态
func (self *ForgetAndResetNodeCommand) Execute(c *cc.Controller) (cc.Result, error) {
	cs := c.ClusterState
	target := cs.FindNode(self.NodeId)
	if target == nil {
		return nil, ErrNodeNotExist
	}
	if !target.Free == false {
		return nil, ErrNodeIsFree
	}
	if len(target.Ranges) > 0 {
		return nil, ErrNodeNotEmpty
	}
	var err error
	forgetCount := 0
	allForgetDone := true
	// 1. 所有节点发送Forget
	for _, ns := range cs.AllNodeStates() {
		if ns.Id() == target.Id {
			continue
		}
		node := ns.Node()
		_, err = redis.ClusterForget(ns.Addr(), target.Id)
		if !node.Fail && err != nil && !strings.HasPrefix(err.Error(), "ERR Unknown node") {
			allForgetDone = false
			log.Warningf(target.Addr(), "Forget node %s(%s) failed, %v", ns.Addr(), ns.Id(), err)
			continue
		} else if !node.Fail && err != nil {
			//try again
			for try := redis.NUM_RETRY; try >= 0; try-- {
				_, err = redis.ClusterForget(ns.Addr(), target.Id)
				if err == nil {
					break
				}
			}
			//execute failed after retry
			if err != nil {
				allForgetDone = false
				log.Warningf(target.Addr(), "Forget node %s(%s) failed after retry, %v", ns.Addr(), ns.Id(), err)
				continue
			}

		}
		log.Eventf(target.Addr(), "Forget by %s(%s).", ns.Addr(), ns.Id())
		forgetCount++
	}
	if !allForgetDone {
		return nil, fmt.Errorf("Not all forget done, only (%d/%d) success",
			forgetCount, len(cs.AllNodeStates())-1)
	}
	// 2. 重置
	if !target.Fail {
		_, err = redis.ClusterReset(target.Addr(), false)
		if err != nil {
			return nil, fmt.Errorf("Reset node %s(%s) failed, %v", target.Id, target.Addr(), err)
		}
		log.Eventf(target.Addr(), "Reset.")
	}

	// remove seed in leader contrller
	meta.RemoveSeed(target.Addr())
	return nil, nil
}
Example #2
0
// 生成ClusterSnapshot
func (self *Inspector) BuildClusterTopo() (*topo.Cluster, []*topo.Node, error) {
	self.mutex.Lock()
	defer self.mutex.Unlock()
	if len(meta.Seeds()) == 0 {
		return nil, nil, ErrNoSeed
	}

	// 过滤掉连接不上的节点
	seeds := []*topo.Node{}
	for _, s := range meta.Seeds() {
		if redis.IsAlive(s.Addr()) {
			seeds = append(seeds, s)
		} else {
			// remove this seed from meta seeds
			// will re-add to seeds if join the cluster again
			meta.RemoveSeed(s.Addr())
		}
	}

	if len(seeds) == 0 {
		return nil, seeds, ErrNoSeed
	}

	// 顺序选一个节点,获取nodes数据作为基准,再用其他节点的数据与基准做对比
	if self.SeedIndex >= len(seeds) {
		self.SeedIndex = len(seeds) - 1
	}
	var seed *topo.Node
	for i := 0; i < len(seeds); i++ {
		seed = seeds[self.SeedIndex]
		self.SeedIndex++
		self.SeedIndex %= len(seeds)
		if seed.Free {
			glog.Info("Seed node is free ", seed.Addr())
		} else {
			break
		}
	}
	cluster, err := self.initClusterTopo(seed)
	if err != nil {
		glog.Infof("InitClusterTopo failed")
		return nil, seeds, err
	}

	// 检查所有节点返回的信息是不是相同,如果不同说明正在变化中,直接返回等待重试
	if len(seeds) > 1 {
		for _, s := range seeds {
			if s == seed {
				continue
			}
			err := self.checkClusterTopo(s, cluster)
			if err != nil {
				free, node := self.isFreeNode(s)
				if free {
					node.Free = true
					glog.Infof("Found free node %s", node.Addr())
					cluster.AddNode(node)
				} else {
					glog.Infof("checkClusterTopo failed")
					return cluster, seeds, err
				}
			} else {
				s.Free = false
			}
		}
	}

	// 构造LocalRegion视图
	for _, s := range cluster.LocalRegionNodes() {
		if s.PFailCount() > cluster.NumLocalRegionNode()/2 {
			glog.Infof("Found %d/%d PFAIL state on %s, set FAIL",
				s.PFailCount(), cluster.NumLocalRegionNode(), s.Addr())
			s.SetFail(true)
		}
	}

	if meta.IsClusterLeader() {
		cluster.BuildReplicaSets()
	}

	meta.MergeSeeds(cluster.LocalRegionNodes())
	self.ClusterTopo = cluster
	return cluster, seeds, nil
}