// 似乎,只有同时进行Forget和Reset才有意义,否则都是一个不一致的状态 func (self *ForgetAndResetNodeCommand) Execute(c *cc.Controller) (cc.Result, error) { cs := c.ClusterState target := cs.FindNode(self.NodeId) if target == nil { return nil, ErrNodeNotExist } if !target.Free == false { return nil, ErrNodeIsFree } if len(target.Ranges) > 0 { return nil, ErrNodeNotEmpty } var err error forgetCount := 0 allForgetDone := true // 1. 所有节点发送Forget for _, ns := range cs.AllNodeStates() { if ns.Id() == target.Id { continue } node := ns.Node() _, err = redis.ClusterForget(ns.Addr(), target.Id) if !node.Fail && err != nil && !strings.HasPrefix(err.Error(), "ERR Unknown node") { allForgetDone = false log.Warningf(target.Addr(), "Forget node %s(%s) failed, %v", ns.Addr(), ns.Id(), err) continue } else if !node.Fail && err != nil { //try again for try := redis.NUM_RETRY; try >= 0; try-- { _, err = redis.ClusterForget(ns.Addr(), target.Id) if err == nil { break } } //execute failed after retry if err != nil { allForgetDone = false log.Warningf(target.Addr(), "Forget node %s(%s) failed after retry, %v", ns.Addr(), ns.Id(), err) continue } } log.Eventf(target.Addr(), "Forget by %s(%s).", ns.Addr(), ns.Id()) forgetCount++ } if !allForgetDone { return nil, fmt.Errorf("Not all forget done, only (%d/%d) success", forgetCount, len(cs.AllNodeStates())-1) } // 2. 重置 if !target.Fail { _, err = redis.ClusterReset(target.Addr(), false) if err != nil { return nil, fmt.Errorf("Reset node %s(%s) failed, %v", target.Id, target.Addr(), err) } log.Eventf(target.Addr(), "Reset.") } // remove seed in leader contrller meta.RemoveSeed(target.Addr()) return nil, nil }
// 生成ClusterSnapshot func (self *Inspector) BuildClusterTopo() (*topo.Cluster, []*topo.Node, error) { self.mutex.Lock() defer self.mutex.Unlock() if len(meta.Seeds()) == 0 { return nil, nil, ErrNoSeed } // 过滤掉连接不上的节点 seeds := []*topo.Node{} for _, s := range meta.Seeds() { if redis.IsAlive(s.Addr()) { seeds = append(seeds, s) } else { // remove this seed from meta seeds // will re-add to seeds if join the cluster again meta.RemoveSeed(s.Addr()) } } if len(seeds) == 0 { return nil, seeds, ErrNoSeed } // 顺序选一个节点,获取nodes数据作为基准,再用其他节点的数据与基准做对比 if self.SeedIndex >= len(seeds) { self.SeedIndex = len(seeds) - 1 } var seed *topo.Node for i := 0; i < len(seeds); i++ { seed = seeds[self.SeedIndex] self.SeedIndex++ self.SeedIndex %= len(seeds) if seed.Free { glog.Info("Seed node is free ", seed.Addr()) } else { break } } cluster, err := self.initClusterTopo(seed) if err != nil { glog.Infof("InitClusterTopo failed") return nil, seeds, err } // 检查所有节点返回的信息是不是相同,如果不同说明正在变化中,直接返回等待重试 if len(seeds) > 1 { for _, s := range seeds { if s == seed { continue } err := self.checkClusterTopo(s, cluster) if err != nil { free, node := self.isFreeNode(s) if free { node.Free = true glog.Infof("Found free node %s", node.Addr()) cluster.AddNode(node) } else { glog.Infof("checkClusterTopo failed") return cluster, seeds, err } } else { s.Free = false } } } // 构造LocalRegion视图 for _, s := range cluster.LocalRegionNodes() { if s.PFailCount() > cluster.NumLocalRegionNode()/2 { glog.Infof("Found %d/%d PFAIL state on %s, set FAIL", s.PFailCount(), cluster.NumLocalRegionNode(), s.Addr()) s.SetFail(true) } } if meta.IsClusterLeader() { cluster.BuildReplicaSets() } meta.MergeSeeds(cluster.LocalRegionNodes()) self.ClusterTopo = cluster return cluster, seeds, nil }