Exemplo n.º 1
0
func (m *MigrateManager) CreateTask(sourceId, targetId string, ranges []topo.Range, cluster *topo.Cluster) (*MigrateTask, error) {
	sourceRS := cluster.FindReplicaSetByNode(sourceId)
	targetRS := cluster.FindReplicaSetByNode(targetId)
	if sourceRS == nil || targetRS == nil {
		return nil, ErrReplicatSetNotFound
	}
	task := NewMigrateTask(cluster, sourceRS, targetRS, ranges)
	err := m.AddTask(task)
	if err != nil {
		return nil, err
	}
	return task, nil
}
Exemplo n.º 2
0
func (self *Inspector) IsClusterDamaged(cluster *topo.Cluster, seeds []*topo.Node) bool {
	// more than half masters dead
	numFail := 0
	for _, node := range cluster.MasterNodes() {
		if node.Fail {
			numFail++
		}
	}
	if numFail >= (cluster.Size()+1)/2 {
		return true
	}

	// more than half nodes dead
	if len(seeds) > cluster.NumLocalRegionNode()/2 {
		return false
	}
	for _, seed := range seeds {
		c, err := self.initClusterTopo(seed)
		if err != nil {
			return false
		}
		for _, node := range c.LocalRegionNodes() {
			// nodes not in seeds must be pfail
			if !containsNode(node, seeds) && !node.PFail {
				return false
			}
		}
	}
	glog.Info("more than half nodes dead")
	return true
}
Exemplo n.º 3
0
func GenerateRebalancePlan(method string, cluster *topo.Cluster, targetIds []string, ratio int) ([]*MigratePlan, error) {
	rss := cluster.ReplicaSets()
	regions := meta.AllRegions()

	ss := []*topo.Node{}          // 有slots的Master
	tm := map[string]*topo.Node{} // 空slots的Master

	for _, rs := range rss {
		master := rs.Master
		// 忽略主挂掉和region覆盖不全的rs
		if master.Fail || !rs.IsCoverAllRegions(regions) || master.Free {
			continue
		}
		if master.Empty() {
			tm[master.Id] = master
		} else {
			ss = append(ss, master)
		}
	}
	if method == "mergetail" {
		merger := MergerTable[method]
		if merger == nil {
			return nil, fmt.Errorf("Rebalancing method %s not exist.", method)
		}
		plans := merger(ss, 0)
		return plans, nil
	} else if method == "mergeall" {
		merger := MergerTable[method]
		if merger == nil {
			return nil, fmt.Errorf("Rebalancing method %s not exist.", method)
		}
		plans := merger(ss, ratio)
		return plans, nil
	} else {

		var ts []*topo.Node
		// 如果没传TargetId,则选择所有可以作为迁移目标的rs
		if len(targetIds) == 0 {
			for _, node := range tm {
				ts = append(ts, node)
			}
		} else {
			for _, id := range targetIds {
				if tm[id] == nil {
					return nil, fmt.Errorf("Master %s not found.", id)
				}
				ts = append(ts, tm[id])
			}
		}

		if len(ts) == 0 {
			return nil, fmt.Errorf("No available empty target replicasets.")
		}

		rebalancer := RebalancerTable[method]
		if rebalancer == nil {
			return nil, fmt.Errorf("Rebalancing method %s not exist.", method)
		}
		plans := rebalancer(ss, ts)
		return plans, nil
	}
	return nil, nil
}
Exemplo n.º 4
0
func (m *MigrateManager) HandleNodeStateChange(cluster *topo.Cluster) {
	// 如果存在迁移任务,先跳过,等结束后再处理
	if len(m.tasks) > 0 {
		goto done
	}

	// 处理主节点的迁移任务重建
	for _, node := range cluster.AllNodes() {
		if node.Fail {
			continue
		}
		// Wait a while
		if time.Now().Sub(m.lastTaskEndTime) < 5*time.Second {
			continue
		}
		for id, slots := range node.Migrating {
			// 根据slot生成ranges
			ranges := []topo.Range{}
			for _, slot := range slots {
				// 如果是自己
				if id == node.Id {
					redis.SetSlot(node.Addr(), slot, redis.SLOT_STABLE, "")
				} else {
					ranges = append(ranges, topo.Range{Left: slot, Right: slot})
				}
			}
			// Source
			source := node
			if !node.IsMaster() {
				srs := cluster.FindReplicaSetByNode(node.Id)
				if srs != nil {
					source = srs.Master
				}
			}
			// Target
			rs := cluster.FindReplicaSetByNode(id)
			if source.Fail || rs.Master.Fail {
				continue
			}

			_, err := m.CreateTask(source.Id, rs.Master.Id, ranges, cluster)
			if err != nil {
				log.Warningf(node.Addr(), "Can not recover migrate task, %v", err)
			} else {
				log.Warningf(node.Addr(), "Will recover migrating task for node %s(%s) with MIGRATING info"+
					", Task(Source:%s, Target:%s).", node.Id, node.Addr(), source.Addr(), rs.Master.Addr())
				goto done
			}
		}
		for id, slots := range node.Importing {
			// 根据slot生成ranges
			ranges := []topo.Range{}
			for _, slot := range slots {
				// 如果是自己
				if id == node.Id {
					redis.SetSlot(node.Addr(), slot, redis.SLOT_STABLE, "")
				} else {
					ranges = append(ranges, topo.Range{Left: slot, Right: slot})
				}
			}
			// Target
			target := node
			if !node.IsMaster() {
				trs := cluster.FindReplicaSetByNode(node.Id)
				if trs != nil {
					target = trs.Master
				}
			}
			if target.IsStandbyMaster() {
				s := cluster.FindNodeBySlot(ranges[0].Left)
				if s != nil {
					log.Warningf(node.Addr(), "Reset migrate task target to %s(%s)", s.Id, s.Addr())
					target = s
				}
			}
			// Source
			rs := cluster.FindReplicaSetByNode(id)
			if target.Fail || rs.Master.Fail {
				continue
			}
			_, err := m.CreateTask(rs.Master.Id, target.Id, ranges, cluster)
			if err != nil {
				log.Warningf(node.Addr(), "Can not recover migrate task, %v", err)
			} else {
				log.Warningf(node.Addr(), "Will recover migrating task for node %s(%s) with IMPORTING info"+
					", Task(Source:%s,Target:%s).", node.Id, node.Addr(), rs.Master.Addr(), target.Addr())
				goto done
			}
		}
	}

done:
	for _, task := range m.tasks {
		if task.CurrentState() != StateDone {
			m.handleTaskChange(task, cluster)
		}
	}
}
Exemplo n.º 5
0
// 更新任务状态机
func (m *MigrateManager) handleTaskChange(task *MigrateTask, cluster *topo.Cluster) error {
	fromNode := cluster.FindNode(task.SourceNode().Id)
	toNode := cluster.FindNode(task.TargetNode().Id)
	tname := task.TaskName()

	if fromNode == nil {
		log.Infof(tname, "Source node %s(%s) not exist", fromNode.Addr(), fromNode.Id)
		return ErrNodeNotFound
	}
	if toNode == nil {
		log.Infof(tname, "Target node %s(%s) not exist", toNode.Addr(), toNode.Id)
		return ErrNodeNotFound
	}

	// 角色变化说明该分片进行了主从切换
	if !fromNode.IsMaster() || !toNode.IsMaster() {
		log.Warningf(tname, "%s role change, cancel migration task %s\n", fromNode.Id[:6], task.TaskName())
		task.SetState(StateCancelling)
		return ErrSourceNodeFail
	}

	// 如果是源节点挂了,直接取消,等待主从切换之后重建任务
	if fromNode.Fail {
		log.Infof(tname, "Cancel migration task %s\n", task.TaskName())
		task.SetState(StateCancelling)
		return ErrSourceNodeFail
	}
	// 如果目标节点挂了,需要记录当前的ReplicaSet,观察等待主从切换
	if toNode.Fail {
		if task.CurrentState() == StateRunning {
			task.SetState(StateTargetNodeFailure)
			task.SetBackupReplicaSet(task.TargetReplicaSet())
			return ErrTargetNodeFail
		}
	} else if task.CurrentState() != StateNew {
		task.SetState(StateRunning)
		task.SetBackupReplicaSet(nil)
	}
	// 如果目标节点已经进行了Failover(重新选主),我们需要找到对应的新主
	// 方法是从BackupReplicaSet里取一个从来查找
	if toNode.IsStandbyMaster() {
		brs := task.BackupReplicaSet()
		if brs == nil {
			task.SetState(StateCancelling)
			log.Info(tname, "No backup replicaset found, controller maybe restarted after target master failure, can not do recovery.")
			return ErrCanNotRecover
		}
		slaves := brs.Slaves
		if len(slaves) == 0 {
			task.SetState(StateCancelling)
			log.Info(tname, "The dead target master has no slave, cannot do recovery.")
			return ErrCanNotRecover
		} else {
			rs := cluster.FindReplicaSetByNode(slaves[0].Id)
			if rs == nil {
				task.SetState(StateCancelling)
				log.Info(tname, "No replicaset for slave of dead target master found")
				return ErrCanNotRecover
			}
			task.ReplaceTargetReplicaSet(rs)
			log.Infof(tname, "Recover dead target node to %s(%s)",
				rs.Master.Id, rs.Master.Addr())
		}
	}
	return nil
}
Exemplo n.º 6
0
func (self *Inspector) checkClusterTopo(seed *topo.Node, cluster *topo.Cluster) error {
	resp, err := redis.ClusterNodesInRegion(seed.Addr(), self.LocalRegion)
	if err != nil && strings.HasPrefix(err.Error(), "ERR Wrong CLUSTER subcommand or number of arguments") {
		//server version do not support 'cluster nodes extra [region]'
		resp, err = redis.ClusterNodes(seed.Addr())
	}

	//this may lead to BuildClusterTopo update failed for a time
	//the node is step into this state after check IsAlive
	if err != nil && strings.HasPrefix(err.Error(), "LOADING") {
		return nil
	}
	if err != nil {
		return err
	}

	var summary topo.SummaryInfo
	lines := strings.Split(resp, "\n")
	for _, line := range lines {
		if strings.HasPrefix(line, "# ") {
			summary.ReadLine(line)
			continue
		}
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}

		s, myself, err := self.buildNode(line)
		if err == ErrNodeInHandShake || err == ErrNodeNoAddr {
			continue
		}
		// Fix 'cluster nodes extra' & 'cluster nodes extra region' compatiable
		if s.Region != self.LocalRegion {
			continue
		}
		if err != nil {
			return err
		}
		if s.Ip == "127.0.0.1" {
			s.Ip = seed.Ip
		}
		node := cluster.FindNode(s.Id)
		if node == nil {
			if s.PFail {
				glog.Warningf("forget dead node %s(%s) should be forgoten", s.Id, s.Addr())
				//redis.ClusterForget(seed.Addr(), s.Id)
			}
			return fmt.Errorf("node not exist %s(%s)", s.Id, s.Addr())
		}

		// 对比节点数据是否相同
		if !node.Compare(s) {
			glog.Infof("%#v vs %#v different", s, node)
			if s.Tag == "-" && node.Tag != "-" {
				// 可能存在处于不被Cluster接受的节点,节点可以看见Cluster,但Cluster看不到它。
				// 一种复现情况情况:某个节点已经死了,系统将其Forget,但是OP并未被摘除该节点,
				// 而是恢复了该节点。
				glog.Warningf("remeet node %s", seed.Addr())
				self.MeetNode(seed)
			}
			return ErrNodesInfoNotSame
		}
		if len(node.Ranges) == 0 && len(s.Ranges) > 0 {
			glog.Warningf("Ranges not equal, use nonempty ranges.")
			node.Ranges = s.Ranges
		}

		if myself {
			info, err := redis.FetchClusterInfo(node.Addr())
			if err != nil {
				return err
			}
			node.ClusterInfo = info
			node.SummaryInfo = summary
		}

		if len(s.Migrating) != 0 {
			node.Migrating = s.Migrating
		}
		if len(s.Importing) != 0 {
			node.Importing = s.Importing
		}

		if s.PFail {
			node.IncrPFailCount()
		}
	}
	return nil
}