func (m *MigrateManager) rebalance(rbtask *RebalanceTask, cluster *topo.Cluster) { // 启动所有任务,失败则等待一会进行重试 for { allRunning := true for _, plan := range rbtask.Plans { if plan.task == nil { task, err := m.CreateTask(plan.SourceId, plan.TargetId, plan.Ranges, cluster) if err == nil { log.Infof(task.TaskName(), "Rebalance task created, %v", task) plan.task = task } else { allRunning = false } } } if allRunning { break } streams.RebalanceStateStream.Pub(*m.rebalanceTask) time.Sleep(5 * time.Second) } // 等待结束 for { allDone := true for _, plan := range rbtask.Plans { state := plan.task.CurrentState() if state != StateDone && state != StateCancelled { allDone = false } } if allDone { break } streams.RebalanceStateStream.Pub(*m.rebalanceTask) time.Sleep(5 * time.Second) } now := time.Now() m.rebalanceTask.EndTime = &now streams.RebalanceStateStream.Pub(*m.rebalanceTask) m.rebalanceTask = nil }
func (t *MigrateTask) Run() { if t.CurrentState() == StateNew { t.SetState(StateRunning) } if t.CurrentState() == StateCancelling { t.SetState(StateCancelled) return } prev_key := "" timeout_cnt := 0 for i, r := range t.ranges { if r.Left < 0 { r.Left = 0 } if r.Right > 16383 { r.Right = 16383 } t.currRangeIndex = i t.currSlot = r.Left t.totalKeysInSlot = 0 for t.currSlot <= r.Right { t.streamPub(true) // 尽量在迁移完一个完整Slot或遇到错误时,再进行状态的转换 if t.CurrentState() == StateCancelling { t.SetState(StateCancelled) t.streamPub(false) return } // 暂停,sleep一会继续检查 if t.CurrentState() == StatePausing { t.SetState(StatePaused) } if t.CurrentState() == StatePaused { time.Sleep(100 * time.Millisecond) continue } // 正常运行 app := meta.GetAppConfig() nkeys, err, key := t.migrateSlot(t.currSlot, app.MigrateKeysEachTime) t.totalKeysInSlot += nkeys // Check remains again seed := t.SourceNode() remains, err2 := redis.CountKeysInSlot(seed.Addr(), t.currSlot) if err2 != nil { remains = -1 } if err != nil || remains > 0 { log.Warningf(t.TaskName(), "Migrate slot %d error, %d keys done, total %d keys, remains %d keys, %v", t.currSlot, nkeys, t.totalKeysInSlot, remains, err) if err != nil && strings.HasPrefix(err.Error(), "READONLY") { log.Warningf(t.TaskName(), "Migrating across slaves nodes. "+ "Maybe a manual failover just happened, "+ "if cluster marks down after this point, "+ "we need recover it by ourself using cli commands.") t.SetState(StateCancelled) goto quit } else if err != nil && strings.HasPrefix(err.Error(), "CLUSTERDOWN") { log.Warningf(t.TaskName(), "The cluster is down, please check it yourself, migrating task cancelled.") t.SetState(StateCancelled) goto quit } else if err != nil && strings.HasPrefix(err.Error(), "IOERR") { log.Warningf(t.TaskName(), "Migrating key:%s timeout", key) if timeout_cnt > 10 { log.Warningf(t.TaskName(), "Migrating key:%s timeout too frequently, task cancelled") t.SetState(StateCancelled) goto quit } if prev_key == key { timeout_cnt++ } else { timeout_cnt = 0 prev_key = key } } time.Sleep(500 * time.Millisecond) } else { log.Infof(t.TaskName(), "Migrate slot %d done, %d keys done, total %d keys, remains %d keys", t.currSlot, nkeys, t.totalKeysInSlot, remains) t.currSlot++ t.totalKeysInSlot = 0 } } } t.currSlot-- t.SetState(StateDone) quit: t.streamPub(false) }
func (self *FixClusterCommand) Execute(c *cc.Controller) (cc.Result, error) { cs := c.ClusterState snapshot := cs.GetClusterSnapshot() if snapshot == nil { return nil, nil } snapshot.BuildReplicaSets() nodeStates := map[string]string{} nss := cs.AllNodeStates() for id, n := range nss { nodeStates[id] = n.CurrentState() } rss := snapshot.ReplicaSets() totalNum := 0 //总节点数 totalRepli := 0 failedNodes := []*topo.Node{} freeNodes := []*topo.Node{} defectMaster := []*topo.Node{} for _, rs := range rss { //check failed nodes and free nodes if rs.Master != nil && rs.Master.IsArbiter() { continue } totalNum = totalNum + len(rs.AllNodes()) if len(rs.Master.Ranges) == 0 && nodeStates[rs.Master.Id] == state.StateRunning { //free节点 freeNodes = append(freeNodes, rs.Master) } else { if len(rs.AllNodes()) > 1 { totalRepli = totalRepli + 1 } for _, node := range rs.AllNodes() { if nodeStates[node.Id] != state.StateRunning { failedNodes = append(failedNodes, node) } } } } log.Infof("CLUSTER", "freeNodes=%d failedNodes=%d", len(freeNodes), len(failedNodes)) if len(freeNodes) == 0 && len(failedNodes) == 0 { return nil, nil } if len(freeNodes) != len(failedNodes) || (totalNum-len(failedNodes))%(totalRepli) != 0 { log.Infof("CLUSTER", "totalNum=%d totalRepli=%d freeNodes=%d failedNodes=%d", totalNum-len(failedNodes), totalRepli, len(freeNodes), len(failedNodes)) return nil, errors.New("cluster fix check error, please check") } avgReplica := int((totalNum - len(failedNodes)) / totalRepli) replicaBroken := func(rs *topo.ReplicaSet) bool { for _, n := range rs.AllNodes() { if nodeStates[n.Id] != state.StateRunning { return true } } return false } for _, rs := range rss { if rs.Master != nil && rs.Master.IsArbiter() || nodeStates[rs.Master.Id] != state.StateRunning { continue } if len(rs.AllNodes()) < avgReplica && len(rs.Master.Ranges) > 0 && nodeStates[rs.Master.Id] == state.StateRunning { defectMaster = append(defectMaster, rs.Master) } if len(rs.AllNodes()) == avgReplica && replicaBroken(rs) == true { defectMaster = append(defectMaster, rs.Master) } } // forget offline nodes for _, node := range failedNodes { forgetCmd := ForgetAndResetNodeCommand{ NodeId: node.Id, } forgetCmd.Execute(c) log.Eventf(node.Addr(), "Forget and reset failed node") } //meet & replicate for _, node := range freeNodes { meetCmd := MeetNodeCommand{ NodeId: node.Id, } meetCmd.Execute(c) log.Eventf(node.Addr(), "Meet cluster") // give some time to gossip time.Sleep(5 * time.Second) } for idx, node := range freeNodes { //disable read disableReadCmd := DisableReadCommand{ NodeId: node.Id, } disableReadCmd.Execute(c) log.Eventf(node.Addr(), "Disable read flag") //replicate replicateCmd := ReplicateCommand{ ChildId: node.Id, ParentId: defectMaster[idx].Id, } replicateCmd.Execute(c) log.Eventf(node.Addr(), "Replicate %s to %s", node.Addr(), defectMaster[idx].Addr()) } result := FixClusterResult{Result: true} return result, nil }
// 是否有其他Failover正在进行 doing, err := meta.IsDoingFailover() if err != nil { log.Warningf(ns.Addr(), "Fetch failover status failed, %v", err) return false } if doing { // get doing failover record, if record last for more than 1min, delete doing record record, err := meta.DoingFailoverRecord() if err == nil { if record.Timestamp.Add(1 * time.Millisecond).Before(time.Now()) { err = meta.UnmarkFailoverDoing() if err != nil { log.Warning(ns.Addr(), "UnmarkFailoverDoing failed last for 1 min, %v", err) } log.Infof(ns.Addr(), "UnmarkFailoverDoing last for 1 min") } } log.Warning(ns.Addr(), "There is another failover doing") return false } // 最近是否进行过Failover lastTime, err := meta.LastFailoverTime() if err != nil { log.Warningf(ns.Addr(), "Get last failover time failed, %v", err) return false } app := meta.GetAppConfig() if lastTime != nil && time.Since(*lastTime) < app.AutoFailoverInterval { log.Warningf(ns.Addr(), "Failover too soon, lastTime: %v", *lastTime)
// 更新任务状态机 func (m *MigrateManager) handleTaskChange(task *MigrateTask, cluster *topo.Cluster) error { fromNode := cluster.FindNode(task.SourceNode().Id) toNode := cluster.FindNode(task.TargetNode().Id) tname := task.TaskName() if fromNode == nil { log.Infof(tname, "Source node %s(%s) not exist", fromNode.Addr(), fromNode.Id) return ErrNodeNotFound } if toNode == nil { log.Infof(tname, "Target node %s(%s) not exist", toNode.Addr(), toNode.Id) return ErrNodeNotFound } // 角色变化说明该分片进行了主从切换 if !fromNode.IsMaster() || !toNode.IsMaster() { log.Warningf(tname, "%s role change, cancel migration task %s\n", fromNode.Id[:6], task.TaskName()) task.SetState(StateCancelling) return ErrSourceNodeFail } // 如果是源节点挂了,直接取消,等待主从切换之后重建任务 if fromNode.Fail { log.Infof(tname, "Cancel migration task %s\n", task.TaskName()) task.SetState(StateCancelling) return ErrSourceNodeFail } // 如果目标节点挂了,需要记录当前的ReplicaSet,观察等待主从切换 if toNode.Fail { if task.CurrentState() == StateRunning { task.SetState(StateTargetNodeFailure) task.SetBackupReplicaSet(task.TargetReplicaSet()) return ErrTargetNodeFail } } else if task.CurrentState() != StateNew { task.SetState(StateRunning) task.SetBackupReplicaSet(nil) } // 如果目标节点已经进行了Failover(重新选主),我们需要找到对应的新主 // 方法是从BackupReplicaSet里取一个从来查找 if toNode.IsStandbyMaster() { brs := task.BackupReplicaSet() if brs == nil { task.SetState(StateCancelling) log.Info(tname, "No backup replicaset found, controller maybe restarted after target master failure, can not do recovery.") return ErrCanNotRecover } slaves := brs.Slaves if len(slaves) == 0 { task.SetState(StateCancelling) log.Info(tname, "The dead target master has no slave, cannot do recovery.") return ErrCanNotRecover } else { rs := cluster.FindReplicaSetByNode(slaves[0].Id) if rs == nil { task.SetState(StateCancelling) log.Info(tname, "No replicaset for slave of dead target master found") return ErrCanNotRecover } task.ReplaceTargetReplicaSet(rs) log.Infof(tname, "Recover dead target node to %s(%s)", rs.Master.Id, rs.Master.Addr()) } } return nil }