// rebuild tasks from zk func (m *MigrateManager) RebuildTasks(migrateMetas []*meta.MigrateMeta, cluster *topo.Cluster) { if migrateMetas != nil { for _, mm := range migrateMetas { task, err := m.CreateTask(mm.SourceId, mm.TargetId, mm.Ranges, cluster) if err == nil { log.Info(task.TaskName(), "Load task from zk") } else { log.Warning(task.TaskName(), "CreateTask failed, %v", err) } } } }
func (cs *ClusterState) BuildClusterSnapshot() { // __CC__没什么意义,跟Region区别开即可 cluster := topo.NewCluster("__CC__") for _, ns := range cs.nodeStates { cluster.AddNode(ns.node) } err := cluster.BuildReplicaSets() // 出现这种情况,很可能是启动时节点还不全 if err != nil { log.Info("CLUSTER ", "Build cluster snapshot failed ", err) return } cs.cluster = cluster }
/// 迁移slot过程: /// 1. 标记Target分片Master为IMPORTING /// 2. 标记所有Source分片节点为MIGRATING /// 3. 从Source分片Master取keys迁移,直到空,数据迁移完成 /// 4. 设置Target的Slave的slot归属到Target /// 5. 设置Target的Master的slot归属到Target /// 6. 设置Source所有节点的slot归属到Target /// 命令: /// 1. <Target Master> setslot $slot IMPORTING $sourceId /// 2. <Source Slaves> setslot $slot MIGRATING $targetId /// 3. <Source Master> setslot $slot MIGRATING $targetId /// ... migrating all keys /// 4. <Target Slaves> setslot $slot node $targetId /// 5. <Target Master> setslot $slot node $targetId /// 6. <Source Slaves> setslot $slot node $targetId /// 7. <Source Master> setslot $slot node $targetId func (t *MigrateTask) migrateSlot(slot int, keysPer int) (int, error, string) { rs := t.SourceReplicaSet() sourceNode := t.SourceNode() targetNode := t.TargetNode() err := redis.SetSlot(targetNode.Addr(), slot, redis.SLOT_IMPORTING, sourceNode.Id) if err != nil { if strings.HasPrefix(err.Error(), "ERR I'm already the owner of hash slot") { log.Warningf(t.TaskName(), "%s already the owner of hash slot %d", targetNode.Id[:6], slot) // 逻辑到此,说明Target已经包含该slot,但是Source处于Migrating状态 // 迁移实际已经完成,需要清理Source的Migrating状态 srs := t.SourceReplicaSet() err = SetSlotToNode(srs, slot, targetNode.Id) if err != nil { return 0, err, "" } err = SetSlotStable(srs, slot) if err != nil { return 0, err, "" } trs := t.TargetReplicaSet() err = SetSlotToNode(trs, slot, targetNode.Id) if err != nil { return 0, err, "" } err = SetSlotStable(trs, slot) return 0, err, "" } return 0, err, "" } // 需要将Source分片的所有节点标记为MIGRATING,最大限度避免从地域的读造成的数据不一致 for _, node := range rs.AllNodes() { err := redis.SetSlot(node.Addr(), slot, redis.SLOT_MIGRATING, targetNode.Id) if err != nil { if strings.HasPrefix(err.Error(), "ERR I'm not the owner of hash slot") { log.Warningf(t.TaskName(), "%s is not the owner of hash slot %d", sourceNode.Id, slot) srs := t.SourceReplicaSet() err = SetSlotStable(srs, slot) if err != nil { log.Warningf(t.TaskName(), "Failed to clean MIGRATING state of source server.") return 0, err, "" } trs := t.TargetReplicaSet() err = SetSlotStable(trs, slot) if err != nil { log.Warningf(t.TaskName(), "Failed to clean MIGRATING state of target server.") return 0, err, "" } return 0, fmt.Errorf("mig: %s is not the owner of hash slot %d", sourceNode.Id, slot), "" } return 0, err, "" } } nkeys := 0 app := meta.GetAppConfig() for { keys, err := redis.GetKeysInSlot(sourceNode.Addr(), slot, keysPer) if err != nil { return nkeys, err, "" } for _, key := range keys { _, err := redis.Migrate(sourceNode.Addr(), targetNode.Ip, targetNode.Port, key, app.MigrateTimeout) if err != nil { return nkeys, err, key } nkeys++ } if len(keys) == 0 { // 迁移完成,需要等SourceSlaves同步(DEL)完成,即SourceSlaves节点中该slot内已无key slaveSyncDone := true srs := t.SourceReplicaSet() for _, node := range srs.AllNodes() { nkeys, err := redis.CountKeysInSlot(node.Addr(), slot) if err != nil { return nkeys, err, "" } if nkeys > 0 { slaveSyncDone = false } } if !slaveSyncDone { // FIXME // master migrate done, slave still have some keys in slot, setslot will ensure slave clear the data log.Info(t.TaskName(), "source node not empty, setslot will clear") //return nkeys, fmt.Errorf("mig: source nodes not all empty, will retry."), "" } // 设置slot归属到新节点,该操作自动清理IMPORTING和MIGRATING状态 // 如果设置的是Source节点,设置slot归属时,Redis会确保该slot中已无剩余的key trs := t.TargetReplicaSet() // 优先设置从节点,保证当主的数据分布还未广播到从节点时主挂掉,slot信息也不会丢失 for _, node := range trs.Slaves { if node.Fail { continue } err = redis.SetSlot(node.Addr(), slot, redis.SLOT_NODE, targetNode.Id) if err != nil { return nkeys, err, "" } } // 该操作增加Epoch并广播出去 err = redis.SetSlot(trs.Master.Addr(), slot, redis.SLOT_NODE, targetNode.Id) if err != nil { return nkeys, err, "" } // 更新节点上slot的归属 for _, rs := range t.cluster.ReplicaSets() { if rs.Master.IsStandbyMaster() { continue } err = SetSlotToNode(rs, slot, targetNode.Id) if err != nil { return nkeys, err, "" } } break } } return nkeys, nil, "" }
func (cs *ClusterState) RunFailoverTask(oldMasterId, newMasterId string) { new := cs.FindNodeState(newMasterId) old := cs.FindNodeState(oldMasterId) if old == nil { log.Warningf(oldMasterId, "Can't run failover task, the old dead master lost") return } if new == nil { log.Warningf(oldMasterId, "Can't run failover task, new master lost (%s)", newMasterId) old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL) return } // 通过新主广播消息 redis.DisableRead(new.Addr(), old.Id()) redis.DisableWrite(new.Addr(), old.Id()) c := make(chan error, 1) go func() { //choose failover force or takeover in case of arbiter cluster := cs.cluster rs := cluster.FindReplicaSetByNode(old.Id()) if cluster.HasArbiter() || cluster.IsClusterDown() { //use failover takeover c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, true, rs) } else { //use failover force c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, false, rs) } }() select { case err := <-c: if err != nil { log.Eventf(old.Addr(), "Failover request done with error(%v).", err) } else { log.Eventf(old.Addr(), "Failover request done, new master %s(%s).", new.Id(), new.Addr()) } case <-time.After(20 * time.Minute): log.Eventf(old.Addr(), "Failover timedout, new master %s(%s)", new.Id(), new.Addr()) } // 重新读取一次,因为可能已经更新了 roleChanged := false node := cs.FindNode(newMasterId) if node.IsMaster() { roleChanged = true } else { for i := 0; i < 10; i++ { info, err := redis.FetchInfo(node.Addr(), "Replication") if err == nil && info.Get("role") == "master" { roleChanged = true break } log.Warningf(old.Addr(), "Role of new master %s(%s) has not yet changed, will check 5 seconds later.", new.Id(), new.Addr()) time.Sleep(5 * time.Second) } } if roleChanged { log.Eventf(old.Addr(), "New master %s(%s) role change success", node.Id, node.Addr()) // 处理迁移过程中的异常问题,将故障节点(旧主)的slots转移到新主上 oldNode := cs.FindNode(oldMasterId) if oldNode != nil && oldNode.Fail && oldNode.IsMaster() && len(oldNode.Ranges) != 0 { log.Warningf(old.Addr(), "Some node carries slots info(%v) about the old master, waiting for MigrateManager to fix it.", oldNode.Ranges) } else { log.Info(old.Addr(), "Good, no slot need to be fix after failover.") } } else { log.Warningf(old.Addr(), "Failover failed, please check cluster state.") log.Warningf(old.Addr(), "The dead master will goto OFFLINE state and then goto WAIT_FAILOVER_BEGIN state to try failover again.") } old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL) // 打开新主的写入,因为给slave加Write没有效果 // 所以即便Failover失败,也不会产生错误 redis.EnableWrite(new.Addr(), new.Id()) }
localRegionNodes := rs.RegionNodes(ns.node.Region) if len(localRegionNodes) < 2 { return false } // 最多一个故障节点(FAIL或不处于Running状态) for _, node := range localRegionNodes { if node.Id == ns.Id() { continue } nodeState := cs.FindNodeState(node.Id) if node.Fail || nodeState.CurrentState() != StateRunning { return false } } } log.Info(getNodeState(i).Addr(), "Can failover slave") return true } MasterAutoFailoverConstraint = func(i interface{}) bool { ctx := i.(StateContext) cs := ctx.ClusterState ns := ctx.NodeState // 如果AutoFailover没开,且不是执行Failover的信号 if !meta.AutoFailover() && ctx.Input.Command != CMD_FAILOVER_BEGIN_SIGNAL { log.Warning(ns.Addr(), "Check constraint failed, autofailover off or no FL begin signal") return false } rs := cs.FindReplicaSetByNode(ns.Id())
// 更新任务状态机 func (m *MigrateManager) handleTaskChange(task *MigrateTask, cluster *topo.Cluster) error { fromNode := cluster.FindNode(task.SourceNode().Id) toNode := cluster.FindNode(task.TargetNode().Id) tname := task.TaskName() if fromNode == nil { log.Infof(tname, "Source node %s(%s) not exist", fromNode.Addr(), fromNode.Id) return ErrNodeNotFound } if toNode == nil { log.Infof(tname, "Target node %s(%s) not exist", toNode.Addr(), toNode.Id) return ErrNodeNotFound } // 角色变化说明该分片进行了主从切换 if !fromNode.IsMaster() || !toNode.IsMaster() { log.Warningf(tname, "%s role change, cancel migration task %s\n", fromNode.Id[:6], task.TaskName()) task.SetState(StateCancelling) return ErrSourceNodeFail } // 如果是源节点挂了,直接取消,等待主从切换之后重建任务 if fromNode.Fail { log.Infof(tname, "Cancel migration task %s\n", task.TaskName()) task.SetState(StateCancelling) return ErrSourceNodeFail } // 如果目标节点挂了,需要记录当前的ReplicaSet,观察等待主从切换 if toNode.Fail { if task.CurrentState() == StateRunning { task.SetState(StateTargetNodeFailure) task.SetBackupReplicaSet(task.TargetReplicaSet()) return ErrTargetNodeFail } } else if task.CurrentState() != StateNew { task.SetState(StateRunning) task.SetBackupReplicaSet(nil) } // 如果目标节点已经进行了Failover(重新选主),我们需要找到对应的新主 // 方法是从BackupReplicaSet里取一个从来查找 if toNode.IsStandbyMaster() { brs := task.BackupReplicaSet() if brs == nil { task.SetState(StateCancelling) log.Info(tname, "No backup replicaset found, controller maybe restarted after target master failure, can not do recovery.") return ErrCanNotRecover } slaves := brs.Slaves if len(slaves) == 0 { task.SetState(StateCancelling) log.Info(tname, "The dead target master has no slave, cannot do recovery.") return ErrCanNotRecover } else { rs := cluster.FindReplicaSetByNode(slaves[0].Id) if rs == nil { task.SetState(StateCancelling) log.Info(tname, "No replicaset for slave of dead target master found") return ErrCanNotRecover } task.ReplaceTargetReplicaSet(rs) log.Infof(tname, "Recover dead target node to %s(%s)", rs.Master.Id, rs.Master.Addr()) } } return nil }