func (self *UpdateRegionCommand) Execute(c *cc.Controller) (cc.Result, error) { if len(self.Nodes) == 0 { return nil, nil } // 更新Cluster拓扑 cs := c.ClusterState cs.UpdateRegionNodes(self.Region, self.Nodes) // 首先更新迁移任务状态,以便发现故障时,在处理故障之前就暂停迁移任务 cluster := cs.GetClusterSnapshot() if cluster != nil { mm := c.MigrateManager mm.HandleNodeStateChange(cluster) } for _, ns := range cs.AllNodeStates() { node := ns.Node() // Slave auto enable read ? if !node.IsMaster() && !node.Fail && !node.Readable && node.MasterLinkStatus == "up" { if meta.GetAppConfig().AutoEnableSlaveRead { redis.EnableRead(node.Addr(), node.Id) } } // Master auto enable write ? if node.IsMaster() && !node.Fail && !node.Writable { if meta.GetAppConfig().AutoEnableMasterWrite { redis.EnableWrite(node.Addr(), node.Id) } } // Fix chained replication: slave's parent is slave. if meta.LocalRegion() == self.Region && !node.IsMaster() { parent := cs.FindNode(node.ParentId) // Parent is not master? if parent != nil && !parent.IsMaster() { grandpa := cs.FindNode(parent.ParentId) if grandpa != nil { _, err := redis.ClusterReplicate(node.Addr(), grandpa.Id) if err == nil { log.Warningf(node.Addr(), "Fix chained replication, (%s->%s->%s)=>(%s->%s)", node, parent, grandpa, node, grandpa) } } else { log.Warningf(node.Addr(), "Found chained replication, (%s->%s->nil), cannot fix.", node, parent) } } } // 更新Region内Node的状态机 ns.AdvanceFSM(cs, state.CMD_NONE) } return nil, nil }
func (self *AppInfoCommand) Execute(c *cc.Controller) (cc.Result, error) { result := &AppInfoResult{ AppConfig: meta.GetAppConfig(), Leader: meta.ClusterLeaderConfig(), } return result, nil }
func (self *Inspector) Run() { appconfig := meta.GetAppConfig() // FetchClusterNodesInterval not support heat loading tickChan := time.NewTicker(appconfig.FetchClusterNodesInterval).C for { select { case <-tickChan: if !meta.IsRegionLeader() { continue } cluster, seeds, err := self.BuildClusterTopo() if err != nil { glog.Infof("build cluster topo failed, %v", err) } if cluster == nil { continue } var failureInfo *topo.FailureInfo if meta.IsInMasterRegion() && self.IsClusterDamaged(cluster, seeds) { failureInfo = &topo.FailureInfo{Seeds: seeds} } var nodes []*topo.Node if err == nil { nodes = cluster.LocalRegionNodes() } err = SendRegionTopoSnapshot(nodes, failureInfo) if err != nil { glog.Infof("send snapshot failed, %v", err) } } } }
func (m *MigrateManager) CheckAndRunTask() { tickCh := time.NewTicker(time.Second * 5).C for { select { case <-tickCh: glog.Info("Check new migrate task, task queue length: ", len(m.tasks)) app := meta.GetAppConfig() for idx, task := range m.tasks { glog.Infof("Task (%s) status:%s ", task.TaskName(), stateNames[task.CurrentState()]) if idx >= app.MigrateConcurrency { break } if task.CurrentState() == StateNew { glog.Info("Set task running ", task) // update task state in case of next loop state check task.SetState(StateRunning) go task.Run() } else if task.CurrentState() == StateDone || task.CurrentState() == StateCancelled { glog.Info("Remove task when task is done or be cancelled", task) m.RemoveTask(task, true) } } } } }
func (t *MigrateTask) Run() { if t.CurrentState() == StateNew { t.SetState(StateRunning) } if t.CurrentState() == StateCancelling { t.SetState(StateCancelled) return } prev_key := "" timeout_cnt := 0 for i, r := range t.ranges { if r.Left < 0 { r.Left = 0 } if r.Right > 16383 { r.Right = 16383 } t.currRangeIndex = i t.currSlot = r.Left t.totalKeysInSlot = 0 for t.currSlot <= r.Right { t.streamPub(true) // 尽量在迁移完一个完整Slot或遇到错误时,再进行状态的转换 if t.CurrentState() == StateCancelling { t.SetState(StateCancelled) t.streamPub(false) return } // 暂停,sleep一会继续检查 if t.CurrentState() == StatePausing { t.SetState(StatePaused) } if t.CurrentState() == StatePaused { time.Sleep(100 * time.Millisecond) continue } // 正常运行 app := meta.GetAppConfig() nkeys, err, key := t.migrateSlot(t.currSlot, app.MigrateKeysEachTime) t.totalKeysInSlot += nkeys // Check remains again seed := t.SourceNode() remains, err2 := redis.CountKeysInSlot(seed.Addr(), t.currSlot) if err2 != nil { remains = -1 } if err != nil || remains > 0 { log.Warningf(t.TaskName(), "Migrate slot %d error, %d keys done, total %d keys, remains %d keys, %v", t.currSlot, nkeys, t.totalKeysInSlot, remains, err) if err != nil && strings.HasPrefix(err.Error(), "READONLY") { log.Warningf(t.TaskName(), "Migrating across slaves nodes. "+ "Maybe a manual failover just happened, "+ "if cluster marks down after this point, "+ "we need recover it by ourself using cli commands.") t.SetState(StateCancelled) goto quit } else if err != nil && strings.HasPrefix(err.Error(), "CLUSTERDOWN") { log.Warningf(t.TaskName(), "The cluster is down, please check it yourself, migrating task cancelled.") t.SetState(StateCancelled) goto quit } else if err != nil && strings.HasPrefix(err.Error(), "IOERR") { log.Warningf(t.TaskName(), "Migrating key:%s timeout", key) if timeout_cnt > 10 { log.Warningf(t.TaskName(), "Migrating key:%s timeout too frequently, task cancelled") t.SetState(StateCancelled) goto quit } if prev_key == key { timeout_cnt++ } else { timeout_cnt = 0 prev_key = key } } time.Sleep(500 * time.Millisecond) } else { log.Infof(t.TaskName(), "Migrate slot %d done, %d keys done, total %d keys, remains %d keys", t.currSlot, nkeys, t.totalKeysInSlot, remains) t.currSlot++ t.totalKeysInSlot = 0 } } } t.currSlot-- t.SetState(StateDone) quit: t.streamPub(false) }
/// 迁移slot过程: /// 1. 标记Target分片Master为IMPORTING /// 2. 标记所有Source分片节点为MIGRATING /// 3. 从Source分片Master取keys迁移,直到空,数据迁移完成 /// 4. 设置Target的Slave的slot归属到Target /// 5. 设置Target的Master的slot归属到Target /// 6. 设置Source所有节点的slot归属到Target /// 命令: /// 1. <Target Master> setslot $slot IMPORTING $sourceId /// 2. <Source Slaves> setslot $slot MIGRATING $targetId /// 3. <Source Master> setslot $slot MIGRATING $targetId /// ... migrating all keys /// 4. <Target Slaves> setslot $slot node $targetId /// 5. <Target Master> setslot $slot node $targetId /// 6. <Source Slaves> setslot $slot node $targetId /// 7. <Source Master> setslot $slot node $targetId func (t *MigrateTask) migrateSlot(slot int, keysPer int) (int, error, string) { rs := t.SourceReplicaSet() sourceNode := t.SourceNode() targetNode := t.TargetNode() err := redis.SetSlot(targetNode.Addr(), slot, redis.SLOT_IMPORTING, sourceNode.Id) if err != nil { if strings.HasPrefix(err.Error(), "ERR I'm already the owner of hash slot") { log.Warningf(t.TaskName(), "%s already the owner of hash slot %d", targetNode.Id[:6], slot) // 逻辑到此,说明Target已经包含该slot,但是Source处于Migrating状态 // 迁移实际已经完成,需要清理Source的Migrating状态 srs := t.SourceReplicaSet() err = SetSlotToNode(srs, slot, targetNode.Id) if err != nil { return 0, err, "" } err = SetSlotStable(srs, slot) if err != nil { return 0, err, "" } trs := t.TargetReplicaSet() err = SetSlotToNode(trs, slot, targetNode.Id) if err != nil { return 0, err, "" } err = SetSlotStable(trs, slot) return 0, err, "" } return 0, err, "" } // 需要将Source分片的所有节点标记为MIGRATING,最大限度避免从地域的读造成的数据不一致 for _, node := range rs.AllNodes() { err := redis.SetSlot(node.Addr(), slot, redis.SLOT_MIGRATING, targetNode.Id) if err != nil { if strings.HasPrefix(err.Error(), "ERR I'm not the owner of hash slot") { log.Warningf(t.TaskName(), "%s is not the owner of hash slot %d", sourceNode.Id, slot) srs := t.SourceReplicaSet() err = SetSlotStable(srs, slot) if err != nil { log.Warningf(t.TaskName(), "Failed to clean MIGRATING state of source server.") return 0, err, "" } trs := t.TargetReplicaSet() err = SetSlotStable(trs, slot) if err != nil { log.Warningf(t.TaskName(), "Failed to clean MIGRATING state of target server.") return 0, err, "" } return 0, fmt.Errorf("mig: %s is not the owner of hash slot %d", sourceNode.Id, slot), "" } return 0, err, "" } } nkeys := 0 app := meta.GetAppConfig() for { keys, err := redis.GetKeysInSlot(sourceNode.Addr(), slot, keysPer) if err != nil { return nkeys, err, "" } for _, key := range keys { _, err := redis.Migrate(sourceNode.Addr(), targetNode.Ip, targetNode.Port, key, app.MigrateTimeout) if err != nil { return nkeys, err, key } nkeys++ } if len(keys) == 0 { // 迁移完成,需要等SourceSlaves同步(DEL)完成,即SourceSlaves节点中该slot内已无key slaveSyncDone := true srs := t.SourceReplicaSet() for _, node := range srs.AllNodes() { nkeys, err := redis.CountKeysInSlot(node.Addr(), slot) if err != nil { return nkeys, err, "" } if nkeys > 0 { slaveSyncDone = false } } if !slaveSyncDone { // FIXME // master migrate done, slave still have some keys in slot, setslot will ensure slave clear the data log.Info(t.TaskName(), "source node not empty, setslot will clear") //return nkeys, fmt.Errorf("mig: source nodes not all empty, will retry."), "" } // 设置slot归属到新节点,该操作自动清理IMPORTING和MIGRATING状态 // 如果设置的是Source节点,设置slot归属时,Redis会确保该slot中已无剩余的key trs := t.TargetReplicaSet() // 优先设置从节点,保证当主的数据分布还未广播到从节点时主挂掉,slot信息也不会丢失 for _, node := range trs.Slaves { if node.Fail { continue } err = redis.SetSlot(node.Addr(), slot, redis.SLOT_NODE, targetNode.Id) if err != nil { return nkeys, err, "" } } // 该操作增加Epoch并广播出去 err = redis.SetSlot(trs.Master.Addr(), slot, redis.SLOT_NODE, targetNode.Id) if err != nil { return nkeys, err, "" } // 更新节点上slot的归属 for _, rs := range t.cluster.ReplicaSets() { if rs.Master.IsStandbyMaster() { continue } err = SetSlotToNode(rs, slot, targetNode.Id) if err != nil { return nkeys, err, "" } } break } } return nkeys, nil, "" }
} ) /// Constraints var ( SlaveAutoFailoverConstraint = func(i interface{}) bool { ctx := i.(StateContext) cs := ctx.ClusterState ns := ctx.NodeState rs := cs.FindReplicaSetByNode(ns.Id()) if rs == nil { return false } app := meta.GetAppConfig() if app.SlaveFailoverLimit { // 至少还有一个节点 localRegionNodes := rs.RegionNodes(ns.node.Region) if len(localRegionNodes) < 2 { return false } // 最多一个故障节点(FAIL或不处于Running状态) for _, node := range localRegionNodes { if node.Id == ns.Id() { continue } nodeState := cs.FindNodeState(node.Id) if node.Fail || nodeState.CurrentState() != StateRunning { return false }