func (self *DisableWriteCommand) Execute(c *cc.Controller) (cc.Result, error) { cs := c.ClusterState target := cs.FindNode(self.NodeId) if target == nil { return nil, ErrNodeNotExist } if target.Fail { return nil, ErrNodeIsDead } var err error for _, ns := range cs.AllNodeStates() { _, err = redis.DisableWrite(ns.Addr(), target.Id) if err == nil { return nil, nil } } return nil, err }
func (cs *ClusterState) RunFailoverTask(oldMasterId, newMasterId string) { new := cs.FindNodeState(newMasterId) old := cs.FindNodeState(oldMasterId) if old == nil { log.Warningf(oldMasterId, "Can't run failover task, the old dead master lost") return } if new == nil { log.Warningf(oldMasterId, "Can't run failover task, new master lost (%s)", newMasterId) old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL) return } // 通过新主广播消息 redis.DisableRead(new.Addr(), old.Id()) redis.DisableWrite(new.Addr(), old.Id()) c := make(chan error, 1) go func() { //choose failover force or takeover in case of arbiter cluster := cs.cluster rs := cluster.FindReplicaSetByNode(old.Id()) if cluster.HasArbiter() || cluster.IsClusterDown() { //use failover takeover c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, true, rs) } else { //use failover force c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, false, rs) } }() select { case err := <-c: if err != nil { log.Eventf(old.Addr(), "Failover request done with error(%v).", err) } else { log.Eventf(old.Addr(), "Failover request done, new master %s(%s).", new.Id(), new.Addr()) } case <-time.After(20 * time.Minute): log.Eventf(old.Addr(), "Failover timedout, new master %s(%s)", new.Id(), new.Addr()) } // 重新读取一次,因为可能已经更新了 roleChanged := false node := cs.FindNode(newMasterId) if node.IsMaster() { roleChanged = true } else { for i := 0; i < 10; i++ { info, err := redis.FetchInfo(node.Addr(), "Replication") if err == nil && info.Get("role") == "master" { roleChanged = true break } log.Warningf(old.Addr(), "Role of new master %s(%s) has not yet changed, will check 5 seconds later.", new.Id(), new.Addr()) time.Sleep(5 * time.Second) } } if roleChanged { log.Eventf(old.Addr(), "New master %s(%s) role change success", node.Id, node.Addr()) // 处理迁移过程中的异常问题,将故障节点(旧主)的slots转移到新主上 oldNode := cs.FindNode(oldMasterId) if oldNode != nil && oldNode.Fail && oldNode.IsMaster() && len(oldNode.Ranges) != 0 { log.Warningf(old.Addr(), "Some node carries slots info(%v) about the old master, waiting for MigrateManager to fix it.", oldNode.Ranges) } else { log.Info(old.Addr(), "Good, no slot need to be fix after failover.") } } else { log.Warningf(old.Addr(), "Failover failed, please check cluster state.") log.Warningf(old.Addr(), "The dead master will goto OFFLINE state and then goto WAIT_FAILOVER_BEGIN state to try failover again.") } old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL) // 打开新主的写入,因为给slave加Write没有效果 // 所以即便Failover失败,也不会产生错误 redis.EnableWrite(new.Addr(), new.Id()) }
} else { go cs.RunFailoverTask(ns.Id(), masterId) } } MasterGotoOfflineHandler = func(i interface{}) { ctx := i.(StateContext) cs := ctx.ClusterState ns := ctx.NodeState for _, n := range cs.AllNodeStates() { resp, err := redis.DisableRead(n.Addr(), ns.Id()) if err == nil { log.Infof(ns.Addr(), "Disable read of the already dead master: %s %s", resp, ns.Id()) } resp, err = redis.DisableWrite(n.Addr(), ns.Id()) if err == nil { log.Infof(ns.Addr(), "Disable read of the already dead master: %s %s", resp, ns.Id()) break } } } ) var ( RedisNodeStateModel = fsm.NewStateModel() ) func init() { RedisNodeStateModel.AddState(RunningState) RedisNodeStateModel.AddState(WaitFailoverBeginState)