func (self *SetAsMasterCommand) Execute(c *cc.Controller) (cc.Result, error) { cs := c.ClusterState node := cs.FindNode(self.NodeId) if node == nil { return nil, ErrNodeNotExist } if node.IsMaster() { return nil, ErrNodeIsMaster } mm := c.MigrateManager if len(mm.AllTasks()) > 0 { return nil, fmt.Errorf("Migrate task exists, cancel task to continue.") } //FIXME rs := cs.FindReplicaSetByNode(self.NodeId) _, err := redis.ClusterFailover(node.Addr(), rs) if err != nil { return nil, err } for _, ns := range cs.AllNodeStates() { _, err = redis.EnableWrite(ns.Addr(), self.NodeId) if err == nil { return nil, nil } } return nil, err }
func (self *UpdateRegionCommand) Execute(c *cc.Controller) (cc.Result, error) { if len(self.Nodes) == 0 { return nil, nil } // 更新Cluster拓扑 cs := c.ClusterState cs.UpdateRegionNodes(self.Region, self.Nodes) // 首先更新迁移任务状态,以便发现故障时,在处理故障之前就暂停迁移任务 cluster := cs.GetClusterSnapshot() if cluster != nil { mm := c.MigrateManager mm.HandleNodeStateChange(cluster) } for _, ns := range cs.AllNodeStates() { node := ns.Node() // Slave auto enable read ? if !node.IsMaster() && !node.Fail && !node.Readable && node.MasterLinkStatus == "up" { if meta.GetAppConfig().AutoEnableSlaveRead { redis.EnableRead(node.Addr(), node.Id) } } // Master auto enable write ? if node.IsMaster() && !node.Fail && !node.Writable { if meta.GetAppConfig().AutoEnableMasterWrite { redis.EnableWrite(node.Addr(), node.Id) } } // Fix chained replication: slave's parent is slave. if meta.LocalRegion() == self.Region && !node.IsMaster() { parent := cs.FindNode(node.ParentId) // Parent is not master? if parent != nil && !parent.IsMaster() { grandpa := cs.FindNode(parent.ParentId) if grandpa != nil { _, err := redis.ClusterReplicate(node.Addr(), grandpa.Id) if err == nil { log.Warningf(node.Addr(), "Fix chained replication, (%s->%s->%s)=>(%s->%s)", node, parent, grandpa, node, grandpa) } } else { log.Warningf(node.Addr(), "Found chained replication, (%s->%s->nil), cannot fix.", node, parent) } } } // 更新Region内Node的状态机 ns.AdvanceFSM(cs, state.CMD_NONE) } return nil, nil }
func (self *EnableWriteCommand) Execute(c *cc.Controller) (cc.Result, error) { cs := c.ClusterState target := cs.FindNode(self.NodeId) if target == nil { return nil, ErrNodeNotExist } if target.Fail { return nil, ErrNodeIsDead } var err error for _, ns := range cs.AllNodeStates() { _, err = redis.EnableWrite(ns.Addr(), target.Id) if err == nil { return nil, nil } } return nil, err }
func (cs *ClusterState) RunFailoverTask(oldMasterId, newMasterId string) { new := cs.FindNodeState(newMasterId) old := cs.FindNodeState(oldMasterId) if old == nil { log.Warningf(oldMasterId, "Can't run failover task, the old dead master lost") return } if new == nil { log.Warningf(oldMasterId, "Can't run failover task, new master lost (%s)", newMasterId) old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL) return } // 通过新主广播消息 redis.DisableRead(new.Addr(), old.Id()) redis.DisableWrite(new.Addr(), old.Id()) c := make(chan error, 1) go func() { //choose failover force or takeover in case of arbiter cluster := cs.cluster rs := cluster.FindReplicaSetByNode(old.Id()) if cluster.HasArbiter() || cluster.IsClusterDown() { //use failover takeover c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, true, rs) } else { //use failover force c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, false, rs) } }() select { case err := <-c: if err != nil { log.Eventf(old.Addr(), "Failover request done with error(%v).", err) } else { log.Eventf(old.Addr(), "Failover request done, new master %s(%s).", new.Id(), new.Addr()) } case <-time.After(20 * time.Minute): log.Eventf(old.Addr(), "Failover timedout, new master %s(%s)", new.Id(), new.Addr()) } // 重新读取一次,因为可能已经更新了 roleChanged := false node := cs.FindNode(newMasterId) if node.IsMaster() { roleChanged = true } else { for i := 0; i < 10; i++ { info, err := redis.FetchInfo(node.Addr(), "Replication") if err == nil && info.Get("role") == "master" { roleChanged = true break } log.Warningf(old.Addr(), "Role of new master %s(%s) has not yet changed, will check 5 seconds later.", new.Id(), new.Addr()) time.Sleep(5 * time.Second) } } if roleChanged { log.Eventf(old.Addr(), "New master %s(%s) role change success", node.Id, node.Addr()) // 处理迁移过程中的异常问题,将故障节点(旧主)的slots转移到新主上 oldNode := cs.FindNode(oldMasterId) if oldNode != nil && oldNode.Fail && oldNode.IsMaster() && len(oldNode.Ranges) != 0 { log.Warningf(old.Addr(), "Some node carries slots info(%v) about the old master, waiting for MigrateManager to fix it.", oldNode.Ranges) } else { log.Info(old.Addr(), "Good, no slot need to be fix after failover.") } } else { log.Warningf(old.Addr(), "Failover failed, please check cluster state.") log.Warningf(old.Addr(), "The dead master will goto OFFLINE state and then goto WAIT_FAILOVER_BEGIN state to try failover again.") } old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL) // 打开新主的写入,因为给slave加Write没有效果 // 所以即便Failover失败,也不会产生错误 redis.EnableWrite(new.Addr(), new.Id()) }