Example #1
0
func (self *SetAsMasterCommand) Execute(c *cc.Controller) (cc.Result, error) {
	cs := c.ClusterState
	node := cs.FindNode(self.NodeId)
	if node == nil {
		return nil, ErrNodeNotExist
	}
	if node.IsMaster() {
		return nil, ErrNodeIsMaster
	}
	mm := c.MigrateManager
	if len(mm.AllTasks()) > 0 {
		return nil, fmt.Errorf("Migrate task exists, cancel task to continue.")
	}
	//FIXME
	rs := cs.FindReplicaSetByNode(self.NodeId)

	_, err := redis.ClusterFailover(node.Addr(), rs)
	if err != nil {
		return nil, err
	}
	for _, ns := range cs.AllNodeStates() {
		_, err = redis.EnableWrite(ns.Addr(), self.NodeId)
		if err == nil {
			return nil, nil
		}
	}
	return nil, err
}
Example #2
0
func (self *UpdateRegionCommand) Execute(c *cc.Controller) (cc.Result, error) {
	if len(self.Nodes) == 0 {
		return nil, nil
	}

	// 更新Cluster拓扑
	cs := c.ClusterState
	cs.UpdateRegionNodes(self.Region, self.Nodes)

	// 首先更新迁移任务状态,以便发现故障时,在处理故障之前就暂停迁移任务
	cluster := cs.GetClusterSnapshot()
	if cluster != nil {
		mm := c.MigrateManager
		mm.HandleNodeStateChange(cluster)
	}

	for _, ns := range cs.AllNodeStates() {
		node := ns.Node()
		// Slave auto enable read ?
		if !node.IsMaster() && !node.Fail && !node.Readable && node.MasterLinkStatus == "up" {
			if meta.GetAppConfig().AutoEnableSlaveRead {
				redis.EnableRead(node.Addr(), node.Id)
			}
		}
		// Master auto enable write ?
		if node.IsMaster() && !node.Fail && !node.Writable {
			if meta.GetAppConfig().AutoEnableMasterWrite {
				redis.EnableWrite(node.Addr(), node.Id)
			}
		}
		// Fix chained replication: slave's parent is slave.
		if meta.LocalRegion() == self.Region && !node.IsMaster() {
			parent := cs.FindNode(node.ParentId)
			// Parent is not master?
			if parent != nil && !parent.IsMaster() {
				grandpa := cs.FindNode(parent.ParentId)
				if grandpa != nil {
					_, err := redis.ClusterReplicate(node.Addr(), grandpa.Id)
					if err == nil {
						log.Warningf(node.Addr(), "Fix chained replication, (%s->%s->%s)=>(%s->%s)",
							node, parent, grandpa, node, grandpa)
					}
				} else {
					log.Warningf(node.Addr(), "Found chained replication, (%s->%s->nil), cannot fix.",
						node, parent)
				}
			}
		}

		// 更新Region内Node的状态机
		ns.AdvanceFSM(cs, state.CMD_NONE)
	}

	return nil, nil
}
Example #3
0
func (self *EnableWriteCommand) Execute(c *cc.Controller) (cc.Result, error) {
	cs := c.ClusterState
	target := cs.FindNode(self.NodeId)
	if target == nil {
		return nil, ErrNodeNotExist
	}
	if target.Fail {
		return nil, ErrNodeIsDead
	}
	var err error
	for _, ns := range cs.AllNodeStates() {
		_, err = redis.EnableWrite(ns.Addr(), target.Id)
		if err == nil {
			return nil, nil
		}
	}
	return nil, err
}
Example #4
0
func (cs *ClusterState) RunFailoverTask(oldMasterId, newMasterId string) {
	new := cs.FindNodeState(newMasterId)
	old := cs.FindNodeState(oldMasterId)

	if old == nil {
		log.Warningf(oldMasterId, "Can't run failover task, the old dead master lost")
		return
	}
	if new == nil {
		log.Warningf(oldMasterId, "Can't run failover task, new master lost (%s)", newMasterId)
		old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL)
		return
	}

	// 通过新主广播消息
	redis.DisableRead(new.Addr(), old.Id())
	redis.DisableWrite(new.Addr(), old.Id())

	c := make(chan error, 1)

	go func() {
		//choose failover force or takeover in case of arbiter
		cluster := cs.cluster
		rs := cluster.FindReplicaSetByNode(old.Id())
		if cluster.HasArbiter() || cluster.IsClusterDown() {
			//use failover takeover
			c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, true, rs)
		} else {
			//use failover force
			c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, false, rs)
		}
	}()

	select {
	case err := <-c:
		if err != nil {
			log.Eventf(old.Addr(), "Failover request done with error(%v).", err)
		} else {
			log.Eventf(old.Addr(), "Failover request done, new master %s(%s).", new.Id(), new.Addr())
		}
	case <-time.After(20 * time.Minute):
		log.Eventf(old.Addr(), "Failover timedout, new master %s(%s)", new.Id(), new.Addr())
	}

	// 重新读取一次,因为可能已经更新了
	roleChanged := false
	node := cs.FindNode(newMasterId)
	if node.IsMaster() {
		roleChanged = true
	} else {
		for i := 0; i < 10; i++ {
			info, err := redis.FetchInfo(node.Addr(), "Replication")
			if err == nil && info.Get("role") == "master" {
				roleChanged = true
				break
			}
			log.Warningf(old.Addr(),
				"Role of new master %s(%s) has not yet changed, will check 5 seconds later.",
				new.Id(), new.Addr())
			time.Sleep(5 * time.Second)
		}
	}

	if roleChanged {
		log.Eventf(old.Addr(), "New master %s(%s) role change success", node.Id, node.Addr())
		// 处理迁移过程中的异常问题,将故障节点(旧主)的slots转移到新主上
		oldNode := cs.FindNode(oldMasterId)
		if oldNode != nil && oldNode.Fail && oldNode.IsMaster() && len(oldNode.Ranges) != 0 {
			log.Warningf(old.Addr(),
				"Some node carries slots info(%v) about the old master, waiting for MigrateManager to fix it.",
				oldNode.Ranges)
		} else {
			log.Info(old.Addr(), "Good, no slot need to be fix after failover.")
		}
	} else {
		log.Warningf(old.Addr(), "Failover failed, please check cluster state.")
		log.Warningf(old.Addr(), "The dead master will goto OFFLINE state and then goto WAIT_FAILOVER_BEGIN state to try failover again.")
	}

	old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL)

	// 打开新主的写入,因为给slave加Write没有效果
	// 所以即便Failover失败,也不会产生错误
	redis.EnableWrite(new.Addr(), new.Id())
}