Example #1
0
func checkMasterRole(node *topo.Node, ismaster bool) (bool, error) {
	addr := node.Addr()
	info, err := redis.FetchInfo(addr, "replication")
	if err != nil {
		return false, fmt.Errorf("Connect %s failed", addr)
	}
	if info.Get("role") == "master" {
		return true, nil
	} else {
		return false, nil
	}
}
Example #2
0
func checkSlaveRepliStatusOk(node *topo.Node) (bool, error) {
	addr := node.Addr()
	info, err := redis.FetchInfo(addr, "all")
	if err != nil {
		return false, err
	}
	if info.Get("role") == "master" {
		return false, nil
	}
	if info.Get("master_link_status") == "down" {
		return false, nil
	} else if info.Get("loading") == "1" {
		return false, nil
	} else {
		return true, nil
	}
}
Example #3
0
// 失败返回-1
func fetchReplOffset(addr string) int64 {
	info, err := redis.FetchInfo(addr, "Replication")
	if err != nil {
		return -1
	}
	if info.Get("role") == "master" {
		offset, err := info.GetInt64("master_repl_offset")
		if err != nil {
			return -1
		} else {
			return offset
		}
	}
	offset, err := info.GetInt64("slave_repl_offset")
	if err != nil {
		return -1
	}
	return offset
}
Example #4
0
func (cs *ClusterState) RunFailoverTask(oldMasterId, newMasterId string) {
	new := cs.FindNodeState(newMasterId)
	old := cs.FindNodeState(oldMasterId)

	if old == nil {
		log.Warningf(oldMasterId, "Can't run failover task, the old dead master lost")
		return
	}
	if new == nil {
		log.Warningf(oldMasterId, "Can't run failover task, new master lost (%s)", newMasterId)
		old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL)
		return
	}

	// 通过新主广播消息
	redis.DisableRead(new.Addr(), old.Id())
	redis.DisableWrite(new.Addr(), old.Id())

	c := make(chan error, 1)

	go func() {
		//choose failover force or takeover in case of arbiter
		cluster := cs.cluster
		rs := cluster.FindReplicaSetByNode(old.Id())
		if cluster.HasArbiter() || cluster.IsClusterDown() {
			//use failover takeover
			c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, true, rs)
		} else {
			//use failover force
			c <- redis.SetAsMasterWaitSyncDone(new.Addr(), true, false, rs)
		}
	}()

	select {
	case err := <-c:
		if err != nil {
			log.Eventf(old.Addr(), "Failover request done with error(%v).", err)
		} else {
			log.Eventf(old.Addr(), "Failover request done, new master %s(%s).", new.Id(), new.Addr())
		}
	case <-time.After(20 * time.Minute):
		log.Eventf(old.Addr(), "Failover timedout, new master %s(%s)", new.Id(), new.Addr())
	}

	// 重新读取一次,因为可能已经更新了
	roleChanged := false
	node := cs.FindNode(newMasterId)
	if node.IsMaster() {
		roleChanged = true
	} else {
		for i := 0; i < 10; i++ {
			info, err := redis.FetchInfo(node.Addr(), "Replication")
			if err == nil && info.Get("role") == "master" {
				roleChanged = true
				break
			}
			log.Warningf(old.Addr(),
				"Role of new master %s(%s) has not yet changed, will check 5 seconds later.",
				new.Id(), new.Addr())
			time.Sleep(5 * time.Second)
		}
	}

	if roleChanged {
		log.Eventf(old.Addr(), "New master %s(%s) role change success", node.Id, node.Addr())
		// 处理迁移过程中的异常问题,将故障节点(旧主)的slots转移到新主上
		oldNode := cs.FindNode(oldMasterId)
		if oldNode != nil && oldNode.Fail && oldNode.IsMaster() && len(oldNode.Ranges) != 0 {
			log.Warningf(old.Addr(),
				"Some node carries slots info(%v) about the old master, waiting for MigrateManager to fix it.",
				oldNode.Ranges)
		} else {
			log.Info(old.Addr(), "Good, no slot need to be fix after failover.")
		}
	} else {
		log.Warningf(old.Addr(), "Failover failed, please check cluster state.")
		log.Warningf(old.Addr(), "The dead master will goto OFFLINE state and then goto WAIT_FAILOVER_BEGIN state to try failover again.")
	}

	old.AdvanceFSM(cs, CMD_FAILOVER_END_SIGNAL)

	// 打开新主的写入,因为给slave加Write没有效果
	// 所以即便Failover失败,也不会产生错误
	redis.EnableWrite(new.Addr(), new.Id())
}