func testNsqLookupNsqdNodesChange(t *testing.T, useFakeLeadership bool) { if testing.Verbose() { SetCoordLogger(&levellogger.GLogger{}, levellogger.LOG_INFO) glog.SetFlags(0, "", "", true, true, 1) glog.StartWorker(time.Second) } else { SetCoordLogger(newTestLogger(t), levellogger.LOG_DEBUG) } idList := []string{"id1", "id2", "id3", "id4", "id5"} lookupCoord1, nodeInfoList := prepareCluster(t, idList, useFakeLeadership) for _, n := range nodeInfoList { defer os.RemoveAll(n.dataPath) defer n.localNsqd.Exit() defer n.nsqdCoord.Stop() } topic := "test-nsqlookup-topic-unit-test" lookupLeadership := lookupCoord1.leadership lookupCoord1.DeleteTopic(topic, "**") topic3 := topic + topic lookupCoord1.DeleteTopic(topic3, "**") time.Sleep(time.Second) defer func() { lookupCoord1.DeleteTopic(topic, "**") lookupCoord1.DeleteTopic(topic3, "**") time.Sleep(time.Second * 3) lookupCoord1.Stop() }() // test new topic create err := lookupCoord1.CreateTopic(topic, TopicMetaInfo{2, 2, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord1, time.Second*3) pmeta, _, err := lookupLeadership.GetTopicMetaInfo(topic) pn := pmeta.PartitionNum test.Nil(t, err) test.Equal(t, pn, 2) t0, err := lookupLeadership.GetTopicInfo(topic, 0) test.Nil(t, err) t1, err := lookupLeadership.GetTopicInfo(topic, 1) test.Nil(t, err) test.Equal(t, len(t0.ISR), 2) test.Equal(t, len(t1.ISR), 2) t.Log(t0) t.Log(t1) test.NotEqual(t, t0.Leader, t1.Leader) t0LeaderCoord := nodeInfoList[t0.Leader].nsqdCoord test.NotNil(t, t0LeaderCoord) tc0, coordErr := t0LeaderCoord.getTopicCoord(topic, 0) test.Nil(t, coordErr) test.Equal(t, tc0.topicInfo.Leader, t0.Leader) test.Equal(t, len(tc0.topicInfo.ISR), 2) t1LeaderCoord := nodeInfoList[t1.Leader].nsqdCoord test.NotNil(t, t1LeaderCoord) tc1, coordErr := t1LeaderCoord.getTopicCoord(topic, 1) test.Nil(t, coordErr) test.Equal(t, tc1.topicInfo.Leader, t1.Leader) test.Equal(t, len(tc1.topicInfo.ISR), 2) coordLog.Warningf("============= begin test isr node failed ====") // test isr node lost lostNodeID := t0.ISR[1] atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 1) nodeInfoList[lostNodeID].nsqdCoord.leadership.UnregisterNsqd(nodeInfoList[lostNodeID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*3) t0, err = lookupLeadership.GetTopicInfo(topic, 0) if len(t0.ISR) < t0.Replica { waitClusterStable(lookupCoord1, time.Second*3) } t0, err = lookupLeadership.GetTopicInfo(topic, 0) test.Nil(t, err) test.Equal(t, FindSlice(t0.ISR, lostNodeID) == -1, true) test.Equal(t, len(t0.ISR), t0.Replica) test.Equal(t, t0.Leader, t0.ISR[0]) // clear topic info on failed node, test the reload for failed node nodeInfoList[lostNodeID].nsqdCoord.topicCoords = make(map[string]map[int]*TopicCoordinator) // test new catchup and new isr atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 0) nodeInfoList[lostNodeID].nsqdCoord.leadership.RegisterNsqd(nodeInfoList[lostNodeID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*3) t0, _ = lookupLeadership.GetTopicInfo(topic, 0) if len(t0.ISR) < t0.Replica { waitClusterStable(lookupCoord1, time.Second*3) } t0, _ = lookupLeadership.GetTopicInfo(topic, 0) test.Equal(t, len(t0.CatchupList), 0) test.Equal(t, len(t0.ISR) >= t0.Replica, true) test.Equal(t, len(tc0.topicInfo.ISR), len(t0.ISR)) test.Equal(t, t0.Leader, t0.ISR[0]) lookupCoord1.triggerCheckTopics("", 0, time.Second) time.Sleep(time.Second) t0, _ = lookupLeadership.GetTopicInfo(topic, 0) // should remove the unnecessary node test.Equal(t, len(t0.ISR), t0.Replica) coordLog.Warningf("============= begin test leader failed ====") // test leader node lost lostNodeID = t0.Leader atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 1) nodeInfoList[lostNodeID].nsqdCoord.leadership.UnregisterNsqd(nodeInfoList[lostNodeID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*3) t0, _ = lookupLeadership.GetTopicInfo(topic, 0) if len(t0.ISR) < t0.Replica { waitClusterStable(lookupCoord1, time.Second*3) } t0, _ = lookupLeadership.GetTopicInfo(topic, 0) t.Log(t0) test.Equal(t, t0.Replica, len(t0.ISR)) test.Equal(t, t0.Leader, t0.ISR[0]) test.NotEqual(t, t0.Leader, lostNodeID) //test.Equal(t, len(t0.CatchupList), 1) test.Equal(t, FindSlice(t0.ISR, lostNodeID) == -1, true) t0LeaderCoord = nodeInfoList[t0.Leader].nsqdCoord test.NotNil(t, t0LeaderCoord) tc0, coordErr = t0LeaderCoord.getTopicCoord(topic, 0) test.Nil(t, coordErr) test.Equal(t, len(tc0.topicInfo.ISR), len(t0.ISR)) test.Equal(t, tc0.topicInfo.Leader, t0.Leader) // test lost leader node rejoin atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 0) nodeInfoList[lostNodeID].nsqdCoord.leadership.RegisterNsqd(nodeInfoList[lostNodeID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*3) t0, _ = lookupLeadership.GetTopicInfo(topic, 0) if len(t0.ISR) < t0.Replica { waitClusterStable(lookupCoord1, time.Second*3) } t0, _ = lookupLeadership.GetTopicInfo(topic, 0) t.Log(t0) test.Equal(t, len(t0.CatchupList), 0) test.Equal(t, len(t0.ISR) >= t0.Replica, true) t0LeaderCoord = nodeInfoList[t0.Leader].nsqdCoord test.NotNil(t, t0LeaderCoord) tc0, coordErr = t0LeaderCoord.getTopicCoord(topic, 0) test.Nil(t, coordErr) test.Equal(t, len(tc0.topicInfo.ISR), len(t0.ISR)) test.Equal(t, tc0.topicInfo.Leader, t0.Leader) waitClusterStable(lookupCoord1, time.Second*3) t0, _ = lookupLeadership.GetTopicInfo(topic, 0) // should remove the unnecessary node test.Equal(t, len(t0.ISR), t0.Replica) // test old leader failed and begin elect new and then new leader failed coordLog.Warningf("============= begin test old leader failed and then new leader failed ====") lostNodeID = t0.Leader lostISRID := t0.ISR[1] if lostISRID == lostNodeID { lostISRID = t0.ISR[0] } atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 1) nodeInfoList[lostNodeID].nsqdCoord.leadership.UnregisterNsqd(nodeInfoList[lostNodeID].nodeInfo) time.Sleep(time.Millisecond) atomic.StoreInt32(&nodeInfoList[lostISRID].nsqdCoord.stopping, 1) nodeInfoList[lostISRID].nsqdCoord.leadership.UnregisterNsqd(nodeInfoList[lostISRID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*3) atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 0) atomic.StoreInt32(&nodeInfoList[lostISRID].nsqdCoord.stopping, 0) nodeInfoList[lostNodeID].nsqdCoord.leadership.RegisterNsqd(nodeInfoList[lostNodeID].nodeInfo) nodeInfoList[lostISRID].nsqdCoord.leadership.RegisterNsqd(nodeInfoList[lostISRID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*3) waitClusterStable(lookupCoord1, time.Second*5) t0, _ = lookupLeadership.GetTopicInfo(topic, 0) test.Equal(t, true, len(t0.ISR) >= t0.Replica) test.Equal(t, t0.Leader == t0.ISR[0] || t0.Leader == t0.ISR[1], true) t0LeaderCoord = nodeInfoList[t0.Leader].nsqdCoord test.NotNil(t, t0LeaderCoord) tc0, coordErr = t0LeaderCoord.getTopicCoord(topic, 0) test.Nil(t, coordErr) test.Equal(t, len(tc0.topicInfo.ISR), len(t0.ISR)) test.Equal(t, tc0.topicInfo.Leader, t0.Leader) waitClusterStable(lookupCoord1, time.Second*5) t0, _ = lookupLeadership.GetTopicInfo(topic, 0) // should remove the unnecessary node test.Equal(t, t0.Replica, len(t0.ISR)) // test join isr timeout lostNodeID = t1.ISR[1] atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 1) nodeInfoList[lostNodeID].nsqdCoord.leadership.UnregisterNsqd(nodeInfoList[lostNodeID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*5) atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 0) nodeInfoList[lostNodeID].nsqdCoord.leadership.RegisterNsqd(nodeInfoList[lostNodeID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*5) // with only 2 replica, the isr join fail should not change the isr list nodeInfoList[lostNodeID].nsqdCoord.rpcServer.toggleDisableRpcTest(true) waitClusterStable(lookupCoord1, time.Second*10) t1, _ = lookupLeadership.GetTopicInfo(topic, 1) test.Equal(t, true, len(t1.ISR)+len(t1.CatchupList) >= t1.Replica) test.Equal(t, t1.Leader == t1.ISR[0] || t1.Leader == t1.ISR[1], true) nodeInfoList[lostNodeID].nsqdCoord.rpcServer.toggleDisableRpcTest(false) waitClusterStable(lookupCoord1, time.Second*5) // test new topic create coordLog.Warningf("============= begin test 3 replicas ====") err = lookupCoord1.CreateTopic(topic3, TopicMetaInfo{1, 3, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord1, time.Second*5) // with 3 replica, the isr join timeout will change the isr list if the isr has the quorum nodes t3, err := lookupLeadership.GetTopicInfo(topic3, 0) test.Nil(t, err) test.Equal(t, len(t3.ISR), t3.Replica) lostNodeID = t3.ISR[1] atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 1) nodeInfoList[lostNodeID].nsqdCoord.leadership.UnregisterNsqd(nodeInfoList[lostNodeID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*5) atomic.StoreInt32(&nodeInfoList[lostNodeID].nsqdCoord.stopping, 0) nodeInfoList[lostNodeID].nsqdCoord.leadership.RegisterNsqd(nodeInfoList[lostNodeID].nodeInfo) waitClusterStable(lookupCoord1, time.Second*5) nodeInfoList[lostNodeID].nsqdCoord.rpcServer.toggleDisableRpcTest(true) waitClusterStable(lookupCoord1, time.Second*5) t3, _ = lookupLeadership.GetTopicInfo(topic3, 0) test.Equal(t, true, len(t3.ISR) >= t3.Replica-1) test.Equal(t, true, len(t3.ISR) <= t3.Replica) test.Equal(t, t3.Leader == t3.ISR[0] || t3.Leader == t3.ISR[1], true) nodeInfoList[lostNodeID].nsqdCoord.rpcServer.toggleDisableRpcTest(false) waitClusterStable(lookupCoord1, time.Second*5) glog.Flush() t0, _ = lookupLeadership.GetTopicInfo(topic, 0) test.Equal(t, true, len(t0.ISR) >= t0.Replica) t1, _ = lookupLeadership.GetTopicInfo(topic, 1) test.Equal(t, true, len(t1.ISR) >= t0.Replica) // before migrate really start, the isr should not reach the replica factor // however, catch up may start early while check leadership or enable topic write t3, _ = lookupLeadership.GetTopicInfo(topic3, 0) test.Equal(t, true, len(t3.ISR)+len(t3.CatchupList) >= t3.Replica) t0IsrNum := 2 t1IsrNum := 2 coordLog.Warningf("========== begin test quit ====") quitList := make([]*NsqdCoordinator, 0) quitList = append(quitList, nodeInfoList[t0.Leader].nsqdCoord) if t1.Leader != t0.Leader { quitList = append(quitList, nodeInfoList[t1.Leader].nsqdCoord) } if t3.Leader != t0.Leader && t3.Leader != t1.Leader { quitList = append(quitList, nodeInfoList[t3.Leader].nsqdCoord) } for id, n := range nodeInfoList { if id == t0.Leader || id == t1.Leader || id == t3.Leader { continue } quitList = append(quitList, n.nsqdCoord) } test.Equal(t, len(nodeInfoList), len(quitList)) for _, nsqdCoord := range quitList { failedID := nsqdCoord.myNode.GetID() delete(nodeInfoList, failedID) nsqdCoord.Stop() if t0IsrNum > 1 { if FindSlice(t0.ISR, failedID) != -1 { t0IsrNum-- } } if t1IsrNum > 1 { if FindSlice(t1.ISR, failedID) != -1 { t1IsrNum-- } } waitClusterStable(lookupCoord1, time.Second*5) t0, _ = lookupLeadership.GetTopicInfo(topic, 0) // we have no failed node in isr or we got the last failed node leaving in isr. t.Log(t0) test.Equal(t, FindSlice(t0.ISR, failedID) == -1 || (len(t0.ISR) == 1 && t0.ISR[0] == failedID), true) test.Equal(t, true, len(t0.ISR) >= t0IsrNum) t1, _ = lookupLeadership.GetTopicInfo(topic, 1) t.Log(t1) test.Equal(t, FindSlice(t1.ISR, failedID) == -1 || (len(t1.ISR) == 1 && t1.ISR[0] == failedID), true) test.Equal(t, true, len(t1.ISR) >= t1IsrNum) t3, _ = lookupLeadership.GetTopicInfo(topic3, 0) t.Log(t3) test.Equal(t, FindSlice(t3.ISR, failedID) == -1 || (len(t3.ISR) == 1 && t3.ISR[0] == failedID), true) } }
func TestNsqLookupNsqdCreateTopic(t *testing.T) { // on 4 nodes, we should test follow cases // 1 partition 1 replica // 1 partition 3 replica // 3 partition 1 replica // 2 partition 2 replica if testing.Verbose() { SetCoordLogger(&levellogger.GLogger{}, levellogger.LOG_WARN) glog.SetFlags(0, "", "", true, true, 1) glog.StartWorker(time.Second) } else { SetCoordLogger(newTestLogger(t), levellogger.LOG_DEBUG) } idList := []string{"id1", "id2", "id3", "id4"} lookupCoord1, nodeInfoList := prepareCluster(t, idList, false) for _, n := range nodeInfoList { defer os.RemoveAll(n.dataPath) defer n.localNsqd.Exit() defer n.nsqdCoord.Stop() } test.Equal(t, 4, len(nodeInfoList)) topic_p1_r1 := "test-nsqlookup-topic-unit-testcreate-p1-r1" topic_p1_r3 := "test-nsqlookup-topic-unit-testcreate-p1-r3" topic_p3_r1 := "test-nsqlookup-topic-unit-testcreate-p3-r1" topic_p2_r2 := "test-nsqlookup-topic-unit-testcreate-p2-r2" lookupLeadership := lookupCoord1.leadership time.Sleep(time.Second) checkDeleteErr(t, lookupCoord1.DeleteTopic(topic_p1_r1, "**")) checkDeleteErr(t, lookupCoord1.DeleteTopic(topic_p1_r3, "**")) checkDeleteErr(t, lookupCoord1.DeleteTopic(topic_p3_r1, "**")) checkDeleteErr(t, lookupCoord1.DeleteTopic(topic_p2_r2, "**")) time.Sleep(time.Second * 3) defer func() { checkDeleteErr(t, lookupCoord1.DeleteTopic(topic_p1_r1, "**")) checkDeleteErr(t, lookupCoord1.DeleteTopic(topic_p1_r3, "**")) checkDeleteErr(t, lookupCoord1.DeleteTopic(topic_p3_r1, "**")) checkDeleteErr(t, lookupCoord1.DeleteTopic(topic_p2_r2, "**")) time.Sleep(time.Second * 3) lookupCoord1.Stop() }() // test new topic create err := lookupCoord1.CreateTopic(topic_p1_r1, TopicMetaInfo{1, 1, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord1, time.Second*3) pmeta, _, err := lookupLeadership.GetTopicMetaInfo(topic_p1_r1) pn := pmeta.PartitionNum test.Nil(t, err) test.Equal(t, pn, 1) t0, err := lookupLeadership.GetTopicInfo(topic_p1_r1, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), 1) t.Logf("t0 leader is: %v", t0.Leader) if nodeInfoList[t0.Leader] == nil { t.Fatalf("no leader: %v, %v", t0, nodeInfoList) } t0LeaderCoord := nodeInfoList[t0.Leader].nsqdCoord test.NotNil(t, t0LeaderCoord) tc0, coordErr := t0LeaderCoord.getTopicCoord(topic_p1_r1, 0) test.Nil(t, coordErr) test.Equal(t, tc0.topicInfo.Leader, t0.Leader) test.Equal(t, len(tc0.topicInfo.ISR), 1) err = lookupCoord1.CreateTopic(topic_p1_r3, TopicMetaInfo{1, 3, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord1, time.Second*5) lookupCoord1.triggerCheckTopics("", 0, 0) waitClusterStable(lookupCoord1, time.Second*5) pmeta, _, err = lookupLeadership.GetTopicMetaInfo(topic_p1_r3) pn = pmeta.PartitionNum test.Nil(t, err) test.Equal(t, pn, 1) t0, err = lookupLeadership.GetTopicInfo(topic_p1_r3, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), 3) t.Logf("t0 leader is: %v", t0.Leader) if nodeInfoList[t0.Leader] == nil { t.Fatalf("no leader: %v, %v", t0, nodeInfoList) } t0LeaderCoord = nodeInfoList[t0.Leader].nsqdCoord test.NotNil(t, t0LeaderCoord) tc0, coordErr = t0LeaderCoord.getTopicCoord(topic_p1_r3, 0) test.Nil(t, coordErr) test.Equal(t, tc0.topicInfo.Leader, t0.Leader) test.Equal(t, len(tc0.topicInfo.ISR), 3) err = lookupCoord1.CreateTopic(topic_p3_r1, TopicMetaInfo{3, 1, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord1, time.Second*2) waitClusterStable(lookupCoord1, time.Second*5) pmeta, _, err = lookupLeadership.GetTopicMetaInfo(topic_p3_r1) pn = pmeta.PartitionNum test.Nil(t, err) test.Equal(t, pn, 3) t0, err = lookupLeadership.GetTopicInfo(topic_p3_r1, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), 1) t.Logf("t0 leader is: %v", t0.Leader) if nodeInfoList[t0.Leader] == nil { t.Fatalf("no leader: %v, %v", t0, nodeInfoList) } t0LeaderCoord = nodeInfoList[t0.Leader].nsqdCoord test.NotNil(t, t0LeaderCoord) tc0, coordErr = t0LeaderCoord.getTopicCoord(topic_p3_r1, 0) test.Nil(t, coordErr) test.Equal(t, tc0.topicInfo.Leader, t0.Leader) test.Equal(t, len(tc0.topicInfo.ISR), 1) t1, err := lookupLeadership.GetTopicInfo(topic_p3_r1, 1) t1LeaderCoord := nodeInfoList[t1.Leader].nsqdCoord test.NotNil(t, t1LeaderCoord) tc1, coordErr := t1LeaderCoord.getTopicCoord(topic_p3_r1, 1) test.Nil(t, coordErr) test.Equal(t, tc1.topicInfo.Leader, t1.Leader) test.Equal(t, len(tc1.topicInfo.ISR), 1) err = lookupCoord1.CreateTopic(topic_p2_r2, TopicMetaInfo{2, 2, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord1, time.Second*3) waitClusterStable(lookupCoord1, time.Second*5) pmeta, _, err = lookupLeadership.GetTopicMetaInfo(topic_p2_r2) pn = pmeta.PartitionNum test.Nil(t, err) test.Equal(t, pn, 2) t0, err = lookupLeadership.GetTopicInfo(topic_p2_r2, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), 2) t.Logf("t0 leader is: %v", t0.Leader) if nodeInfoList[t0.Leader] == nil { t.Fatalf("no leader: %v, %v", t0, nodeInfoList) } t0LeaderCoord = nodeInfoList[t0.Leader].nsqdCoord test.NotNil(t, t0LeaderCoord) tc0, coordErr = t0LeaderCoord.getTopicCoord(topic_p2_r2, 0) test.Nil(t, coordErr) test.Equal(t, tc0.topicInfo.Leader, t0.Leader) test.Equal(t, len(tc0.topicInfo.ISR), 2) t1, err = lookupLeadership.GetTopicInfo(topic_p2_r2, 1) t1LeaderCoord = nodeInfoList[t1.Leader].nsqdCoord test.NotNil(t, t1LeaderCoord) tc1, coordErr = t1LeaderCoord.getTopicCoord(topic_p2_r2, 1) test.Nil(t, coordErr) test.Equal(t, tc1.topicInfo.Leader, t1.Leader) test.Equal(t, len(tc1.topicInfo.ISR), 2) // test create on exist topic, create on partial partition oldMeta, _, err := lookupCoord1.leadership.GetTopicMetaInfo(topic_p2_r2) test.Nil(t, err) err = lookupCoord1.CreateTopic(topic_p2_r2, TopicMetaInfo{2, 2, 0, 0, 1, 1}) test.NotNil(t, err) waitClusterStable(lookupCoord1, time.Second) waitClusterStable(lookupCoord1, time.Second*5) newMeta, _, err := lookupCoord1.leadership.GetTopicMetaInfo(topic_p2_r2) test.Nil(t, err) test.Equal(t, oldMeta, newMeta) }
func TestNsqLookupMovePartition(t *testing.T) { if testing.Verbose() { SetCoordLogger(&levellogger.GLogger{}, levellogger.LOG_WARN) glog.SetFlags(0, "", "", true, true, 1) glog.StartWorker(time.Second) } else { SetCoordLogger(newTestLogger(t), levellogger.LOG_DEBUG) } idList := []string{"id1", "id2", "id3", "id4", "id5"} lookupCoord, nodeInfoList := prepareCluster(t, idList, false) for _, n := range nodeInfoList { defer os.RemoveAll(n.dataPath) defer n.localNsqd.Exit() defer n.nsqdCoord.Stop() } topic_p1_r1 := "test-nsqlookup-topic-unit-test-move-p1-r1" topic_p2_r2 := "test-nsqlookup-topic-unit-test-move-p2-r2" lookupLeadership := lookupCoord.leadership checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r1, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p2_r2, "**")) time.Sleep(time.Second * 3) defer func() { checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r1, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p2_r2, "**")) time.Sleep(time.Second * 3) lookupCoord.Stop() }() // test new topic create err := lookupCoord.CreateTopic(topic_p1_r1, TopicMetaInfo{1, 1, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) err = lookupCoord.CreateTopic(topic_p2_r2, TopicMetaInfo{2, 2, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) lookupCoord.triggerCheckTopics("", 0, 0) waitClusterStable(lookupCoord, time.Second*3) // test move leader to other isr; // test move leader to other catchup; // test move non-leader to other node; t0, err := lookupLeadership.GetTopicInfo(topic_p1_r1, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), 1) // move p1_r1 leader to other node toNode := "" for _, node := range nodeInfoList { if node.nodeInfo.GetID() == t0.Leader { continue } toNode = node.nodeInfo.GetID() break } lookupCoord.triggerCheckTopics("", 0, 0) time.Sleep(time.Second) err = lookupCoord.MoveTopicPartitionDataByManual(topic_p1_r1, 0, true, t0.Leader, toNode) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) t0, err = lookupLeadership.GetTopicInfo(topic_p1_r1, 0) test.Nil(t, err) // it may be two nodes in isr if the moved leader rejoin as isr test.Equal(t, len(t0.ISR) >= 1, true) test.Equal(t, t0.Leader, toNode) t0, err = lookupLeadership.GetTopicInfo(topic_p2_r2, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), 2) toNode = "" for _, nid := range t0.ISR { if nid == t0.Leader { continue } toNode = nid break } waitClusterStable(lookupCoord, time.Second*3) // move leader to other isr node oldLeader := t0.Leader err = lookupCoord.MoveTopicPartitionDataByManual(topic_p2_r2, 0, true, t0.Leader, toNode) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) t0, err = lookupLeadership.GetTopicInfo(topic_p2_r2, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR) >= 2, true) test.NotEqual(t, t0.Leader, oldLeader) test.Equal(t, t0.Leader, toNode) // move leader to other non-isr node toNode = "" for _, node := range nodeInfoList { if FindSlice(t0.ISR, node.nodeInfo.GetID()) != -1 { continue } // check other partition t1, err := lookupLeadership.GetTopicInfo(topic_p2_r2, 1) if err == nil { if FindSlice(t1.ISR, node.nodeInfo.GetID()) != -1 { continue } } toNode = node.nodeInfo.GetID() break } lookupCoord.triggerCheckTopics("", 0, 0) time.Sleep(time.Second) err = lookupCoord.MoveTopicPartitionDataByManual(topic_p2_r2, 0, true, t0.Leader, toNode) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) t0, err = lookupLeadership.GetTopicInfo(topic_p2_r2, 0) test.Nil(t, err) test.Equal(t, t0.Leader, toNode) // move non-leader to other non-isr node toNode = "" toNodeInvalid := "" fromNode := "" for _, nid := range t0.ISR { if nid != t0.Leader { fromNode = nid } } for _, node := range nodeInfoList { if FindSlice(t0.ISR, node.nodeInfo.GetID()) != -1 { continue } // check other partition t1, err := lookupLeadership.GetTopicInfo(topic_p2_r2, 1) if err == nil { toNodeInvalid = t1.Leader if FindSlice(t1.ISR, node.nodeInfo.GetID()) != -1 { continue } } toNode = node.nodeInfo.GetID() break } lookupCoord.triggerCheckTopics("", 0, 0) time.Sleep(time.Second) err = lookupCoord.MoveTopicPartitionDataByManual(topic_p2_r2, 0, false, fromNode, toNodeInvalid) test.NotNil(t, err) test.Equal(t, ErrNodeIsExcludedForTopicData, err) lookupCoord.triggerCheckTopics("", 0, 0) time.Sleep(time.Second) err = lookupCoord.MoveTopicPartitionDataByManual(topic_p2_r2, 0, false, fromNode, toNode) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) t0, err = lookupLeadership.GetTopicInfo(topic_p2_r2, 0) test.Nil(t, err) test.Equal(t, FindSlice(t0.ISR, toNode) != -1, true) test.Equal(t, -1, FindSlice(t0.ISR, fromNode)) }
func TestNsqLookupExpandPartition(t *testing.T) { if testing.Verbose() { SetCoordLogger(&levellogger.GLogger{}, levellogger.LOG_WARN) glog.SetFlags(0, "", "", true, true, 1) glog.StartWorker(time.Second) } else { SetCoordLogger(newTestLogger(t), levellogger.LOG_DEBUG) } idList := []string{"id1", "id2", "id3", "id4", "id5", "id6"} lookupCoord, nodeInfoList := prepareCluster(t, idList, false) for _, n := range nodeInfoList { defer os.RemoveAll(n.dataPath) defer n.localNsqd.Exit() defer n.nsqdCoord.Stop() } topic_p1_r1 := "test-nsqlookup-topic-unit-test-expand-p1-r1" topic_p1_r2 := "test-nsqlookup-topic-unit-test-expand-p1-r2" topic_p1_r3 := "test-nsqlookup-topic-unit-test-expand-p1-r3" lookupLeadership := lookupCoord.leadership checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r1, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r2, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r3, "**")) time.Sleep(time.Second * 3) defer func() { checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r1, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r2, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r3, "**")) time.Sleep(time.Second * 3) lookupCoord.Stop() }() err := lookupCoord.CreateTopic(topic_p1_r1, TopicMetaInfo{1, 1, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second) err = lookupCoord.CreateTopic(topic_p1_r2, TopicMetaInfo{1, 2, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second) err = lookupCoord.CreateTopic(topic_p1_r3, TopicMetaInfo{1, 3, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second) waitClusterStable(lookupCoord, time.Second) waitClusterStable(lookupCoord, time.Second*3) err = lookupCoord.ExpandTopicPartition(topic_p1_r1, 3) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) t0, err := lookupLeadership.GetTopicInfo(topic_p1_r1, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), 1) t1, err := lookupLeadership.GetTopicInfo(topic_p1_r1, 1) test.Nil(t, err) test.Equal(t, len(t1.ISR), 1) t2, err := lookupLeadership.GetTopicInfo(topic_p1_r1, 2) test.Nil(t, err) test.Equal(t, len(t2.ISR), 1) lookupCoord.triggerCheckTopics("", 0, 0) waitClusterStable(lookupCoord, time.Second*3) err = lookupCoord.ExpandTopicPartition(topic_p1_r2, 2) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) t0, err = lookupLeadership.GetTopicInfo(topic_p1_r2, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), t0.Replica) t1, err = lookupLeadership.GetTopicInfo(topic_p1_r2, 1) test.Nil(t, err) test.Equal(t, len(t1.ISR), t1.Replica) lookupCoord.triggerCheckTopics("", 0, 0) waitClusterStable(lookupCoord, time.Second*3) err = lookupCoord.ExpandTopicPartition(topic_p1_r2, 3) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) t0, err = lookupLeadership.GetTopicInfo(topic_p1_r2, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), t0.Replica) t1, err = lookupLeadership.GetTopicInfo(topic_p1_r2, 1) test.Nil(t, err) test.Equal(t, len(t1.ISR), t1.Replica) t2, err = lookupLeadership.GetTopicInfo(topic_p1_r2, 2) test.Nil(t, err) test.Equal(t, len(t2.ISR), t2.Replica) waitClusterStable(lookupCoord, time.Second*3) // should fail err = lookupCoord.ExpandTopicPartition(topic_p1_r2, 4) test.NotNil(t, err) err = lookupCoord.ExpandTopicPartition(topic_p1_r3, 2) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*3) t0, err = lookupLeadership.GetTopicInfo(topic_p1_r3, 0) test.Nil(t, err) test.Equal(t, len(t0.ISR), t0.Replica) t1, err = lookupLeadership.GetTopicInfo(topic_p1_r3, 1) test.Nil(t, err) test.Equal(t, len(t1.ISR), t1.Replica) waitClusterStable(lookupCoord, time.Second*3) // should fail err = lookupCoord.ExpandTopicPartition(topic_p1_r3, 3) test.NotNil(t, err) }
func TestNsqLookupMarkNodeRemove(t *testing.T) { if testing.Verbose() { SetCoordLogger(&levellogger.GLogger{}, levellogger.LOG_WARN) glog.SetFlags(0, "", "", true, true, 1) glog.StartWorker(time.Second) } else { SetCoordLogger(newTestLogger(t), levellogger.LOG_DEBUG) } idList := []string{"id1", "id2", "id3", "id4", "id5"} lookupCoord, nodeInfoList := prepareCluster(t, idList, false) for _, n := range nodeInfoList { defer os.RemoveAll(n.dataPath) defer n.localNsqd.Exit() defer n.nsqdCoord.Stop() } topic_p4_r1 := "test-nsqlookup-topic-unit-test-removenode-p4-r1" topic_p2_r2 := "test-nsqlookup-topic-unit-test-removenode-p2-r2" topic_p1_r3 := "test-nsqlookup-topic-unit-test-removenode-p1-r3" lookupLeadership := lookupCoord.leadership checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p4_r1, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p2_r2, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r3, "**")) time.Sleep(time.Second * 3) defer func() { checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p4_r1, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p2_r2, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r3, "**")) time.Sleep(time.Second * 3) lookupCoord.Stop() }() err := lookupCoord.CreateTopic(topic_p4_r1, TopicMetaInfo{4, 1, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second) err = lookupCoord.CreateTopic(topic_p2_r2, TopicMetaInfo{2, 2, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second) err = lookupCoord.CreateTopic(topic_p1_r3, TopicMetaInfo{1, 3, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second) waitClusterStable(lookupCoord, time.Second*5) nid := "" for _, n := range nodeInfoList { nid = n.nodeInfo.GetID() break } err = lookupCoord.MarkNodeAsRemoving(nid) test.Nil(t, err) checkStart := time.Now() for time.Since(checkStart) < time.Minute*2 { time.Sleep(time.Second) isDone := true for i := 0; i < 4; i++ { info, err := lookupLeadership.GetTopicInfo(topic_p4_r1, i) test.Nil(t, err) if FindSlice(info.ISR, nid) != -1 { t.Logf("still waiting remove: %v", info) isDone = false break } } if !isDone { continue } time.Sleep(time.Second) for i := 0; i < 2; i++ { info, err := lookupLeadership.GetTopicInfo(topic_p2_r2, i) test.Nil(t, err) if FindSlice(info.ISR, nid) != -1 { t.Logf("still waiting remove: %v", info) isDone = false break } } if !isDone { continue } time.Sleep(time.Second) info, err := lookupLeadership.GetTopicInfo(topic_p1_r3, 0) test.Nil(t, err) if FindSlice(info.ISR, nid) != -1 { t.Logf("still waiting remove: %v from removing node", info) isDone = false } t.Logf("all done") if isDone { break } } for time.Since(checkStart) < time.Minute*2 { lookupCoord.nodesMutex.Lock() state := lookupCoord.removingNodes[nid] lookupCoord.nodesMutex.Unlock() if state == "data_transfered" || state == "done" { break } else { t.Logf("still waiting state: %v ", state) } time.Sleep(time.Second) } if time.Since(checkStart) >= time.Minute*2 { t.Error("remove node timeout") } }
func TestNsqLookupUpdateTopicMeta(t *testing.T) { if testing.Verbose() { SetCoordLogger(&levellogger.GLogger{}, levellogger.LOG_WARN) glog.SetFlags(0, "", "", true, true, 1) glog.StartWorker(time.Second) } else { SetCoordLogger(newTestLogger(t), levellogger.LOG_DEBUG) } idList := []string{"id1", "id2", "id3", "id4"} lookupCoord, nodeInfoList := prepareCluster(t, idList, false) for _, n := range nodeInfoList { defer os.RemoveAll(n.dataPath) defer n.localNsqd.Exit() defer n.nsqdCoord.Stop() } topic_p1_r1 := "test-nsqlookup-topic-unit-test-updatemeta-p1-r1" topic_p2_r1 := "test-nsqlookup-topic-unit-test-updatemeta-p2-r1" lookupLeadership := lookupCoord.leadership checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r1, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p2_r1, "**")) time.Sleep(time.Second * 3) defer func() { checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p1_r1, "**")) checkDeleteErr(t, lookupCoord.DeleteTopic(topic_p2_r1, "**")) time.Sleep(time.Second * 3) lookupCoord.Stop() }() err := lookupCoord.CreateTopic(topic_p1_r1, TopicMetaInfo{1, 1, 0, 0, 0, 0}) test.Nil(t, err) time.Sleep(time.Second) err = lookupCoord.CreateTopic(topic_p2_r1, TopicMetaInfo{2, 1, 0, 0, 0, 0}) test.Nil(t, err) waitClusterStable(lookupCoord, time.Second*5) // test increase replicator and decrease the replicator err = lookupCoord.ChangeTopicMetaParam(topic_p1_r1, -1, -1, 3) lookupCoord.triggerCheckTopics("", 0, 0) time.Sleep(time.Second * 5) tmeta, _, _ := lookupLeadership.GetTopicMetaInfo(topic_p1_r1) test.Equal(t, 3, tmeta.Replica) for i := 0; i < tmeta.PartitionNum; i++ { info, err := lookupLeadership.GetTopicInfo(topic_p1_r1, i) test.Nil(t, err) test.Equal(t, tmeta.Replica, len(info.ISR)) } err = lookupCoord.ChangeTopicMetaParam(topic_p1_r1, -1, -1, 2) lookupCoord.triggerCheckTopics("", 0, 0) time.Sleep(time.Second * 3) tmeta, _, _ = lookupLeadership.GetTopicMetaInfo(topic_p1_r1) test.Equal(t, 2, tmeta.Replica) for i := 0; i < tmeta.PartitionNum; i++ { info, err := lookupLeadership.GetTopicInfo(topic_p1_r1, i) test.Nil(t, err) test.Equal(t, tmeta.Replica, len(info.ISR)) } err = lookupCoord.ChangeTopicMetaParam(topic_p2_r1, -1, -1, 2) lookupCoord.triggerCheckTopics("", 0, 0) time.Sleep(time.Second * 3) tmeta, _, _ = lookupLeadership.GetTopicMetaInfo(topic_p2_r1) test.Equal(t, 2, tmeta.Replica) for i := 0; i < tmeta.PartitionNum; i++ { info, err := lookupLeadership.GetTopicInfo(topic_p2_r1, i) test.Nil(t, err) test.Equal(t, tmeta.Replica, len(info.ISR)) } // should fail err = lookupCoord.ChangeTopicMetaParam(topic_p2_r1, -1, -1, 3) test.NotNil(t, err) err = lookupCoord.ChangeTopicMetaParam(topic_p2_r1, -1, -1, 1) lookupCoord.triggerCheckTopics("", 0, 0) time.Sleep(time.Second * 3) tmeta, _, _ = lookupLeadership.GetTopicMetaInfo(topic_p2_r1) test.Equal(t, 1, tmeta.Replica) for i := 0; i < tmeta.PartitionNum; i++ { info, err := lookupLeadership.GetTopicInfo(topic_p2_r1, i) test.Nil(t, err) test.Equal(t, tmeta.Replica, len(info.ISR)) } // test update the sync and retention , all partition and replica should be updated err = lookupCoord.ChangeTopicMetaParam(topic_p1_r1, 1234, 3, -1) time.Sleep(time.Second) tmeta, _, _ = lookupLeadership.GetTopicMetaInfo(topic_p1_r1) test.Equal(t, 1234, tmeta.SyncEvery) test.Equal(t, int32(3), tmeta.RetentionDay) for i := 0; i < tmeta.PartitionNum; i++ { info, err := lookupLeadership.GetTopicInfo(topic_p1_r1, i) test.Nil(t, err) for _, nid := range info.ISR { localNsqd := nodeInfoList[nid].localNsqd localTopic, err := localNsqd.GetExistingTopic(topic_p1_r1, i) test.Nil(t, err) dinfo := localTopic.GetDynamicInfo() test.Equal(t, int64(1234), dinfo.SyncEvery) test.Equal(t, int32(3), dinfo.RetentionDay) } } }
func TestTopicResetWithQueueStart(t *testing.T) { opts := NewOptions() opts.Logger = newTestLogger(t) if testing.Verbose() { opts.Logger = &levellogger.GLogger{} opts.LogLevel = 3 glog.SetFlags(0, "", "", true, true, 1) glog.StartWorker(time.Second) } opts.MaxBytesPerFile = 1024 * 1024 _, _, nsqd := mustStartNSQD(opts) defer os.RemoveAll(opts.DataPath) defer nsqd.Exit() topic := nsqd.GetTopic("test", 0) topic.dynamicConf.AutoCommit = 1 topic.dynamicConf.SyncEvery = 10 msgNum := 5000 channel := topic.GetChannel("ch") test.NotNil(t, channel) msg := NewMessage(0, make([]byte, 1000)) msg.Timestamp = time.Now().Add(-1 * time.Hour * time.Duration(24*4)).UnixNano() msgSize := int32(0) var dend BackendQueueEnd for i := 0; i <= msgNum; i++ { msg.ID = 0 _, _, msgSize, dend, _ = topic.PutMessage(msg) msg.Timestamp = time.Now().Add(-1 * time.Hour * 24 * time.Duration(4-dend.(*diskQueueEndInfo).EndOffset.FileNum)).UnixNano() } topic.ForceFlush() fileNum := topic.backend.diskWriteEnd.EndOffset.FileNum test.Equal(t, int64(0), topic.backend.GetQueueReadStart().(*diskQueueEndInfo).EndOffset.FileNum) test.Equal(t, true, fileNum >= 4) nsqLog.Warningf("reading the topic %v backend ", topic.GetFullName()) for i := 0; i < 100; i++ { msg := <-channel.clientMsgChan channel.ConfirmBackendQueue(msg) } topic.dynamicConf.RetentionDay = 2 oldEnd := topic.backend.GetQueueWriteEnd().(*diskQueueEndInfo) // reset with new start resetStart := &diskQueueEndInfo{} resetStart.virtualEnd = topic.backend.GetQueueWriteEnd().Offset() + BackendOffset(msgSize*10) resetStart.totalMsgCnt = topic.backend.GetQueueWriteEnd().TotalMsgCnt() + 10 err := topic.ResetBackendWithQueueStartNoLock(int64(resetStart.Offset()), resetStart.TotalMsgCnt()) test.NotNil(t, err) topic.DisableForSlave() err = topic.ResetBackendWithQueueStartNoLock(int64(resetStart.Offset()), resetStart.TotalMsgCnt()) test.Nil(t, err) topic.EnableForMaster() nsqLog.Warningf("reset the topic %v backend with queue start: %v", topic.GetFullName(), resetStart) test.Equal(t, resetStart.Offset(), BackendOffset(topic.GetQueueReadStart())) newEnd := topic.backend.GetQueueWriteEnd().(*diskQueueEndInfo) test.Equal(t, resetStart.Offset(), newEnd.Offset()) test.Equal(t, resetStart.TotalMsgCnt(), newEnd.TotalMsgCnt()) test.Equal(t, true, newEnd.EndOffset.GreatThan(&oldEnd.EndOffset)) test.Equal(t, int64(0), newEnd.EndOffset.Pos) test.Equal(t, resetStart.Offset(), channel.GetConfirmed().Offset()) test.Equal(t, resetStart.TotalMsgCnt(), channel.GetChannelEnd().TotalMsgCnt()) for i := 0; i < msgNum; i++ { msg.ID = 0 _, _, msgSize, _, _ = topic.PutMessage(msg) } topic.ForceFlush() newEnd = topic.backend.GetQueueWriteEnd().(*diskQueueEndInfo) test.Equal(t, resetStart.TotalMsgCnt()+int64(msgNum), newEnd.TotalMsgCnt()) for i := 0; i < 100; i++ { msg := <-channel.clientMsgChan channel.ConfirmBackendQueue(msg) test.Equal(t, msg.offset+msg.rawMoveSize, channel.GetConfirmed().Offset()) } // reset with old start topic.DisableForSlave() err = topic.ResetBackendWithQueueStartNoLock(int64(resetStart.Offset()), resetStart.TotalMsgCnt()) test.Nil(t, err) topic.EnableForMaster() test.Equal(t, resetStart.Offset(), BackendOffset(topic.GetQueueReadStart())) newEnd = topic.backend.GetQueueWriteEnd().(*diskQueueEndInfo) test.Equal(t, resetStart.Offset(), newEnd.Offset()) test.Equal(t, resetStart.TotalMsgCnt(), newEnd.TotalMsgCnt()) test.Equal(t, true, newEnd.EndOffset.GreatThan(&oldEnd.EndOffset)) test.Equal(t, int64(0), newEnd.EndOffset.Pos) test.Equal(t, resetStart.Offset(), channel.GetConfirmed().Offset()) test.Equal(t, resetStart.TotalMsgCnt(), channel.GetChannelEnd().TotalMsgCnt()) for i := 0; i < msgNum; i++ { msg.ID = 0 _, _, msgSize, dend, _ = topic.PutMessage(msg) msg.Timestamp = time.Now().Add(-1 * time.Hour * 24 * time.Duration(4-dend.(*diskQueueEndInfo).EndOffset.FileNum)).UnixNano() } topic.ForceFlush() newEnd = topic.backend.GetQueueWriteEnd().(*diskQueueEndInfo) test.Equal(t, resetStart.TotalMsgCnt()+int64(msgNum), newEnd.TotalMsgCnt()) for i := 0; i < 100; i++ { msg := <-channel.clientMsgChan channel.ConfirmBackendQueue(msg) test.Equal(t, msg.offset+msg.rawMoveSize, channel.GetConfirmed().Offset()) } }