// Ensure that the cluster configuration can be reloaded. func TestClusterConfigReload(t *testing.T) { procAttr := &os.ProcAttr{Files: []*os.File{nil, os.Stdout, os.Stderr}} argGroup, etcds, err := CreateCluster(3, procAttr, false) assert.NoError(t, err) defer DestroyCluster(etcds) resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":3, "removeDelay":60}`)) assert.Equal(t, resp.StatusCode, 200) time.Sleep(1 * time.Second) resp, _ = tests.Get("http://localhost:7002/v2/admin/config") body := tests.ReadBodyJSON(resp) assert.Equal(t, resp.StatusCode, 200) assert.Equal(t, body["activeSize"], 3) assert.Equal(t, body["removeDelay"], 60) // kill all DestroyCluster(etcds) for i := 0; i < 3; i++ { etcds[i], err = os.StartProcess(EtcdBinPath, argGroup[i], procAttr) } time.Sleep(1 * time.Second) resp, _ = tests.Get("http://localhost:7002/v2/admin/config") body = tests.ReadBodyJSON(resp) assert.Equal(t, resp.StatusCode, 200) assert.Equal(t, body["activeSize"], 3) assert.Equal(t, body["removeDelay"], 60) }
// Create a full cluster, disconnect a peer, wait for autodemotion, wait for autopromotion. func TestStandbyAutoPromote(t *testing.T) { t.Skip("functionality unimplemented") clusterSize := 10 // DefaultActiveSize + 1 _, etcds, err := CreateCluster(clusterSize, &os.ProcAttr{Files: []*os.File{nil, os.Stdout, os.Stderr}}, false) if err != nil { t.Fatal("cannot create cluster") } defer func() { // Wrap this in a closure so that it picks up the updated version of // the "etcds" variable. DestroyCluster(etcds) }() c := etcd.NewClient(nil) c.SyncCluster() time.Sleep(1 * time.Second) // Verify that we have one standby. result, err := c.Get("_etcd/standbys", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 1) // Reconfigure with a short promote delay (2 second). resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":9, "promoteDelay":2}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } // Remove peer. etcd := etcds[1] etcds = append(etcds[:1], etcds[2:]...) if err := etcd.Kill(); err != nil { panic(err.Error()) } etcd.Release() // Wait for it to get dropped. time.Sleep(server.PeerActivityMonitorTimeout + (2 * time.Second)) // Wait for the standby to be promoted. time.Sleep(server.ActiveMonitorTimeout + (2 * time.Second)) // Verify that we have 9 peers. result, err = c.Get("_etcd/machines", true, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 9) // Verify that node10 is one of those peers. result, err = c.Get("_etcd/machines/node10", false, false) assert.NoError(t, err) // Verify that there are no more standbys. result, err = c.Get("_etcd/standbys", false, true) assert.NoError(t, err) if assert.Equal(t, len(result.Node.Nodes), 1) { assert.Equal(t, result.Node.Nodes[0].Key, "/_etcd/standbys/node2") } }
// Create a full cluster, disconnect a peer, wait for removal, wait for standby join. func TestStandbyAutoJoin(t *testing.T) { clusterSize := 5 _, etcds, err := CreateCluster(clusterSize, &os.ProcAttr{Files: []*os.File{nil, os.Stdout, os.Stderr}}, false) if err != nil { t.Fatal("cannot create cluster") } defer func() { // Wrap this in a closure so that it picks up the updated version of // the "etcds" variable. DestroyCluster(etcds) }() c := etcd.NewClient(nil) c.SyncCluster() time.Sleep(1 * time.Second) // Verify that we have five machines. result, err := c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 5) // Reconfigure with a short remove delay (2 second). resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":4, "removeDelay":2, "syncInterval":1}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } // Wait for a monitor cycle before checking for removal. time.Sleep(server.ActiveMonitorTimeout + (1 * time.Second)) // Verify that we now have four peers. result, err = c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 4) // Remove peer. etcd := etcds[1] etcds = append(etcds[:1], etcds[2:]...) if err := etcd.Kill(); err != nil { panic(err.Error()) } etcd.Release() // Wait for it to get dropped. time.Sleep(server.PeerActivityMonitorTimeout + (1 * time.Second)) // Wait for the standby to join. time.Sleep((1 * time.Second) + (1 * time.Second)) // Verify that we have 4 peers. result, err = c.Get("_etcd/machines", true, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 4) // Verify that node2 is not one of those peers. _, err = c.Get("_etcd/machines/node2", false, false) assert.Error(t, err) }
func TestStandbyJoinMiss(t *testing.T) { clusterSize := 2 _, etcds, err := CreateCluster(clusterSize, &os.ProcAttr{Files: []*os.File{nil, os.Stdout, os.Stderr}}, false) if err != nil { t.Fatal("cannot create cluster") } defer DestroyCluster(etcds) c := etcd.NewClient(nil) c.SyncCluster() time.Sleep(1 * time.Second) // Verify that we have two machines. result, err := c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), clusterSize) resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"removeDelay":4, "syncInterval":4}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } time.Sleep(time.Second) resp, _ = tests.Delete("http://localhost:7001/v2/admin/machines/node2", "application/json", nil) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } // Wait for a monitor cycle before checking for removal. time.Sleep(server.ActiveMonitorTimeout + (1 * time.Second)) // Verify that we now have one peer. result, err = c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 1) // Simulate the join failure _, err = server.NewClient(nil).AddMachine("http://localhost:7001", &server.JoinCommand{ MinVersion: store.MinVersion(), MaxVersion: store.MaxVersion(), Name: "node2", RaftURL: "http://127.0.0.1:7002", EtcdURL: "http://127.0.0.1:4002", }) assert.NoError(t, err) time.Sleep(6 * time.Second) go tests.Delete("http://localhost:7001/v2/admin/machines/node2", "application/json", nil) time.Sleep(time.Second) result, err = c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 1) }
// Ensure that the cluster configuration can be updated. func TestClusterConfigSet(t *testing.T) { _, etcds, err := CreateCluster(3, &os.ProcAttr{Files: []*os.File{nil, os.Stdout, os.Stderr}}, false) assert.NoError(t, err) defer DestroyCluster(etcds) resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":3, "removeDelay":60}`)) assert.Equal(t, resp.StatusCode, 200) time.Sleep(1 * time.Second) resp, _ = tests.Get("http://localhost:7002/v2/admin/config") body := tests.ReadBodyJSON(resp) assert.Equal(t, resp.StatusCode, 200) assert.Equal(t, body["activeSize"], 3) assert.Equal(t, body["removeDelay"], 60) }
// Create a full cluster and then add extra an extra standby node. func TestStandby(t *testing.T) { t.Skip("functionality unimplemented") clusterSize := 10 // DefaultActiveSize + 1 _, etcds, err := CreateCluster(clusterSize, &os.ProcAttr{Files: []*os.File{nil, os.Stdout, os.Stderr}}, false) assert.NoError(t, err) defer DestroyCluster(etcds) if err != nil { t.Fatal("cannot create cluster") } c := etcd.NewClient(nil) c.SyncCluster() // Set key. time.Sleep(time.Second) if _, err := c.Set("foo", "bar", 0); err != nil { panic(err) } time.Sleep(time.Second) // Check that all peers and standbys have the value. for i := range etcds { resp, err := tests.Get(fmt.Sprintf("http://localhost:%d/v2/keys/foo", 4000+(i+1))) if assert.NoError(t, err) { body := tests.ReadBodyJSON(resp) if node, _ := body["node"].(map[string]interface{}); assert.NotNil(t, node) { assert.Equal(t, node["value"], "bar") } } } // Verify that we have one standby. result, err := c.Get("_etcd/standbys", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 1) // Reconfigure with larger active size (10 nodes) and wait for promotion. resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":10, "promoteDelay":1800}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } time.Sleep(server.ActiveMonitorTimeout + (1 * time.Second)) // Verify that the standby node is now a peer. result, err = c.Get("_etcd/standbys", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 0) // Reconfigure with a smaller active size (8 nodes). resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":8, "promoteDelay":1800}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } // Wait for two monitor cycles before checking for demotion. time.Sleep((2 * server.ActiveMonitorTimeout) + (1 * time.Second)) // Verify that we now have eight peers. result, err = c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 8) // Verify that we now have two standbys. result, err = c.Get("_etcd/standbys", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 2) }
// Create a five-node cluster // Kill all the nodes and restart func TestMultiNodeKillAllAndRecoveryWithStandbys(t *testing.T) { procAttr := new(os.ProcAttr) procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr} stop := make(chan bool) leaderChan := make(chan string, 1) all := make(chan bool, 1) clusterSize := 15 argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false) defer DestroyCluster(etcds) if err != nil { t.Fatal("cannot create cluster") } c := etcd.NewClient(nil) go Monitor(clusterSize, clusterSize, leaderChan, all, stop) <-all <-leaderChan stop <- true c.SyncCluster() // Reconfigure with smaller active size (7 nodes) and wait for remove. resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":7}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } time.Sleep(2*server.ActiveMonitorTimeout + (1 * time.Second)) // Verify that there is three machines in peer mode. result, err := c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 7) // send set commands for i := 0; i < 2*clusterSize; i++ { // Test Set _, err := c.Set("foo", "bar", 0) if err != nil { panic(err) } } time.Sleep(time.Second) // kill all DestroyCluster(etcds) time.Sleep(time.Second) stop = make(chan bool) leaderChan = make(chan string, 1) all = make(chan bool, 1) time.Sleep(time.Second) for i := 0; i < clusterSize; i++ { etcds[i], err = os.StartProcess(EtcdBinPath, append(argGroup[i], "-peers="), procAttr) } time.Sleep(2 * time.Second) // send set commands for i := 0; i < 2*clusterSize; i++ { // Test Set _, err := c.Set("foo", "bar", 0) if err != nil { t.Fatalf("Recovery error: %s", err) } } // Verify that we have seven machines. result, err = c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 7) }
// Create a full cluster and then change the active size dramatically. func TestStandbyDramaticChange(t *testing.T) { clusterSize := 9 _, etcds, err := CreateCluster(clusterSize, &os.ProcAttr{Files: []*os.File{nil, os.Stdout, os.Stderr}}, false) assert.NoError(t, err) defer DestroyCluster(etcds) if err != nil { t.Fatal("cannot create cluster") } time.Sleep(time.Second) c := etcd.NewClient(nil) c.SyncCluster() num := clusterSize for i := 0; i < 3; i++ { for inc := 0; inc < 2; inc++ { // Verify that we just have i machines. result, err := c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), num) if inc == 0 { num -= 6 } else { num += 6 } t.Log("Reconfigure with active size", num) resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(fmt.Sprintf(`{"activeSize":%d, "syncInterval":1}`, num))) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } if inc == 0 { // Wait for monitor cycles before checking for demotion. time.Sleep(6*server.ActiveMonitorTimeout + (1 * time.Second)) } else { time.Sleep(time.Second + (1 * time.Second)) } // Verify that we now have peers. result, err = c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), num) t.Log("Test the functionality of all servers") // Set key. if _, err := c.Set("foo", "bar", 0); err != nil { panic(err) } time.Sleep(100 * time.Millisecond) // Check that all peers and standbys have the value. for i := range etcds { resp, err := tests.Get(fmt.Sprintf("http://localhost:%d/v2/keys/foo", 4000+(i+1))) if assert.NoError(t, err) { body := tests.ReadBodyJSON(resp) if node, _ := body["node"].(map[string]interface{}); assert.NotNil(t, node) { assert.Equal(t, node["value"], "bar") } } } } } }
// Create a full cluster and then change the active size. func TestStandby(t *testing.T) { clusterSize := 15 _, etcds, err := CreateCluster(clusterSize, &os.ProcAttr{Files: []*os.File{nil, os.Stdout, os.Stderr}}, false) if !assert.NoError(t, err) { t.Fatal("cannot create cluster") } defer DestroyCluster(etcds) resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"syncInterval":1}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } time.Sleep(time.Second) c := etcd.NewClient(nil) c.SyncCluster() // Verify that we just have default machines. result, err := c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 9) t.Log("Reconfigure with a smaller active size") resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":7, "syncInterval":1}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } // Wait for two monitor cycles before checking for demotion. time.Sleep((2 * server.ActiveMonitorTimeout) + (2 * time.Second)) // Verify that we now have seven peers. result, err = c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 7) t.Log("Test the functionality of all servers") // Set key. time.Sleep(time.Second) if _, err := c.Set("foo", "bar", 0); err != nil { panic(err) } time.Sleep(time.Second) // Check that all peers and standbys have the value. for i := range etcds { resp, err := tests.Get(fmt.Sprintf("http://localhost:%d/v2/keys/foo", 4000+(i+1))) if assert.NoError(t, err) { body := tests.ReadBodyJSON(resp) if node, _ := body["node"].(map[string]interface{}); assert.NotNil(t, node) { assert.Equal(t, node["value"], "bar") } } } t.Log("Reconfigure with larger active size and wait for join") resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":8, "syncInterval":1}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } time.Sleep((1 * time.Second) + (1 * time.Second)) // Verify that exactly eight machines are in the cluster. result, err = c.Get("_etcd/machines", false, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 8) }
// This test will kill the current leader and wait for the etcd cluster to elect a new leader for 200 times. // It will print out the election time and the average election time. // It runs in a cluster with standby nodes. func TestKillLeaderWithStandbys(t *testing.T) { // https://github.com/goraft/raft/issues/222 t.Skip("stuck on raft issue") procAttr := new(os.ProcAttr) procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr} clusterSize := 5 argGroup, etcds, err := CreateCluster(clusterSize, procAttr, false) if err != nil { t.Fatal("cannot create cluster") } defer DestroyCluster(etcds) stop := make(chan bool) leaderChan := make(chan string, 1) all := make(chan bool, 1) time.Sleep(time.Second) go Monitor(clusterSize, 1, leaderChan, all, stop) c := etcd.NewClient(nil) c.SyncCluster() // Reconfigure with a small active size. resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":3, "removeDelay":2, "syncInterval":1}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } // Wait for two monitor cycles before checking for demotion. time.Sleep((2 * server.ActiveMonitorTimeout) + (2 * time.Second)) // Verify that we have 3 peers. result, err := c.Get("_etcd/machines", true, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 3) var totalTime time.Duration leader := "http://127.0.0.1:7001" for i := 0; i < clusterSize; i++ { t.Log("leader is ", leader) port, _ := strconv.Atoi(strings.Split(leader, ":")[2]) num := port - 7001 t.Log("kill server ", num) etcds[num].Kill() etcds[num].Release() start := time.Now() for { newLeader := <-leaderChan if newLeader != leader { leader = newLeader break } } take := time.Now().Sub(start) totalTime += take avgTime := totalTime / (time.Duration)(i+1) fmt.Println("Total time:", totalTime, "; Avg time:", avgTime) time.Sleep(server.ActiveMonitorTimeout + (1 * time.Second)) time.Sleep(2 * time.Second) // Verify that we have 3 peers. result, err = c.Get("_etcd/machines", true, true) assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 3) // Verify that killed node is not one of those peers. _, err = c.Get(fmt.Sprintf("_etcd/machines/node%d", num+1), false, false) assert.Error(t, err) etcds[num], err = os.StartProcess(EtcdBinPath, argGroup[num], procAttr) } stop <- true }
// remove the node and node rejoin with previous log func TestRemoveNode(t *testing.T) { procAttr := new(os.ProcAttr) procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr} clusterSize := 4 argGroup, etcds, _ := CreateCluster(clusterSize, procAttr, false) defer DestroyCluster(etcds) time.Sleep(time.Second) c := etcd.NewClient(nil) c.SyncCluster() resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":4, "syncInterval":5}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() } rmReq, _ := http.NewRequest("DELETE", "http://127.0.0.1:7001/remove/node3", nil) client := &http.Client{} for i := 0; i < 2; i++ { for i := 0; i < 2; i++ { client.Do(rmReq) fmt.Println("send remove to node3 and wait for its exiting") time.Sleep(100 * time.Millisecond) resp, err := c.Get("_etcd/machines", false, false) if err != nil { panic(err) } if len(resp.Node.Nodes) != 3 { t.Fatal("cannot remove peer") } etcds[2].Kill() etcds[2].Wait() if i == 1 { // rejoin with log etcds[2], err = os.StartProcess(EtcdBinPath, argGroup[2], procAttr) } else { // rejoin without log etcds[2], err = os.StartProcess(EtcdBinPath, append(argGroup[2], "-f"), procAttr) } if err != nil { panic(err) } time.Sleep(time.Second + 5*time.Second) resp, err = c.Get("_etcd/machines", false, false) if err != nil { panic(err) } if len(resp.Node.Nodes) != 4 { t.Fatalf("add peer fails #1 (%d != 4)", len(resp.Node.Nodes)) } } // first kill the node, then remove it, then add it back for i := 0; i < 2; i++ { etcds[2].Kill() fmt.Println("kill node3 and wait for its exiting") etcds[2].Wait() client.Do(rmReq) time.Sleep(100 * time.Millisecond) resp, err := c.Get("_etcd/machines", false, false) if err != nil { panic(err) } if len(resp.Node.Nodes) != 3 { t.Fatal("cannot remove peer") } if i == 1 { // rejoin with log etcds[2], err = os.StartProcess(EtcdBinPath, append(argGroup[2]), procAttr) } else { // rejoin without log etcds[2], err = os.StartProcess(EtcdBinPath, append(argGroup[2], "-f"), procAttr) } if err != nil { panic(err) } time.Sleep(time.Second + time.Second) resp, err = c.Get("_etcd/machines", false, false) if err != nil { panic(err) } if len(resp.Node.Nodes) != 4 { t.Fatalf("add peer fails #2 (%d != 4)", len(resp.Node.Nodes)) } } } }
func TestRemovePausedNode(t *testing.T) { procAttr := new(os.ProcAttr) procAttr.Files = []*os.File{nil, os.Stdout, os.Stderr} clusterSize := 4 _, etcds, _ := CreateCluster(clusterSize, procAttr, false) defer DestroyCluster(etcds) time.Sleep(time.Second) c := etcd.NewClient(nil) c.SyncCluster() r, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":3, "removeDelay":1, "syncInterval":1}`)) if !assert.Equal(t, r.StatusCode, 200) { t.FailNow() } // Wait for standby instances to update its cluster config time.Sleep(6 * time.Second) resp, err := c.Get("_etcd/machines", false, false) if err != nil { panic(err) } if len(resp.Node.Nodes) != 3 { t.Fatal("cannot remove peer") } for i := 0; i < clusterSize; i++ { // first pause the node, then remove it, then resume it idx := rand.Int() % clusterSize etcds[idx].Signal(syscall.SIGSTOP) fmt.Printf("pause node%d and let standby node take its place\n", idx+1) time.Sleep(4 * time.Second) etcds[idx].Signal(syscall.SIGCONT) // let it change its state to candidate at least time.Sleep(time.Second) stop := make(chan bool) leaderChan := make(chan string, 1) all := make(chan bool, 1) go Monitor(clusterSize, clusterSize, leaderChan, all, stop) <-all <-leaderChan stop <- true resp, err = c.Get("_etcd/machines", false, false) if err != nil { panic(err) } if len(resp.Node.Nodes) != 3 { t.Fatalf("add peer fails (%d != 3)", len(resp.Node.Nodes)) } for i := 0; i < 3; i++ { if resp.Node.Nodes[i].Key == fmt.Sprintf("node%d", idx+1) { t.Fatal("node should be removed") } } } }