func postFreeze( c cluster.Cluster, freeze bool, timeout time.Duration, ) (serverpb.ClusterFreezeResponse, error) { httpClient := cluster.HTTPClient httpClient.Timeout = timeout var resp serverpb.ClusterFreezeResponse log.Infof(context.Background(), "requesting: freeze=%t, timeout=%s", freeze, timeout) cb := func(v proto.Message) { oldNum := resp.RangesAffected resp = *v.(*serverpb.ClusterFreezeResponse) if oldNum > resp.RangesAffected { resp.RangesAffected = oldNum } if (resp != serverpb.ClusterFreezeResponse{}) { log.Infof(context.Background(), "%+v", &resp) } } err := httputil.StreamJSON( httpClient, c.URL(0)+"/_admin/v1/cluster/freeze", &serverpb.ClusterFreezeRequest{Freeze: freeze}, &serverpb.ClusterFreezeResponse{}, cb, ) return resp, err }
func cutNetwork(t *testing.T, c cluster.Cluster, closer <-chan struct{}, partitions ...[]int) { defer func() { if errs := restoreNetwork(t, c); len(errs) > 0 { t.Fatalf("errors restoring the network: %+v", errs) } }() addrs, addrsToNode := mustGetHosts(t, c) ipPartitions := make([][]iptables.IP, 0, len(partitions)) for _, partition := range partitions { ipPartition := make([]iptables.IP, 0, len(partition)) for _, nodeIndex := range partition { ipPartition = append(ipPartition, addrs[nodeIndex]) } ipPartitions = append(ipPartitions, ipPartition) } log.Warningf(context.TODO(), "partitioning: %v (%v)", partitions, ipPartitions) for host, cmds := range iptables.Rules(iptables.Bidirectional(ipPartitions...)) { for _, cmd := range cmds { if err := c.ExecRoot(addrsToNode[host], cmd); err != nil { t.Fatal(err) } } } <-closer log.Warningf(context.TODO(), "resolved all partitions") }
func testStatusServerInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { // Get the ids for each node. idMap := make(map[int]roachpb.NodeID) for i := 0; i < c.NumNodes(); i++ { var details serverpb.DetailsResponse if err := httputil.GetJSON(cluster.HTTPClient, c.URL(ctx, i)+"/_status/details/local", &details); err != nil { t.Fatal(err) } idMap[i] = details.NodeID } // Check local response for the every node. for i := 0; i < c.NumNodes(); i++ { id := idMap[i] checkNode(ctx, t, c, i, id, id, id) get(ctx, t, c.URL(ctx, i), "/_status/nodes") } // Proxy from the first node to the last node. firstNode := 0 lastNode := c.NumNodes() - 1 firstID := idMap[firstNode] lastID := idMap[lastNode] checkNode(ctx, t, c, firstNode, firstID, lastID, lastID) // And from the last node to the first node. checkNode(ctx, t, c, lastNode, lastID, firstID, firstID) // And from the last node to the last node. checkNode(ctx, t, c, lastNode, lastID, lastID, lastID) }
func testBuildInfoInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { checkGossip(t, c, 20*time.Second, hasPeers(c.NumNodes())) var details serverpb.DetailsResponse util.SucceedsSoon(t, func() error { select { case <-stopper: t.Fatalf("interrupted") default: } return httputil.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/details/local", &details) }) bi := details.BuildInfo testData := map[string]string{ "go_version": bi.GoVersion, "tag": bi.Tag, "time": bi.Time, "dependencies": bi.Dependencies, } for key, val := range testData { if val == "" { t.Errorf("build info not set for \"%s\"", key) } } }
// CheckGossip fetches the gossip infoStore from each node and invokes the given // function. The test passes if the function returns 0 for every node, // retrying for up to the given duration. func CheckGossip( ctx context.Context, t testing.TB, c cluster.Cluster, d time.Duration, f CheckGossipFunc, ) { err := util.RetryForDuration(d, func() error { select { case <-stopper.ShouldStop(): t.Fatalf("interrupted") return nil case <-time.After(1 * time.Second): } var infoStatus gossip.InfoStatus for i := 0; i < c.NumNodes(); i++ { if err := httputil.GetJSON(cluster.HTTPClient, c.URL(ctx, i)+"/_status/gossip/local", &infoStatus); err != nil { return errors.Wrapf(err, "failed to get gossip status from node %d", i) } if err := f(infoStatus.Infos); err != nil { return errors.Errorf("node %d: %s", i, err) } } return nil }) if err != nil { t.Fatal(errors.Errorf("condition failed to evaluate within %s: %s", d, err)) } }
// checkGossip fetches the gossip infoStore from each node and invokes the given // function. The test passes if the function returns 0 for every node, // retrying for up to the given duration. func checkGossip(t *testing.T, c cluster.Cluster, d time.Duration, f checkGossipFunc) { err := util.RetryForDuration(d, func() error { select { case <-stopper: t.Fatalf("interrupted") return nil case <-time.After(1 * time.Second): } var infoStatus gossip.InfoStatus for i := 0; i < c.NumNodes(); i++ { if err := httputil.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/gossip/local", &infoStatus); err != nil { return err } if err := f(infoStatus.Infos); err != nil { return errors.Errorf("node %d: %s", i, err) } } return nil }) if err != nil { t.Fatal(errors.Errorf("condition failed to evaluate within %s: %s", d, err)) } }
func mustGetHosts(t *testing.T, c cluster.Cluster) ([]iptables.IP, map[iptables.IP]int) { var addrs []iptables.IP addrsToNode := make(map[iptables.IP]int) for i := 0; i < c.NumNodes(); i++ { addr := iptables.IP(c.InternalIP(i).String()) addrsToNode[addr] = i addrs = append(addrs, addr) } return addrs, addrsToNode }
func restoreNetwork(t *testing.T, c cluster.Cluster) []error { var errs []error for i := 0; i < c.NumNodes(); i++ { for _, cmd := range iptables.Reset() { if err := c.ExecRoot(i, cmd); err != nil { errs = append(errs, err) } } } return errs }
func testPutInner(ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { db, err := c.NewClient(ctx, 0) if err != nil { t.Fatal(err) } errs := make(chan error, c.NumNodes()) start := timeutil.Now() deadline := start.Add(cfg.Duration) var count int64 for i := 0; i < c.NumNodes(); i++ { go func() { r, _ := randutil.NewPseudoRand() value := randutil.RandBytes(r, 8192) for timeutil.Now().Before(deadline) { k := atomic.AddInt64(&count, 1) v := value[:r.Intn(len(value))] if err := db.Put(ctx, fmt.Sprintf("%08d", k), v); err != nil { errs <- err return } } errs <- nil }() } for i := 0; i < c.NumNodes(); { baseCount := atomic.LoadInt64(&count) select { case <-stopper.ShouldStop(): t.Fatalf("interrupted") case err := <-errs: if err != nil { t.Fatal(err) } i++ case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. loadedCount := atomic.LoadInt64(&count) log.Infof(ctx, "%d (%d/s)", loadedCount, loadedCount-baseCount) c.Assert(ctx, t) if err := cluster.Consistent(ctx, c, 0); err != nil { t.Fatal(err) } } } elapsed := timeutil.Since(start) log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
// checkNode checks all the endpoints of the status server hosted by node and // requests info for the node with otherNodeID. That node could be the same // other node, the same node or "local". func checkNode( ctx context.Context, t *testing.T, c cluster.Cluster, i int, nodeID, otherNodeID, expectedNodeID roachpb.NodeID, ) { urlIDs := []string{otherNodeID.String()} if nodeID == otherNodeID { urlIDs = append(urlIDs, "local") } var details serverpb.DetailsResponse for _, urlID := range urlIDs { if err := httputil.GetJSON(cluster.HTTPClient, c.URL(ctx, i)+"/_status/details/"+urlID, &details); err != nil { t.Fatal(errors.Errorf("unable to parse details - %s", err)) } if details.NodeID != expectedNodeID { t.Fatal(errors.Errorf("%d calling %s: node ids don't match - expected %d, actual %d", nodeID, urlID, expectedNodeID, details.NodeID)) } get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/gossip/%s", urlID)) get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/nodes/%s", urlID)) get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/logfiles/%s", urlID)) get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/logs/%s", urlID)) get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/stacks/%s", urlID)) } }
func testClusterRecoveryInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() // One client for each node. initBank(t, c.PGUrl(ctx, 0)) start := timeutil.Now() state := testState{ t: t, errChan: make(chan error, num), teardown: make(chan struct{}), deadline: start.Add(cfg.Duration), clients: make([]testClient, num), } for i := 0; i < num; i++ { state.clients[i].Lock() state.initClient(ctx, t, c, i) state.clients[i].Unlock() go transferMoneyLoop(ctx, i, &state, *numAccounts, *maxTransfer) } defer func() { <-state.teardown }() // Chaos monkey. rnd, seed := randutil.NewPseudoRand() log.Warningf(ctx, "monkey starts (seed %d)", seed) pickNodes := func() []int { return rnd.Perm(num)[:rnd.Intn(num)+1] } go chaosMonkey(ctx, &state, c, true, pickNodes, 0) waitClientsStop(ctx, num, &state, stall) // Verify accounts. verifyAccounts(t, &state.clients[0]) elapsed := timeutil.Since(start) var count uint64 counts := state.counts() for _, c := range counts { count += c } log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
func testRepairInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { testStopper := stop.NewStopper() dc := newDynamicClient(c, testStopper) testStopper.AddCloser(dc) defer testStopper.Stop() // Add some loads. for i := 0; i < c.NumNodes()*2; i++ { ID := i testStopper.RunWorker(func() { insertLoad(t, dc, ID) }) } // TODO(bram): #5345 add repair mechanism. select { case <-stopper: case <-time.After(cfg.Duration): } }
func testRepairInner(ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { dc := newDynamicClient(c, stopper) stopper.AddCloser(stop.CloserFn(func() { dc.Close(ctx) })) // Add some loads. for i := 0; i < c.NumNodes()*2; i++ { ID := i stopper.RunWorker(func() { insertLoad(ctx, t, dc, ID) }) } // TODO(bram): #5345 add repair mechanism. select { case <-stopper.ShouldStop(): case <-time.After(cfg.Duration): } }
func testGossipPeeringsInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() deadline := timeutil.Now().Add(cfg.Duration) waitTime := longWaitTime if cfg.Duration < waitTime { waitTime = shortWaitTime } for timeutil.Now().Before(deadline) { CheckGossip(ctx, t, c, waitTime, HasPeers(num)) // Restart the first node. log.Infof(ctx, "restarting node 0") if err := c.Restart(ctx, 0); err != nil { t.Fatal(err) } CheckGossip(ctx, t, c, waitTime, HasPeers(num)) // Restart another node (if there is one). var pickedNode int if num > 1 { pickedNode = rand.Intn(num-1) + 1 } log.Infof(ctx, "restarting node %d", pickedNode) if err := c.Restart(ctx, pickedNode); err != nil { t.Fatal(err) } CheckGossip(ctx, t, c, waitTime, HasPeers(num)) } }
// BidirectionalPartitionNemesis is a nemesis which randomly severs the network // symmetrically between two random groups of nodes. Partitioned and connected // mode take alternating turns, with random durations of up to 15s. func BidirectionalPartitionNemesis(t *testing.T, stop <-chan struct{}, c cluster.Cluster) { randSec := func() time.Duration { return time.Duration(rand.Int63n(15 * int64(time.Second))) } log.Infof(context.Background(), "cleaning up any previous rules") _ = restoreNetwork(t, c) // clean up any potential leftovers log.Infof(context.Background(), "starting partition nemesis") for { ch := make(chan struct{}) go func() { select { case <-time.After(randSec()): case <-stop: } close(ch) }() cutNetwork(t, c, ch, randomBidirectionalPartition(c.NumNodes())...) select { case <-stop: return case <-time.After(randSec()): } } }
func checkRangeReplication(t *testing.T, c cluster.Cluster, d time.Duration) { if c.NumNodes() < 1 { // Looks silly, but we actually start zero-node clusters in the // reference tests. t.Log("replication test is a no-op for empty cluster") return } wantedReplicas := 3 if c.NumNodes() < 3 { wantedReplicas = c.NumNodes() } log.Infof(context.Background(), "waiting for first range to have %d replicas", wantedReplicas) util.SucceedsSoon(t, func() error { // Reconnect on every iteration; gRPC will eagerly tank the connection // on transport errors. Always talk to node 0 because it's guaranteed // to exist. client, dbStopper := c.NewClient(t, 0) defer dbStopper.Stop() select { case <-stopper: t.Fatalf("interrupted") return nil case <-time.After(1 * time.Second): } foundReplicas, err := countRangeReplicas(client) if err != nil { return err } if log.V(1) { log.Infof(context.Background(), "found %d replicas", foundReplicas) } if foundReplicas >= wantedReplicas { return nil } return fmt.Errorf("expected %d replicas, only found %d", wantedReplicas, foundReplicas) }) log.Infof(context.Background(), "found %d replicas", wantedReplicas) }
func testNodeRestartInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() if minNum := 3; num < minNum { t.Skipf("need at least %d nodes, got %d", minNum, num) } // One client for each node. initBank(t, c.PGUrl(ctx, 0)) start := timeutil.Now() state := testState{ t: t, errChan: make(chan error, 1), teardown: make(chan struct{}), deadline: start.Add(cfg.Duration), clients: make([]testClient, 1), } clientIdx := num - 1 client := &state.clients[0] client.Lock() client.db = makePGClient(t, c.PGUrl(ctx, clientIdx)) client.Unlock() go transferMoneyLoop(ctx, 0, &state, *numAccounts, *maxTransfer) defer func() { <-state.teardown }() // Chaos monkey. rnd, seed := randutil.NewPseudoRand() log.Warningf(ctx, "monkey starts (seed %d)", seed) pickNodes := func() []int { return []int{rnd.Intn(clientIdx)} } go chaosMonkey(ctx, &state, c, false, pickNodes, clientIdx) waitClientsStop(ctx, 1, &state, stall) // Verify accounts. verifyAccounts(t, client) elapsed := timeutil.Since(start) count := atomic.LoadUint64(&client.count) log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
func testAdminLossOfQuorumInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { if c.NumNodes() < 2 { t.Logf("skipping test %s because given cluster has too few nodes", cfg.Name) return } // Get the ids for each node. nodeIDs := make([]roachpb.NodeID, c.NumNodes()) for i := 0; i < c.NumNodes(); i++ { var details serverpb.DetailsResponse if err := httputil.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/details/local", &details); err != nil { t.Fatal(err) } nodeIDs[i] = details.NodeID } // Leave only the first node alive. for i := 1; i < c.NumNodes(); i++ { if err := c.Kill(i); err != nil { t.Fatal(err) } } // Retrieve node statuses. var nodes serverpb.NodesResponse if err := httputil.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/nodes", &nodes); err != nil { t.Fatal(err) } for _, nodeID := range nodeIDs { var nodeStatus status.NodeStatus if err := httputil.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/nodes/"+strconv.Itoa(int(nodeID)), &nodeStatus); err != nil { t.Fatal(err) } } // Retrieve time-series data. nowNanos := timeutil.Now().UnixNano() queryRequest := tspb.TimeSeriesQueryRequest{ StartNanos: nowNanos - 10*time.Second.Nanoseconds(), EndNanos: nowNanos, Queries: []tspb.Query{ {Name: "doesn't_matter", Sources: []string{}}, }, } var queryResponse tspb.TimeSeriesQueryResponse if err := httputil.PostJSON(cluster.HTTPClient, c.URL(0)+"/ts/query", &queryRequest, &queryResponse); err != nil { t.Fatal(err) } // TODO(cdo): When we're able to issue SQL queries without a quorum, test all // admin endpoints that issue SQL queries here. }
// initClient initializes the client talking to node "i". // It requires that the caller hold the client's write lock. func (state *testState) initClient(t *testing.T, c cluster.Cluster, i int) { state.clients[i].db = makePGClient(t, c.PGUrl(i)) }
// initClient initializes the client talking to node "i". // It requires that the caller hold the client's write lock. func (state *testState) initClient(ctx context.Context, t *testing.T, c cluster.Cluster, i int) { state.clients[i].db = makePGClient(t, c.PGUrl(ctx, i)) }
func testMonotonicInsertsInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { var clients []mtClient for i := 0; i < c.NumNodes(); i++ { clients = append(clients, mtClient{ID: i, DB: makePGClient(t, c.PGUrl(ctx, i))}) } // We will insert into this table by selecting MAX(val) and increasing by // one and expect that val and sts (the commit timestamp) are both // simultaneously increasing. if _, err := clients[0].Exec(` CREATE DATABASE mono; CREATE TABLE IF NOT EXISTS mono.mono (val INT, sts STRING, node INT, tb INT); INSERT INTO mono.mono VALUES(-1, '0', -1, -1)`); err != nil { t.Fatal(err) } var idGen uint64 invoke := func(client mtClient) { logPrefix := fmt.Sprintf("%03d.%03d: ", atomic.AddUint64(&idGen, 1), client.ID) l := func(msg string, args ...interface{}) { log.Infof(ctx, logPrefix+msg, args...) if log.V(2) { t.Logf(logPrefix+msg, args...) } } l("begin") defer l("done") var exRow, insRow mtRow var attempt int if err := crdb.ExecuteTx(client.DB, func(tx *gosql.Tx) error { attempt++ l("attempt %d", attempt) if err := tx.QueryRow(`SELECT cluster_logical_timestamp()`).Scan( &insRow.sts, ); err != nil { l(err.Error()) return err } l("read max val") if err := tx.QueryRow(`SELECT MAX(val) AS m FROM mono.mono`).Scan( &exRow.val, ); err != nil { l(err.Error()) return err } l("read max row for val=%d", exRow.val) if err := tx.QueryRow(`SELECT sts, node, tb FROM mono.mono WHERE val = $1`, exRow.val, ).Scan( &exRow.sts, &exRow.node, &exRow.tb, ); err != nil { l(err.Error()) return err } l("insert") if err := tx.QueryRow(` INSERT INTO mono.mono (val, sts, node, tb) VALUES($1, $2, $3, $4) RETURNING val, sts, node, tb`, exRow.val+1, insRow.sts, client.ID, 0, ).Scan( &insRow.val, &insRow.sts, &insRow.node, &insRow.tb, ); err != nil { l(err.Error()) return err } l("commit") return nil }); err != nil { t.Errorf("%T: %v", err, err) } } verify := func() { client := clients[0] var numDistinct int if err := client.QueryRow("SELECT COUNT(DISTINCT(val)) FROM mono.mono").Scan( &numDistinct, ); err != nil { t.Fatal(err) } rows, err := client.Query("SELECT val, sts, node, tb FROM mono.mono ORDER BY val ASC, sts ASC") if err != nil { t.Fatal(err) } var results mtRows for rows.Next() { var row mtRow if err := rows.Scan(&row.val, &row.sts, &row.node, &row.tb); err != nil { t.Fatal(err) } results = append(results, row) } if !sort.IsSorted(results) { t.Errorf("results are not sorted:\n%s", results) } if numDistinct != len(results) { t.Errorf("'val' column is not unique: %d results, but %d distinct:\n%s", len(results), numDistinct, results) } } concurrency := 2 * c.NumNodes() sem := make(chan struct{}, concurrency) timer := time.After(cfg.Duration) defer verify() defer func() { // Now that consuming has stopped, fill up the semaphore (i.e. wait for // still-running goroutines to stop) for i := 0; i < concurrency; i++ { sem <- struct{}{} } }() for { select { case sem <- struct{}{}: case <-stopper.ShouldStop(): return case <-timer: return } go func(client mtClient) { invoke(client) <-sem }(clients[rand.Intn(c.NumNodes())]) } }
// chaosMonkey picks a set of nodes and restarts them. If stopClients is set // all the clients are locked before the nodes are restarted. func chaosMonkey( ctx context.Context, state *testState, c cluster.Cluster, stopClients bool, pickNodes func() []int, consistentIdx int, ) { defer close(state.teardown) for curRound := uint64(1); !state.done(); curRound++ { atomic.StoreUint64(&state.monkeyIteration, curRound) select { case <-stopper.ShouldStop(): return default: } // Pick nodes to be restarted. nodes := pickNodes() if stopClients { // Prevent all clients from writing while nodes are being restarted. for i := 0; i < len(state.clients); i++ { state.clients[i].Lock() } } log.Infof(ctx, "round %d: restarting nodes %v", curRound, nodes) for _, i := range nodes { // Two early exit conditions. select { case <-stopper.ShouldStop(): break default: } if state.done() { break } log.Infof(ctx, "round %d: restarting %d", curRound, i) if err := c.Kill(ctx, i); err != nil { state.t.Error(err) } if err := c.Restart(ctx, i); err != nil { state.t.Error(err) } if stopClients { // Reinitialize the client talking to the restarted node. state.initClient(ctx, state.t, c, i) } } if stopClients { for i := 0; i < len(state.clients); i++ { state.clients[i].Unlock() } } preCount := state.counts() madeProgress := func() bool { newCounts := state.counts() for i := range newCounts { if newCounts[i] > preCount[i] { return true } } return false } // Sleep until at least one client is writing successfully. log.Warningf(ctx, "round %d: monkey sleeping while cluster recovers...", curRound) for !state.done() && !madeProgress() { time.Sleep(time.Second) } c.Assert(ctx, state.t) if err := cluster.Consistent(ctx, c, consistentIdx); err != nil { state.t.Error(err) } log.Warningf(ctx, "round %d: cluster recovered", curRound) } }
func testSingleKeyInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() // Initialize the value for our test key to zero. const key = "test-key" initDB := c.NewClient(ctx, t, 0) if err := initDB.Put(ctx, key, 0); err != nil { t.Fatal(err) } type result struct { err error maxLatency time.Duration } resultCh := make(chan result, num) deadline := timeutil.Now().Add(cfg.Duration) var expected int64 // Start up num workers each reading and writing the same // key. Each worker is configured to talk to a different node in the // cluster. for i := 0; i < num; i++ { db := c.NewClient(ctx, t, i) go func() { var r result for timeutil.Now().Before(deadline) { start := timeutil.Now() err := db.Txn(ctx, func(txn *client.Txn) error { minExp := atomic.LoadInt64(&expected) r, err := txn.Get(key) if err != nil { return err } b := txn.NewBatch() v := r.ValueInt() b.Put(key, v+1) err = txn.CommitInBatch(b) // Atomic updates after the fact mean that we should read // exp or larger (since concurrent writers might have // committed but not yet performed their atomic update). if err == nil && v < minExp { return errors.Errorf("unexpected read: %d, expected >= %d", v, minExp) } return err }) if err != nil { resultCh <- result{err: err} return } atomic.AddInt64(&expected, 1) latency := timeutil.Since(start) if r.maxLatency < latency { r.maxLatency = latency } } resultCh <- r }() } // Verify that none of the workers encountered an error. var results []result for len(results) < num { select { case <-stopper.ShouldStop(): t.Fatalf("interrupted") case r := <-resultCh: if r.err != nil { t.Fatal(r.err) } results = append(results, r) case <-time.After(1 * time.Second): // Periodically print out progress so that we know the test is still // running. log.Infof(ctx, "%d", atomic.LoadInt64(&expected)) } } // Verify the resulting value stored at the key is what we expect. r, err := initDB.Get(ctx, key) if err != nil { t.Fatal(err) } v := r.ValueInt() if expected != v { t.Fatalf("expected %d, but found %d", expected, v) } var maxLatency []time.Duration for _, r := range results { maxLatency = append(maxLatency, r.maxLatency) } log.Infof(ctx, "%d increments: %s", v, maxLatency) }
func testEventLogInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() if num <= 0 { t.Fatalf("%d nodes in cluster", num) } var confirmedClusterID uuid.UUID type nodeEventInfo struct { Descriptor roachpb.NodeDescriptor ClusterID uuid.UUID } // Verify that a node_join message was logged for each node in the cluster. // We expect there to eventually be one such message for each node in the // cluster, and each message must be correctly formatted. util.SucceedsSoon(t, func() error { db := makePGClient(t, c.PGUrl(ctx, 0)) defer db.Close() // Query all node join events. There should be one for each node in the // cluster. rows, err := db.Query( "SELECT targetID, info FROM system.eventlog WHERE eventType = $1", string(csql.EventLogNodeJoin)) if err != nil { return err } seenIds := make(map[int64]struct{}) var clusterID uuid.UUID for rows.Next() { var targetID int64 var infoStr gosql.NullString if err := rows.Scan(&targetID, &infoStr); err != nil { t.Fatal(err) } // Verify the stored node descriptor. if !infoStr.Valid { t.Fatalf("info not recorded for node join, target node %d", targetID) } var info nodeEventInfo if err := json.Unmarshal([]byte(infoStr.String), &info); err != nil { t.Fatal(err) } if a, e := int64(info.Descriptor.NodeID), targetID; a != e { t.Fatalf("Node join with targetID %d had descriptor for wrong node %d", e, a) } // Verify cluster ID is recorded, and is the same for all nodes. if (info.ClusterID == uuid.UUID{}) { t.Fatalf("Node join recorded nil cluster id, info: %v", info) } if (clusterID == uuid.UUID{}) { clusterID = info.ClusterID } else if clusterID != info.ClusterID { t.Fatalf( "Node join recorded different cluster ID than earlier node. Expected %s, got %s. Info: %v", clusterID, info.ClusterID, info) } // Verify that all NodeIDs are different. if _, ok := seenIds[targetID]; ok { t.Fatalf("Node ID %d seen in two different node join messages", targetID) } seenIds[targetID] = struct{}{} } if err := rows.Err(); err != nil { return err } if a, e := len(seenIds), c.NumNodes(); a != e { return errors.Errorf("expected %d node join messages, found %d: %v", e, a, seenIds) } confirmedClusterID = clusterID return nil }) // Stop and Start Node 0, and verify the node restart message. if err := c.Kill(ctx, 0); err != nil { t.Fatal(err) } if err := c.Restart(ctx, 0); err != nil { t.Fatal(err) } util.SucceedsSoon(t, func() error { db := makePGClient(t, c.PGUrl(ctx, 0)) defer db.Close() // Query all node restart events. There should only be one. rows, err := db.Query( "SELECT targetID, info FROM system.eventlog WHERE eventType = $1", string(csql.EventLogNodeRestart)) if err != nil { return err } seenCount := 0 for rows.Next() { var targetID int64 var infoStr gosql.NullString if err := rows.Scan(&targetID, &infoStr); err != nil { t.Fatal(err) } // Verify the stored node descriptor. if !infoStr.Valid { t.Fatalf("info not recorded for node join, target node %d", targetID) } var info nodeEventInfo if err := json.Unmarshal([]byte(infoStr.String), &info); err != nil { t.Fatal(err) } if a, e := int64(info.Descriptor.NodeID), targetID; a != e { t.Fatalf("Node join with targetID %d had descriptor for wrong node %d", e, a) } // Verify cluster ID is recorded, and is the same for all nodes. if confirmedClusterID != info.ClusterID { t.Fatalf( "Node restart recorded different cluster ID than earlier join. Expected %s, got %s. Info: %v", confirmedClusterID, info.ClusterID, info) } seenCount++ } if err := rows.Err(); err != nil { return err } if seenCount != 1 { return errors.Errorf("Expected only one node restart event, found %d", seenCount) } return nil }) }
func testFreezeClusterInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { minAffected := int64(server.ExpectedInitialRangeCount()) const long = time.Minute const short = 10 * time.Second mustPost := func(freeze bool) serverpb.ClusterFreezeResponse { reply, err := postFreeze(c, freeze, long) if err != nil { t.Fatal(errors.Errorf("%v", err)) } return reply } if reply := mustPost(false); reply.RangesAffected != 0 { t.Fatalf("expected initial unfreeze to affect no ranges, got %d", reply.RangesAffected) } if reply := mustPost(true); reply.RangesAffected < minAffected { t.Fatalf("expected >=%d frozen ranges, got %d", minAffected, reply.RangesAffected) } if reply := mustPost(true); reply.RangesAffected != 0 { t.Fatalf("expected second freeze to affect no ranges, got %d", reply.RangesAffected) } if reply := mustPost(false); reply.RangesAffected < minAffected { t.Fatalf("expected >=%d thawed ranges, got %d", minAffected, reply.RangesAffected) } num := c.NumNodes() if num < 3 { t.Skip("skipping remainder of test; needs at least 3 nodes") } // Kill the last node. if err := c.Kill(num - 1); err != nil { t.Fatal(err) } // Attempt to freeze should get stuck (since it does not get confirmation // of the last node receiving the freeze command). // Note that this is the freeze trigger stalling on the Replica, not the // Store-polling mechanism. acceptErrs := strings.Join([]string{ "timed out waiting for Range", "Timeout exceeded while", "connection is closing", "deadline", // error returned via JSON when the server-side gRPC stream times out (due to // lack of new input). Unmarshaling that JSON fails with a message referencing // unknown fields, unfortunately in map order. "unknown field .*", }, "|") if reply, err := postFreeze(c, true, short); !testutils.IsError(err, acceptErrs) { t.Fatalf("expected timeout, got %v: %v", err, reply) } // Shut down the remaining nodes and restart them. for i := 0; i < num-1; i++ { if err := c.Kill(i); err != nil { t.Fatal(err) } } for i := 0; i < num; i++ { if err := c.Restart(i); err != nil { t.Fatal(err) } } // The cluster should now be fully operational (at least after waiting // a little bit) since each node tries to unfreeze everything when it // starts. if err := util.RetryForDuration(time.Minute, func() error { if _, err := postFreeze(c, false, short); err != nil { if testutils.IsError(err, "404 Not Found") { // It can take a bit until the endpoint is available. return err } t.Fatal(err) } // TODO(tschottdorf): moving the client creation outside of the retry // loop will break the test with the following message: // // client/rpc_sender.go:61: roachpb.Batch RPC failed as client // connection was closed // // Perhaps the cluster updates the address too late after restarting // the node. db, dbStopper := c.NewClient(t, 0) defer dbStopper.Stop() if _, err := db.Scan(context.TODO(), keys.LocalMax, roachpb.KeyMax, 0); err != nil { t.Fatal(err) } return nil }); err != nil { t.Fatal(err) } // Unfreezing again should be a no-op. if reply, err := postFreeze(c, false, long); err != nil { t.Fatal(err) } else if reply.RangesAffected > 0 { t.Fatalf("still %d frozen ranges", reply.RangesAffected) } }
func testGossipRestartInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { // This already replicates the first range (in the local setup). // The replication of the first range is important: as long as the // first range only exists on one node, that node can trivially // acquire the range lease. Once the range is replicated, however, // nodes must be able to discover each other over gossip before the // lease can be acquired. num := c.NumNodes() deadline := timeutil.Now().Add(cfg.Duration) waitTime := longWaitTime if cfg.Duration < waitTime { waitTime = shortWaitTime } for timeutil.Now().Before(deadline) { log.Infof(ctx, "waiting for initial gossip connections") CheckGossip(ctx, t, c, waitTime, HasPeers(num)) CheckGossip(ctx, t, c, waitTime, hasClusterID) CheckGossip(ctx, t, c, waitTime, hasSentinel) log.Infof(ctx, "killing all nodes") for i := 0; i < num; i++ { if err := c.Kill(ctx, i); err != nil { t.Fatal(err) } } log.Infof(ctx, "restarting all nodes") for i := 0; i < num; i++ { if err := c.Restart(ctx, i); err != nil { t.Fatal(err) } } log.Infof(ctx, "waiting for gossip to be connected") CheckGossip(ctx, t, c, waitTime, HasPeers(num)) CheckGossip(ctx, t, c, waitTime, hasClusterID) CheckGossip(ctx, t, c, waitTime, hasSentinel) for i := 0; i < num; i++ { db, err := c.NewClient(ctx, i) if err != nil { t.Fatal(err) } if i == 0 { if err := db.Del(ctx, "count"); err != nil { t.Fatal(err) } } var kv client.KeyValue if err := db.Txn(ctx, func(txn *client.Txn) error { var err error kv, err = txn.Inc("count", 1) return err }); err != nil { t.Fatal(err) } else if v := kv.ValueInt(); v != int64(i+1) { t.Fatalf("unexpected value %d for write #%d (expected %d)", v, i, i+1) } } } }