func testClusterRecoveryInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() // One client for each node. initBank(t, c.PGUrl(ctx, 0)) start := timeutil.Now() state := testState{ t: t, errChan: make(chan error, num), teardown: make(chan struct{}), deadline: start.Add(cfg.Duration), clients: make([]testClient, num), } for i := 0; i < num; i++ { state.clients[i].Lock() state.initClient(ctx, t, c, i) state.clients[i].Unlock() go transferMoneyLoop(ctx, i, &state, *numAccounts, *maxTransfer) } defer func() { <-state.teardown }() // Chaos monkey. rnd, seed := randutil.NewPseudoRand() log.Warningf(ctx, "monkey starts (seed %d)", seed) pickNodes := func() []int { return rnd.Perm(num)[:rnd.Intn(num)+1] } go chaosMonkey(ctx, &state, c, true, pickNodes, 0) waitClientsStop(ctx, num, &state, stall) // Verify accounts. verifyAccounts(t, &state.clients[0]) elapsed := timeutil.Since(start) var count uint64 counts := state.counts() for _, c := range counts { count += c } log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
func testNodeRestartInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() if minNum := 3; num < minNum { t.Skipf("need at least %d nodes, got %d", minNum, num) } // One client for each node. initBank(t, c.PGUrl(ctx, 0)) start := timeutil.Now() state := testState{ t: t, errChan: make(chan error, 1), teardown: make(chan struct{}), deadline: start.Add(cfg.Duration), clients: make([]testClient, 1), } clientIdx := num - 1 client := &state.clients[0] client.Lock() client.db = makePGClient(t, c.PGUrl(ctx, clientIdx)) client.Unlock() go transferMoneyLoop(ctx, 0, &state, *numAccounts, *maxTransfer) defer func() { <-state.teardown }() // Chaos monkey. rnd, seed := randutil.NewPseudoRand() log.Warningf(ctx, "monkey starts (seed %d)", seed) pickNodes := func() []int { return []int{rnd.Intn(clientIdx)} } go chaosMonkey(ctx, &state, c, false, pickNodes, clientIdx) waitClientsStop(ctx, 1, &state, stall) // Verify accounts. verifyAccounts(t, client) elapsed := timeutil.Since(start) count := atomic.LoadUint64(&client.count) log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds()) }
// initClient initializes the client talking to node "i". // It requires that the caller hold the client's write lock. func (state *testState) initClient(ctx context.Context, t *testing.T, c cluster.Cluster, i int) { state.clients[i].db = makePGClient(t, c.PGUrl(ctx, i)) }
func testEventLogInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { num := c.NumNodes() if num <= 0 { t.Fatalf("%d nodes in cluster", num) } var confirmedClusterID uuid.UUID type nodeEventInfo struct { Descriptor roachpb.NodeDescriptor ClusterID uuid.UUID } // Verify that a node_join message was logged for each node in the cluster. // We expect there to eventually be one such message for each node in the // cluster, and each message must be correctly formatted. util.SucceedsSoon(t, func() error { db := makePGClient(t, c.PGUrl(ctx, 0)) defer db.Close() // Query all node join events. There should be one for each node in the // cluster. rows, err := db.Query( "SELECT targetID, info FROM system.eventlog WHERE eventType = $1", string(csql.EventLogNodeJoin)) if err != nil { return err } seenIds := make(map[int64]struct{}) var clusterID uuid.UUID for rows.Next() { var targetID int64 var infoStr gosql.NullString if err := rows.Scan(&targetID, &infoStr); err != nil { t.Fatal(err) } // Verify the stored node descriptor. if !infoStr.Valid { t.Fatalf("info not recorded for node join, target node %d", targetID) } var info nodeEventInfo if err := json.Unmarshal([]byte(infoStr.String), &info); err != nil { t.Fatal(err) } if a, e := int64(info.Descriptor.NodeID), targetID; a != e { t.Fatalf("Node join with targetID %d had descriptor for wrong node %d", e, a) } // Verify cluster ID is recorded, and is the same for all nodes. if (info.ClusterID == uuid.UUID{}) { t.Fatalf("Node join recorded nil cluster id, info: %v", info) } if (clusterID == uuid.UUID{}) { clusterID = info.ClusterID } else if clusterID != info.ClusterID { t.Fatalf( "Node join recorded different cluster ID than earlier node. Expected %s, got %s. Info: %v", clusterID, info.ClusterID, info) } // Verify that all NodeIDs are different. if _, ok := seenIds[targetID]; ok { t.Fatalf("Node ID %d seen in two different node join messages", targetID) } seenIds[targetID] = struct{}{} } if err := rows.Err(); err != nil { return err } if a, e := len(seenIds), c.NumNodes(); a != e { return errors.Errorf("expected %d node join messages, found %d: %v", e, a, seenIds) } confirmedClusterID = clusterID return nil }) // Stop and Start Node 0, and verify the node restart message. if err := c.Kill(ctx, 0); err != nil { t.Fatal(err) } if err := c.Restart(ctx, 0); err != nil { t.Fatal(err) } util.SucceedsSoon(t, func() error { db := makePGClient(t, c.PGUrl(ctx, 0)) defer db.Close() // Query all node restart events. There should only be one. rows, err := db.Query( "SELECT targetID, info FROM system.eventlog WHERE eventType = $1", string(csql.EventLogNodeRestart)) if err != nil { return err } seenCount := 0 for rows.Next() { var targetID int64 var infoStr gosql.NullString if err := rows.Scan(&targetID, &infoStr); err != nil { t.Fatal(err) } // Verify the stored node descriptor. if !infoStr.Valid { t.Fatalf("info not recorded for node join, target node %d", targetID) } var info nodeEventInfo if err := json.Unmarshal([]byte(infoStr.String), &info); err != nil { t.Fatal(err) } if a, e := int64(info.Descriptor.NodeID), targetID; a != e { t.Fatalf("Node join with targetID %d had descriptor for wrong node %d", e, a) } // Verify cluster ID is recorded, and is the same for all nodes. if confirmedClusterID != info.ClusterID { t.Fatalf( "Node restart recorded different cluster ID than earlier join. Expected %s, got %s. Info: %v", confirmedClusterID, info.ClusterID, info) } seenCount++ } if err := rows.Err(); err != nil { return err } if seenCount != 1 { return errors.Errorf("Expected only one node restart event, found %d", seenCount) } return nil }) }
func testMonotonicInsertsInner( ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, ) { var clients []mtClient for i := 0; i < c.NumNodes(); i++ { clients = append(clients, mtClient{ID: i, DB: makePGClient(t, c.PGUrl(ctx, i))}) } // We will insert into this table by selecting MAX(val) and increasing by // one and expect that val and sts (the commit timestamp) are both // simultaneously increasing. if _, err := clients[0].Exec(` CREATE DATABASE mono; CREATE TABLE IF NOT EXISTS mono.mono (val INT, sts STRING, node INT, tb INT); INSERT INTO mono.mono VALUES(-1, '0', -1, -1)`); err != nil { t.Fatal(err) } var idGen uint64 invoke := func(client mtClient) { logPrefix := fmt.Sprintf("%03d.%03d: ", atomic.AddUint64(&idGen, 1), client.ID) l := func(msg string, args ...interface{}) { log.Infof(ctx, logPrefix+msg, args...) if log.V(2) { t.Logf(logPrefix+msg, args...) } } l("begin") defer l("done") var exRow, insRow mtRow var attempt int if err := crdb.ExecuteTx(client.DB, func(tx *gosql.Tx) error { attempt++ l("attempt %d", attempt) if err := tx.QueryRow(`SELECT cluster_logical_timestamp()`).Scan( &insRow.sts, ); err != nil { l(err.Error()) return err } l("read max val") if err := tx.QueryRow(`SELECT MAX(val) AS m FROM mono.mono`).Scan( &exRow.val, ); err != nil { l(err.Error()) return err } l("read max row for val=%d", exRow.val) if err := tx.QueryRow(`SELECT sts, node, tb FROM mono.mono WHERE val = $1`, exRow.val, ).Scan( &exRow.sts, &exRow.node, &exRow.tb, ); err != nil { l(err.Error()) return err } l("insert") if err := tx.QueryRow(` INSERT INTO mono.mono (val, sts, node, tb) VALUES($1, $2, $3, $4) RETURNING val, sts, node, tb`, exRow.val+1, insRow.sts, client.ID, 0, ).Scan( &insRow.val, &insRow.sts, &insRow.node, &insRow.tb, ); err != nil { l(err.Error()) return err } l("commit") return nil }); err != nil { t.Errorf("%T: %v", err, err) } } verify := func() { client := clients[0] var numDistinct int if err := client.QueryRow("SELECT COUNT(DISTINCT(val)) FROM mono.mono").Scan( &numDistinct, ); err != nil { t.Fatal(err) } rows, err := client.Query("SELECT val, sts, node, tb FROM mono.mono ORDER BY val ASC, sts ASC") if err != nil { t.Fatal(err) } var results mtRows for rows.Next() { var row mtRow if err := rows.Scan(&row.val, &row.sts, &row.node, &row.tb); err != nil { t.Fatal(err) } results = append(results, row) } if !sort.IsSorted(results) { t.Errorf("results are not sorted:\n%s", results) } if numDistinct != len(results) { t.Errorf("'val' column is not unique: %d results, but %d distinct:\n%s", len(results), numDistinct, results) } } concurrency := 2 * c.NumNodes() sem := make(chan struct{}, concurrency) timer := time.After(cfg.Duration) defer verify() defer func() { // Now that consuming has stopped, fill up the semaphore (i.e. wait for // still-running goroutines to stop) for i := 0; i < concurrency; i++ { sem <- struct{}{} } }() for { select { case sem <- struct{}{}: case <-stopper.ShouldStop(): return case <-timer: return } go func(client mtClient) { invoke(client) <-sem }(clients[rand.Intn(c.NumNodes())]) } }
// initClient initializes the client talking to node "i". // It requires that the caller hold the client's write lock. func (state *testState) initClient(t *testing.T, c cluster.Cluster, i int) { state.clients[i].db = makePGClient(t, c.PGUrl(i)) }