Example #1
1
func postFreeze(
	c cluster.Cluster, freeze bool, timeout time.Duration,
) (serverpb.ClusterFreezeResponse, error) {
	httpClient := cluster.HTTPClient
	httpClient.Timeout = timeout

	var resp serverpb.ClusterFreezeResponse
	log.Infof(context.Background(), "requesting: freeze=%t, timeout=%s", freeze, timeout)
	cb := func(v proto.Message) {
		oldNum := resp.RangesAffected
		resp = *v.(*serverpb.ClusterFreezeResponse)
		if oldNum > resp.RangesAffected {
			resp.RangesAffected = oldNum
		}
		if (resp != serverpb.ClusterFreezeResponse{}) {
			log.Infof(context.Background(), "%+v", &resp)
		}
	}
	err := httputil.StreamJSON(
		httpClient,
		c.URL(0)+"/_admin/v1/cluster/freeze",
		&serverpb.ClusterFreezeRequest{Freeze: freeze},
		&serverpb.ClusterFreezeResponse{},
		cb,
	)
	return resp, err
}
Example #2
0
func cutNetwork(t *testing.T, c cluster.Cluster, closer <-chan struct{}, partitions ...[]int) {
	defer func() {
		if errs := restoreNetwork(t, c); len(errs) > 0 {
			t.Fatalf("errors restoring the network: %+v", errs)
		}
	}()
	addrs, addrsToNode := mustGetHosts(t, c)
	ipPartitions := make([][]iptables.IP, 0, len(partitions))
	for _, partition := range partitions {
		ipPartition := make([]iptables.IP, 0, len(partition))
		for _, nodeIndex := range partition {
			ipPartition = append(ipPartition, addrs[nodeIndex])
		}
		ipPartitions = append(ipPartitions, ipPartition)
	}
	log.Warningf(context.TODO(), "partitioning: %v (%v)", partitions, ipPartitions)
	for host, cmds := range iptables.Rules(iptables.Bidirectional(ipPartitions...)) {
		for _, cmd := range cmds {
			if err := c.ExecRoot(addrsToNode[host], cmd); err != nil {
				t.Fatal(err)
			}
		}
	}
	<-closer
	log.Warningf(context.TODO(), "resolved all partitions")
}
Example #3
0
func testStatusServerInner(
	ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig,
) {
	// Get the ids for each node.
	idMap := make(map[int]roachpb.NodeID)
	for i := 0; i < c.NumNodes(); i++ {
		var details serverpb.DetailsResponse
		if err := httputil.GetJSON(cluster.HTTPClient, c.URL(ctx, i)+"/_status/details/local", &details); err != nil {
			t.Fatal(err)
		}
		idMap[i] = details.NodeID
	}

	// Check local response for the every node.
	for i := 0; i < c.NumNodes(); i++ {
		id := idMap[i]
		checkNode(ctx, t, c, i, id, id, id)
		get(ctx, t, c.URL(ctx, i), "/_status/nodes")
	}

	// Proxy from the first node to the last node.
	firstNode := 0
	lastNode := c.NumNodes() - 1
	firstID := idMap[firstNode]
	lastID := idMap[lastNode]
	checkNode(ctx, t, c, firstNode, firstID, lastID, lastID)

	// And from the last node to the first node.
	checkNode(ctx, t, c, lastNode, lastID, firstID, firstID)

	// And from the last node to the last node.
	checkNode(ctx, t, c, lastNode, lastID, lastID, lastID)
}
Example #4
0
func testBuildInfoInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	checkGossip(t, c, 20*time.Second, hasPeers(c.NumNodes()))

	var details serverpb.DetailsResponse
	util.SucceedsSoon(t, func() error {
		select {
		case <-stopper:
			t.Fatalf("interrupted")
		default:
		}
		return httputil.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/details/local", &details)
	})

	bi := details.BuildInfo
	testData := map[string]string{
		"go_version":   bi.GoVersion,
		"tag":          bi.Tag,
		"time":         bi.Time,
		"dependencies": bi.Dependencies,
	}
	for key, val := range testData {
		if val == "" {
			t.Errorf("build info not set for \"%s\"", key)
		}
	}
}
Example #5
0
// CheckGossip fetches the gossip infoStore from each node and invokes the given
// function. The test passes if the function returns 0 for every node,
// retrying for up to the given duration.
func CheckGossip(
	ctx context.Context, t testing.TB, c cluster.Cluster, d time.Duration, f CheckGossipFunc,
) {
	err := util.RetryForDuration(d, func() error {
		select {
		case <-stopper.ShouldStop():
			t.Fatalf("interrupted")
			return nil
		case <-time.After(1 * time.Second):
		}

		var infoStatus gossip.InfoStatus
		for i := 0; i < c.NumNodes(); i++ {
			if err := httputil.GetJSON(cluster.HTTPClient, c.URL(ctx, i)+"/_status/gossip/local", &infoStatus); err != nil {
				return errors.Wrapf(err, "failed to get gossip status from node %d", i)
			}
			if err := f(infoStatus.Infos); err != nil {
				return errors.Errorf("node %d: %s", i, err)
			}
		}

		return nil
	})
	if err != nil {
		t.Fatal(errors.Errorf("condition failed to evaluate within %s: %s", d, err))
	}
}
Example #6
0
// checkGossip fetches the gossip infoStore from each node and invokes the given
// function. The test passes if the function returns 0 for every node,
// retrying for up to the given duration.
func checkGossip(t *testing.T, c cluster.Cluster, d time.Duration, f checkGossipFunc) {
	err := util.RetryForDuration(d, func() error {
		select {
		case <-stopper:
			t.Fatalf("interrupted")
			return nil
		case <-time.After(1 * time.Second):
		}

		var infoStatus gossip.InfoStatus
		for i := 0; i < c.NumNodes(); i++ {
			if err := httputil.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/gossip/local", &infoStatus); err != nil {
				return err
			}
			if err := f(infoStatus.Infos); err != nil {
				return errors.Errorf("node %d: %s", i, err)
			}
		}

		return nil
	})
	if err != nil {
		t.Fatal(errors.Errorf("condition failed to evaluate within %s: %s", d, err))
	}
}
Example #7
0
func mustGetHosts(t *testing.T, c cluster.Cluster) ([]iptables.IP, map[iptables.IP]int) {
	var addrs []iptables.IP
	addrsToNode := make(map[iptables.IP]int)
	for i := 0; i < c.NumNodes(); i++ {
		addr := iptables.IP(c.InternalIP(i).String())
		addrsToNode[addr] = i
		addrs = append(addrs, addr)
	}
	return addrs, addrsToNode
}
Example #8
0
func restoreNetwork(t *testing.T, c cluster.Cluster) []error {
	var errs []error
	for i := 0; i < c.NumNodes(); i++ {
		for _, cmd := range iptables.Reset() {
			if err := c.ExecRoot(i, cmd); err != nil {
				errs = append(errs, err)
			}
		}
	}
	return errs
}
Example #9
0
func testPutInner(ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	db, err := c.NewClient(ctx, 0)
	if err != nil {
		t.Fatal(err)
	}

	errs := make(chan error, c.NumNodes())
	start := timeutil.Now()
	deadline := start.Add(cfg.Duration)
	var count int64
	for i := 0; i < c.NumNodes(); i++ {
		go func() {
			r, _ := randutil.NewPseudoRand()
			value := randutil.RandBytes(r, 8192)

			for timeutil.Now().Before(deadline) {
				k := atomic.AddInt64(&count, 1)
				v := value[:r.Intn(len(value))]
				if err := db.Put(ctx, fmt.Sprintf("%08d", k), v); err != nil {
					errs <- err
					return
				}
			}
			errs <- nil
		}()
	}

	for i := 0; i < c.NumNodes(); {
		baseCount := atomic.LoadInt64(&count)
		select {
		case <-stopper.ShouldStop():
			t.Fatalf("interrupted")
		case err := <-errs:
			if err != nil {
				t.Fatal(err)
			}
			i++
		case <-time.After(1 * time.Second):
			// Periodically print out progress so that we know the test is still
			// running.
			loadedCount := atomic.LoadInt64(&count)
			log.Infof(ctx, "%d (%d/s)", loadedCount, loadedCount-baseCount)
			c.Assert(ctx, t)
			if err := cluster.Consistent(ctx, c, 0); err != nil {
				t.Fatal(err)
			}
		}
	}

	elapsed := timeutil.Since(start)
	log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds())
}
Example #10
0
// checkNode checks all the endpoints of the status server hosted by node and
// requests info for the node with otherNodeID. That node could be the same
// other node, the same node or "local".
func checkNode(
	ctx context.Context,
	t *testing.T,
	c cluster.Cluster,
	i int,
	nodeID,
	otherNodeID, expectedNodeID roachpb.NodeID,
) {
	urlIDs := []string{otherNodeID.String()}
	if nodeID == otherNodeID {
		urlIDs = append(urlIDs, "local")
	}
	var details serverpb.DetailsResponse
	for _, urlID := range urlIDs {
		if err := httputil.GetJSON(cluster.HTTPClient, c.URL(ctx, i)+"/_status/details/"+urlID, &details); err != nil {
			t.Fatal(errors.Errorf("unable to parse details - %s", err))
		}
		if details.NodeID != expectedNodeID {
			t.Fatal(errors.Errorf("%d calling %s: node ids don't match - expected %d, actual %d", nodeID, urlID, expectedNodeID, details.NodeID))
		}

		get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/gossip/%s", urlID))
		get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/nodes/%s", urlID))
		get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/logfiles/%s", urlID))
		get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/logs/%s", urlID))
		get(ctx, t, c.URL(ctx, i), fmt.Sprintf("/_status/stacks/%s", urlID))
	}
}
Example #11
0
func testClusterRecoveryInner(
	ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig,
) {
	num := c.NumNodes()

	// One client for each node.
	initBank(t, c.PGUrl(ctx, 0))

	start := timeutil.Now()
	state := testState{
		t:        t,
		errChan:  make(chan error, num),
		teardown: make(chan struct{}),
		deadline: start.Add(cfg.Duration),
		clients:  make([]testClient, num),
	}

	for i := 0; i < num; i++ {
		state.clients[i].Lock()
		state.initClient(ctx, t, c, i)
		state.clients[i].Unlock()
		go transferMoneyLoop(ctx, i, &state, *numAccounts, *maxTransfer)
	}

	defer func() {
		<-state.teardown
	}()

	// Chaos monkey.
	rnd, seed := randutil.NewPseudoRand()
	log.Warningf(ctx, "monkey starts (seed %d)", seed)
	pickNodes := func() []int {
		return rnd.Perm(num)[:rnd.Intn(num)+1]
	}
	go chaosMonkey(ctx, &state, c, true, pickNodes, 0)

	waitClientsStop(ctx, num, &state, stall)

	// Verify accounts.
	verifyAccounts(t, &state.clients[0])

	elapsed := timeutil.Since(start)
	var count uint64
	counts := state.counts()
	for _, c := range counts {
		count += c
	}
	log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds())
}
Example #12
0
func testRepairInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	testStopper := stop.NewStopper()
	dc := newDynamicClient(c, testStopper)
	testStopper.AddCloser(dc)
	defer testStopper.Stop()

	// Add some loads.
	for i := 0; i < c.NumNodes()*2; i++ {
		ID := i
		testStopper.RunWorker(func() {
			insertLoad(t, dc, ID)
		})
	}

	// TODO(bram): #5345 add repair mechanism.

	select {
	case <-stopper:
	case <-time.After(cfg.Duration):
	}
}
Example #13
0
func testRepairInner(ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	dc := newDynamicClient(c, stopper)
	stopper.AddCloser(stop.CloserFn(func() {
		dc.Close(ctx)
	}))

	// Add some loads.
	for i := 0; i < c.NumNodes()*2; i++ {
		ID := i
		stopper.RunWorker(func() {
			insertLoad(ctx, t, dc, ID)
		})
	}

	// TODO(bram): #5345 add repair mechanism.

	select {
	case <-stopper.ShouldStop():
	case <-time.After(cfg.Duration):
	}
}
func testGossipPeeringsInner(
	ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig,
) {
	num := c.NumNodes()

	deadline := timeutil.Now().Add(cfg.Duration)

	waitTime := longWaitTime
	if cfg.Duration < waitTime {
		waitTime = shortWaitTime
	}

	for timeutil.Now().Before(deadline) {
		CheckGossip(ctx, t, c, waitTime, HasPeers(num))

		// Restart the first node.
		log.Infof(ctx, "restarting node 0")
		if err := c.Restart(ctx, 0); err != nil {
			t.Fatal(err)
		}
		CheckGossip(ctx, t, c, waitTime, HasPeers(num))

		// Restart another node (if there is one).
		var pickedNode int
		if num > 1 {
			pickedNode = rand.Intn(num-1) + 1
		}
		log.Infof(ctx, "restarting node %d", pickedNode)
		if err := c.Restart(ctx, pickedNode); err != nil {
			t.Fatal(err)
		}
		CheckGossip(ctx, t, c, waitTime, HasPeers(num))
	}
}
Example #15
0
// BidirectionalPartitionNemesis is a nemesis which randomly severs the network
// symmetrically between two random groups of nodes. Partitioned and connected
// mode take alternating turns, with random durations of up to 15s.
func BidirectionalPartitionNemesis(t *testing.T, stop <-chan struct{}, c cluster.Cluster) {
	randSec := func() time.Duration { return time.Duration(rand.Int63n(15 * int64(time.Second))) }
	log.Infof(context.Background(), "cleaning up any previous rules")
	_ = restoreNetwork(t, c) // clean up any potential leftovers
	log.Infof(context.Background(), "starting partition nemesis")
	for {
		ch := make(chan struct{})
		go func() {
			select {
			case <-time.After(randSec()):
			case <-stop:
			}
			close(ch)
		}()
		cutNetwork(t, c, ch, randomBidirectionalPartition(c.NumNodes())...)
		select {
		case <-stop:
			return
		case <-time.After(randSec()):
		}
	}
}
Example #16
0
func checkRangeReplication(t *testing.T, c cluster.Cluster, d time.Duration) {
	if c.NumNodes() < 1 {
		// Looks silly, but we actually start zero-node clusters in the
		// reference tests.
		t.Log("replication test is a no-op for empty cluster")
		return
	}

	wantedReplicas := 3
	if c.NumNodes() < 3 {
		wantedReplicas = c.NumNodes()
	}

	log.Infof(context.Background(), "waiting for first range to have %d replicas", wantedReplicas)

	util.SucceedsSoon(t, func() error {
		// Reconnect on every iteration; gRPC will eagerly tank the connection
		// on transport errors. Always talk to node 0 because it's guaranteed
		// to exist.
		client, dbStopper := c.NewClient(t, 0)
		defer dbStopper.Stop()

		select {
		case <-stopper:
			t.Fatalf("interrupted")
			return nil
		case <-time.After(1 * time.Second):
		}

		foundReplicas, err := countRangeReplicas(client)
		if err != nil {
			return err
		}

		if log.V(1) {
			log.Infof(context.Background(), "found %d replicas", foundReplicas)
		}
		if foundReplicas >= wantedReplicas {
			return nil
		}
		return fmt.Errorf("expected %d replicas, only found %d", wantedReplicas, foundReplicas)
	})

	log.Infof(context.Background(), "found %d replicas", wantedReplicas)
}
Example #17
0
func testNodeRestartInner(
	ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig,
) {
	num := c.NumNodes()
	if minNum := 3; num < minNum {
		t.Skipf("need at least %d nodes, got %d", minNum, num)
	}

	// One client for each node.
	initBank(t, c.PGUrl(ctx, 0))

	start := timeutil.Now()
	state := testState{
		t:        t,
		errChan:  make(chan error, 1),
		teardown: make(chan struct{}),
		deadline: start.Add(cfg.Duration),
		clients:  make([]testClient, 1),
	}

	clientIdx := num - 1
	client := &state.clients[0]
	client.Lock()
	client.db = makePGClient(t, c.PGUrl(ctx, clientIdx))
	client.Unlock()
	go transferMoneyLoop(ctx, 0, &state, *numAccounts, *maxTransfer)

	defer func() {
		<-state.teardown
	}()

	// Chaos monkey.
	rnd, seed := randutil.NewPseudoRand()
	log.Warningf(ctx, "monkey starts (seed %d)", seed)
	pickNodes := func() []int {
		return []int{rnd.Intn(clientIdx)}
	}
	go chaosMonkey(ctx, &state, c, false, pickNodes, clientIdx)

	waitClientsStop(ctx, 1, &state, stall)

	// Verify accounts.
	verifyAccounts(t, client)

	elapsed := timeutil.Since(start)
	count := atomic.LoadUint64(&client.count)
	log.Infof(ctx, "%d %.1f/sec", count, float64(count)/elapsed.Seconds())
}
Example #18
0
func testAdminLossOfQuorumInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	if c.NumNodes() < 2 {
		t.Logf("skipping test %s because given cluster has too few nodes", cfg.Name)
		return
	}

	// Get the ids for each node.
	nodeIDs := make([]roachpb.NodeID, c.NumNodes())
	for i := 0; i < c.NumNodes(); i++ {
		var details serverpb.DetailsResponse
		if err := httputil.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/details/local", &details); err != nil {
			t.Fatal(err)
		}
		nodeIDs[i] = details.NodeID
	}

	// Leave only the first node alive.
	for i := 1; i < c.NumNodes(); i++ {
		if err := c.Kill(i); err != nil {
			t.Fatal(err)
		}
	}

	// Retrieve node statuses.
	var nodes serverpb.NodesResponse
	if err := httputil.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/nodes", &nodes); err != nil {
		t.Fatal(err)
	}

	for _, nodeID := range nodeIDs {
		var nodeStatus status.NodeStatus
		if err := httputil.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/nodes/"+strconv.Itoa(int(nodeID)), &nodeStatus); err != nil {
			t.Fatal(err)
		}
	}

	// Retrieve time-series data.
	nowNanos := timeutil.Now().UnixNano()
	queryRequest := tspb.TimeSeriesQueryRequest{
		StartNanos: nowNanos - 10*time.Second.Nanoseconds(),
		EndNanos:   nowNanos,
		Queries: []tspb.Query{
			{Name: "doesn't_matter", Sources: []string{}},
		},
	}
	var queryResponse tspb.TimeSeriesQueryResponse
	if err := httputil.PostJSON(cluster.HTTPClient, c.URL(0)+"/ts/query",
		&queryRequest, &queryResponse); err != nil {
		t.Fatal(err)
	}

	// TODO(cdo): When we're able to issue SQL queries without a quorum, test all
	// admin endpoints that issue SQL queries here.
}
Example #19
0
// initClient initializes the client talking to node "i".
// It requires that the caller hold the client's write lock.
func (state *testState) initClient(t *testing.T, c cluster.Cluster, i int) {
	state.clients[i].db = makePGClient(t, c.PGUrl(i))
}
Example #20
0
// initClient initializes the client talking to node "i".
// It requires that the caller hold the client's write lock.
func (state *testState) initClient(ctx context.Context, t *testing.T, c cluster.Cluster, i int) {
	state.clients[i].db = makePGClient(t, c.PGUrl(ctx, i))
}
Example #21
0
func testMonotonicInsertsInner(
	ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig,
) {
	var clients []mtClient
	for i := 0; i < c.NumNodes(); i++ {
		clients = append(clients, mtClient{ID: i, DB: makePGClient(t, c.PGUrl(ctx, i))})
	}
	// We will insert into this table by selecting MAX(val) and increasing by
	// one and expect that val and sts (the commit timestamp) are both
	// simultaneously increasing.
	if _, err := clients[0].Exec(`
CREATE DATABASE mono;
CREATE TABLE IF NOT EXISTS mono.mono (val INT, sts STRING, node INT, tb INT);
INSERT INTO mono.mono VALUES(-1, '0', -1, -1)`); err != nil {
		t.Fatal(err)
	}

	var idGen uint64

	invoke := func(client mtClient) {
		logPrefix := fmt.Sprintf("%03d.%03d: ", atomic.AddUint64(&idGen, 1), client.ID)
		l := func(msg string, args ...interface{}) {
			log.Infof(ctx, logPrefix+msg, args...)
			if log.V(2) {
				t.Logf(logPrefix+msg, args...)
			}
		}
		l("begin")
		defer l("done")

		var exRow, insRow mtRow
		var attempt int
		if err := crdb.ExecuteTx(client.DB, func(tx *gosql.Tx) error {
			attempt++
			l("attempt %d", attempt)
			if err := tx.QueryRow(`SELECT cluster_logical_timestamp()`).Scan(
				&insRow.sts,
			); err != nil {
				l(err.Error())
				return err
			}

			l("read max val")
			if err := tx.QueryRow(`SELECT MAX(val) AS m FROM mono.mono`).Scan(
				&exRow.val,
			); err != nil {
				l(err.Error())
				return err
			}

			l("read max row for val=%d", exRow.val)
			if err := tx.QueryRow(`SELECT sts, node, tb FROM mono.mono WHERE val = $1`,
				exRow.val,
			).Scan(
				&exRow.sts, &exRow.node, &exRow.tb,
			); err != nil {
				l(err.Error())
				return err
			}

			l("insert")
			if err := tx.QueryRow(`
INSERT INTO mono.mono (val, sts, node, tb) VALUES($1, $2, $3, $4)
RETURNING val, sts, node, tb`,
				exRow.val+1, insRow.sts, client.ID, 0,
			).Scan(
				&insRow.val, &insRow.sts, &insRow.node, &insRow.tb,
			); err != nil {
				l(err.Error())
				return err
			}
			l("commit")
			return nil
		}); err != nil {
			t.Errorf("%T: %v", err, err)
		}
	}

	verify := func() {
		client := clients[0]
		var numDistinct int
		if err := client.QueryRow("SELECT COUNT(DISTINCT(val)) FROM mono.mono").Scan(
			&numDistinct,
		); err != nil {
			t.Fatal(err)
		}
		rows, err := client.Query("SELECT val, sts, node, tb FROM mono.mono ORDER BY val ASC, sts ASC")
		if err != nil {
			t.Fatal(err)
		}
		var results mtRows
		for rows.Next() {
			var row mtRow
			if err := rows.Scan(&row.val, &row.sts, &row.node, &row.tb); err != nil {
				t.Fatal(err)
			}
			results = append(results, row)
		}

		if !sort.IsSorted(results) {
			t.Errorf("results are not sorted:\n%s", results)
		}

		if numDistinct != len(results) {
			t.Errorf("'val' column is not unique: %d results, but %d distinct:\n%s",
				len(results), numDistinct, results)
		}
	}

	concurrency := 2 * c.NumNodes()

	sem := make(chan struct{}, concurrency)
	timer := time.After(cfg.Duration)

	defer verify()
	defer func() {
		// Now that consuming has stopped, fill up the semaphore (i.e. wait for
		// still-running goroutines to stop)
		for i := 0; i < concurrency; i++ {
			sem <- struct{}{}
		}
	}()

	for {
		select {
		case sem <- struct{}{}:
		case <-stopper.ShouldStop():
			return
		case <-timer:
			return
		}
		go func(client mtClient) {
			invoke(client)
			<-sem
		}(clients[rand.Intn(c.NumNodes())])
	}
}
Example #22
0
// chaosMonkey picks a set of nodes and restarts them. If stopClients is set
// all the clients are locked before the nodes are restarted.
func chaosMonkey(
	ctx context.Context,
	state *testState,
	c cluster.Cluster,
	stopClients bool,
	pickNodes func() []int,
	consistentIdx int,
) {
	defer close(state.teardown)
	for curRound := uint64(1); !state.done(); curRound++ {
		atomic.StoreUint64(&state.monkeyIteration, curRound)
		select {
		case <-stopper.ShouldStop():
			return
		default:
		}

		// Pick nodes to be restarted.
		nodes := pickNodes()

		if stopClients {
			// Prevent all clients from writing while nodes are being restarted.
			for i := 0; i < len(state.clients); i++ {
				state.clients[i].Lock()
			}
		}
		log.Infof(ctx, "round %d: restarting nodes %v", curRound, nodes)
		for _, i := range nodes {
			// Two early exit conditions.
			select {
			case <-stopper.ShouldStop():
				break
			default:
			}
			if state.done() {
				break
			}
			log.Infof(ctx, "round %d: restarting %d", curRound, i)
			if err := c.Kill(ctx, i); err != nil {
				state.t.Error(err)
			}
			if err := c.Restart(ctx, i); err != nil {
				state.t.Error(err)
			}
			if stopClients {
				// Reinitialize the client talking to the restarted node.
				state.initClient(ctx, state.t, c, i)
			}
		}
		if stopClients {
			for i := 0; i < len(state.clients); i++ {
				state.clients[i].Unlock()
			}
		}

		preCount := state.counts()

		madeProgress := func() bool {
			newCounts := state.counts()
			for i := range newCounts {
				if newCounts[i] > preCount[i] {
					return true
				}
			}
			return false
		}

		// Sleep until at least one client is writing successfully.
		log.Warningf(ctx, "round %d: monkey sleeping while cluster recovers...", curRound)
		for !state.done() && !madeProgress() {
			time.Sleep(time.Second)
		}
		c.Assert(ctx, state.t)

		if err := cluster.Consistent(ctx, c, consistentIdx); err != nil {
			state.t.Error(err)
		}
		log.Warningf(ctx, "round %d: cluster recovered", curRound)
	}
}
Example #23
0
func testSingleKeyInner(
	ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig,
) {
	num := c.NumNodes()

	// Initialize the value for our test key to zero.
	const key = "test-key"
	initDB := c.NewClient(ctx, t, 0)
	if err := initDB.Put(ctx, key, 0); err != nil {
		t.Fatal(err)
	}

	type result struct {
		err        error
		maxLatency time.Duration
	}

	resultCh := make(chan result, num)
	deadline := timeutil.Now().Add(cfg.Duration)
	var expected int64

	// Start up num workers each reading and writing the same
	// key. Each worker is configured to talk to a different node in the
	// cluster.
	for i := 0; i < num; i++ {
		db := c.NewClient(ctx, t, i)
		go func() {
			var r result
			for timeutil.Now().Before(deadline) {
				start := timeutil.Now()
				err := db.Txn(ctx, func(txn *client.Txn) error {
					minExp := atomic.LoadInt64(&expected)
					r, err := txn.Get(key)
					if err != nil {
						return err
					}
					b := txn.NewBatch()
					v := r.ValueInt()
					b.Put(key, v+1)
					err = txn.CommitInBatch(b)
					// Atomic updates after the fact mean that we should read
					// exp or larger (since concurrent writers might have
					// committed but not yet performed their atomic update).
					if err == nil && v < minExp {
						return errors.Errorf("unexpected read: %d, expected >= %d", v, minExp)
					}
					return err
				})
				if err != nil {
					resultCh <- result{err: err}
					return
				}
				atomic.AddInt64(&expected, 1)
				latency := timeutil.Since(start)
				if r.maxLatency < latency {
					r.maxLatency = latency
				}
			}
			resultCh <- r
		}()
	}

	// Verify that none of the workers encountered an error.
	var results []result
	for len(results) < num {
		select {
		case <-stopper.ShouldStop():
			t.Fatalf("interrupted")
		case r := <-resultCh:
			if r.err != nil {
				t.Fatal(r.err)
			}
			results = append(results, r)
		case <-time.After(1 * time.Second):
			// Periodically print out progress so that we know the test is still
			// running.
			log.Infof(ctx, "%d", atomic.LoadInt64(&expected))
		}
	}

	// Verify the resulting value stored at the key is what we expect.
	r, err := initDB.Get(ctx, key)
	if err != nil {
		t.Fatal(err)
	}
	v := r.ValueInt()
	if expected != v {
		t.Fatalf("expected %d, but found %d", expected, v)
	}
	var maxLatency []time.Duration
	for _, r := range results {
		maxLatency = append(maxLatency, r.maxLatency)
	}
	log.Infof(ctx, "%d increments: %s", v, maxLatency)
}
Example #24
0
func testEventLogInner(
	ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig,
) {
	num := c.NumNodes()
	if num <= 0 {
		t.Fatalf("%d nodes in cluster", num)
	}

	var confirmedClusterID uuid.UUID
	type nodeEventInfo struct {
		Descriptor roachpb.NodeDescriptor
		ClusterID  uuid.UUID
	}

	// Verify that a node_join message was logged for each node in the cluster.
	// We expect there to eventually be one such message for each node in the
	// cluster, and each message must be correctly formatted.
	util.SucceedsSoon(t, func() error {
		db := makePGClient(t, c.PGUrl(ctx, 0))
		defer db.Close()

		// Query all node join events. There should be one for each node in the
		// cluster.
		rows, err := db.Query(
			"SELECT targetID, info FROM system.eventlog WHERE eventType = $1",
			string(csql.EventLogNodeJoin))
		if err != nil {
			return err
		}
		seenIds := make(map[int64]struct{})
		var clusterID uuid.UUID
		for rows.Next() {
			var targetID int64
			var infoStr gosql.NullString
			if err := rows.Scan(&targetID, &infoStr); err != nil {
				t.Fatal(err)
			}

			// Verify the stored node descriptor.
			if !infoStr.Valid {
				t.Fatalf("info not recorded for node join, target node %d", targetID)
			}
			var info nodeEventInfo
			if err := json.Unmarshal([]byte(infoStr.String), &info); err != nil {
				t.Fatal(err)
			}
			if a, e := int64(info.Descriptor.NodeID), targetID; a != e {
				t.Fatalf("Node join with targetID %d had descriptor for wrong node %d", e, a)
			}

			// Verify cluster ID is recorded, and is the same for all nodes.
			if (info.ClusterID == uuid.UUID{}) {
				t.Fatalf("Node join recorded nil cluster id, info: %v", info)
			}
			if (clusterID == uuid.UUID{}) {
				clusterID = info.ClusterID
			} else if clusterID != info.ClusterID {
				t.Fatalf(
					"Node join recorded different cluster ID than earlier node. Expected %s, got %s. Info: %v",
					clusterID, info.ClusterID, info)
			}

			// Verify that all NodeIDs are different.
			if _, ok := seenIds[targetID]; ok {
				t.Fatalf("Node ID %d seen in two different node join messages", targetID)
			}
			seenIds[targetID] = struct{}{}
		}
		if err := rows.Err(); err != nil {
			return err
		}

		if a, e := len(seenIds), c.NumNodes(); a != e {
			return errors.Errorf("expected %d node join messages, found %d: %v", e, a, seenIds)
		}

		confirmedClusterID = clusterID
		return nil
	})

	// Stop and Start Node 0, and verify the node restart message.
	if err := c.Kill(ctx, 0); err != nil {
		t.Fatal(err)
	}
	if err := c.Restart(ctx, 0); err != nil {
		t.Fatal(err)
	}

	util.SucceedsSoon(t, func() error {
		db := makePGClient(t, c.PGUrl(ctx, 0))
		defer db.Close()

		// Query all node restart events. There should only be one.
		rows, err := db.Query(
			"SELECT targetID, info FROM system.eventlog WHERE eventType = $1",
			string(csql.EventLogNodeRestart))
		if err != nil {
			return err
		}

		seenCount := 0
		for rows.Next() {
			var targetID int64
			var infoStr gosql.NullString
			if err := rows.Scan(&targetID, &infoStr); err != nil {
				t.Fatal(err)
			}

			// Verify the stored node descriptor.
			if !infoStr.Valid {
				t.Fatalf("info not recorded for node join, target node %d", targetID)
			}
			var info nodeEventInfo
			if err := json.Unmarshal([]byte(infoStr.String), &info); err != nil {
				t.Fatal(err)
			}
			if a, e := int64(info.Descriptor.NodeID), targetID; a != e {
				t.Fatalf("Node join with targetID %d had descriptor for wrong node %d", e, a)
			}

			// Verify cluster ID is recorded, and is the same for all nodes.
			if confirmedClusterID != info.ClusterID {
				t.Fatalf(
					"Node restart recorded different cluster ID than earlier join. Expected %s, got %s. Info: %v",
					confirmedClusterID, info.ClusterID, info)
			}

			seenCount++
		}
		if err := rows.Err(); err != nil {
			return err
		}
		if seenCount != 1 {
			return errors.Errorf("Expected only one node restart event, found %d", seenCount)
		}
		return nil
	})
}
Example #25
0
func testFreezeClusterInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	minAffected := int64(server.ExpectedInitialRangeCount())

	const long = time.Minute
	const short = 10 * time.Second

	mustPost := func(freeze bool) serverpb.ClusterFreezeResponse {
		reply, err := postFreeze(c, freeze, long)
		if err != nil {
			t.Fatal(errors.Errorf("%v", err))
		}
		return reply
	}

	if reply := mustPost(false); reply.RangesAffected != 0 {
		t.Fatalf("expected initial unfreeze to affect no ranges, got %d", reply.RangesAffected)
	}

	if reply := mustPost(true); reply.RangesAffected < minAffected {
		t.Fatalf("expected >=%d frozen ranges, got %d", minAffected, reply.RangesAffected)
	}

	if reply := mustPost(true); reply.RangesAffected != 0 {
		t.Fatalf("expected second freeze to affect no ranges, got %d", reply.RangesAffected)
	}

	if reply := mustPost(false); reply.RangesAffected < minAffected {
		t.Fatalf("expected >=%d thawed ranges, got %d", minAffected, reply.RangesAffected)
	}

	num := c.NumNodes()
	if num < 3 {
		t.Skip("skipping remainder of test; needs at least 3 nodes")
	}

	// Kill the last node.
	if err := c.Kill(num - 1); err != nil {
		t.Fatal(err)
	}

	// Attempt to freeze should get stuck (since it does not get confirmation
	// of the last node receiving the freeze command).
	// Note that this is the freeze trigger stalling on the Replica, not the
	// Store-polling mechanism.
	acceptErrs := strings.Join([]string{
		"timed out waiting for Range",
		"Timeout exceeded while",
		"connection is closing",
		"deadline",
		// error returned via JSON when the server-side gRPC stream times out (due to
		// lack of new input). Unmarshaling that JSON fails with a message referencing
		// unknown fields, unfortunately in map order.
		"unknown field .*",
	}, "|")
	if reply, err := postFreeze(c, true, short); !testutils.IsError(err, acceptErrs) {
		t.Fatalf("expected timeout, got %v: %v", err, reply)
	}

	// Shut down the remaining nodes and restart them.
	for i := 0; i < num-1; i++ {
		if err := c.Kill(i); err != nil {
			t.Fatal(err)
		}
	}
	for i := 0; i < num; i++ {
		if err := c.Restart(i); err != nil {
			t.Fatal(err)
		}
	}

	// The cluster should now be fully operational (at least after waiting
	// a little bit) since each node tries to unfreeze everything when it
	// starts.
	if err := util.RetryForDuration(time.Minute, func() error {
		if _, err := postFreeze(c, false, short); err != nil {
			if testutils.IsError(err, "404 Not Found") {
				// It can take a bit until the endpoint is available.
				return err
			}
			t.Fatal(err)
		}

		// TODO(tschottdorf): moving the client creation outside of the retry
		// loop will break the test with the following message:
		//
		//   client/rpc_sender.go:61: roachpb.Batch RPC failed as client
		//   connection was closed
		//
		// Perhaps the cluster updates the address too late after restarting
		// the node.
		db, dbStopper := c.NewClient(t, 0)
		defer dbStopper.Stop()

		if _, err := db.Scan(context.TODO(), keys.LocalMax, roachpb.KeyMax, 0); err != nil {
			t.Fatal(err)
		}
		return nil
	}); err != nil {
		t.Fatal(err)
	}

	// Unfreezing again should be a no-op.
	if reply, err := postFreeze(c, false, long); err != nil {
		t.Fatal(err)
	} else if reply.RangesAffected > 0 {
		t.Fatalf("still %d frozen ranges", reply.RangesAffected)
	}
}
func testGossipRestartInner(
	ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig,
) {
	// This already replicates the first range (in the local setup).
	// The replication of the first range is important: as long as the
	// first range only exists on one node, that node can trivially
	// acquire the range lease. Once the range is replicated, however,
	// nodes must be able to discover each other over gossip before the
	// lease can be acquired.
	num := c.NumNodes()

	deadline := timeutil.Now().Add(cfg.Duration)

	waitTime := longWaitTime
	if cfg.Duration < waitTime {
		waitTime = shortWaitTime
	}

	for timeutil.Now().Before(deadline) {
		log.Infof(ctx, "waiting for initial gossip connections")
		CheckGossip(ctx, t, c, waitTime, HasPeers(num))
		CheckGossip(ctx, t, c, waitTime, hasClusterID)
		CheckGossip(ctx, t, c, waitTime, hasSentinel)

		log.Infof(ctx, "killing all nodes")
		for i := 0; i < num; i++ {
			if err := c.Kill(ctx, i); err != nil {
				t.Fatal(err)
			}
		}

		log.Infof(ctx, "restarting all nodes")
		for i := 0; i < num; i++ {
			if err := c.Restart(ctx, i); err != nil {
				t.Fatal(err)
			}
		}

		log.Infof(ctx, "waiting for gossip to be connected")
		CheckGossip(ctx, t, c, waitTime, HasPeers(num))
		CheckGossip(ctx, t, c, waitTime, hasClusterID)
		CheckGossip(ctx, t, c, waitTime, hasSentinel)

		for i := 0; i < num; i++ {
			db, err := c.NewClient(ctx, i)
			if err != nil {
				t.Fatal(err)
			}
			if i == 0 {
				if err := db.Del(ctx, "count"); err != nil {
					t.Fatal(err)
				}
			}
			var kv client.KeyValue
			if err := db.Txn(ctx, func(txn *client.Txn) error {
				var err error
				kv, err = txn.Inc("count", 1)
				return err
			}); err != nil {
				t.Fatal(err)
			} else if v := kv.ValueInt(); v != int64(i+1) {
				t.Fatalf("unexpected value %d for write #%d (expected %d)", v, i, i+1)
			}
		}
	}
}