func testAdminLossOfQuorumInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { if c.NumNodes() < 2 { t.Logf("skipping test %s because given cluster has too few nodes", cfg.Name) return } // Get the ids for each node. nodeIDs := make([]roachpb.NodeID, c.NumNodes()) for i := 0; i < c.NumNodes(); i++ { var details serverpb.DetailsResponse if err := util.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/details/local", &details); err != nil { t.Fatal(err) } nodeIDs[i] = details.NodeID } // Leave only the first node alive. for i := 1; i < c.NumNodes(); i++ { if err := c.Kill(i); err != nil { t.Fatal(err) } } // Retrieve node statuses. var nodes serverpb.NodesResponse if err := util.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/nodes", &nodes); err != nil { t.Fatal(err) } for _, nodeID := range nodeIDs { var nodeStatus status.NodeStatus if err := util.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/nodes/"+strconv.Itoa(int(nodeID)), &nodeStatus); err != nil { t.Fatal(err) } } // Retrieve time-series data. nowNanos := timeutil.Now().UnixNano() queryRequest := tspb.TimeSeriesQueryRequest{ StartNanos: nowNanos - 10*time.Second.Nanoseconds(), EndNanos: nowNanos, Queries: []tspb.Query{ {Name: "doesnt_matter", Sources: []string{}}, }, } var queryResponse tspb.TimeSeriesQueryResponse if err := util.PostJSON(cluster.HTTPClient, c.URL(0)+"/ts/query", &queryRequest, &queryResponse); err != nil { t.Fatal(err) } // TODO(cdo): When we're able to issue SQL queries without a quorum, test all // admin endpoints that issue SQL queries here. }
// checkGossip fetches the gossip infoStore from each node and invokes the given // function. The test passes if the function returns 0 for every node, // retrying for up to the given duration. func checkGossip(t *testing.T, c cluster.Cluster, d time.Duration, f checkGossipFunc) { err := util.RetryForDuration(d, func() error { select { case <-stopper: t.Fatalf("interrupted") return nil case <-time.After(1 * time.Second): } var infoStatus gossip.InfoStatus for i := 0; i < c.NumNodes(); i++ { if err := util.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/gossip/local", &infoStatus); err != nil { return err } if err := f(infoStatus.Infos); err != nil { return errors.Errorf("node %d: %s", i, err) } } return nil }) if err != nil { t.Fatal(errors.Errorf("condition failed to evaluate within %s: %s", d, err)) } }
// GetAddress returns a net.Addr or error. func (nl *nodeLookupResolver) GetAddress() (net.Addr, error) { if nl.httpClient == nil { tlsConfig, err := nl.context.GetClientTLSConfig() if err != nil { return nil, err } nl.httpClient = &http.Client{ Transport: &http.Transport{TLSClientConfig: tlsConfig}, Timeout: base.NetworkTimeout, } } local := struct { Address util.UnresolvedAddr `json:"address"` // We ignore all other fields. }{} log.Infof("querying %s for gossip nodes", nl.addr) // TODO(marc): put common URIs in base and reuse everywhere. if err := util.GetJSON(nl.httpClient, nl.context.HTTPRequestScheme(), nl.addr, "/_status/details/local", &local); err != nil { return nil, err } addr, err := resolveAddress(local.Address.Network(), local.Address.String()) if err != nil { return nil, err } return addr, nil }
func testBuildInfoInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { checkGossip(t, c, 20*time.Second, hasPeers(c.NumNodes())) var details server.DetailsResponse util.SucceedsSoon(t, func() error { select { case <-stopper: t.Fatalf("interrupted") default: } return util.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/details/local", &details) }) bi := details.BuildInfo testData := map[string]string{ "go_version": bi.GoVersion, "tag": bi.Tag, "time": bi.Time, "dependencies": bi.Dependencies, } for key, val := range testData { if val == "" { t.Errorf("build info not set for \"%s\"", key) } } }
func testStatusServerInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { // Get the ids for each node. idMap := make(map[int]roachpb.NodeID) for i := 0; i < c.NumNodes(); i++ { var details server.DetailsResponse if err := util.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/details/local", &details); err != nil { t.Fatal(err) } idMap[i] = details.NodeID } // Check local response for the every node. for i := 0; i < c.NumNodes(); i++ { id := idMap[i] checkNode(t, c, i, id, id, id) get(t, c.URL(i), "/_status/nodes") } // Proxy from the first node to the last node. firstNode := 0 lastNode := c.NumNodes() - 1 firstID := idMap[firstNode] lastID := idMap[lastNode] checkNode(t, c, firstNode, firstID, lastID, lastID) // And from the last node to the first node. checkNode(t, c, lastNode, lastID, firstID, firstID) // And from the last node to the last node. checkNode(t, c, lastNode, lastID, lastID, lastID) }
// apiGet issues a GET to the provided server using the given API path and marshals the result // into the v parameter. func apiGet(s *TestServer, path string, v interface{}) error { apiPath := apiEndpoint + path client, err := s.Ctx.GetHTTPClient() if err != nil { return err } return util.GetJSON(client, s.Ctx.HTTPRequestScheme(), s.HTTPAddr(), apiPath, v) }
// getJSON is a convenience wrapper around cockroach/util.GetJSON(), which retrieves // an URL specified by the parameters and unmarshals the result into the supplied // interface. func getJSON(tls bool, hostport, path string, v interface{}) error { scheme := "https" if !tls { scheme = "http" } return util.GetJSON(&HTTPClient, scheme, hostport, path, v) }
// apiGet issues a GET to the provided server using the given API path and // marshals the result into response. func apiGet(s TestServer, path string, response proto.Message) error { apiPath := apiEndpoint + path client, err := s.Ctx.GetHTTPClient() if err != nil { return err } return util.GetJSON(client, s.Ctx.AdminURL()+apiPath, response) }
// printStats prints the time it took for rebalancing to finish and the final // standard deviation of replica counts across stores. func (at *allocatorTest) printRebalanceStats(db *gosql.DB, host string, adminPort int) error { // TODO(cuongdo): Output these in a machine-friendly way and graph. // Output time it took to rebalance. { var rebalanceIntervalStr string var rebalanceInterval time.Duration q := `SELECT (SELECT MAX(timestamp) FROM rangelog) - ` + `(select MAX(timestamp) FROM eventlog WHERE eventType='` + string(sql.EventLogNodeJoin) + `')` if err := db.QueryRow(q).Scan(&rebalanceIntervalStr); err != nil { return err } rebalanceInterval, err := time.ParseDuration(rebalanceIntervalStr) if err != nil { return err } if rebalanceInterval < 0 { // This can happen with single-node clusters. rebalanceInterval = time.Duration(0) } log.Infof("cluster took %s to rebalance", rebalanceInterval) } // Output # of range events that occurred. All other things being equal, // larger numbers are worse and potentially indicate thrashing. { var rangeEvents int64 q := `SELECT COUNT(*) from rangelog` if err := db.QueryRow(q).Scan(&rangeEvents); err != nil { return err } log.Infof("%d range events", rangeEvents) } // Output standard deviation of the replica counts for all stores. { var client http.Client var nodesResp serverpb.NodesResponse url := fmt.Sprintf("http://%s:%d/_status/nodes", host, adminPort) if err := util.GetJSON(client, url, &nodesResp); err != nil { return err } var replicaCounts stats.Float64Data for _, node := range nodesResp.Nodes { for _, ss := range node.StoreStatuses { replicaCounts = append(replicaCounts, float64(ss.Metrics["replicas"])) } } stddev, err := stats.StdDevP(replicaCounts) if err != nil { return err } log.Infof("stddev(replica count) = %.2f", stddev) } return nil }
// queryCount returns the total SQL queries executed by the cluster. func (cl continuousLoadTest) queryCount(f *terrafarm.Farmer) (float64, error) { var client http.Client var resp status.NodeStatus host := f.Nodes()[0] if err := util.GetJSON(client, "http://"+host+":8080/_status/nodes/local", &resp); err != nil { return 0, err } count, ok := resp.Metrics["sql.query.count"] if !ok { return 0, errors.New("couldn't find SQL query count metric") } return count, nil }
// GetAddress returns a net.Addr or error. // Upon errors, we set exhausted=true, then flip it back when called again. func (nl *nodeLookupResolver) GetAddress() (net.Addr, error) { // TODO(marc): this is a bit of a hack to allow the server to start. // In single-node setups, this resolver will never return anything since // the status handlers are not serving yet. Instead, we specify multiple // gossip addresses (--gossip=localhost,http-lb=lb). We need this one to // be exhausted from time to time so that we have a chance to hit the fixed address. // Remove once the status pages are served before we've established a connection to // the gossip network. if nl.exhausted { nl.exhausted = false return nil, util.Errorf("skipping temporarily-exhausted resolver") } if nl.httpClient == nil { tlsConfig, err := nl.context.GetClientTLSConfig() if err != nil { return nil, err } nl.httpClient = &http.Client{ Transport: &http.Transport{TLSClientConfig: tlsConfig}, Timeout: base.NetworkTimeout, } } nl.exhausted = true local := struct { Address util.UnresolvedAddr `json:"address"` // We ignore all other fields. }{} log.Infof("querying %s for gossip nodes", nl.addr) // TODO(marc): put common URIs in base and reuse everywhere. if err := util.GetJSON(nl.httpClient, nl.context.HTTPRequestScheme(), nl.addr, "/_status/details/local", &local); err != nil { return nil, err } addr, err := resolveAddress(local.Address.Network(), local.Address.String()) if err != nil { return nil, err } nl.exhausted = false log.Infof("found gossip node: %+v", addr) return addr, nil }
func (at *allocatorTest) stdDev() (float64, error) { host := at.f.Nodes()[0] var client http.Client var nodesResp serverpb.NodesResponse url := fmt.Sprintf("http://%s:%s/_status/nodes", host, adminPort) if err := util.GetJSON(client, url, &nodesResp); err != nil { return 0, err } var replicaCounts stats.Float64Data for _, node := range nodesResp.Nodes { for _, ss := range node.StoreStatuses { replicaCounts = append(replicaCounts, float64(ss.Metrics["replicas"])) } } stdDev, err := stats.StdDevP(replicaCounts) if err != nil { return 0, err } return stdDev, nil }
// checkNode checks all the endpoints of the status server hosted by node and // requests info for the node with otherNodeID. That node could be the same // other node, the same node or "local". func checkNode(t *testing.T, c cluster.Cluster, i int, nodeID, otherNodeID, expectedNodeID roachpb.NodeID) { urlIDs := []string{otherNodeID.String()} if nodeID == otherNodeID { urlIDs = append(urlIDs, "local") } var details server.DetailsResponse for _, urlID := range urlIDs { if err := util.GetJSON(cluster.HTTPClient, c.URL(i)+"/_status/details/"+urlID, &details); err != nil { t.Fatal(util.ErrorfSkipFrames(1, "unable to parse details - %s", err)) } if details.NodeID != expectedNodeID { t.Fatal(util.ErrorfSkipFrames(1, "%d calling %s: node ids don't match - expected %d, actual %d", nodeID, urlID, expectedNodeID, details.NodeID)) } get(t, c.URL(i), fmt.Sprintf("/_status/gossip/%s", urlID)) get(t, c.URL(i), fmt.Sprintf("/_status/nodes/%s", urlID)) get(t, c.URL(i), fmt.Sprintf("/_status/logfiles/%s", urlID)) get(t, c.URL(i), fmt.Sprintf("/_status/logs/%s", urlID)) get(t, c.URL(i), fmt.Sprintf("/_status/stacks/%s", urlID)) } }
func TestStopServer(t *testing.T) { defer leaktest.AfterTest(t)() tc := StartTestCluster(t, 3, base.TestClusterArgs{ReplicationMode: base.ReplicationAuto}) defer tc.Stopper().Stop() if err := tc.WaitForFullReplication(); err != nil { t.Fatal(err) } // Connect to server 1, ensure it is answering requests over HTTP and GRPC. server1 := tc.Server(1) var response serverpb.HealthResponse httpClient1, err := server1.GetHTTPClient() if err != nil { t.Fatal(err) } url := server1.AdminURL() + "/_admin/v1/health" if err := util.GetJSON(httpClient1, url, &response); err != nil { t.Fatal(err) } rpcContext := rpc.NewContext( tc.Server(1).RPCContext().Context, tc.Server(1).Clock(), tc.Stopper(), ) conn, err := rpcContext.GRPCDial(server1.ServingAddr()) if err != nil { t.Fatal(err) } adminClient1 := serverpb.NewAdminClient(conn) ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() if _, err := adminClient1.Health(ctx, &serverpb.HealthRequest{}); err != nil { t.Fatal(err) } // Stop server 1. tc.StopServer(1) // Verify HTTP and GRPC requests to server now fail. httpErrorText := "connection refused" if err := util.GetJSON(httpClient1, url, &response); err == nil { t.Fatal("Expected HTTP Request to fail after server stopped") } else if !testutils.IsError(err, httpErrorText) { t.Fatalf("Expected error from server with text %q, got error with text %q", httpErrorText, err.Error()) } grpcErrorText := "rpc error" if _, err := adminClient1.Health(ctx, &serverpb.HealthRequest{}); err == nil { t.Fatal("Expected GRPC Request to fail after server stopped") } else if !testutils.IsError(err, grpcErrorText) { t.Fatalf("Expected error from GRPC with text %q, got error with text %q", grpcErrorText, err.Error()) } // Verify that request to Server 0 still works. httpClient1, err = tc.Server(0).GetHTTPClient() if err != nil { t.Fatal(err) } url = tc.Server(0).AdminURL() + "/_admin/v1/health" if err := util.GetJSON(httpClient1, url, &response); err != nil { t.Fatal(err) } }
func TestAdminAPITableStats(t *testing.T) { defer leaktest.AfterTest(t)() const nodeCount = 3 tc := testcluster.StartTestCluster(t, nodeCount, base.TestClusterArgs{ ReplicationMode: base.ReplicationAuto, ServerArgs: base.TestServerArgs{ ScanInterval: time.Millisecond, ScanMaxIdleTime: time.Millisecond, }, }) defer tc.Stopper().Stop() if err := tc.WaitForFullReplication(); err != nil { t.Fatal(err) } server0 := tc.Server(0) // Create clients (SQL, HTTP) connected to server 0. db := tc.ServerConn(0) client, err := server0.GetHTTPClient() if err != nil { t.Fatal(err) } client.Timeout = base.NetworkTimeout * 3 // Make a single table and insert some data. The database and test have // names which require escaping, in order to verify that database and // table names are being handled correctly. if _, err := db.Exec(`CREATE DATABASE "test test"`); err != nil { t.Fatal(err) } if _, err := db.Exec(` CREATE TABLE "test test"."foo foo" ( id INT PRIMARY KEY, val STRING )`, ); err != nil { t.Fatal(err) } for i := 0; i < 10; i++ { if _, err := db.Exec(` INSERT INTO "test test"."foo foo" VALUES( $1, $2 )`, i, "test", ); err != nil { t.Fatal(err) } } url := server0.AdminURL() + "/_admin/v1/databases/test test/tables/foo foo/stats" var tsResponse serverpb.TableStatsResponse // The new SQL table may not yet have split into its own range. Wait for // this to occur, and for full replication. util.SucceedsSoon(t, func() error { if err := util.GetJSON(client, url, &tsResponse); err != nil { return err } if tsResponse.RangeCount != 1 { return errors.Errorf("Table range not yet separated.") } if tsResponse.NodeCount != nodeCount { return errors.Errorf("Table range not yet replicated to %d nodes.", 3) } if a, e := tsResponse.ReplicaCount, int64(nodeCount); a != e { return errors.Errorf("expected %d replicas, found %d", e, a) } return nil }) // These two conditions *must* be true, given that the above // SucceedsSoon has succeeded. if a, e := tsResponse.Stats.KeyCount, int64(20); a < e { t.Fatalf("expected at least 20 total keys, found %d", a) } if len(tsResponse.MissingNodes) > 0 { t.Fatalf("expected no missing nodes, found %v", tsResponse.MissingNodes) } // Kill a node, ensure it shows up in MissingNodes and that ReplicaCount is // lower. tc.StopServer(1) if err := util.GetJSON(client, url, &tsResponse); err != nil { t.Fatal(err) } if a, e := tsResponse.NodeCount, int64(nodeCount); a != e { t.Errorf("expected %d nodes, found %d", e, a) } if a, e := tsResponse.RangeCount, int64(1); a != e { t.Errorf("expected %d ranges, found %d", e, a) } if a, e := tsResponse.ReplicaCount, int64((nodeCount/2)+1); a != e { t.Errorf("expected %d replicas, found %d", e, a) } if a, e := tsResponse.Stats.KeyCount, int64(10); a < e { t.Errorf("expected at least 10 total keys, found %d", a) } if len(tsResponse.MissingNodes) != 1 { t.Errorf("expected one missing node, found %v", tsResponse.MissingNodes) } // Call TableStats with a very low timeout. This tests that fan-out queries // do not leak goroutines if the calling context is abandoned. // Interestingly, the call can actually sometimes succeed, despite the small // timeout; however, in aggregate (or in stress tests) this will suffice for // detecting leaks. client.Timeout = 1 * time.Nanosecond _ = util.GetJSON(client, url, &tsResponse) }