func TestOffsetMeasurement(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() serverTime := time.Unix(0, 20) serverClock := hlc.NewClock(serverTime.UnixNano) serverCtx := newNodeTestContext(serverClock, stopper) s, ln := newTestServer(t, serverCtx, true) remoteAddr := ln.Addr().String() RegisterHeartbeatServer(s, &HeartbeatService{ clock: serverClock, remoteClockMonitor: serverCtx.RemoteClocks, }) // Create a client clock that is behind the server clock. clientAdvancing := AdvancingClock{time: time.Unix(0, 10)} clientClock := hlc.NewClock(clientAdvancing.UnixNano) clientClock.SetMaxOffset(time.Millisecond) clientCtx := newNodeTestContext(clientClock, stopper) clientCtx.RemoteClocks.offsetTTL = 5 * clientAdvancing.getAdvancementInterval() if _, err := clientCtx.GRPCDial(remoteAddr); err != nil { t.Fatal(err) } expectedOffset := RemoteOffset{Offset: 10, Uncertainty: 0, MeasuredAt: 10} util.SucceedsSoon(t, func() error { clientCtx.RemoteClocks.mu.Lock() defer clientCtx.RemoteClocks.mu.Unlock() if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok { return errors.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr) } else if o != expectedOffset { return errors.Errorf("expected:\n%v\nactual:\n%v", expectedOffset, o) } return nil }) // Change the client such that it receives a heartbeat right after the // maximum clock reading delay. clientAdvancing.setAdvancementInterval( maximumPingDurationMult*clientClock.MaxOffset() + 1*time.Nanosecond) util.SucceedsSoon(t, func() error { clientCtx.RemoteClocks.mu.Lock() defer clientCtx.RemoteClocks.mu.Unlock() if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; ok { return errors.Errorf("expected offset to have been cleared, but found %s", o) } return nil }) }
// TestRaftTransportCircuitBreaker verifies that messages will be // dropped waiting for raft node connection to be established. func TestRaftTransportCircuitBreaker(t *testing.T) { defer leaktest.AfterTest(t)() rttc := newRaftTransportTestContext(t) defer rttc.Stop() serverReplica := roachpb.ReplicaDescriptor{ NodeID: 2, StoreID: 2, ReplicaID: 2, } _, serverAddr := rttc.AddNodeWithoutGossip(serverReplica.NodeID) serverChannel := rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID) clientReplica := roachpb.ReplicaDescriptor{ NodeID: 1, StoreID: 1, ReplicaID: 1, } clientTransport := rttc.AddNode(clientReplica.NodeID) // The transport is set up asynchronously, so we expect the first // Send to return true here. if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) { t.Errorf("unexpectedly failed sending while connection is being asynchronously established") } // However, sending repeated messages should begin dropping once // the circuit breaker does trip. util.SucceedsSoon(t, func() error { if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) { return errors.Errorf("expected circuit breaker to trip") } return nil }) // Now, gossip address of server. rttc.GossipNode(serverReplica.NodeID, serverAddr) // Keep sending commit=2 until breaker resets and we receive the // first instance. It's possible an earlier message for commit=1 // snuck in. util.SucceedsSoon(t, func() error { if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 2}) { clientTransport.GetCircuitBreaker(serverReplica.NodeID).Reset() } select { case req := <-serverChannel.ch: if req.Message.Commit == 2 { return nil } default: } return errors.Errorf("expected message commit=2") }) }
// TestScannerDisabled verifies that disabling a scanner prevents // replicas from being added to queues. func TestScannerDisabled(t *testing.T) { defer leaktest.AfterTest(t)() const count = 3 ranges := newTestRangeSet(count, t) q := &testQueue{} s := newReplicaScanner(log.AmbientContext{}, 1*time.Millisecond, 0, ranges) s.AddQueues(q) mc := hlc.NewManualClock(0) clock := hlc.NewClock(mc.UnixNano) stopper := stop.NewStopper() s.Start(clock, stopper) defer stopper.Stop() // Verify queue gets all ranges. util.SucceedsSoon(t, func() error { if q.count() != count { return errors.Errorf("expected %d replicas; have %d", count, q.count()) } if s.scanCount() == 0 { return errors.Errorf("expected scanner count to increment") } return nil }) lastWaitEnabledCount := s.waitEnabledCount() // Now, disable the scanner. s.SetDisabled(true) util.SucceedsSoon(t, func() error { if s.waitEnabledCount() == lastWaitEnabledCount { return errors.Errorf("expected scanner to stop when disabled") } return nil }) lastScannerCount := s.scanCount() // Remove the replicas and verify the scanner still removes them while disabled. ranges.Visit(func(repl *Replica) bool { s.RemoveReplica(repl) return true }) util.SucceedsSoon(t, func() error { if qc := q.count(); qc != 0 { return errors.Errorf("expected queue to be empty after replicas removed from scanner; got %d", qc) } return nil }) if sc := s.scanCount(); sc != lastScannerCount { t.Errorf("expected scanner count to not increment: %d != %d", sc, lastScannerCount) } }
// TestRangeSplitsWithWritePressure sets the zone config max bytes for // a range to 256K and writes data until there are five ranges. func TestRangeSplitsWithWritePressure(t *testing.T) { defer leaktest.AfterTest(t)() // Override default zone config. cfg := config.DefaultZoneConfig() cfg.RangeMaxBytes = 1 << 18 defer config.TestingSetDefaultZoneConfig(cfg)() dbCtx := client.DefaultDBContext() dbCtx.TxnRetryOptions = retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 10 * time.Millisecond, Multiplier: 2, } s, _ := createTestDBWithContext(t, dbCtx) // This is purely to silence log spam. config.TestingSetupZoneConfigHook(s.Stopper) defer s.Stop() // Start test writer write about a 32K/key so there aren't too many writes necessary to split 64K range. done := make(chan struct{}) var wg sync.WaitGroup wg.Add(1) go startTestWriter(s.DB, int64(0), 1<<15, &wg, nil, nil, done, t) // Check that we split 5 times in allotted time. util.SucceedsSoon(t, func() error { // Scan the txn records. rows, err := s.DB.Scan(context.TODO(), keys.Meta2Prefix, keys.MetaMax, 0) if err != nil { return errors.Errorf("failed to scan meta2 keys: %s", err) } if lr := len(rows); lr < 5 { return errors.Errorf("expected >= 5 scans; got %d", lr) } return nil }) close(done) wg.Wait() // This write pressure test often causes splits while resolve // intents are in flight, causing them to fail with range key // mismatch errors. However, LocalSender should retry in these // cases. Check here via MVCC scan that there are no dangling write // intents. We do this using a SucceedsSoon construct to account // for timing of finishing the test writer and a possibly-ongoing // asynchronous split. util.SucceedsSoon(t, func() error { if _, _, _, err := engine.MVCCScan(context.Background(), s.Eng, keys.LocalMax, roachpb.KeyMax, math.MaxInt64, hlc.MaxTimestamp, true, nil); err != nil { return errors.Errorf("failed to verify no dangling intents: %s", err) } return nil }) }
// TestReplicaGCQueueDropReplica verifies that a removed replica is // immediately cleaned up. func TestReplicaGCQueueDropReplicaDirect(t *testing.T) { defer leaktest.AfterTest(t)() mtc := &multiTestContext{} const numStores = 3 rangeID := roachpb.RangeID(1) // In this test, the Replica on the second Node is removed, and the test // verifies that that Node adds this Replica to its RangeGCQueue. However, // the queue does a consistent lookup which will usually be read from // Node 1. Hence, if Node 1 hasn't processed the removal when Node 2 has, // no GC will take place since the consistent RangeLookup hits the first // Node. We use the TestingCommandFilter to make sure that the second Node // waits for the first. cfg := storage.TestStoreConfig() mtc.storeConfig = &cfg mtc.storeConfig.TestingKnobs.TestingCommandFilter = func(filterArgs storagebase.FilterArgs) *roachpb.Error { et, ok := filterArgs.Req.(*roachpb.EndTransactionRequest) if !ok || filterArgs.Sid != 2 { return nil } rct := et.InternalCommitTrigger.GetChangeReplicasTrigger() if rct == nil || rct.ChangeType != roachpb.REMOVE_REPLICA { return nil } util.SucceedsSoon(t, func() error { r, err := mtc.stores[0].GetReplica(rangeID) if err != nil { return err } if _, ok := r.Desc().GetReplicaDescriptor(2); ok { return errors.New("expected second node gone from first node's known replicas") } return nil }) return nil } mtc.Start(t, numStores) defer mtc.Stop() mtc.replicateRange(rangeID, 1, 2) mtc.unreplicateRange(rangeID, 1) // Make sure the range is removed from the store. util.SucceedsSoon(t, func() error { if _, err := mtc.stores[1].GetReplica(rangeID); !testutils.IsError(err, "range .* was not found") { return errors.Errorf("expected range removal: %v", err) // NB: errors.Wrapf(nil, ...) returns nil. } return nil }) }
func TestFailedOffsetMeasurement(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() // Can't be zero because that'd be an empty offset. clock := hlc.NewClock(time.Unix(0, 1).UnixNano) serverCtx := newNodeTestContext(clock, stopper) s, ln := newTestServer(t, serverCtx, true) remoteAddr := ln.Addr().String() heartbeat := &ManualHeartbeatService{ clock: clock, remoteClockMonitor: serverCtx.RemoteClocks, ready: make(chan struct{}), stopper: stopper, } RegisterHeartbeatServer(s, heartbeat) // Create a client that never receives a heartbeat after the first. clientCtx := newNodeTestContext(clock, stopper) // Increase the timeout so that failure arises from exceeding the maximum // clock reading delay, not the timeout. clientCtx.HeartbeatTimeout = 20 * clientCtx.HeartbeatInterval if _, err := clientCtx.GRPCDial(remoteAddr); err != nil { t.Fatal(err) } heartbeat.ready <- struct{}{} // Allow one heartbeat for initialization. util.SucceedsSoon(t, func() error { clientCtx.RemoteClocks.mu.Lock() defer clientCtx.RemoteClocks.mu.Unlock() if _, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok { return errors.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr) } return nil }) util.SucceedsSoon(t, func() error { serverCtx.RemoteClocks.mu.Lock() defer serverCtx.RemoteClocks.mu.Unlock() if o, ok := serverCtx.RemoteClocks.mu.offsets[remoteAddr]; ok { return errors.Errorf("expected offset of %s to not be initialized, but it was: %v", remoteAddr, o) } return nil }) }
func TestEagerReplication(t *testing.T) { defer leaktest.AfterTest(t)() store, stopper, _ := createTestStore(t) defer stopper.Stop() // Disable the replica scanner so that we rely on the eager replication code // path that occurs after splits. store.SetReplicaScannerActive(false) if err := server.WaitForInitialSplits(store.DB()); err != nil { t.Fatal(err) } // WaitForInitialSplits will return as soon as the meta2 span contains the // expected number of descriptors. But the addition of replicas to the // replicateQueue after a split occurs happens after the update of the // descriptors in meta2 leaving a tiny window of time in which the newly // split replica will not have been added to purgatory. Thus we loop. util.SucceedsSoon(t, func() error { // After the initial splits have been performed, all of the resulting ranges // should be present in replicate queue purgatory (because we only have a // single store in the test and thus replication cannot succeed). expected := server.ExpectedInitialRangeCount() if n := store.ReplicateQueuePurgatoryLength(); expected != n { return errors.Errorf("expected %d replicas in purgatory, but found %d", expected, n) } return nil }) }
func TestSplitAtTableBoundary(t *testing.T) { defer leaktest.AfterTest(t)() testClusterArgs := base.TestClusterArgs{ ReplicationMode: base.ReplicationAuto, } tc := testcluster.StartTestCluster(t, 3, testClusterArgs) defer tc.Stopper().Stop() runner := sqlutils.MakeSQLRunner(t, tc.Conns[0]) runner.Exec(`CREATE DATABASE test`) runner.Exec(`CREATE TABLE test.t (k SERIAL PRIMARY KEY, v INT)`) const tableIDQuery = ` SELECT tables.id FROM system.namespace tables JOIN system.namespace dbs ON dbs.id = tables.parentid WHERE dbs.name = $1 AND tables.name = $2 ` var tableID uint32 runner.QueryRow(tableIDQuery, "test", "t").Scan(&tableID) tableStartKey := keys.MakeTablePrefix(tableID) // Wait for new table to split. util.SucceedsSoon(t, func() error { desc, err := tc.LookupRange(keys.MakeRowSentinelKey(tableStartKey)) if err != nil { t.Fatal(err) } if !desc.StartKey.Equal(tableStartKey) { log.Infof(context.TODO(), "waiting on split results") return errors.Errorf("expected range start key %s; got %s", tableStartKey, desc.StartKey) } return nil }) }
// TestClientDisconnectRedundant verifies that the gossip server // will drop an outgoing client connection that is already an // inbound client connection of another node. func TestClientDisconnectRedundant(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t, metric.NewRegistry()) remote := startGossip(2, stopper, t, metric.NewRegistry()) // startClient requires locks are held, so acquire here. local.mu.Lock() remote.mu.Lock() rAddr := remote.mu.is.NodeAddr lAddr := local.mu.is.NodeAddr local.startClient(&rAddr, remote.NodeID.Get()) remote.startClient(&lAddr, local.NodeID.Get()) local.mu.Unlock() remote.mu.Unlock() local.manage() remote.manage() util.SucceedsSoon(t, func() error { // Check which of the clients is connected to the other. ok1 := local.findClient(func(c *client) bool { return c.addr.String() == rAddr.String() }) != nil ok2 := remote.findClient(func(c *client) bool { return c.addr.String() == lAddr.String() }) != nil // We expect node 2 to disconnect; if both are still connected, // it's possible that node 1 gossiped before node 2 connected, in // which case we have to gossip from node 1 to trigger the // disconnect redundant client code. if ok1 && ok2 { if err := local.AddInfo("local-key", nil, time.Second); err != nil { t.Fatal(err) } } else if ok1 && !ok2 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) { return nil } return errors.New("local client to remote not yet closed as redundant") }) }
// Verify that when we enqueue the same range multiple times for the same // reason, it is only processed once. func TestSchedulerBuffering(t *testing.T) { defer leaktest.AfterTest(t)() p := newTestProcessor() s := newRaftScheduler(log.AmbientContext{}, nil, p, 1) stopper := stop.NewStopper() defer stopper.Stop() s.Start(stopper) testCases := []struct { state raftScheduleState expected string }{ {stateRaftReady, "ready=[1:1] request=[] tick=[]"}, {stateRaftRequest, "ready=[1:1] request=[1:1] tick=[]"}, {stateRaftTick, "ready=[1:1] request=[1:1] tick=[1:1]"}, {stateRaftReady | stateRaftRequest | stateRaftTick, "ready=[1:2] request=[1:2] tick=[1:2]"}, } for _, c := range testCases { s.signal(s.enqueueN(c.state, 1, 1, 1, 1, 1)) util.SucceedsSoon(t, func() error { if s := p.String(); c.expected != s { return errors.Errorf("expected %s, but got %s", c.expected, s) } return nil }) } }
func TestComputeStatsForKeySpan(t *testing.T) { defer leaktest.AfterTest(t)() mtc := &multiTestContext{} defer mtc.Stop() mtc.Start(t, 3) // Create a number of ranges using splits. splitKeys := []string{"a", "c", "e", "g", "i"} for _, k := range splitKeys { key := []byte(k) repl := mtc.stores[0].LookupReplica(key, roachpb.RKeyMin) args := adminSplitArgs(key, key) header := roachpb.Header{ RangeID: repl.RangeID, } if _, err := client.SendWrappedWith(context.Background(), mtc.stores[0], header, args); err != nil { t.Fatal(err) } } // Wait for splits to finish. util.SucceedsSoon(t, func() error { repl := mtc.stores[0].LookupReplica(roachpb.RKey("z"), nil) if actualRSpan := repl.Desc().RSpan(); !actualRSpan.Key.Equal(roachpb.RKey("i")) { return errors.Errorf("expected range %s to begin at key 'i'", repl) } return nil }) // Create some keys across the ranges. incKeys := []string{"b", "bb", "bbb", "d", "dd", "h"} for _, k := range incKeys { if _, err := mtc.dbs[0].Inc(context.TODO(), []byte(k), 5); err != nil { t.Fatal(err) } } // Verify stats across different spans. for _, tcase := range []struct { startKey string endKey string expectedRanges int expectedKeys int64 }{ {"a", "i", 4, 6}, {"a", "c", 1, 3}, {"b", "e", 2, 5}, {"e", "i", 2, 1}, } { start, end := tcase.startKey, tcase.endKey stats, count := mtc.stores[0].ComputeStatsForKeySpan( roachpb.RKey(start), roachpb.RKey(end)) if a, e := count, tcase.expectedRanges; a != e { t.Errorf("Expected %d ranges in span [%s - %s], found %d", e, start, end, a) } if a, e := stats.LiveCount, tcase.expectedKeys; a != e { t.Errorf("Expected %d keys in span [%s - %s], found %d", e, start, end, a) } } }
func testBuildInfoInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { checkGossip(t, c, 20*time.Second, hasPeers(c.NumNodes())) var details serverpb.DetailsResponse util.SucceedsSoon(t, func() error { select { case <-stopper: t.Fatalf("interrupted") default: } return httputil.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/details/local", &details) }) bi := details.BuildInfo testData := map[string]string{ "go_version": bi.GoVersion, "tag": bi.Tag, "time": bi.Time, "dependencies": bi.Dependencies, } for key, val := range testData { if val == "" { t.Errorf("build info not set for \"%s\"", key) } } }
// TestTxnCoordSenderGCTimeout verifies that the coordinator cleans up extant // transactions and intents after the lastUpdateNanos exceeds the timeout. func TestTxnCoordSenderGCTimeout(t *testing.T) { defer leaktest.AfterTest(t)() s, sender := createTestDB(t) defer s.Stop() // Set heartbeat interval to 1ms for testing. sender.heartbeatInterval = 1 * time.Millisecond txn := client.NewTxn(context.Background(), *s.DB) key := roachpb.Key("a") if err := txn.Put(key, []byte("value")); err != nil { t.Fatal(err) } // Now, advance clock past the default client timeout. // Locking the TxnCoordSender to prevent a data race. sender.Lock() s.Manual.Increment(defaultClientTimeout.Nanoseconds() + 1) sender.Unlock() txnID := *txn.Proto.ID util.SucceedsSoon(t, func() error { // Locking the TxnCoordSender to prevent a data race. sender.Lock() _, ok := sender.txns[txnID] sender.Unlock() if ok { return errors.Errorf("expected garbage collection") } return nil }) verifyCleanup(key, sender, s.Eng, t) }
// TestMetricsRecording verifies that Node statistics are periodically recorded // as time series data. func TestMetricsRecording(t *testing.T) { defer leaktest.AfterTest(t)() s, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{ MetricsSampleInterval: 5 * time.Millisecond}) defer s.Stopper().Stop() checkTimeSeriesKey := func(now int64, keyName string) error { key := ts.MakeDataKey(keyName, "", ts.Resolution10s, now) data := roachpb.InternalTimeSeriesData{} return kvDB.GetProto(context.TODO(), key, &data) } // Verify that metrics for the current timestamp are recorded. This should // be true very quickly. util.SucceedsSoon(t, func() error { now := s.Clock().PhysicalNow() if err := checkTimeSeriesKey(now, "cr.store.livebytes.1"); err != nil { return err } if err := checkTimeSeriesKey(now, "cr.node.sys.go.allocbytes.1"); err != nil { return err } return nil }) }
func startBankTransfers(t testing.TB, stopper *stop.Stopper, sqlDB *gosql.DB, numAccounts int) { const maxTransfer = 999 for { select { case <-stopper.ShouldQuiesce(): return // All done. default: // Keep going. } from := rand.Intn(numAccounts) to := rand.Intn(numAccounts - 1) for from == to { to = numAccounts - 1 } amount := rand.Intn(maxTransfer) const update = `UPDATE bench.bank SET balance = CASE id WHEN $1 THEN balance-$3 WHEN $2 THEN balance+$3 END WHERE id IN ($1, $2)` util.SucceedsSoon(t, func() error { select { case <-stopper.ShouldQuiesce(): return nil // All done. default: // Keep going. } _, err := sqlDB.Exec(update, from, to, amount) return err }) } }
// TestScannerTiming verifies that ranges are scanned, regardless // of how many, to match scanInterval. func TestScannerTiming(t *testing.T) { defer leaktest.AfterTest(t)() const count = 3 const runTime = 100 * time.Millisecond const maxError = 7500 * time.Microsecond durations := []time.Duration{ 15 * time.Millisecond, 25 * time.Millisecond, } for i, duration := range durations { util.SucceedsSoon(t, func() error { ranges := newTestRangeSet(count, t) q := &testQueue{} s := newReplicaScanner(log.AmbientContext{}, duration, 0, ranges) s.AddQueues(q) mc := hlc.NewManualClock(0) clock := hlc.NewClock(mc.UnixNano) stopper := stop.NewStopper() s.Start(clock, stopper) time.Sleep(runTime) stopper.Stop() avg := s.avgScan() log.Infof(context.Background(), "%d: average scan: %s", i, avg) if avg.Nanoseconds()-duration.Nanoseconds() > maxError.Nanoseconds() || duration.Nanoseconds()-avg.Nanoseconds() > maxError.Nanoseconds() { return errors.Errorf("expected %s, got %s: exceeds max error of %s", duration, avg, maxError) } return nil }) } }
// TestClientDisallowMultipleConns verifies that the server disallows // multiple connections from the same client node ID. func TestClientDisallowMultipleConns(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t, metric.NewRegistry()) remote := startGossip(2, stopper, t, metric.NewRegistry()) local.mu.Lock() remote.mu.Lock() rAddr := remote.mu.is.NodeAddr // Start two clients from local to remote. RPC client cache is // disabled via the context, so we'll start two different outgoing // connections. local.startClient(&rAddr, remote.NodeID.Get()) local.startClient(&rAddr, remote.NodeID.Get()) local.mu.Unlock() remote.mu.Unlock() local.manage() remote.manage() util.SucceedsSoon(t, func() error { // Verify that the remote server has only a single incoming // connection and the local server has only a single outgoing // connection. local.mu.Lock() remote.mu.Lock() outgoing := local.outgoing.len() incoming := remote.mu.incoming.len() local.mu.Unlock() remote.mu.Unlock() if outgoing == 1 && incoming == 1 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) { return nil } return errors.Errorf("incorrect number of incoming (%d) or outgoing (%d) connections", incoming, outgoing) }) }
func gossipSucceedsSoon( t *testing.T, stopper *stop.Stopper, disconnected chan *client, gossip map[*client]*Gossip, f func() error, ) { // Use an insecure context since we don't need a valid cert. rpcContext := rpc.NewContext(log.AmbientContext{}, &base.Config{Insecure: true}, nil, stopper) for c := range gossip { disconnected <- c } util.SucceedsSoon(t, func() error { select { case client := <-disconnected: // If the client wasn't able to connect, restart it. client.start(gossip[client], disconnected, rpcContext, stopper, gossip[client].NodeID.Get(), rpcContext.NewBreaker()) default: } return f() }) }
func gossipSucceedsSoon( t *testing.T, stopper *stop.Stopper, disconnected chan *client, gossip map[*client]*Gossip, f func() error, ) { // Use an insecure context since we don't need a valid cert. rpcContext := newInsecureRPCContext(stopper) for c := range gossip { disconnected <- c } util.SucceedsSoon(t, func() error { select { case client := <-disconnected: // If the client wasn't able to connect, restart it. client.start(gossip[client], disconnected, rpcContext, stopper, rpcContext.NewBreaker()) default: } return f() }) }
// TestGossipStorageCleanup verifies that bad resolvers are purged // from the bootstrap info after gossip has successfully connected. func TestGossipStorageCleanup(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() const numNodes = 3 network := simulation.NewNetwork(stopper, numNodes, false) const notReachableAddr = "localhost:0" const invalidAddr = "10.0.0.1000:3333333" // Set storage for each of the nodes. addresses := make(unresolvedAddrSlice, len(network.Nodes)) stores := make([]testStorage, len(network.Nodes)) for i, n := range network.Nodes { addresses[i] = util.MakeUnresolvedAddr(n.Addr().Network(), n.Addr().String()) // Pre-add an invalid address to each gossip storage. if err := stores[i].WriteBootstrapInfo(&gossip.BootstrapInfo{ Addresses: []util.UnresolvedAddr{ util.MakeUnresolvedAddr("tcp", network.Nodes[(i+1)%numNodes].Addr().String()), // node i+1 address util.MakeUnresolvedAddr("tcp", notReachableAddr), // unreachable address util.MakeUnresolvedAddr("tcp", invalidAddr), // invalid address }, }); err != nil { t.Fatal(err) } if err := n.Gossip.SetStorage(&stores[i]); err != nil { t.Fatal(err) } n.Gossip.SetStallInterval(1 * time.Millisecond) n.Gossip.SetBootstrapInterval(1 * time.Millisecond) } // Wait for the gossip network to connect. network.RunUntilFullyConnected() // Let the gossip network continue running in the background without the // simulation cycler preventing it from operating. for _, node := range network.Nodes { node.Gossip.EnableSimulationCycler(false) } // Wait long enough for storage to get the expected number of // addresses and no pending cleanups. util.SucceedsSoon(t, func() error { for i := range stores { p := &stores[i] if expected, actual := len(network.Nodes)-1 /* -1 is ourself */, p.Len(); expected != actual { return errors.Errorf("expected %v, got %v (info: %#v)", expected, actual, p.Info().Addresses) } for _, addr := range p.Info().Addresses { if addr.String() == invalidAddr { return errors.Errorf("node %d still needs bootstrap cleanup", i) } } } return nil }) }
// TestScannerAddToQueues verifies that ranges are added to and // removed from multiple queues. func TestScannerAddToQueues(t *testing.T) { defer leaktest.AfterTest(t)() const count = 3 ranges := newTestRangeSet(count, t) q1, q2 := &testQueue{}, &testQueue{} // We don't want to actually consume entries from the queues during this test. q1.setDisabled(true) q2.setDisabled(true) s := newReplicaScanner(log.AmbientContext{}, 1*time.Millisecond, 0, ranges) s.AddQueues(q1, q2) mc := hlc.NewManualClock(0) clock := hlc.NewClock(mc.UnixNano) stopper := stop.NewStopper() // Start scanner and verify that all ranges are added to both queues. s.Start(clock, stopper) util.SucceedsSoon(t, func() error { if q1.count() != count || q2.count() != count { return errors.Errorf("q1 or q2 count != %d; got %d, %d", count, q1.count(), q2.count()) } return nil }) // Remove first range and verify it does not exist in either range. rng := ranges.remove(0, t) util.SucceedsSoon(t, func() error { // This is intentionally inside the loop, otherwise this test races as // our removal of the range may be processed before a stray re-queue. // Removing on each attempt makes sure we clean this up as we retry. s.RemoveReplica(rng) c1 := q1.count() c2 := q2.count() if c1 != count-1 || c2 != count-1 { return errors.Errorf("q1 or q2 count != %d; got %d, %d", count-1, c1, c2) } return nil }) // Stop scanner and verify both queues are stopped. stopper.Stop() if !q1.isDone() || !q2.isDone() { t.Errorf("expected all queues to stop; got %t, %t", q1.isDone(), q2.isDone()) } }
// Test that abruptly closing a pgwire connection releases all leases held by // that session. func TestPGWireConnectionCloseReleasesLeases(t *testing.T) { defer leaktest.AfterTest(t)() s, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{}) defer s.Stopper().Stop() url, cleanupConn := sqlutils.PGUrl(t, s.ServingAddr(), "SetupServer", url.User(security.RootUser)) defer cleanupConn() conn, err := pq.Open(url.String()) if err != nil { t.Fatal(err) } ex := conn.(driver.Execer) if _, err := ex.Exec("CREATE DATABASE test", nil); err != nil { t.Fatal(err) } if _, err := ex.Exec("CREATE TABLE test.t (i INT PRIMARY KEY)", nil); err != nil { t.Fatal(err) } // Start a txn so leases are accumulated by queries. if _, err := ex.Exec("BEGIN", nil); err != nil { t.Fatal(err) } // Get a table lease. if _, err := ex.Exec("SELECT * FROM test.t", nil); err != nil { t.Fatal(err) } // Abruptly close the connection. if err := conn.Close(); err != nil { t.Fatal(err) } // Verify that there are no leases held. tableDesc := sqlbase.GetTableDescriptor(kvDB, "test", "t") lm := s.LeaseManager().(*LeaseManager) // Looking for a table state validates that there used to be a lease on the // table. ts := lm.findTableState(tableDesc.ID, false /* create */) if ts == nil { t.Fatal("table state not found") } ts.mu.Lock() leases := ts.active.data ts.mu.Unlock() if len(leases) != 1 { t.Fatalf("expected one lease, found: %d", len(leases)) } // Wait for the lease to be released. util.SucceedsSoon(t, func() error { ts.mu.Lock() refcount := ts.active.data[0].refcount ts.mu.Unlock() if refcount != 0 { return errors.Errorf( "expected lease to be unused, found refcount: %d", refcount) } return nil }) }
func TestStopperNumTasks(t *testing.T) { defer leaktest.AfterTest(t)() s := stop.NewStopper() var tasks []chan bool for i := 0; i < 3; i++ { c := make(chan bool) tasks = append(tasks, c) if err := s.RunAsyncTask(context.Background(), func(_ context.Context) { // Wait for channel to close <-c }); err != nil { t.Fatal(err) } tm := s.RunningTasks() if numTypes, numTasks := len(tm), s.NumTasks(); numTypes != 1 || numTasks != i+1 { t.Errorf("stopper should have %d running tasks, got %d / %+v", i+1, numTasks, tm) } m := s.RunningTasks() if len(m) != 1 { t.Fatalf("expected exactly one task map entry: %+v", m) } for _, v := range m { if expNum := len(tasks); v != expNum { t.Fatalf("%d: expected %d tasks, got %d", i, expNum, v) } } } for i, c := range tasks { m := s.RunningTasks() if len(m) != 1 { t.Fatalf("%d: expected exactly one task map entry: %+v", i, m) } for _, v := range m { if expNum := len(tasks[i:]); v != expNum { t.Fatalf("%d: expected %d tasks, got %d:\n%s", i, expNum, v, m) } } // Close the channel to let the task proceed. close(c) expNum := len(tasks[i+1:]) util.SucceedsSoon(t, func() error { if nt := s.NumTasks(); nt != expNum { return errors.Errorf("%d: stopper should have %d running tasks, got %d", i, expNum, nt) } return nil }) } // The taskmap should've been cleared out. if m := s.RunningTasks(); len(m) != 0 { t.Fatalf("task map not empty: %+v", m) } s.Stop() }
func TestGCQueueLastProcessedTimestamps(t *testing.T) { defer leaktest.AfterTest(t)() tc := testContext{} stopper := stop.NewStopper() defer stopper.Stop() tc.Start(t, stopper) // Create two last processed times both at the range start key and // also at some mid-point key in order to simulate a merge. // Two transactions. lastProcessedVals := []struct { key roachpb.Key expGC bool }{ {keys.QueueLastProcessedKey(roachpb.RKeyMin, "timeSeriesMaintenance"), false}, {keys.QueueLastProcessedKey(roachpb.RKeyMin, "replica consistency checker"), false}, {keys.QueueLastProcessedKey(roachpb.RKey("a"), "timeSeriesMaintenance"), true}, {keys.QueueLastProcessedKey(roachpb.RKey("b"), "replica consistency checker"), true}, } ts := tc.Clock().Now() for _, lpv := range lastProcessedVals { if err := engine.MVCCPutProto(context.Background(), tc.engine, nil, lpv.key, hlc.ZeroTimestamp, nil, &ts); err != nil { t.Fatal(err) } } cfg, ok := tc.gossip.GetSystemConfig() if !ok { t.Fatal("config not set") } // Process through a scan queue. gcQ := newGCQueue(tc.store, tc.gossip) if err := gcQ.process(context.Background(), tc.Clock().Now(), tc.repl, cfg); err != nil { t.Fatal(err) } // Verify GC. util.SucceedsSoon(t, func() error { for _, lpv := range lastProcessedVals { ok, err := engine.MVCCGetProto(context.Background(), tc.engine, lpv.key, hlc.ZeroTimestamp, true, nil, &ts) if err != nil { return err } if ok == lpv.expGC { return errors.Errorf("expected GC of %s: %t; got %t", lpv.key, lpv.expGC, ok) } } return nil }) }
// TestNodeLivenessRestart verifies that if nodes are shutdown and // restarted, the node liveness records are re-gossiped immediately. func TestNodeLivenessRestart(t *testing.T) { defer leaktest.AfterTest(t)() mtc := &multiTestContext{} defer mtc.Stop() mtc.Start(t, 2) // After verifying node is in liveness table, stop store. verifyLiveness(t, mtc) mtc.stopStore(0) // Clear the liveness records in store 1's gossip to make sure we're // seeing the liveness record properly gossiped at store startup. var expKeys []string for _, g := range mtc.gossips { key := gossip.MakeNodeLivenessKey(g.NodeID.Get()) expKeys = append(expKeys, key) if err := g.AddInfoProto(key, &storage.Liveness{}, 0); err != nil { t.Fatal(err) } } sort.Strings(expKeys) // Register a callback to gossip in order to verify liveness records // are re-gossiped. var keysMu struct { syncutil.Mutex keys []string } livenessRegex := gossip.MakePrefixPattern(gossip.KeyNodeLivenessPrefix) mtc.gossips[0].RegisterCallback(livenessRegex, func(key string, _ roachpb.Value) { keysMu.Lock() defer keysMu.Unlock() for _, k := range keysMu.keys { if k == key { return } } keysMu.keys = append(keysMu.keys, key) }) // Restart store and verify gossip contains liveness record for nodes 1&2. mtc.restartStore(0) util.SucceedsSoon(t, func() error { keysMu.Lock() defer keysMu.Unlock() sort.Strings(keysMu.keys) if !reflect.DeepEqual(keysMu.keys, expKeys) { return errors.Errorf("expected keys %+v != keys %+v", expKeys, keysMu.keys) } return nil }) }
// TestRangeCommandClockUpdate verifies that followers update their // clocks when executing a command, even if the lease holder's clock is far // in the future. func TestRangeCommandClockUpdate(t *testing.T) { defer leaktest.AfterTest(t)() const numNodes = 3 var manuals []*hlc.ManualClock var clocks []*hlc.Clock for i := 0; i < numNodes; i++ { manuals = append(manuals, hlc.NewManualClock(1)) clocks = append(clocks, hlc.NewClock(manuals[i].UnixNano)) clocks[i].SetMaxOffset(100 * time.Millisecond) } mtc := &multiTestContext{clocks: clocks} mtc.Start(t, numNodes) defer mtc.Stop() mtc.replicateRange(1, 1, 2) // Advance the lease holder's clock ahead of the followers (by more than // MaxOffset but less than the range lease) and execute a command. manuals[0].Increment(int64(500 * time.Millisecond)) incArgs := incrementArgs([]byte("a"), 5) ts := clocks[0].Now() if _, err := client.SendWrappedWith(context.Background(), rg1(mtc.stores[0]), roachpb.Header{Timestamp: ts}, &incArgs); err != nil { t.Fatal(err) } // Wait for that command to execute on all the followers. util.SucceedsSoon(t, func() error { values := []int64{} for _, eng := range mtc.engines { val, _, err := engine.MVCCGet(context.Background(), eng, roachpb.Key("a"), clocks[0].Now(), true, nil) if err != nil { return err } values = append(values, mustGetInt(val)) } if !reflect.DeepEqual(values, []int64{5, 5, 5}) { return errors.Errorf("expected (5, 5, 5), got %v", values) } return nil }) // Verify that all the followers have accepted the clock update from // node 0 even though it comes from outside the usual max offset. now := clocks[0].Now() for i, clock := range clocks { // Only compare the WallTimes: it's normal for clock 0 to be a few logical ticks ahead. if clock.Now().WallTime < now.WallTime { t.Errorf("clock %d is behind clock 0: %s vs %s", i, clock.Now(), now) } } }
// TestRetryableError verifies that Send returns a retryable error // when it hits an RPC error. func TestRetryableError(t *testing.T) { defer leaktest.AfterTest(t)() clientStopper := stop.NewStopper() defer clientStopper.Stop() clientContext := newNodeTestContext(hlc.NewClock(hlc.UnixNano, time.Nanosecond), clientStopper) serverStopper := stop.NewStopper() serverContext := newNodeTestContext(hlc.NewClock(hlc.UnixNano, time.Nanosecond), serverStopper) s, ln := newTestServer(t, serverContext) roachpb.RegisterInternalServer(s, Node(0)) addr := ln.Addr().String() if _, err := clientContext.GRPCDial(addr); err != nil { t.Fatal(err) } // Wait until the client becomes healthy and shut down the server. util.SucceedsSoon(t, func() error { if !clientContext.IsConnHealthy(addr) { return errors.Errorf("client not yet healthy") } return nil }) serverStopper.Stop() // Wait until the client becomes unhealthy. util.SucceedsSoon(t, func() error { if clientContext.IsConnHealthy(addr) { return errors.Errorf("client not yet unhealthy") } return nil }) opts := SendOptions{ctx: context.Background()} if _, err := sendBatch(opts, []net.Addr{ln.Addr()}, clientContext); err == nil { t.Fatalf("Unexpected success") } }
// TestNodeLivenessSelf verifies that a node keeps its own most // recent liveness heartbeat info in preference to anything which // might be received belatedly through gossip. func TestNodeLivenessSelf(t *testing.T) { defer leaktest.AfterTest(t)() mtc := &multiTestContext{} defer mtc.Stop() mtc.Start(t, 1) // Verify liveness of all nodes for all nodes. stopNodeLivenessHeartbeats(mtc) if err := mtc.nodeLivenesses[0].ManualHeartbeat(); err != nil { t.Fatal(err) } // Gossip random nonsense for liveness and verify that asking for // the node's own node ID returns the "correct" value. g := mtc.gossips[0] key := gossip.MakeNodeLivenessKey(g.NodeID.Get()) var count int32 g.RegisterCallback(key, func(_ string, val roachpb.Value) { atomic.AddInt32(&count, 1) }) util.SucceedsSoon(t, func() error { if err := g.AddInfoProto(key, &storage.Liveness{ NodeID: 1, Epoch: 2, }, 0); err != nil { t.Fatal(err) } if atomic.LoadInt32(&count) < 2 { return errors.New("expected count >= 2") } return nil }) // Self should not see new epoch. l := mtc.nodeLivenesses[0] lGet, err := l.GetLiveness(g.NodeID.Get()) if err != nil { t.Fatal(err) } lSelf, err := l.Self() if err != nil { t.Fatal(err) } if lGet != lSelf { t.Errorf("expected GetLiveness() to return same value as Self(): %+v != %+v", lGet, lSelf) } if lGet.Epoch == 2 || lSelf.NodeID == 2 { t.Errorf("expected GetLiveness() and Self() not to return artificially gossiped liveness: %+v, %+v", lGet, lSelf) } }
// TestNodeLivenessEpochIncrement verifies that incrementing the epoch // of a node requires the node to be considered not-live and that on // increment, no other nodes believe the epoch-incremented node to be // live. func TestNodeLivenessEpochIncrement(t *testing.T) { defer leaktest.AfterTest(t)() mtc := startMultiTestContext(t, 2) defer mtc.Stop() verifyLiveness(t, mtc) stopNodeLivenessHeartbeats(mtc) // First try to increment the epoch of a known-live node. deadNodeID := mtc.gossips[1].NodeID.Get() if err := mtc.nodeLivenesses[0].IncrementEpoch( context.Background(), deadNodeID); !testutils.IsError(err, "cannot increment epoch on live node") { t.Fatalf("expected error incrementing a live node: %v", err) } // Advance clock past liveness threshold & increment epoch. oldLiveness, err := mtc.nodeLivenesses[0].GetLiveness(deadNodeID) if err != nil { t.Fatal(err) } active, _ := storage.RangeLeaseDurations( storage.RaftElectionTimeout(base.DefaultRaftTickInterval, 0)) mtc.manualClock.Increment(active.Nanoseconds() + 1) if err := mtc.nodeLivenesses[0].IncrementEpoch(context.Background(), deadNodeID); err != nil { t.Fatalf("unexpected error incrementing a live node: %s", err) } // Verify that the epoch has been advanced. util.SucceedsSoon(t, func() error { newLiveness, err := mtc.nodeLivenesses[0].GetLiveness(deadNodeID) if err != nil { return err } if newLiveness.Epoch != oldLiveness.Epoch+1 { return errors.Errorf("expected epoch to increment") } if newLiveness.Expiration != oldLiveness.Expiration { return errors.Errorf("expected expiration to remain unchanged") } if live, err := mtc.nodeLivenesses[0].IsLive(deadNodeID); live || err != nil { return errors.Errorf("expected dead node to remain dead after epoch increment %t: %v", live, err) } return nil }) // Verify epoch increment metric count. if c := mtc.nodeLivenesses[0].Metrics().EpochIncrements.Count(); c != 1 { t.Errorf("expected epoch increment == 1; got %d", c) } }
func verifyLiveness(t *testing.T, mtc *multiTestContext) { util.SucceedsSoon(t, func() error { for _, nl := range mtc.nodeLivenesses { for _, g := range mtc.gossips { live, err := nl.IsLive(g.NodeID.Get()) if err != nil { return err } else if !live { return errors.Errorf("node %d not live", g.NodeID.Get()) } } } return nil }) }