Example #1
0
func TestOffsetMeasurement(t *testing.T) {
	defer leaktest.AfterTest(t)()

	stopper := stop.NewStopper()
	defer stopper.Stop()

	serverTime := time.Unix(0, 20)
	serverClock := hlc.NewClock(serverTime.UnixNano)
	serverCtx := newNodeTestContext(serverClock, stopper)
	s, ln := newTestServer(t, serverCtx, true)
	remoteAddr := ln.Addr().String()

	RegisterHeartbeatServer(s, &HeartbeatService{
		clock:              serverClock,
		remoteClockMonitor: serverCtx.RemoteClocks,
	})

	// Create a client clock that is behind the server clock.
	clientAdvancing := AdvancingClock{time: time.Unix(0, 10)}
	clientClock := hlc.NewClock(clientAdvancing.UnixNano)
	clientClock.SetMaxOffset(time.Millisecond)
	clientCtx := newNodeTestContext(clientClock, stopper)
	clientCtx.RemoteClocks.offsetTTL = 5 * clientAdvancing.getAdvancementInterval()
	if _, err := clientCtx.GRPCDial(remoteAddr); err != nil {
		t.Fatal(err)
	}

	expectedOffset := RemoteOffset{Offset: 10, Uncertainty: 0, MeasuredAt: 10}
	util.SucceedsSoon(t, func() error {
		clientCtx.RemoteClocks.mu.Lock()
		defer clientCtx.RemoteClocks.mu.Unlock()

		if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok {
			return errors.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr)
		} else if o != expectedOffset {
			return errors.Errorf("expected:\n%v\nactual:\n%v", expectedOffset, o)
		}
		return nil
	})

	// Change the client such that it receives a heartbeat right after the
	// maximum clock reading delay.
	clientAdvancing.setAdvancementInterval(
		maximumPingDurationMult*clientClock.MaxOffset() + 1*time.Nanosecond)

	util.SucceedsSoon(t, func() error {
		clientCtx.RemoteClocks.mu.Lock()
		defer clientCtx.RemoteClocks.mu.Unlock()

		if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; ok {
			return errors.Errorf("expected offset to have been cleared, but found %s", o)
		}
		return nil
	})
}
// TestRaftTransportCircuitBreaker verifies that messages will be
// dropped waiting for raft node connection to be established.
func TestRaftTransportCircuitBreaker(t *testing.T) {
	defer leaktest.AfterTest(t)()
	rttc := newRaftTransportTestContext(t)
	defer rttc.Stop()

	serverReplica := roachpb.ReplicaDescriptor{
		NodeID:    2,
		StoreID:   2,
		ReplicaID: 2,
	}
	_, serverAddr := rttc.AddNodeWithoutGossip(serverReplica.NodeID)
	serverChannel := rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID)

	clientReplica := roachpb.ReplicaDescriptor{
		NodeID:    1,
		StoreID:   1,
		ReplicaID: 1,
	}
	clientTransport := rttc.AddNode(clientReplica.NodeID)

	// The transport is set up asynchronously, so we expect the first
	// Send to return true here.
	if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) {
		t.Errorf("unexpectedly failed sending while connection is being asynchronously established")
	}

	// However, sending repeated messages should begin dropping once
	// the circuit breaker does trip.
	util.SucceedsSoon(t, func() error {
		if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) {
			return errors.Errorf("expected circuit breaker to trip")
		}
		return nil
	})

	// Now, gossip address of server.
	rttc.GossipNode(serverReplica.NodeID, serverAddr)

	// Keep sending commit=2 until breaker resets and we receive the
	// first instance. It's possible an earlier message for commit=1
	// snuck in.
	util.SucceedsSoon(t, func() error {
		if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 2}) {
			clientTransport.GetCircuitBreaker(serverReplica.NodeID).Reset()
		}
		select {
		case req := <-serverChannel.ch:
			if req.Message.Commit == 2 {
				return nil
			}
		default:
		}
		return errors.Errorf("expected message commit=2")
	})
}
Example #3
0
// TestScannerDisabled verifies that disabling a scanner prevents
// replicas from being added to queues.
func TestScannerDisabled(t *testing.T) {
	defer leaktest.AfterTest(t)()
	const count = 3
	ranges := newTestRangeSet(count, t)
	q := &testQueue{}
	s := newReplicaScanner(log.AmbientContext{}, 1*time.Millisecond, 0, ranges)
	s.AddQueues(q)
	mc := hlc.NewManualClock(0)
	clock := hlc.NewClock(mc.UnixNano)
	stopper := stop.NewStopper()
	s.Start(clock, stopper)
	defer stopper.Stop()

	// Verify queue gets all ranges.
	util.SucceedsSoon(t, func() error {
		if q.count() != count {
			return errors.Errorf("expected %d replicas; have %d", count, q.count())
		}
		if s.scanCount() == 0 {
			return errors.Errorf("expected scanner count to increment")
		}
		return nil
	})

	lastWaitEnabledCount := s.waitEnabledCount()

	// Now, disable the scanner.
	s.SetDisabled(true)
	util.SucceedsSoon(t, func() error {
		if s.waitEnabledCount() == lastWaitEnabledCount {
			return errors.Errorf("expected scanner to stop when disabled")
		}
		return nil
	})

	lastScannerCount := s.scanCount()

	// Remove the replicas and verify the scanner still removes them while disabled.
	ranges.Visit(func(repl *Replica) bool {
		s.RemoveReplica(repl)
		return true
	})

	util.SucceedsSoon(t, func() error {
		if qc := q.count(); qc != 0 {
			return errors.Errorf("expected queue to be empty after replicas removed from scanner; got %d", qc)
		}
		return nil
	})
	if sc := s.scanCount(); sc != lastScannerCount {
		t.Errorf("expected scanner count to not increment: %d != %d", sc, lastScannerCount)
	}
}
Example #4
0
// TestRangeSplitsWithWritePressure sets the zone config max bytes for
// a range to 256K and writes data until there are five ranges.
func TestRangeSplitsWithWritePressure(t *testing.T) {
	defer leaktest.AfterTest(t)()
	// Override default zone config.
	cfg := config.DefaultZoneConfig()
	cfg.RangeMaxBytes = 1 << 18
	defer config.TestingSetDefaultZoneConfig(cfg)()

	dbCtx := client.DefaultDBContext()
	dbCtx.TxnRetryOptions = retry.Options{
		InitialBackoff: 1 * time.Millisecond,
		MaxBackoff:     10 * time.Millisecond,
		Multiplier:     2,
	}
	s, _ := createTestDBWithContext(t, dbCtx)
	// This is purely to silence log spam.
	config.TestingSetupZoneConfigHook(s.Stopper)
	defer s.Stop()

	// Start test writer write about a 32K/key so there aren't too many writes necessary to split 64K range.
	done := make(chan struct{})
	var wg sync.WaitGroup
	wg.Add(1)
	go startTestWriter(s.DB, int64(0), 1<<15, &wg, nil, nil, done, t)

	// Check that we split 5 times in allotted time.
	util.SucceedsSoon(t, func() error {
		// Scan the txn records.
		rows, err := s.DB.Scan(context.TODO(), keys.Meta2Prefix, keys.MetaMax, 0)
		if err != nil {
			return errors.Errorf("failed to scan meta2 keys: %s", err)
		}
		if lr := len(rows); lr < 5 {
			return errors.Errorf("expected >= 5 scans; got %d", lr)
		}
		return nil
	})
	close(done)
	wg.Wait()

	// This write pressure test often causes splits while resolve
	// intents are in flight, causing them to fail with range key
	// mismatch errors. However, LocalSender should retry in these
	// cases. Check here via MVCC scan that there are no dangling write
	// intents. We do this using a SucceedsSoon construct to account
	// for timing of finishing the test writer and a possibly-ongoing
	// asynchronous split.
	util.SucceedsSoon(t, func() error {
		if _, _, _, err := engine.MVCCScan(context.Background(), s.Eng, keys.LocalMax, roachpb.KeyMax, math.MaxInt64, hlc.MaxTimestamp, true, nil); err != nil {
			return errors.Errorf("failed to verify no dangling intents: %s", err)
		}
		return nil
	})
}
Example #5
0
// TestReplicaGCQueueDropReplica verifies that a removed replica is
// immediately cleaned up.
func TestReplicaGCQueueDropReplicaDirect(t *testing.T) {
	defer leaktest.AfterTest(t)()
	mtc := &multiTestContext{}
	const numStores = 3
	rangeID := roachpb.RangeID(1)

	// In this test, the Replica on the second Node is removed, and the test
	// verifies that that Node adds this Replica to its RangeGCQueue. However,
	// the queue does a consistent lookup which will usually be read from
	// Node 1. Hence, if Node 1 hasn't processed the removal when Node 2 has,
	// no GC will take place since the consistent RangeLookup hits the first
	// Node. We use the TestingCommandFilter to make sure that the second Node
	// waits for the first.
	cfg := storage.TestStoreConfig()
	mtc.storeConfig = &cfg
	mtc.storeConfig.TestingKnobs.TestingCommandFilter =
		func(filterArgs storagebase.FilterArgs) *roachpb.Error {
			et, ok := filterArgs.Req.(*roachpb.EndTransactionRequest)
			if !ok || filterArgs.Sid != 2 {
				return nil
			}
			rct := et.InternalCommitTrigger.GetChangeReplicasTrigger()
			if rct == nil || rct.ChangeType != roachpb.REMOVE_REPLICA {
				return nil
			}
			util.SucceedsSoon(t, func() error {
				r, err := mtc.stores[0].GetReplica(rangeID)
				if err != nil {
					return err
				}
				if _, ok := r.Desc().GetReplicaDescriptor(2); ok {
					return errors.New("expected second node gone from first node's known replicas")
				}
				return nil
			})
			return nil
		}

	mtc.Start(t, numStores)
	defer mtc.Stop()

	mtc.replicateRange(rangeID, 1, 2)
	mtc.unreplicateRange(rangeID, 1)

	// Make sure the range is removed from the store.
	util.SucceedsSoon(t, func() error {
		if _, err := mtc.stores[1].GetReplica(rangeID); !testutils.IsError(err, "range .* was not found") {
			return errors.Errorf("expected range removal: %v", err) // NB: errors.Wrapf(nil, ...) returns nil.
		}
		return nil
	})
}
Example #6
0
func TestFailedOffsetMeasurement(t *testing.T) {
	defer leaktest.AfterTest(t)()

	stopper := stop.NewStopper()
	defer stopper.Stop()

	// Can't be zero because that'd be an empty offset.
	clock := hlc.NewClock(time.Unix(0, 1).UnixNano)

	serverCtx := newNodeTestContext(clock, stopper)
	s, ln := newTestServer(t, serverCtx, true)
	remoteAddr := ln.Addr().String()

	heartbeat := &ManualHeartbeatService{
		clock:              clock,
		remoteClockMonitor: serverCtx.RemoteClocks,
		ready:              make(chan struct{}),
		stopper:            stopper,
	}
	RegisterHeartbeatServer(s, heartbeat)

	// Create a client that never receives a heartbeat after the first.
	clientCtx := newNodeTestContext(clock, stopper)
	// Increase the timeout so that failure arises from exceeding the maximum
	// clock reading delay, not the timeout.
	clientCtx.HeartbeatTimeout = 20 * clientCtx.HeartbeatInterval
	if _, err := clientCtx.GRPCDial(remoteAddr); err != nil {
		t.Fatal(err)
	}
	heartbeat.ready <- struct{}{} // Allow one heartbeat for initialization.

	util.SucceedsSoon(t, func() error {
		clientCtx.RemoteClocks.mu.Lock()
		defer clientCtx.RemoteClocks.mu.Unlock()

		if _, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok {
			return errors.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr)
		}
		return nil
	})

	util.SucceedsSoon(t, func() error {
		serverCtx.RemoteClocks.mu.Lock()
		defer serverCtx.RemoteClocks.mu.Unlock()

		if o, ok := serverCtx.RemoteClocks.mu.offsets[remoteAddr]; ok {
			return errors.Errorf("expected offset of %s to not be initialized, but it was: %v", remoteAddr, o)
		}
		return nil
	})
}
Example #7
0
func TestEagerReplication(t *testing.T) {
	defer leaktest.AfterTest(t)()
	store, stopper, _ := createTestStore(t)
	defer stopper.Stop()

	// Disable the replica scanner so that we rely on the eager replication code
	// path that occurs after splits.
	store.SetReplicaScannerActive(false)

	if err := server.WaitForInitialSplits(store.DB()); err != nil {
		t.Fatal(err)
	}

	// WaitForInitialSplits will return as soon as the meta2 span contains the
	// expected number of descriptors. But the addition of replicas to the
	// replicateQueue after a split occurs happens after the update of the
	// descriptors in meta2 leaving a tiny window of time in which the newly
	// split replica will not have been added to purgatory. Thus we loop.
	util.SucceedsSoon(t, func() error {
		// After the initial splits have been performed, all of the resulting ranges
		// should be present in replicate queue purgatory (because we only have a
		// single store in the test and thus replication cannot succeed).
		expected := server.ExpectedInitialRangeCount()
		if n := store.ReplicateQueuePurgatoryLength(); expected != n {
			return errors.Errorf("expected %d replicas in purgatory, but found %d", expected, n)
		}
		return nil
	})
}
Example #8
0
func TestSplitAtTableBoundary(t *testing.T) {
	defer leaktest.AfterTest(t)()

	testClusterArgs := base.TestClusterArgs{
		ReplicationMode: base.ReplicationAuto,
	}
	tc := testcluster.StartTestCluster(t, 3, testClusterArgs)
	defer tc.Stopper().Stop()

	runner := sqlutils.MakeSQLRunner(t, tc.Conns[0])
	runner.Exec(`CREATE DATABASE test`)
	runner.Exec(`CREATE TABLE test.t (k SERIAL PRIMARY KEY, v INT)`)

	const tableIDQuery = `
SELECT tables.id FROM system.namespace tables
  JOIN system.namespace dbs ON dbs.id = tables.parentid
  WHERE dbs.name = $1 AND tables.name = $2
`
	var tableID uint32
	runner.QueryRow(tableIDQuery, "test", "t").Scan(&tableID)
	tableStartKey := keys.MakeTablePrefix(tableID)

	// Wait for new table to split.
	util.SucceedsSoon(t, func() error {
		desc, err := tc.LookupRange(keys.MakeRowSentinelKey(tableStartKey))
		if err != nil {
			t.Fatal(err)
		}
		if !desc.StartKey.Equal(tableStartKey) {
			log.Infof(context.TODO(), "waiting on split results")
			return errors.Errorf("expected range start key %s; got %s", tableStartKey, desc.StartKey)
		}
		return nil
	})
}
Example #9
0
// TestClientDisconnectRedundant verifies that the gossip server
// will drop an outgoing client connection that is already an
// inbound client connection of another node.
func TestClientDisconnectRedundant(t *testing.T) {
	defer leaktest.AfterTest(t)()
	stopper := stop.NewStopper()
	defer stopper.Stop()
	local := startGossip(1, stopper, t, metric.NewRegistry())
	remote := startGossip(2, stopper, t, metric.NewRegistry())
	// startClient requires locks are held, so acquire here.
	local.mu.Lock()
	remote.mu.Lock()
	rAddr := remote.mu.is.NodeAddr
	lAddr := local.mu.is.NodeAddr
	local.startClient(&rAddr, remote.NodeID.Get())
	remote.startClient(&lAddr, local.NodeID.Get())
	local.mu.Unlock()
	remote.mu.Unlock()
	local.manage()
	remote.manage()
	util.SucceedsSoon(t, func() error {
		// Check which of the clients is connected to the other.
		ok1 := local.findClient(func(c *client) bool { return c.addr.String() == rAddr.String() }) != nil
		ok2 := remote.findClient(func(c *client) bool { return c.addr.String() == lAddr.String() }) != nil
		// We expect node 2 to disconnect; if both are still connected,
		// it's possible that node 1 gossiped before node 2 connected, in
		// which case we have to gossip from node 1 to trigger the
		// disconnect redundant client code.
		if ok1 && ok2 {
			if err := local.AddInfo("local-key", nil, time.Second); err != nil {
				t.Fatal(err)
			}
		} else if ok1 && !ok2 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) {
			return nil
		}
		return errors.New("local client to remote not yet closed as redundant")
	})
}
Example #10
0
// Verify that when we enqueue the same range multiple times for the same
// reason, it is only processed once.
func TestSchedulerBuffering(t *testing.T) {
	defer leaktest.AfterTest(t)()

	p := newTestProcessor()
	s := newRaftScheduler(log.AmbientContext{}, nil, p, 1)
	stopper := stop.NewStopper()
	defer stopper.Stop()
	s.Start(stopper)

	testCases := []struct {
		state    raftScheduleState
		expected string
	}{
		{stateRaftReady, "ready=[1:1] request=[] tick=[]"},
		{stateRaftRequest, "ready=[1:1] request=[1:1] tick=[]"},
		{stateRaftTick, "ready=[1:1] request=[1:1] tick=[1:1]"},
		{stateRaftReady | stateRaftRequest | stateRaftTick, "ready=[1:2] request=[1:2] tick=[1:2]"},
	}

	for _, c := range testCases {
		s.signal(s.enqueueN(c.state, 1, 1, 1, 1, 1))

		util.SucceedsSoon(t, func() error {
			if s := p.String(); c.expected != s {
				return errors.Errorf("expected %s, but got %s", c.expected, s)
			}
			return nil
		})
	}
}
Example #11
0
func TestComputeStatsForKeySpan(t *testing.T) {
	defer leaktest.AfterTest(t)()
	mtc := &multiTestContext{}
	defer mtc.Stop()
	mtc.Start(t, 3)

	// Create a number of ranges using splits.
	splitKeys := []string{"a", "c", "e", "g", "i"}
	for _, k := range splitKeys {
		key := []byte(k)
		repl := mtc.stores[0].LookupReplica(key, roachpb.RKeyMin)
		args := adminSplitArgs(key, key)
		header := roachpb.Header{
			RangeID: repl.RangeID,
		}
		if _, err := client.SendWrappedWith(context.Background(), mtc.stores[0], header, args); err != nil {
			t.Fatal(err)
		}
	}

	// Wait for splits to finish.
	util.SucceedsSoon(t, func() error {
		repl := mtc.stores[0].LookupReplica(roachpb.RKey("z"), nil)
		if actualRSpan := repl.Desc().RSpan(); !actualRSpan.Key.Equal(roachpb.RKey("i")) {
			return errors.Errorf("expected range %s to begin at key 'i'", repl)
		}
		return nil
	})

	// Create some keys across the ranges.
	incKeys := []string{"b", "bb", "bbb", "d", "dd", "h"}
	for _, k := range incKeys {
		if _, err := mtc.dbs[0].Inc(context.TODO(), []byte(k), 5); err != nil {
			t.Fatal(err)
		}
	}

	// Verify stats across different spans.
	for _, tcase := range []struct {
		startKey       string
		endKey         string
		expectedRanges int
		expectedKeys   int64
	}{
		{"a", "i", 4, 6},
		{"a", "c", 1, 3},
		{"b", "e", 2, 5},
		{"e", "i", 2, 1},
	} {
		start, end := tcase.startKey, tcase.endKey
		stats, count := mtc.stores[0].ComputeStatsForKeySpan(
			roachpb.RKey(start), roachpb.RKey(end))
		if a, e := count, tcase.expectedRanges; a != e {
			t.Errorf("Expected %d ranges in span [%s - %s], found %d", e, start, end, a)
		}
		if a, e := stats.LiveCount, tcase.expectedKeys; a != e {
			t.Errorf("Expected %d keys in span [%s - %s], found %d", e, start, end, a)
		}
	}
}
Example #12
0
func testBuildInfoInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) {
	checkGossip(t, c, 20*time.Second, hasPeers(c.NumNodes()))

	var details serverpb.DetailsResponse
	util.SucceedsSoon(t, func() error {
		select {
		case <-stopper:
			t.Fatalf("interrupted")
		default:
		}
		return httputil.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/details/local", &details)
	})

	bi := details.BuildInfo
	testData := map[string]string{
		"go_version":   bi.GoVersion,
		"tag":          bi.Tag,
		"time":         bi.Time,
		"dependencies": bi.Dependencies,
	}
	for key, val := range testData {
		if val == "" {
			t.Errorf("build info not set for \"%s\"", key)
		}
	}
}
Example #13
0
// TestTxnCoordSenderGCTimeout verifies that the coordinator cleans up extant
// transactions and intents after the lastUpdateNanos exceeds the timeout.
func TestTxnCoordSenderGCTimeout(t *testing.T) {
	defer leaktest.AfterTest(t)()
	s, sender := createTestDB(t)
	defer s.Stop()

	// Set heartbeat interval to 1ms for testing.
	sender.heartbeatInterval = 1 * time.Millisecond

	txn := client.NewTxn(context.Background(), *s.DB)
	key := roachpb.Key("a")
	if err := txn.Put(key, []byte("value")); err != nil {
		t.Fatal(err)
	}

	// Now, advance clock past the default client timeout.
	// Locking the TxnCoordSender to prevent a data race.
	sender.Lock()
	s.Manual.Increment(defaultClientTimeout.Nanoseconds() + 1)
	sender.Unlock()

	txnID := *txn.Proto.ID

	util.SucceedsSoon(t, func() error {
		// Locking the TxnCoordSender to prevent a data race.
		sender.Lock()
		_, ok := sender.txns[txnID]
		sender.Unlock()
		if ok {
			return errors.Errorf("expected garbage collection")
		}
		return nil
	})

	verifyCleanup(key, sender, s.Eng, t)
}
Example #14
0
// TestMetricsRecording verifies that Node statistics are periodically recorded
// as time series data.
func TestMetricsRecording(t *testing.T) {
	defer leaktest.AfterTest(t)()
	s, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{
		MetricsSampleInterval: 5 * time.Millisecond})
	defer s.Stopper().Stop()

	checkTimeSeriesKey := func(now int64, keyName string) error {
		key := ts.MakeDataKey(keyName, "", ts.Resolution10s, now)
		data := roachpb.InternalTimeSeriesData{}
		return kvDB.GetProto(context.TODO(), key, &data)
	}

	// Verify that metrics for the current timestamp are recorded. This should
	// be true very quickly.
	util.SucceedsSoon(t, func() error {
		now := s.Clock().PhysicalNow()
		if err := checkTimeSeriesKey(now, "cr.store.livebytes.1"); err != nil {
			return err
		}
		if err := checkTimeSeriesKey(now, "cr.node.sys.go.allocbytes.1"); err != nil {
			return err
		}
		return nil
	})
}
Example #15
0
func startBankTransfers(t testing.TB, stopper *stop.Stopper, sqlDB *gosql.DB, numAccounts int) {
	const maxTransfer = 999
	for {
		select {
		case <-stopper.ShouldQuiesce():
			return // All done.
		default:
			// Keep going.
		}

		from := rand.Intn(numAccounts)
		to := rand.Intn(numAccounts - 1)
		for from == to {
			to = numAccounts - 1
		}

		amount := rand.Intn(maxTransfer)

		const update = `UPDATE bench.bank
				SET balance = CASE id WHEN $1 THEN balance-$3 WHEN $2 THEN balance+$3 END
				WHERE id IN ($1, $2)`
		util.SucceedsSoon(t, func() error {
			select {
			case <-stopper.ShouldQuiesce():
				return nil // All done.
			default:
				// Keep going.
			}
			_, err := sqlDB.Exec(update, from, to, amount)
			return err
		})
	}
}
Example #16
0
// TestScannerTiming verifies that ranges are scanned, regardless
// of how many, to match scanInterval.
func TestScannerTiming(t *testing.T) {
	defer leaktest.AfterTest(t)()
	const count = 3
	const runTime = 100 * time.Millisecond
	const maxError = 7500 * time.Microsecond
	durations := []time.Duration{
		15 * time.Millisecond,
		25 * time.Millisecond,
	}
	for i, duration := range durations {
		util.SucceedsSoon(t, func() error {
			ranges := newTestRangeSet(count, t)
			q := &testQueue{}
			s := newReplicaScanner(log.AmbientContext{}, duration, 0, ranges)
			s.AddQueues(q)
			mc := hlc.NewManualClock(0)
			clock := hlc.NewClock(mc.UnixNano)
			stopper := stop.NewStopper()
			s.Start(clock, stopper)
			time.Sleep(runTime)
			stopper.Stop()

			avg := s.avgScan()
			log.Infof(context.Background(), "%d: average scan: %s", i, avg)
			if avg.Nanoseconds()-duration.Nanoseconds() > maxError.Nanoseconds() ||
				duration.Nanoseconds()-avg.Nanoseconds() > maxError.Nanoseconds() {
				return errors.Errorf("expected %s, got %s: exceeds max error of %s", duration, avg, maxError)
			}
			return nil
		})
	}
}
Example #17
0
// TestClientDisallowMultipleConns verifies that the server disallows
// multiple connections from the same client node ID.
func TestClientDisallowMultipleConns(t *testing.T) {
	defer leaktest.AfterTest(t)()
	stopper := stop.NewStopper()
	defer stopper.Stop()
	local := startGossip(1, stopper, t, metric.NewRegistry())
	remote := startGossip(2, stopper, t, metric.NewRegistry())
	local.mu.Lock()
	remote.mu.Lock()
	rAddr := remote.mu.is.NodeAddr
	// Start two clients from local to remote. RPC client cache is
	// disabled via the context, so we'll start two different outgoing
	// connections.
	local.startClient(&rAddr, remote.NodeID.Get())
	local.startClient(&rAddr, remote.NodeID.Get())
	local.mu.Unlock()
	remote.mu.Unlock()
	local.manage()
	remote.manage()
	util.SucceedsSoon(t, func() error {
		// Verify that the remote server has only a single incoming
		// connection and the local server has only a single outgoing
		// connection.
		local.mu.Lock()
		remote.mu.Lock()
		outgoing := local.outgoing.len()
		incoming := remote.mu.incoming.len()
		local.mu.Unlock()
		remote.mu.Unlock()
		if outgoing == 1 && incoming == 1 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) {
			return nil
		}
		return errors.Errorf("incorrect number of incoming (%d) or outgoing (%d) connections", incoming, outgoing)
	})
}
Example #18
0
func gossipSucceedsSoon(
	t *testing.T,
	stopper *stop.Stopper,
	disconnected chan *client,
	gossip map[*client]*Gossip,
	f func() error,
) {
	// Use an insecure context since we don't need a valid cert.
	rpcContext := rpc.NewContext(log.AmbientContext{}, &base.Config{Insecure: true}, nil, stopper)

	for c := range gossip {
		disconnected <- c
	}

	util.SucceedsSoon(t, func() error {
		select {
		case client := <-disconnected:
			// If the client wasn't able to connect, restart it.
			client.start(gossip[client], disconnected, rpcContext, stopper, gossip[client].NodeID.Get(), rpcContext.NewBreaker())
		default:
		}

		return f()
	})
}
Example #19
0
func gossipSucceedsSoon(
	t *testing.T,
	stopper *stop.Stopper,
	disconnected chan *client,
	gossip map[*client]*Gossip,
	f func() error,
) {
	// Use an insecure context since we don't need a valid cert.
	rpcContext := newInsecureRPCContext(stopper)

	for c := range gossip {
		disconnected <- c
	}

	util.SucceedsSoon(t, func() error {
		select {
		case client := <-disconnected:
			// If the client wasn't able to connect, restart it.
			client.start(gossip[client], disconnected, rpcContext, stopper, rpcContext.NewBreaker())
		default:
		}

		return f()
	})
}
Example #20
0
// TestGossipStorageCleanup verifies that bad resolvers are purged
// from the bootstrap info after gossip has successfully connected.
func TestGossipStorageCleanup(t *testing.T) {
	defer leaktest.AfterTest(t)()
	stopper := stop.NewStopper()
	defer stopper.Stop()

	const numNodes = 3
	network := simulation.NewNetwork(stopper, numNodes, false)

	const notReachableAddr = "localhost:0"
	const invalidAddr = "10.0.0.1000:3333333"
	// Set storage for each of the nodes.
	addresses := make(unresolvedAddrSlice, len(network.Nodes))
	stores := make([]testStorage, len(network.Nodes))
	for i, n := range network.Nodes {
		addresses[i] = util.MakeUnresolvedAddr(n.Addr().Network(), n.Addr().String())
		// Pre-add an invalid address to each gossip storage.
		if err := stores[i].WriteBootstrapInfo(&gossip.BootstrapInfo{
			Addresses: []util.UnresolvedAddr{
				util.MakeUnresolvedAddr("tcp", network.Nodes[(i+1)%numNodes].Addr().String()), // node i+1 address
				util.MakeUnresolvedAddr("tcp", notReachableAddr),                              // unreachable address
				util.MakeUnresolvedAddr("tcp", invalidAddr),                                   // invalid address
			},
		}); err != nil {
			t.Fatal(err)
		}
		if err := n.Gossip.SetStorage(&stores[i]); err != nil {
			t.Fatal(err)
		}
		n.Gossip.SetStallInterval(1 * time.Millisecond)
		n.Gossip.SetBootstrapInterval(1 * time.Millisecond)
	}

	// Wait for the gossip network to connect.
	network.RunUntilFullyConnected()

	// Let the gossip network continue running in the background without the
	// simulation cycler preventing it from operating.
	for _, node := range network.Nodes {
		node.Gossip.EnableSimulationCycler(false)
	}

	// Wait long enough for storage to get the expected number of
	// addresses and no pending cleanups.
	util.SucceedsSoon(t, func() error {
		for i := range stores {
			p := &stores[i]
			if expected, actual := len(network.Nodes)-1 /* -1 is ourself */, p.Len(); expected != actual {
				return errors.Errorf("expected %v, got %v (info: %#v)", expected, actual, p.Info().Addresses)
			}
			for _, addr := range p.Info().Addresses {
				if addr.String() == invalidAddr {
					return errors.Errorf("node %d still needs bootstrap cleanup", i)
				}
			}
		}
		return nil
	})
}
Example #21
0
// TestScannerAddToQueues verifies that ranges are added to and
// removed from multiple queues.
func TestScannerAddToQueues(t *testing.T) {
	defer leaktest.AfterTest(t)()
	const count = 3
	ranges := newTestRangeSet(count, t)
	q1, q2 := &testQueue{}, &testQueue{}
	// We don't want to actually consume entries from the queues during this test.
	q1.setDisabled(true)
	q2.setDisabled(true)
	s := newReplicaScanner(log.AmbientContext{}, 1*time.Millisecond, 0, ranges)
	s.AddQueues(q1, q2)
	mc := hlc.NewManualClock(0)
	clock := hlc.NewClock(mc.UnixNano)
	stopper := stop.NewStopper()

	// Start scanner and verify that all ranges are added to both queues.
	s.Start(clock, stopper)
	util.SucceedsSoon(t, func() error {
		if q1.count() != count || q2.count() != count {
			return errors.Errorf("q1 or q2 count != %d; got %d, %d", count, q1.count(), q2.count())
		}
		return nil
	})

	// Remove first range and verify it does not exist in either range.
	rng := ranges.remove(0, t)
	util.SucceedsSoon(t, func() error {
		// This is intentionally inside the loop, otherwise this test races as
		// our removal of the range may be processed before a stray re-queue.
		// Removing on each attempt makes sure we clean this up as we retry.
		s.RemoveReplica(rng)
		c1 := q1.count()
		c2 := q2.count()
		if c1 != count-1 || c2 != count-1 {
			return errors.Errorf("q1 or q2 count != %d; got %d, %d", count-1, c1, c2)
		}
		return nil
	})

	// Stop scanner and verify both queues are stopped.
	stopper.Stop()
	if !q1.isDone() || !q2.isDone() {
		t.Errorf("expected all queues to stop; got %t, %t", q1.isDone(), q2.isDone())
	}
}
Example #22
0
// Test that abruptly closing a pgwire connection releases all leases held by
// that session.
func TestPGWireConnectionCloseReleasesLeases(t *testing.T) {
	defer leaktest.AfterTest(t)()
	s, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{})
	defer s.Stopper().Stop()
	url, cleanupConn := sqlutils.PGUrl(t, s.ServingAddr(), "SetupServer", url.User(security.RootUser))
	defer cleanupConn()
	conn, err := pq.Open(url.String())
	if err != nil {
		t.Fatal(err)
	}
	ex := conn.(driver.Execer)
	if _, err := ex.Exec("CREATE DATABASE test", nil); err != nil {
		t.Fatal(err)
	}
	if _, err := ex.Exec("CREATE TABLE test.t (i INT PRIMARY KEY)", nil); err != nil {
		t.Fatal(err)
	}
	// Start a txn so leases are accumulated by queries.
	if _, err := ex.Exec("BEGIN", nil); err != nil {
		t.Fatal(err)
	}
	// Get a table lease.
	if _, err := ex.Exec("SELECT * FROM test.t", nil); err != nil {
		t.Fatal(err)
	}
	// Abruptly close the connection.
	if err := conn.Close(); err != nil {
		t.Fatal(err)
	}
	// Verify that there are no leases held.
	tableDesc := sqlbase.GetTableDescriptor(kvDB, "test", "t")
	lm := s.LeaseManager().(*LeaseManager)
	// Looking for a table state validates that there used to be a lease on the
	// table.
	ts := lm.findTableState(tableDesc.ID, false /* create */)
	if ts == nil {
		t.Fatal("table state not found")
	}
	ts.mu.Lock()
	leases := ts.active.data
	ts.mu.Unlock()
	if len(leases) != 1 {
		t.Fatalf("expected one lease, found: %d", len(leases))
	}
	// Wait for the lease to be released.
	util.SucceedsSoon(t, func() error {
		ts.mu.Lock()
		refcount := ts.active.data[0].refcount
		ts.mu.Unlock()
		if refcount != 0 {
			return errors.Errorf(
				"expected lease to be unused, found refcount: %d", refcount)
		}
		return nil
	})
}
Example #23
0
func TestStopperNumTasks(t *testing.T) {
	defer leaktest.AfterTest(t)()
	s := stop.NewStopper()
	var tasks []chan bool
	for i := 0; i < 3; i++ {
		c := make(chan bool)
		tasks = append(tasks, c)
		if err := s.RunAsyncTask(context.Background(), func(_ context.Context) {
			// Wait for channel to close
			<-c
		}); err != nil {
			t.Fatal(err)
		}
		tm := s.RunningTasks()
		if numTypes, numTasks := len(tm), s.NumTasks(); numTypes != 1 || numTasks != i+1 {
			t.Errorf("stopper should have %d running tasks, got %d / %+v", i+1, numTasks, tm)
		}
		m := s.RunningTasks()
		if len(m) != 1 {
			t.Fatalf("expected exactly one task map entry: %+v", m)
		}
		for _, v := range m {
			if expNum := len(tasks); v != expNum {
				t.Fatalf("%d: expected %d tasks, got %d", i, expNum, v)
			}
		}
	}
	for i, c := range tasks {
		m := s.RunningTasks()
		if len(m) != 1 {
			t.Fatalf("%d: expected exactly one task map entry: %+v", i, m)
		}
		for _, v := range m {
			if expNum := len(tasks[i:]); v != expNum {
				t.Fatalf("%d: expected %d tasks, got %d:\n%s", i, expNum, v, m)
			}
		}
		// Close the channel to let the task proceed.
		close(c)
		expNum := len(tasks[i+1:])
		util.SucceedsSoon(t, func() error {
			if nt := s.NumTasks(); nt != expNum {
				return errors.Errorf("%d: stopper should have %d running tasks, got %d", i, expNum, nt)
			}
			return nil
		})
	}
	// The taskmap should've been cleared out.
	if m := s.RunningTasks(); len(m) != 0 {
		t.Fatalf("task map not empty: %+v", m)
	}
	s.Stop()
}
Example #24
0
func TestGCQueueLastProcessedTimestamps(t *testing.T) {
	defer leaktest.AfterTest(t)()
	tc := testContext{}
	stopper := stop.NewStopper()
	defer stopper.Stop()
	tc.Start(t, stopper)

	// Create two last processed times both at the range start key and
	// also at some mid-point key in order to simulate a merge.
	// Two transactions.
	lastProcessedVals := []struct {
		key   roachpb.Key
		expGC bool
	}{
		{keys.QueueLastProcessedKey(roachpb.RKeyMin, "timeSeriesMaintenance"), false},
		{keys.QueueLastProcessedKey(roachpb.RKeyMin, "replica consistency checker"), false},
		{keys.QueueLastProcessedKey(roachpb.RKey("a"), "timeSeriesMaintenance"), true},
		{keys.QueueLastProcessedKey(roachpb.RKey("b"), "replica consistency checker"), true},
	}

	ts := tc.Clock().Now()
	for _, lpv := range lastProcessedVals {
		if err := engine.MVCCPutProto(context.Background(), tc.engine, nil, lpv.key, hlc.ZeroTimestamp, nil, &ts); err != nil {
			t.Fatal(err)
		}
	}

	cfg, ok := tc.gossip.GetSystemConfig()
	if !ok {
		t.Fatal("config not set")
	}

	// Process through a scan queue.
	gcQ := newGCQueue(tc.store, tc.gossip)
	if err := gcQ.process(context.Background(), tc.Clock().Now(), tc.repl, cfg); err != nil {
		t.Fatal(err)
	}

	// Verify GC.
	util.SucceedsSoon(t, func() error {
		for _, lpv := range lastProcessedVals {
			ok, err := engine.MVCCGetProto(context.Background(), tc.engine, lpv.key, hlc.ZeroTimestamp, true, nil, &ts)
			if err != nil {
				return err
			}
			if ok == lpv.expGC {
				return errors.Errorf("expected GC of %s: %t; got %t", lpv.key, lpv.expGC, ok)
			}
		}
		return nil
	})
}
Example #25
0
// TestNodeLivenessRestart verifies that if nodes are shutdown and
// restarted, the node liveness records are re-gossiped immediately.
func TestNodeLivenessRestart(t *testing.T) {
	defer leaktest.AfterTest(t)()
	mtc := &multiTestContext{}
	defer mtc.Stop()
	mtc.Start(t, 2)

	// After verifying node is in liveness table, stop store.
	verifyLiveness(t, mtc)
	mtc.stopStore(0)

	// Clear the liveness records in store 1's gossip to make sure we're
	// seeing the liveness record properly gossiped at store startup.
	var expKeys []string
	for _, g := range mtc.gossips {
		key := gossip.MakeNodeLivenessKey(g.NodeID.Get())
		expKeys = append(expKeys, key)
		if err := g.AddInfoProto(key, &storage.Liveness{}, 0); err != nil {
			t.Fatal(err)
		}
	}
	sort.Strings(expKeys)

	// Register a callback to gossip in order to verify liveness records
	// are re-gossiped.
	var keysMu struct {
		syncutil.Mutex
		keys []string
	}
	livenessRegex := gossip.MakePrefixPattern(gossip.KeyNodeLivenessPrefix)
	mtc.gossips[0].RegisterCallback(livenessRegex, func(key string, _ roachpb.Value) {
		keysMu.Lock()
		defer keysMu.Unlock()
		for _, k := range keysMu.keys {
			if k == key {
				return
			}
		}
		keysMu.keys = append(keysMu.keys, key)
	})

	// Restart store and verify gossip contains liveness record for nodes 1&2.
	mtc.restartStore(0)
	util.SucceedsSoon(t, func() error {
		keysMu.Lock()
		defer keysMu.Unlock()
		sort.Strings(keysMu.keys)
		if !reflect.DeepEqual(keysMu.keys, expKeys) {
			return errors.Errorf("expected keys %+v != keys %+v", expKeys, keysMu.keys)
		}
		return nil
	})
}
Example #26
0
// TestRangeCommandClockUpdate verifies that followers update their
// clocks when executing a command, even if the lease holder's clock is far
// in the future.
func TestRangeCommandClockUpdate(t *testing.T) {
	defer leaktest.AfterTest(t)()

	const numNodes = 3
	var manuals []*hlc.ManualClock
	var clocks []*hlc.Clock
	for i := 0; i < numNodes; i++ {
		manuals = append(manuals, hlc.NewManualClock(1))
		clocks = append(clocks, hlc.NewClock(manuals[i].UnixNano))
		clocks[i].SetMaxOffset(100 * time.Millisecond)
	}
	mtc := &multiTestContext{clocks: clocks}
	mtc.Start(t, numNodes)
	defer mtc.Stop()
	mtc.replicateRange(1, 1, 2)

	// Advance the lease holder's clock ahead of the followers (by more than
	// MaxOffset but less than the range lease) and execute a command.
	manuals[0].Increment(int64(500 * time.Millisecond))
	incArgs := incrementArgs([]byte("a"), 5)
	ts := clocks[0].Now()
	if _, err := client.SendWrappedWith(context.Background(), rg1(mtc.stores[0]), roachpb.Header{Timestamp: ts}, &incArgs); err != nil {
		t.Fatal(err)
	}

	// Wait for that command to execute on all the followers.
	util.SucceedsSoon(t, func() error {
		values := []int64{}
		for _, eng := range mtc.engines {
			val, _, err := engine.MVCCGet(context.Background(), eng, roachpb.Key("a"), clocks[0].Now(), true, nil)
			if err != nil {
				return err
			}
			values = append(values, mustGetInt(val))
		}
		if !reflect.DeepEqual(values, []int64{5, 5, 5}) {
			return errors.Errorf("expected (5, 5, 5), got %v", values)
		}
		return nil
	})

	// Verify that all the followers have accepted the clock update from
	// node 0 even though it comes from outside the usual max offset.
	now := clocks[0].Now()
	for i, clock := range clocks {
		// Only compare the WallTimes: it's normal for clock 0 to be a few logical ticks ahead.
		if clock.Now().WallTime < now.WallTime {
			t.Errorf("clock %d is behind clock 0: %s vs %s", i, clock.Now(), now)
		}
	}
}
Example #27
0
// TestRetryableError verifies that Send returns a retryable error
// when it hits an RPC error.
func TestRetryableError(t *testing.T) {
	defer leaktest.AfterTest(t)()

	clientStopper := stop.NewStopper()
	defer clientStopper.Stop()
	clientContext := newNodeTestContext(hlc.NewClock(hlc.UnixNano, time.Nanosecond), clientStopper)

	serverStopper := stop.NewStopper()
	serverContext := newNodeTestContext(hlc.NewClock(hlc.UnixNano, time.Nanosecond), serverStopper)

	s, ln := newTestServer(t, serverContext)
	roachpb.RegisterInternalServer(s, Node(0))

	addr := ln.Addr().String()
	if _, err := clientContext.GRPCDial(addr); err != nil {
		t.Fatal(err)
	}
	// Wait until the client becomes healthy and shut down the server.
	util.SucceedsSoon(t, func() error {
		if !clientContext.IsConnHealthy(addr) {
			return errors.Errorf("client not yet healthy")
		}
		return nil
	})
	serverStopper.Stop()
	// Wait until the client becomes unhealthy.
	util.SucceedsSoon(t, func() error {
		if clientContext.IsConnHealthy(addr) {
			return errors.Errorf("client not yet unhealthy")
		}
		return nil
	})

	opts := SendOptions{ctx: context.Background()}
	if _, err := sendBatch(opts, []net.Addr{ln.Addr()}, clientContext); err == nil {
		t.Fatalf("Unexpected success")
	}
}
Example #28
0
// TestNodeLivenessSelf verifies that a node keeps its own most
// recent liveness heartbeat info in preference to anything which
// might be received belatedly through gossip.
func TestNodeLivenessSelf(t *testing.T) {
	defer leaktest.AfterTest(t)()
	mtc := &multiTestContext{}
	defer mtc.Stop()
	mtc.Start(t, 1)

	// Verify liveness of all nodes for all nodes.
	stopNodeLivenessHeartbeats(mtc)
	if err := mtc.nodeLivenesses[0].ManualHeartbeat(); err != nil {
		t.Fatal(err)
	}

	// Gossip random nonsense for liveness and verify that asking for
	// the node's own node ID returns the "correct" value.
	g := mtc.gossips[0]
	key := gossip.MakeNodeLivenessKey(g.NodeID.Get())
	var count int32
	g.RegisterCallback(key, func(_ string, val roachpb.Value) {
		atomic.AddInt32(&count, 1)
	})
	util.SucceedsSoon(t, func() error {
		if err := g.AddInfoProto(key, &storage.Liveness{
			NodeID: 1,
			Epoch:  2,
		}, 0); err != nil {
			t.Fatal(err)
		}
		if atomic.LoadInt32(&count) < 2 {
			return errors.New("expected count >= 2")
		}
		return nil
	})

	// Self should not see new epoch.
	l := mtc.nodeLivenesses[0]
	lGet, err := l.GetLiveness(g.NodeID.Get())
	if err != nil {
		t.Fatal(err)
	}
	lSelf, err := l.Self()
	if err != nil {
		t.Fatal(err)
	}
	if lGet != lSelf {
		t.Errorf("expected GetLiveness() to return same value as Self(): %+v != %+v", lGet, lSelf)
	}
	if lGet.Epoch == 2 || lSelf.NodeID == 2 {
		t.Errorf("expected GetLiveness() and Self() not to return artificially gossiped liveness: %+v, %+v", lGet, lSelf)
	}
}
Example #29
0
// TestNodeLivenessEpochIncrement verifies that incrementing the epoch
// of a node requires the node to be considered not-live and that on
// increment, no other nodes believe the epoch-incremented node to be
// live.
func TestNodeLivenessEpochIncrement(t *testing.T) {
	defer leaktest.AfterTest(t)()
	mtc := startMultiTestContext(t, 2)
	defer mtc.Stop()

	verifyLiveness(t, mtc)
	stopNodeLivenessHeartbeats(mtc)

	// First try to increment the epoch of a known-live node.
	deadNodeID := mtc.gossips[1].NodeID.Get()
	if err := mtc.nodeLivenesses[0].IncrementEpoch(
		context.Background(), deadNodeID); !testutils.IsError(err, "cannot increment epoch on live node") {
		t.Fatalf("expected error incrementing a live node: %v", err)
	}

	// Advance clock past liveness threshold & increment epoch.
	oldLiveness, err := mtc.nodeLivenesses[0].GetLiveness(deadNodeID)
	if err != nil {
		t.Fatal(err)
	}
	active, _ := storage.RangeLeaseDurations(
		storage.RaftElectionTimeout(base.DefaultRaftTickInterval, 0))
	mtc.manualClock.Increment(active.Nanoseconds() + 1)
	if err := mtc.nodeLivenesses[0].IncrementEpoch(context.Background(), deadNodeID); err != nil {
		t.Fatalf("unexpected error incrementing a live node: %s", err)
	}

	// Verify that the epoch has been advanced.
	util.SucceedsSoon(t, func() error {
		newLiveness, err := mtc.nodeLivenesses[0].GetLiveness(deadNodeID)
		if err != nil {
			return err
		}
		if newLiveness.Epoch != oldLiveness.Epoch+1 {
			return errors.Errorf("expected epoch to increment")
		}
		if newLiveness.Expiration != oldLiveness.Expiration {
			return errors.Errorf("expected expiration to remain unchanged")
		}
		if live, err := mtc.nodeLivenesses[0].IsLive(deadNodeID); live || err != nil {
			return errors.Errorf("expected dead node to remain dead after epoch increment %t: %v", live, err)
		}
		return nil
	})

	// Verify epoch increment metric count.
	if c := mtc.nodeLivenesses[0].Metrics().EpochIncrements.Count(); c != 1 {
		t.Errorf("expected epoch increment == 1; got %d", c)
	}
}
Example #30
0
func verifyLiveness(t *testing.T, mtc *multiTestContext) {
	util.SucceedsSoon(t, func() error {
		for _, nl := range mtc.nodeLivenesses {
			for _, g := range mtc.gossips {
				live, err := nl.IsLive(g.NodeID.Get())
				if err != nil {
					return err
				} else if !live {
					return errors.Errorf("node %d not live", g.NodeID.Get())
				}
			}
		}
		return nil
	})
}