// TestGossipCullNetwork verifies that a client will be culled from // the network periodically (at cullInterval duration intervals). func TestGossipCullNetwork(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t) local.SetCullInterval(5 * time.Millisecond) local.mu.Lock() for i := 0; i < minPeers; i++ { peer := startGossip(roachpb.NodeID(i+2), stopper, t) local.startClient(&peer.is.NodeAddr, stopper) } local.mu.Unlock() util.SucceedsSoon(t, func() error { if len(local.Outgoing()) == minPeers { return nil } return errors.New("some peers not yet connected") }) local.manage() util.SucceedsSoon(t, func() error { // Verify that a client is closed within the cull interval. if len(local.Outgoing()) == minPeers-1 { return nil } return errors.New("no network culling occurred") }) }
// TestReplicateRange verifies basic replication functionality by creating two stores // and a range, replicating the range to the second store, and reading its data there. func TestReplicateRange(t *testing.T) { defer leaktest.AfterTest(t)() mtc := startMultiTestContext(t, 2) defer mtc.Stop() // Issue a command on the first node before replicating. incArgs := incrementArgs([]byte("a"), 5) if _, err := client.SendWrapped(rg1(mtc.stores[0]), nil, &incArgs); err != nil { t.Fatal(err) } rng, err := mtc.stores[0].GetReplica(1) if err != nil { t.Fatal(err) } if err := rng.ChangeReplicas(roachpb.ADD_REPLICA, roachpb.ReplicaDescriptor{ NodeID: mtc.stores[1].Ident.NodeID, StoreID: mtc.stores[1].Ident.StoreID, }, rng.Desc()); err != nil { t.Fatal(err) } // Verify no intent remains on range descriptor key. key := keys.RangeDescriptorKey(rng.Desc().StartKey) desc := roachpb.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key, mtc.stores[0].Clock().Now(), true, nil, &desc); !ok || err != nil { t.Fatalf("fetching range descriptor yielded %t, %s", ok, err) } // Verify that in time, no intents remain on meta addressing // keys, and that range descriptor on the meta records is correct. util.SucceedsSoon(t, func() error { meta2 := keys.Addr(keys.RangeMetaKey(roachpb.RKeyMax)) meta1 := keys.Addr(keys.RangeMetaKey(meta2)) for _, key := range []roachpb.RKey{meta2, meta1} { metaDesc := roachpb.RangeDescriptor{} if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), key.AsRawKey(), mtc.stores[0].Clock().Now(), true, nil, &metaDesc); !ok || err != nil { return util.Errorf("failed to resolve %s", key.AsRawKey()) } if !reflect.DeepEqual(metaDesc, desc) { return util.Errorf("descs not equal: %+v != %+v", metaDesc, desc) } } return nil }) // Verify that the same data is available on the replica. util.SucceedsSoon(t, func() error { getArgs := getArgs([]byte("a")) if reply, err := client.SendWrappedWith(rg1(mtc.stores[1]), nil, roachpb.Header{ ReadConsistency: roachpb.INCONSISTENT, }, &getArgs); err != nil { return util.Errorf("failed to read data: %s", err) } else if e, v := int64(5), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e { return util.Errorf("failed to read correct data: expected %d, got %d", e, v) } return nil }) }
func TestGossipOrphanedStallDetection(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t, metric.NewRegistry()) local.SetStallInterval(5 * time.Millisecond) // Make sure we have the sentinel to ensure that its absence is not the // cause of stall detection. if err := local.AddInfo(KeySentinel, nil, time.Hour); err != nil { t.Fatal(err) } peerStopper := stop.NewStopper() peer := startGossip(2, peerStopper, t, metric.NewRegistry()) peerNodeID := peer.GetNodeID() peerAddr := peer.GetNodeAddr() local.startClient(peerAddr, peerNodeID) util.SucceedsSoon(t, func() error { for _, peerID := range local.Outgoing() { if peerID == peerNodeID { return nil } } return errors.Errorf("%d not yet connected", peerNodeID) }) local.bootstrap() local.manage() peerStopper.Stop() util.SucceedsSoon(t, func() error { for _, peerID := range local.Outgoing() { if peerID == peerNodeID { return errors.Errorf("%d still connected", peerNodeID) } } return nil }) peerStopper = stop.NewStopper() defer peerStopper.Stop() peer = startGossipAtAddr(peerNodeID, peerAddr, peerStopper, t, metric.NewRegistry()) util.SucceedsSoon(t, func() error { for _, peerID := range local.Outgoing() { if peerID == peerNodeID { return nil } } return errors.Errorf("%d not yet connected", peerNodeID) }) }
func TestOffsetMeasurement(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() serverTime := time.Unix(0, 20) serverClock := hlc.NewClock(serverTime.UnixNano) serverCtx := newNodeTestContext(serverClock, stopper) s, ln := newTestServer(t, serverCtx, true) remoteAddr := ln.Addr().String() RegisterHeartbeatServer(s, &HeartbeatService{ clock: serverClock, remoteClockMonitor: serverCtx.RemoteClocks, }) // Create a client clock that is behind the server clock. clientAdvancing := AdvancingClock{time: time.Unix(0, 10)} clientClock := hlc.NewClock(clientAdvancing.UnixNano) clientClock.SetMaxOffset(time.Millisecond) clientCtx := newNodeTestContext(clientClock, stopper) clientCtx.RemoteClocks.offsetTTL = 5 * clientAdvancing.advancementInterval if _, err := clientCtx.GRPCDial(remoteAddr); err != nil { t.Fatal(err) } expectedOffset := RemoteOffset{Offset: 10, Uncertainty: 0, MeasuredAt: 10} util.SucceedsSoon(t, func() error { clientCtx.RemoteClocks.mu.Lock() defer clientCtx.RemoteClocks.mu.Unlock() if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok { return util.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr) } else if o != expectedOffset { return util.Errorf("expected:\n%v\nactual:\n%v", expectedOffset, o) } return nil }) // Change the client such that it receives a heartbeat right after the // maximum clock reading delay. clientAdvancing.Lock() clientAdvancing.advancementInterval = maximumPingDurationMult*clientClock.MaxOffset() + 1*time.Nanosecond clientAdvancing.Unlock() util.SucceedsSoon(t, func() error { clientCtx.RemoteClocks.mu.Lock() defer clientCtx.RemoteClocks.mu.Unlock() if o, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; ok { return util.Errorf("expected offset to have been cleared, but found %s", o) } return nil }) }
// TestRaftTransportCircuitBreaker verifies that messages will be // dropped waiting for raft node connection to be established. func TestRaftTransportCircuitBreaker(t *testing.T) { defer leaktest.AfterTest(t)() rttc := newRaftTransportTestContext(t) defer rttc.Stop() serverReplica := roachpb.ReplicaDescriptor{ NodeID: 2, StoreID: 2, ReplicaID: 2, } _, serverAddr := rttc.AddNodeWithoutGossip(serverReplica.NodeID) serverChannel := rttc.ListenStore(serverReplica.NodeID, serverReplica.StoreID) clientReplica := roachpb.ReplicaDescriptor{ NodeID: 1, StoreID: 1, ReplicaID: 1, } clientTransport := rttc.AddNode(clientReplica.NodeID) // The transport is set up asynchronously, so we expect the first // Send to return true here. if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) { t.Errorf("unexpectedly failed sending while connection is being asynchronously established") } // However, sending repeated messages should begin dropping once // the circuit breaker does trip. util.SucceedsSoon(t, func() error { if rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 1}) { return errors.Errorf("expected circuit breaker to trip") } return nil }) // Now, gossip address of server. rttc.GossipNode(serverReplica.NodeID, serverAddr) // Keep sending commit=2 until breaker resets and we receive the // first instance. It's possible an earlier message for commit=1 // snuck in. util.SucceedsSoon(t, func() error { if !rttc.Send(clientReplica, serverReplica, 1, raftpb.Message{Commit: 2}) { clientTransport.GetCircuitBreaker(serverReplica.NodeID).Reset() } select { case req := <-serverChannel.ch: if req.Message.Commit == 2 { return nil } default: } return errors.Errorf("expected message commit=2") }) }
// TestRangeSplitsWithWritePressure sets the zone config max bytes for // a range to 256K and writes data until there are five ranges. func TestRangeSplitsWithWritePressure(t *testing.T) { defer leaktest.AfterTest(t)() // Override default zone config. cfg := config.DefaultZoneConfig() cfg.RangeMaxBytes = 1 << 18 defer config.TestingSetDefaultZoneConfig(cfg)() dbCtx := client.DefaultDBContext() dbCtx.TxnRetryOptions = retry.Options{ InitialBackoff: 1 * time.Millisecond, MaxBackoff: 10 * time.Millisecond, Multiplier: 2, } s, _ := createTestDBWithContext(t, dbCtx) // This is purely to silence log spam. config.TestingSetupZoneConfigHook(s.Stopper) defer s.Stop() // Start test writer write about a 32K/key so there aren't too many writes necessary to split 64K range. done := make(chan struct{}) var wg sync.WaitGroup wg.Add(1) go startTestWriter(s.DB, int64(0), 1<<15, &wg, nil, nil, done, t) // Check that we split 5 times in allotted time. util.SucceedsSoon(t, func() error { // Scan the txn records. rows, err := s.DB.Scan(keys.Meta2Prefix, keys.MetaMax, 0) if err != nil { return util.Errorf("failed to scan meta2 keys: %s", err) } if lr := len(rows); lr < 5 { return util.Errorf("expected >= 5 scans; got %d", lr) } return nil }) close(done) wg.Wait() // This write pressure test often causes splits while resolve // intents are in flight, causing them to fail with range key // mismatch errors. However, LocalSender should retry in these // cases. Check here via MVCC scan that there are no dangling write // intents. We do this using a SucceedsSoon construct to account // for timing of finishing the test writer and a possibly-ongoing // asynchronous split. util.SucceedsSoon(t, func() error { if _, _, err := engine.MVCCScan(context.Background(), s.Eng, keys.LocalMax, roachpb.KeyMax, 0, hlc.MaxTimestamp, true, nil); err != nil { return util.Errorf("failed to verify no dangling intents: %s", err) } return nil }) }
// TestScannerDisabled verifies that disabling a scanner prevents // replicas from being added to queues. func TestScannerDisabled(t *testing.T) { defer leaktest.AfterTest(t)() const count = 3 ranges := newTestRangeSet(count, t) q := &testQueue{} s := newReplicaScanner(1*time.Millisecond, 0, ranges) s.AddQueues(q) mc := hlc.NewManualClock(0) clock := hlc.NewClock(mc.UnixNano) stopper := stop.NewStopper() s.Start(clock, stopper) defer stopper.Stop() // Verify queue gets all ranges. util.SucceedsSoon(t, func() error { if q.count() != count { return errors.Errorf("expected %d replicas; have %d", count, q.count()) } if s.scanCount() == 0 { return errors.Errorf("expected scanner count to increment") } return nil }) lastWaitEnabledCount := s.waitEnabledCount() // Now, disable the scanner. s.SetDisabled(true) util.SucceedsSoon(t, func() error { if s.waitEnabledCount() == lastWaitEnabledCount { return errors.Errorf("expected scanner to stop when disabled") } return nil }) lastScannerCount := s.scanCount() // Remove the replicas and verify the scanner still removes them while disabled. ranges.Visit(func(repl *Replica) bool { s.RemoveReplica(repl) return true }) util.SucceedsSoon(t, func() error { if qc := q.count(); qc != 0 { return errors.Errorf("expected queue to be empty after replicas removed from scanner; got %d", qc) } return nil }) if sc := s.scanCount(); sc != lastScannerCount { t.Errorf("expected scanner count to not increment: %d != %d", sc, lastScannerCount) } }
func TestFailedOffsetMeasurement(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() // Can't be zero because that'd be an empty offset. clock := hlc.NewClock(hlc.NewManualClock(1).UnixNano) serverCtx := newNodeTestContext(clock, stopper) serverCtx.RemoteClocks.monitorInterval = 100 * time.Millisecond s, ln := newTestServer(t, serverCtx, true) remoteAddr := ln.Addr().String() heartbeat := &ManualHeartbeatService{ clock: clock, remoteClockMonitor: serverCtx.RemoteClocks, ready: make(chan struct{}), stopper: stopper, } RegisterHeartbeatServer(s, heartbeat) // Create a client that never receives a heartbeat after the first. clientCtx := newNodeTestContext(clock, stopper) // Increase the timeout so that failure arises from exceeding the maximum // clock reading delay, not the timeout. clientCtx.HeartbeatTimeout = 20 * clientCtx.HeartbeatInterval _, err := clientCtx.GRPCDial(remoteAddr) if err != nil { t.Fatal(err) } heartbeat.ready <- struct{}{} // Allow one heartbeat for initialization. util.SucceedsSoon(t, func() error { clientCtx.RemoteClocks.mu.Lock() defer clientCtx.RemoteClocks.mu.Unlock() if _, ok := clientCtx.RemoteClocks.mu.offsets[remoteAddr]; !ok { return util.Errorf("expected offset of %s to be initialized, but it was not", remoteAddr) } return nil }) util.SucceedsSoon(t, func() error { serverCtx.RemoteClocks.mu.Lock() defer serverCtx.RemoteClocks.mu.Unlock() if o, ok := serverCtx.RemoteClocks.mu.offsets[remoteAddr]; ok { return util.Errorf("expected offset of %s to not be initialized, but it was: %v", remoteAddr, o) } return nil }) }
// TestReplicaGCQueueDropReplica verifies that a removed replica is // immediately cleaned up. func TestReplicaGCQueueDropReplicaDirect(t *testing.T) { defer leaktest.AfterTest(t)() mtc := &multiTestContext{} const numStores = 3 rangeID := roachpb.RangeID(1) // In this test, the Replica on the second Node is removed, and the test // verifies that that Node adds this Replica to its RangeGCQueue. However, // the queue does a consistent lookup which will usually be read from // Node 1. Hence, if Node 1 hasn't processed the removal when Node 2 has, // no GC will take place since the consistent RangeLookup hits the first // Node. We use the TestingCommandFilter to make sure that the second Node // waits for the first. ctx := storage.TestStoreContext() mtc.storeContext = &ctx mtc.storeContext.TestingKnobs.TestingCommandFilter = func(filterArgs storageutils.FilterArgs) error { et, ok := filterArgs.Req.(*roachpb.EndTransactionRequest) if !ok || filterArgs.Sid != 2 { return nil } rct := et.InternalCommitTrigger.GetChangeReplicasTrigger() if rct == nil || rct.ChangeType != roachpb.REMOVE_REPLICA { return nil } util.SucceedsSoon(t, func() error { r, err := mtc.stores[0].GetReplica(rangeID) if err != nil { return err } if i, _ := r.Desc().FindReplica(2); i >= 0 { return errors.New("expected second node gone from first node's known replicas") } return nil }) return nil } mtc.Start(t, numStores) defer mtc.Stop() mtc.replicateRange(rangeID, 1, 2) mtc.unreplicateRange(rangeID, 1) // Make sure the range is removed from the store. util.SucceedsSoon(t, func() error { if _, err := mtc.stores[1].GetReplica(rangeID); !testutils.IsError(err, "range .* was not found") { return util.Errorf("expected range removal") } return nil }) }
// TestBaseQueueProcess verifies that items from the queue are // processed according to the timer function. func TestBaseQueueProcess(t *testing.T) { defer leaktest.AfterTest(t)() g, stopper := gossipForTest(t) defer stopper.Stop() r1 := &Replica{RangeID: 1} if err := r1.setDesc(&roachpb.RangeDescriptor{RangeID: 1}); err != nil { t.Fatal(err) } r2 := &Replica{RangeID: 2} if err := r2.setDesc(&roachpb.RangeDescriptor{RangeID: 2}); err != nil { t.Fatal(err) } testQueue := &testQueueImpl{ blocker: make(chan struct{}, 1), shouldQueueFn: func(now roachpb.Timestamp, r *Replica) (shouldQueue bool, priority float64) { shouldQueue = true priority = float64(r.RangeID) return }, } bq := makeBaseQueue("test", testQueue, g, 2) mc := hlc.NewManualClock(0) clock := hlc.NewClock(mc.UnixNano) bq.Start(clock, stopper) bq.MaybeAdd(r1, roachpb.ZeroTimestamp) bq.MaybeAdd(r2, roachpb.ZeroTimestamp) if pc := atomic.LoadInt32(&testQueue.processed); pc != 0 { t.Errorf("expected no processed ranges; got %d", pc) } testQueue.blocker <- struct{}{} util.SucceedsSoon(t, func() error { if pc := atomic.LoadInt32(&testQueue.processed); pc != int32(1) { return util.Errorf("expected %d processed replicas; got %d", 1, pc) } return nil }) testQueue.blocker <- struct{}{} util.SucceedsSoon(t, func() error { if pc := atomic.LoadInt32(&testQueue.processed); pc < int32(2) { return util.Errorf("expected >= %d processed replicas; got %d", 2, pc) } return nil }) // Ensure the test queue is not blocked on a stray call to // testQueueImpl.timer(). close(testQueue.blocker) }
// TestStoreZoneUpdateAndRangeSplit verifies that modifying the zone // configuration changes range max bytes and Range.maybeSplit() takes // max bytes into account when deciding whether to enqueue a range for // splitting. It further verifies that the range is in fact split on // exceeding zone's RangeMaxBytes. func TestStoreZoneUpdateAndRangeSplit(t *testing.T) { defer leaktest.AfterTest(t)() store, stopper, _ := createTestStore(t) config.TestingSetupZoneConfigHook(stopper) defer stopper.Stop() maxBytes := int64(1 << 16) // Set max bytes. descID := uint32(keys.MaxReservedDescID + 1) config.TestingSetZoneConfig(descID, &config.ZoneConfig{RangeMaxBytes: maxBytes}) // Trigger gossip callback. if err := store.Gossip().AddInfoProto(gossip.KeySystemConfig, &config.SystemConfig{}, 0); err != nil { t.Fatal(err) } tableBoundary := keys.MakeTablePrefix(descID) { var rng *storage.Replica // Wait for the range to be split along table boundaries. expectedRSpan := roachpb.RSpan{Key: roachpb.RKey(tableBoundary), EndKey: roachpb.RKeyMax} util.SucceedsSoon(t, func() error { rng = store.LookupReplica(tableBoundary, nil) if actualRSpan := rng.Desc().RSpan(); !actualRSpan.Equal(expectedRSpan) { return util.Errorf("expected range %s to span %s", rng, expectedRSpan) } return nil }) // Check range's max bytes settings. if actualMaxBytes := rng.GetMaxBytes(); actualMaxBytes != maxBytes { t.Fatalf("range %s max bytes mismatch, got: %d, expected: %d", rng, actualMaxBytes, maxBytes) } // Look in the range after prefix we're writing to. fillRange(store, rng.RangeID, tableBoundary, maxBytes, t) } // Verify that the range is in fact split. util.SucceedsSoon(t, func() error { rng := store.LookupReplica(keys.MakeTablePrefix(descID+1), nil) rngDesc := rng.Desc() rngStart, rngEnd := rngDesc.StartKey, rngDesc.EndKey if rngStart.Equal(tableBoundary) || !rngEnd.Equal(roachpb.RKeyMax) { return util.Errorf("range %s has not yet split", rng) } return nil }) }
// TestNodeJoin verifies a new node is able to join a bootstrapped // cluster consisting of one node. func TestNodeJoin(t *testing.T) { defer leaktest.AfterTest(t)() engineStopper := stop.NewStopper() defer engineStopper.Stop() e := engine.NewInMem(roachpb.Attributes{}, 1<<20, engineStopper) if _, err := bootstrapCluster([]engine.Engine{e}, kv.NewTxnMetrics(metric.NewRegistry())); err != nil { t.Fatal(err) } // Start the bootstrap node. engines1 := []engine.Engine{e} addr1 := util.CreateTestAddr("tcp") _, server1Addr, node1, stopper1 := createAndStartTestNode(addr1, engines1, addr1, t) defer stopper1.Stop() // Create a new node. engines2 := []engine.Engine{engine.NewInMem(roachpb.Attributes{}, 1<<20, engineStopper)} addr2 := util.CreateTestAddr("tcp") _, server2Addr, node2, stopper2 := createAndStartTestNode(addr2, engines2, server1Addr, t) defer stopper2.Stop() // Verify new node is able to bootstrap its store. util.SucceedsSoon(t, func() error { if sc := node2.stores.GetStoreCount(); sc != 1 { return util.Errorf("GetStoreCount() expected 1; got %d", sc) } return nil }) // Verify node1 sees node2 via gossip and vice versa. node1Key := gossip.MakeNodeIDKey(node1.Descriptor.NodeID) node2Key := gossip.MakeNodeIDKey(node2.Descriptor.NodeID) util.SucceedsSoon(t, func() error { var nodeDesc1 roachpb.NodeDescriptor if err := node1.ctx.Gossip.GetInfoProto(node2Key, &nodeDesc1); err != nil { return err } if addr2Str, server2AddrStr := nodeDesc1.Address.String(), server2Addr.String(); addr2Str != server2AddrStr { return util.Errorf("addr2 gossip %s doesn't match addr2 address %s", addr2Str, server2AddrStr) } var nodeDesc2 roachpb.NodeDescriptor if err := node2.ctx.Gossip.GetInfoProto(node1Key, &nodeDesc2); err != nil { return err } if addr1Str, server1AddrStr := nodeDesc2.Address.String(), server1Addr.String(); addr1Str != server1AddrStr { return util.Errorf("addr1 gossip %s doesn't match addr1 address %s", addr1Str, server1AddrStr) } return nil }) }
// TestScannerTiming verifies that ranges are scanned, regardless // of how many, to match scanInterval. func TestScannerTiming(t *testing.T) { defer leaktest.AfterTest(t)() const count = 3 const runTime = 100 * time.Millisecond const maxError = 7500 * time.Microsecond durations := []time.Duration{ 15 * time.Millisecond, 25 * time.Millisecond, } for i, duration := range durations { util.SucceedsSoon(t, func() error { ranges := newTestRangeSet(count, t) q := &testQueue{} s := newReplicaScanner(duration, 0, ranges) s.AddQueues(q) mc := hlc.NewManualClock(0) clock := hlc.NewClock(mc.UnixNano) stopper := stop.NewStopper() s.Start(clock, stopper) time.Sleep(runTime) stopper.Stop() avg := s.avgScan() log.Infof("%d: average scan: %s", i, avg) if avg.Nanoseconds()-duration.Nanoseconds() > maxError.Nanoseconds() || duration.Nanoseconds()-avg.Nanoseconds() > maxError.Nanoseconds() { return errors.Errorf("expected %s, got %s: exceeds max error of %s", duration, avg, maxError) } return nil }) } }
// TestMetricsRecording verifies that Node statistics are periodically recorded // as time series data. func TestMetricsRecording(t *testing.T) { defer leaktest.AfterTest(t)() tsrv := TestServer{} tsrv.Ctx = NewTestContext() tsrv.Ctx.MetricsFrequency = 5 * time.Millisecond if err := tsrv.Start(); err != nil { t.Fatal(err) } defer tsrv.Stop() checkTimeSeriesKey := func(now int64, keyName string) error { key := ts.MakeDataKey(keyName, "", ts.Resolution10s, now) data := roachpb.InternalTimeSeriesData{} return tsrv.db.GetProto(key, &data).GoError() } // Verify that metrics for the current timestamp are recorded. This should // be true very quickly. util.SucceedsSoon(t, func() error { now := tsrv.Clock().PhysicalNow() if err := checkTimeSeriesKey(now, "cr.store.livebytes.1"); err != nil { return err } if err := checkTimeSeriesKey(now, "cr.node.sys.allocbytes.1"); err != nil { return err } return nil }) }
func gossipForTest(t *testing.T) (*gossip.Gossip, *stop.Stopper) { stopper := stop.NewStopper() // Setup fake zone config handler. config.TestingSetupZoneConfigHook(stopper) rpcContext := rpc.NewContext(&base.Context{}, hlc.NewClock(hlc.UnixNano), stopper) g := gossip.New(rpcContext, gossip.TestBootstrap, stopper) // Have to call g.SetNodeID before call g.AddInfo g.SetNodeID(roachpb.NodeID(1)) // Put an empty system config into gossip. if err := g.AddInfoProto(gossip.KeySystemConfig, &config.SystemConfig{}, 0); err != nil { t.Fatal(err) } // Wait for SystemConfig. util.SucceedsSoon(t, func() error { if g.GetSystemConfig() == nil { return util.Errorf("expected non-nil system config") } return nil }) return g, stopper }
func checkRangeReplication(t *testing.T, c cluster.Cluster, d time.Duration) { // Always talk to node 0. client, dbStopper := makeClient(t, c.ConnString(0)) defer dbStopper.Stop() wantedReplicas := 3 if c.NumNodes() < 3 { wantedReplicas = c.NumNodes() } log.Infof("waiting for first range to have %d replicas", wantedReplicas) util.SucceedsSoon(t, func() error { select { case <-stopper: t.Fatalf("interrupted") return nil case <-time.After(1 * time.Second): } foundReplicas, err := countRangeReplicas(client) if err != nil { return err } if log.V(1) { log.Infof("found %d replicas", foundReplicas) } if foundReplicas >= wantedReplicas { return nil } return fmt.Errorf("expected %d replicas, only found %d", wantedReplicas, foundReplicas) }) }
// TestGossipCullNetwork verifies that a client will be culled from // the network periodically (at cullInterval duration intervals). func TestGossipCullNetwork(t *testing.T) { defer leaktest.AfterTest(t)() // Create the local gossip and minPeers peers. stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t) local.SetCullInterval(5 * time.Millisecond) peers := []*Gossip{} for i := 0; i < minPeers; i++ { peers = append(peers, startGossip(roachpb.NodeID(i+2), stopper, t)) } // Start clients to all peers and start the local gossip's manage routine. local.mu.Lock() for _, p := range peers { pAddr := p.is.NodeAddr local.startClient(&pAddr, stopper) } local.mu.Unlock() local.manage() util.SucceedsSoon(t, func() error { // Verify that a client is closed within the cull interval. if len(local.Outgoing()) == minPeers-1 { return nil } return errors.New("no network culling occurred") }) }
func TestEagerReplication(t *testing.T) { defer leaktest.AfterTest(t)() store, stopper, _ := createTestStore(t) defer stopper.Stop() // Disable the replica scanner so that we rely on the eager replication code // path that occurs after splits. store.SetReplicaScannerDisabled(true) if err := server.WaitForInitialSplits(store.DB()); err != nil { t.Fatal(err) } // WaitForInitialSplits will return as soon as the meta2 span contains the // expected number of descriptors. But the addition of replicas to the // replicateQueue after a split occurs happens after the update of the // descriptors in meta2 leaving a tiny window of time in which the newly // split replica will not have been added to purgatory. Thus we loop. util.SucceedsSoon(t, func() error { // After the initial splits have been performed, all of the resulting ranges // should be present in replicate queue purgatory (because we only have a // single store in the test and thus replication cannot succeed). expected := server.ExpectedInitialRangeCount() if n := store.ReplicateQueuePurgatoryLength(); expected != n { return errors.Errorf("expected %d replicas in purgatory, but found %d", expected, n) } return nil }) }
// TestTxnCoordSenderHeartbeat verifies periodic heartbeat of the // transaction record. func TestTxnCoordSenderHeartbeat(t *testing.T) { defer leaktest.AfterTest(t)() s := createTestDB(t) defer s.Stop() defer teardownHeartbeats(s.Sender) // Set heartbeat interval to 1ms for testing. s.Sender.heartbeatInterval = 1 * time.Millisecond initialTxn := client.NewTxn(*s.DB) if err := initialTxn.Put(roachpb.Key("a"), []byte("value")); err != nil { t.Fatal(err) } // Verify 3 heartbeats. var heartbeatTS roachpb.Timestamp for i := 0; i < 3; i++ { util.SucceedsSoon(t, func() error { ok, txn, pErr := getTxn(s.Sender, &initialTxn.Proto) if !ok || pErr != nil { t.Fatalf("got txn: %t: %s", ok, pErr) } // Advance clock by 1ns. // Locking the TxnCoordSender to prevent a data race. s.Sender.Lock() s.Manual.Increment(1) s.Sender.Unlock() if heartbeatTS.Less(*txn.LastHeartbeat) { heartbeatTS = *txn.LastHeartbeat return nil } return util.Errorf("expected heartbeat") }) } }
// TestStoreRangeUpReplicate verifies that the replication queue will notice // under-replicated ranges and replicate them. func TestStoreRangeUpReplicate(t *testing.T) { defer leaktest.AfterTest(t)() mtc := startMultiTestContext(t, 3) defer mtc.Stop() // Initialize the gossip network. var wg sync.WaitGroup wg.Add(len(mtc.stores)) key := gossip.MakePrefixPattern(gossip.KeyStorePrefix) mtc.stores[0].Gossip().RegisterCallback(key, func(_ string, _ roachpb.Value) { wg.Done() }) for _, s := range mtc.stores { s.GossipStore() } wg.Wait() // Once we know our peers, trigger a scan. mtc.stores[0].ForceReplicationScanAndProcess() // The range should become available on every node. util.SucceedsSoon(t, func() error { for _, s := range mtc.stores { r := s.LookupReplica(roachpb.RKey("a"), roachpb.RKey("b")) if r == nil { return util.Errorf("expected replica for keys \"a\" - \"b\"") } } return nil }) }
func testBuildInfoInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { checkGossip(t, c, 20*time.Second, hasPeers(c.NumNodes())) util.SucceedsSoon(t, func() error { select { case <-stopper: t.Fatalf("interrupted") return nil default: } var r struct { BuildInfo map[string]string } if err := getJSON(c.URL(0), "/_status/details/local", &r); err != nil { return err } for _, key := range []string{"goVersion", "tag", "time", "dependencies"} { if val, ok := r.BuildInfo[key]; !ok { t.Errorf("build info missing for \"%s\"", key) } else if val == "" { t.Errorf("build info not set for \"%s\"", key) } } return nil }) }
// TestStoreRangeSplitWithMaxBytesUpdate tests a scenario where a new // zone config that updates the max bytes is set and triggers a range // split. func TestStoreRangeSplitWithMaxBytesUpdate(t *testing.T) { defer leaktest.AfterTest(t)() store, stopper, _ := createTestStore(t) config.TestingSetupZoneConfigHook(stopper) defer stopper.Stop() origRng := store.LookupReplica(roachpb.RKeyMin, nil) // Set max bytes. maxBytes := int64(1 << 16) descID := uint32(keys.MaxReservedDescID + 1) config.TestingSetZoneConfig(descID, &config.ZoneConfig{RangeMaxBytes: maxBytes}) // Trigger gossip callback. if err := store.Gossip().AddInfoProto(gossip.KeySystemConfig, &config.SystemConfig{}, 0); err != nil { t.Fatal(err) } // Verify that the range is split and the new range has the correct max bytes. util.SucceedsSoon(t, func() error { newRng := store.LookupReplica(keys.MakeTablePrefix(descID), nil) if newRng.RangeID == origRng.RangeID { return util.Errorf("expected new range created by split") } if newRng.GetMaxBytes() != maxBytes { return util.Errorf("expected %d max bytes for the new range, but got %d", maxBytes, newRng.GetMaxBytes()) } return nil }) }
func testBuildInfoInner(t *testing.T, c cluster.Cluster, cfg cluster.TestConfig) { checkGossip(t, c, 20*time.Second, hasPeers(c.NumNodes())) var details server.DetailsResponse util.SucceedsSoon(t, func() error { select { case <-stopper: t.Fatalf("interrupted") default: } return util.GetJSON(cluster.HTTPClient, c.URL(0)+"/_status/details/local", &details) }) bi := details.BuildInfo testData := map[string]string{ "go_version": bi.GoVersion, "tag": bi.Tag, "time": bi.Time, "dependencies": bi.Dependencies, } for key, val := range testData { if val == "" { t.Errorf("build info not set for \"%s\"", key) } } }
// TestClientDisconnectRedundant verifies that the gossip server // will drop an outgoing client connection that is already an // inbound client connection of another node. func TestClientDisconnectRedundant(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t, metric.NewRegistry()) remote := startGossip(2, stopper, t, metric.NewRegistry()) // startClient requires locks are held, so acquire here. local.mu.Lock() remote.mu.Lock() rAddr := remote.mu.is.NodeAddr lAddr := local.mu.is.NodeAddr local.startClient(&rAddr, remote.mu.is.NodeID) remote.startClient(&lAddr, local.mu.is.NodeID) local.mu.Unlock() remote.mu.Unlock() local.manage() remote.manage() util.SucceedsSoon(t, func() error { // Check which of the clients is connected to the other. ok1 := local.findClient(func(c *client) bool { return c.addr.String() == rAddr.String() }) != nil ok2 := remote.findClient(func(c *client) bool { return c.addr.String() == lAddr.String() }) != nil // We expect node 2 to disconnect; if both are still connected, // it's possible that node 1 gossiped before node 2 connected, in // which case we have to gossip from node 1 to trigger the // disconnect redundant client code. if ok1 && ok2 { if err := local.AddInfo("local-key", nil, time.Second); err != nil { t.Fatal(err) } } else if ok1 && !ok2 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) { return nil } return errors.New("local client to remote not yet closed as redundant") }) }
// TestTxnCoordSenderGCTimeout verifies that the coordinator cleans up extant // transactions and intents after the lastUpdateNanos exceeds the timeout. func TestTxnCoordSenderGCTimeout(t *testing.T) { defer leaktest.AfterTest(t)() s, sender := createTestDB(t) defer s.Stop() // Set heartbeat interval to 1ms for testing. sender.heartbeatInterval = 1 * time.Millisecond txn := client.NewTxn(context.Background(), *s.DB) key := roachpb.Key("a") if err := txn.Put(key, []byte("value")); err != nil { t.Fatal(err) } // Now, advance clock past the default client timeout. // Locking the TxnCoordSender to prevent a data race. sender.Lock() s.Manual.Set(defaultClientTimeout.Nanoseconds() + 1) sender.Unlock() txnID := *txn.Proto.ID util.SucceedsSoon(t, func() error { // Locking the TxnCoordSender to prevent a data race. sender.Lock() _, ok := sender.txns[txnID] sender.Unlock() if ok { return util.Errorf("expected garbage collection") } return nil }) verifyCleanup(key, sender, s.Eng, t) }
// TestClientDisallowMultipleConns verifies that the server disallows // multiple connections from the same client node ID. func TestClientDisallowMultipleConns(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t) remote := startGossip(2, stopper, t) local.mu.Lock() remote.mu.Lock() rAddr := remote.is.NodeAddr // Start two clients from local to remote. RPC client cache is // disabled via the context, so we'll start two different outgoing // connections. local.startClient(&rAddr, stopper) local.startClient(&rAddr, stopper) local.mu.Unlock() remote.mu.Unlock() local.manage() remote.manage() util.SucceedsSoon(t, func() error { // Verify that the remote server has only a single incoming // connection and the local server has only a single outgoing // connection. local.mu.Lock() remote.mu.Lock() outgoing := local.outgoing.len() incoming := remote.incoming.len() local.mu.Unlock() remote.mu.Unlock() if outgoing == 1 && incoming == 1 && verifyServerMaps(local, 0) && verifyServerMaps(remote, 1) { return nil } return util.Errorf("incorrect number of incoming (%d) or outgoing (%d) connections", incoming, outgoing) }) }
func gossipForTest(t *testing.T) (*gossip.Gossip, *stop.Stopper) { stopper := stop.NewStopper() // Setup fake zone config handler. config.TestingSetupZoneConfigHook(stopper) rpcContext := rpc.NewContext(nil, nil, stopper) g := gossip.New(rpcContext, nil, stopper) // Have to call g.SetNodeID before call g.AddInfo g.SetNodeID(roachpb.NodeID(1)) // Put an empty system config into gossip. if err := g.AddInfoProto(gossip.KeySystemConfig, &config.SystemConfig{}, 0); err != nil { t.Fatal(err) } // Wait for SystemConfig. util.SucceedsSoon(t, func() error { if _, ok := g.GetSystemConfig(); !ok { return util.Errorf("expected system config to be set") } return nil }) return g, stopper }
// TestMetricsRecording verifies that Node statistics are periodically recorded // as time series data. func TestMetricsRecording(t *testing.T) { defer leaktest.AfterTest(t)() s, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{ MetricsSampleInterval: 5 * time.Millisecond}) defer s.Stopper().Stop() checkTimeSeriesKey := func(now int64, keyName string) error { key := ts.MakeDataKey(keyName, "", ts.Resolution10s, now) data := roachpb.InternalTimeSeriesData{} return kvDB.GetProto(key, &data) } // Verify that metrics for the current timestamp are recorded. This should // be true very quickly. util.SucceedsSoon(t, func() error { now := s.Clock().PhysicalNow() if err := checkTimeSeriesKey(now, "cr.store.livebytes.1"); err != nil { return err } if err := checkTimeSeriesKey(now, "cr.node.sys.go.allocbytes.1"); err != nil { return err } return nil }) }
// TestClientRetryBootstrap verifies that an initial failure to connect // to a bootstrap host doesn't stall the bootstrapping process in the // absence of any additional activity. This can happen during acceptance // tests if the DNS can't lookup hostnames when gossip is started. func TestClientRetryBootstrap(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() local := startGossip(1, stopper, t) remote := startGossip(2, stopper, t) remote.mu.Lock() rAddr := remote.is.NodeAddr remote.mu.Unlock() if err := local.AddInfo("local-key", []byte("hello"), 0*time.Second); err != nil { t.Fatal(err) } local.SetBootstrapInterval(10 * time.Millisecond) local.SetResolvers([]resolver.Resolver{ &testResolver{addr: rAddr.String(), numFails: 3, numSuccesses: 1}, }) local.bootstrap() local.manage() util.SucceedsSoon(t, func() error { if _, err := remote.GetInfo("local-key"); err != nil { return err } return nil }) }
// TestRemoveRangeWithoutGC ensures that we do not panic when a // replica has been removed but not yet GC'd (and therefore // does not have an active raft group). func TestRemoveRangeWithoutGC(t *testing.T) { defer leaktest.AfterTest(t)() mtc := startMultiTestContext(t, 2) defer mtc.Stop() // Disable the GC queue and move the range from store 0 to 1. mtc.stores[0].DisableReplicaGCQueue(true) const rangeID roachpb.RangeID = 1 mtc.replicateRange(rangeID, 1) mtc.unreplicateRange(rangeID, 0) // Wait for store 0 to process the removal. util.SucceedsSoon(t, func() error { rep, err := mtc.stores[0].GetReplica(rangeID) if err != nil { return err } desc := rep.Desc() if len(desc.Replicas) != 1 { return util.Errorf("range has %d replicas", len(desc.Replicas)) } return nil }) // The replica's data is still on disk even though the Replica // object is removed. var desc roachpb.RangeDescriptor descKey := keys.RangeDescriptorKey(roachpb.RKeyMin) if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), descKey, mtc.stores[0].Clock().Now(), true, nil, &desc); err != nil { t.Fatal(err) } else if !ok { t.Fatal("expected range descriptor to be present") } // Stop and restart the store to reset the replica's raftGroup // pointer to nil. As long as the store has not been restarted it // can continue to use its last known replica ID. mtc.stopStore(0) mtc.restartStore(0) // Turn off the GC queue to ensure that the replica is deleted at // startup instead of by the scanner. This is not 100% guaranteed // since the scanner could have already run at this point, but it // should be enough to prevent us from accidentally relying on the // scanner. mtc.stores[0].DisableReplicaGCQueue(true) // The Replica object is not recreated. if _, err := mtc.stores[0].GetReplica(rangeID); err == nil { t.Fatalf("expected replica to be missing") } // And the data is no longer on disk. if ok, err := engine.MVCCGetProto(mtc.stores[0].Engine(), descKey, mtc.stores[0].Clock().Now(), true, nil, &desc); err != nil { t.Fatal(err) } else if ok { t.Fatal("expected range descriptor to be absent") } }