// TestRequestToUninitializedRange tests the behavior when a request
// is sent to a node which should be a replica of the correct range
// but has not yet received its initial snapshot. This would
// previously panic due to a malformed error response from the server,
// as seen in https://github.com/cockroachdb/cockroach/issues/6027.
//
// Prior to the other changes in the commit that introduced it, this
// test would reliable trigger the panic from #6027. However, it
// relies on some hacky tricks to both trigger the panic and shut down
// cleanly. If this test needs a lot of maintenance in the future we
// should be willing to get rid of it.
func TestRequestToUninitializedRange(t *testing.T) {
	defer leaktest.AfterTest(t)()
	s := server.TestServer{StoresPerNode: 2}
	if err := s.Start(); err != nil {
		t.Fatalf("Could not start server: %v", err)
	}
	defer s.Stop()

	// Choose a range ID that is much larger than any that would be
	// created by initial splits.
	const rangeID = roachpb.RangeID(1000)

	// Set up a range with replicas on two stores of the same node. This
	// ensures that the DistSender will consider both replicas healthy
	// and will try to talk to both (so we can get a non-retryable error
	// from the second store).
	replica1 := roachpb.ReplicaDescriptor{
		NodeID:    1,
		StoreID:   1,
		ReplicaID: 1,
	}
	replica2 := roachpb.ReplicaDescriptor{
		NodeID:    1,
		StoreID:   2,
		ReplicaID: 2,
	}

	// HACK: remove the second store from the node to generate a
	// non-retryable error when we try to talk to it.
	store2, err := s.Stores().GetStore(2)
	if err != nil {
		t.Fatal(err)
	}
	s.Stores().RemoveStore(store2)

	// Create the uninitialized range by sending an isolated raft
	// message to the first store.
	conn, err := s.RPCContext().GRPCDial(s.ServingAddr())
	if err != nil {
		t.Fatal(err)
	}
	raftClient := storage.NewMultiRaftClient(conn)
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()
	stream, err := raftClient.RaftMessage(ctx)
	if err != nil {
		t.Fatal(err)
	}
	msg := storage.RaftMessageRequest{
		GroupID:     rangeID,
		ToReplica:   replica1,
		FromReplica: replica2,
		Message: raftpb.Message{
			Type: raftpb.MsgApp,
			To:   1,
		},
	}
	if err := stream.Send(&msg); err != nil {
		t.Fatal(err)
	}

	// Make sure the replica was created.
	store1, err := s.Stores().GetStore(1)
	if err != nil {
		t.Fatal(err)
	}
	util.SucceedsSoon(t, func() error {
		if replica, err := store1.GetReplica(rangeID); err != nil {
			return util.Errorf("failed to look up replica: %s", err)
		} else if replica.IsInitialized() {
			return util.Errorf("expected replica to be uninitialized")
		}
		return nil
	})

	// Create our own DistSender so we can force some requests to the
	// bogus range. The DistSender needs to be in scope for its own
	// MockRangeDescriptorDB closure.
	var sender *kv.DistSender
	sender = kv.NewDistSender(&kv.DistSenderContext{
		Clock:      s.Clock(),
		RPCContext: s.RPCContext(),
		RangeDescriptorDB: kv.MockRangeDescriptorDB(
			func(key roachpb.RKey, considerIntents, useReverseScan bool,
			) ([]roachpb.RangeDescriptor, []roachpb.RangeDescriptor, *roachpb.Error) {
				if key.Equal(roachpb.RKeyMin) {
					// Pass through requests for the first range to the real sender.
					desc, err := sender.FirstRange()
					if err != nil {
						return nil, nil, roachpb.NewError(err)
					}
					return []roachpb.RangeDescriptor{*desc}, nil, nil
				}
				return []roachpb.RangeDescriptor{{
					RangeID:  rangeID,
					StartKey: roachpb.RKey(keys.Meta2Prefix),
					EndKey:   roachpb.RKeyMax,
					Replicas: []roachpb.ReplicaDescriptor{replica1, replica2},
				}}, nil, nil
			}),
	}, s.Gossip())
	// Only inconsistent reads triggered the panic in #6027.
	hdr := roachpb.Header{
		ReadConsistency: roachpb.INCONSISTENT,
	}
	req := roachpb.NewGet(roachpb.Key("asdf"))
	// Repeat the test a few times: due to the randomization between the
	// two replicas, each attempt only had a 50% chance of triggering
	// the panic.
	for i := 0; i < 5; i++ {
		_, pErr := client.SendWrappedWith(sender, context.Background(), hdr, req)
		// Each attempt fails with "store 2 not found" because that is the
		// non-retryable error.
		if !testutils.IsPError(pErr, "store 2 not found") {
			t.Fatal(pErr)
		}
	}
}
Example #2
0
// processQueue creates a client and sends messages from its designated queue
// via that client, exiting when the client fails or when it idles out. All
// messages remaining in the queue at that point are lost and a new instance of
// processQueue should be started by the next message to be sent.
// TODO(tschottdorf) should let raft know if the node is down;
// need a feedback mechanism for that. Potentially easiest is to arrange for
// the next call to Send() to fail appropriately.
func (t *rpcTransport) processQueue(nodeID roachpb.NodeID, storeID roachpb.StoreID) {
	t.mu.Lock()
	ch, ok := t.queues[storeID]
	t.mu.Unlock()
	if !ok {
		return
	}
	// Clean-up when the loop below shuts down.
	defer func() {
		t.mu.Lock()
		delete(t.queues, storeID)
		t.mu.Unlock()
	}()

	addr, err := t.gossip.GetNodeIDAddress(nodeID)
	if err != nil {
		if log.V(1) {
			log.Errorf("could not get address for node %d: %s", nodeID, err)
		}
		return
	}

	var dialOpt grpc.DialOption
	if t.rpcContext.Insecure {
		dialOpt = grpc.WithInsecure()
	} else {
		tlsConfig, err := t.rpcContext.GetClientTLSConfig()
		if err != nil {
			log.Error(err)
			return
		}
		dialOpt = grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig))
	}

	conn, err := grpc.Dial(addr.String(), dialOpt)
	if err != nil {
		log.Errorf("failed to dial: %v", err)
		return
	}
	defer func() {
		if err := conn.Close(); err != nil {
			log.Error(err)
		}
	}()
	client := storage.NewMultiRaftClient(conn)
	ctx := grpcutil.NewContextWithStopper(context.Background(), t.rpcContext.Stopper)
	stream, err := client.RaftMessage(ctx)
	if err != nil {
		log.Error(err)
		return
	}
	defer func() {
		if err := stream.CloseSend(); err != nil {
			log.Error(err)
		}
	}()

	var raftIdleTimer util.Timer
	defer raftIdleTimer.Stop()
	for {
		raftIdleTimer.Reset(raftIdleTimeout)
		select {
		case <-ctx.Done():
			return
		case <-raftIdleTimer.C:
			raftIdleTimer.Read = true
			if log.V(1) {
				log.Infof("closing Raft transport to %d due to inactivity", nodeID)
			}
			return
		case req := <-ch:
			if err := stream.Send(req); err != nil {
				log.Error(err)
				return
			}
		}
	}
}