// TestRequestToUninitializedRange tests the behavior when a request // is sent to a node which should be a replica of the correct range // but has not yet received its initial snapshot. This would // previously panic due to a malformed error response from the server, // as seen in https://github.com/cockroachdb/cockroach/issues/6027. // // Prior to the other changes in the commit that introduced it, this // test would reliable trigger the panic from #6027. However, it // relies on some hacky tricks to both trigger the panic and shut down // cleanly. If this test needs a lot of maintenance in the future we // should be willing to get rid of it. func TestRequestToUninitializedRange(t *testing.T) { defer leaktest.AfterTest(t)() s := server.TestServer{StoresPerNode: 2} if err := s.Start(); err != nil { t.Fatalf("Could not start server: %v", err) } defer s.Stop() // Choose a range ID that is much larger than any that would be // created by initial splits. const rangeID = roachpb.RangeID(1000) // Set up a range with replicas on two stores of the same node. This // ensures that the DistSender will consider both replicas healthy // and will try to talk to both (so we can get a non-retryable error // from the second store). replica1 := roachpb.ReplicaDescriptor{ NodeID: 1, StoreID: 1, ReplicaID: 1, } replica2 := roachpb.ReplicaDescriptor{ NodeID: 1, StoreID: 2, ReplicaID: 2, } // HACK: remove the second store from the node to generate a // non-retryable error when we try to talk to it. store2, err := s.Stores().GetStore(2) if err != nil { t.Fatal(err) } s.Stores().RemoveStore(store2) // Create the uninitialized range by sending an isolated raft // message to the first store. conn, err := s.RPCContext().GRPCDial(s.ServingAddr()) if err != nil { t.Fatal(err) } raftClient := storage.NewMultiRaftClient(conn) ctx, cancel := context.WithCancel(context.Background()) defer cancel() stream, err := raftClient.RaftMessage(ctx) if err != nil { t.Fatal(err) } msg := storage.RaftMessageRequest{ GroupID: rangeID, ToReplica: replica1, FromReplica: replica2, Message: raftpb.Message{ Type: raftpb.MsgApp, To: 1, }, } if err := stream.Send(&msg); err != nil { t.Fatal(err) } // Make sure the replica was created. store1, err := s.Stores().GetStore(1) if err != nil { t.Fatal(err) } util.SucceedsSoon(t, func() error { if replica, err := store1.GetReplica(rangeID); err != nil { return util.Errorf("failed to look up replica: %s", err) } else if replica.IsInitialized() { return util.Errorf("expected replica to be uninitialized") } return nil }) // Create our own DistSender so we can force some requests to the // bogus range. The DistSender needs to be in scope for its own // MockRangeDescriptorDB closure. var sender *kv.DistSender sender = kv.NewDistSender(&kv.DistSenderContext{ Clock: s.Clock(), RPCContext: s.RPCContext(), RangeDescriptorDB: kv.MockRangeDescriptorDB( func(key roachpb.RKey, considerIntents, useReverseScan bool, ) ([]roachpb.RangeDescriptor, []roachpb.RangeDescriptor, *roachpb.Error) { if key.Equal(roachpb.RKeyMin) { // Pass through requests for the first range to the real sender. desc, err := sender.FirstRange() if err != nil { return nil, nil, roachpb.NewError(err) } return []roachpb.RangeDescriptor{*desc}, nil, nil } return []roachpb.RangeDescriptor{{ RangeID: rangeID, StartKey: roachpb.RKey(keys.Meta2Prefix), EndKey: roachpb.RKeyMax, Replicas: []roachpb.ReplicaDescriptor{replica1, replica2}, }}, nil, nil }), }, s.Gossip()) // Only inconsistent reads triggered the panic in #6027. hdr := roachpb.Header{ ReadConsistency: roachpb.INCONSISTENT, } req := roachpb.NewGet(roachpb.Key("asdf")) // Repeat the test a few times: due to the randomization between the // two replicas, each attempt only had a 50% chance of triggering // the panic. for i := 0; i < 5; i++ { _, pErr := client.SendWrappedWith(sender, context.Background(), hdr, req) // Each attempt fails with "store 2 not found" because that is the // non-retryable error. if !testutils.IsPError(pErr, "store 2 not found") { t.Fatal(pErr) } } }
// processQueue creates a client and sends messages from its designated queue // via that client, exiting when the client fails or when it idles out. All // messages remaining in the queue at that point are lost and a new instance of // processQueue should be started by the next message to be sent. // TODO(tschottdorf) should let raft know if the node is down; // need a feedback mechanism for that. Potentially easiest is to arrange for // the next call to Send() to fail appropriately. func (t *rpcTransport) processQueue(nodeID roachpb.NodeID, storeID roachpb.StoreID) { t.mu.Lock() ch, ok := t.queues[storeID] t.mu.Unlock() if !ok { return } // Clean-up when the loop below shuts down. defer func() { t.mu.Lock() delete(t.queues, storeID) t.mu.Unlock() }() addr, err := t.gossip.GetNodeIDAddress(nodeID) if err != nil { if log.V(1) { log.Errorf("could not get address for node %d: %s", nodeID, err) } return } var dialOpt grpc.DialOption if t.rpcContext.Insecure { dialOpt = grpc.WithInsecure() } else { tlsConfig, err := t.rpcContext.GetClientTLSConfig() if err != nil { log.Error(err) return } dialOpt = grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)) } conn, err := grpc.Dial(addr.String(), dialOpt) if err != nil { log.Errorf("failed to dial: %v", err) return } defer func() { if err := conn.Close(); err != nil { log.Error(err) } }() client := storage.NewMultiRaftClient(conn) ctx := grpcutil.NewContextWithStopper(context.Background(), t.rpcContext.Stopper) stream, err := client.RaftMessage(ctx) if err != nil { log.Error(err) return } defer func() { if err := stream.CloseSend(); err != nil { log.Error(err) } }() var raftIdleTimer util.Timer defer raftIdleTimer.Stop() for { raftIdleTimer.Reset(raftIdleTimeout) select { case <-ctx.Done(): return case <-raftIdleTimer.C: raftIdleTimer.Read = true if log.V(1) { log.Infof("closing Raft transport to %d due to inactivity", nodeID) } return case req := <-ch: if err := stream.Send(req); err != nil { log.Error(err) return } } } }