// CreateNode creates a simulation node and starts an RPC server for it. func (n *Network) CreateNode() (*Node, error) { server := rpc.NewServer(n.rpcContext) ln, err := net.Listen(util.TestAddr.Network(), util.TestAddr.String()) if err != nil { return nil, err } node := &Node{Server: server, Listener: ln, Registry: metric.NewRegistry()} node.Gossip = gossip.NewTest(0, n.rpcContext, server, nil, n.Stopper, node.Registry) n.Stopper.RunWorker(func() { <-n.Stopper.ShouldQuiesce() netutil.FatalIfUnexpected(ln.Close()) <-n.Stopper.ShouldStop() server.Stop() node.Gossip.EnableSimulationCycler(false) }) n.Nodes = append(n.Nodes, node) return node, nil }
// StartNode initializes a gossip instance for the simulation node and // starts it. func (n *Network) StartNode(node *Node) error { node.Gossip.Start(node.Addr()) node.Gossip.EnableSimulationCycler(true) n.nodeIDAllocator++ node.Gossip.NodeID.Set(context.TODO(), n.nodeIDAllocator) if err := node.Gossip.SetNodeDescriptor(&roachpb.NodeDescriptor{ NodeID: node.Gossip.NodeID.Get(), Address: util.MakeUnresolvedAddr(node.Addr().Network(), node.Addr().String()), }); err != nil { return err } if err := node.Gossip.AddInfo(node.Addr().String(), encoding.EncodeUint64Ascending(nil, 0), time.Hour); err != nil { return err } n.Stopper.RunWorker(func() { netutil.FatalIfUnexpected(node.Server.Serve(node.Listener)) }) return nil }
// Start starts the server on the specified port, starts gossip and initializes // the node using the engines from the server's context. // // The passed context can be used to trace the server startup. The context // should represent the general startup operation. func (s *Server) Start(ctx context.Context) error { ctx = s.AnnotateCtx(ctx) startTime := timeutil.Now() tlsConfig, err := s.cfg.GetServerTLSConfig() if err != nil { return err } httpServer := netutil.MakeServer(s.stopper, tlsConfig, s) plainRedirectServer := netutil.MakeServer(s.stopper, tlsConfig, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "https://"+r.Host+r.RequestURI, http.StatusPermanentRedirect) })) // The following code is a specialization of util/net.go's ListenAndServe // which adds pgwire support. A single port is used to serve all protocols // (pg, http, h2) via the following construction: // // non-TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // TLS case: // net.Listen -> cmux.New // | // - -> pgwire.Match -> pgwire.Server.ServeConn // - -> cmux.Any -> grpc.(*Server).Serve // // Note that the difference between the TLS and non-TLS cases exists due to // Go's lack of an h2c (HTTP2 Clear Text) implementation. See inline comments // in util.ListenAndServe for an explanation of how h2c is implemented there // and here. ln, err := net.Listen("tcp", s.cfg.Addr) if err != nil { return err } log.Eventf(ctx, "listening on port %s", s.cfg.Addr) unresolvedListenAddr, err := officialAddr(s.cfg.Addr, ln.Addr()) if err != nil { return err } s.cfg.Addr = unresolvedListenAddr.String() unresolvedAdvertAddr, err := officialAddr(s.cfg.AdvertiseAddr, ln.Addr()) if err != nil { return err } s.cfg.AdvertiseAddr = unresolvedAdvertAddr.String() s.rpcContext.SetLocalInternalServer(s.node) m := cmux.New(ln) pgL := m.Match(pgwire.Match) anyL := m.Match(cmux.Any()) httpLn, err := net.Listen("tcp", s.cfg.HTTPAddr) if err != nil { return err } unresolvedHTTPAddr, err := officialAddr(s.cfg.HTTPAddr, httpLn.Addr()) if err != nil { return err } s.cfg.HTTPAddr = unresolvedHTTPAddr.String() workersCtx := s.AnnotateCtx(context.Background()) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := httpLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) if tlsConfig != nil { httpMux := cmux.New(httpLn) clearL := httpMux.Match(cmux.HTTP1()) tlsL := httpMux.Match(cmux.Any()) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpMux.Serve()) }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(plainRedirectServer.Serve(clearL)) }) httpLn = tls.NewListener(tlsL, tlsConfig) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(httpServer.Serve(httpLn)) }) s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() netutil.FatalIfUnexpected(anyL.Close()) <-s.stopper.ShouldStop() s.grpc.Stop() }) s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.grpc.Serve(anyL)) }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, pgL, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) if len(s.cfg.SocketFile) != 0 { // Unix socket enabled: postgres protocol only. unixLn, err := net.Listen("unix", s.cfg.SocketFile) if err != nil { return err } s.stopper.RunWorker(func() { <-s.stopper.ShouldQuiesce() if err := unixLn.Close(); err != nil { log.Fatal(workersCtx, err) } }) s.stopper.RunWorker(func() { pgCtx := s.pgServer.AmbientCtx.AnnotateCtx(context.Background()) netutil.FatalIfUnexpected(httpServer.ServeWith(s.stopper, unixLn, func(conn net.Conn) { connCtx := log.WithLogTagStr(pgCtx, "client", conn.RemoteAddr().String()) if err := s.pgServer.ServeConn(connCtx, conn); err != nil && !netutil.IsClosedConnection(err) { // Report the error on this connection's context, so that we // know which remote client caused the error when looking at // the logs. log.Error(connCtx, err) } })) }) } // Enable the debug endpoints first to provide an earlier window // into what's going on with the node in advance of exporting node // functionality. // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.HandleFunc(debugEndpoint, http.HandlerFunc(handleDebug)) s.gossip.Start(unresolvedAdvertAddr) log.Event(ctx, "started gossip") s.engines, err = s.cfg.CreateEngines() if err != nil { return errors.Wrap(err, "failed to create engines") } s.stopper.AddCloser(&s.engines) // We might have to sleep a bit to protect against this node producing non- // monotonic timestamps. Before restarting, its clock might have been driven // by other nodes' fast clocks, but when we restarted, we lost all this // information. For example, a client might have written a value at a // timestamp that's in the future of the restarted node's clock, and if we // don't do something, the same client's read would not return the written // value. So, we wait up to MaxOffset; we couldn't have served timestamps more // than MaxOffset in the future (assuming that MaxOffset was not changed, see // #9733). // // As an optimization for tests, we don't sleep if all the stores are brand // new. In this case, the node will not serve anything anyway until it // synchronizes with other nodes. { anyStoreBootstrapped := false for _, e := range s.engines { if _, err := storage.ReadStoreIdent(ctx, e); err != nil { // NotBootstrappedError is expected. if _, ok := err.(*storage.NotBootstrappedError); !ok { return err } } else { anyStoreBootstrapped = true break } } if anyStoreBootstrapped { sleepDuration := s.clock.MaxOffset() - timeutil.Since(startTime) if sleepDuration > 0 { log.Infof(ctx, "sleeping for %s to guarantee HLC monotonicity", sleepDuration) time.Sleep(sleepDuration) } } } // Now that we have a monotonic HLC wrt previous incarnations of the process, // init all the replicas. err = s.node.start( ctx, unresolvedAdvertAddr, s.engines, s.cfg.NodeAttributes, s.cfg.Locality, ) if err != nil { return err } log.Event(ctx, "started node") s.nodeLiveness.StartHeartbeat(ctx, s.stopper) // We can now add the node registry. s.recorder.AddNode(s.registry, s.node.Descriptor, s.node.startedAt) // Begin recording runtime statistics. s.startSampleEnvironment(s.cfg.MetricsSampleInterval) // Begin recording time series data collected by the status monitor. s.tsDB.PollSource( s.cfg.AmbientCtx, s.recorder, s.cfg.MetricsSampleInterval, ts.Resolution10s, s.stopper, ) // Begin recording status summaries. s.node.startWriteSummaries(s.cfg.MetricsSampleInterval) // Create and start the schema change manager only after a NodeID // has been assigned. testingKnobs := &sql.SchemaChangerTestingKnobs{} if s.cfg.TestingKnobs.SQLSchemaChanger != nil { testingKnobs = s.cfg.TestingKnobs.SQLSchemaChanger.(*sql.SchemaChangerTestingKnobs) } sql.NewSchemaChangeManager(testingKnobs, *s.db, s.gossip, s.leaseMgr).Start(s.stopper) s.distSQLServer.Start() log.Infof(ctx, "starting %s server at %s", s.cfg.HTTPRequestScheme(), unresolvedHTTPAddr) log.Infof(ctx, "starting grpc/postgres server at %s", unresolvedListenAddr) log.Infof(ctx, "advertising CockroachDB node at %s", unresolvedAdvertAddr) if len(s.cfg.SocketFile) != 0 { log.Infof(ctx, "starting postgres server at unix:%s", s.cfg.SocketFile) } s.stopper.RunWorker(func() { netutil.FatalIfUnexpected(m.Serve()) }) log.Event(ctx, "accepting connections") // Initialize grpc-gateway mux and context. jsonpb := &protoutil.JSONPb{ EnumsAsInts: true, EmitDefaults: true, Indent: " ", } protopb := new(protoutil.ProtoPb) gwMux := gwruntime.NewServeMux( gwruntime.WithMarshalerOption(gwruntime.MIMEWildcard, jsonpb), gwruntime.WithMarshalerOption(httputil.JSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.AltJSONContentType, jsonpb), gwruntime.WithMarshalerOption(httputil.ProtoContentType, protopb), gwruntime.WithMarshalerOption(httputil.AltProtoContentType, protopb), ) gwCtx, gwCancel := context.WithCancel(s.AnnotateCtx(context.Background())) s.stopper.AddCloser(stop.CloserFn(gwCancel)) // Setup HTTP<->gRPC handlers. conn, err := s.rpcContext.GRPCDial(s.cfg.Addr) if err != nil { return errors.Errorf("error constructing grpc-gateway: %s; are your certificates valid?", err) } for _, gw := range []grpcGatewayServer{s.admin, s.status, &s.tsServer} { if err := gw.RegisterGateway(gwCtx, gwMux, conn); err != nil { return err } } var uiFileSystem http.FileSystem uiDebug := envutil.EnvOrDefaultBool("COCKROACH_DEBUG_UI", false) if uiDebug { uiFileSystem = http.Dir("pkg/ui") } else { uiFileSystem = &assetfs.AssetFS{ Asset: ui.Asset, AssetDir: ui.AssetDir, AssetInfo: ui.AssetInfo, } } uiFileServer := http.FileServer(uiFileSystem) s.mux.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if r.URL.Path == "/" { if uiDebug { r.URL.Path = "debug.html" } else { r.URL.Path = "release.html" } } uiFileServer.ServeHTTP(w, r) })) // TODO(marc): when cookie-based authentication exists, // apply it for all web endpoints. s.mux.Handle(adminPrefix, gwMux) s.mux.Handle(ts.URLPrefix, gwMux) s.mux.Handle(statusPrefix, gwMux) s.mux.Handle("/health", gwMux) s.mux.Handle(statusVars, http.HandlerFunc(s.status.handleVars)) log.Event(ctx, "added http endpoints") if err := sdnotify.Ready(); err != nil { log.Errorf(ctx, "failed to signal readiness using systemd protocol: %s", err) } log.Event(ctx, "server ready") return nil }
// TestHeartbeatHealth verifies that the health status changes after // heartbeats succeed or fail due to transport failures. func TestHeartbeatHealthTransport(t *testing.T) { defer leaktest.AfterTest(t)() stopper := stop.NewStopper() defer stopper.Stop() // Can't be zero because that'd be an empty offset. clock := hlc.NewClock(time.Unix(0, 1).UnixNano, time.Nanosecond) serverCtx := newNodeTestContext(clock, stopper) // newTestServer with a custom listener. tlsConfig, err := serverCtx.GetServerTLSConfig() if err != nil { t.Fatal(err) } s := grpc.NewServer(grpc.Creds(credentials.NewTLS(tlsConfig))) ln, err := net.Listen("tcp", util.TestAddr.String()) if err != nil { t.Fatal(err) } mu := struct { syncutil.Mutex conns []net.Conn }{} connectChan := make(chan struct{}) defer close(connectChan) ln = &interceptingListener{Listener: ln, connectChan: connectChan, connCB: func(conn net.Conn) { mu.Lock() mu.conns = append(mu.conns, conn) mu.Unlock() }} stopper.RunWorker(func() { <-stopper.ShouldQuiesce() netutil.FatalIfUnexpected(ln.Close()) <-stopper.ShouldStop() s.Stop() }) stopper.RunWorker(func() { netutil.FatalIfUnexpected(s.Serve(ln)) }) remoteAddr := ln.Addr().String() RegisterHeartbeatServer(s, &HeartbeatService{ clock: clock, remoteClockMonitor: serverCtx.RemoteClocks, }) clientCtx := newNodeTestContext(clock, stopper) // Make the intervals shorter to speed up the tests. clientCtx.HeartbeatInterval = 1 * time.Millisecond if _, err := clientCtx.GRPCDial(remoteAddr); err != nil { t.Fatal(err) } // Allow the connection to go through. connectChan <- struct{}{} // Everything is normal; should become healthy. testutils.SucceedsSoon(t, func() error { if !clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be healthy", remoteAddr) } return nil }) closeConns := func() { mu.Lock() for _, conn := range mu.conns { if err := conn.Close(); err != nil { t.Fatal(err) } } mu.conns = mu.conns[:0] mu.Unlock() } // Close all the connections. closeConns() // Should become unhealthy now that the connection was closed. testutils.SucceedsSoon(t, func() error { if clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be unhealthy", remoteAddr) } return nil }) // Should become healthy again after GRPC reconnects. connectChan <- struct{}{} testutils.SucceedsSoon(t, func() error { if !clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be healthy", remoteAddr) } return nil }) // Close the listener and all the connections. if err := ln.Close(); err != nil { t.Fatal(err) } closeConns() // Should become unhealthy again now that the connection was closed. testutils.SucceedsSoon(t, func() error { if clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be unhealthy", remoteAddr) } return nil }) // Should stay unhealthy despite reconnection attempts. errUnhealthy := errors.New("connection is still unhealthy") if err := util.RetryForDuration(100*clientCtx.HeartbeatInterval, func() error { if clientCtx.IsConnHealthy(remoteAddr) { return errors.Errorf("expected %s to be unhealthy", remoteAddr) } return errUnhealthy }); err != errUnhealthy { t.Fatal(err) } }