Ejemplo n.º 1
0
// makeBaseQueue returns a new instance of baseQueue with the
// specified shouldQueue function to determine which replicas to queue
// and maxSize to limit the growth of the queue. Note that
// maxSize doesn't prevent new replicas from being added, it just
// limits the total size. Higher priority replicas can still be
// added; their addition simply removes the lowest priority replica.
func makeBaseQueue(
	name string,
	impl queueImpl,
	store *Store,
	gossip *gossip.Gossip,
	cfg queueConfig,
) baseQueue {
	bq := baseQueue{
		name:        name,
		impl:        impl,
		store:       store,
		gossip:      gossip,
		queueConfig: cfg,
		incoming:    make(chan struct{}, 1),
	}
	bq.mu.Locker = new(syncutil.Mutex)
	bq.mu.replicas = map[roachpb.RangeID]*replicaItem{}

	bq.ctx = context.TODO()
	// Prepend [name] to logs.
	bq.ctx = log.WithLogTag(bq.ctx, name, nil)
	bq.ctx = log.WithEventLog(bq.ctx, "queue", name)

	bq.processMu = new(syncutil.Mutex)
	return bq
}
Ejemplo n.º 2
0
// bootstrap connects the node to the gossip network. Bootstrapping
// commences in the event there are no connected clients or the
// sentinel gossip info is not available. After a successful bootstrap
// connection, this method will block on the stalled condvar, which
// receives notifications that gossip network connectivity has been
// lost and requires re-bootstrapping.
func (g *Gossip) bootstrap() {
	g.server.stopper.RunWorker(func() {
		ctx := log.WithLogTag(g.ctx, "bootstrap", nil)
		var bootstrapTimer timeutil.Timer
		defer bootstrapTimer.Stop()
		for {
			if g.server.stopper.RunTask(func() {
				g.mu.Lock()
				defer g.mu.Unlock()
				haveClients := g.outgoing.len() > 0
				haveSentinel := g.mu.is.getInfo(KeySentinel) != nil
				log.Eventf(ctx, "have clients: %t, have sentinel: %t", haveClients, haveSentinel)
				if !haveClients || !haveSentinel {
					// Try to get another bootstrap address from the resolvers.
					if addr := g.getNextBootstrapAddress(); addr != nil {
						g.startClient(addr, g.mu.is.NodeID)
					} else {
						bootstrapAddrs := make([]string, 0, len(g.bootstrapping))
						for addr := range g.bootstrapping {
							bootstrapAddrs = append(bootstrapAddrs, addr)
						}
						log.Eventf(ctx, "no next bootstrap address; currently bootstrapping: %v", bootstrapAddrs)
						// We couldn't start a client, signal that we're stalled so that
						// we'll retry.
						g.maybeSignalStatusChangeLocked()
					}
				}
			}) != nil {
				return
			}

			// Pause an interval before next possible bootstrap.
			bootstrapTimer.Reset(g.bootstrapInterval)
			log.Eventf(ctx, "sleeping %s until bootstrap", g.bootstrapInterval)
			select {
			case <-bootstrapTimer.C:
				bootstrapTimer.Read = true
				// break
			case <-g.server.stopper.ShouldStop():
				return
			}
			log.Eventf(ctx, "idling until bootstrap required")
			// Block until we need bootstrapping again.
			select {
			case <-g.stalledCh:
				log.Eventf(ctx, "detected stall; commencing bootstrap")
				// break
			case <-g.server.stopper.ShouldStop():
				return
			}
		}
	})
}
Ejemplo n.º 3
0
// NewServer creates a Server from a server.Context.
func NewServer(srvCtx Context, stopper *stop.Stopper) (*Server, error) {
	if _, err := net.ResolveTCPAddr("tcp", srvCtx.Addr); err != nil {
		return nil, errors.Errorf("unable to resolve RPC address %q: %v", srvCtx.Addr, err)
	}

	if srvCtx.Ctx == nil {
		srvCtx.Ctx = context.Background()
	}
	if srvCtx.Ctx.Done() != nil {
		panic("context with cancel or deadline")
	}
	if tracing.TracerFromCtx(srvCtx.Ctx) == nil {
		// TODO(radu): instead of modifying srvCtx.Ctx, we should have a separate
		// context.Context inside Server. We will need to rename server.Context
		// though.
		srvCtx.Ctx = tracing.WithTracer(srvCtx.Ctx, tracing.NewTracer())
	}

	if srvCtx.Insecure {
		log.Warning(srvCtx.Ctx, "running in insecure mode, this is strongly discouraged. See --insecure.")
	}
	// Try loading the TLS configs before anything else.
	if _, err := srvCtx.GetServerTLSConfig(); err != nil {
		return nil, err
	}
	if _, err := srvCtx.GetClientTLSConfig(); err != nil {
		return nil, err
	}

	s := &Server{
		mux:     http.NewServeMux(),
		clock:   hlc.NewClock(hlc.UnixNano),
		stopper: stopper,
	}
	// Add a dynamic log tag value for the node ID.
	//
	// We need to pass the server's Ctx as a base context for the various server
	// components, but we won't know the node ID until we Start(). At that point
	// it's too late to change the contexts in the components (various background
	// processes will have already started using the contexts).
	//
	// The dynamic value allows us to add the log tag to the context now and
	// update the value asynchronously. It's not significantly more expensive than
	// a regular tag since it's just doing an (atomic) load when a log/trace
	// message is constructed.
	s.nodeLogTagVal.Set(log.DynamicIntValueUnknown)
	srvCtx.Ctx = log.WithLogTag(srvCtx.Ctx, "n", &s.nodeLogTagVal)
	s.ctx = srvCtx

	s.clock.SetMaxOffset(srvCtx.MaxOffset)

	s.rpcContext = rpc.NewContext(srvCtx.Context, s.clock, s.stopper)
	s.rpcContext.HeartbeatCB = func() {
		if err := s.rpcContext.RemoteClocks.VerifyClockOffset(); err != nil {
			log.Fatal(s.Ctx(), err)
		}
	}
	s.grpc = rpc.NewServer(s.rpcContext)

	s.registry = metric.NewRegistry()
	s.gossip = gossip.New(
		s.Ctx(), s.rpcContext, s.grpc, s.ctx.GossipBootstrapResolvers, s.stopper, s.registry)
	s.storePool = storage.NewStorePool(
		s.gossip,
		s.clock,
		s.rpcContext,
		srvCtx.ReservationsEnabled,
		srvCtx.TimeUntilStoreDead,
		s.stopper,
	)

	// A custom RetryOptions is created which uses stopper.ShouldQuiesce() as
	// the Closer. This prevents infinite retry loops from occurring during
	// graceful server shutdown
	//
	// Such a loop loop occurs with the DistSender attempts a connection to the
	// local server during shutdown, and receives an internal server error (HTTP
	// Code 5xx). This is the correct error for a server to return when it is
	// shutting down, and is normally retryable in a cluster environment.
	// However, on a single-node setup (such as a test), retries will never
	// succeed because the only server has been shut down; thus, thus the
	// DistSender needs to know that it should not retry in this situation.
	retryOpts := base.DefaultRetryOptions()
	retryOpts.Closer = s.stopper.ShouldQuiesce()
	distSenderCfg := kv.DistSenderConfig{
		Ctx:             s.Ctx(),
		Clock:           s.clock,
		RPCContext:      s.rpcContext,
		RPCRetryOptions: &retryOpts,
	}
	s.distSender = kv.NewDistSender(&distSenderCfg, s.gossip)

	txnMetrics := kv.MakeTxnMetrics()
	s.registry.AddMetricStruct(txnMetrics)
	s.txnCoordSender = kv.NewTxnCoordSender(s.Ctx(), s.distSender, s.clock, srvCtx.Linearizable,
		s.stopper, txnMetrics)
	s.db = client.NewDB(s.txnCoordSender)

	s.raftTransport = storage.NewRaftTransport(storage.GossipAddressResolver(s.gossip), s.grpc, s.rpcContext)

	s.kvDB = kv.NewDBServer(s.ctx.Context, s.txnCoordSender, s.stopper)
	roachpb.RegisterExternalServer(s.grpc, s.kvDB)

	// Set up Lease Manager
	var lmKnobs sql.LeaseManagerTestingKnobs
	if srvCtx.TestingKnobs.SQLLeaseManager != nil {
		lmKnobs = *srvCtx.TestingKnobs.SQLLeaseManager.(*sql.LeaseManagerTestingKnobs)
	}
	s.leaseMgr = sql.NewLeaseManager(0, *s.db, s.clock, lmKnobs, s.stopper)
	s.leaseMgr.RefreshLeases(s.stopper, s.db, s.gossip)

	// Set up the DistSQL server
	distSQLCfg := distsql.ServerConfig{
		Context:    s.Ctx(),
		DB:         s.db,
		RPCContext: s.rpcContext,
	}
	s.distSQLServer = distsql.NewServer(distSQLCfg)
	distsql.RegisterDistSQLServer(s.grpc, s.distSQLServer)

	// Set up Executor
	execCfg := sql.ExecutorConfig{
		Context:      s.Ctx(),
		DB:           s.db,
		Gossip:       s.gossip,
		LeaseManager: s.leaseMgr,
		Clock:        s.clock,
		DistSQLSrv:   s.distSQLServer,
	}
	if srvCtx.TestingKnobs.SQLExecutor != nil {
		execCfg.TestingKnobs = srvCtx.TestingKnobs.SQLExecutor.(*sql.ExecutorTestingKnobs)
	} else {
		execCfg.TestingKnobs = &sql.ExecutorTestingKnobs{}
	}

	s.sqlExecutor = sql.NewExecutor(execCfg, s.stopper)
	s.registry.AddMetricStruct(s.sqlExecutor)

	s.pgServer = pgwire.MakeServer(s.ctx.Context, s.sqlExecutor)
	s.registry.AddMetricStruct(s.pgServer.Metrics())

	// TODO(bdarnell): make StoreConfig configurable.
	nCtx := storage.StoreContext{
		Ctx:                            s.Ctx(),
		Clock:                          s.clock,
		DB:                             s.db,
		Gossip:                         s.gossip,
		Transport:                      s.raftTransport,
		RaftTickInterval:               s.ctx.RaftTickInterval,
		ScanInterval:                   s.ctx.ScanInterval,
		ScanMaxIdleTime:                s.ctx.ScanMaxIdleTime,
		ConsistencyCheckInterval:       s.ctx.ConsistencyCheckInterval,
		ConsistencyCheckPanicOnFailure: s.ctx.ConsistencyCheckPanicOnFailure,
		StorePool:                      s.storePool,
		SQLExecutor: sql.InternalExecutor{
			LeaseManager: s.leaseMgr,
		},
		LogRangeEvents: true,
		AllocatorOptions: storage.AllocatorOptions{
			AllowRebalance: true,
		},
	}
	if srvCtx.TestingKnobs.Store != nil {
		nCtx.TestingKnobs = *srvCtx.TestingKnobs.Store.(*storage.StoreTestingKnobs)
	}

	s.recorder = status.NewMetricsRecorder(s.clock)
	s.registry.AddMetricStruct(s.rpcContext.RemoteClocks.Metrics())

	s.runtime = status.MakeRuntimeStatSampler(s.clock)
	s.registry.AddMetricStruct(s.runtime)

	s.node = NewNode(nCtx, s.recorder, s.registry, s.stopper, txnMetrics, sql.MakeEventLogger(s.leaseMgr))
	roachpb.RegisterInternalServer(s.grpc, s.node)
	storage.RegisterStoresServer(s.grpc, s.node.storesServer)

	s.tsDB = ts.NewDB(s.db)
	s.tsServer = ts.MakeServer(s.tsDB)

	s.admin = makeAdminServer(s)
	s.status = newStatusServer(s.db, s.gossip, s.recorder, s.ctx.Context, s.rpcContext, s.node.stores)
	for _, gw := range []grpcGatewayServer{&s.admin, s.status, &s.tsServer} {
		gw.RegisterService(s.grpc)
	}

	return s, nil
}