// gossip loops, sending deltas of the infostore and receiving deltas // in turn. If an alternate is proposed on response, the client addr // is modified and method returns for forwarding by caller. func (c *client) gossip(g *Gossip, gossipClient GossipClient, stopper *stop.Stopper) error { // For un-bootstrapped node, g.is.NodeID is 0 when client start gossip, // so it's better to get nodeID from g.is every time. g.mu.Lock() addr := g.is.NodeAddr g.mu.Unlock() ctx, cancel := context.WithCancel(context.Background()) defer cancel() stream, err := gossipClient.Gossip(ctx) if err != nil { return err } if err := c.requestGossip(g, addr, stream); err != nil { return err } sendGossipChan := make(chan struct{}, 1) // Register a callback for gossip updates. updateCallback := func(_ string, _ roachpb.Value) { select { case sendGossipChan <- struct{}{}: default: } } // Defer calling "undoer" callback returned from registration. defer g.RegisterCallback(".*", updateCallback)() errCh := make(chan error, 1) stopper.RunWorker(func() { errCh <- func() error { for { reply, err := stream.Recv() if err != nil { return err } if err := c.handleResponse(g, reply); err != nil { return err } } }() }) for { select { case <-c.closer: return nil case <-stopper.ShouldStop(): return nil case err := <-errCh: return err case <-sendGossipChan: if err := c.sendGossip(g, addr, stream); err != nil { return err } } } }
// start will run continuously and mark stores as offline if they haven't been // heard from in longer than timeUntilStoreDead. func (sp *StorePool) start(stopper *stop.Stopper) { stopper.RunWorker(func() { for { var timeout time.Duration sp.mu.Lock() detail := sp.queue.peek() if detail == nil { // No stores yet, wait the full timeout. timeout = sp.timeUntilStoreDead } else { // Check to see if the store should be marked as dead. deadAsOf := detail.lastUpdatedTime.GoTime().Add(sp.timeUntilStoreDead) now := sp.clock.Now() if now.GoTime().After(deadAsOf) { deadDetail := sp.queue.dequeue() deadDetail.markDead(now) // The next store might be dead as well, set the timeout to // 0 to process it immediately. timeout = 0 } else { // Store is still alive, schedule the next check for when // it should timeout. timeout = deadAsOf.Sub(now.GoTime()) } } sp.mu.Unlock() select { case <-time.After(timeout): case <-stopper.ShouldStop(): return } } }) }
// maybeWarnAboutInit looks for signs indicating a cluster which // hasn't been initialized and warns. There's no absolutely sure way // to determine whether the current node is simply waiting to be // bootstrapped to an existing cluster vs. the operator having failed // to initialize the cluster via the "cockroach init" command, so // we can only warn. // // This method checks whether all gossip bootstrap hosts are // connected, and whether the node itself is a bootstrap host, but // there is still no sentinel gossip. func (g *Gossip) maybeWarnAboutInit(stopper *stop.Stopper) { stopper.RunWorker(func() { // Wait 5s before first check. select { case <-stopper.ShouldStop(): return case <-time.After(5 * time.Second): } retryOptions := retry.Options{ InitialBackoff: 5 * time.Second, // first backoff at 5s MaxBackoff: 60 * time.Second, // max backoff is 60s Multiplier: 2, // doubles Stopper: stopper, // stop no matter what on stopper } // This will never error because of infinite retries. for r := retry.Start(retryOptions); r.Next(); { g.mu.Lock() hasSentinel := g.is.getInfo(KeySentinel) != nil triedAll := g.triedAll g.mu.Unlock() // If we have the sentinel, exit the retry loop. if hasSentinel { break } // Otherwise, if all bootstrap hosts are connected, warn. if triedAll { log.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " + "Use \"cockroach init\" to initialize.") } } }) }
// bootstrap connects the node to the gossip network. Bootstrapping // commences in the event there are no connected clients or the // sentinel gossip info is not available. After a successful bootstrap // connection, this method will block on the stalled condvar, which // receives notifications that gossip network connectivity has been // lost and requires re-bootstrapping. func (g *Gossip) bootstrap(stopper *stop.Stopper) { stopper.RunWorker(func() { for { g.mu.Lock() if g.closed { g.mu.Unlock() return } // Check whether or not we need bootstrap. haveClients := g.outgoing.len() > 0 haveSentinel := g.is.getInfo(KeySentinel) != nil if !haveClients || !haveSentinel { // Try to get another bootstrap address from the resolvers. if addr := g.getNextBootstrapAddress(); addr != nil { g.startClient(addr, g.bsRPCContext, stopper) } } g.mu.Unlock() // Block until we need bootstrapping again. select { case <-g.stalled: // continue case <-stopper.ShouldStop(): return } } }) }
// startGossip loops on a periodic ticker to gossip node-related // information. Starts a goroutine to loop until the node is closed. func (n *Node) startGossip(ctx context.Context, stopper *stop.Stopper) { stopper.RunWorker(func() { gossipStoresInterval := envutil.EnvOrDefaultDuration("gossip_stores_interval", gossip.DefaultGossipStoresInterval) statusTicker := time.NewTicker(gossipStatusInterval) storesTicker := time.NewTicker(gossipStoresInterval) nodeTicker := time.NewTicker(gossipNodeDescriptorInterval) defer storesTicker.Stop() defer nodeTicker.Stop() n.gossipStores(ctx) // one-off run before going to sleep for { select { case <-statusTicker.C: n.ctx.Gossip.LogStatus() case <-storesTicker.C: n.gossipStores(ctx) case <-nodeTicker.C: if err := n.ctx.Gossip.SetNodeDescriptor(&n.Descriptor); err != nil { log.Warningf(ctx, "couldn't gossip descriptor for node %d: %s", n.Descriptor.NodeID, err) } case <-stopper.ShouldStop(): return } } }) }
// waitAndProcess waits for the pace interval and processes the replica // if repl is not nil. The method returns true when the scanner needs // to be stopped. The method also removes a replica from queues when it // is signaled via the removed channel. func (rs *replicaScanner) waitAndProcess(start time.Time, clock *hlc.Clock, stopper *stop.Stopper, repl *Replica) bool { waitInterval := rs.paceInterval(start, timeutil.Now()) rs.waitTimer.Reset(waitInterval) if log.V(6) { log.Infof("Wait time interval set to %s", waitInterval) } for { select { case <-rs.waitTimer.C: rs.waitTimer.Read = true if repl == nil { return false } return !stopper.RunTask(func() { // Try adding replica to all queues. for _, q := range rs.queues { q.MaybeAdd(repl, clock.Now()) } }) case repl := <-rs.removed: // Remove replica from all queues as applicable. for _, q := range rs.queues { q.MaybeRemove(repl) } if log.V(6) { log.Infof("removed replica %s", repl) } case <-stopper.ShouldStop(): return true } } }
// waitAndProcess waits for the pace interval and processes the replica // if repl is not nil. The method returns true when the scanner needs // to be stopped. The method also removes a replica from queues when it // is signaled via the removed channel. func (rs *replicaScanner) waitAndProcess( start time.Time, clock *hlc.Clock, stopper *stop.Stopper, repl *Replica, ) bool { waitInterval := rs.paceInterval(start, timeutil.Now()) rs.waitTimer.Reset(waitInterval) if log.V(6) { log.Infof(context.TODO(), "wait timer interval set to %s", waitInterval) } for { select { case <-rs.waitTimer.C: if log.V(6) { log.Infof(context.TODO(), "wait timer fired") } rs.waitTimer.Read = true if repl == nil { return false } return nil != stopper.RunTask(func() { // Try adding replica to all queues. for _, q := range rs.queues { q.MaybeAdd(repl, clock.Now()) } }) case repl := <-rs.removed: rs.removeReplica(repl) case <-stopper.ShouldStop(): return true } } }
// NewExecutor creates an Executor and registers a callback on the // system config. func NewExecutor(ctx ExecutorContext, stopper *stop.Stopper, registry *metric.Registry) *Executor { exec := &Executor{ ctx: ctx, reCache: parser.NewRegexpCache(512), registry: registry, latency: registry.Latency("latency"), txnBeginCount: registry.Counter("txn.begin.count"), txnCommitCount: registry.Counter("txn.commit.count"), txnAbortCount: registry.Counter("txn.abort.count"), txnRollbackCount: registry.Counter("txn.rollback.count"), selectCount: registry.Counter("select.count"), updateCount: registry.Counter("update.count"), insertCount: registry.Counter("insert.count"), deleteCount: registry.Counter("delete.count"), ddlCount: registry.Counter("ddl.count"), miscCount: registry.Counter("misc.count"), } exec.systemConfigCond = sync.NewCond(exec.systemConfigMu.RLocker()) gossipUpdateC := ctx.Gossip.RegisterSystemConfigChannel() stopper.RunWorker(func() { for { select { case <-gossipUpdateC: cfg, _ := ctx.Gossip.GetSystemConfig() exec.updateSystemConfig(cfg) case <-stopper.ShouldStop(): return } } }) return exec }
// NewExecutor creates an Executor and registers a callback on the // system config. func NewExecutor(db client.DB, gossip *gossip.Gossip, leaseMgr *LeaseManager, metaRegistry *metric.Registry, stopper *stop.Stopper) *Executor { exec := &Executor{ db: db, reCache: parser.NewRegexpCache(512), leaseMgr: leaseMgr, latency: metaRegistry.Latency("sql.latency"), } exec.systemConfigCond = sync.NewCond(&exec.systemConfigMu) gossipUpdateC := gossip.RegisterSystemConfigChannel() stopper.RunWorker(func() { for { select { case <-gossipUpdateC: cfg := gossip.GetSystemConfig() exec.updateSystemConfig(cfg) case <-stopper.ShouldStop(): return } } }) return exec }
// NewExecutor creates an Executor and registers a callback on the // system config. func NewExecutor(db client.DB, gossip *gossip.Gossip, leaseMgr *LeaseManager, stopper *stop.Stopper) *Executor { registry := metric.NewRegistry() exec := &Executor{ db: db, reCache: parser.NewRegexpCache(512), leaseMgr: leaseMgr, registry: registry, latency: registry.Latency("latency"), txnBeginCount: registry.Counter("transaction.begincount"), selectCount: registry.Counter("select.count"), updateCount: registry.Counter("update.count"), insertCount: registry.Counter("insert.count"), deleteCount: registry.Counter("delete.count"), ddlCount: registry.Counter("ddl.count"), miscCount: registry.Counter("misc.count"), } exec.systemConfigCond = sync.NewCond(&exec.systemConfigMu) gossipUpdateC := gossip.RegisterSystemConfigChannel() stopper.RunWorker(func() { for { select { case <-gossipUpdateC: cfg := gossip.GetSystemConfig() exec.updateSystemConfig(cfg) case <-stopper.ShouldStop(): return } } }) return exec }
// start initializes the infostore with the rpc server address and // then begins processing connecting clients in an infinite select // loop via goroutine. Periodically, clients connected and awaiting // the next round of gossip are awoken via the conditional variable. func (s *server) start(rpcServer *rpc.Server, stopper *stop.Stopper) { addr := rpcServer.Addr() s.is.NodeAddr = util.MakeUnresolvedAddr(addr.Network(), addr.String()) if err := rpcServer.Register("Gossip.Gossip", s.Gossip, &Request{}); err != nil { log.Fatalf("unable to register gossip service with RPC server: %s", err) } rpcServer.AddCloseCallback(s.onClose) updateCallback := func(_ string, _ roachpb.Value) { // Wakeup all pending clients. s.ready.Broadcast() } unregister := s.is.registerCallback(".*", updateCallback) stopper.RunWorker(func() { // Periodically wakeup blocked client gossip requests. for { select { case <-stopper.ShouldStop(): s.stop(unregister) return } } }) }
// RefreshLeases starts a goroutine that refreshes the lease manager // leases for tables received in the latest system configuration via gossip. func (m *LeaseManager) RefreshLeases(s *stop.Stopper, db *client.DB, gossip *gossip.Gossip) { s.RunWorker(func() { descKeyPrefix := keys.MakeTablePrefix(uint32(sqlbase.DescriptorTable.ID)) gossipUpdateC := gossip.RegisterSystemConfigChannel() for { select { case <-gossipUpdateC: cfg, _ := gossip.GetSystemConfig() if m.testingKnobs.GossipUpdateEvent != nil { m.testingKnobs.GossipUpdateEvent(cfg) } // Read all tables and their versions if log.V(2) { log.Info("received a new config; will refresh leases") } // Loop through the configuration to find all the tables. for _, kv := range cfg.Values { if !bytes.HasPrefix(kv.Key, descKeyPrefix) { continue } // Attempt to unmarshal config into a table/database descriptor. var descriptor sqlbase.Descriptor if err := kv.Value.GetProto(&descriptor); err != nil { log.Warningf("%s: unable to unmarshal descriptor %v", kv.Key, kv.Value) continue } switch union := descriptor.Union.(type) { case *sqlbase.Descriptor_Table: table := union.Table if err := table.Validate(); err != nil { log.Errorf("%s: received invalid table descriptor: %v", kv.Key, table) continue } if log.V(2) { log.Infof("%s: refreshing lease table: %d (%s), version: %d", kv.Key, table.ID, table.Name, table.Version) } // Try to refresh the table lease to one >= this version. if t := m.findTableState(table.ID, false /* create */, nil); t != nil { if err := t.purgeOldLeases( db, table.Deleted(), table.Version, m.LeaseStore); err != nil { log.Warningf("error purging leases for table %d(%s): %s", table.ID, table.Name, err) } } case *sqlbase.Descriptor_Database: // Ignore. } } if m.testingKnobs.TestingLeasesRefreshedEvent != nil { m.testingKnobs.TestingLeasesRefreshedEvent(cfg) } case <-s.ShouldStop(): return } } }) }
// waitAndProcess waits for the pace interval and processes the range // if rng is not nil. The method returns true when the scanner needs // to be stopped. The method also removes a range from queues when it // is signaled via the removed channel. func (rs *rangeScanner) waitAndProcess(start time.Time, clock *hlc.Clock, stopper *stop.Stopper, rng *Replica) bool { waitInterval := rs.paceInterval(start, time.Now()) nextTime := time.After(waitInterval) if log.V(6) { log.Infof("Wait time interval set to %s", waitInterval) } for { select { case <-nextTime: if rng == nil { return false } return !stopper.RunTask(func() { // Try adding range to all queues. for _, q := range rs.queues { q.MaybeAdd(rng, clock.Now()) } }) case rng := <-rs.removed: // Remove range from all queues as applicable. for _, q := range rs.queues { q.MaybeRemove(rng) } if log.V(6) { log.Infof("removed range %s", rng) } case <-stopper.ShouldStop(): return true } } }
func (e *eventDemux) start(stopper *stop.Stopper) { stopper.RunWorker(func() { for { select { case events := <-e.events: for _, event := range events { switch event := event.(type) { case *EventLeaderElection: e.LeaderElection <- event case *EventCommandCommitted: e.CommandCommitted <- event case *EventMembershipChangeCommitted: e.MembershipChangeCommitted <- event default: panic(fmt.Sprintf("got unknown event type %T", event)) } } case <-stopper.ShouldStop(): close(e.CommandCommitted) close(e.MembershipChangeCommitted) close(e.LeaderElection) return } } }) }
// maybeWarnAboutInit looks for signs indicating a cluster which // hasn't been initialized and warns. There's no absolutely sure way // to determine whether the current node is simply waiting to be // bootstrapped to an existing cluster vs. the operator having failed // to initialize the cluster via the "cockroach init" command, so // we can only warn. // // This method checks whether all gossip bootstrap hosts are // connected, and whether the node itself is a bootstrap host, but // there is still no sentinel gossip. func (g *Gossip) maybeWarnAboutInit(stopper *stop.Stopper) { stopper.RunWorker(func() { // Wait 5s before first check. select { case <-stopper.ShouldStop(): return case <-time.After(5 * time.Second): } retryOptions := retry.Options{ InitialBackoff: 5 * time.Second, // first backoff at 5s MaxBackoff: 60 * time.Second, // max backoff is 60s Multiplier: 2, // doubles Closer: stopper.ShouldStop(), // stop no matter what on stopper } // This will never error because of infinite retries. for r := retry.Start(retryOptions); r.Next(); { g.mu.Lock() hasConnections := g.outgoing.len()+g.incoming.len() > 0 hasSentinel := g.is.getInfo(KeySentinel) != nil triedAll := g.triedAll g.mu.Unlock() // If we have the sentinel, exit the retry loop. if hasSentinel { break } if !hasConnections { log.Warningf("not connected to gossip; check that gossip flag is set appropriately") } else if triedAll { log.Warningf("missing gossip sentinel; first range unavailable or cluster not initialized") } } }) }
// gossip loops, sending deltas of the infostore and receiving deltas // in turn. If an alternate is proposed on response, the client addr // is modified and method returns for forwarding by caller. func (c *client) gossip(g *Gossip, stopper *stop.Stopper) error { // For un-bootstrapped node, g.is.NodeID is 0 when client start gossip, // so it's better to get nodeID from g.is every time. g.mu.Lock() addr := util.MakeUnresolvedAddr(g.is.NodeAddr.Network(), g.is.NodeAddr.String()) g.mu.Unlock() lAddr := util.MakeUnresolvedAddr(c.rpcClient.LocalAddr().Network(), c.rpcClient.LocalAddr().String()) done := make(chan *netrpc.Call, 10) c.getGossip(g, addr, lAddr, done) // Register a callback for gossip updates. updateCallback := func(_ string, _ roachpb.Value) { c.sendGossip(g, addr, lAddr, done) } // Defer calling "undoer" callback returned from registration. defer g.RegisterCallback(".*", updateCallback)() // Loop until stopper is signalled, or until either the gossip or // RPC clients are closed. getGossip is a hanging get, returning // results only when the remote server has new gossip information to // share. sendGossip is sent to the remote server when this node has // new gossip information to share with the server. // // Nodes "pull" gossip in order to guarantee that they're connected // to the sentinel and not too distant from other nodes in the // network. The also "push" their own gossip which guarantees that // the sentinel node will contain their info, and therefore every // node connected to the sentinel. Just pushing or just pulling // wouldn't guarantee a fully connected network. for { select { case call := <-done: if err := c.handleGossip(g, call); err != nil { return err } req := call.Args.(*Request) // If this was from a gossip pull request, fetch again. if req.Delta == nil { c.getGossip(g, addr, lAddr, done) } else { // Otherwise, it's a gossip push request; set sendingGossip // flag false and maybe send more gossip if there have been // additional updates. g.mu.Lock() c.sendingGossip = false g.mu.Unlock() c.sendGossip(g, addr, lAddr, done) } case <-c.rpcClient.Closed: return util.Errorf("client closed") case <-c.closer: return nil case <-stopper.ShouldStop(): return nil } } }
// processLoop processes the entries in the queue until the provided // stopper signals exit. // // TODO(spencer): current load should factor into replica processing timer. func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { defer func() { bq.mu.Lock() bq.mu.stopped = true bq.mu.Unlock() log.FinishEventLog(bq.ctx) }() // nextTime is initially nil; we don't start any timers until the queue // becomes non-empty. var nextTime <-chan time.Time immediately := make(chan time.Time) close(immediately) for { select { // Exit on stopper. case <-stopper.ShouldStop(): return // Incoming signal sets the next time to process if there were previously // no replicas in the queue. case <-bq.incoming: if nextTime == nil { // When a replica is added, wake up immediately. This is mainly // to facilitate testing without unnecessary sleeps. nextTime = immediately // In case we're in a test, still block on the impl. bq.impl.timer() } // Process replicas as the timer expires. case <-nextTime: repl := bq.pop() if repl != nil { if stopper.RunTask(func() { if err := bq.processReplica(repl, clock); err != nil { // Maybe add failing replica to purgatory if the queue supports it. bq.maybeAddToPurgatory(repl, err, clock, stopper) } }) != nil { return } } if bq.Length() == 0 { nextTime = nil } else { nextTime = time.After(bq.impl.timer()) } } } }) }
// NewContextWithStopper returns a context whose Done() channel is closed when // base's Done() channel is closed or when stopper's ShouldStop() channel is // closed, whichever is first. func NewContextWithStopper(base context.Context, stopper *stop.Stopper) context.Context { ctx, cancel := context.WithCancel(base) go func() { select { case <-ctx.Done(): case <-stopper.ShouldStop(): cancel() } }() return ctx }
// ListenAndServe creates a listener and serves handler on it, closing // the listener when signalled by the stopper. func ListenAndServe(stopper *stop.Stopper, handler http.Handler, addr net.Addr, config *tls.Config) (net.Listener, error) { ln, err := net.Listen(addr.Network(), addr.String()) if err != nil { return nil, err } newAddr, err := updatedAddr(addr, ln.Addr()) if err != nil { return nil, err } if config != nil { ln = tls.NewListener(ln, config) } stopper.RunWorker(func() { var mu sync.Mutex activeConns := make(map[net.Conn]struct{}) httpServer := http.Server{ Handler: handler, ConnState: func(conn net.Conn, state http.ConnState) { mu.Lock() switch state { case http.StateNew: activeConns[conn] = struct{}{} case http.StateClosed: delete(activeConns, conn) } mu.Unlock() }, } if err := httpServer.Serve(ln); err != nil && !IsClosedConnection(err) { log.Fatal(err) } mu.Lock() for conn := range activeConns { conn.Close() } mu.Unlock() }) stopper.RunWorker(func() { <-stopper.ShouldStop() // Some unit tests manually close `ln`, so it may already be closed // when we get here. if err := ln.Close(); err != nil && !IsClosedConnection(err) { log.Fatal(err) } }) return listener{newAddr, ln}, nil }
// RefreshLeases starts a goroutine that refreshes the lease manager // leases for tables received in the latest system configuration via gossip. func (m *LeaseManager) RefreshLeases(s *stop.Stopper, db *client.DB, gossip *gossip.Gossip) { s.RunWorker(func() { descKeyPrefix := keys.MakeTablePrefix(uint32(DescriptorTable.ID)) gossip.RegisterSystemConfigCallback(m.updateSystemConfig) for { select { case <-m.newConfig: // Read all tables and their versions cfg := m.getSystemConfig() if log.V(2) { log.Info("received a new config %v", cfg) } // Loop through the configuration to find all the tables. for _, kv := range cfg.Values { if kv.Value.Tag != roachpb.ValueType_BYTES { continue } if !bytes.HasPrefix(kv.Key, descKeyPrefix) { continue } // Attempt to unmarshal config into a table/database descriptor. var descriptor Descriptor if err := kv.Value.GetProto(&descriptor); err != nil { log.Warningf("unable to unmarshal descriptor %v", kv.Value) continue } switch union := descriptor.Union.(type) { case *Descriptor_Table: table := union.Table if err := table.Validate(); err != nil { log.Errorf("received invalid table descriptor: %v", table) continue } if log.V(2) { log.Infof("refreshing lease table: %d, version: %d", table.ID, table.Version) } // Try to refresh the table lease to one >= this version. if err := m.refreshLease(db, table.ID, table.Version); err != nil { log.Warning(err) } case *Descriptor_Database: // Ignore. } } case <-s.ShouldStop(): return } } }) }
// start runs the storage loop in a goroutine. func (w *writeTask) start(stopper *stop.Stopper) { stopper.RunWorker(func() { for { var request *writeRequest select { case <-w.ready: continue case <-stopper.ShouldStop(): return case request = <-w.in: } if log.V(6) { log.Infof("writeTask got request %#v", *request) } response := &writeResponse{make(map[roachpb.RangeID]*groupWriteResponse)} for groupID, groupReq := range request.groups { group, err := w.storage.GroupStorage(groupID, groupReq.replicaID) if err == ErrGroupDeleted { if log.V(4) { log.Infof("dropping write to deleted group %v", groupID) } continue } else if err != nil { log.Fatalf("GroupStorage(group %s, replica %s) failed: %s", groupID, groupReq.replicaID, err) } groupResp := &groupWriteResponse{raftpb.HardState{}, -1, -1, groupReq.entries} response.groups[groupID] = groupResp if !raft.IsEmptyHardState(groupReq.state) { err := group.SetHardState(groupReq.state) if err != nil { panic(err) // TODO(bdarnell): mark this node dead on storage errors } groupResp.state = groupReq.state } if !raft.IsEmptySnap(groupReq.snapshot) { err := group.ApplySnapshot(groupReq.snapshot) if err != nil { panic(err) // TODO(bdarnell) } } if len(groupReq.entries) > 0 { err := group.Append(groupReq.entries) if err != nil { panic(err) // TODO(bdarnell) } } } w.out <- response } }) }
// ListenAndServe creates a listener and serves handler on it, closing // the listener when signalled by the stopper. func ListenAndServe(stopper *stop.Stopper, handler http.Handler, addr net.Addr, tlsConfig *tls.Config) (net.Listener, error) { ln, err := Listen(addr, tlsConfig) if err != nil { return nil, err } var mu sync.Mutex activeConns := make(map[net.Conn]struct{}) httpServer := http.Server{ TLSConfig: tlsConfig, Handler: handler, ConnState: func(conn net.Conn, state http.ConnState) { mu.Lock() switch state { case http.StateNew: activeConns[conn] = struct{}{} case http.StateClosed: delete(activeConns, conn) } mu.Unlock() }, } if err := http2.ConfigureServer(&httpServer, nil); err != nil { return nil, err } stopper.RunWorker(func() { if err := httpServer.Serve(ln); err != nil && !IsClosedConnection(err) { log.Fatal(err) } <-stopper.ShouldStop() mu.Lock() for conn := range activeConns { conn.Close() } mu.Unlock() }) stopper.RunWorker(func() { <-stopper.ShouldDrain() // Some unit tests manually close `ln`, so it may already be closed // when we get here. if err := ln.Close(); err != nil && !IsClosedConnection(err) { log.Fatal(err) } }) return ln, nil }
// startGossip loops on a periodic ticker to gossip node-related // information. Starts a goroutine to loop until the node is closed. func (n *Node) startGossip(stopper *stop.Stopper) { stopper.RunWorker(func() { ticker := time.NewTicker(gossipInterval) defer ticker.Stop() n.gossipStores() // one-off run before going to sleep for { select { case <-ticker.C: n.gossipStores() case <-stopper.ShouldStop(): return } } }) }
// manage manages outgoing clients. Periodically, the infostore is // scanned for infos with hop count exceeding maxToleratedHops() // threshold. If the number of outgoing clients doesn't exceed // MaxPeers, a new gossip client is connected to a randomly selected // peer beyond maxToleratedHops threshold. Otherwise, the least useful // peer node is cut off to make room for a replacement. Disconnected // clients are processed via the disconnected channel and taken out of // the outgoing address set. If there are no longer any outgoing // connections or the sentinel gossip is unavailable, the bootstrapper // is notified via the stalled conditional variable. func (g *Gossip) manage(stopper *stop.Stopper) { stopper.RunWorker(func() { // Loop until closed and there are no remaining outgoing connections. for { select { case <-stopper.ShouldStop(): return case c := <-g.disconnected: g.doDisconnected(stopper, c) case <-time.After(g.jitteredGossipInterval()): g.doCheckTimeout(stopper) } } }) }
// gossip loops, sending deltas of the infostore and receiving deltas // in turn. If an alternate is proposed on response, the client addr // is modified and method returns for forwarding by caller. func (c *client) gossip(ctx context.Context, g *Gossip, stream Gossip_GossipClient, stopper *stop.Stopper, wg *sync.WaitGroup) error { sendGossipChan := make(chan struct{}, 1) // Register a callback for gossip updates. updateCallback := func(_ string, _ roachpb.Value) { select { case sendGossipChan <- struct{}{}: default: } } // Defer calling "undoer" callback returned from registration. defer g.RegisterCallback(".*", updateCallback)() errCh := make(chan error, 1) // This wait group is used to allow the caller to wait until gossip // processing is terminated. wg.Add(1) stopper.RunWorker(func() { defer wg.Done() errCh <- func() error { for { reply, err := stream.Recv() if err != nil { return err } if err := c.handleResponse(g, reply); err != nil { return err } } }() }) for { select { case <-c.closer: return nil case <-stopper.ShouldStop(): return nil case err := <-errCh: return err case <-sendGossipChan: if err := c.sendGossip(g, stream); err != nil { return err } } } }
// processEventsUntil reads and acknowledges messages from the given channel // until either the given conditional returns true, the channel is closed or a // read on the channel times out. func processEventsUntil(ch <-chan *interceptMessage, stopper *stop.Stopper, f func(*RaftMessageRequest) bool) { for { select { case e, ok := <-ch: if !ok { return } e.ack <- struct{}{} if f(e.args.(*RaftMessageRequest)) { return } case <-stopper.ShouldStop(): return } } }
// runHeartbeat sends periodic heartbeats to client. Closes the // connection on error. Heartbeats are sent in an infinite loop until // an error is encountered. func (c *Client) runHeartbeat(stopper *stop.Stopper) { if log.V(2) { log.Infof("client %s starting heartbeat", c.Addr()) } for { select { case <-stopper.ShouldStop(): return case <-time.After(heartbeatInterval): if err := c.heartbeat(); err != nil { log.Infof("client %s heartbeat failed: %v; recycling...", c.Addr(), err) return } } } }
// manage manages outgoing clients. Periodically, the infostore is // scanned for infos with hop count exceeding the MaxHops // threshold. If the number of outgoing clients doesn't exceed // maxPeers(), a new gossip client is connected to a randomly selected // peer beyond MaxHops threshold. Otherwise, the least useful peer // node is cut off to make room for a replacement. Disconnected // clients are processed via the disconnected channel and taken out of // the outgoing address set. If there are no longer any outgoing // connections or the sentinel gossip is unavailable, the bootstrapper // is notified via the stalled conditional variable. func (g *Gossip) manage(stopper *stop.Stopper) { stopper.RunWorker(func() { // Loop until closed and there are no remaining outgoing connections. for { select { case <-stopper.ShouldStop(): return case c := <-g.disconnected: g.doDisconnected(stopper, c) case <-time.After(g.jitteredInterval(checkInterval)): g.doCheckNetwork(stopper) case <-time.After(g.jitteredInterval(stallInterval)): g.maybeSignalStalledLocked() } } }) }
// startComputePeriodidMetrics starts a loop which periodically instructs each // store to compute the value of metrics which cannot be incrementally // maintained. func (n *Node) startComputePeriodicMetrics(stopper *stop.Stopper) { stopper.RunWorker(func() { // Publish status at the same frequency as metrics are collected. ticker := time.NewTicker(publishStatusInterval) defer ticker.Stop() for tick := 0; ; tick++ { select { case <-ticker.C: if err := n.computePeriodicMetrics(tick); err != nil { log.Errorf(n.Ctx(), "failed computing periodic metrics: %s", err) } case <-stopper.ShouldStop(): return } } }) }
// processLoop processes the entries in the queue until the provided // stopper signals exit. // // TODO(spencer): current load should factor into replica processing timer. func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *stop.Stopper) { stopper.RunWorker(func() { // nextTime is initially nil; we don't start any timers until the queue // becomes non-empty. var nextTime <-chan time.Time immediately := make(chan time.Time) close(immediately) for { select { // Incoming signal sets the next time to process if there were previously // no replicas in the queue. case <-bq.incoming: if nextTime == nil { // When a replica is added, wake up immediately. This is mainly // to facilitate testing without unnecessary sleeps. nextTime = immediately // In case we're in a test, still block on the impl. bq.impl.timer() } // Process replicas as the timer expires. case <-nextTime: stopper.RunTask(func() { bq.processOne(clock) }) if bq.Length() == 0 { nextTime = nil } else { nextTime = time.After(bq.impl.timer()) } // Exit on stopper. case <-stopper.ShouldStop(): bq.Lock() bq.replicas = map[proto.RangeID]*replicaItem{} bq.priorityQ = nil bq.Unlock() return } } }) }