func (e *eventDemux) start(stopper *util.Stopper) { stopper.RunWorker(func() { for { select { case event := <-e.events: switch event := event.(type) { case *EventLeaderElection: e.LeaderElection <- event case *EventCommandCommitted: e.CommandCommitted <- event case *EventMembershipChangeCommitted: e.MembershipChangeCommitted <- event default: panic(fmt.Sprintf("got unknown event type %T", event)) } case <-stopper.ShouldStop(): close(e.CommandCommitted) close(e.MembershipChangeCommitted) close(e.LeaderElection) return } } }) }
// maybeWarnAboutInit looks for signs indicating a cluster which // hasn't been initialized and warns. There's no absolutely sure way // to determine whether the current node is simply waiting to be // bootstrapped to an existing cluster vs. the operator having failed // to initialize the cluster via the "cockroach init" command, so // we can only warn. // // This method checks whether all gossip bootstrap hosts are // connected, and whether the node itself is a bootstrap host, but // there is still no sentinel gossip. func (g *Gossip) maybeWarnAboutInit(stopper *util.Stopper) { stopper.RunWorker(func() { // Wait 5s before first check. select { case <-stopper.ShouldStop(): return case <-time.After(5 * time.Second): } retryOptions := retry.Options{ Tag: "check cluster initialization", Backoff: 5 * time.Second, // first backoff at 5s MaxBackoff: 60 * time.Second, // max backoff is 60s Constant: 2, // doubles MaxAttempts: 0, // indefinite retries Stopper: stopper, // stop no matter what on stopper } // will never error because infinite retries _ = retry.WithBackoff(retryOptions, func() (retry.Status, error) { g.mu.Lock() hasSentinel := g.is.getInfo(KeySentinel) != nil g.mu.Unlock() // If we have the sentinel, exit the retry loop. if hasSentinel { return retry.Break, nil } // Otherwise, if all bootstrap hosts are connected, warn. if g.triedAll { log.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " + "Use \"cockroach init\" to initialize.") } return retry.Continue, nil }) }) }
// bootstrap connects the node to the gossip network. Bootstrapping // commences in the event there are no connected clients or the // sentinel gossip info is not available. After a successful bootstrap // connection, this method will block on the stalled condvar, which // receives notifications that gossip network connectivity has been // lost and requires re-bootstrapping. func (g *Gossip) bootstrap(stopper *util.Stopper) { stopper.RunWorker(func() { for { g.mu.Lock() if g.closed { g.mu.Unlock() return } // Check whether or not we need bootstrap. haveClients := g.outgoing.len() > 0 haveSentinel := g.is.getInfo(KeySentinel) != nil if !haveClients || !haveSentinel { // Try to get another bootstrap address from the resolvers. if addr := g.getNextBootstrapAddress(); addr != nil { g.startClient(addr, g.bsRPCContext, stopper) } } g.mu.Unlock() // Block until we need bootstrapping again. select { case <-g.stalled: // continue case <-stopper.ShouldStop(): return } } }) }
// maybeWarnAboutInit looks for signs indicating a cluster which // hasn't been initialized and warns. There's no absolutely sure way // to determine whether the current node is simply waiting to be // bootstrapped to an existing cluster vs. the operator having failed // to initialize the cluster via the "cockroach init" command, so // we can only warn. // // This method checks whether all gossip bootstrap hosts are // connected, and whether the node itself is a bootstrap host, but // there is still no sentinel gossip. func (g *Gossip) maybeWarnAboutInit(stopper *util.Stopper) { stopper.RunWorker(func() { // Wait 5s before first check. select { case <-stopper.ShouldStop(): return case <-time.After(5 * time.Second): } retryOptions := retry.Options{ InitialBackoff: 5 * time.Second, // first backoff at 5s MaxBackoff: 60 * time.Second, // max backoff is 60s Multiplier: 2, // doubles Stopper: stopper, // stop no matter what on stopper } // will never error because infinite retries for r := retry.Start(retryOptions); r.Next(); { g.mu.Lock() hasSentinel := g.is.getInfo(KeySentinel) != nil g.mu.Unlock() // If we have the sentinel, exit the retry loop. if hasSentinel { break } // Otherwise, if all bootstrap hosts are connected, warn. if g.triedAll { log.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " + "Use \"cockroach init\" to initialize.") } } }) }
// waitAndProcess waits for the pace interval and processes the range // if rng is not nil. The method returns true when the scanner needs // to be stopped. The method also removes a range from queues when it // is signaled via the removed channel. func (rs *rangeScanner) waitAndProcess(start time.Time, clock *hlc.Clock, stopper *util.Stopper, rng *Range) bool { waitInterval := rs.paceInterval(start, time.Now()) nextTime := time.After(waitInterval) if log.V(6) { log.Infof("Wait time interval set to %s", waitInterval) } for { select { case <-nextTime: if rng == nil { return false } if !stopper.StartTask() { return true } // Try adding range to all queues. for _, q := range rs.queues { q.MaybeAdd(rng, clock.Now()) } stopper.FinishTask() return false case rng := <-rs.removed: // Remove range from all queues as applicable. for _, q := range rs.queues { q.MaybeRemove(rng) } if log.V(6) { log.Infof("removed range %s", rng) } case <-stopper.ShouldStop(): return true } } }
// scanLoop loops endlessly, scanning through ranges available via // the range iterator, or until the scanner is stopped. The iteration // is paced to complete a full scan in approximately the scan interval. func (rs *rangeScanner) scanLoop(clock *hlc.Clock, stopper *util.Stopper) { start := time.Now() stats := &storeStats{} for { elapsed := time.Now().Sub(start) remainingNanos := rs.interval.Nanoseconds() - elapsed.Nanoseconds() if remainingNanos < 0 { remainingNanos = 0 } nextIteration := time.Duration(remainingNanos) if count := rs.iter.EstimatedCount(); count > 0 { nextIteration = time.Duration(remainingNanos / int64(count)) } log.V(6).Infof("next range scan iteration in %s", nextIteration) select { case <-time.After(nextIteration): rng := rs.iter.Next() if rng != nil { // Try adding range to all queues. for _, q := range rs.queues { q.MaybeAdd(rng, clock.Now()) } stats.RangeCount++ stats.MVCC.Accumulate(rng.stats.GetMVCC()) } else { // Otherwise, we're done with the iteration. Reset iteration and start time. rs.iter.Reset() start = time.Now() // Increment iteration counter. atomic.AddInt64(&rs.count, 1) // Store the most recent scan results in the scanner's stats. atomic.StorePointer(&rs.stats, unsafe.Pointer(stats)) stats = &storeStats{} log.V(6).Infof("reset range scan iteration") } case rng := <-rs.removed: // Remove range from all queues as applicable. for _, q := range rs.queues { q.MaybeRemove(rng) } log.V(6).Infof("removed range %s", rng) case <-stopper.ShouldStop(): // Exit the loop. stopper.SetStopped() return } } }
// start runs the storage loop in a goroutine. func (w *writeTask) start(stopper *util.Stopper) { stopper.RunWorker(func() { for { var request *writeRequest select { case <-w.ready: continue case <-stopper.ShouldStop(): return case request = <-w.in: } if log.V(6) { log.Infof("writeTask got request %#v", *request) } response := &writeResponse{make(map[proto.RaftID]*groupWriteResponse)} for groupID, groupReq := range request.groups { group := w.storage.GroupStorage(groupID) if group == nil { if log.V(4) { log.Infof("dropping write to group %v", groupID) } continue } groupResp := &groupWriteResponse{raftpb.HardState{}, -1, -1, groupReq.entries} response.groups[groupID] = groupResp if !raft.IsEmptyHardState(groupReq.state) { err := group.SetHardState(groupReq.state) if err != nil { panic(err) // TODO(bdarnell): mark this node dead on storage errors } groupResp.state = groupReq.state } if !raft.IsEmptySnap(groupReq.snapshot) { err := group.ApplySnapshot(groupReq.snapshot) if err != nil { panic(err) // TODO(bdarnell) } } if len(groupReq.entries) > 0 { err := group.Append(groupReq.entries) if err != nil { panic(err) // TODO(bdarnell) } } } w.out <- response } }) }
// manage manages outgoing clients. Periodically, the infostore is // scanned for infos with hop count exceeding maxToleratedHops() // threshold. If the number of outgoing clients doesn't exceed // MaxPeers, a new gossip client is connected to a randomly selected // peer beyond maxToleratedHops threshold. Otherwise, the least useful // peer node is cut off to make room for a replacement. Disconnected // clients are processed via the disconnected channel and taken out of // the outgoing address set. If there are no longer any outgoing // connections or the sentinel gossip is unavailable, the bootstrapper // is notified via the stalled conditional variable. func (g *Gossip) manage(stopper *util.Stopper) { stopper.RunWorker(func() { // Loop until closed and there are no remaining outgoing connections. for { select { case <-stopper.ShouldStop(): return case c := <-g.disconnected: g.doDisconnected(stopper, c) case <-time.After(g.jitteredGossipInterval()): g.doCheckTimeout(stopper) } } }) }
// startGossip loops on a periodic ticker to gossip node-related // information. Starts a goroutine to loop until the node is closed. func (n *Node) startGossip(stopper *util.Stopper) { stopper.RunWorker(func() { ticker := time.NewTicker(gossipInterval) defer ticker.Stop() n.gossipCapacities() // one-off run before going to sleep for { select { case <-ticker.C: n.gossipCapacities() case <-stopper.ShouldStop(): return } } }) }
// processEventsUntil reads and acknowledges messages from the given channel // until either the given conditional returns true, the channel is closed or a // read on the channel times out. func processEventsUntil(ch <-chan *interceptMessage, stopper *util.Stopper, f func(*RaftMessageRequest) bool) { for { select { case e, ok := <-ch: if !ok { return } e.ack <- struct{}{} if f(e.args.(*RaftMessageRequest)) { return } case <-stopper.ShouldStop(): return } } }
// runHeartbeat sends periodic heartbeats to client. Closes the // connection on error. Heartbeats are sent in an infinite loop until // an error is encountered. func (c *Client) runHeartbeat(stopper *util.Stopper) { if log.V(2) { log.Infof("client %s starting heartbeat", c.Addr()) } for { select { case <-stopper.ShouldStop(): return case <-time.After(heartbeatInterval): if err := c.heartbeat(); err != nil { log.Infof("client %s heartbeat failed: %v; recycling...", c.Addr(), err) return } } } }
// startPublishStatuses starts a loop which periodically instructs each store to // publish its current status to the event feed. func (n *Node) startPublishStatuses(stopper *util.Stopper) { stopper.RunWorker(func() { // Publish status at the same frequency as metrics are collected. ticker := time.NewTicker(publishStatusInterval) defer ticker.Stop() for { select { case <-ticker.C: err := n.publishStoreStatuses() if err != nil { log.Error(err) } case <-stopper.ShouldStop(): return } } }) }
func (tq *testQueue) Start(clock *hlc.Clock, stopper *util.Stopper) { stopper.RunWorker(func() { for { select { case <-time.After(1 * time.Millisecond): tq.Lock() if !tq.disabled && len(tq.ranges) > 0 { tq.ranges = tq.ranges[1:] tq.processed++ } tq.Unlock() case <-stopper.ShouldStop(): tq.Lock() tq.done = true tq.Unlock() return } } }) }
// start initializes the infostore with the rpc server address and // then begins processing connecting clients in an infinite select // loop via goroutine. Periodically, clients connected and awaiting // the next round of gossip are awoken via the conditional variable. func (s *server) start(rpcServer *rpc.Server, stopper *util.Stopper) { s.is.NodeAddr = rpcServer.Addr() if err := rpcServer.RegisterName("Gossip", s); err != nil { log.Fatalf("unable to register gossip service with RPC server: %s", err) } rpcServer.AddCloseCallback(s.onClose) stopper.RunWorker(func() { // Periodically wakeup blocked client gossip requests. for { select { case <-time.After(s.jitteredGossipInterval()): // Wakeup all blocked gossip requests. s.ready.Broadcast() case <-stopper.ShouldStop(): s.stop() return } } }) }
// MonitorRemoteOffsets periodically checks that the offset of this server's // clock from the true cluster time is within MaxOffset. If the offset exceeds // MaxOffset, then this method will trigger a fatal error, causing the node to // suicide. func (r *RemoteClockMonitor) MonitorRemoteOffsets(stopper *util.Stopper) { if log.V(1) { log.Infof("monitoring cluster offset") } for { select { case <-stopper.ShouldStop(): return case <-time.After(monitorInterval): offsetInterval, err := r.findOffsetInterval() // By the contract of the hlc, if the value is 0, then safety checking // of the max offset is disabled. However we may still want to // propagate the information to a status node. // TODO(embark): once there is a framework for collecting timeseries // data about the db, propagate the offset status to that. // Don't forget to protect r.offsets through the Mutex if those // Fatalf's below ever turn into something less destructive. if r.lClock.MaxOffset() != 0 { if err != nil { log.Fatalf("clock offset from the cluster time "+ "for remote clocks %v could not be determined: %s", r.offsets, err) } if !isHealthyOffsetInterval(offsetInterval, r.lClock.MaxOffset()) { log.Fatalf("clock offset from the cluster time "+ "for remote clocks: %v is in interval: %s, which "+ "indicates that the true offset is greater than %s", r.offsets, offsetInterval, time.Duration(r.lClock.MaxOffset())) } if log.V(1) { log.Infof("healthy cluster offset: %s", offsetInterval) } } r.mu.Lock() r.lastMonitoredAt = r.lClock.PhysicalNow() r.mu.Unlock() } } }
func (tq *testQueue) Start(clock *hlc.Clock, stopper *util.Stopper) { stopper.Add(1) go func() { for { select { case <-time.After(1 * time.Millisecond): tq.Lock() if len(tq.ranges) > 0 { tq.ranges = tq.ranges[1:] tq.processed++ } tq.Unlock() case <-stopper.ShouldStop(): tq.Lock() tq.done = true tq.Unlock() stopper.SetStopped() return } } }() }
// processLoop processes the entries in the queue until the provided // stopper signals exit. // // TODO(spencer): current load should factor into range processing timer. func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *util.Stopper) { stopper.RunWorker(func() { // nextTime is initially nil; we don't start any timers until the queue // becomes non-empty. var nextTime <-chan time.Time for { select { // Incoming signal sets the next time to process if there were previously // no ranges in the queue. case <-bq.incoming: if nextTime == nil { // When the first range is added, wake up immediately. This is // mainly to facilitate testing without unnecessary sleeps. nextTime = time.After(0 * time.Millisecond) } // Process ranges as the timer expires. case <-nextTime: bq.processOne(clock, stopper) if bq.Length() == 0 { nextTime = nil } else { nextTime = time.After(bq.impl.timer()) } // Exit on stopper. case <-stopper.ShouldStop(): bq.Lock() bq.ranges = map[proto.RaftID]*rangeItem{} bq.priorityQ = nil bq.Unlock() return } } }) }
// startStoresScanner will walk through all the stores in the node every // ctx.ScanInterval and store the status in the db. func (n *Node) startStoresScanner(stopper *util.Stopper) { stopper.RunWorker(func() { // Pick the smaller of the two intervals. var minScanInterval time.Duration if n.ctx.ScanInterval <= n.ctx.ScanMaxIdleTime || n.ctx.ScanMaxIdleTime == 0 { minScanInterval = n.ctx.ScanInterval } else { minScanInterval = n.ctx.ScanMaxIdleTime } // TODO(bram): The number of stores is small. The node status should be // updated whenever a store status is updated. for interval := time.Duration(0); true; interval = minScanInterval { select { case <-time.After(interval): if !stopper.StartTask() { continue } // Walk through all the stores on this node. var rangeCount, leaderRangeCount, replicatedRangeCount, availableRangeCount int32 stats := &engine.MVCCStats{} accessedStoreIDs := []proto.StoreID{} // will never error because `return nil` below _ = n.lSender.VisitStores(func(store *storage.Store) error { storeStatus, err := store.GetStatus() if err != nil { log.Error(err) return nil } if storeStatus == nil { // The store scanner hasn't run on this node yet. return nil } accessedStoreIDs = append(accessedStoreIDs, store.Ident.StoreID) rangeCount += storeStatus.RangeCount leaderRangeCount += storeStatus.LeaderRangeCount replicatedRangeCount += storeStatus.ReplicatedRangeCount availableRangeCount += storeStatus.AvailableRangeCount stats.Add(&storeStatus.Stats) return nil }) // Store the combined stats in the db. now := n.ctx.Clock.Now().WallTime status := &NodeStatus{ Desc: n.Descriptor, StoreIDs: accessedStoreIDs, UpdatedAt: now, StartedAt: n.startedAt, RangeCount: rangeCount, Stats: *stats, LeaderRangeCount: leaderRangeCount, ReplicatedRangeCount: replicatedRangeCount, AvailableRangeCount: availableRangeCount, } key := keys.NodeStatusKey(int32(n.Descriptor.NodeID)) if err := n.ctx.DB.Put(key, status); err != nil { log.Error(err) } // Increment iteration count. n.completedScan.L.Lock() n.scanCount++ n.completedScan.Broadcast() n.completedScan.L.Unlock() if log.V(6) { log.Infof("store scan iteration completed") } stopper.FinishTask() case <-stopper.ShouldStop(): // Exit the loop. return } } }) }
// gossip loops, sending deltas of the infostore and receiving deltas // in turn. If an alternate is proposed on response, the client addr // is modified and method returns for forwarding by caller. func (c *client) gossip(g *Gossip, stopper *util.Stopper) error { localMaxSeq := int64(0) remoteMaxSeq := int64(-1) for { // Compute the delta of local node's infostore to send with request. g.mu.Lock() delta := g.is.delta(c.peerID, localMaxSeq) nodeID := g.is.NodeID // needs to be accessed with the lock held g.mu.Unlock() var deltaBytes []byte if delta != nil { localMaxSeq = delta.MaxSeq var buf bytes.Buffer if err := gob.NewEncoder(&buf).Encode(delta); err != nil { return util.Errorf("infostore could not be encoded: %s", err) } deltaBytes = buf.Bytes() } // Send gossip with timeout. args := &proto.GossipRequest{ NodeID: nodeID, Addr: *proto.FromNetAddr(g.is.NodeAddr), LAddr: *proto.FromNetAddr(c.rpcClient.LocalAddr()), MaxSeq: remoteMaxSeq, Delta: deltaBytes, } reply := &proto.GossipResponse{} gossipCall := c.rpcClient.Go("Gossip.Gossip", args, reply, nil) select { case <-gossipCall.Done: if gossipCall.Error != nil { return gossipCall.Error } case <-c.rpcClient.Closed: return util.Error("client closed") case <-c.closer: return nil case <-stopper.ShouldStop(): return nil case <-time.After(g.interval * 10): return util.Errorf("timeout after: %s", g.interval*10) } // Handle remote forwarding. if reply.Alternate != nil { var err error if c.forwardAddr, err = reply.Alternate.NetAddr(); err != nil { return util.Errorf("unable to resolve alternate address: %s: %s", reply.Alternate, err) } return util.Errorf("received forward from %s to %s", c.addr, reply.Alternate) } // Combine remote node's infostore delta with ours. now := time.Now().UnixNano() if reply.Delta != nil { delta := &infoStore{} if err := gob.NewDecoder(bytes.NewBuffer(reply.Delta)).Decode(delta); err != nil { return util.Errorf("infostore could not be decoded: %s", err) } if delta.infoCount() > 0 { if log.V(1) { log.Infof("gossip: received %s", delta) } else { log.Infof("gossip: received %d info(s) from %s", delta.infoCount(), c.addr) } } g.mu.Lock() c.peerID = delta.NodeID g.outgoing.addNode(c.peerID) freshCount := g.is.combine(delta) if freshCount > 0 { c.lastFresh = now } remoteMaxSeq = delta.MaxSeq // If we have the sentinel gossip, we're considered connected. g.checkHasConnected() g.mu.Unlock() } // Check whether this outgoing client is duplicating work already // being done by an incoming client. To avoid mutual shutdown, we // only shutdown our client if our node ID is less than the peer's. if g.hasIncoming(c.peerID) && nodeID < c.peerID { return util.Errorf("stopping outgoing client %d @ %s; already have incoming", c.peerID, c.addr) } // Check whether peer node is too boring--disconnect if yes. if nodeID != c.peerID && (now-c.lastFresh) > int64(maxWaitForNewGossip) { return util.Errorf("peer is too boring") } } }