Ejemplo n.º 1
0
// maybeWarnAboutInit looks for signs indicating a cluster which
// hasn't been initialized and warns. There's no absolutely sure way
// to determine whether the current node is simply waiting to be
// bootstrapped to an existing cluster vs. the operator having failed
// to initialize the cluster via the "cockroach init" command, so
// we can only warn.
//
// This method checks whether all gossip bootstrap hosts are
// connected, and whether the node itself is a bootstrap host, but
// there is still no sentinel gossip.
func (g *Gossip) maybeWarnAboutInit(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		// Wait 5s before first check.
		select {
		case <-stopper.ShouldStop():
			return
		case <-time.After(5 * time.Second):
		}
		retryOptions := retry.Options{
			Tag:         "check cluster initialization",
			Backoff:     5 * time.Second,  // first backoff at 5s
			MaxBackoff:  60 * time.Second, // max backoff is 60s
			Constant:    2,                // doubles
			MaxAttempts: 0,                // indefinite retries
			Stopper:     stopper,          // stop no matter what on stopper
		}
		// will never error because infinite retries
		_ = retry.WithBackoff(retryOptions, func() (retry.Status, error) {
			g.mu.Lock()
			hasSentinel := g.is.getInfo(KeySentinel) != nil
			g.mu.Unlock()
			// If we have the sentinel, exit the retry loop.
			if hasSentinel {
				return retry.Break, nil
			}
			// Otherwise, if all bootstrap hosts are connected, warn.
			if g.triedAll {
				log.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " +
					"Use \"cockroach init\" to initialize.")
			}
			return retry.Continue, nil
		})
	})
}
Ejemplo n.º 2
0
// bootstrap connects the node to the gossip network. Bootstrapping
// commences in the event there are no connected clients or the
// sentinel gossip info is not available. After a successful bootstrap
// connection, this method will block on the stalled condvar, which
// receives notifications that gossip network connectivity has been
// lost and requires re-bootstrapping.
func (g *Gossip) bootstrap(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		for {
			g.mu.Lock()
			if g.closed {
				g.mu.Unlock()
				return
			}
			// Check whether or not we need bootstrap.
			haveClients := g.outgoing.len() > 0
			haveSentinel := g.is.getInfo(KeySentinel) != nil
			if !haveClients || !haveSentinel {
				// Try to get another bootstrap address from the resolvers.
				if addr := g.getNextBootstrapAddress(); addr != nil {
					g.startClient(addr, g.bsRPCContext, stopper)
				}
			}
			g.mu.Unlock()

			// Block until we need bootstrapping again.
			select {
			case <-g.stalled:
				// continue
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Ejemplo n.º 3
0
func (e *eventDemux) start(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		for {
			select {
			case event := <-e.events:
				switch event := event.(type) {
				case *EventLeaderElection:
					e.LeaderElection <- event

				case *EventCommandCommitted:
					e.CommandCommitted <- event

				case *EventMembershipChangeCommitted:
					e.MembershipChangeCommitted <- event

				default:
					panic(fmt.Sprintf("got unknown event type %T", event))
				}

			case <-stopper.ShouldStop():
				close(e.CommandCommitted)
				close(e.MembershipChangeCommitted)
				close(e.LeaderElection)
				return
			}
		}
	})
}
Ejemplo n.º 4
0
// maybeWarnAboutInit looks for signs indicating a cluster which
// hasn't been initialized and warns. There's no absolutely sure way
// to determine whether the current node is simply waiting to be
// bootstrapped to an existing cluster vs. the operator having failed
// to initialize the cluster via the "cockroach init" command, so
// we can only warn.
//
// This method checks whether all gossip bootstrap hosts are
// connected, and whether the node itself is a bootstrap host, but
// there is still no sentinel gossip.
func (g *Gossip) maybeWarnAboutInit(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		// Wait 5s before first check.
		select {
		case <-stopper.ShouldStop():
			return
		case <-time.After(5 * time.Second):
		}
		retryOptions := retry.Options{
			InitialBackoff: 5 * time.Second,  // first backoff at 5s
			MaxBackoff:     60 * time.Second, // max backoff is 60s
			Multiplier:     2,                // doubles
			Stopper:        stopper,          // stop no matter what on stopper
		}
		// will never error because infinite retries
		for r := retry.Start(retryOptions); r.Next(); {
			g.mu.Lock()
			hasSentinel := g.is.getInfo(KeySentinel) != nil
			g.mu.Unlock()
			// If we have the sentinel, exit the retry loop.
			if hasSentinel {
				break
			}
			// Otherwise, if all bootstrap hosts are connected, warn.
			if g.triedAll {
				log.Warningf("connected to gossip but missing sentinel. Has the cluster been initialized? " +
					"Use \"cockroach init\" to initialize.")
			}
		}
	})
}
Ejemplo n.º 5
0
// scanLoop loops endlessly, scanning through ranges available via
// the range set, or until the scanner is stopped. The iteration
// is paced to complete a full scan in approximately the scan interval.
func (rs *rangeScanner) scanLoop(clock *hlc.Clock, stopper *util.Stopper) {
	stopper.RunWorker(func() {
		start := time.Now()
		stats := &storeStats{}

		for {
			if rs.ranges.EstimatedCount() == 0 {
				// Just wait without processing any range.
				if rs.waitAndProcess(start, clock, stopper, stats, nil) {
					break
				}
			} else {
				shouldStop := true
				rs.ranges.Visit(func(rng *Range) bool {
					shouldStop = rs.waitAndProcess(start, clock, stopper, stats, rng)
					return !shouldStop
				})
				if shouldStop {
					break
				}
			}

			if !stopper.StartTask() {
				// Exit the loop.
				break
			}

			// We're done with the iteration.
			// Store the most recent scan results in the scanner's stats.
			atomic.StorePointer(&rs.stats, unsafe.Pointer(stats))
			stats = &storeStats{}
			if rs.scanFn != nil {
				rs.scanFn()
			}
			// Increment iteration count.
			rs.completedScan.L.Lock()
			rs.count++
			rs.total += time.Now().Sub(start)
			rs.completedScan.Broadcast()
			rs.completedScan.L.Unlock()
			if log.V(6) {
				log.Infof("reset range scan iteration")
			}

			// Reset iteration and start time.
			start = time.Now()
			stopper.FinishTask()
		}
	})
}
Ejemplo n.º 6
0
// start runs the storage loop in a goroutine.
func (w *writeTask) start(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		for {
			var request *writeRequest
			select {
			case <-w.ready:
				continue
			case <-stopper.ShouldStop():
				return
			case request = <-w.in:
			}
			if log.V(6) {
				log.Infof("writeTask got request %#v", *request)
			}
			response := &writeResponse{make(map[proto.RaftID]*groupWriteResponse)}

			for groupID, groupReq := range request.groups {
				group := w.storage.GroupStorage(groupID)
				if group == nil {
					if log.V(4) {
						log.Infof("dropping write to group %v", groupID)
					}
					continue
				}
				groupResp := &groupWriteResponse{raftpb.HardState{}, -1, -1, groupReq.entries}
				response.groups[groupID] = groupResp
				if !raft.IsEmptyHardState(groupReq.state) {
					err := group.SetHardState(groupReq.state)
					if err != nil {
						panic(err) // TODO(bdarnell): mark this node dead on storage errors
					}
					groupResp.state = groupReq.state
				}
				if !raft.IsEmptySnap(groupReq.snapshot) {
					err := group.ApplySnapshot(groupReq.snapshot)
					if err != nil {
						panic(err) // TODO(bdarnell)
					}
				}
				if len(groupReq.entries) > 0 {
					err := group.Append(groupReq.entries)
					if err != nil {
						panic(err) // TODO(bdarnell)
					}
				}
			}
			w.out <- response
		}
	})
}
Ejemplo n.º 7
0
// startGossip loops on a periodic ticker to gossip node-related
// information. Starts a goroutine to loop until the node is closed.
func (n *Node) startGossip(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		ticker := time.NewTicker(gossipInterval)
		defer ticker.Stop()
		n.gossipCapacities() // one-off run before going to sleep
		for {
			select {
			case <-ticker.C:
				n.gossipCapacities()
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Ejemplo n.º 8
0
// manage manages outgoing clients. Periodically, the infostore is
// scanned for infos with hop count exceeding maxToleratedHops()
// threshold. If the number of outgoing clients doesn't exceed
// MaxPeers, a new gossip client is connected to a randomly selected
// peer beyond maxToleratedHops threshold. Otherwise, the least useful
// peer node is cut off to make room for a replacement. Disconnected
// clients are processed via the disconnected channel and taken out of
// the outgoing address set. If there are no longer any outgoing
// connections or the sentinel gossip is unavailable, the bootstrapper
// is notified via the stalled conditional variable.
func (g *Gossip) manage(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		// Loop until closed and there are no remaining outgoing connections.
		for {
			select {
			case <-stopper.ShouldStop():
				return
			case c := <-g.disconnected:
				g.doDisconnected(stopper, c)
			case <-time.After(g.jitteredGossipInterval()):
				g.doCheckTimeout(stopper)
			}
		}
	})
}
Ejemplo n.º 9
0
// startPublishStatuses starts a loop which periodically instructs each store to
// publish its current status to the event feed.
func (n *Node) startPublishStatuses(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		// Publish status at the same frequency as metrics are collected.
		ticker := time.NewTicker(publishStatusInterval)
		defer ticker.Stop()
		for {
			select {
			case <-ticker.C:
				err := n.publishStoreStatuses()
				if err != nil {
					log.Error(err)
				}
			case <-stopper.ShouldStop():
				return
			}
		}
	})
}
Ejemplo n.º 10
0
// scanLoop loops endlessly, scanning through ranges available via
// the range set, or until the scanner is stopped. The iteration
// is paced to complete a full scan in approximately the scan interval.
func (rs *rangeScanner) scanLoop(clock *hlc.Clock, stopper *util.Stopper) {
	stopper.RunWorker(func() {
		start := time.Now()

		for {
			if rs.ranges.EstimatedCount() == 0 {
				// Just wait without processing any range.
				if rs.waitAndProcess(start, clock, stopper, nil) {
					break
				}
			} else {
				shouldStop := true
				rs.ranges.Visit(func(rng *Range) bool {
					shouldStop = rs.waitAndProcess(start, clock, stopper, rng)
					return !shouldStop
				})
				if shouldStop {
					break
				}
			}

			if !stopper.StartTask() {
				// Exit the loop.
				break
			}

			// Increment iteration count.
			rs.completedScan.L.Lock()
			rs.count++
			rs.total += time.Now().Sub(start)
			rs.completedScan.Broadcast()
			rs.completedScan.L.Unlock()
			if log.V(6) {
				log.Infof("reset range scan iteration")
			}

			// Reset iteration and start time.
			start = time.Now()
			stopper.FinishTask()
		}
	})
}
Ejemplo n.º 11
0
func (tq *testQueue) Start(clock *hlc.Clock, stopper *util.Stopper) {
	stopper.RunWorker(func() {
		for {
			select {
			case <-time.After(1 * time.Millisecond):
				tq.Lock()
				if !tq.disabled && len(tq.ranges) > 0 {
					tq.ranges = tq.ranges[1:]
					tq.processed++
				}
				tq.Unlock()
			case <-stopper.ShouldStop():
				tq.Lock()
				tq.done = true
				tq.Unlock()
				return
			}
		}
	})
}
Ejemplo n.º 12
0
// start dials the remote addr and commences gossip once connected.
// Upon exit, signals client is done by pushing it onto the done
// channel. If the client experienced an error, its err field will
// be set. This method starts client processing in a goroutine and
// returns immediately.
func (c *client) start(g *Gossip, done chan *client, context *rpc.Context, stopper *util.Stopper) {
	stopper.RunWorker(func() {
		c.rpcClient = rpc.NewClient(c.addr, nil, context)
		select {
		case <-c.rpcClient.Ready:
			// Success!
		case <-c.rpcClient.Closed:
			c.err = util.Errorf("gossip client failed to connect")
			done <- c
			return
		}

		// Start gossipping and wait for disconnect or error.
		c.lastFresh = time.Now().UnixNano()
		c.err = c.gossip(g, stopper)
		if c.err != nil {
			c.rpcClient.Close()
		}
		done <- c
	})
}
Ejemplo n.º 13
0
// start initializes the infostore with the rpc server address and
// then begins processing connecting clients in an infinite select
// loop via goroutine. Periodically, clients connected and awaiting
// the next round of gossip are awoken via the conditional variable.
func (s *server) start(rpcServer *rpc.Server, stopper *util.Stopper) {
	s.is.NodeAddr = rpcServer.Addr()
	if err := rpcServer.RegisterName("Gossip", s); err != nil {
		log.Fatalf("unable to register gossip service with RPC server: %s", err)
	}
	rpcServer.AddCloseCallback(s.onClose)

	stopper.RunWorker(func() {
		// Periodically wakeup blocked client gossip requests.
		for {
			select {
			case <-time.After(s.jitteredGossipInterval()):
				// Wakeup all blocked gossip requests.
				s.ready.Broadcast()
			case <-stopper.ShouldStop():
				s.stop()
				return
			}
		}
	})
}
Ejemplo n.º 14
0
// processLoop processes the entries in the queue until the provided
// stopper signals exit.
//
// TODO(spencer): current load should factor into range processing timer.
func (bq *baseQueue) processLoop(clock *hlc.Clock, stopper *util.Stopper) {
	stopper.RunWorker(func() {
		// nextTime is initially nil; we don't start any timers until the queue
		// becomes non-empty.
		var nextTime <-chan time.Time

		for {
			select {
			// Incoming signal sets the next time to process if there were previously
			// no ranges in the queue.
			case <-bq.incoming:
				if nextTime == nil {
					// When the first range is added, wake up immediately. This is
					// mainly to facilitate testing without unnecessary sleeps.
					nextTime = time.After(0 * time.Millisecond)
				}
			// Process ranges as the timer expires.
			case <-nextTime:
				bq.processOne(clock, stopper)
				if bq.Length() == 0 {
					nextTime = nil
				} else {
					nextTime = time.After(bq.impl.timer())
				}

			// Exit on stopper.
			case <-stopper.ShouldStop():
				bq.Lock()
				bq.ranges = map[proto.RaftID]*rangeItem{}
				bq.priorityQ = nil
				bq.Unlock()
				return
			}
		}
	})
}
Ejemplo n.º 15
0
// startStoresScanner will walk through all the stores in the node every
// ctx.ScanInterval and store the status in the db.
func (n *Node) startStoresScanner(stopper *util.Stopper) {
	stopper.RunWorker(func() {
		// Pick the smaller of the two intervals.
		var minScanInterval time.Duration
		if n.ctx.ScanInterval <= n.ctx.ScanMaxIdleTime || n.ctx.ScanMaxIdleTime == 0 {
			minScanInterval = n.ctx.ScanInterval
		} else {
			minScanInterval = n.ctx.ScanMaxIdleTime
		}

		// TODO(bram): The number of stores is small. The node status should be
		// updated whenever a store status is updated.
		for interval := time.Duration(0); true; interval = minScanInterval {
			select {
			case <-time.After(interval):
				if !stopper.StartTask() {
					continue
				}
				// Walk through all the stores on this node.
				var rangeCount, leaderRangeCount, replicatedRangeCount, availableRangeCount int32
				stats := &engine.MVCCStats{}
				accessedStoreIDs := []proto.StoreID{}
				// will never error because `return nil` below
				_ = n.lSender.VisitStores(func(store *storage.Store) error {
					storeStatus, err := store.GetStatus()
					if err != nil {
						log.Error(err)
						return nil
					}
					if storeStatus == nil {
						// The store scanner hasn't run on this node yet.
						return nil
					}
					accessedStoreIDs = append(accessedStoreIDs, store.Ident.StoreID)
					rangeCount += storeStatus.RangeCount
					leaderRangeCount += storeStatus.LeaderRangeCount
					replicatedRangeCount += storeStatus.ReplicatedRangeCount
					availableRangeCount += storeStatus.AvailableRangeCount
					stats.Add(&storeStatus.Stats)
					return nil
				})

				// Store the combined stats in the db.
				now := n.ctx.Clock.Now().WallTime
				status := &NodeStatus{
					Desc:                 n.Descriptor,
					StoreIDs:             accessedStoreIDs,
					UpdatedAt:            now,
					StartedAt:            n.startedAt,
					RangeCount:           rangeCount,
					Stats:                *stats,
					LeaderRangeCount:     leaderRangeCount,
					ReplicatedRangeCount: replicatedRangeCount,
					AvailableRangeCount:  availableRangeCount,
				}
				key := keys.NodeStatusKey(int32(n.Descriptor.NodeID))
				if err := n.ctx.DB.Put(key, status); err != nil {
					log.Error(err)
				}
				// Increment iteration count.
				n.completedScan.L.Lock()
				n.scanCount++
				n.completedScan.Broadcast()
				n.completedScan.L.Unlock()
				if log.V(6) {
					log.Infof("store scan iteration completed")
				}
				stopper.FinishTask()
			case <-stopper.ShouldStop():
				// Exit the loop.
				return
			}
		}
	})
}
Ejemplo n.º 16
0
// NewServer creates a Server from a server.Context.
func NewServer(ctx *Context, stopper *util.Stopper) (*Server, error) {
	if ctx == nil {
		return nil, util.Error("ctx must not be null")
	}

	addr := ctx.Addr
	_, err := net.ResolveTCPAddr("tcp", addr)
	if err != nil {
		return nil, util.Errorf("unable to resolve RPC address %q: %v", addr, err)
	}

	if ctx.Insecure {
		log.Warning("running in insecure mode, this is strongly discouraged. See --insecure and --certs.")
	}
	// Try loading the TLS configs before anything else.
	if _, err := ctx.GetServerTLSConfig(); err != nil {
		return nil, err
	}
	if _, err := ctx.GetClientTLSConfig(); err != nil {
		return nil, err
	}

	s := &Server{
		ctx:     ctx,
		mux:     http.NewServeMux(),
		clock:   hlc.NewClock(hlc.UnixNano),
		stopper: stopper,
	}
	s.clock.SetMaxOffset(ctx.MaxOffset)

	rpcContext := rpc.NewContext(&ctx.Context, s.clock, stopper)
	stopper.RunWorker(func() {
		rpcContext.RemoteClocks.MonitorRemoteOffsets(stopper)
	})

	s.rpc = rpc.NewServer(util.MakeUnresolvedAddr("tcp", addr), rpcContext)
	s.stopper.AddCloser(s.rpc)
	s.gossip = gossip.New(rpcContext, s.ctx.GossipInterval, s.ctx.GossipBootstrapResolvers)

	ds := kv.NewDistSender(&kv.DistSenderContext{Clock: s.clock}, s.gossip)
	sender := kv.NewTxnCoordSender(ds, s.clock, ctx.Linearizable, s.stopper)
	if s.db, err = client.Open("//root@", client.SenderOpt(sender)); err != nil {
		return nil, err
	}

	s.raftTransport, err = newRPCTransport(s.gossip, s.rpc, rpcContext)
	if err != nil {
		return nil, err
	}
	s.stopper.AddCloser(s.raftTransport)

	s.kvDB = kv.NewDBServer(&s.ctx.Context, sender)
	if s.ctx.ExperimentalRPCServer {
		if err = s.kvDB.RegisterRPC(s.rpc); err != nil {
			return nil, err
		}
	}

	//
	s.sqlServer = sqlserver.NewServer(s.db)

	// TODO(bdarnell): make StoreConfig configurable.
	nCtx := storage.StoreContext{
		Clock:           s.clock,
		DB:              s.db,
		Gossip:          s.gossip,
		Transport:       s.raftTransport,
		ScanInterval:    s.ctx.ScanInterval,
		ScanMaxIdleTime: s.ctx.ScanMaxIdleTime,
		EventFeed:       &util.Feed{},
	}
	s.node = NewNode(nCtx)
	s.admin = newAdminServer(s.db, s.stopper)
	s.status = newStatusServer(s.db, s.gossip)
	s.tsDB = ts.NewDB(s.db)
	s.tsServer = ts.NewServer(s.tsDB)
	s.stopper.AddCloser(nCtx.EventFeed)

	return s, nil
}