Beispiel #1
0
// maybeSignalStatusChangeLocked checks whether gossip should transition its
// internal state from connected to stalled or vice versa.
func (g *Gossip) maybeSignalStatusChangeLocked() {
	orphaned := g.outgoing.len()+g.mu.incoming.len() == 0
	stalled := orphaned || g.mu.is.getInfo(KeySentinel) == nil
	if stalled {
		// We employ the stalled boolean to avoid filling logs with warnings.
		if !g.stalled {
			log.Eventf(g.ctx, "now stalled")
			if orphaned {
				if len(g.resolvers) == 0 {
					log.Warningf(g.ctx, "no resolvers found; use --join to specify a connected node")
				} else {
					log.Warningf(g.ctx, "no incoming or outgoing connections")
				}
			} else if len(g.resolversTried) == len(g.resolvers) {
				log.Warningf(g.ctx, "first range unavailable; resolvers exhausted")
			} else {
				log.Warningf(g.ctx, "first range unavailable; trying remaining resolvers")
			}
		}
		if len(g.resolvers) > 0 {
			g.signalStalledLocked()
		}
	} else {
		if g.stalled {
			log.Eventf(g.ctx, "connected")
			log.Infof(g.ctx, "node has connected to cluster via gossip")
			g.signalConnectedLocked()
		}
		g.maybeCleanupBootstrapAddressesLocked()
	}
	g.stalled = stalled
}
Beispiel #2
0
// addInternal adds the replica the queue with specified priority. If
// the replica is already queued, updates the existing
// priority. Expects the queue lock is held by caller.
func (bq *baseQueue) addInternal(repl *Replica, should bool, priority float64) (bool, error) {
	if bq.mu.stopped {
		return false, errQueueStopped
	}

	if bq.mu.disabled {
		log.Event(bq.ctx, "queue disabled")
		return false, errQueueDisabled
	}

	if !repl.IsInitialized() {
		// We checked this above in MaybeAdd(), but we need to check it
		// again for Add().
		return false, errors.New("replica not initialized")
	}

	// If the replica is currently in purgatory, don't re-add it.
	if _, ok := bq.mu.purgatory[repl.RangeID]; ok {
		return false, nil
	}

	item, ok := bq.mu.replicas[repl.RangeID]
	if !should {
		if ok {
			log.Eventf(bq.ctx, "%s: removing", item.value)
			bq.remove(item)
		}
		return false, errReplicaNotAddable
	} else if ok {
		if item.priority != priority {
			log.Eventf(bq.ctx, "%s: updating priority: %0.3f -> %0.3f",
				repl, item.priority, priority)
		}
		// Replica has already been added; update priority.
		bq.mu.priorityQ.update(item, priority)
		return false, nil
	}

	log.VEventf(3, bq.ctx, "%s: adding: priority=%0.3f", repl, priority)
	item = &replicaItem{value: repl.RangeID, priority: priority}
	heap.Push(&bq.mu.priorityQ, item)
	bq.mu.replicas[repl.RangeID] = item

	// If adding this replica has pushed the queue past its maximum size,
	// remove the lowest priority element.
	if pqLen := bq.mu.priorityQ.Len(); pqLen > bq.maxSize {
		bq.remove(bq.mu.priorityQ[pqLen-1])
	}
	// Signal the processLoop that a replica has been added.
	select {
	case bq.incoming <- struct{}{}:
	default:
		// No need to signal again.
	}
	return true, nil
}
Beispiel #3
0
// bootstrap connects the node to the gossip network. Bootstrapping
// commences in the event there are no connected clients or the
// sentinel gossip info is not available. After a successful bootstrap
// connection, this method will block on the stalled condvar, which
// receives notifications that gossip network connectivity has been
// lost and requires re-bootstrapping.
func (g *Gossip) bootstrap() {
	g.server.stopper.RunWorker(func() {
		ctx := log.WithLogTag(g.ctx, "bootstrap", nil)
		var bootstrapTimer timeutil.Timer
		defer bootstrapTimer.Stop()
		for {
			if g.server.stopper.RunTask(func() {
				g.mu.Lock()
				defer g.mu.Unlock()
				haveClients := g.outgoing.len() > 0
				haveSentinel := g.mu.is.getInfo(KeySentinel) != nil
				log.Eventf(ctx, "have clients: %t, have sentinel: %t", haveClients, haveSentinel)
				if !haveClients || !haveSentinel {
					// Try to get another bootstrap address from the resolvers.
					if addr := g.getNextBootstrapAddress(); addr != nil {
						g.startClient(addr, g.mu.is.NodeID)
					} else {
						bootstrapAddrs := make([]string, 0, len(g.bootstrapping))
						for addr := range g.bootstrapping {
							bootstrapAddrs = append(bootstrapAddrs, addr)
						}
						log.Eventf(ctx, "no next bootstrap address; currently bootstrapping: %v", bootstrapAddrs)
						// We couldn't start a client, signal that we're stalled so that
						// we'll retry.
						g.maybeSignalStatusChangeLocked()
					}
				}
			}) != nil {
				return
			}

			// Pause an interval before next possible bootstrap.
			bootstrapTimer.Reset(g.bootstrapInterval)
			log.Eventf(ctx, "sleeping %s until bootstrap", g.bootstrapInterval)
			select {
			case <-bootstrapTimer.C:
				bootstrapTimer.Read = true
				// break
			case <-g.server.stopper.ShouldStop():
				return
			}
			log.Eventf(ctx, "idling until bootstrap required")
			// Block until we need bootstrapping again.
			select {
			case <-g.stalledCh:
				log.Eventf(ctx, "detected stall; commencing bootstrap")
				// break
			case <-g.server.stopper.ShouldStop():
				return
			}
		}
	})
}
Beispiel #4
0
// maybeAddBootstrapAddress adds the specified address to the list of
// bootstrap addresses if not already present. Returns whether a new
// bootstrap address was added. The caller must hold the gossip mutex.
func (g *Gossip) maybeAddBootstrapAddress(addr util.UnresolvedAddr) bool {
	if _, ok := g.bootstrapAddrs[addr]; !ok {
		g.bootstrapInfo.Addresses = append(g.bootstrapInfo.Addresses, addr)
		g.bootstrapAddrs[addr] = struct{}{}
		log.Eventf(g.ctx, "add bootstrap %s", addr)
		return true
	}
	return false
}
Beispiel #5
0
// removeClient removes the specified client. Called when a client
// disconnects.
func (g *Gossip) removeClient(target *client) {
	g.clientsMu.Lock()
	defer g.clientsMu.Unlock()
	for i, candidate := range g.clientsMu.clients {
		if candidate == target {
			log.Eventf(g.ctx, "client %s disconnected", candidate.addr)
			g.clientsMu.clients = append(g.clientsMu.clients[:i], g.clientsMu.clients[i+1:]...)
			delete(g.bootstrapping, candidate.addr.String())
			g.outgoing.removeNode(candidate.peerID)
			break
		}
	}
}
Beispiel #6
0
// manage manages outgoing clients. Periodically, the infostore is
// scanned for infos with hop count exceeding the MaxHops
// threshold. If the number of outgoing clients doesn't exceed
// maxPeers(), a new gossip client is connected to a randomly selected
// peer beyond MaxHops threshold. Otherwise, the least useful peer
// node is cut off to make room for a replacement. Disconnected
// clients are processed via the disconnected channel and taken out of
// the outgoing address set. If there are no longer any outgoing
// connections or the sentinel gossip is unavailable, the bootstrapper
// is notified via the stalled conditional variable.
func (g *Gossip) manage() {
	g.server.stopper.RunWorker(func() {
		cullTicker := time.NewTicker(g.jitteredInterval(g.cullInterval))
		stallTicker := time.NewTicker(g.jitteredInterval(g.stallInterval))
		defer cullTicker.Stop()
		defer stallTicker.Stop()
		for {
			select {
			case <-g.server.stopper.ShouldStop():
				return
			case c := <-g.disconnected:
				g.doDisconnected(c)
			case nodeID := <-g.tighten:
				g.tightenNetwork(nodeID)
			case <-cullTicker.C:
				func() {
					g.mu.Lock()
					if !g.outgoing.hasSpace() {
						leastUsefulID := g.mu.is.leastUseful(g.outgoing)

						if c := g.findClient(func(c *client) bool {
							return c.peerID == leastUsefulID
						}); c != nil {
							if log.V(1) {
								log.Infof(g.ctx, "closing least useful client %+v to tighten network graph", c)
							}
							log.Eventf(g.ctx, "culling %s", c.addr)
							c.close()

							// After releasing the lock, block until the client disconnects.
							defer func() {
								g.doDisconnected(<-g.disconnected)
							}()
						} else {
							if log.V(1) {
								g.clientsMu.Lock()
								log.Infof(g.ctx, "couldn't find least useful client among %+v", g.clientsMu.clients)
								g.clientsMu.Unlock()
							}
						}
					}
					g.mu.Unlock()
				}()
			case <-stallTicker.C:
				g.mu.Lock()
				g.maybeSignalStatusChangeLocked()
				g.mu.Unlock()
			}
		}
	})
}
Beispiel #7
0
// maybeAddResolver creates and adds a resolver for the specified
// address if one does not already exist. Returns whether a new
// resolver was added. The caller must hold the gossip mutex.
func (g *Gossip) maybeAddResolver(addr util.UnresolvedAddr) bool {
	if _, ok := g.resolverAddrs[addr]; !ok {
		r, err := resolver.NewResolverFromUnresolvedAddr(addr)
		if err != nil {
			log.Warningf(g.ctx, "bad address %s: %s", addr, err)
			return false
		}
		g.resolvers = append(g.resolvers, r)
		g.resolverAddrs[addr] = r
		log.Eventf(g.ctx, "add resolver %s", r)
		return true
	}
	return false
}
Beispiel #8
0
// startClient launches a new client connected to remote address.
// The client is added to the outgoing address set and launched in
// a goroutine.
func (g *Gossip) startClient(addr net.Addr, nodeID roachpb.NodeID) {
	g.clientsMu.Lock()
	defer g.clientsMu.Unlock()
	breaker, ok := g.clientsMu.breakers[addr.String()]
	if !ok {
		breaker = g.rpcContext.NewBreaker()
		g.clientsMu.breakers[addr.String()] = breaker
	}

	log.Eventf(g.ctx, "starting new client to %s", addr)
	c := newClient(g.ctx, addr, g.serverMetrics)
	g.clientsMu.clients = append(g.clientsMu.clients, c)
	c.start(g, g.disconnected, g.rpcContext, g.server.stopper, nodeID, breaker)
}
Beispiel #9
0
// tightenNetwork "tightens" the network by starting a new gossip
// client to the most distant node as measured in required gossip hops
// to propagate info from the distant node to this node.
func (g *Gossip) tightenNetwork(distantNodeID roachpb.NodeID) {
	g.mu.Lock()
	defer g.mu.Unlock()
	if g.outgoing.hasSpace() {
		if nodeAddr, err := g.getNodeIDAddressLocked(distantNodeID); err != nil {
			log.Errorf(g.ctx, "node %d: unable to get address for node %d: %s",
				g.mu.is.NodeID, distantNodeID, err)
		} else {
			log.Infof(g.ctx, "node %d: starting client to distant node %d to tighten network graph",
				g.mu.is.NodeID, distantNodeID)
			log.Eventf(g.ctx, "tightening network with new client to %s", nodeAddr)
			g.startClient(nodeAddr, g.mu.is.NodeID)
		}
	}
}