Example #1
0
// antiEntropy is a long running method used to perform anti-entropy
// between local and remote state.
func (l *localState) antiEntropy(shutdownCh chan struct{}) {
SYNC:
	// Sync our state with the servers
	for {
		err := l.setSyncState()
		if err == nil {
			break
		}
		l.logger.Printf("[ERR] agent: failed to sync remote state: %v", err)
		select {
		case <-l.consulCh:
			// Stagger the retry on leader election, avoid a thundering heard
			select {
			case <-time.After(lib.RandomStagger(aeScale(syncStaggerIntv, len(l.iface.LANMembers())))):
			case <-shutdownCh:
				return
			}
		case <-time.After(syncRetryIntv + lib.RandomStagger(aeScale(syncRetryIntv, len(l.iface.LANMembers())))):
		case <-shutdownCh:
			return
		}
	}

	// Force-trigger AE to pickup any changes
	l.changeMade()

	// Schedule the next full sync, with a random stagger
	aeIntv := aeScale(l.config.AEInterval, len(l.iface.LANMembers()))
	aeIntv = aeIntv + lib.RandomStagger(aeIntv)
	aeTimer := time.After(aeIntv)

	// Wait for sync events
	for {
		select {
		case <-aeTimer:
			goto SYNC
		case <-l.triggerCh:
			// Skip the sync if we are paused
			if l.isPaused() {
				continue
			}
			if err := l.syncChanges(); err != nil {
				l.logger.Printf("[ERR] agent: failed to sync changes: %v", err)
			}
		case <-shutdownCh:
			return
		}
	}
}
Example #2
0
// registerAndHeartbeat is a long lived goroutine used to register the client
// and then start heartbeatng to the server.
func (c *Client) registerAndHeartbeat() {
	// Register the node
	c.retryRegisterNode()

	// Start watching changes for node changes
	go c.watchNodeUpdates()

	// Setup the heartbeat timer, for the initial registration
	// we want to do this quickly. We want to do it extra quickly
	// in development mode.
	var heartbeat <-chan time.Time
	if c.config.DevMode {
		heartbeat = time.After(0)
	} else {
		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
	}

	for {
		select {
		case <-heartbeat:
			if err := c.updateNodeStatus(); err != nil {
				heartbeat = time.After(c.retryIntv(registerRetryIntv))
			} else {
				c.heartbeatLock.Lock()
				heartbeat = time.After(c.heartbeatTTL)
				c.heartbeatLock.Unlock()
			}

		case <-c.shutdownCh:
			return
		}
	}
}
Example #3
0
// Run triggers periodic syncing of services and checks with Consul.  This is
// a long lived go-routine which is stopped during shutdown.
func (c *Syncer) Run() {
	sync := time.NewTimer(0)
	for {
		select {
		case <-sync.C:
			d := syncInterval - lib.RandomStagger(syncInterval/syncJitter)
			sync.Reset(d)

			if err := c.SyncServices(); err != nil {
				if c.consulAvailable {
					c.logger.Printf("[DEBUG] consul.syncer: error in syncing: %v", err)
				}
				c.consulAvailable = false
			} else {
				if !c.consulAvailable {
					c.logger.Printf("[DEBUG] consul.syncer: syncs succesful")
				}
				c.consulAvailable = true
			}
		case <-c.notifySyncCh:
			sync.Reset(syncInterval)
		case <-c.shutdownCh:
			c.Shutdown()
		case <-c.notifyShutdownCh:
			sync.Stop()
			c.logger.Printf("[INFO] consul.syncer: shutting down syncer ")
			return
		}
	}
}
Example #4
0
// blockingRPC is used for queries that need to wait for a
// minimum index. This is used to block and wait for changes.
func (s *Server) blockingRPC(opts *blockingOptions) error {
	var timeout *time.Timer
	var notifyCh chan struct{}
	var state *state.StateStore

	// Fast path non-blocking
	if opts.queryOpts.MinQueryIndex == 0 {
		goto RUN_QUERY
	}

	// Restrict the max query time, and ensure there is always one
	if opts.queryOpts.MaxQueryTime > maxQueryTime {
		opts.queryOpts.MaxQueryTime = maxQueryTime
	} else if opts.queryOpts.MaxQueryTime <= 0 {
		opts.queryOpts.MaxQueryTime = defaultQueryTime
	}

	// Apply a small amount of jitter to the request
	opts.queryOpts.MaxQueryTime += lib.RandomStagger(opts.queryOpts.MaxQueryTime / jitterFraction)

	// Setup a query timeout
	timeout = time.NewTimer(opts.queryOpts.MaxQueryTime)

	// Setup the notify channel
	notifyCh = make(chan struct{}, 1)

	// Ensure we tear down any watchers on return
	state = s.fsm.State()
	defer func() {
		timeout.Stop()
		state.StopWatch(opts.watch, notifyCh)
	}()

REGISTER_NOTIFY:
	// Register the notification channel. This may be done
	// multiple times if we have not reached the target wait index.
	state.Watch(opts.watch, notifyCh)

RUN_QUERY:
	// Update the query meta data
	s.setQueryMeta(opts.queryMeta)

	// Run the query function
	metrics.IncrCounter([]string{"nomad", "rpc", "query"}, 1)
	err := opts.run()

	// Check for minimum query time
	if err == nil && opts.queryOpts.MinQueryIndex > 0 && opts.queryMeta.Index <= opts.queryOpts.MinQueryIndex {
		select {
		case <-notifyCh:
			goto REGISTER_NOTIFY
		case <-timeout.C:
		}
	}
	return err
}
Example #5
0
// registerAndHeartbeat is a long lived goroutine used to register the client
// and then start heartbeatng to the server.
func (c *Client) registerAndHeartbeat() {
	// Register the node
	c.retryRegisterNode()

	// Start watching changes for node changes
	go c.watchNodeUpdates()

	// Setup the heartbeat timer, for the initial registration
	// we want to do this quickly. We want to do it extra quickly
	// in development mode.
	var heartbeat <-chan time.Time
	if c.config.DevMode {
		heartbeat = time.After(0)
	} else {
		heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
	}

	for {
		select {
		case <-heartbeat:
			if err := c.updateNodeStatus(); err != nil {
				// The servers have changed such that this node has not been
				// registered before
				if strings.Contains(err.Error(), "node not found") {
					// Re-register the node
					c.logger.Printf("[INFO] client: re-registering node")
					c.retryRegisterNode()
					heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger))
				} else {
					c.logger.Printf("[ERR] client: heartbeating failed: %v", err)
					heartbeat = time.After(c.retryIntv(registerRetryIntv))
				}
			} else {
				c.heartbeatLock.Lock()
				heartbeat = time.After(c.heartbeatTTL)
				c.heartbeatLock.Unlock()
			}

		case <-c.shutdownCh:
			return
		}
	}
}
Example #6
0
// UpdateCheck is used to update the status of a check
func (l *localState) UpdateCheck(checkID types.CheckID, status, output string) {
	l.Lock()
	defer l.Unlock()

	check, ok := l.checks[checkID]
	if !ok {
		return
	}

	// Update the critical time tracking (this doesn't cause a server updates
	// so we can always keep this up to date).
	if status == structs.HealthCritical {
		_, wasCritical := l.checkCriticalTime[checkID]
		if !wasCritical {
			l.checkCriticalTime[checkID] = time.Now()
		}
	} else {
		delete(l.checkCriticalTime, checkID)
	}

	// Do nothing if update is idempotent
	if check.Status == status && check.Output == output {
		return
	}

	// Defer a sync if the output has changed. This is an optimization around
	// frequent updates of output. Instead, we update the output internally,
	// and periodically do a write-back to the servers. If there is a status
	// change we do the write immediately.
	if l.config.CheckUpdateInterval > 0 && check.Status == status {
		check.Output = output
		if _, ok := l.deferCheck[checkID]; !ok {
			intv := time.Duration(uint64(l.config.CheckUpdateInterval)/2) + lib.RandomStagger(l.config.CheckUpdateInterval)
			deferSync := time.AfterFunc(intv, func() {
				l.Lock()
				if _, ok := l.checkStatus[checkID]; ok {
					l.checkStatus[checkID] = syncStatus{inSync: false}
					l.changeMade()
				}
				delete(l.deferCheck, checkID)
				l.Unlock()
			})
			l.deferCheck[checkID] = deferSync
		}
		return
	}

	// Update status and mark out of sync
	check.Status = status
	check.Output = output
	l.checkStatus[checkID] = syncStatus{inSync: false}
	l.changeMade()
}
Example #7
0
// resetHeartbeatTimer is used to reset the TTL of a heartbeat.
// This can be used for new heartbeats and existing ones.
func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) {
	s.heartbeatTimersLock.Lock()
	defer s.heartbeatTimersLock.Unlock()

	// Compute the target TTL value
	n := len(s.heartbeatTimers)
	ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n)
	ttl += lib.RandomStagger(ttl)

	// Reset the TTL
	s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace)
	return ttl, nil
}
Example #8
0
// forward is used to forward to a remote region or to forward to the local leader
// Returns a bool of if forwarding was performed, as well as any error
func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, reply interface{}) (bool, error) {
	var firstCheck time.Time

	region := info.RequestRegion()
	if region == "" {
		return true, fmt.Errorf("missing target RPC")
	}

	// Handle region forwarding
	if region != s.config.Region {
		err := s.forwardRegion(region, method, args, reply)
		return true, err
	}

	// Check if we can allow a stale read
	if info.IsRead() && info.AllowStaleRead() {
		return false, nil
	}

CHECK_LEADER:
	// Find the leader
	isLeader, remoteServer := s.getLeader()

	// Handle the case we are the leader
	if isLeader {
		return false, nil
	}

	// Handle the case of a known leader
	if remoteServer != nil {
		err := s.forwardLeader(remoteServer, method, args, reply)
		return true, err
	}

	// Gate the request until there is a leader
	if firstCheck.IsZero() {
		firstCheck = time.Now()
	}
	if time.Now().Sub(firstCheck) < s.config.RPCHoldTimeout {
		jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction)
		select {
		case <-time.After(jitter):
			goto CHECK_LEADER
		case <-s.shutdownCh:
		}
	}

	// No leader found and hold time exceeded
	return true, structs.ErrNoLeader
}
Example #9
0
// setupAgent is used to start the agent and various interfaces
func (c *Command) setupAgent(config *Config, logOutput io.Writer) error {
	c.Ui.Output("Starting Nomad agent...")
	agent, err := NewAgent(config, logOutput)
	if err != nil {
		c.Ui.Error(fmt.Sprintf("Error starting agent: %s", err))
		return err
	}
	c.agent = agent

	// Enable the SCADA integration
	if err := c.setupSCADA(config); err != nil {
		agent.Shutdown()
		c.Ui.Error(fmt.Sprintf("Error starting SCADA: %s", err))
		return err
	}

	// Setup the HTTP server
	http, err := NewHTTPServer(agent, config, logOutput)
	if err != nil {
		agent.Shutdown()
		c.Ui.Error(fmt.Sprintf("Error starting http server: %s", err))
		return err
	}
	c.httpServer = http

	// Setup update checking
	if !config.DisableUpdateCheck {
		version := config.Version
		if config.VersionPrerelease != "" {
			version += fmt.Sprintf("-%s", config.VersionPrerelease)
		}
		updateParams := &checkpoint.CheckParams{
			Product: "nomad",
			Version: version,
		}
		if !config.DisableAnonymousSignature {
			updateParams.SignatureFile = filepath.Join(config.DataDir, "checkpoint-signature")
		}

		// Schedule a periodic check with expected interval of 24 hours
		checkpoint.CheckInterval(updateParams, 24*time.Hour, c.checkpointResults)

		// Do an immediate check within the next 30 seconds
		go func() {
			time.Sleep(lib.RandomStagger(30 * time.Second))
			c.checkpointResults(checkpoint.Check(updateParams))
		}()
	}
	return nil
}
Example #10
0
// refreshServerRebalanceTimer is only called once m.rebalanceTimer expires.
func (m *Manager) refreshServerRebalanceTimer() time.Duration {
	l := m.getServerList()
	numConsulServers := len(l.servers)
	// Limit this connection's life based on the size (and health) of the
	// cluster.  Never rebalance a connection more frequently than
	// connReuseLowWatermarkDuration, and make sure we never exceed
	// clusterWideRebalanceConnsPerSec operations/s across numLANMembers.
	clusterWideRebalanceConnsPerSec := float64(numConsulServers * newRebalanceConnsPerSecPerServer)
	connReuseLowWatermarkDuration := clientRPCMinReuseDuration + lib.RandomStagger(clientRPCMinReuseDuration/clientRPCJitterFraction)
	numLANMembers := m.clusterInfo.NumNodes()
	connRebalanceTimeout := lib.RateScaledInterval(clusterWideRebalanceConnsPerSec, connReuseLowWatermarkDuration, numLANMembers)

	m.rebalanceTimer.Reset(connRebalanceTimeout)
	return connRebalanceTimeout
}
Example #11
0
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckDocker) run() {
	// Get the randomized initial pause time
	initialPauseTime := lib.RandomStagger(c.Interval)
	c.Logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s -c %s in container %s", initialPauseTime, c.Shell, c.Script, c.DockerContainerID)
	next := time.After(initialPauseTime)
	for {
		select {
		case <-next:
			c.check()
			next = time.After(c.Interval)
		case <-c.stopCh:
			return
		}
	}
}
Example #12
0
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckTCP) run() {
	// Get the randomized initial pause time
	initialPauseTime := lib.RandomStagger(c.Interval)
	c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
	next := time.After(initialPauseTime)
	for {
		select {
		case <-next:
			c.check()
			next = time.After(c.Interval)
		case <-c.stopCh:
			return
		}
	}
}
Example #13
0
// run is invoked by a goroutine to run until Stop() is called
func (r *CheckRunner) run() {
	// Get the randomized initial pause time
	initialPauseTime := lib.RandomStagger(r.check.Interval())
	r.logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s", initialPauseTime, r.check.ID())
	next := time.NewTimer(initialPauseTime)
	for {
		select {
		case <-next.C:
			r.runCheck(r.check)
			next.Reset(r.check.Interval())
		case <-r.stopCh:
			next.Stop()
			return
		}
	}
}
Example #14
0
// sendCoordinate is a long-running loop that periodically sends our coordinate
// to the server. Closing the agent's shutdownChannel will cause this to exit.
func (a *Agent) sendCoordinate() {
	for {
		rate := a.config.SyncCoordinateRateTarget
		min := a.config.SyncCoordinateIntervalMin
		intv := lib.RateScaledInterval(rate, min, len(a.LANMembers()))
		intv = intv + lib.RandomStagger(intv)

		select {
		case <-time.After(intv):
			members := a.LANMembers()
			grok, err := consul.CanServersUnderstandProtocol(members, 3)
			if err != nil {
				a.logger.Printf("[ERR] agent: failed to check servers: %s", err)
				continue
			}
			if !grok {
				a.logger.Printf("[DEBUG] agent: skipping coordinate updates until servers are upgraded")
				continue
			}

			c, err := a.GetCoordinate()
			if err != nil {
				a.logger.Printf("[ERR] agent: failed to get coordinate: %s", err)
				continue
			}

			// TODO - Consider adding a distance check so we don't send
			// an update if the position hasn't changed by more than a
			// threshold.
			req := structs.CoordinateUpdateRequest{
				Datacenter:   a.config.Datacenter,
				Node:         a.config.NodeName,
				Coord:        c,
				WriteRequest: structs.WriteRequest{Token: a.config.ACLToken},
			}
			var reply struct{}
			if err := a.RPC("Coordinate.Update", &req, &reply); err != nil {
				a.logger.Printf("[ERR] agent: coordinate update error: %s", err)
				continue
			}
		case <-a.shutdownCh:
			return
		}
	}
}
Example #15
0
func (c *ConsulBackend) runEventDemuxer(shutdownCh ShutdownChannel, advertiseAddr string, activeFunc activeFunction, sealedFunc sealedFunction) {
	// Fire the reconcileTimer immediately upon starting the event demuxer
	reconcileTimer := time.NewTimer(0)
	defer reconcileTimer.Stop()

	// Schedule the first check.  Consul TTL checks are passing by
	// default, checkTimer does not need to be run immediately.
	checkTimer := time.NewTimer(c.checkDuration())
	defer checkTimer.Stop()

	// Use a reactor pattern to handle and dispatch events to singleton
	// goroutine handlers for execution.  It is not acceptable to drop
	// inbound events from Notify*().
	//
	// goroutines are dispatched if the demuxer can acquire a lock (via
	// an atomic CAS incr) on the handler.  Handlers are responsible for
	// deregistering themselves (atomic CAS decr).  Handlers and the
	// demuxer share a lock to synchronize information at the beginning
	// and end of a handler's life (or after a handler wakes up from
	// sleeping during a back-off/retry).
	var shutdown bool
	var checkLock int64
	var registeredServiceID string
	var serviceRegLock int64
shutdown:
	for {
		select {
		case <-c.notifyActiveCh:
			// Run reconcile immediately upon active state change notification
			reconcileTimer.Reset(0)
		case <-c.notifySealedCh:
			// Run check timer immediately upon a seal state change notification
			checkTimer.Reset(0)
		case <-reconcileTimer.C:
			// Unconditionally rearm the reconcileTimer
			reconcileTimer.Reset(reconcileTimeout - lib.RandomStagger(reconcileTimeout/checkJitterFactor))

			// Abort if service discovery is disabled or a
			// reconcile handler is already active
			if !c.disableRegistration && atomic.CompareAndSwapInt64(&serviceRegLock, 0, 1) {
				// Enter handler with serviceRegLock held
				go func() {
					defer atomic.CompareAndSwapInt64(&serviceRegLock, 1, 0)
					for !shutdown {
						serviceID, err := c.reconcileConsul(registeredServiceID, activeFunc, sealedFunc)
						if err != nil {
							c.logger.Printf("[WARN]: consul: reconcile unable to talk with Consul backend: %v", err)
							time.Sleep(consulRetryInterval)
							continue
						}

						c.serviceLock.Lock()
						defer c.serviceLock.Unlock()

						registeredServiceID = serviceID
						return
					}
				}()
			}
		case <-checkTimer.C:
			checkTimer.Reset(c.checkDuration())
			// Abort if service discovery is disabled or a
			// reconcile handler is active
			if !c.disableRegistration && atomic.CompareAndSwapInt64(&checkLock, 0, 1) {
				// Enter handler with checkLock held
				go func() {
					defer atomic.CompareAndSwapInt64(&checkLock, 1, 0)
					for !shutdown {
						sealed := sealedFunc()
						if err := c.runCheck(sealed); err != nil {
							c.logger.Printf("[WARN]: consul: check unable to talk with Consul backend: %v", err)
							time.Sleep(consulRetryInterval)
							continue
						}
						return
					}
				}()
			}
		case <-shutdownCh:
			c.logger.Printf("[INFO]: consul: Shutting down consul backend")
			shutdown = true
			break shutdown
		}
	}

	c.serviceLock.RLock()
	defer c.serviceLock.RUnlock()
	if err := c.client.Agent().ServiceDeregister(registeredServiceID); err != nil {
		c.logger.Printf("[WARN]: consul: service deregistration failed: %v", err)
	}
}
Example #16
0
// setupAgent is used to start the agent and various interfaces
func (c *Command) setupAgent(config *Config, logOutput io.Writer, logWriter *logWriter) error {
	c.Ui.Output("Starting Consul agent...")
	agent, err := Create(config, logOutput)
	if err != nil {
		c.Ui.Error(fmt.Sprintf("Error starting agent: %s", err))
		return err
	}
	c.agent = agent

	// Setup the RPC listener
	rpcAddr, err := config.ClientListener(config.Addresses.RPC, config.Ports.RPC)
	if err != nil {
		c.Ui.Error(fmt.Sprintf("Invalid RPC bind address: %s", err))
		return err
	}

	// Clear the domain socket file if it exists
	socketPath, isSocket := unixSocketAddr(config.Addresses.RPC)
	if isSocket {
		if _, err := os.Stat(socketPath); !os.IsNotExist(err) {
			agent.logger.Printf("[WARN] agent: Replacing socket %q", socketPath)
		}
		if err := os.Remove(socketPath); err != nil && !os.IsNotExist(err) {
			c.Ui.Output(fmt.Sprintf("Error removing socket file: %s", err))
			return err
		}
	}

	rpcListener, err := net.Listen(rpcAddr.Network(), rpcAddr.String())
	if err != nil {
		agent.Shutdown()
		c.Ui.Error(fmt.Sprintf("Error starting RPC listener: %s", err))
		return err
	}

	// Set up ownership/permission bits on the socket file
	if isSocket {
		if err := setFilePermissions(socketPath, config.UnixSockets); err != nil {
			agent.Shutdown()
			c.Ui.Error(fmt.Sprintf("Error setting up socket: %s", err))
			return err
		}
	}

	// Start the IPC layer
	c.Ui.Output("Starting Consul agent RPC...")
	c.rpcServer = NewAgentRPC(agent, rpcListener, logOutput, logWriter)

	// Enable the SCADA integration
	if err := c.setupScadaConn(config); err != nil {
		agent.Shutdown()
		c.Ui.Error(fmt.Sprintf("Error starting SCADA connection: %s", err))
		return err
	}

	if config.Ports.HTTP > 0 || config.Ports.HTTPS > 0 {
		servers, err := NewHTTPServers(agent, config, logOutput)
		if err != nil {
			agent.Shutdown()
			c.Ui.Error(fmt.Sprintf("Error starting http servers: %s", err))
			return err
		}
		c.httpServers = servers
	}

	if config.Ports.DNS > 0 {
		dnsAddr, err := config.ClientListener(config.Addresses.DNS, config.Ports.DNS)
		if err != nil {
			agent.Shutdown()
			c.Ui.Error(fmt.Sprintf("Invalid DNS bind address: %s", err))
			return err
		}

		server, err := NewDNSServer(agent, &config.DNSConfig, logOutput,
			config.Domain, dnsAddr.String(), config.DNSRecursors)
		if err != nil {
			agent.Shutdown()
			c.Ui.Error(fmt.Sprintf("Error starting dns server: %s", err))
			return err
		}
		c.dnsServer = server
	}

	// Setup update checking
	if !config.DisableUpdateCheck {
		version := config.Version
		if config.VersionPrerelease != "" {
			version += fmt.Sprintf("-%s", config.VersionPrerelease)
		}
		updateParams := &checkpoint.CheckParams{
			Product: "consul",
			Version: version,
		}
		if !config.DisableAnonymousSignature {
			updateParams.SignatureFile = filepath.Join(config.DataDir, "checkpoint-signature")
		}

		// Schedule a periodic check with expected interval of 24 hours
		checkpoint.CheckInterval(updateParams, 24*time.Hour, c.checkpointResults)

		// Do an immediate check within the next 30 seconds
		go func() {
			time.Sleep(lib.RandomStagger(30 * time.Second))
			c.checkpointResults(checkpoint.Check(updateParams))
		}()
	}
	return nil
}
Example #17
0
// runACLReplication is a long-running goroutine that will attempt to replicate
// ACLs while the server is the leader, until the shutdown channel closes.
func (s *Server) runACLReplication() {
	var status structs.ACLReplicationStatus
	status.Enabled = true
	status.SourceDatacenter = s.config.ACLDatacenter
	s.updateACLReplicationStatus(status)

	// Show that it's not running on the way out.
	defer func() {
		status.Running = false
		s.updateACLReplicationStatus(status)
	}()

	// Give each server's replicator a random initial phase for good
	// measure.
	select {
	case <-s.shutdownCh:
		return

	case <-time.After(lib.RandomStagger(s.config.ACLReplicationInterval)):
	}

	// We are fairly conservative with the lastRemoteIndex so that after a
	// leadership change or an error we re-sync everything (we also don't
	// want to block the first time after one of these events so we can
	// show a successful sync in the status endpoint).
	var lastRemoteIndex uint64
	replicate := func() {
		if !status.Running {
			lastRemoteIndex = 0 // Re-sync everything.
			status.Running = true
			s.updateACLReplicationStatus(status)
			s.logger.Printf("[INFO] consul: ACL replication started")
		}

		index, err := s.replicateACLs(lastRemoteIndex)
		if err != nil {
			lastRemoteIndex = 0 // Re-sync everything.
			status.LastError = time.Now()
			s.updateACLReplicationStatus(status)
			s.logger.Printf("[WARN] consul: ACL replication error (will retry if still leader): %v", err)
		} else {
			lastRemoteIndex = index
			status.ReplicatedIndex = index
			status.LastSuccess = time.Now()
			s.updateACLReplicationStatus(status)
			s.logger.Printf("[DEBUG] consul: ACL replication completed through remote index %d", index)
		}
	}
	pause := func() {
		if status.Running {
			lastRemoteIndex = 0 // Re-sync everything.
			status.Running = false
			s.updateACLReplicationStatus(status)
			s.logger.Printf("[INFO] consul: ACL replication stopped (no longer leader)")
		}
	}

	// This will slowly poll to see if replication should be active. Once it
	// is and we've caught up, the replicate() call will begin to block and
	// only wake up when the query timer expires or there are new ACLs to
	// replicate. We've chosen this design so that the ACLReplicationInterval
	// is the lower bound for how quickly we will replicate, no matter how
	// much ACL churn is happening on the remote side.
	//
	// The blocking query inside replicate() respects the shutdown channel,
	// so we won't get stuck in here as things are torn down.
	for {
		select {
		case <-s.shutdownCh:
			return

		case <-time.After(s.config.ACLReplicationInterval):
			if s.IsLeader() {
				replicate()
			} else {
				pause()
			}
		}
	}
}
Example #18
0
// retryIntv calculates a retry interval value given the base
func (c *Client) retryIntv(base time.Duration) time.Duration {
	if c.config.DevMode {
		return devModeRetryIntv
	}
	return base + lib.RandomStagger(base)
}
Example #19
0
// blockingRPC is used for queries that need to wait for a minimum index. This
// is used to block and wait for changes.
func (s *Server) blockingRPC(queryOpts *structs.QueryOptions, queryMeta *structs.QueryMeta,
	watch state.Watch, run func() error) error {
	var timeout *time.Timer
	var notifyCh chan struct{}

	// Fast path right to the non-blocking query.
	if queryOpts.MinQueryIndex == 0 {
		goto RUN_QUERY
	}

	// Make sure a watch was given if we were asked to block.
	if watch == nil {
		panic("no watch given for blocking query")
	}

	// Restrict the max query time, and ensure there is always one.
	if queryOpts.MaxQueryTime > maxQueryTime {
		queryOpts.MaxQueryTime = maxQueryTime
	} else if queryOpts.MaxQueryTime <= 0 {
		queryOpts.MaxQueryTime = defaultQueryTime
	}

	// Apply a small amount of jitter to the request.
	queryOpts.MaxQueryTime += lib.RandomStagger(queryOpts.MaxQueryTime / jitterFraction)

	// Setup a query timeout.
	timeout = time.NewTimer(queryOpts.MaxQueryTime)

	// Setup the notify channel.
	notifyCh = make(chan struct{}, 1)

	// Ensure we tear down any watches on return.
	defer func() {
		timeout.Stop()
		watch.Clear(notifyCh)
	}()

REGISTER_NOTIFY:
	// Register the notification channel. This may be done multiple times if
	// we haven't reached the target wait index.
	watch.Wait(notifyCh)

RUN_QUERY:
	// Update the query metadata.
	s.setQueryMeta(queryMeta)

	// If the read must be consistent we verify that we are still the leader.
	if queryOpts.RequireConsistent {
		if err := s.consistentRead(); err != nil {
			return err
		}
	}

	// Run the query.
	metrics.IncrCounter([]string{"consul", "rpc", "query"}, 1)
	err := run()

	// Check for minimum query time.
	if err == nil && queryMeta.Index > 0 && queryMeta.Index <= queryOpts.MinQueryIndex {
		select {
		case <-notifyCh:
			goto REGISTER_NOTIFY
		case <-timeout.C:
		}
	}
	return err
}
Example #20
0
// setupBootstrapHandler() creates the closure necessary to support a Consul
// fallback handler.
func (s *Server) setupBootstrapHandler() error {
	// peersTimeout is used to indicate to the Consul Syncer that the
	// current Nomad Server has a stale peer set.  peersTimeout will time
	// out if the Consul Syncer bootstrapFn has not observed a Raft
	// leader in maxStaleLeadership.  If peersTimeout has been triggered,
	// the Consul Syncer will begin querying Consul for other Nomad
	// Servers.
	//
	// NOTE: time.Timer is used vs time.Time in order to handle clock
	// drift because time.Timer is implemented as a monotonic clock.
	var peersTimeout *time.Timer = time.NewTimer(0)

	// consulQueryCount is the number of times the bootstrapFn has been
	// called, regardless of success.
	var consulQueryCount uint64

	// leadershipTimedOut is a helper method that returns true if the
	// peersTimeout timer has expired.
	leadershipTimedOut := func() bool {
		select {
		case <-peersTimeout.C:
			return true
		default:
			return false
		}
	}

	// The bootstrapFn callback handler is used to periodically poll
	// Consul to look up the Nomad Servers in Consul.  In the event the
	// server has been brought up without a `retry-join` configuration
	// and this Server is partitioned from the rest of the cluster,
	// periodically poll Consul to reattach this Server to other servers
	// in the same region and automatically reform a quorum (assuming the
	// correct number of servers required for quorum are present).
	bootstrapFn := func() error {
		// If there is a raft leader, do nothing
		if s.raft.Leader() != "" {
			peersTimeout.Reset(maxStaleLeadership)
			return nil
		}

		// (ab)use serf.go's behavior of setting BootstrapExpect to
		// zero if we have bootstrapped.  If we have bootstrapped
		bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect)
		if bootstrapExpect == 0 {
			// This Nomad Server has been bootstrapped.  Rely on
			// the peersTimeout firing as a guard to prevent
			// aggressive querying of Consul.
			if !leadershipTimedOut() {
				return nil
			}
		} else {
			if consulQueryCount > 0 && !leadershipTimedOut() {
				return nil
			}

			// This Nomad Server has not been bootstrapped, reach
			// out to Consul if our peer list is less than
			// `bootstrap_expect`.
			raftPeers, err := s.raftPeers.Peers()
			if err != nil {
				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
				return nil
			}

			// The necessary number of Nomad Servers required for
			// quorum has been reached, we do not need to poll
			// Consul.  Let the normal timeout-based strategy
			// take over.
			if len(raftPeers) >= int(bootstrapExpect) {
				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
				return nil
			}
		}
		consulQueryCount++

		s.logger.Printf("[DEBUG] server.consul: lost contact with Nomad quorum, falling back to Consul for server list")

		consulCatalog := s.consulSyncer.ConsulClient().Catalog()
		dcs, err := consulCatalog.Datacenters()
		if err != nil {
			peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
			return fmt.Errorf("server.consul: unable to query Consul datacenters: %v", err)
		}
		if len(dcs) > 2 {
			// Query the local DC first, then shuffle the
			// remaining DCs.  If additional calls to bootstrapFn
			// are necessary, this Nomad Server will eventually
			// walk all datacenter until it finds enough hosts to
			// form a quorum.
			shuffleStrings(dcs[1:])
			dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
		}

		nomadServerServiceName := s.config.ConsulConfig.ServerServiceName
		var mErr multierror.Error
		const defaultMaxNumNomadServers = 8
		nomadServerServices := make([]string, 0, defaultMaxNumNomadServers)
		localNode := s.serf.Memberlist().LocalNode()
		for _, dc := range dcs {
			consulOpts := &consulapi.QueryOptions{
				AllowStale: true,
				Datacenter: dc,
				Near:       "_agent",
				WaitTime:   consul.DefaultQueryWaitDuration,
			}
			consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts)
			if err != nil {
				err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err)
				s.logger.Printf("[WARN] server.consul: %v", err)
				mErr.Errors = append(mErr.Errors, err)
				continue
			}

			for _, cs := range consulServices {
				port := strconv.FormatInt(int64(cs.ServicePort), 10)
				addr := cs.ServiceAddress
				if addr == "" {
					addr = cs.Address
				}
				if localNode.Addr.String() == addr && int(localNode.Port) == cs.ServicePort {
					continue
				}
				serverAddr := net.JoinHostPort(addr, port)
				nomadServerServices = append(nomadServerServices, serverAddr)
			}
		}

		if len(nomadServerServices) == 0 {
			if len(mErr.Errors) > 0 {
				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
				return mErr.ErrorOrNil()
			}

			// Log the error and return nil so future handlers
			// can attempt to register the `nomad` service.
			pollInterval := peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)
			s.logger.Printf("[TRACE] server.consul: no Nomad Servers advertising service %+q in Consul datacenters %+q, sleeping for %v", nomadServerServiceName, dcs, pollInterval)
			peersTimeout.Reset(pollInterval)
			return nil
		}

		numServersContacted, err := s.Join(nomadServerServices)
		if err != nil {
			peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
			return fmt.Errorf("contacted %d Nomad Servers: %v", numServersContacted, err)
		}

		peersTimeout.Reset(maxStaleLeadership)
		s.logger.Printf("[INFO] server.consul: successfully contacted %d Nomad Servers", numServersContacted)

		return nil
	}

	s.consulSyncer.AddPeriodicHandler("Nomad Server Fallback Server Handler", bootstrapFn)
	return nil
}
Example #21
0
// RPC is used to forward an RPC call to a consul server, or fail if no servers
func (c *Client) RPC(method string, args interface{}, reply interface{}) error {
	// Check to make sure we haven't spent too much time querying a
	// single server
	now := time.Now()
	if !c.connRebalanceTime.IsZero() && now.After(c.connRebalanceTime) {
		c.logger.Printf("[DEBUG] consul: connection time to server %s exceeded, rotating server connection", c.lastServer.Addr)
		c.lastServer = nil
	}

	// Allocate these vars on the stack before the goto
	var numConsulServers int
	var clusterWideRebalanceConnsPerSec float64
	var connReuseLowWaterMark time.Duration
	var numLANMembers int

	// Check the last RPC time, continue to reuse cached connection for
	// up to clientRPCMinReuseDuration unless exceeded
	// clientRPCConnMaxIdle
	lastRPCTime := now.Sub(c.lastRPCTime)
	var server *serverParts
	if c.lastServer != nil && lastRPCTime < clientRPCConnMaxIdle {
		server = c.lastServer
		goto TRY_RPC
	}

	// Bail if we can't find any servers
	c.consulLock.RLock()
	numConsulServers = len(c.consuls)
	if numConsulServers == 0 {
		c.consulLock.RUnlock()
		return structs.ErrNoServers
	}

	// Select a random addr
	server = c.consuls[rand.Int31n(int32(numConsulServers))]
	c.consulLock.RUnlock()

	// Limit this connection's life based on the size (and health) of the
	// cluster.  Never rebalance a connection more frequently than
	// connReuseLowWaterMark, and make sure we never exceed
	// clusterWideRebalanceConnsPerSec operations/s across numLANMembers.
	clusterWideRebalanceConnsPerSec = float64(numConsulServers * newRebalanceConnsPerSecPerServer)
	connReuseLowWaterMark = clientRPCMinReuseDuration + lib.RandomStagger(clientRPCMinReuseDuration/clientRPCJitterFraction)
	numLANMembers = len(c.LANMembers())
	c.connRebalanceTime = now.Add(lib.RateScaledInterval(clusterWideRebalanceConnsPerSec, connReuseLowWaterMark, numLANMembers))
	c.logger.Printf("[DEBUG] consul: connection to server %s will expire at %v", server.Addr, c.connRebalanceTime)

	// Forward to remote Consul
TRY_RPC:
	if err := c.connPool.RPC(c.config.Datacenter, server.Addr, server.Version, method, args, reply); err != nil {
		c.connRebalanceTime = time.Time{}
		c.lastRPCTime = time.Time{}
		c.lastServer = nil
		return err
	}

	// Cache the last server
	c.lastServer = server
	c.lastRPCTime = now
	return nil
}