Example #1
0
func TestMathMinInt(t *testing.T) {
	tests := [][3]int{{1, 2, 1}, {-1, 1, -1}, {2, 0, 0}}
	for _, test := range tests {
		expected := test[2]
		actual := lib.MinInt(test[0], test[1])
		if expected != actual {
			t.Fatalf("expected %d, got %d", expected, actual)
		}
	}
}
Example #2
0
// trimUDPAnswers makes sure a UDP response is not longer than allowed by RFC
// 1035.  Enforce an arbitrary limit that can be further ratcheted down by
// config, and then make sure the response doesn't exceed 512 bytes.
func trimUDPAnswers(config *DNSConfig, resp *dns.Msg) (trimmed bool) {
	numAnswers := len(resp.Answer)

	// This cuts UDP responses to a useful but limited number of responses.
	maxAnswers := lib.MinInt(maxUDPAnswerLimit, config.UDPAnswerLimit)
	if numAnswers > maxAnswers {
		resp.Answer = resp.Answer[:maxAnswers]
	}

	// This enforces the hard limit of 512 bytes per the RFC.
	for len(resp.Answer) > 0 && resp.Len() > 512 {
		resp.Answer = resp.Answer[:len(resp.Answer)-1]
	}

	return len(resp.Answer) < numAnswers
}
Example #3
0
// trimUDPResponse makes sure a UDP response is not longer than allowed by RFC
// 1035. Enforce an arbitrary limit that can be further ratcheted down by
// config, and then make sure the response doesn't exceed 512 bytes. Any extra
// records will be trimmed along with answers.
func trimUDPResponse(config *DNSConfig, resp *dns.Msg) (trimmed bool) {
	numAnswers := len(resp.Answer)
	hasExtra := len(resp.Extra) > 0

	// We avoid some function calls and allocations by only handling the
	// extra data when necessary.
	var index map[string]dns.RR
	if hasExtra {
		index = make(map[string]dns.RR, len(resp.Extra))
		indexRRs(resp.Extra, index)
	}

	// This cuts UDP responses to a useful but limited number of responses.
	maxAnswers := lib.MinInt(maxUDPAnswerLimit, config.UDPAnswerLimit)
	if numAnswers > maxAnswers {
		resp.Answer = resp.Answer[:maxAnswers]
		if hasExtra {
			syncExtra(index, resp)
		}
	}

	// This enforces the hard limit of 512 bytes per the RFC. Note that we
	// temporarily switch to uncompressed so that we limit to a response
	// that will not exceed 512 bytes uncompressed, which is more
	// conservative and will allow our responses to be compliant even if
	// some downstream server uncompresses them.
	compress := resp.Compress
	resp.Compress = false
	for len(resp.Answer) > 0 && resp.Len() > 512 {
		resp.Answer = resp.Answer[:len(resp.Answer)-1]
		if hasExtra {
			syncExtra(index, resp)
		}
	}
	resp.Compress = compress

	return len(resp.Answer) < numAnswers
}
Example #4
0
// setupConsulSyncer creates Client-mode consul.Syncer which periodically
// executes callbacks on a fixed interval.
//
// TODO(sean@): this could eventually be moved to a priority queue and give
// each task an interval, but that is not necessary at this time.
func (c *Client) setupConsulSyncer() error {
	// The bootstrapFn callback handler is used to periodically poll
	// Consul to look up the Nomad Servers in Consul.  In the event the
	// heartbeat deadline has been exceeded and this Client is orphaned
	// from its servers, periodically poll Consul to reattach this Client
	// to its cluster and automatically recover from a detached state.
	bootstrapFn := func() error {
		now := time.Now()
		c.heartbeatLock.Lock()

		// If the last heartbeat didn't contain a leader, give the
		// Nomad server this Agent is talking to one more attempt at
		// providing a heartbeat that does contain a leader.
		if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) {
			c.heartbeatLock.Unlock()
			return nil
		}
		c.heartbeatLock.Unlock()

		consulCatalog := c.consulSyncer.ConsulClient().Catalog()
		dcs, err := consulCatalog.Datacenters()
		if err != nil {
			return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err)
		}
		if len(dcs) > 2 {
			// Query the local DC first, then shuffle the
			// remaining DCs.  Future heartbeats will cause Nomad
			// Clients to fixate on their local datacenter so
			// it's okay to talk with remote DCs.  If the no
			// Nomad servers are available within
			// datacenterQueryLimit, the next heartbeat will pick
			// a new set of servers so it's okay.
			nearestDC := dcs[0]
			otherDCs := make([]string, 0, len(dcs))
			shuffleStrings(otherDCs)
			otherDCs = dcs[1:lib.MinInt(len(dcs), datacenterQueryLimit)]

			dcs = append([]string{nearestDC}, otherDCs...)
		}

		// Forward RPCs to our region
		nomadRPCArgs := structs.GenericRequest{
			QueryOptions: structs.QueryOptions{
				Region: c.Region(),
			},
		}

		nomadServerServiceName := c.config.ConsulConfig.ServerServiceName
		var mErr multierror.Error
		const defaultMaxNumNomadServers = 8
		nomadServerServices := make([]string, 0, defaultMaxNumNomadServers)
		c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs)
		for _, dc := range dcs {
			consulOpts := &consulapi.QueryOptions{
				AllowStale: true,
				Datacenter: dc,
				Near:       "_agent",
				WaitTime:   consul.DefaultQueryWaitDuration,
			}
			consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagRPC, consulOpts)
			if err != nil {
				mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", nomadServerServiceName, dc, err))
				continue
			}

			for _, s := range consulServices {
				port := strconv.FormatInt(int64(s.ServicePort), 10)
				addr := s.ServiceAddress
				if addr == "" {
					addr = s.Address
				}
				serverAddr := net.JoinHostPort(addr, port)
				serverEndpoint, err := rpcproxy.NewServerEndpoint(serverAddr)
				if err != nil {
					mErr.Errors = append(mErr.Errors, err)
					continue
				}
				var peers []string
				if err := c.connPool.RPC(c.Region(), serverEndpoint.Addr, c.RPCMajorVersion(), "Status.Peers", nomadRPCArgs, &peers); err != nil {
					mErr.Errors = append(mErr.Errors, err)
					continue
				}
				// Successfully received the Server peers list of the correct
				// region
				if len(peers) != 0 {
					nomadServerServices = append(nomadServerServices, peers...)
					break
				}
			}
			// Break if at least one Nomad Server was successfully pinged
			if len(nomadServerServices) > 0 {
				break
			}
		}
		if len(nomadServerServices) == 0 {
			if len(mErr.Errors) > 0 {
				return mErr.ErrorOrNil()
			}

			return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %q", nomadServerServiceName, dcs)
		}

		// Log the servers we are adding
		c.logger.Printf("[DEBUG] client.consul: bootstrap adding following Servers: %q", nomadServerServices)

		c.heartbeatLock.Lock()
		if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) {
			c.heartbeatLock.Unlock()
			// Common, healthy path
			if err := c.rpcProxy.SetBackupServers(nomadServerServices); err != nil {
				return fmt.Errorf("client.consul: unable to set backup servers: %v", err)
			}
		} else {
			c.heartbeatLock.Unlock()
			// If this Client is talking with a Server that
			// doesn't have a leader, and we have exceeded the
			// consulPullHeartbeatDeadline, change the call from
			// SetBackupServers() to calling AddPrimaryServer()
			// in order to allow the Clients to randomly begin
			// considering all known Nomad servers and
			// eventually, hopefully, find their way to a Nomad
			// Server that has quorum (assuming Consul has a
			// server list that is in the majority).
			for _, s := range nomadServerServices {
				c.rpcProxy.AddPrimaryServer(s)
			}
		}

		return nil
	}
	if c.config.ConsulConfig.ClientAutoJoin {
		c.consulSyncer.AddPeriodicHandler("Nomad Client Fallback Server Handler", bootstrapFn)
	}

	consulServicesReaperFn := func() error {
		const estInitialExecutorDomains = 8

		// Create the domains to keep and add the server and client
		domains := make([]consul.ServiceDomain, 2, estInitialExecutorDomains)
		domains[0] = consul.ServerDomain
		domains[1] = consul.ClientDomain

		for allocID, ar := range c.getAllocRunners() {
			ar.taskStatusLock.RLock()
			taskStates := copyTaskStates(ar.taskStates)
			ar.taskStatusLock.RUnlock()
			for taskName, taskState := range taskStates {
				// Only keep running tasks
				if taskState.State == structs.TaskStateRunning {
					d := consul.NewExecutorDomain(allocID, taskName)
					domains = append(domains, d)
				}
			}
		}

		return c.consulSyncer.ReapUnmatched(domains)
	}
	if c.config.ConsulConfig.AutoAdvertise {
		c.consulSyncer.AddPeriodicHandler("Nomad Client Services Sync Handler", consulServicesReaperFn)
	}

	return nil
}
Example #5
0
// setupBootstrapHandler() creates the closure necessary to support a Consul
// fallback handler.
func (s *Server) setupBootstrapHandler() error {
	// peersTimeout is used to indicate to the Consul Syncer that the
	// current Nomad Server has a stale peer set.  peersTimeout will time
	// out if the Consul Syncer bootstrapFn has not observed a Raft
	// leader in maxStaleLeadership.  If peersTimeout has been triggered,
	// the Consul Syncer will begin querying Consul for other Nomad
	// Servers.
	//
	// NOTE: time.Timer is used vs time.Time in order to handle clock
	// drift because time.Timer is implemented as a monotonic clock.
	var peersTimeout *time.Timer = time.NewTimer(0)

	// consulQueryCount is the number of times the bootstrapFn has been
	// called, regardless of success.
	var consulQueryCount uint64

	// leadershipTimedOut is a helper method that returns true if the
	// peersTimeout timer has expired.
	leadershipTimedOut := func() bool {
		select {
		case <-peersTimeout.C:
			return true
		default:
			return false
		}
	}

	// The bootstrapFn callback handler is used to periodically poll
	// Consul to look up the Nomad Servers in Consul.  In the event the
	// server has been brought up without a `retry-join` configuration
	// and this Server is partitioned from the rest of the cluster,
	// periodically poll Consul to reattach this Server to other servers
	// in the same region and automatically reform a quorum (assuming the
	// correct number of servers required for quorum are present).
	bootstrapFn := func() error {
		// If there is a raft leader, do nothing
		if s.raft.Leader() != "" {
			peersTimeout.Reset(maxStaleLeadership)
			return nil
		}

		// (ab)use serf.go's behavior of setting BootstrapExpect to
		// zero if we have bootstrapped.  If we have bootstrapped
		bootstrapExpect := atomic.LoadInt32(&s.config.BootstrapExpect)
		if bootstrapExpect == 0 {
			// This Nomad Server has been bootstrapped.  Rely on
			// the peersTimeout firing as a guard to prevent
			// aggressive querying of Consul.
			if !leadershipTimedOut() {
				return nil
			}
		} else {
			if consulQueryCount > 0 && !leadershipTimedOut() {
				return nil
			}

			// This Nomad Server has not been bootstrapped, reach
			// out to Consul if our peer list is less than
			// `bootstrap_expect`.
			raftPeers, err := s.raftPeers.Peers()
			if err != nil {
				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
				return nil
			}

			// The necessary number of Nomad Servers required for
			// quorum has been reached, we do not need to poll
			// Consul.  Let the normal timeout-based strategy
			// take over.
			if len(raftPeers) >= int(bootstrapExpect) {
				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
				return nil
			}
		}
		consulQueryCount++

		s.logger.Printf("[DEBUG] server.consul: lost contact with Nomad quorum, falling back to Consul for server list")

		consulCatalog := s.consulSyncer.ConsulClient().Catalog()
		dcs, err := consulCatalog.Datacenters()
		if err != nil {
			peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
			return fmt.Errorf("server.consul: unable to query Consul datacenters: %v", err)
		}
		if len(dcs) > 2 {
			// Query the local DC first, then shuffle the
			// remaining DCs.  If additional calls to bootstrapFn
			// are necessary, this Nomad Server will eventually
			// walk all datacenter until it finds enough hosts to
			// form a quorum.
			shuffleStrings(dcs[1:])
			dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)]
		}

		nomadServerServiceName := s.config.ConsulConfig.ServerServiceName
		var mErr multierror.Error
		const defaultMaxNumNomadServers = 8
		nomadServerServices := make([]string, 0, defaultMaxNumNomadServers)
		localNode := s.serf.Memberlist().LocalNode()
		for _, dc := range dcs {
			consulOpts := &consulapi.QueryOptions{
				AllowStale: true,
				Datacenter: dc,
				Near:       "_agent",
				WaitTime:   consul.DefaultQueryWaitDuration,
			}
			consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts)
			if err != nil {
				err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err)
				s.logger.Printf("[WARN] server.consul: %v", err)
				mErr.Errors = append(mErr.Errors, err)
				continue
			}

			for _, cs := range consulServices {
				port := strconv.FormatInt(int64(cs.ServicePort), 10)
				addr := cs.ServiceAddress
				if addr == "" {
					addr = cs.Address
				}
				if localNode.Addr.String() == addr && int(localNode.Port) == cs.ServicePort {
					continue
				}
				serverAddr := net.JoinHostPort(addr, port)
				nomadServerServices = append(nomadServerServices, serverAddr)
			}
		}

		if len(nomadServerServices) == 0 {
			if len(mErr.Errors) > 0 {
				peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
				return mErr.ErrorOrNil()
			}

			// Log the error and return nil so future handlers
			// can attempt to register the `nomad` service.
			pollInterval := peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)
			s.logger.Printf("[TRACE] server.consul: no Nomad Servers advertising service %+q in Consul datacenters %+q, sleeping for %v", nomadServerServiceName, dcs, pollInterval)
			peersTimeout.Reset(pollInterval)
			return nil
		}

		numServersContacted, err := s.Join(nomadServerServices)
		if err != nil {
			peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor))
			return fmt.Errorf("contacted %d Nomad Servers: %v", numServersContacted, err)
		}

		peersTimeout.Reset(maxStaleLeadership)
		s.logger.Printf("[INFO] server.consul: successfully contacted %d Nomad Servers", numServersContacted)

		return nil
	}

	s.consulSyncer.AddPeriodicHandler("Nomad Server Fallback Server Handler", bootstrapFn)
	return nil
}