Пример #1
0
func (kab *KeepAliveBuffer) send() {
	kab.lastSent = time.Now()
	// Send but don't block worker (to ensure fast service of channel).
	go func() {
		consul := config.GetConsulClient()
		if kab.isSession {
			session := consul.Session()
			_, _, err := session.Renew(kab.ID, nil)
			if err != nil {
				discoveryLogger.WithFields(
					"err", err,
					"sessionID", kab.ID,
				).Error("Error trying to send keep alive")
				kab.Close()
				return
			}
		} else {
			// Service TTL.
			agent := consul.Agent()
			err := agent.PassTTL("service:"+kab.ID, "")
			if err != nil {
				discoveryLogger.WithFields(
					"err", err,
					"serviceID", kab.ID,
				).Error("Error trying to send keep alive")
				kab.Close()
			}
		}
	}()
}
Пример #2
0
func (ska *SelfKeepAlive) send() {
	consul := config.GetConsulClient()
	if ska.isSession {
		// Session TTL.
		session := consul.Session()
		_, _, err := session.Renew(ska.ID, nil)
		if err != nil {
			discoveryLogger.WithFields(
				"err", err,
				"sessionID", ska.ID,
			).Error("Error trying to send keep alive")
			ska.Stop()
		}
	} else {
		// Service TTL.
		agent := consul.Agent()
		err := agent.PassTTL("service:"+ska.ID, "")
		if err != nil {
			discoveryLogger.WithFields(
				"err", err,
				"serviceID", ska.ID,
			).Error("Error trying to send keep alive")
			ska.Stop()
		}
	}
}
Пример #3
0
// WaitResource monitors a resource and blocks until that resource is
// released or there is some other error.
func WaitResource(service string, resource string) error {
	service = url.QueryEscape(service)
	resource = url.QueryEscape(resource)

	consul := config.GetConsulClient()
	kv := consul.KV()

	lastIndex := uint64(0)
	for {
		pair, qm, err := kv.Get(
			consulResourcePrefix+service+"/"+resource,
			&consulapi.QueryOptions{
				WaitIndex:         lastIndex,
				RequireConsistent: true,
			})
		if err != nil {
			if !consulapi.IsServerError(err) {
				return err
			}
			// Consul unresponsive. Wait a bit and try again.
			time.Sleep(3 * time.Second)
			continue
		}
		if pair == nil {
			return nil
		}
		lastIndex = qm.LastIndex
	}
}
Пример #4
0
// RegisterServiceLocal registers the service as running on the current node.
func RegisterServiceLocal(
	service string, instanceID string, target string,
	ttl time.Duration) (err error) {
	consul := config.GetConsulClient()
	agent := consul.Agent()

	// Parse target into address + port.
	addressSplit := strings.Split(target, ":")
	if len(addressSplit) != 2 {
		return fmt.Errorf("Invalid address")
	}
	port, err := strconv.Atoi(addressSplit[1])
	if err != nil {
		return fmt.Errorf("Invalid address")
	}
	address := addressSplit[0]
	return agent.ServiceRegister(&consulapi.AgentServiceRegistration{
		Name:    service,
		ID:      instanceID,
		Address: address,
		Port:    port,
		Check: &consulapi.AgentServiceCheck{
			TTL:    ttl.String(),
			Status: "passing",
		},
	})
}
Пример #5
0
func (tracker *FleetTracker) scaleDown(delta int, rpcEvent *RPCEvent) error {
	logger.WithFields(
		"leverEnv", rpcEvent.Environment,
		"leverService", rpcEvent.Service,
		"codeVersion", rpcEvent.CodeVersion,
		"servingID", rpcEvent.ServingID,
		"deltaInstances", delta,
	).Info("Scaling down")

	consulHealth := config.GetConsulClient().Health()
	entries, _, err := consulHealth.Service(
		rpcEvent.ServingID, "", true, &consulapi.QueryOptions{
			RequireConsistent: true,
		})
	if err != nil {
		logger.WithFields(
			"err", err,
			"servingID", rpcEvent.ServingID,
		).Error("Error trying to ask Consul for instances")
	}
	if len(entries) < delta {
		delta = len(entries)
	}

	tmpRand := leverutil.GetRand()
	permutation := tmpRand.Perm(len(entries))
	leverutil.PutRand(tmpRand)
	shuffled := make([]*consulapi.ServiceEntry, len(entries))
	for from, to := range permutation {
		shuffled[to] = entries[from]
	}
	toRemove := shuffled[:delta]

	hadErrors := false
	for _, entry := range toRemove {
		err = hostman.StopInstance(
			tracker.grpcPool,
			&hostman.InstanceKey{
				Environment: rpcEvent.Environment,
				Service:     rpcEvent.Service,
				InstanceID:  entry.Service.ID,
				ServingID:   rpcEvent.ServingID,
			}, entry.Node.Node)
		if err != nil {
			logger.WithFields("err", err).Error(
				"Error trying to stop instance remotely")
			hadErrors = true
		}
	}
	if hadErrors {
		return fmt.Errorf("There were errors during scale down")
	}
	return nil
}
Пример #6
0
func newResouce(
	service string, resource string, sessionID string) (res *Resource) {
	service = url.QueryEscape(service)
	resource = url.QueryEscape(resource)
	return &Resource{
		key:       consulResourcePrefix + service + "/" + resource,
		consul:    config.GetConsulClient(),
		service:   service,
		resource:  resource,
		sessionID: sessionID,
	}
}
Пример #7
0
func (tracker *LoadTracker) tick(
	avgRPCNanos, rpcNanosVariance, avgRate,
	rateVariance float64) (deltaInstances int) {
	// TODO: This model works decently well for real-time, non-streaming RPCs.
	//       It doesn't do so well with streaming or RPCs taking a long time
	//       because it is imprecise and reacts very late (after the
	//       RPC / stream has finished). Need another strategy for those cases.

	// Assume RPC rate and time are the worst possible (max in confidence
	// interval). This gives us a theoretical 97.5% certainty (in reality
	// we are much less sure, due to many other factors not modeled here).
	RPCNanosCI := 1.96 * math.Sqrt(rpcNanosVariance)
	maxRPCNanos := float64(avgRPCNanos) + RPCNanosCI
	rateCI := 1.96 * math.Sqrt(rateVariance)
	maxRate := avgRate + rateCI
	totalLoad := maxRate * (maxRPCNanos / float64(1000000000))

	// Work out how many instances are theoretically necessary right now, given
	// the current load.
	requiredNumInstances := int(math.Ceil(totalLoad / tracker.maxInstanceLoad))
	requiredNumInstances = max(requiredNumInstances, tracker.minInstances)

	// Work out how many instances are expected to be healthy in the near
	// future.
	if time.Now().After(tracker.nextInstancesQuery) {
		// Refresh number of healthy instances from Consul.
		consulHealth := config.GetConsulClient().Health()
		entries, _, err := consulHealth.Service(
			tracker.servingID, "", true, &consulapi.QueryOptions{
				AllowStale:        true,
				RequireConsistent: false,
			})
		if err != nil {
			trackerLogger.WithFields(
				"err", err,
				"servingID", tracker.servingID,
			).Error("Error trying to ask Consul for instances")
		} else {
			tracker.queriedNumInstances = len(entries)
		}
		// We don't need this number to be fresh. Don't do this very often.
		tracker.nextInstancesQuery =
			time.Now().Add(ChangesExpectedAfterFlag.Get())
	}
	tracker.maybeExpireDeltas()
	var assumedNumInstances int
	if tracker.queriedNumInstances ==
		tracker.numInstances+tracker.totalDeltaInstances ||
		tracker.numInstances == -1 ||
		tracker.totalDeltaInstances == 0 {
		// All expected changes have been applied or load oscillating or
		// we made no changes but the number of instances changed for reasons
		// external to this algorithm (eg instances expired or died on their
		// own) or the tracker is completely new.
		// TODO: What if there is a widespread crash across instances. We might
		//       want to react quickly to bring up replacements. Maybe.
		//       Maybe not. We would need to avoid bringing up replacement(s)
		//       forever in situations where they eg just crash on startup.
		// Just reset deltas.
		tracker.numInstances = tracker.queriedNumInstances
		tracker.totalDeltaInstances = 0
		tracker.deltas = nil
		assumedNumInstances = tracker.queriedNumInstances
	} else {
		// Not all changes propagated. Use the number of instances we expect.
		assumedNumInstances = tracker.numInstances + tracker.totalDeltaInstances
	}

	delta := requiredNumInstances - assumedNumInstances
	trackerLogger.WithFields(
		"avgRate", avgRate,
		"rateCI", rateCI,
	).Debug("RATE")
	trackerLogger.WithFields(
		"avgRPCNanos", avgRPCNanos,
		"RPCNanosCI", RPCNanosCI,
	).Debug("NANOS")
	trackerLogger.WithFields(
		"servingID", tracker.servingID,
		"avgRPCNanos", avgRPCNanos,
		"RPCNanosCI", RPCNanosCI,
		"avgRate", avgRate,
		"rateCI", rateCI,
		"totalLoad", totalLoad,
		"requiredNumInstances", requiredNumInstances,
		"assumedNumInstances", assumedNumInstances,
		"delta", delta,
	).Debug("Tick")
	if delta == 0 {
		// In balance - nothing to do.
		tracker.decreaseTriggered = false
		return 0
	}

	if delta > 0 {
		// Add instances.
		tracker.decreaseTriggered = false
		maxDelta := MaxDeltaFlag.Get()
		if delta > maxDelta {
			delta = maxDelta
		}
		tracker.applyDelta(delta)
		return delta
	}

	// Maybe remove instances (if the load was low for a while).
	if tracker.decreaseTriggered && time.Now().After(tracker.decreaseTime) {
		// Time for a decrease.
		tracker.applyDelta(delta)
		tracker.decreaseTriggered = false
		return delta
	} else if !tracker.decreaseTriggered {
		// Just noticed decrease is necessary. If after ScaleDownAfterFlag
		// we still need to decrease, do it then.
		tracker.decreaseTriggered = true
		tracker.decreaseTime = time.Now().Add(ScaleDownAfterFlag.Get())
		return 0
	} else {
		// Just wait patiently until tracker.decreaseTime.
		return 0
	}

	// Note: It is possible that a future tick may not take place at all
	//       and a decrease does not take effect. In that case, the
	//       instance expiry timer within each instance would take care of
	//       clearing unnecessary instances.
}
Пример #8
0
// ServiceKeepAlive maintains the TTL for a service.
func ServiceKeepAlive(instanceID string) error {
	consul := config.GetConsulClient()
	agent := consul.Agent()
	return agent.PassTTL("service:"+instanceID, "")
}
Пример #9
0
// DeregisterService deregisters a service from Consul.
func DeregisterService(instanceID string) error {
	consul := config.GetConsulClient()
	agent := consul.Agent()
	return agent.ServiceDeregister(instanceID)
}
Пример #10
0
// GetOwnNodeName returns the node name of the current node.
func GetOwnNodeName() (string, error) {
	consul := config.GetConsulClient()
	return consul.Agent().NodeName()
}