// processHealthResult processes the given health results, adding their stats to the CacheAvailableStatus. Note this is NOT threadsafe, because it non-atomically gets CacheAvailableStatuses, Events, LastHealthDurations and later updates them. This MUST NOT be called from multiple threads.
func processHealthResult(cacheHealthChan <-chan cache.Result, toData todata.TODataThreadsafe, localStates peer.CRStatesThreadsafe, lastHealthDurationsThreadsafe DurationMapThreadsafe, statHistory StatHistoryThreadsafe, monitorConfig TrafficMonitorConfigMapThreadsafe, peerStates peer.CRStatesPeersThreadsafe, combinedStates peer.CRStatesThreadsafe, fetchCount UintThreadsafe, errorCount UintThreadsafe, events EventsThreadsafe, localCacheStatusThreadsafe CacheAvailableStatusThreadsafe, lastHealthEndTimes map[enum.CacheName]time.Time, healthHistory map[enum.CacheName][]cache.Result, results []cache.Result, cfg config.Config) {
	if len(results) == 0 {
		return
	}
	toDataCopy := toData.Get() // create a copy, so the same data used for all processing of this cache health result
	localCacheStatus := localCacheStatusThreadsafe.Get().Copy()
	monitorConfigCopy := monitorConfig.Get()
	for _, healthResult := range results {
		log.Debugf("poll %v %v healthresultman start\n", healthResult.PollID, time.Now())
		fetchCount.Inc()
		var prevResult cache.Result
		healthResultHistory := healthHistory[enum.CacheName(healthResult.Id)]
		// healthResultHistory := healthHistory.Get(enum.CacheName(healthResult.Id))
		if len(healthResultHistory) != 0 {
			prevResult = healthResultHistory[len(healthResultHistory)-1]
		}

		health.GetVitals(&healthResult, &prevResult, &monitorConfigCopy)
		// healthHistory.Set(enum.CacheName(healthResult.Id), pruneHistory(append(healthHistory.Get(enum.CacheName(healthResult.Id)), healthResult), defaultMaxHistory))
		healthHistory[enum.CacheName(healthResult.Id)] = pruneHistory(append(healthHistory[enum.CacheName(healthResult.Id)], healthResult), cfg.MaxHealthHistory)
		isAvailable, whyAvailable := health.EvalCache(healthResult, &monitorConfigCopy)
		if localStates.Get().Caches[healthResult.Id].IsAvailable != isAvailable {
			log.Infof("Changing state for %s was: %t now: %t because %s errors: %v", healthResult.Id, prevResult.Available, isAvailable, whyAvailable, healthResult.Errors)
			events.Add(Event{Time: time.Now().Unix(), Description: whyAvailable, Name: healthResult.Id, Hostname: healthResult.Id, Type: toDataCopy.ServerTypes[healthResult.Id].String(), Available: isAvailable})
		}

		localCacheStatus[healthResult.Id] = CacheAvailableStatus{Available: isAvailable, Status: monitorConfigCopy.TrafficServer[string(healthResult.Id)].Status} // TODO move within localStates?
		localStates.SetCache(healthResult.Id, peer.IsAvailable{IsAvailable: isAvailable})
		log.Debugf("poll %v %v calculateDeliveryServiceState start\n", healthResult.PollID, time.Now())
		calculateDeliveryServiceState(toDataCopy.DeliveryServiceServers, localStates)
		log.Debugf("poll %v %v calculateDeliveryServiceState end\n", healthResult.PollID, time.Now())
	}
	localCacheStatusThreadsafe.Set(localCacheStatus)
	// TODO determine if we should combineCrStates() here

	lastHealthDurations := lastHealthDurationsThreadsafe.Get().Copy()
	for _, healthResult := range results {
		if lastHealthStart, ok := lastHealthEndTimes[enum.CacheName(healthResult.Id)]; ok {
			d := time.Since(lastHealthStart)
			lastHealthDurations[enum.CacheName(healthResult.Id)] = d
		}
		lastHealthEndTimes[enum.CacheName(healthResult.Id)] = time.Now()

		log.Debugf("poll %v %v finish\n", healthResult.PollID, time.Now())
		healthResult.PollFinished <- healthResult.PollID
	}
	lastHealthDurationsThreadsafe.Set(lastHealthDurations)
}
Пример #2
0
//
// Kicks off the pollers and handlers
//
func Start(opsConfigFile string, staticAppData StaticAppData) {
	var toSession *traffic_ops.Session

	fetchSuccessCounter := gmx.NewCounter("fetchSuccess")
	fetchFailCounter := gmx.NewCounter("fetchFail")
	fetchPendingGauge := gmx.NewGauge("fetchPending")

	tr := &http.Transport{
		TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
	}

	sharedClient := http.Client{
		Timeout:   defaultHttpTimeout,
		Transport: tr,
	}

	cacheHealthConfigChannel := make(chan poller.HttpPollerConfig)
	cacheHealthChannel := make(chan cache.Result)
	cacheHealthTick := make(chan uint64)
	cacheHealthPoller := poller.HttpPoller{
		TickChan:      cacheHealthTick,
		ConfigChannel: cacheHealthConfigChannel,
		Config: poller.HttpPollerConfig{
			Interval: defaultCacheHealthPollingInterval,
		},
		Fetcher: fetcher.HttpFetcher{
			Handler: cache.Handler{ResultChannel: cacheHealthChannel},
			Client:  sharedClient,
			Success: fetchSuccessCounter,
			Fail:    fetchFailCounter,
			Pending: fetchPendingGauge,
		},
	}

	cacheStatConfigChannel := make(chan poller.HttpPollerConfig)
	cacheStatChannel := make(chan cache.Result)
	cacheStatPoller := poller.HttpPoller{
		ConfigChannel: cacheStatConfigChannel,
		Config: poller.HttpPollerConfig{
			Interval: defaultCacheStatPollingInterval,
		},
		Fetcher: fetcher.HttpFetcher{
			Handler: cache.Handler{ResultChannel: cacheStatChannel},
			Client:  sharedClient,
			Success: fetchSuccessCounter,
			Fail:    fetchFailCounter,
			Pending: fetchPendingGauge,
		},
	}

	sessionChannel := make(chan *traffic_ops.Session)
	monitorConfigChannel := make(chan traffic_ops.TrafficMonitorConfigMap)
	monitorOpsConfigChannel := make(chan handler.OpsConfig)
	monitorConfigPoller := poller.MonitorConfigPoller{
		Interval:         defaultMonitorConfigPollingInterval,
		SessionChannel:   sessionChannel,
		ConfigChannel:    monitorConfigChannel,
		OpsConfigChannel: monitorOpsConfigChannel,
	}

	opsConfigFileChannel := make(chan interface{})
	opsConfigFilePoller := poller.FilePoller{
		File:          opsConfigFile,
		ResultChannel: opsConfigFileChannel,
	}

	opsConfigChannel := make(chan handler.OpsConfig)
	opsConfigFileHandler := handler.OpsConfigFileHandler{
		ResultChannel:    opsConfigFilePoller.ResultChannel,
		OpsConfigChannel: opsConfigChannel,
	}

	peerConfigChannel := make(chan poller.HttpPollerConfig)
	peerChannel := make(chan peer.Result)
	peerPoller := poller.HttpPoller{
		ConfigChannel: peerConfigChannel,
		Config: poller.HttpPollerConfig{
			Interval: defaultPeerPollingInterval,
		},
		Fetcher: fetcher.HttpFetcher{
			Handler: peer.Handler{ResultChannel: peerChannel},
			Client:  sharedClient,
			Success: fetchSuccessCounter,
			Fail:    fetchFailCounter,
			Pending: fetchPendingGauge,
		},
	}

	go opsConfigFileHandler.Listen()
	go opsConfigFilePoller.Poll()
	go monitorConfigPoller.Poll()
	go cacheHealthPoller.Poll()
	go cacheStatPoller.Poll()
	go peerPoller.Poll()

	dr := make(chan http_server.DataRequest)

	healthHistory := make(map[string][]interface{})
	statHistory := make(map[string][]interface{})

	var opsConfig handler.OpsConfig
	var monitorConfig traffic_ops.TrafficMonitorConfigMap
	localStates := peer.Crstates{Caches: make(map[string]peer.IsAvailable), Deliveryservice: make(map[string]peer.Deliveryservice)}    // this is the local state as discoverer by this traffic_monitor
	peerStates := make(map[string]peer.Crstates)                                                                                       // each peer's last state is saved in this map
	combinedStates := peer.Crstates{Caches: make(map[string]peer.IsAvailable), Deliveryservice: make(map[string]peer.Deliveryservice)} // this is the result of combining the localStates and all the peerStates using the var ??

	deliveryServiceServers := map[string][]string{}
	serverTypes := map[string]string{}

	// TODO put stat data in a struct, for brevity
	lastHealthEndTimes := map[string]time.Time{}
	lastHealthDurations := map[string]time.Duration{}
	fetchCount := uint64(0) // note this is the number of individual caches fetched from, not the number of times all the caches were polled.
	healthIteration := uint64(0)
	errorCount := uint64(0)
	events := []Event{}
	eventIndex := uint64(0)
	for {
		select {
		case req := <-dr:
			defer close(req.C)

			var body []byte
			var err error

			switch req.T {
			case http_server.TR_CONFIG:
				if toSession != nil && opsConfig.CdnName != "" {
					body, err = toSession.CRConfigRaw(opsConfig.CdnName)
				}
			case http_server.TR_STATE_DERIVED:
				body, err = peer.CrStatesMarshall(combinedStates)
			case http_server.TR_STATE_SELF:
				body, err = peer.CrStatesMarshall(localStates)
			case http_server.CACHE_STATS:
				// TODO: add support for ?hc=N query param, stats=, wildcard, individual caches
				// add pp and date to the json:
				/*
					pp: "0=[my-ats-edge-cache-1], hc=[1]",
					date: "Thu Oct 09 20:28:36 UTC 2014"
				*/
				params := req.Parameters
				hc := 1
				if _, exists := params["hc"]; exists {
					v, err := strconv.Atoi(params["hc"][0])
					if err == nil {
						hc = v
					}
				}
				body, err = cache.StatsMarshall(statHistory, hc)
			case http_server.DS_STATS:
				body = []byte("TODO implement")
			case http_server.EVENT_LOG:
				body, err = json.Marshal(JSONEvents{Events: events})
			case http_server.PEER_STATES:
				body = []byte("TODO implement")
			case http_server.STAT_SUMMARY:
				body = []byte("TODO implement")
			case http_server.STATS:
				body, err = getStats(staticAppData, cacheHealthPoller.Config.Interval, lastHealthDurations, fetchCount, healthIteration, errorCount)
				if err != nil {
					// TODO send error to client
					errorCount++
					log.Printf("ERROR getting stats %v\n", err)
					continue
				}
			case http_server.CONFIG_DOC:
				opsConfigCopy := opsConfig
				// if the password is blank, leave it blank, so callers can see it's missing.
				if opsConfigCopy.Password != "" {
					opsConfigCopy.Password = "******"
				}
				body, err = json.Marshal(opsConfigCopy)
			default:
				body = []byte("TODO error message")
			}
			req.C <- body
		case oc := <-opsConfigFileHandler.OpsConfigChannel:
			var err error
			opsConfig = oc

			listenAddress := ":80" // default

			if opsConfig.HttpListener != "" {
				listenAddress = opsConfig.HttpListener
			}

			err = http_server.Run(dr, listenAddress)
			if err != nil {
				errorCount++
				log.Printf("MonitorConfigPoller: error creating HTTP server: %s\n", err)
				continue
			}

			toSession, err = traffic_ops.Login(opsConfig.Url, opsConfig.Username, opsConfig.Password, opsConfig.Insecure)
			if err != nil {
				errorCount++
				log.Printf("MonitorConfigPoller: error instantiating Session with traffic_ops: %s\n", err)
				continue
			}

			deliveryServiceServers, err = getDeliveryServiceServers(toSession, opsConfig.CdnName)
			if err != nil {
				errorCount++
				log.Printf("Error getting delivery service servers from Traffic Ops: %v\n", err)
				continue
			}

			serverTypes, err = getServerTypes(toSession, opsConfig.CdnName)
			if err != nil {
				errorCount++
				log.Printf("Error getting server types from Traffic Ops: %v\n", err)
				continue
			}

			// This must be in a goroutine, because the monitorConfigPoller tick sends to a channel this select listens for. Thus, if we block on sends to the monitorConfigPoller, we have a livelock race condition.
			go func() {
				monitorConfigPoller.OpsConfigChannel <- opsConfig // this is needed for cdnName
				monitorConfigPoller.SessionChannel <- toSession
			}()
		case monitorConfig = <-monitorConfigPoller.ConfigChannel:
			healthUrls := map[string]string{}
			statUrls := map[string]string{}
			peerUrls := map[string]string{}
			caches := map[string]string{}

			for _, srv := range monitorConfig.TrafficServer {
				caches[srv.HostName] = srv.Status

				if srv.Status == "ONLINE" {
					localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: true}
					continue
				}
				if srv.Status == "OFFLINE" {
					localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: false}
					continue
				}
				// seed states with available = false until our polling cycle picks up a result
				if _, exists := localStates.Caches[srv.HostName]; !exists {
					localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: false}
				}

				url := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingURL
				r := strings.NewReplacer(
					"${hostname}", srv.FQDN,
					"${interface_name}", srv.InterfaceName,
					"application=system", "application=plugin.remap",
					"application=", "application=plugin.remap",
				)
				url = r.Replace(url)
				healthUrls[srv.HostName] = url
				r = strings.NewReplacer("application=plugin.remap", "application=")
				url = r.Replace(url)
				statUrls[srv.HostName] = url
			}

			for _, srv := range monitorConfig.TrafficMonitor {
				if srv.Status != "ONLINE" {
					continue
				}
				// TODO: the URL should be config driven. -jse
				url := fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.IP, srv.Port)
				peerUrls[srv.HostName] = url
			}

			cacheStatPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: statUrls, Interval: defaultCacheStatPollingInterval}
			cacheHealthPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: healthUrls, Interval: defaultCacheHealthPollingInterval}
			peerPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: peerUrls, Interval: defaultPeerPollingInterval}

			for k := range localStates.Caches {
				_, exists := monitorConfig.TrafficServer[k]

				if !exists {
					fmt.Printf("Warning: removing %s from localStates", k)
					delete(localStates.Caches, k)
				}
			}

			addStateDeliveryServices(monitorConfig, localStates.Deliveryservice)
		case i := <-cacheHealthTick:
			healthIteration = i
		case healthResult := <-cacheHealthChannel:
			fetchCount++
			var prevResult cache.Result
			if len(healthHistory[healthResult.Id]) != 0 {
				prevResult = healthHistory[healthResult.Id][len(healthHistory[healthResult.Id])-1].(cache.Result)
			}
			health.GetVitals(&healthResult, &prevResult, &monitorConfig)
			healthHistory[healthResult.Id] = pruneHistory(append(healthHistory[healthResult.Id], healthResult), defaultMaxHistory)
			isAvailable, whyAvailable := health.EvalCache(healthResult, &monitorConfig)
			if localStates.Caches[healthResult.Id].IsAvailable != isAvailable {
				fmt.Println("Changing state for", healthResult.Id, " was:", prevResult.Available, " is now:", isAvailable, " because:", whyAvailable, " errors:", healthResult.Errors)
				e := Event{
					Index:       eventIndex,
					Time:        time.Now().Unix(),
					Description: whyAvailable,
					Name:        healthResult.Id,
					Hostname:    healthResult.Id,
					Type:        serverTypes[healthResult.Id],
					Available:   isAvailable,
				}
				events = append([]Event{e}, events...)
				if len(events) > maxEvents {
					events = events[:maxEvents-1]
				}
				eventIndex++
			}
			localStates.Caches[healthResult.Id] = peer.IsAvailable{IsAvailable: isAvailable}
			calculateDeliveryServiceState(deliveryServiceServers, localStates.Caches, localStates.Deliveryservice)

			if lastHealthStart, ok := lastHealthEndTimes[healthResult.Id]; ok {
				lastHealthDurations[healthResult.Id] = time.Since(lastHealthStart)
			}
			lastHealthEndTimes[healthResult.Id] = time.Now()

			// if _, ok := queryIntervalStart[pollI]; !ok {
			// 	log.Printf("ERROR poll start index not found")
			// 	continue
			// }
			// lastQueryIntervalTime = time.Since(queryIntervalStart[pollI])
		case stats := <-cacheStatChannel:
			statHistory[stats.Id] = pruneHistory(append(statHistory[stats.Id], stats), defaultMaxHistory)
		case crStatesResult := <-peerChannel:
			peerStates[crStatesResult.Id] = crStatesResult.PeerStats
			combinedStates = combineCrStates(peerStates, localStates)
		}
	}
}