func (handler Handler) Handle(id string, r io.Reader, err error, pollId uint64, pollFinished chan<- uint64) { log.Debugf("poll %v %v handle start\n", pollId, time.Now()) result := Result{ Id: enum.CacheName(id), Available: false, Errors: []error{}, Time: time.Now(), // TODO change this to be computed the instant we get the result back, to minimise inaccuracy PollID: pollId, PollFinished: pollFinished, } if err != nil { log.Errorf("%v handler given error '%v'\n", id, err) // error here, in case the thing that called Handle didn't error result.Errors = append(result.Errors, err) handler.ResultChannel <- result return } if r == nil { log.Errorf("%v handle reader nil\n", id) result.Errors = append(result.Errors, fmt.Errorf("handler got nil reader")) handler.ResultChannel <- result return } result.PrecomputedData.Reporting = true if err := json.NewDecoder(r).Decode(&result.Astats); err != nil { log.Errorf("%s procnetdev decode error '%v'\n", id, err) result.Errors = append(result.Errors, err) handler.ResultChannel <- result return } if result.Astats.System.ProcNetDev == "" { log.Warnf("addkbps %s procnetdev empty\n", id) } if result.Astats.System.InfSpeed == 0 { log.Warnf("addkbps %s inf.speed empty\n", id) } log.Debugf("poll %v %v handle decode end\n", pollId, time.Now()) if err != nil { result.Errors = append(result.Errors, err) log.Errorf("addkbps handle %s error '%v'\n", id, err) } else { result.Available = true } if handler.Precompute() { log.Debugf("poll %v %v handle precompute start\n", pollId, time.Now()) result = handler.precompute(result) log.Debugf("poll %v %v handle precompute end\n", pollId, time.Now()) } log.Debugf("poll %v %v handle write start\n", pollId, time.Now()) handler.ResultChannel <- result log.Debugf("poll %v %v handle end\n", pollId, time.Now()) }
func CreateStats(statHistory map[enum.CacheName][]cache.Result, toData todata.TOData, crStates peer.Crstates, lastStats LastStats, now time.Time) (Stats, LastStats, error) { start := time.Now() dsStats := NewStats() for deliveryService, _ := range toData.DeliveryServiceServers { if deliveryService == "" { log.Errorf("EMPTY CreateStats deliveryService") continue } dsStats.DeliveryService[enum.DeliveryServiceName(deliveryService)] = *dsdata.NewStat() } dsStats = setStaticData(dsStats, toData.DeliveryServiceServers) var err error dsStats, err = addAvailableData(dsStats, crStates, toData.ServerCachegroups, toData.ServerDeliveryServices, toData.ServerTypes, statHistory) // TODO move after stat summarisation if err != nil { return dsStats, lastStats, fmt.Errorf("Error getting Cache availability data: %v", err) } for server, history := range statHistory { if len(history) < 1 { continue // TODO warn? } cachegroup, ok := toData.ServerCachegroups[server] if !ok { log.Warnf("server %s has no cachegroup, skipping\n", server) continue } serverType, ok := toData.ServerTypes[enum.CacheName(server)] if !ok { log.Warnf("server %s not in CRConfig, skipping\n", server) continue } result := history[len(history)-1] // TODO check result.PrecomputedData.Errors for ds, resultStat := range result.PrecomputedData.DeliveryServiceStats { if ds == "" { log.Errorf("EMPTY precomputed delivery service") continue } if _, ok := dsStats.DeliveryService[ds]; !ok { dsStats.DeliveryService[ds] = resultStat continue } httpDsStat := dsStats.DeliveryService[ds] httpDsStat.TotalStats = httpDsStat.TotalStats.Sum(resultStat.TotalStats) httpDsStat.CacheGroups[cachegroup] = httpDsStat.CacheGroups[cachegroup].Sum(resultStat.CacheGroups[cachegroup]) httpDsStat.Types[serverType] = httpDsStat.Types[serverType].Sum(resultStat.Types[serverType]) httpDsStat.Caches[server] = httpDsStat.Caches[server].Sum(resultStat.Caches[server]) httpDsStat.CachesTimeReceived[server] = resultStat.CachesTimeReceived[server] httpDsStat.CommonStats = dsStats.DeliveryService[ds].CommonStats dsStats.DeliveryService[ds] = httpDsStat // TODO determine if necessary } } perSecStats, lastStats := addPerSecStats(statHistory, dsStats, lastStats, now, toData.ServerCachegroups, toData.ServerTypes) log.Infof("CreateStats took %v\n", time.Since(start)) return perSecStats, lastStats, nil }
// StartStatHistoryManager fetches the full statistics data from ATS Astats. This includes everything needed for all calculations, such as Delivery Services. This is expensive, though, and may be hard on ATS, so it should poll less often. // For a fast 'is it alive' poll, use the Health Result Manager poll. // Returns the stat history, the duration between the stat poll for each cache, the last Kbps data, and the calculated Delivery Service stats. func StartStatHistoryManager(cacheStatChan <-chan cache.Result, combinedStates peer.CRStatesThreadsafe, toData todata.TODataThreadsafe, errorCount UintThreadsafe, cfg config.Config) (StatHistoryThreadsafe, DurationMapThreadsafe, LastStatsThreadsafe, DSStatsThreadsafe) { statHistory := NewStatHistoryThreadsafe(cfg.MaxStatHistory) lastStatDurations := NewDurationMapThreadsafe() lastStatEndTimes := map[enum.CacheName]time.Time{} lastStats := NewLastStatsThreadsafe() dsStats := NewDSStatsThreadsafe() tickInterval := cfg.StatFlushInterval go func() { for { var results []cache.Result results = append(results, <-cacheStatChan) tick := time.Tick(tickInterval) innerLoop: for { select { case <-tick: log.Warnf("StatHistoryManager flushing queued results\n") processStatResults(results, statHistory, combinedStates.Get(), lastStats, toData.Get(), errorCount, dsStats, lastStatEndTimes, lastStatDurations) break innerLoop default: select { case r := <-cacheStatChan: results = append(results, r) default: processStatResults(results, statHistory, combinedStates.Get(), lastStats, toData.Get(), errorCount, dsStats, lastStatEndTimes, lastStatDurations) break innerLoop } } } } }() return statHistory, lastStatDurations, lastStats, dsStats }
func healthResultManagerListen(cacheHealthChan <-chan cache.Result, toData todata.TODataThreadsafe, localStates peer.CRStatesThreadsafe, lastHealthDurations DurationMapThreadsafe, statHistory StatHistoryThreadsafe, monitorConfig TrafficMonitorConfigMapThreadsafe, peerStates peer.CRStatesPeersThreadsafe, combinedStates peer.CRStatesThreadsafe, fetchCount UintThreadsafe, errorCount UintThreadsafe, events EventsThreadsafe, localCacheStatus CacheAvailableStatusThreadsafe, cfg config.Config) { lastHealthEndTimes := map[enum.CacheName]time.Time{} healthHistory := map[enum.CacheName][]cache.Result{} // This reads at least 1 value from the cacheHealthChan. Then, we loop, and try to read from the channel some more. If there's nothing to read, we hit `default` and process. If there is stuff to read, we read it, then inner-loop trying to read more. If we're continuously reading and the channel is never empty, and we hit the tick time, process anyway even though the channel isn't empty, to prevent never processing (starvation). for { var results []cache.Result results = append(results, <-cacheHealthChan) tick := time.Tick(cfg.HealthFlushInterval) innerLoop: for { select { case <-tick: log.Warnf("Health Result Manager flushing queued results\n") processHealthResult(cacheHealthChan, toData, localStates, lastHealthDurations, statHistory, monitorConfig, peerStates, combinedStates, fetchCount, errorCount, events, localCacheStatus, lastHealthEndTimes, healthHistory, results, cfg) break innerLoop default: select { case r := <-cacheHealthChan: results = append(results, r) default: processHealthResult(cacheHealthChan, toData, localStates, lastHealthDurations, statHistory, monitorConfig, peerStates, combinedStates, fetchCount, errorCount, events, localCacheStatus, lastHealthEndTimes, healthHistory, results, cfg) break innerLoop } } } } }
// TODO JvD: add deliveryservice stuff func combineCrStates(peerStates map[enum.TrafficMonitorName]peer.Crstates, localStates peer.Crstates) peer.Crstates { combinedStates := peer.NewCrstates() for cacheName, localCacheState := range localStates.Caches { // localStates gets pruned when servers are disabled, it's the source of truth downVotes := 0 // TODO JvD: change to use parameter when deciding to be optimistic or pessimistic. if localCacheState.IsAvailable { // log.Infof(cacheName, " is available locally - setting to IsAvailable: true") combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: true} // we don't care about the peers, we got a "good one", and we're optimistic } else { downVotes++ // localStates says it's not happy for _, peerCrStates := range peerStates { if peerCrStates.Caches[cacheName].IsAvailable { // log.Infoln(cacheName, "- locally we think it's down, but", peerName, "says IsAvailable: ", peerCrStates.Caches[cacheName].IsAvailable, "trusting the peer.") combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: true} // we don't care about the peers, we got a "good one", and we're optimistic break // one peer that thinks we're good is all we need. } else { // log.Infoln(cacheName, "- locally we think it's down, and", peerName, "says IsAvailable: ", peerCrStates.Caches[cacheName].IsAvailable, "down voting") downVotes++ // peerStates for this peer doesn't like it } } } if downVotes > len(peerStates) { // log.Infoln(cacheName, "-", downVotes, "down votes, setting to IsAvailable: false") combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: false} } } for deliveryServiceName, localDeliveryService := range localStates.Deliveryservice { deliveryService := peer.Deliveryservice{IsAvailable: false, DisabledLocations: []enum.CacheName{}} // important to initialize DisabledLocations, so JSON is `[]` not `null` if localDeliveryService.IsAvailable { deliveryService.IsAvailable = true } deliveryService.DisabledLocations = localDeliveryService.DisabledLocations for peerName, iPeerStates := range peerStates { peerDeliveryService, ok := iPeerStates.Deliveryservice[deliveryServiceName] if !ok { log.Warnf("local delivery service %s not found in peer %s\n", deliveryServiceName, peerName) continue } if peerDeliveryService.IsAvailable { deliveryService.IsAvailable = true } deliveryService.DisabledLocations = intersection(deliveryService.DisabledLocations, peerDeliveryService.DisabledLocations) } combinedStates.Deliveryservice[deliveryServiceName] = deliveryService } return combinedStates }
// StartStatHistoryManager fetches the full statistics data from ATS Astats. This includes everything needed for all calculations, such as Delivery Services. This is expensive, though, and may be hard on ATS, so it should poll less often. // For a fast 'is it alive' poll, use the Health Result Manager poll. // Returns the stat history, the duration between the stat poll for each cache, the last Kbps data, the calculated Delivery Service stats, and the unpolled caches list. func StartStatHistoryManager( cacheStatChan <-chan cache.Result, localStates peer.CRStatesThreadsafe, combinedStates peer.CRStatesThreadsafe, toData todata.TODataThreadsafe, cachesChanged <-chan struct{}, errorCount UintThreadsafe, cfg config.Config, monitorConfig TrafficMonitorConfigMapThreadsafe, ) (StatHistoryThreadsafe, DurationMapThreadsafe, LastStatsThreadsafe, DSStatsThreadsafe, UnpolledCachesThreadsafe) { statHistory := NewStatHistoryThreadsafe(cfg.MaxStatHistory) lastStatDurations := NewDurationMapThreadsafe() lastStatEndTimes := map[enum.CacheName]time.Time{} lastStats := NewLastStatsThreadsafe() dsStats := NewDSStatsThreadsafe() unpolledCaches := NewUnpolledCachesThreadsafe() tickInterval := cfg.StatFlushInterval go func() { <-cachesChanged // wait for the signal that localStates have been set unpolledCaches.SetNewCaches(getNewCaches(localStates, monitorConfig)) for { var results []cache.Result results = append(results, <-cacheStatChan) tick := time.Tick(tickInterval) innerLoop: for { select { case <-cachesChanged: unpolledCaches.SetNewCaches(getNewCaches(localStates, monitorConfig)) case <-tick: log.Warnf("StatHistoryManager flushing queued results\n") processStatResults(results, statHistory, combinedStates.Get(), lastStats, toData.Get(), errorCount, dsStats, lastStatEndTimes, lastStatDurations, unpolledCaches) break innerLoop default: select { case r := <-cacheStatChan: results = append(results, r) default: processStatResults(results, statHistory, combinedStates.Get(), lastStats, toData.Get(), errorCount, dsStats, lastStatEndTimes, lastStatDurations, unpolledCaches) break innerLoop } } } } }() return statHistory, lastStatDurations, lastStats, dsStats, unpolledCaches }
// addCachePerSecStats calculates the cache per-second stats, adds them to LastStats, and returns the augmented object. func addCachePerSecStats(cacheName enum.CacheName, results []cache.Result, lastStats LastStats) LastStats { outBytes, outBytesTime, err := latestBytes(results) // it's ok if `latestBytes` returns 0s with an error, `addLastStat` will refrain from setting it (unless the previous calculation was nonzero, in which case it will error appropriately). if err != nil { log.Warnf("while computing delivery service data for cache %v: %v\n", cacheName, err) } lastStat := lastStats.Caches[cacheName] // if lastStats.Caches[cacheName] doesn't exist, it will be zero-constructed, and `addLastStat` will refrain from setting the PerSec for zero LastStats lastStat.Bytes, err = addLastStat(lastStat.Bytes, outBytes, outBytesTime) if err != nil { log.Errorf("while computing delivery service data for cache %v: %v\n", cacheName, err) return lastStats } lastStats.Caches[cacheName] = lastStat return lastStats }
func addAvailableData(dsStats Stats, crStates peer.Crstates, serverCachegroups map[enum.CacheName]enum.CacheGroupName, serverDs map[enum.CacheName][]enum.DeliveryServiceName, serverTypes map[enum.CacheName]enum.CacheType, statHistory map[enum.CacheName][]cache.Result) (Stats, error) { for cache, available := range crStates.Caches { cacheGroup, ok := serverCachegroups[cache] if !ok { log.Warnf("CreateStats not adding availability data for '%s': not found in Cachegroups\n", cache) continue } deliveryServices, ok := serverDs[cache] if !ok { log.Warnf("CreateStats not adding availability data for '%s': not found in DeliveryServices\n", cache) continue } cacheType, ok := serverTypes[enum.CacheName(cache)] if !ok { log.Warnf("CreateStats not adding availability data for '%s': not found in Server Types\n", cache) continue } for _, deliveryService := range deliveryServices { if deliveryService == "" { log.Errorf("EMPTY addAvailableData DS") // various bugs in other functions can cause this - this will help identify and debug them. continue } stat, ok := dsStats.DeliveryService[enum.DeliveryServiceName(deliveryService)] if !ok { log.Warnf("CreateStats not adding availability data for '%s': not found in Stats\n", cache) continue // TODO log warning? Error? } if available.IsAvailable { // c.IsAvailable.Value stat.CommonStats.IsAvailable.Value = true stat.CommonStats.CachesAvailableNum.Value++ cacheGroupStats := stat.CacheGroups[enum.CacheGroupName(cacheGroup)] cacheGroupStats.IsAvailable.Value = true stat.CacheGroups[enum.CacheGroupName(cacheGroup)] = cacheGroupStats stat.TotalStats.IsAvailable.Value = true typeStats := stat.Types[cacheType] typeStats.IsAvailable.Value = true stat.Types[cacheType] = typeStats } // TODO fix nested ifs if results, ok := statHistory[enum.CacheName(cache)]; ok { if len(results) < 1 { log.Warnf("no results %v %v\n", cache, deliveryService) } else { result := results[0] if result.PrecomputedData.Reporting { stat.CommonStats.CachesReporting[enum.CacheName(cache)] = true } else { log.Debugf("no reporting %v %v\n", cache, deliveryService) } } } else { log.Debugf("no result for %v %v\n", cache, deliveryService) } dsStats.DeliveryService[enum.DeliveryServiceName(deliveryService)] = stat // TODO Necessary? Remove? } } return dsStats, nil }
func createCacheStatuses( cacheTypes map[enum.CacheName]enum.CacheType, statHistory map[enum.CacheName][]cache.Result, lastHealthDurations map[enum.CacheName]time.Duration, cacheStates map[enum.CacheName]peer.IsAvailable, lastStats ds.LastStats, localCacheStatusThreadsafe CacheAvailableStatusThreadsafe, ) map[enum.CacheName]CacheStatus { conns := createCacheConnections(statHistory) statii := map[enum.CacheName]CacheStatus{} localCacheStatus := localCacheStatusThreadsafe.Get() for cacheName, cacheType := range cacheTypes { cacheStatHistory, ok := statHistory[cacheName] if !ok { log.Warnf("createCacheStatuses stat history missing cache %s\n", cacheName) continue } if len(cacheStatHistory) < 1 { log.Warnf("createCacheStatuses stat history empty for cache %s\n", cacheName) continue } log.Debugf("createCacheStatuses NOT empty for cache %s\n", cacheName) var loadAverage *float64 procLoadAvg := cacheStatHistory[0].Astats.System.ProcLoadavg if procLoadAvg != "" { firstSpace := strings.IndexRune(procLoadAvg, ' ') if firstSpace == -1 { log.Warnf("WARNING unexpected proc.loadavg '%s' for cache %s\n", procLoadAvg, cacheName) } else { loadAverageVal, err := strconv.ParseFloat(procLoadAvg[:firstSpace], 64) if err != nil { log.Warnf("proc.loadavg doesn't contain a float prefix '%s' for cache %s\n", procLoadAvg, cacheName) } else { loadAverage = &loadAverageVal } } } var queryTime *int64 queryTimeVal, ok := lastHealthDurations[cacheName] if !ok { log.Warnf("cache not in last health durations cache %s\n", cacheName) } else { queryTimeInt := int64(queryTimeVal / time.Millisecond) queryTime = &queryTimeInt } var kbps *float64 lastStat, ok := lastStats.Caches[enum.CacheName(cacheName)] if !ok { log.Warnf("cache not in last kbps cache %s\n", cacheName) } else { kbpsVal := lastStat.Bytes.PerSec / float64(ds.BytesPerKilobit) kbps = &kbpsVal } var connections *int64 connectionsVal, ok := conns[enum.CacheName(cacheName)] if !ok { log.Warnf("cache not in connections %s\n", cacheName) } else { connections = &connectionsVal } var status *string statusVal, ok := localCacheStatus[enum.CacheName(cacheName)] if !ok { log.Warnf("cache not in statuses %s\n", cacheName) } else { statusString := statusVal.Status + " - " if localCacheStatus[enum.CacheName(cacheName)].Available { statusString += "available" } else { statusString += "unavailable" } status = &statusString } cacheTypeStr := string(cacheType) statii[enum.CacheName(cacheName)] = CacheStatus{Type: &cacheTypeStr, LoadAverage: loadAverage, QueryTimeMilliseconds: queryTime, BandwidthKbps: kbps, ConnectionCount: connections, Status: status} } return statii }
// TODO timing, and determine if the case, or its internal `for`, should be put in a goroutine // TODO determine if subscribers take action on change, and change to mutexed objects if not. func monitorConfigListen(monitorConfigTS TrafficMonitorConfigMapThreadsafe, monitorConfigPollChan <-chan to.TrafficMonitorConfigMap, localStates peer.CRStatesThreadsafe, statUrlSubscriber chan<- poller.HttpPollerConfig, healthUrlSubscriber chan<- poller.HttpPollerConfig, peerUrlSubscriber chan<- poller.HttpPollerConfig, cfg config.Config, staticAppData StaticAppData) { for { select { case monitorConfig := <-monitorConfigPollChan: monitorConfigTS.Set(monitorConfig) healthUrls := map[string]string{} statUrls := map[string]string{} peerUrls := map[string]string{} caches := map[string]string{} for _, srv := range monitorConfig.TrafficServer { caches[srv.HostName] = srv.Status cacheName := enum.CacheName(srv.HostName) if srv.Status == "ONLINE" { localStates.SetCache(cacheName, peer.IsAvailable{IsAvailable: true}) continue } if srv.Status == "OFFLINE" { localStates.SetCache(cacheName, peer.IsAvailable{IsAvailable: false}) continue } // seed states with available = false until our polling cycle picks up a result if _, exists := localStates.Get().Caches[cacheName]; !exists { localStates.SetCache(cacheName, peer.IsAvailable{IsAvailable: false}) } url := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingURL r := strings.NewReplacer( "${hostname}", srv.FQDN, "${interface_name}", srv.InterfaceName, "application=system", "application=plugin.remap", "application=", "application=plugin.remap", ) url = r.Replace(url) healthUrls[srv.HostName] = url r = strings.NewReplacer("application=plugin.remap", "application=") url = r.Replace(url) statUrls[srv.HostName] = url } for _, srv := range monitorConfig.TrafficMonitor { if srv.HostName == staticAppData.Hostname { continue } if srv.Status != "ONLINE" { continue } // TODO: the URL should be config driven. -jse url := fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.IP, srv.Port) peerUrls[srv.HostName] = url } statUrlSubscriber <- poller.HttpPollerConfig{Urls: statUrls, Interval: cfg.CacheStatPollingInterval} healthUrlSubscriber <- poller.HttpPollerConfig{Urls: healthUrls, Interval: cfg.CacheHealthPollingInterval} peerUrlSubscriber <- poller.HttpPollerConfig{Urls: peerUrls, Interval: cfg.PeerPollingInterval} for cacheName := range localStates.GetCaches() { if _, exists := monitorConfig.TrafficServer[string(cacheName)]; !exists { log.Warnf("Removing %s from localStates", cacheName) localStates.DeleteCache(cacheName) } } // TODO because there are multiple writers to localStates.DeliveryService, there is a race condition, where MonitorConfig (this func) and HealthResultManager could write at the same time, and the HealthResultManager could overwrite a delivery service addition or deletion here. Probably the simplest and most performant fix would be a lock-free algorithm using atomic compare-and-swaps. for _, ds := range monitorConfig.DeliveryService { // since caches default to unavailable, also default DS false if _, exists := localStates.Get().Deliveryservice[enum.DeliveryServiceName(ds.XMLID)]; !exists { localStates.SetDeliveryService(enum.DeliveryServiceName(ds.XMLID), peer.Deliveryservice{IsAvailable: false, DisabledLocations: []enum.CacheName{}}) // important to initialize DisabledLocations, so JSON is `[]` not `null` } } for ds, _ := range localStates.Get().Deliveryservice { if _, exists := monitorConfig.DeliveryService[string(ds)]; !exists { localStates.DeleteDeliveryService(ds) } } } } }