func CreateStats(statHistory map[enum.CacheName][]cache.Result, toData todata.TOData, crStates peer.Crstates, lastKbpsStats StatsLastKbps, now time.Time) (Stats, StatsLastKbps, error) { start := time.Now() dsStats := NewStats() for deliveryService, _ := range toData.DeliveryServiceServers { if deliveryService == "" { log.Errorf("EMPTY CreateStats deliveryService") continue } dsStats.DeliveryService[enum.DeliveryServiceName(deliveryService)] = *dsdata.NewStat() } dsStats = setStaticData(dsStats, toData.DeliveryServiceServers) var err error dsStats, err = addAvailableData(dsStats, crStates, toData.ServerCachegroups, toData.ServerDeliveryServices, toData.ServerTypes, statHistory) // TODO move after stat summarisation if err != nil { return dsStats, lastKbpsStats, fmt.Errorf("Error getting Cache availability data: %v", err) } for server, history := range statHistory { if len(history) < 1 { continue // TODO warn? } cachegroup, ok := toData.ServerCachegroups[server] if !ok { log.Warnf("server %s has no cachegroup, skipping\n", server) continue } serverType, ok := toData.ServerTypes[enum.CacheName(server)] if !ok { log.Warnf("server %s not in CRConfig, skipping\n", server) continue } result := history[len(history)-1] // TODO check result.PrecomputedData.Errors for ds, resultStat := range result.PrecomputedData.DeliveryServiceStats { if ds == "" { log.Errorf("EMPTY precomputed delivery service") continue } if _, ok := dsStats.DeliveryService[ds]; !ok { dsStats.DeliveryService[ds] = resultStat continue } httpDsStat := dsStats.DeliveryService[ds] httpDsStat.Total = httpDsStat.Total.Sum(resultStat.Total) httpDsStat.CacheGroups[cachegroup] = httpDsStat.CacheGroups[cachegroup].Sum(resultStat.CacheGroups[cachegroup]) httpDsStat.Type[serverType] = httpDsStat.Type[serverType].Sum(resultStat.Type[serverType]) dsStats.DeliveryService[ds] = httpDsStat // TODO determine if necessary } } kbpsStats, kbpsStatsLastKbps, kbpsErr := addKbps(statHistory, dsStats, lastKbpsStats, now) log.Infof("CreateStats took %v\n", time.Since(start)) return kbpsStats, kbpsStatsLastKbps, kbpsErr }
func healthResultManagerListen(cacheHealthChan <-chan cache.Result, toData todata.TODataThreadsafe, localStates peer.CRStatesThreadsafe, lastHealthDurations DurationMapThreadsafe, statHistory StatHistoryThreadsafe, monitorConfig TrafficMonitorConfigMapThreadsafe, peerStates peer.CRStatesPeersThreadsafe, combinedStates peer.CRStatesThreadsafe, fetchCount UintThreadsafe, errorCount UintThreadsafe, events EventsThreadsafe, localCacheStatus CacheAvailableStatusThreadsafe) { lastHealthEndTimes := map[enum.CacheName]time.Time{} healthHistory := map[enum.CacheName][]cache.Result{} // This reads at least 1 value from the cacheHealthChan. Then, we loop, and try to read from the channel some more. If there's nothing to read, we hit `default` and process. If there is stuff to read, we read it, then inner-loop trying to read more. If we're continuously reading and the channel is never empty, and we hit the tick time, process anyway even though the channel isn't empty, to prevent never processing (starvation). for { var results []cache.Result results = append(results, <-cacheHealthChan) tickInterval := time.Millisecond * 200 // TODO make config setting tick := time.Tick(tickInterval) innerLoop: for { select { case <-tick: log.Warnf("Health Result Manager flushing queued results\n") processHealthResult(cacheHealthChan, toData, localStates, lastHealthDurations, statHistory, monitorConfig, peerStates, combinedStates, fetchCount, errorCount, events, localCacheStatus, lastHealthEndTimes, healthHistory, results) break innerLoop default: select { case r := <-cacheHealthChan: results = append(results, r) default: processHealthResult(cacheHealthChan, toData, localStates, lastHealthDurations, statHistory, monitorConfig, peerStates, combinedStates, fetchCount, errorCount, events, localCacheStatus, lastHealthEndTimes, healthHistory, results) break innerLoop } } } } }
func (handler Handler) Handle(id string, r io.Reader, err error, pollId uint64, pollFinished chan<- uint64) { log.Debugf("poll %v %v handle start\n", pollId, time.Now()) result := Result{ Id: id, Available: false, Errors: []error{}, Time: time.Now(), // TODO change this to be computed the instant we get the result back, to minimise inaccuracy PollID: pollId, PollFinished: pollFinished, } if err != nil { log.Errorf("%v handler given error '%v'\n", id, err) // error here, in case the thing that called Handle didn't error result.Errors = append(result.Errors, err) handler.ResultChannel <- result return } if r == nil { log.Errorf("%v handle reader nil\n", id) result.Errors = append(result.Errors, fmt.Errorf("handler got nil reader")) handler.ResultChannel <- result return } result.PrecomputedData.Reporting = true if err := json.NewDecoder(r).Decode(&result.Astats); err != nil { log.Errorf("%s procnetdev decode error '%v'\n", id, err) result.Errors = append(result.Errors, err) handler.ResultChannel <- result return } if result.Astats.System.ProcNetDev == "" { log.Warnf("addkbps %s procnetdev empty\n", id) } log.Debugf("poll %v %v handle decode end\n", pollId, time.Now()) if err != nil { result.Errors = append(result.Errors, err) log.Errorf("addkbps handle %s error '%v'\n", id, err) } else { result.Available = true } if handler.Precompute() { log.Debugf("poll %v %v handle precompute start\n", pollId, time.Now()) result = handler.precompute(result) log.Debugf("poll %v %v handle precompute end\n", pollId, time.Now()) } log.Debugf("poll %v %v handle write start\n", pollId, time.Now()) handler.ResultChannel <- result log.Debugf("poll %v %v handle end\n", pollId, time.Now()) }
// TODO JvD: add deliveryservice stuff func combineCrStates(peerStates map[string]peer.Crstates, localStates peer.Crstates) peer.Crstates { combinedStates := peer.NewCrstates() for cacheName, localCacheState := range localStates.Caches { // localStates gets pruned when servers are disabled, it's the source of truth downVotes := 0 // TODO JvD: change to use parameter when deciding to be optimistic or pessimistic. if localCacheState.IsAvailable { // log.Infof(cacheName, " is available locally - setting to IsAvailable: true") combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: true} // we don't care about the peers, we got a "good one", and we're optimistic } else { downVotes++ // localStates says it's not happy for _, peerCrStates := range peerStates { if peerCrStates.Caches[cacheName].IsAvailable { // log.Infoln(cacheName, "- locally we think it's down, but", peerName, "says IsAvailable: ", peerCrStates.Caches[cacheName].IsAvailable, "trusting the peer.") combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: true} // we don't care about the peers, we got a "good one", and we're optimistic break // one peer that thinks we're good is all we need. } else { // log.Infoln(cacheName, "- locally we think it's down, and", peerName, "says IsAvailable: ", peerCrStates.Caches[cacheName].IsAvailable, "down voting") downVotes++ // peerStates for this peer doesn't like it } } } if downVotes > len(peerStates) { // log.Infoln(cacheName, "-", downVotes, "down votes, setting to IsAvailable: false") combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: false} } } for deliveryServiceName, localDeliveryService := range localStates.Deliveryservice { deliveryService := peer.Deliveryservice{} if localDeliveryService.IsAvailable { deliveryService.IsAvailable = true } deliveryService.DisabledLocations = localDeliveryService.DisabledLocations for peerName, iPeerStates := range peerStates { peerDeliveryService, ok := iPeerStates.Deliveryservice[deliveryServiceName] if !ok { log.Warnf("local delivery service %s not found in peer %s\n", deliveryServiceName, peerName) continue } if peerDeliveryService.IsAvailable { deliveryService.IsAvailable = true } deliveryService.DisabledLocations = intersection(deliveryService.DisabledLocations, peerDeliveryService.DisabledLocations) } combinedStates.Deliveryservice[deliveryServiceName] = deliveryService } return combinedStates }
func createCacheStatuses(cacheTypes map[enum.CacheName]enum.CacheType, statHistory map[enum.CacheName][]cache.Result, lastHealthDurations map[enum.CacheName]time.Duration, cacheStates map[string]peer.IsAvailable, lastKbpsStats ds.StatsLastKbps, localCacheStatusThreadsafe CacheAvailableStatusThreadsafe) map[enum.CacheName]CacheStatus { conns := createCacheConnections(statHistory) statii := map[enum.CacheName]CacheStatus{} localCacheStatus := localCacheStatusThreadsafe.Get() for cacheName, cacheType := range cacheTypes { cacheStatHistory, ok := statHistory[cacheName] if !ok { log.Warnf("createCacheStatuses stat history missing cache %s\n", cacheName) continue } if len(cacheStatHistory) < 1 { log.Warnf("createCacheStatuses stat history empty for cache %s\n", cacheName) continue } log.Debugf("createCacheStatuses NOT empty for cache %s\n", cacheName) var loadAverage *float64 procLoadAvg := cacheStatHistory[0].Astats.System.ProcLoadavg if procLoadAvg != "" { firstSpace := strings.IndexRune(procLoadAvg, ' ') if firstSpace == -1 { log.Warnf("WARNING unexpected proc.loadavg '%s' for cache %s\n", procLoadAvg, cacheName) } else { loadAverageVal, err := strconv.ParseFloat(procLoadAvg[:firstSpace], 64) if err != nil { log.Warnf("proc.loadavg doesn't contain a float prefix '%s' for cache %s\n", procLoadAvg, cacheName) } else { loadAverage = &loadAverageVal } } } var queryTime *int64 queryTimeVal, ok := lastHealthDurations[cacheName] if !ok { log.Warnf("cache not in last health durations cache %s\n", cacheName) } else { queryTimeInt := int64(queryTimeVal / time.Millisecond) queryTime = &queryTimeInt } var kbps *float64 kbpsVal, ok := lastKbpsStats.Caches[enum.CacheName(cacheName)] if !ok { log.Warnf("cache not in last kbps cache %s\n", cacheName) } else { kbps = &kbpsVal.Kbps } var connections *int64 connectionsVal, ok := conns[enum.CacheName(cacheName)] if !ok { log.Warnf("cache not in connections %s\n", cacheName) } else { connections = &connectionsVal } var status *string statusVal, ok := localCacheStatus[enum.CacheName(cacheName)] if !ok { log.Warnf("cache not in statuses %s\n", cacheName) } else { statusString := statusVal.Status + " - " if localCacheStatus[enum.CacheName(cacheName)].Available { statusString += "available" } else { statusString += "unavailable" } status = &statusString } cacheTypeStr := string(cacheType) statii[enum.CacheName(cacheName)] = CacheStatus{Type: &cacheTypeStr, LoadAverage: loadAverage, QueryTimeMilliseconds: queryTime, BandwidthKbps: kbps, ConnectionCount: connections, Status: status} } return statii }
func addAvailableData(dsStats Stats, crStates peer.Crstates, serverCachegroups map[enum.CacheName]enum.CacheGroupName, serverDs map[string][]string, serverTypes map[enum.CacheName]enum.CacheType, statHistory map[enum.CacheName][]cache.Result) (Stats, error) { for cache, available := range crStates.Caches { cacheGroup, ok := serverCachegroups[enum.CacheName(cache)] if !ok { log.Warnf("CreateStats not adding availability data for '%s': not found in Cachegroups\n", cache) continue } deliveryServices, ok := serverDs[cache] if !ok { log.Warnf("CreateStats not adding availability data for '%s': not found in DeliveryServices\n", cache) continue } cacheType, ok := serverTypes[enum.CacheName(cache)] if !ok { log.Warnf("CreateStats not adding availability data for '%s': not found in Server Types\n", cache) continue } for _, deliveryService := range deliveryServices { if deliveryService == "" { log.Errorf("EMPTY addAvailableData DS") // various bugs in other functions can cause this - this will help identify and debug them. continue } stat, ok := dsStats.DeliveryService[enum.DeliveryServiceName(deliveryService)] if !ok { log.Warnf("CreateStats not adding availability data for '%s': not found in Stats\n", cache) continue // TODO log warning? Error? } if available.IsAvailable { // c.IsAvailable.Value stat.Common.IsAvailable.Value = true stat.Common.CachesAvailable.Value++ cacheGroupStats := stat.CacheGroups[enum.CacheGroupName(cacheGroup)] cacheGroupStats.IsAvailable.Value = true stat.CacheGroups[enum.CacheGroupName(cacheGroup)] = cacheGroupStats stat.Total.IsAvailable.Value = true typeStats := stat.Type[cacheType] typeStats.IsAvailable.Value = true stat.Type[cacheType] = typeStats } // TODO fix nested ifs if results, ok := statHistory[enum.CacheName(cache)]; ok { if len(results) < 1 { log.Warnf("no results %v %v\n", cache, deliveryService) } else { result := results[0] if result.PrecomputedData.Reporting { stat.Common.CachesReporting[enum.CacheName(cache)] = true } else { log.Debugf("no reporting %v %v\n", cache, deliveryService) } } } else { log.Debugf("no result for %v %v\n", cache, deliveryService) } dsStats.DeliveryService[enum.DeliveryServiceName(deliveryService)] = stat // TODO Necessary? Remove? } } return dsStats, nil }
// addKbps adds Kbps fields to the NewStats, based on the previous out_bytes in the oldStats, and the time difference. // // Traffic Server only updates its data every N seconds. So, often we get a new Stats with the same OutBytes as the previous one, // So, we must record the last changed value, and the time it changed. Then, if the new OutBytes is different from the previous, // we set the (new - old) / lastChangedTime as the KBPS, and update the recorded LastChangedTime and LastChangedValue // // This specifically returns the given dsStats and lastKbpsStats on error, so it's safe to do persistentStats, persistentLastKbpsStats, err = addKbps(...) // TODO handle ATS byte rolling (when the `out_bytes` overflows back to 0) func addKbps(statHistory map[enum.CacheName][]cache.Result, dsStats Stats, lastKbpsStats StatsLastKbps, dsStatsTime time.Time) (Stats, StatsLastKbps, error) { for dsName, stat := range dsStats.DeliveryService { lastKbpsStat, lastKbpsStatExists := lastKbpsStats.DeliveryServices[dsName] if !lastKbpsStatExists { lastKbpsStat = newStatLastKbps() } for cgName, cacheStats := range stat.CacheGroups { lastKbpsData, _ := lastKbpsStat.CacheGroups[cgName] if cacheStats.OutBytes.Value == lastKbpsData.Bytes { cacheStats.Kbps.Value = lastKbpsData.Kbps stat.CacheGroups[cgName] = cacheStats continue } if lastKbpsStatExists && lastKbpsData.Bytes != 0 { cacheStats.Kbps.Value = float64(cacheStats.OutBytes.Value-lastKbpsData.Bytes) / dsStatsTime.Sub(lastKbpsData.Time).Seconds() } if cacheStats.Kbps.Value < 0 { cacheStats.Kbps.Value = 0 log.Errorf("addkbps negative cachegroup cacheStats.Kbps.Value: '%v' '%v' %v - %v / %v\n", dsName, cgName, cacheStats.OutBytes.Value, lastKbpsData.Bytes, dsStatsTime.Sub(lastKbpsData.Time).Seconds()) } lastKbpsStat.CacheGroups[cgName] = LastKbpsData{Time: dsStatsTime, Bytes: cacheStats.OutBytes.Value, Kbps: cacheStats.Kbps.Value} stat.CacheGroups[cgName] = cacheStats } for cacheType, cacheStats := range stat.Type { lastKbpsData, _ := lastKbpsStat.Type[cacheType] if cacheStats.OutBytes.Value == lastKbpsData.Bytes { if cacheStats.OutBytes.Value == lastKbpsData.Bytes { if lastKbpsData.Kbps < 0 { log.Errorf("addkbps negative cachetype cacheStats.Kbps.Value!\n") lastKbpsData.Kbps = 0 } cacheStats.Kbps.Value = lastKbpsData.Kbps stat.Type[cacheType] = cacheStats continue } if lastKbpsStatExists && lastKbpsData.Bytes != 0 { cacheStats.Kbps.Value = float64(cacheStats.OutBytes.Value-lastKbpsData.Bytes) / dsStatsTime.Sub(lastKbpsData.Time).Seconds() } if cacheStats.Kbps.Value < 0 { log.Errorf("addkbps negative cachetype cacheStats.Kbps.Value.\n") cacheStats.Kbps.Value = 0 } lastKbpsStat.Type[cacheType] = LastKbpsData{Time: dsStatsTime, Bytes: cacheStats.OutBytes.Value, Kbps: cacheStats.Kbps.Value} stat.Type[cacheType] = cacheStats } } totalChanged := lastKbpsStat.Total.Bytes != stat.Total.OutBytes.Value if lastKbpsStatExists && lastKbpsStat.Total.Bytes != 0 && totalChanged { stat.Total.Kbps.Value = float64(stat.Total.OutBytes.Value-lastKbpsStat.Total.Bytes) / dsStatsTime.Sub(lastKbpsStat.Total.Time).Seconds() / BytesPerKbps if stat.Total.Kbps.Value < 0 { stat.Total.Kbps.Value = 0 log.Errorf("addkbps negative stat.Total.Kbps.Value! Deliveryservice '%v' %v - %v / %v\n", dsName, stat.Total.OutBytes.Value, lastKbpsStat.Total.Bytes, dsStatsTime.Sub(lastKbpsStat.Total.Time).Seconds()) } } else { stat.Total.Kbps.Value = lastKbpsStat.Total.Kbps } if totalChanged { lastKbpsStat.Total = LastKbpsData{Time: dsStatsTime, Bytes: stat.Total.OutBytes.Value, Kbps: stat.Total.Kbps.Value} } lastKbpsStats.DeliveryServices[dsName] = lastKbpsStat dsStats.DeliveryService[dsName] = stat } for cacheName, results := range statHistory { var result *cache.Result for _, r := range results { // result.Errors can include stat errors where OutBytes was set correctly, so we look for the first non-zero OutBytes rather than the first errorless result // TODO add error classes to PrecomputedData, to distinguish stat errors from HTTP errors? if r.PrecomputedData.OutBytes == 0 { continue } result = &r break } if result == nil { log.Warnf("addkbps cache %v has no results\n", cacheName) continue } outBytes := result.PrecomputedData.OutBytes lastCacheKbpsData, ok := lastKbpsStats.Caches[cacheName] if !ok { // this means this is the first result for this cache - this is a normal condition lastKbpsStats.Caches[cacheName] = LastKbpsData{Time: dsStatsTime, Bytes: outBytes, Kbps: 0} continue } if lastCacheKbpsData.Bytes == outBytes { // this means this ATS hasn't updated its byte count yet - this is a normal condition continue // don't try to kbps, and importantly don't change the time of the last change, if Traffic Server hasn't updated } if outBytes == 0 { log.Errorf("addkbps %v outbytes zero\n", cacheName) continue } kbps := float64(outBytes-lastCacheKbpsData.Bytes) / result.Time.Sub(lastCacheKbpsData.Time).Seconds() / BytesPerKbps if lastCacheKbpsData.Bytes == 0 { kbps = 0 log.Errorf("addkbps cache %v lastCacheKbpsData.Bytes zero\n", cacheName) } if kbps < 0 { log.Errorf("addkbps negative cache kbps: cache %v kbps %v outBytes %v lastCacheKbpsData.Bytes %v dsStatsTime %v lastCacheKbpsData.Time %v\n", cacheName, kbps, outBytes, lastCacheKbpsData.Bytes, dsStatsTime, lastCacheKbpsData.Time) // this is almost certainly a code bug. The only case this would ever be a data issue, would be if Traffic Server returned fewer bytes than previously. kbps = 0 } lastKbpsStats.Caches[cacheName] = LastKbpsData{Time: result.Time, Bytes: outBytes, Kbps: kbps} } return dsStats, lastKbpsStats, nil }
// TODO timing, and determine if the case, or its internal `for`, should be put in a goroutine // TODO determine if subscribers take action on change, and change to mutexed objects if not. func monitorConfigListen(monitorConfigTS TrafficMonitorConfigMapThreadsafe, monitorConfigPollChan <-chan to.TrafficMonitorConfigMap, localStates peer.CRStatesThreadsafe, statUrlSubscriber chan<- poller.HttpPollerConfig, healthUrlSubscriber chan<- poller.HttpPollerConfig, peerUrlSubscriber chan<- poller.HttpPollerConfig) { for { select { case monitorConfig := <-monitorConfigPollChan: monitorConfigTS.Set(monitorConfig) healthUrls := map[string]string{} statUrls := map[string]string{} peerUrls := map[string]string{} caches := map[string]string{} for _, srv := range monitorConfig.TrafficServer { caches[srv.HostName] = srv.Status if srv.Status == "ONLINE" { localStates.SetCache(srv.HostName, peer.IsAvailable{IsAvailable: true}) continue } if srv.Status == "OFFLINE" { localStates.SetCache(srv.HostName, peer.IsAvailable{IsAvailable: false}) continue } // seed states with available = false until our polling cycle picks up a result if _, exists := localStates.Get().Caches[srv.HostName]; !exists { localStates.SetCache(srv.HostName, peer.IsAvailable{IsAvailable: false}) } url := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingURL r := strings.NewReplacer( "${hostname}", srv.FQDN, "${interface_name}", srv.InterfaceName, "application=system", "application=plugin.remap", "application=", "application=plugin.remap", ) url = r.Replace(url) healthUrls[srv.HostName] = url r = strings.NewReplacer("application=plugin.remap", "application=") url = r.Replace(url) statUrls[srv.HostName] = url } for _, srv := range monitorConfig.TrafficMonitor { if srv.Status != "ONLINE" { continue } // TODO: the URL should be config driven. -jse url := fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.IP, srv.Port) peerUrls[srv.HostName] = url } statUrlSubscriber <- poller.HttpPollerConfig{Urls: statUrls, Interval: defaultCacheStatPollingInterval} healthUrlSubscriber <- poller.HttpPollerConfig{Urls: healthUrls, Interval: defaultCacheHealthPollingInterval} peerUrlSubscriber <- poller.HttpPollerConfig{Urls: peerUrls, Interval: defaultPeerPollingInterval} for k := range localStates.GetCaches() { if _, exists := monitorConfig.TrafficServer[k]; !exists { log.Warnf("Removing %s from localStates", k) localStates.DeleteCache(k) } } addStateDeliveryServices(monitorConfig, localStates.Get().Deliveryservice) } } }