// processHealthResult processes the given health results, adding their stats to the CacheAvailableStatus. Note this is NOT threadsafe, because it non-atomically gets CacheAvailableStatuses, Events, LastHealthDurations and later updates them. This MUST NOT be called from multiple threads. func processHealthResult(cacheHealthChan <-chan cache.Result, toData todata.TODataThreadsafe, localStates peer.CRStatesThreadsafe, lastHealthDurationsThreadsafe DurationMapThreadsafe, statHistory StatHistoryThreadsafe, monitorConfig TrafficMonitorConfigMapThreadsafe, peerStates peer.CRStatesPeersThreadsafe, combinedStates peer.CRStatesThreadsafe, fetchCount UintThreadsafe, errorCount UintThreadsafe, events EventsThreadsafe, localCacheStatusThreadsafe CacheAvailableStatusThreadsafe, lastHealthEndTimes map[enum.CacheName]time.Time, healthHistory map[enum.CacheName][]cache.Result, results []cache.Result, cfg config.Config) { if len(results) == 0 { return } toDataCopy := toData.Get() // create a copy, so the same data used for all processing of this cache health result localCacheStatus := localCacheStatusThreadsafe.Get().Copy() monitorConfigCopy := monitorConfig.Get() for _, healthResult := range results { log.Debugf("poll %v %v healthresultman start\n", healthResult.PollID, time.Now()) fetchCount.Inc() var prevResult cache.Result healthResultHistory := healthHistory[enum.CacheName(healthResult.Id)] // healthResultHistory := healthHistory.Get(enum.CacheName(healthResult.Id)) if len(healthResultHistory) != 0 { prevResult = healthResultHistory[len(healthResultHistory)-1] } health.GetVitals(&healthResult, &prevResult, &monitorConfigCopy) // healthHistory.Set(enum.CacheName(healthResult.Id), pruneHistory(append(healthHistory.Get(enum.CacheName(healthResult.Id)), healthResult), defaultMaxHistory)) healthHistory[enum.CacheName(healthResult.Id)] = pruneHistory(append(healthHistory[enum.CacheName(healthResult.Id)], healthResult), cfg.MaxHealthHistory) isAvailable, whyAvailable := health.EvalCache(healthResult, &monitorConfigCopy) if localStates.Get().Caches[healthResult.Id].IsAvailable != isAvailable { log.Infof("Changing state for %s was: %t now: %t because %s errors: %v", healthResult.Id, prevResult.Available, isAvailable, whyAvailable, healthResult.Errors) events.Add(Event{Time: time.Now().Unix(), Description: whyAvailable, Name: healthResult.Id, Hostname: healthResult.Id, Type: toDataCopy.ServerTypes[healthResult.Id].String(), Available: isAvailable}) } localCacheStatus[healthResult.Id] = CacheAvailableStatus{Available: isAvailable, Status: monitorConfigCopy.TrafficServer[string(healthResult.Id)].Status} // TODO move within localStates? localStates.SetCache(healthResult.Id, peer.IsAvailable{IsAvailable: isAvailable}) log.Debugf("poll %v %v calculateDeliveryServiceState start\n", healthResult.PollID, time.Now()) calculateDeliveryServiceState(toDataCopy.DeliveryServiceServers, localStates) log.Debugf("poll %v %v calculateDeliveryServiceState end\n", healthResult.PollID, time.Now()) } localCacheStatusThreadsafe.Set(localCacheStatus) // TODO determine if we should combineCrStates() here lastHealthDurations := lastHealthDurationsThreadsafe.Get().Copy() for _, healthResult := range results { if lastHealthStart, ok := lastHealthEndTimes[enum.CacheName(healthResult.Id)]; ok { d := time.Since(lastHealthStart) lastHealthDurations[enum.CacheName(healthResult.Id)] = d } lastHealthEndTimes[enum.CacheName(healthResult.Id)] = time.Now() log.Debugf("poll %v %v finish\n", healthResult.PollID, time.Now()) healthResult.PollFinished <- healthResult.PollID } lastHealthDurationsThreadsafe.Set(lastHealthDurations) }
// // Kicks off the pollers and handlers // func Start(opsConfigFile string, staticAppData StaticAppData) { var toSession *traffic_ops.Session fetchSuccessCounter := gmx.NewCounter("fetchSuccess") fetchFailCounter := gmx.NewCounter("fetchFail") fetchPendingGauge := gmx.NewGauge("fetchPending") tr := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, } sharedClient := http.Client{ Timeout: defaultHttpTimeout, Transport: tr, } cacheHealthConfigChannel := make(chan poller.HttpPollerConfig) cacheHealthChannel := make(chan cache.Result) cacheHealthTick := make(chan uint64) cacheHealthPoller := poller.HttpPoller{ TickChan: cacheHealthTick, ConfigChannel: cacheHealthConfigChannel, Config: poller.HttpPollerConfig{ Interval: defaultCacheHealthPollingInterval, }, Fetcher: fetcher.HttpFetcher{ Handler: cache.Handler{ResultChannel: cacheHealthChannel}, Client: sharedClient, Success: fetchSuccessCounter, Fail: fetchFailCounter, Pending: fetchPendingGauge, }, } cacheStatConfigChannel := make(chan poller.HttpPollerConfig) cacheStatChannel := make(chan cache.Result) cacheStatPoller := poller.HttpPoller{ ConfigChannel: cacheStatConfigChannel, Config: poller.HttpPollerConfig{ Interval: defaultCacheStatPollingInterval, }, Fetcher: fetcher.HttpFetcher{ Handler: cache.Handler{ResultChannel: cacheStatChannel}, Client: sharedClient, Success: fetchSuccessCounter, Fail: fetchFailCounter, Pending: fetchPendingGauge, }, } sessionChannel := make(chan *traffic_ops.Session) monitorConfigChannel := make(chan traffic_ops.TrafficMonitorConfigMap) monitorOpsConfigChannel := make(chan handler.OpsConfig) monitorConfigPoller := poller.MonitorConfigPoller{ Interval: defaultMonitorConfigPollingInterval, SessionChannel: sessionChannel, ConfigChannel: monitorConfigChannel, OpsConfigChannel: monitorOpsConfigChannel, } opsConfigFileChannel := make(chan interface{}) opsConfigFilePoller := poller.FilePoller{ File: opsConfigFile, ResultChannel: opsConfigFileChannel, } opsConfigChannel := make(chan handler.OpsConfig) opsConfigFileHandler := handler.OpsConfigFileHandler{ ResultChannel: opsConfigFilePoller.ResultChannel, OpsConfigChannel: opsConfigChannel, } peerConfigChannel := make(chan poller.HttpPollerConfig) peerChannel := make(chan peer.Result) peerPoller := poller.HttpPoller{ ConfigChannel: peerConfigChannel, Config: poller.HttpPollerConfig{ Interval: defaultPeerPollingInterval, }, Fetcher: fetcher.HttpFetcher{ Handler: peer.Handler{ResultChannel: peerChannel}, Client: sharedClient, Success: fetchSuccessCounter, Fail: fetchFailCounter, Pending: fetchPendingGauge, }, } go opsConfigFileHandler.Listen() go opsConfigFilePoller.Poll() go monitorConfigPoller.Poll() go cacheHealthPoller.Poll() go cacheStatPoller.Poll() go peerPoller.Poll() dr := make(chan http_server.DataRequest) healthHistory := make(map[string][]interface{}) statHistory := make(map[string][]interface{}) var opsConfig handler.OpsConfig var monitorConfig traffic_ops.TrafficMonitorConfigMap localStates := peer.Crstates{Caches: make(map[string]peer.IsAvailable), Deliveryservice: make(map[string]peer.Deliveryservice)} // this is the local state as discoverer by this traffic_monitor peerStates := make(map[string]peer.Crstates) // each peer's last state is saved in this map combinedStates := peer.Crstates{Caches: make(map[string]peer.IsAvailable), Deliveryservice: make(map[string]peer.Deliveryservice)} // this is the result of combining the localStates and all the peerStates using the var ?? deliveryServiceServers := map[string][]string{} serverTypes := map[string]string{} // TODO put stat data in a struct, for brevity lastHealthEndTimes := map[string]time.Time{} lastHealthDurations := map[string]time.Duration{} fetchCount := uint64(0) // note this is the number of individual caches fetched from, not the number of times all the caches were polled. healthIteration := uint64(0) errorCount := uint64(0) events := []Event{} eventIndex := uint64(0) for { select { case req := <-dr: defer close(req.C) var body []byte var err error switch req.T { case http_server.TR_CONFIG: if toSession != nil && opsConfig.CdnName != "" { body, err = toSession.CRConfigRaw(opsConfig.CdnName) } case http_server.TR_STATE_DERIVED: body, err = peer.CrStatesMarshall(combinedStates) case http_server.TR_STATE_SELF: body, err = peer.CrStatesMarshall(localStates) case http_server.CACHE_STATS: // TODO: add support for ?hc=N query param, stats=, wildcard, individual caches // add pp and date to the json: /* pp: "0=[my-ats-edge-cache-1], hc=[1]", date: "Thu Oct 09 20:28:36 UTC 2014" */ params := req.Parameters hc := 1 if _, exists := params["hc"]; exists { v, err := strconv.Atoi(params["hc"][0]) if err == nil { hc = v } } body, err = cache.StatsMarshall(statHistory, hc) case http_server.DS_STATS: body = []byte("TODO implement") case http_server.EVENT_LOG: body, err = json.Marshal(JSONEvents{Events: events}) case http_server.PEER_STATES: body = []byte("TODO implement") case http_server.STAT_SUMMARY: body = []byte("TODO implement") case http_server.STATS: body, err = getStats(staticAppData, cacheHealthPoller.Config.Interval, lastHealthDurations, fetchCount, healthIteration, errorCount) if err != nil { // TODO send error to client errorCount++ log.Printf("ERROR getting stats %v\n", err) continue } case http_server.CONFIG_DOC: opsConfigCopy := opsConfig // if the password is blank, leave it blank, so callers can see it's missing. if opsConfigCopy.Password != "" { opsConfigCopy.Password = "******" } body, err = json.Marshal(opsConfigCopy) default: body = []byte("TODO error message") } req.C <- body case oc := <-opsConfigFileHandler.OpsConfigChannel: var err error opsConfig = oc listenAddress := ":80" // default if opsConfig.HttpListener != "" { listenAddress = opsConfig.HttpListener } err = http_server.Run(dr, listenAddress) if err != nil { errorCount++ log.Printf("MonitorConfigPoller: error creating HTTP server: %s\n", err) continue } toSession, err = traffic_ops.Login(opsConfig.Url, opsConfig.Username, opsConfig.Password, opsConfig.Insecure) if err != nil { errorCount++ log.Printf("MonitorConfigPoller: error instantiating Session with traffic_ops: %s\n", err) continue } deliveryServiceServers, err = getDeliveryServiceServers(toSession, opsConfig.CdnName) if err != nil { errorCount++ log.Printf("Error getting delivery service servers from Traffic Ops: %v\n", err) continue } serverTypes, err = getServerTypes(toSession, opsConfig.CdnName) if err != nil { errorCount++ log.Printf("Error getting server types from Traffic Ops: %v\n", err) continue } // This must be in a goroutine, because the monitorConfigPoller tick sends to a channel this select listens for. Thus, if we block on sends to the monitorConfigPoller, we have a livelock race condition. go func() { monitorConfigPoller.OpsConfigChannel <- opsConfig // this is needed for cdnName monitorConfigPoller.SessionChannel <- toSession }() case monitorConfig = <-monitorConfigPoller.ConfigChannel: healthUrls := map[string]string{} statUrls := map[string]string{} peerUrls := map[string]string{} caches := map[string]string{} for _, srv := range monitorConfig.TrafficServer { caches[srv.HostName] = srv.Status if srv.Status == "ONLINE" { localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: true} continue } if srv.Status == "OFFLINE" { localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: false} continue } // seed states with available = false until our polling cycle picks up a result if _, exists := localStates.Caches[srv.HostName]; !exists { localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: false} } url := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingURL r := strings.NewReplacer( "${hostname}", srv.FQDN, "${interface_name}", srv.InterfaceName, "application=system", "application=plugin.remap", "application=", "application=plugin.remap", ) url = r.Replace(url) healthUrls[srv.HostName] = url r = strings.NewReplacer("application=plugin.remap", "application=") url = r.Replace(url) statUrls[srv.HostName] = url } for _, srv := range monitorConfig.TrafficMonitor { if srv.Status != "ONLINE" { continue } // TODO: the URL should be config driven. -jse url := fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.IP, srv.Port) peerUrls[srv.HostName] = url } cacheStatPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: statUrls, Interval: defaultCacheStatPollingInterval} cacheHealthPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: healthUrls, Interval: defaultCacheHealthPollingInterval} peerPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: peerUrls, Interval: defaultPeerPollingInterval} for k := range localStates.Caches { _, exists := monitorConfig.TrafficServer[k] if !exists { fmt.Printf("Warning: removing %s from localStates", k) delete(localStates.Caches, k) } } addStateDeliveryServices(monitorConfig, localStates.Deliveryservice) case i := <-cacheHealthTick: healthIteration = i case healthResult := <-cacheHealthChannel: fetchCount++ var prevResult cache.Result if len(healthHistory[healthResult.Id]) != 0 { prevResult = healthHistory[healthResult.Id][len(healthHistory[healthResult.Id])-1].(cache.Result) } health.GetVitals(&healthResult, &prevResult, &monitorConfig) healthHistory[healthResult.Id] = pruneHistory(append(healthHistory[healthResult.Id], healthResult), defaultMaxHistory) isAvailable, whyAvailable := health.EvalCache(healthResult, &monitorConfig) if localStates.Caches[healthResult.Id].IsAvailable != isAvailable { fmt.Println("Changing state for", healthResult.Id, " was:", prevResult.Available, " is now:", isAvailable, " because:", whyAvailable, " errors:", healthResult.Errors) e := Event{ Index: eventIndex, Time: time.Now().Unix(), Description: whyAvailable, Name: healthResult.Id, Hostname: healthResult.Id, Type: serverTypes[healthResult.Id], Available: isAvailable, } events = append([]Event{e}, events...) if len(events) > maxEvents { events = events[:maxEvents-1] } eventIndex++ } localStates.Caches[healthResult.Id] = peer.IsAvailable{IsAvailable: isAvailable} calculateDeliveryServiceState(deliveryServiceServers, localStates.Caches, localStates.Deliveryservice) if lastHealthStart, ok := lastHealthEndTimes[healthResult.Id]; ok { lastHealthDurations[healthResult.Id] = time.Since(lastHealthStart) } lastHealthEndTimes[healthResult.Id] = time.Now() // if _, ok := queryIntervalStart[pollI]; !ok { // log.Printf("ERROR poll start index not found") // continue // } // lastQueryIntervalTime = time.Since(queryIntervalStart[pollI]) case stats := <-cacheStatChannel: statHistory[stats.Id] = pruneHistory(append(statHistory[stats.Id], stats), defaultMaxHistory) case crStatesResult := <-peerChannel: peerStates[crStatesResult.Id] = crStatesResult.PeerStats combinedStates = combineCrStates(peerStates, localStates) } } }