// // Kicks off the pollers and handlers // func Start(opsConfigFile string, cfg config.Config, staticAppData StaticAppData) { toSession := towrap.ITrafficOpsSession(towrap.NewTrafficOpsSessionThreadsafe(nil)) counters := fetcher.Counters{ Success: gmx.NewCounter("fetchSuccess"), Fail: gmx.NewCounter("fetchFail"), Pending: gmx.NewGauge("fetchPending"), } // TODO investigate whether a unique client per cache to be polled is faster sharedClient := &http.Client{ Timeout: cfg.HttpTimeout, Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}, } localStates := peer.NewCRStatesThreadsafe() // this is the local state as discoverer by this traffic_monitor peerStates := peer.NewCRStatesPeersThreadsafe() // each peer's last state is saved in this map fetchCount := NewUintThreadsafe() // note this is the number of individual caches fetched from, not the number of times all the caches were polled. healthIteration := NewUintThreadsafe() errorCount := NewUintThreadsafe() toData := todata.NewThreadsafe() cacheHealthHandler := cache.NewHandler() cacheHealthPoller := poller.NewHTTP(cfg.CacheHealthPollingInterval, true, sharedClient, counters, cacheHealthHandler) cacheStatHandler := cache.NewPrecomputeHandler(toData, peerStates) // TODO figure out if this is necessary, with the CacheHealthPoller cacheStatPoller := poller.NewHTTP(cfg.CacheStatPollingInterval, false, sharedClient, counters, cacheStatHandler) monitorConfigPoller := poller.NewMonitorConfig(cfg.MonitorConfigPollingInterval) peerHandler := peer.NewHandler() peerPoller := poller.NewHTTP(cfg.PeerPollingInterval, false, sharedClient, counters, peerHandler) go monitorConfigPoller.Poll() go cacheHealthPoller.Poll() go cacheStatPoller.Poll() go peerPoller.Poll() cachesChanged := make(chan struct{}) monitorConfig := StartMonitorConfigManager( monitorConfigPoller.ConfigChannel, localStates, cacheStatPoller.ConfigChannel, cacheHealthPoller.ConfigChannel, peerPoller.ConfigChannel, cachesChanged, cfg, staticAppData, ) combinedStates := StartPeerManager( peerHandler.ResultChannel, localStates, peerStates, ) statHistory, _, lastKbpsStats, dsStats, unpolledCaches := StartStatHistoryManager( cacheStatHandler.ResultChannel, localStates, combinedStates, toData, cachesChanged, errorCount, cfg, monitorConfig, ) lastHealthDurations, events, localCacheStatus := StartHealthResultManager( cacheHealthHandler.ResultChannel, toData, localStates, statHistory, monitorConfig, peerStates, combinedStates, fetchCount, errorCount, cfg, ) StartOpsConfigManager( opsConfigFile, toSession, toData, []chan<- handler.OpsConfig{monitorConfigPoller.OpsConfigChannel}, []chan<- towrap.ITrafficOpsSession{monitorConfigPoller.SessionChannel}, localStates, peerStates, combinedStates, statHistory, lastKbpsStats, dsStats, events, staticAppData, cacheHealthPoller.Config.Interval, lastHealthDurations, fetchCount, healthIteration, errorCount, localCacheStatus, unpolledCaches, ) healthTickListener(cacheHealthPoller.TickChan, healthIteration) }
// // Kicks off the pollers and handlers // func Start(opsConfigFile string, staticAppData StaticAppData) { var toSession *traffic_ops.Session fetchSuccessCounter := gmx.NewCounter("fetchSuccess") fetchFailCounter := gmx.NewCounter("fetchFail") fetchPendingGauge := gmx.NewGauge("fetchPending") tr := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, } sharedClient := http.Client{ Timeout: defaultHttpTimeout, Transport: tr, } cacheHealthConfigChannel := make(chan poller.HttpPollerConfig) cacheHealthChannel := make(chan cache.Result) cacheHealthTick := make(chan uint64) cacheHealthPoller := poller.HttpPoller{ TickChan: cacheHealthTick, ConfigChannel: cacheHealthConfigChannel, Config: poller.HttpPollerConfig{ Interval: defaultCacheHealthPollingInterval, }, Fetcher: fetcher.HttpFetcher{ Handler: cache.Handler{ResultChannel: cacheHealthChannel}, Client: sharedClient, Success: fetchSuccessCounter, Fail: fetchFailCounter, Pending: fetchPendingGauge, }, } cacheStatConfigChannel := make(chan poller.HttpPollerConfig) cacheStatChannel := make(chan cache.Result) cacheStatPoller := poller.HttpPoller{ ConfigChannel: cacheStatConfigChannel, Config: poller.HttpPollerConfig{ Interval: defaultCacheStatPollingInterval, }, Fetcher: fetcher.HttpFetcher{ Handler: cache.Handler{ResultChannel: cacheStatChannel}, Client: sharedClient, Success: fetchSuccessCounter, Fail: fetchFailCounter, Pending: fetchPendingGauge, }, } sessionChannel := make(chan *traffic_ops.Session) monitorConfigChannel := make(chan traffic_ops.TrafficMonitorConfigMap) monitorOpsConfigChannel := make(chan handler.OpsConfig) monitorConfigPoller := poller.MonitorConfigPoller{ Interval: defaultMonitorConfigPollingInterval, SessionChannel: sessionChannel, ConfigChannel: monitorConfigChannel, OpsConfigChannel: monitorOpsConfigChannel, } opsConfigFileChannel := make(chan interface{}) opsConfigFilePoller := poller.FilePoller{ File: opsConfigFile, ResultChannel: opsConfigFileChannel, } opsConfigChannel := make(chan handler.OpsConfig) opsConfigFileHandler := handler.OpsConfigFileHandler{ ResultChannel: opsConfigFilePoller.ResultChannel, OpsConfigChannel: opsConfigChannel, } peerConfigChannel := make(chan poller.HttpPollerConfig) peerChannel := make(chan peer.Result) peerPoller := poller.HttpPoller{ ConfigChannel: peerConfigChannel, Config: poller.HttpPollerConfig{ Interval: defaultPeerPollingInterval, }, Fetcher: fetcher.HttpFetcher{ Handler: peer.Handler{ResultChannel: peerChannel}, Client: sharedClient, Success: fetchSuccessCounter, Fail: fetchFailCounter, Pending: fetchPendingGauge, }, } go opsConfigFileHandler.Listen() go opsConfigFilePoller.Poll() go monitorConfigPoller.Poll() go cacheHealthPoller.Poll() go cacheStatPoller.Poll() go peerPoller.Poll() dr := make(chan http_server.DataRequest) healthHistory := make(map[string][]interface{}) statHistory := make(map[string][]interface{}) var opsConfig handler.OpsConfig var monitorConfig traffic_ops.TrafficMonitorConfigMap localStates := peer.Crstates{Caches: make(map[string]peer.IsAvailable), Deliveryservice: make(map[string]peer.Deliveryservice)} // this is the local state as discoverer by this traffic_monitor peerStates := make(map[string]peer.Crstates) // each peer's last state is saved in this map combinedStates := peer.Crstates{Caches: make(map[string]peer.IsAvailable), Deliveryservice: make(map[string]peer.Deliveryservice)} // this is the result of combining the localStates and all the peerStates using the var ?? deliveryServiceServers := map[string][]string{} serverTypes := map[string]string{} // TODO put stat data in a struct, for brevity lastHealthEndTimes := map[string]time.Time{} lastHealthDurations := map[string]time.Duration{} fetchCount := uint64(0) // note this is the number of individual caches fetched from, not the number of times all the caches were polled. healthIteration := uint64(0) errorCount := uint64(0) events := []Event{} eventIndex := uint64(0) for { select { case req := <-dr: defer close(req.C) var body []byte var err error switch req.T { case http_server.TR_CONFIG: if toSession != nil && opsConfig.CdnName != "" { body, err = toSession.CRConfigRaw(opsConfig.CdnName) } case http_server.TR_STATE_DERIVED: body, err = peer.CrStatesMarshall(combinedStates) case http_server.TR_STATE_SELF: body, err = peer.CrStatesMarshall(localStates) case http_server.CACHE_STATS: // TODO: add support for ?hc=N query param, stats=, wildcard, individual caches // add pp and date to the json: /* pp: "0=[my-ats-edge-cache-1], hc=[1]", date: "Thu Oct 09 20:28:36 UTC 2014" */ params := req.Parameters hc := 1 if _, exists := params["hc"]; exists { v, err := strconv.Atoi(params["hc"][0]) if err == nil { hc = v } } body, err = cache.StatsMarshall(statHistory, hc) case http_server.DS_STATS: body = []byte("TODO implement") case http_server.EVENT_LOG: body, err = json.Marshal(JSONEvents{Events: events}) case http_server.PEER_STATES: body = []byte("TODO implement") case http_server.STAT_SUMMARY: body = []byte("TODO implement") case http_server.STATS: body, err = getStats(staticAppData, cacheHealthPoller.Config.Interval, lastHealthDurations, fetchCount, healthIteration, errorCount) if err != nil { // TODO send error to client errorCount++ log.Printf("ERROR getting stats %v\n", err) continue } case http_server.CONFIG_DOC: opsConfigCopy := opsConfig // if the password is blank, leave it blank, so callers can see it's missing. if opsConfigCopy.Password != "" { opsConfigCopy.Password = "******" } body, err = json.Marshal(opsConfigCopy) default: body = []byte("TODO error message") } req.C <- body case oc := <-opsConfigFileHandler.OpsConfigChannel: var err error opsConfig = oc listenAddress := ":80" // default if opsConfig.HttpListener != "" { listenAddress = opsConfig.HttpListener } err = http_server.Run(dr, listenAddress) if err != nil { errorCount++ log.Printf("MonitorConfigPoller: error creating HTTP server: %s\n", err) continue } toSession, err = traffic_ops.Login(opsConfig.Url, opsConfig.Username, opsConfig.Password, opsConfig.Insecure) if err != nil { errorCount++ log.Printf("MonitorConfigPoller: error instantiating Session with traffic_ops: %s\n", err) continue } deliveryServiceServers, err = getDeliveryServiceServers(toSession, opsConfig.CdnName) if err != nil { errorCount++ log.Printf("Error getting delivery service servers from Traffic Ops: %v\n", err) continue } serverTypes, err = getServerTypes(toSession, opsConfig.CdnName) if err != nil { errorCount++ log.Printf("Error getting server types from Traffic Ops: %v\n", err) continue } // This must be in a goroutine, because the monitorConfigPoller tick sends to a channel this select listens for. Thus, if we block on sends to the monitorConfigPoller, we have a livelock race condition. go func() { monitorConfigPoller.OpsConfigChannel <- opsConfig // this is needed for cdnName monitorConfigPoller.SessionChannel <- toSession }() case monitorConfig = <-monitorConfigPoller.ConfigChannel: healthUrls := map[string]string{} statUrls := map[string]string{} peerUrls := map[string]string{} caches := map[string]string{} for _, srv := range monitorConfig.TrafficServer { caches[srv.HostName] = srv.Status if srv.Status == "ONLINE" { localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: true} continue } if srv.Status == "OFFLINE" { localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: false} continue } // seed states with available = false until our polling cycle picks up a result if _, exists := localStates.Caches[srv.HostName]; !exists { localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: false} } url := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingURL r := strings.NewReplacer( "${hostname}", srv.FQDN, "${interface_name}", srv.InterfaceName, "application=system", "application=plugin.remap", "application=", "application=plugin.remap", ) url = r.Replace(url) healthUrls[srv.HostName] = url r = strings.NewReplacer("application=plugin.remap", "application=") url = r.Replace(url) statUrls[srv.HostName] = url } for _, srv := range monitorConfig.TrafficMonitor { if srv.Status != "ONLINE" { continue } // TODO: the URL should be config driven. -jse url := fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.IP, srv.Port) peerUrls[srv.HostName] = url } cacheStatPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: statUrls, Interval: defaultCacheStatPollingInterval} cacheHealthPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: healthUrls, Interval: defaultCacheHealthPollingInterval} peerPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: peerUrls, Interval: defaultPeerPollingInterval} for k := range localStates.Caches { _, exists := monitorConfig.TrafficServer[k] if !exists { fmt.Printf("Warning: removing %s from localStates", k) delete(localStates.Caches, k) } } addStateDeliveryServices(monitorConfig, localStates.Deliveryservice) case i := <-cacheHealthTick: healthIteration = i case healthResult := <-cacheHealthChannel: fetchCount++ var prevResult cache.Result if len(healthHistory[healthResult.Id]) != 0 { prevResult = healthHistory[healthResult.Id][len(healthHistory[healthResult.Id])-1].(cache.Result) } health.GetVitals(&healthResult, &prevResult, &monitorConfig) healthHistory[healthResult.Id] = pruneHistory(append(healthHistory[healthResult.Id], healthResult), defaultMaxHistory) isAvailable, whyAvailable := health.EvalCache(healthResult, &monitorConfig) if localStates.Caches[healthResult.Id].IsAvailable != isAvailable { fmt.Println("Changing state for", healthResult.Id, " was:", prevResult.Available, " is now:", isAvailable, " because:", whyAvailable, " errors:", healthResult.Errors) e := Event{ Index: eventIndex, Time: time.Now().Unix(), Description: whyAvailable, Name: healthResult.Id, Hostname: healthResult.Id, Type: serverTypes[healthResult.Id], Available: isAvailable, } events = append([]Event{e}, events...) if len(events) > maxEvents { events = events[:maxEvents-1] } eventIndex++ } localStates.Caches[healthResult.Id] = peer.IsAvailable{IsAvailable: isAvailable} calculateDeliveryServiceState(deliveryServiceServers, localStates.Caches, localStates.Deliveryservice) if lastHealthStart, ok := lastHealthEndTimes[healthResult.Id]; ok { lastHealthDurations[healthResult.Id] = time.Since(lastHealthStart) } lastHealthEndTimes[healthResult.Id] = time.Now() // if _, ok := queryIntervalStart[pollI]; !ok { // log.Printf("ERROR poll start index not found") // continue // } // lastQueryIntervalTime = time.Since(queryIntervalStart[pollI]) case stats := <-cacheStatChannel: statHistory[stats.Id] = pruneHistory(append(statHistory[stats.Id], stats), defaultMaxHistory) case crStatesResult := <-peerChannel: peerStates[crStatesResult.Id] = crStatesResult.PeerStats combinedStates = combineCrStates(peerStates, localStates) } } }