Ejemplo n.º 1
0
// TODO JvD: add deliveryservice stuff
func combineCrStates(peerStates map[string]peer.Crstates, localStates peer.Crstates) peer.Crstates {
	combinedStates := peer.NewCRStates()
	for cacheName, localCacheState := range localStates.Caches { // localStates gets pruned when servers are disabled, it's the source of truth
		downVotes := 0 // TODO JvD: change to use parameter when deciding to be optimistic or pessimistic.
		if localCacheState.IsAvailable {
			// fmt.Println(cacheName, " is available locally - setting to IsAvailable: true")
			combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: true} // we don't care about the peers, we got a "good one", and we're optimistic
		} else {
			downVotes++ // localStates says it's not happy
			for _, peerCrStates := range peerStates {
				if peerCrStates.Caches[cacheName].IsAvailable {
					// fmt.Println(cacheName, "- locally we think it's down, but", peerName, "says IsAvailable: ", peerCrStates.Caches[cacheName].IsAvailable, "trusting the peer.")
					combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: true} // we don't care about the peers, we got a "good one", and we're optimistic
					break                                                                  // one peer that thinks we're good is all we need.
				} else {
					// fmt.Println(cacheName, "- locally we think it's down, and", peerName, "says IsAvailable: ", peerCrStates.Caches[cacheName].IsAvailable, "down voting")
					downVotes++ // peerStates for this peer doesn't like it
				}
			}
		}
		if downVotes > len(peerStates) {
			// fmt.Println(cacheName, "-", downVotes, "down votes, setting to IsAvailable: false")
			combinedStates.Caches[cacheName] = peer.IsAvailable{IsAvailable: false}
		}
	}

	for deliveryServiceName, localDeliveryService := range localStates.Deliveryservice {
		deliveryService := peer.Deliveryservice{}
		if localDeliveryService.IsAvailable {
			deliveryService.IsAvailable = true
		}
		deliveryService.DisabledLocations = localDeliveryService.DisabledLocations

		for peerName, iPeerStates := range peerStates {
			peerDeliveryService, ok := iPeerStates.Deliveryservice[deliveryServiceName]
			if !ok {
				log.Printf("WARN local delivery service %s not found in peer %s\n", deliveryServiceName, peerName)
				continue
			}
			if peerDeliveryService.IsAvailable {
				deliveryService.IsAvailable = true
			}
			deliveryService.DisabledLocations = intersection(deliveryService.DisabledLocations, peerDeliveryService.DisabledLocations)
		}
		combinedStates.Deliveryservice[deliveryServiceName] = deliveryService
	}

	return combinedStates
}
Ejemplo n.º 2
0
//
// Kicks off the pollers and handlers
//
func Start(opsConfigFile string, staticAppData StaticAppData) {
	var toSession *traffic_ops.Session

	fetchSuccessCounter := gmx.NewCounter("fetchSuccess")
	fetchFailCounter := gmx.NewCounter("fetchFail")
	fetchPendingGauge := gmx.NewGauge("fetchPending")

	tr := &http.Transport{
		TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
	}

	sharedClient := http.Client{
		Timeout:   defaultHttpTimeout,
		Transport: tr,
	}

	cacheHealthConfigChannel := make(chan poller.HttpPollerConfig)
	cacheHealthChannel := make(chan cache.Result)
	cacheHealthTick := make(chan uint64)
	cacheHealthPoller := poller.HttpPoller{
		TickChan:      cacheHealthTick,
		ConfigChannel: cacheHealthConfigChannel,
		Config: poller.HttpPollerConfig{
			Interval: defaultCacheHealthPollingInterval,
		},
		Fetcher: fetcher.HttpFetcher{
			Handler: cache.Handler{ResultChannel: cacheHealthChannel},
			Client:  sharedClient,
			Success: fetchSuccessCounter,
			Fail:    fetchFailCounter,
			Pending: fetchPendingGauge,
		},
	}

	cacheStatConfigChannel := make(chan poller.HttpPollerConfig)
	cacheStatChannel := make(chan cache.Result)
	cacheStatPoller := poller.HttpPoller{
		ConfigChannel: cacheStatConfigChannel,
		Config: poller.HttpPollerConfig{
			Interval: defaultCacheStatPollingInterval,
		},
		Fetcher: fetcher.HttpFetcher{
			Handler: cache.Handler{ResultChannel: cacheStatChannel},
			Client:  sharedClient,
			Success: fetchSuccessCounter,
			Fail:    fetchFailCounter,
			Pending: fetchPendingGauge,
		},
	}

	sessionChannel := make(chan *traffic_ops.Session)
	monitorConfigChannel := make(chan traffic_ops.TrafficMonitorConfigMap)
	monitorOpsConfigChannel := make(chan handler.OpsConfig)
	monitorConfigPoller := poller.MonitorConfigPoller{
		Interval:         defaultMonitorConfigPollingInterval,
		SessionChannel:   sessionChannel,
		ConfigChannel:    monitorConfigChannel,
		OpsConfigChannel: monitorOpsConfigChannel,
	}

	opsConfigFileChannel := make(chan interface{})
	opsConfigFilePoller := poller.FilePoller{
		File:          opsConfigFile,
		ResultChannel: opsConfigFileChannel,
	}

	opsConfigChannel := make(chan handler.OpsConfig)
	opsConfigFileHandler := handler.OpsConfigFileHandler{
		ResultChannel:    opsConfigFilePoller.ResultChannel,
		OpsConfigChannel: opsConfigChannel,
	}

	peerConfigChannel := make(chan poller.HttpPollerConfig)
	peerChannel := make(chan peer.Result)
	peerPoller := poller.HttpPoller{
		ConfigChannel: peerConfigChannel,
		Config: poller.HttpPollerConfig{
			Interval: defaultPeerPollingInterval,
		},
		Fetcher: fetcher.HttpFetcher{
			Handler: peer.Handler{ResultChannel: peerChannel},
			Client:  sharedClient,
			Success: fetchSuccessCounter,
			Fail:    fetchFailCounter,
			Pending: fetchPendingGauge,
		},
	}

	go opsConfigFileHandler.Listen()
	go opsConfigFilePoller.Poll()
	go monitorConfigPoller.Poll()
	go cacheHealthPoller.Poll()
	go cacheStatPoller.Poll()
	go peerPoller.Poll()

	dr := make(chan http_server.DataRequest)

	healthHistory := map[string][]interface{}{}
	statHistory := map[string][]interface{}{}

	var opsConfig handler.OpsConfig
	var monitorConfig traffic_ops.TrafficMonitorConfigMap
	localStates := peer.NewCRStates()        // this is the local state as discoverer by this traffic_monitor
	peerStates := map[string]peer.Crstates{} // each peer's last state is saved in this map
	combinedStates := peer.NewCRStates()     // this is the result of combining the localStates and all the peerStates using the var ??

	deliveryServiceServers := map[string][]string{}
	serverDeliveryServices := map[string]string{}
	serverTypes := map[string]ds.StatCacheType{}
	deliveryServiceTypes := map[string]ds.StatType{}
	deliveryServiceRegexes := map[string][]string{}
	serverCachegroups := map[string]string{}

	// TODO put stat data in a struct, for brevity
	lastHealthEndTimes := map[string]time.Time{}
	lastHealthDurations := map[string]time.Duration{}
	fetchCount := uint64(0) // note this is the number of individual caches fetched from, not the number of times all the caches were polled.
	healthIteration := uint64(0)
	errorCount := uint64(0)
	events := []Event{}
	eventIndex := uint64(0)
	dsStats := ds.NewStats()
	lastKbpsStats := ds.NewStatsLastKbps()
	localCacheStatus := map[ds.CacheName]CacheAvailableStatus{}

	for {
		select {
		case req := <-dr:
			defer close(req.Response)

			var body []byte
			var err error

			switch req.Type {
			case http_server.TRConfig:
				if toSession == nil {
					err = fmt.Errorf("Unable to connect to Traffic Ops")
				} else if opsConfig.CdnName == "" {
					err = fmt.Errorf("No CDN Configured")
				} else {
					body, err = toSession.CRConfigRaw(opsConfig.CdnName)
				}
				if err != nil {
					err = fmt.Errorf("TR Config: %v", err)
				}
			case http_server.TRStateDerived:
				body, err = peer.CrStatesMarshall(combinedStates)
				if err != nil {
					err = fmt.Errorf("TR State (derived): %v", err)
				}
			case http_server.TRStateSelf:
				body, err = peer.CrStatesMarshall(localStates)
				if err != nil {
					err = fmt.Errorf("TR State (self): %v", err)
				}
			case http_server.CacheStats:
				// TODO: add support for ?hc=N query param, stats=, wildcard, individual caches
				// add pp and date to the json:
				/*
					pp: "0=[my-ats-edge-cache-1], hc=[1]",
					date: "Thu Oct 09 20:28:36 UTC 2014"
				*/
				params := req.Parameters
				hc := 1
				if _, exists := params["hc"]; exists {
					v, err := strconv.Atoi(params["hc"][0])
					if err == nil {
						hc = v
					}
				}
				body, err = cache.StatsMarshall(statHistory, hc)
				if err != nil {
					err = fmt.Errorf("CacheStats: %v", err)
				}
			case http_server.DSStats:
				body, err = json.Marshal(ds.StatsJSON(dsStats)) // TODO marshall beforehand, for performance? (test to see how often requests are made)
				if err != nil {
					err = fmt.Errorf("DsStats: %v", err)
				}
			case http_server.EventLog:
				body, err = json.Marshal(JSONEvents{Events: events})
				if err != nil {
					err = fmt.Errorf("EventLog: %v", err)
				}
			case http_server.PeerStates:
				body, err = json.Marshal(createApiPeerStates(peerStates))
			case http_server.StatSummary:
				body = []byte("TODO implement")
			case http_server.Stats:
				body, err = getStats(staticAppData, cacheHealthPoller.Config.Interval, lastHealthDurations, fetchCount, healthIteration, errorCount)
				if err != nil {
					err = fmt.Errorf("Stats: %v", err)
				}
			case http_server.ConfigDoc:
				opsConfigCopy := opsConfig
				// if the password is blank, leave it blank, so callers can see it's missing.
				if opsConfigCopy.Password != "" {
					opsConfigCopy.Password = "******"
				}
				body, err = json.Marshal(opsConfigCopy)
				if err != nil {
					err = fmt.Errorf("Config Doc: %v", err)
				}
			case http_server.APICacheCount: // TODO determine if this should use peerStates
				body = []byte(strconv.Itoa(len(localStates.Caches)))
			case http_server.APICacheAvailableCount:
				body = []byte(strconv.Itoa(cacheAvailableCount(localStates.Caches)))
			case http_server.APICacheDownCount:
				body = []byte(strconv.Itoa(cacheDownCount(localStates.Caches)))
			case http_server.APIVersion:
				s := "traffic_monitor-" + staticAppData.Version + "."
				if len(staticAppData.GitRevision) > 6 {
					s += staticAppData.GitRevision[:6]
				} else {
					s += staticAppData.GitRevision
				}
				body = []byte(s)
			case http_server.APITrafficOpsURI:
				body = []byte(opsConfig.Url)
			case http_server.APICacheStates:
				body, err = json.Marshal(createCacheStatuses(serverTypes, statHistory, lastHealthDurations, localStates.Caches, lastKbpsStats, localCacheStatus))
			default:
				err = fmt.Errorf("Unknown Request Type: %v", req.Type)
			}

			if err != nil {
				errorCount++
				log.Printf("ERROR Request Error: %v\n", err)
			} else {
				req.Response <- body
			}
		case oc := <-opsConfigFileHandler.OpsConfigChannel:
			var err error
			opsConfig = oc

			listenAddress := ":80" // default

			if opsConfig.HttpListener != "" {
				listenAddress = opsConfig.HttpListener
			}

			handleErr := func(err error) {
				errorCount++
				log.Printf("%v\n", err)
			}

			err = http_server.Run(dr, listenAddress)
			if err != nil {
				handleErr(fmt.Errorf("MonitorConfigPoller: error creating HTTP server: %s\n", err))
				continue
			}

			toSession, err = traffic_ops.Login(opsConfig.Url, opsConfig.Username, opsConfig.Password, opsConfig.Insecure)
			if err != nil {
				handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops: %s\n", err))
				continue
			}

			deliveryServiceServers, serverDeliveryServices, err = getDeliveryServiceServers(toSession, opsConfig.CdnName)
			if err != nil {
				handleErr(fmt.Errorf("Error getting delivery service servers from Traffic Ops: %v\n", err))
				continue
			}

			deliveryServiceTypes, err = getDeliveryServiceTypes(toSession, opsConfig.CdnName)
			if err != nil {
				handleErr(fmt.Errorf("Error getting delivery service types from Traffic Ops: %v\n", err))
				continue
			}

			deliveryServiceRegexes, err = getDeliveryServiceRegexes(toSession, opsConfig.CdnName)
			if err != nil {
				handleErr(fmt.Errorf("Error getting delivery service regexes from Traffic Ops: %v\n", err))
				continue
			}

			serverCachegroups, err = getServerCachegroups(toSession, opsConfig.CdnName)
			if err != nil {
				handleErr(fmt.Errorf("Error getting server cachegroups from Traffic Ops: %v\n", err))
				continue
			}

			serverTypes, err = getServerTypes(toSession, opsConfig.CdnName)
			if err != nil {
				handleErr(fmt.Errorf("Error getting server types from Traffic Ops: %v\n", err))
				continue
			}

			// This must be in a goroutine, because the monitorConfigPoller tick sends to a channel this select listens for. Thus, if we block on sends to the monitorConfigPoller, we have a livelock race condition.
			go func() {
				monitorConfigPoller.OpsConfigChannel <- opsConfig // this is needed for cdnName
				monitorConfigPoller.SessionChannel <- toSession
			}()
		case monitorConfig = <-monitorConfigPoller.ConfigChannel:
			healthUrls := map[string]string{}
			statUrls := map[string]string{}
			peerUrls := map[string]string{}
			caches := map[string]string{}

			for _, srv := range monitorConfig.TrafficServer {
				caches[srv.HostName] = srv.Status

				if srv.Status == "ONLINE" {
					localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: true}
					continue
				}
				if srv.Status == "OFFLINE" {
					localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: false}
					continue
				}
				// seed states with available = false until our polling cycle picks up a result
				if _, exists := localStates.Caches[srv.HostName]; !exists {
					localStates.Caches[srv.HostName] = peer.IsAvailable{IsAvailable: false}
				}

				url := monitorConfig.Profile[srv.Profile].Parameters.HealthPollingURL
				r := strings.NewReplacer(
					"${hostname}", srv.FQDN,
					"${interface_name}", srv.InterfaceName,
					"application=system", "application=plugin.remap",
					"application=", "application=plugin.remap",
				)
				url = r.Replace(url)
				healthUrls[srv.HostName] = url
				r = strings.NewReplacer("application=plugin.remap", "application=")
				url = r.Replace(url)
				statUrls[srv.HostName] = url
			}

			for _, srv := range monitorConfig.TrafficMonitor {
				if srv.Status != "ONLINE" {
					continue
				}
				// TODO: the URL should be config driven. -jse
				url := fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.IP, srv.Port)
				peerUrls[srv.HostName] = url
			}

			cacheStatPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: statUrls, Interval: defaultCacheStatPollingInterval}
			cacheHealthPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: healthUrls, Interval: defaultCacheHealthPollingInterval}
			peerPoller.ConfigChannel <- poller.HttpPollerConfig{Urls: peerUrls, Interval: defaultPeerPollingInterval}

			for k := range localStates.Caches {
				_, exists := monitorConfig.TrafficServer[k]

				if !exists {
					fmt.Printf("Warning: removing %s from localStates", k)
					delete(localStates.Caches, k)
				}
			}

			addStateDeliveryServices(monitorConfig, localStates.Deliveryservice)
		case i := <-cacheHealthTick:
			healthIteration = i
		case healthResult := <-cacheHealthChannel:
			fetchCount++
			var prevResult cache.Result
			if len(healthHistory[healthResult.Id]) != 0 {
				prevResult = healthHistory[healthResult.Id][len(healthHistory[healthResult.Id])-1].(cache.Result)
			}
			health.GetVitals(&healthResult, &prevResult, &monitorConfig)
			healthHistory[healthResult.Id] = pruneHistory(append(healthHistory[healthResult.Id], healthResult), defaultMaxHistory)
			isAvailable, whyAvailable := health.EvalCache(healthResult, &monitorConfig)
			if localStates.Caches[healthResult.Id].IsAvailable != isAvailable {
				fmt.Println("Changing state for", healthResult.Id, " was:", prevResult.Available, " is now:", isAvailable, " because:", whyAvailable, " errors:", healthResult.Errors)
				e := Event{Index: eventIndex, Time: time.Now().Unix(), Description: whyAvailable, Name: healthResult.Id, Hostname: healthResult.Id, Type: serverTypes[healthResult.Id].String(), Available: isAvailable}
				events = append([]Event{e}, events...)
				if len(events) > maxEvents {
					events = events[:maxEvents-1]
				}
				eventIndex++
			}

			localCacheStatus[ds.CacheName(healthResult.Id)] = CacheAvailableStatus{Available: isAvailable, Status: monitorConfig.TrafficServer[healthResult.Id].Status} // TODO move within localStates
			localStates.Caches[healthResult.Id] = peer.IsAvailable{IsAvailable: isAvailable}
			calculateDeliveryServiceState(deliveryServiceServers, localStates.Caches, localStates.Deliveryservice)

			// TODO determine if we should combineCrStates() here

			now := time.Now()

			var err error
			dsStats, lastKbpsStats, err = ds.CreateStats(statHistory, deliveryServiceServers, serverDeliveryServices, deliveryServiceTypes, deliveryServiceRegexes, serverCachegroups, serverTypes, combinedStates, lastKbpsStats, now)
			if err != nil {
				errorCount++
				log.Printf("ERROR getting deliveryservice: %v\n", err)
			}

			if lastHealthStart, ok := lastHealthEndTimes[healthResult.Id]; ok {
				lastHealthDurations[healthResult.Id] = time.Since(lastHealthStart)
			}
			lastHealthEndTimes[healthResult.Id] = now
			fmt.Printf("DEBUG health duration for %s: %v\n", healthResult.Id, lastHealthDurations[healthResult.Id])

			// if _, ok := queryIntervalStart[pollI]; !ok {
			// 	log.Printf("ERROR poll start index not found")
			// 	continue
			// }
			// lastQueryIntervalTime = time.Since(queryIntervalStart[pollI])
		case stats := <-cacheStatChannel:
			statHistory[stats.Id] = pruneHistory(append(statHistory[stats.Id], stats), defaultMaxHistory)
		case crStatesResult := <-peerChannel:
			peerStates[crStatesResult.Id] = crStatesResult.PeerStats
			combinedStates = combineCrStates(peerStates, localStates)
		}
	}
}