func runCheckSlave(ctx *cli.Context) int { fmt.Println("Discoving running applications and associated tasks...") marathon, err := loadMarathon(ctx.GlobalString("marathon-host")) if err != nil { fmt.Println(err) return 1 } slave, err := loadMesos(ctx.GlobalString("marathon-host")) if err != nil { fmt.Println(err) return 1 } slaveFrameworks := slave.Framework("marathon") dockerClient := lib.NewDockerClient() marathonApps, err := buildMesosMarathonMatrix(slave.Slave.Id, slave.Slave.HostName, slaveFrameworks, marathon, dockerClient, ctx.GlobalBool("ignore-deploys")) if err != nil { fmt.Println(err) return 1 } discrepancy, containerAccount, output, err := verifyApplications(marathonApps) if err != nil { fmt.Println(lib.PrintRed("An error occoured while verifying applications!")) fmt.Println(err) return 1 } orphanedContainers := make(boolmap) ignoredImages := make([]string, 0) chronosHost := ctx.GlobalString("chronos-host") if chronosHost != "" { config := chronos.Config{ URL: chronosHost, } client, err := chronos.NewClient(config) jobs, err := client.Jobs() if err == nil { for _, job := range *jobs { if job.Container != nil { ignoredImages = append(ignoredImages, job.Container.Image) } } } } containers, err := lib.ListRunningContainers(dockerClient, ignoredImages) if err != nil { fmt.Println(lib.PrintRed("An error occoured while determining if docker container is running!")) fmt.Println(err) return 1 } for _, container := range containers { if !containerAccount[container] { if ctx.Bool("kill-stragglers") { if err := lib.StopContainer(container, 300, dockerClient); err != nil { fmt.Printf("An error occoured while trying to stop container (%s): %s\n", container, err) orphanedContainers[container] = true } } else { orphanedContainers[container] = true } } } if discrepancy || len(orphanedContainers) > 0 { if discrepancy { fmt.Println(lib.PrintYellow("Discrepency in task state found!")) result := columnize.SimpleFormat(output) fmt.Println(result) } if len(orphanedContainers) > 0 { fmt.Println(lib.PrintYellow("Orphaned docker containers found!")) tmp_output := []string{ "Orphaned Docker Containers | ", } for c := range orphanedContainers { tmp_output = append(tmp_output, fmt.Sprintf(" | %s", lib.PrintRed(c))) } result := columnize.SimpleFormat(tmp_output) fmt.Println(result) } return 2 } fmt.Println(lib.PrintGreen("Mesos and Marathon agree about running tasks!")) return 0 }
func runFailedTasks(ctx *cli.Context) int { fmt.Println("Discovering completed mesos tasks") mesos := &lib.Mesos{ Host: ctx.GlobalString("mesos-host"), } mesosClient := mesos.Client() if err := mesos.LoadClusterInfo(mesosClient); err != nil { fmt.Println(err) return 1 } if ctx.GlobalBool("only-leader") { if err := mesos.ErrIfNotLeader(); err != nil { fmt.Println(err) return 0 } } frameworks := mesos.Framework("marathon") timeWindow := ctx.Int("time-window") failureLimit := ctx.Int("failure-limit") // map of all containers to number of failures in the last 30 min failingMap := make(map[string]int) now := float64(time.Now().Unix()) if len(frameworks) > 0 { for _, f := range frameworks { for _, t := range f.CompletedTasks { failure := false for _, s := range t.Statuses { if s.State == "TASK_FAILED" { age := now - s.Timestamp failure = age < float64(60*timeWindow) // minutes } } if failure { failingMap[t.Name]++ } } } } if len(failingMap) == 0 { fmt.Println(lib.PrintGreen(fmt.Sprintf("No failures found in the last %d minutes", timeWindow))) return 0 } returnError := false output := make([]string, 1) output[0] = "Application | Num Failures" for app, numFailures := range failingMap { line := fmt.Sprintf("%s | %d", app, numFailures) output = append(output, line) if numFailures > failureLimit { returnError = true } } result := columnize.SimpleFormat(output) fmt.Println(result) if returnError { fmt.Println(lib.PrintRed(fmt.Sprintf("Found errors above the failure limit of %d", failureLimit))) return 2 } fmt.Println(lib.PrintYellow(fmt.Sprintf("Errors found, but not above the failure limit of %d", failureLimit))) return 0 }