Beispiel #1
0
func runCheckSlave(ctx *cli.Context) int {
	fmt.Println("Discoving running applications and associated tasks...")

	marathon, err := loadMarathon(ctx.GlobalString("marathon-host"))
	if err != nil {
		fmt.Println(err)
		return 1
	}

	slave, err := loadMesos(ctx.GlobalString("marathon-host"))
	if err != nil {
		fmt.Println(err)
		return 1
	}

	slaveFrameworks := slave.Framework("marathon")

	dockerClient := lib.NewDockerClient()
	marathonApps, err := buildMesosMarathonMatrix(slave.Slave.Id, slave.Slave.HostName, slaveFrameworks, marathon, dockerClient, ctx.GlobalBool("ignore-deploys"))
	if err != nil {
		fmt.Println(err)
		return 1
	}

	discrepancy, containerAccount, output, err := verifyApplications(marathonApps)
	if err != nil {
		fmt.Println(lib.PrintRed("An error occoured while verifying applications!"))
		fmt.Println(err)
		return 1
	}

	orphanedContainers := make(boolmap)
	ignoredImages := make([]string, 0)
	chronosHost := ctx.GlobalString("chronos-host")
	if chronosHost != "" {
		config := chronos.Config{
			URL: chronosHost,
		}
		client, err := chronos.NewClient(config)
		jobs, err := client.Jobs()
		if err == nil {
			for _, job := range *jobs {
				if job.Container != nil {
					ignoredImages = append(ignoredImages, job.Container.Image)
				}
			}
		}
	}
	containers, err := lib.ListRunningContainers(dockerClient, ignoredImages)
	if err != nil {
		fmt.Println(lib.PrintRed("An error occoured while determining if docker container is running!"))
		fmt.Println(err)
		return 1
	}

	for _, container := range containers {
		if !containerAccount[container] {
			if ctx.Bool("kill-stragglers") {
				if err := lib.StopContainer(container, 300, dockerClient); err != nil {
					fmt.Printf("An error occoured while trying to stop container (%s): %s\n", container, err)
					orphanedContainers[container] = true
				}
			} else {
				orphanedContainers[container] = true
			}
		}
	}

	if discrepancy || len(orphanedContainers) > 0 {
		if discrepancy {
			fmt.Println(lib.PrintYellow("Discrepency in task state found!"))
			result := columnize.SimpleFormat(output)
			fmt.Println(result)
		}
		if len(orphanedContainers) > 0 {
			fmt.Println(lib.PrintYellow("Orphaned docker containers found!"))
			tmp_output := []string{
				"Orphaned Docker Containers | ",
			}
			for c := range orphanedContainers {
				tmp_output = append(tmp_output, fmt.Sprintf(" | %s", lib.PrintRed(c)))
			}
			result := columnize.SimpleFormat(tmp_output)
			fmt.Println(result)
		}
		return 2
	}
	fmt.Println(lib.PrintGreen("Mesos and Marathon agree about running tasks!"))
	return 0
}
Beispiel #2
0
func runFailedTasks(ctx *cli.Context) int {
	fmt.Println("Discovering completed mesos tasks")

	mesos := &lib.Mesos{
		Host: ctx.GlobalString("mesos-host"),
	}
	mesosClient := mesos.Client()
	if err := mesos.LoadClusterInfo(mesosClient); err != nil {
		fmt.Println(err)
		return 1
	}

	if ctx.GlobalBool("only-leader") {
		if err := mesos.ErrIfNotLeader(); err != nil {
			fmt.Println(err)
			return 0
		}
	}

	frameworks := mesos.Framework("marathon")

	timeWindow := ctx.Int("time-window")
	failureLimit := ctx.Int("failure-limit")

	// map of all containers to number of failures in the last 30 min
	failingMap := make(map[string]int)
	now := float64(time.Now().Unix())
	if len(frameworks) > 0 {
		for _, f := range frameworks {
			for _, t := range f.CompletedTasks {
				failure := false
				for _, s := range t.Statuses {
					if s.State == "TASK_FAILED" {
						age := now - s.Timestamp
						failure = age < float64(60*timeWindow) // minutes
					}
				}
				if failure {
					failingMap[t.Name]++
				}
			}
		}
	}

	if len(failingMap) == 0 {
		fmt.Println(lib.PrintGreen(fmt.Sprintf("No failures found in the last %d minutes", timeWindow)))
		return 0
	}

	returnError := false
	output := make([]string, 1)
	output[0] = "Application | Num Failures"
	for app, numFailures := range failingMap {
		line := fmt.Sprintf("%s | %d", app, numFailures)
		output = append(output, line)
		if numFailures > failureLimit {
			returnError = true
		}
	}

	result := columnize.SimpleFormat(output)
	fmt.Println(result)

	if returnError {
		fmt.Println(lib.PrintRed(fmt.Sprintf("Found errors above the failure limit of %d", failureLimit)))
		return 2
	}

	fmt.Println(lib.PrintYellow(fmt.Sprintf("Errors found, but not above the failure limit of %d", failureLimit)))
	return 0
}