Exemple #1
0
func jobList(client *cluster.Client, all bool) (sortJobs, error) {
	hosts, err := client.Hosts()
	if err != nil {
		return nil, fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return nil, errors.New("no hosts found")
	}

	var jobs []host.ActiveJob
	for _, h := range hosts {
		hostJobs, err := h.ListJobs()
		if err != nil {
			return nil, fmt.Errorf("could not get jobs for host %s: %s", h.ID(), err)
		}
		for _, job := range hostJobs {
			jobs = append(jobs, job)
		}
	}

	sorted := make(sortJobs, 0, len(jobs))
	for _, job := range jobs {
		if !all && job.Status != host.StatusStarting && job.Status != host.StatusRunning {
			continue
		}
		sorted = append(sorted, job)
	}
	sort.Sort(sort.Reverse(sorted))
	return sorted, nil
}
Exemple #2
0
func runVolumeList(args *docopt.Args, client *cluster.Client) error {
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

	w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0)
	defer w.Flush()
	listRec(w,
		"ID",
		"HOST",
	)

	for _, volume := range volumes {
		listRec(w,
			volume.Volume.ID,
			volume.Host.ID(),
		)
	}
	return nil
}
Exemple #3
0
func runVolumeGarbageCollection(args *docopt.Args, client *cluster.Client) error {
	// collect list of all volume ids currently attached to jobs
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	attached := make(map[string]struct{})
	for _, h := range hosts {
		jobs, err := h.ListJobs()
		if err != nil {
			fmt.Printf("error listing jobs on host %s: %s\n", h.ID(), err)
			continue
		}
		for _, j := range jobs {
			for _, vb := range j.Job.Config.Volumes {
				attached[vb.VolumeID] = struct{}{}
			}
		}
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

	// iterate over list of all volumes, deleting any not found in the attached list
	success := true
outer:
	for _, v := range volumes {
		if _, ok := attached[v.Volume.ID]; ok {
			// volume is attached, continue to next volume
			continue outer
		}
		if err := v.Host.DestroyVolume(v.Volume.ID); err != nil {
			success = false
			fmt.Printf("could not delete volume %s: %s\n", v.Volume.ID, err)
			continue outer
		}
		fmt.Println(v.Volume.ID, "deleted")
	}
	if !success {
		return errors.New("could not garbage collect all volumes")
	}

	return nil
}
Exemple #4
0
func runVolumeList(args *docopt.Args, client *cluster.Client) error {
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

	w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0)
	defer w.Flush()
	listRec(w,
		"ID",
		"TYPE",
		"HOST",
		"CREATED",
		"META",
	)

	for _, volume := range volumes {
		meta := make([]string, 0, len(volume.Volume.Meta))
		for k, v := range volume.Volume.Meta {
			meta = append(meta, fmt.Sprintf("%s=%s", k, v))
		}
		listRec(w,
			volume.Volume.ID,
			volume.Volume.Type,
			volume.Host.ID(),
			units.HumanDuration(time.Now().UTC().Sub(volume.Volume.CreatedAt))+" ago",
			strings.Join(meta, " "),
		)
	}
	return nil
}
Exemple #5
0
func runTags(args *docopt.Args, client *cluster.Client) error {
	if args.Bool["set"] {
		return runTagsSet(args, client)
	} else if args.Bool["del"] {
		return runTagsDel(args, client)
	}
	hosts, err := client.Hosts()
	if err != nil {
		return err
	}
	w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0)
	defer w.Flush()
	listRec(w, "HOST", "TAGS")
	for _, host := range hosts {
		tags := make([]string, 0, len(host.Tags()))
		for k, v := range host.Tags() {
			tags = append(tags, fmt.Sprintf("%s=%s", k, v))
		}
		listRec(w, host.ID(), strings.Join(tags, " "))
	}
	return nil
}
Exemple #6
0
func runVolumeDelete(args *docopt.Args, client *cluster.Client) error {
	success := true
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

outer:
	for _, id := range args.All["ID"].([]string) {
		// find this volume in the list
		for _, v := range volumes {
			if v.Volume.ID == id {
				if err := v.Host.DestroyVolume(id); err != nil {
					success = false
					fmt.Printf("could not delete volume %s: %s\n", id, err)
					continue outer
				}
				// delete the volume
				fmt.Println(id, "deleted")
				continue outer
			}
		}
		success = false
		fmt.Printf("could not delete volume %s: volume not found\n", id)
	}
	if !success {
		return errors.New("could not delete all volumes")
	}
	return nil
}
Exemple #7
0
func (f *ClusterFixer) Run(args *docopt.Args, c *cluster.Client) error {
	f.c = c
	f.l = log15.New()
	var err error

	minHosts, err := strconv.Atoi(args.String["--min-hosts"])
	if err != nil || minHosts < 1 {
		return fmt.Errorf("invalid or missing --min-hosts value")
	}

	f.hosts, err = c.Hosts()
	if err != nil {
		f.l.Error("unable to list hosts from discoverd, falling back to peer IP list", "error", err)
		var ips []string
		if ipList := args.String["--peer-ips"]; ipList != "" {
			ips = strings.Split(ipList, ",")
			if minHosts == 0 {
				minHosts = len(ips)
			}
		}
		if len(ips) == 0 {
			return fmt.Errorf("error connecting to discoverd, use --peer-ips: %s", err)
		}
		if len(ips) < minHosts {
			return fmt.Errorf("number of peer IPs provided (%d) is less than --min-hosts (%d)", len(ips), minHosts)
		}

		f.hosts = make([]*cluster.Host, len(ips))
		for i, ip := range ips {
			url := fmt.Sprintf("http://%s:1113", ip)
			status, err := cluster.NewHost("", url, nil, nil).GetStatus()
			if err != nil {
				return fmt.Errorf("error connecting to %s: %s", ip, err)
			}
			f.hosts[i] = cluster.NewHost(status.ID, url, nil, nil)
		}
	}
	// check expected number of hosts
	if len(f.hosts) < minHosts {
		// TODO(titanous): be smarter about this
		return fmt.Errorf("expected at least %d hosts, but %d found", minHosts, len(f.hosts))
	}
	f.l.Info("found expected hosts", "n", len(f.hosts))

	if err := f.FixDiscoverd(); err != nil {
		return err
	}
	if err := f.FixFlannel(); err != nil {
		return err
	}

	f.l.Info("waiting for discoverd to be available")
	timeout := time.After(time.Minute)
	for {
		var err error
		if _, err = discoverd.GetInstances("discoverd", 30*time.Second); err != nil {
			time.Sleep(100 * time.Millisecond)
		} else {
			break
		}
		select {
		case <-timeout:
			return fmt.Errorf("timed out waiting for discoverd, last error: %s", err)
		}
	}

	f.l.Info("checking for running controller API")
	controllerService := discoverd.NewService("controller")
	controllerInstances, _ := controllerService.Instances()
	if len(controllerInstances) > 0 {
		f.l.Info("found running controller API instances", "n", len(controllerInstances))
		if err := f.FixController(controllerInstances, false); err != nil {
			f.l.Error("error fixing controller", "err", err)
			// if unable to write correct formations, we need to kill the scheduler so that the rest of this works
			if err := f.KillSchedulers(); err != nil {
				return err
			}
		}
	}

	f.l.Info("checking status of sirenia databases")
	for _, db := range []string{"postgres", "mariadb", "mongodb"} {
		f.l.Info("checking for database state", "db", db)
		if _, err := discoverd.NewService(db).GetMeta(); err != nil {
			if discoverd.IsNotFound(err) {
				f.l.Info("skipping recovery of db, no state in discoverd", "db", db)
				continue
			}
			f.l.Error("error checking database state", "db", db)
			return err
		}
		if err := f.FixSirenia(db); err != nil {
			return err
		}
	}

	f.l.Info("checking for running controller API")
	controllerInstances, _ = controllerService.Instances()
	if len(controllerInstances) == 0 {
		// kill schedulers to prevent interference
		if err := f.KillSchedulers(); err != nil {
			return err
		}
		controllerInstances, err = f.StartAppJob("controller", "web", "controller")
		if err != nil {
			return err
		}
	} else {
		f.l.Info("found running controller API instances", "n", len(controllerInstances))
	}

	if err := f.FixController(controllerInstances, true); err != nil {
		f.l.Error("error fixing controller", "err", err)
		return err
	}

	f.l.Info("cluster fix complete")

	return nil
}
Exemple #8
0
func runVolumeGarbageCollection(args *docopt.Args, client *cluster.Client) error {
	// collect list of all volume ids currently attached to jobs
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	keep := make(map[string]struct{})
	for _, h := range hosts {
		jobs, err := h.ListJobs()
		if err != nil {
			fmt.Printf("error listing jobs on host %s: %s\n", h.ID(), err)
			continue
		}
		for _, j := range jobs {
			if j.Status != host.StatusRunning && j.Status != host.StatusStarting {
				continue
			}

			// keep the tmpfs (it has the same ID as the job)
			keep[j.Job.ID] = struct{}{}

			// keep the data volumes
			for _, vb := range j.Job.Config.Volumes {
				keep[vb.VolumeID] = struct{}{}
			}

			// keep the mounted layers
			for _, m := range j.Job.Mountspecs {
				keep[m.ID] = struct{}{}
			}
		}
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

	// iterate over list of all volumes, deleting any not found in the keep list
	success := true
outer:
	for _, v := range volumes {
		if _, ok := keep[v.Volume.ID]; ok {
			continue outer
		}
		// don't delete system images
		if v.Volume.Meta["flynn.system-image"] == "true" {
			continue
		}
		if err := v.Host.DestroyVolume(v.Volume.ID); err != nil {
			success = false
			fmt.Printf("could not delete %s volume %s: %s\n", v.Volume.Type, v.Volume.ID, err)
			continue outer
		}
		fmt.Println("Deleted", v.Volume.Type, "volume", v.Volume.ID)
	}
	if !success {
		return errors.New("could not garbage collect all volumes")
	}

	return nil
}