func jobList(client *cluster.Client, all bool) (sortJobs, error) { hosts, err := client.Hosts() if err != nil { return nil, fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return nil, errors.New("no hosts found") } var jobs []host.ActiveJob for _, h := range hosts { hostJobs, err := h.ListJobs() if err != nil { return nil, fmt.Errorf("could not get jobs for host %s: %s", h.ID(), err) } for _, job := range hostJobs { jobs = append(jobs, job) } } sorted := make(sortJobs, 0, len(jobs)) for _, job := range jobs { if !all && job.Status != host.StatusStarting && job.Status != host.StatusRunning { continue } sorted = append(sorted, job) } sort.Sort(sort.Reverse(sorted)) return sorted, nil }
func runVolumeList(args *docopt.Args, client *cluster.Client) error { hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } volumes, err := clusterVolumes(hosts) if err != nil { return err } w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0) defer w.Flush() listRec(w, "ID", "HOST", ) for _, volume := range volumes { listRec(w, volume.Volume.ID, volume.Host.ID(), ) } return nil }
func runVolumeGarbageCollection(args *docopt.Args, client *cluster.Client) error { // collect list of all volume ids currently attached to jobs hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } attached := make(map[string]struct{}) for _, h := range hosts { jobs, err := h.ListJobs() if err != nil { fmt.Printf("error listing jobs on host %s: %s\n", h.ID(), err) continue } for _, j := range jobs { for _, vb := range j.Job.Config.Volumes { attached[vb.VolumeID] = struct{}{} } } } volumes, err := clusterVolumes(hosts) if err != nil { return err } // iterate over list of all volumes, deleting any not found in the attached list success := true outer: for _, v := range volumes { if _, ok := attached[v.Volume.ID]; ok { // volume is attached, continue to next volume continue outer } if err := v.Host.DestroyVolume(v.Volume.ID); err != nil { success = false fmt.Printf("could not delete volume %s: %s\n", v.Volume.ID, err) continue outer } fmt.Println(v.Volume.ID, "deleted") } if !success { return errors.New("could not garbage collect all volumes") } return nil }
func runVolumeList(args *docopt.Args, client *cluster.Client) error { hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } volumes, err := clusterVolumes(hosts) if err != nil { return err } w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0) defer w.Flush() listRec(w, "ID", "TYPE", "HOST", "CREATED", "META", ) for _, volume := range volumes { meta := make([]string, 0, len(volume.Volume.Meta)) for k, v := range volume.Volume.Meta { meta = append(meta, fmt.Sprintf("%s=%s", k, v)) } listRec(w, volume.Volume.ID, volume.Volume.Type, volume.Host.ID(), units.HumanDuration(time.Now().UTC().Sub(volume.Volume.CreatedAt))+" ago", strings.Join(meta, " "), ) } return nil }
func runTags(args *docopt.Args, client *cluster.Client) error { if args.Bool["set"] { return runTagsSet(args, client) } else if args.Bool["del"] { return runTagsDel(args, client) } hosts, err := client.Hosts() if err != nil { return err } w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0) defer w.Flush() listRec(w, "HOST", "TAGS") for _, host := range hosts { tags := make([]string, 0, len(host.Tags())) for k, v := range host.Tags() { tags = append(tags, fmt.Sprintf("%s=%s", k, v)) } listRec(w, host.ID(), strings.Join(tags, " ")) } return nil }
func runVolumeDelete(args *docopt.Args, client *cluster.Client) error { success := true hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } volumes, err := clusterVolumes(hosts) if err != nil { return err } outer: for _, id := range args.All["ID"].([]string) { // find this volume in the list for _, v := range volumes { if v.Volume.ID == id { if err := v.Host.DestroyVolume(id); err != nil { success = false fmt.Printf("could not delete volume %s: %s\n", id, err) continue outer } // delete the volume fmt.Println(id, "deleted") continue outer } } success = false fmt.Printf("could not delete volume %s: volume not found\n", id) } if !success { return errors.New("could not delete all volumes") } return nil }
func (f *ClusterFixer) Run(args *docopt.Args, c *cluster.Client) error { f.c = c f.l = log15.New() var err error minHosts, err := strconv.Atoi(args.String["--min-hosts"]) if err != nil || minHosts < 1 { return fmt.Errorf("invalid or missing --min-hosts value") } f.hosts, err = c.Hosts() if err != nil { f.l.Error("unable to list hosts from discoverd, falling back to peer IP list", "error", err) var ips []string if ipList := args.String["--peer-ips"]; ipList != "" { ips = strings.Split(ipList, ",") if minHosts == 0 { minHosts = len(ips) } } if len(ips) == 0 { return fmt.Errorf("error connecting to discoverd, use --peer-ips: %s", err) } if len(ips) < minHosts { return fmt.Errorf("number of peer IPs provided (%d) is less than --min-hosts (%d)", len(ips), minHosts) } f.hosts = make([]*cluster.Host, len(ips)) for i, ip := range ips { url := fmt.Sprintf("http://%s:1113", ip) status, err := cluster.NewHost("", url, nil, nil).GetStatus() if err != nil { return fmt.Errorf("error connecting to %s: %s", ip, err) } f.hosts[i] = cluster.NewHost(status.ID, url, nil, nil) } } // check expected number of hosts if len(f.hosts) < minHosts { // TODO(titanous): be smarter about this return fmt.Errorf("expected at least %d hosts, but %d found", minHosts, len(f.hosts)) } f.l.Info("found expected hosts", "n", len(f.hosts)) if err := f.FixDiscoverd(); err != nil { return err } if err := f.FixFlannel(); err != nil { return err } f.l.Info("waiting for discoverd to be available") timeout := time.After(time.Minute) for { var err error if _, err = discoverd.GetInstances("discoverd", 30*time.Second); err != nil { time.Sleep(100 * time.Millisecond) } else { break } select { case <-timeout: return fmt.Errorf("timed out waiting for discoverd, last error: %s", err) } } f.l.Info("checking for running controller API") controllerService := discoverd.NewService("controller") controllerInstances, _ := controllerService.Instances() if len(controllerInstances) > 0 { f.l.Info("found running controller API instances", "n", len(controllerInstances)) if err := f.FixController(controllerInstances, false); err != nil { f.l.Error("error fixing controller", "err", err) // if unable to write correct formations, we need to kill the scheduler so that the rest of this works if err := f.KillSchedulers(); err != nil { return err } } } f.l.Info("checking status of sirenia databases") for _, db := range []string{"postgres", "mariadb", "mongodb"} { f.l.Info("checking for database state", "db", db) if _, err := discoverd.NewService(db).GetMeta(); err != nil { if discoverd.IsNotFound(err) { f.l.Info("skipping recovery of db, no state in discoverd", "db", db) continue } f.l.Error("error checking database state", "db", db) return err } if err := f.FixSirenia(db); err != nil { return err } } f.l.Info("checking for running controller API") controllerInstances, _ = controllerService.Instances() if len(controllerInstances) == 0 { // kill schedulers to prevent interference if err := f.KillSchedulers(); err != nil { return err } controllerInstances, err = f.StartAppJob("controller", "web", "controller") if err != nil { return err } } else { f.l.Info("found running controller API instances", "n", len(controllerInstances)) } if err := f.FixController(controllerInstances, true); err != nil { f.l.Error("error fixing controller", "err", err) return err } f.l.Info("cluster fix complete") return nil }
func runVolumeGarbageCollection(args *docopt.Args, client *cluster.Client) error { // collect list of all volume ids currently attached to jobs hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } keep := make(map[string]struct{}) for _, h := range hosts { jobs, err := h.ListJobs() if err != nil { fmt.Printf("error listing jobs on host %s: %s\n", h.ID(), err) continue } for _, j := range jobs { if j.Status != host.StatusRunning && j.Status != host.StatusStarting { continue } // keep the tmpfs (it has the same ID as the job) keep[j.Job.ID] = struct{}{} // keep the data volumes for _, vb := range j.Job.Config.Volumes { keep[vb.VolumeID] = struct{}{} } // keep the mounted layers for _, m := range j.Job.Mountspecs { keep[m.ID] = struct{}{} } } } volumes, err := clusterVolumes(hosts) if err != nil { return err } // iterate over list of all volumes, deleting any not found in the keep list success := true outer: for _, v := range volumes { if _, ok := keep[v.Volume.ID]; ok { continue outer } // don't delete system images if v.Volume.Meta["flynn.system-image"] == "true" { continue } if err := v.Host.DestroyVolume(v.Volume.ID); err != nil { success = false fmt.Printf("could not delete %s volume %s: %s\n", v.Volume.Type, v.Volume.ID, err) continue outer } fmt.Println("Deleted", v.Volume.Type, "volume", v.Volume.ID) } if !success { return errors.New("could not garbage collect all volumes") } return nil }