func jobList(client *cluster.Client, all bool) (sortJobs, error) { hosts, err := client.Hosts() if err != nil { return nil, fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return nil, errors.New("no hosts found") } var jobs []host.ActiveJob for _, h := range hosts { hostJobs, err := h.ListJobs() if err != nil { return nil, fmt.Errorf("could not get jobs for host %s: %s", h.ID(), err) } for _, job := range hostJobs { jobs = append(jobs, job) } } sorted := make(sortJobs, 0, len(jobs)) for _, job := range jobs { if !all && job.Status != host.StatusStarting && job.Status != host.StatusRunning { continue } sorted = append(sorted, job) } sort.Sort(sort.Reverse(sorted)) return sorted, nil }
func getLog(hostID, jobID string, client *cluster.Client, follow, init bool, stdout, stderr io.Writer) error { hostClient, err := client.Host(hostID) if err != nil { return fmt.Errorf("could not connect to host %s: %s", hostID, err) } attachReq := &host.AttachReq{ JobID: jobID, Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs, } if follow { attachReq.Flags |= host.AttachFlagStream } if init { attachReq.Flags |= host.AttachFlagInitLog } attachClient, err := hostClient.Attach(attachReq, false) if err != nil { switch err { case host.ErrJobNotRunning: return nil case cluster.ErrWouldWait: return errors.New("no such job") } return err } defer attachClient.Close() _, err = attachClient.Receive(stdout, stderr) return err }
func jobList(client *cluster.Client, all bool) (sortJobs, error) { hosts, err := client.ListHosts() if err != nil { return nil, fmt.Errorf("could not list hosts: %s", err) } var jobs []host.ActiveJob for id := range hosts { h, err := client.DialHost(id) if err != nil { return nil, fmt.Errorf("could not dial host %s: %s", id, err) } hostJobs, err := h.ListJobs() if err != nil { return nil, fmt.Errorf("could not get jobs for host %s: %s", id, err) } for _, job := range hostJobs { jobs = append(jobs, job) } } sorted := make(sortJobs, 0, len(jobs)) for _, job := range jobs { if !all && job.Status != host.StatusStarting && job.Status != host.StatusRunning { continue } sorted = append(sorted, job) } sort.Sort(sort.Reverse(sorted)) return sorted, nil }
func runVolumeList(args *docopt.Args, client *cluster.Client) error { hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } volumes, err := clusterVolumes(hosts) if err != nil { return err } w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0) defer w.Flush() listRec(w, "ID", "HOST", ) for _, volume := range volumes { listRec(w, volume.Volume.ID, volume.Host.ID(), ) } return nil }
func runStop(args *docopt.Args, client *cluster.Client) error { success := true clients := make(map[string]cluster.Host) for _, id := range args.All["ID"].([]string) { hostID, jobID, err := cluster.ParseJobID(id) if err != nil { fmt.Printf("could not parse %s: %s", id, err) success = false continue } hostClient, ok := clients[hostID] if !ok { var err error hostClient, err = client.DialHost(hostID) if err != nil { fmt.Printf("could not connect to host %s: %s\n", hostID, err) success = false continue } clients[hostID] = hostClient } if err := hostClient.StopJob(jobID); err != nil { fmt.Printf("could not stop job %s: %s\n", jobID, err) success = false continue } fmt.Println(jobID, "stopped") } if !success { return errors.New("could not stop all jobs") } return nil }
func randomHost(cc *cluster.Client) (string, error) { hosts, err := cc.ListHosts() if err != nil { return "", err } if len(hosts) == 0 { return "", cluster.ErrNoServers } return schedutil.PickHost(hosts).ID, nil }
func randomHost(cc *cluster.Client) (string, error) { hosts, err := cc.ListHosts() if err != nil { return "", err } for _, host := range hosts { return host.ID, nil } return "", cluster.ErrNoServers }
func runTagsDel(args *docopt.Args, client *cluster.Client) error { host, err := client.Host(args.String["<hostid>"]) if err != nil { return err } vars := args.All["<var>"].([]string) tags := make(map[string]string, len(vars)) for _, v := range vars { // empty tags get deleted on the host tags[v] = "" } return host.UpdateTags(tags) }
func runVolumeGarbageCollection(args *docopt.Args, client *cluster.Client) error { // collect list of all volume ids currently attached to jobs hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } attached := make(map[string]struct{}) for _, h := range hosts { jobs, err := h.ListJobs() if err != nil { fmt.Printf("error listing jobs on host %s: %s\n", h.ID(), err) continue } for _, j := range jobs { for _, vb := range j.Job.Config.Volumes { attached[vb.VolumeID] = struct{}{} } } } volumes, err := clusterVolumes(hosts) if err != nil { return err } // iterate over list of all volumes, deleting any not found in the attached list success := true outer: for _, v := range volumes { if _, ok := attached[v.Volume.ID]; ok { // volume is attached, continue to next volume continue outer } if err := v.Host.DestroyVolume(v.Volume.ID); err != nil { success = false fmt.Printf("could not delete volume %s: %s\n", v.Volume.ID, err) continue outer } fmt.Println(v.Volume.ID, "deleted") } if !success { return errors.New("could not garbage collect all volumes") } return nil }
func runTagsSet(args *docopt.Args, client *cluster.Client) error { host, err := client.Host(args.String["<hostid>"]) if err != nil { return err } pairs := args.All["<var>=<val>"].([]string) tags := make(map[string]string, len(pairs)) for _, s := range pairs { keyVal := strings.SplitN(s, "=", 2) if len(keyVal) == 1 && keyVal[0] != "" { tags[keyVal[0]] = "true" } else if len(keyVal) == 2 { tags[keyVal[0]] = keyVal[1] } } return host.UpdateTags(tags) }
func runInspect(args *docopt.Args, client *cluster.Client) error { hostID, jobID, err := cluster.ParseJobID(args.String["ID"]) if err != nil { return err } hostClient, err := client.Host(hostID) if err != nil { return fmt.Errorf("could not connect to host %s: %s", hostID, err) } job, err := hostClient.GetJob(jobID) if err != nil { return fmt.Errorf("no such job") } printJobDesc(job, os.Stdout, !args.Bool["--omit-env"]) return nil }
func runVolumeCreate(args *docopt.Args, client *cluster.Client) error { hostId := args.String["<host>"] hostClient, err := client.Host(hostId) if err != nil { fmt.Println("could not connect to host", hostId) } provider := "default" if args.String["--provider"] != "" { provider = args.String["--provider"] } v, err := hostClient.CreateVolume(provider) if err != nil { fmt.Printf("could not create volume: %s\n", err) return err } fmt.Printf("created volume %s on %s\n", v.ID, hostId) return nil }
func runVolumeList(args *docopt.Args, client *cluster.Client) error { hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } volumes, err := clusterVolumes(hosts) if err != nil { return err } w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0) defer w.Flush() listRec(w, "ID", "TYPE", "HOST", "CREATED", "META", ) for _, volume := range volumes { meta := make([]string, 0, len(volume.Volume.Meta)) for k, v := range volume.Volume.Meta { meta = append(meta, fmt.Sprintf("%s=%s", k, v)) } listRec(w, volume.Volume.ID, volume.Volume.Type, volume.Host.ID(), units.HumanDuration(time.Now().UTC().Sub(volume.Volume.CreatedAt))+" ago", strings.Join(meta, " "), ) } return nil }
func runTags(args *docopt.Args, client *cluster.Client) error { if args.Bool["set"] { return runTagsSet(args, client) } else if args.Bool["del"] { return runTagsDel(args, client) } hosts, err := client.Hosts() if err != nil { return err } w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0) defer w.Flush() listRec(w, "HOST", "TAGS") for _, host := range hosts { tags := make([]string, 0, len(host.Tags())) for k, v := range host.Tags() { tags = append(tags, fmt.Sprintf("%s=%s", k, v)) } listRec(w, host.ID(), strings.Join(tags, " ")) } return nil }
func runVolumeDelete(args *docopt.Args, client *cluster.Client) error { success := true hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } volumes, err := clusterVolumes(hosts) if err != nil { return err } outer: for _, id := range args.All["ID"].([]string) { // find this volume in the list for _, v := range volumes { if v.Volume.ID == id { if err := v.Host.DestroyVolume(id); err != nil { success = false fmt.Printf("could not delete volume %s: %s\n", id, err) continue outer } // delete the volume fmt.Println(id, "deleted") continue outer } } success = false fmt.Printf("could not delete volume %s: volume not found\n", id) } if !success { return errors.New("could not delete all volumes") } return nil }
func getLog(hostID, jobID string, client *cluster.Client, follow bool, stdout, stderr io.Writer) error { hostClient, err := client.DialHost(hostID) if err != nil { return fmt.Errorf("could not connect to host %s: %s", hostID, err) } defer hostClient.Close() attachReq := &host.AttachReq{ JobID: jobID, Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs, } if follow { attachReq.Flags |= host.AttachFlagStream } attachClient, err := hostClient.Attach(attachReq, false) if err != nil { if err == cluster.ErrWouldWait { return errors.New("no such job") } return err } defer attachClient.Close() attachClient.Receive(stdout, stderr) return nil }
func runTags(args *docopt.Args, client *cluster.Client) error { if args.Bool["set"] { return runTagsSet(args, client) } else if args.Bool["del"] { return runTagsDel(args, client) } instances, err := client.HostInstances() if err != nil { return err } w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0) defer w.Flush() listRec(w, "HOST", "TAGS") for _, inst := range instances { tags := make([]string, 0, len(inst.Meta)) for k, v := range inst.Meta { if strings.HasPrefix(k, host.TagPrefix) { tags = append(tags, fmt.Sprintf("%s=%s", strings.TrimPrefix(k, host.TagPrefix), v)) } } listRec(w, inst.Meta["id"], strings.Join(tags, " ")) } return nil }
func runSignal(args *docopt.Args, client *cluster.Client) error { id := args.String["ID"] sig, err := strconv.Atoi(args.String["SIGNAL"]) if err != nil { fmt.Println("invalid value for SIGNAL") return err } hostID, err := cluster.ExtractHostID(id) if err != nil { fmt.Println("could not parse", id) return err } hostClient, err := client.Host(hostID) if err != nil { fmt.Println("could not connect to host", hostID) return err } if err := hostClient.SignalJob(id, sig); err != nil { fmt.Println("could not signal job", id) return err } fmt.Printf("sent signal %d to %s successfully\n", sig, id) return nil }
func runVolumeGarbageCollection(args *docopt.Args, client *cluster.Client) error { // collect list of all volume ids currently attached to jobs hosts, err := client.Hosts() if err != nil { return fmt.Errorf("could not list hosts: %s", err) } if len(hosts) == 0 { return errors.New("no hosts found") } keep := make(map[string]struct{}) for _, h := range hosts { jobs, err := h.ListJobs() if err != nil { fmt.Printf("error listing jobs on host %s: %s\n", h.ID(), err) continue } for _, j := range jobs { if j.Status != host.StatusRunning && j.Status != host.StatusStarting { continue } // keep the tmpfs (it has the same ID as the job) keep[j.Job.ID] = struct{}{} // keep the data volumes for _, vb := range j.Job.Config.Volumes { keep[vb.VolumeID] = struct{}{} } // keep the mounted layers for _, m := range j.Job.Mountspecs { keep[m.ID] = struct{}{} } } } volumes, err := clusterVolumes(hosts) if err != nil { return err } // iterate over list of all volumes, deleting any not found in the keep list success := true outer: for _, v := range volumes { if _, ok := keep[v.Volume.ID]; ok { continue outer } // don't delete system images if v.Volume.Meta["flynn.system-image"] == "true" { continue } if err := v.Host.DestroyVolume(v.Volume.ID); err != nil { success = false fmt.Printf("could not delete %s volume %s: %s\n", v.Volume.Type, v.Volume.ID, err) continue outer } fmt.Println("Deleted", v.Volume.Type, "volume", v.Volume.ID) } if !success { return errors.New("could not garbage collect all volumes") } return nil }
func (f *ClusterFixer) Run(args *docopt.Args, c *cluster.Client) error { f.c = c f.l = log15.New() var err error minHosts, err := strconv.Atoi(args.String["--min-hosts"]) if err != nil || minHosts < 1 { return fmt.Errorf("invalid or missing --min-hosts value") } f.hosts, err = c.Hosts() if err != nil { f.l.Error("unable to list hosts from discoverd, falling back to peer IP list", "error", err) var ips []string if ipList := args.String["--peer-ips"]; ipList != "" { ips = strings.Split(ipList, ",") if minHosts == 0 { minHosts = len(ips) } } if len(ips) == 0 { return fmt.Errorf("error connecting to discoverd, use --peer-ips: %s", err) } if len(ips) < minHosts { return fmt.Errorf("number of peer IPs provided (%d) is less than --min-hosts (%d)", len(ips), minHosts) } f.hosts = make([]*cluster.Host, len(ips)) for i, ip := range ips { url := fmt.Sprintf("http://%s:1113", ip) status, err := cluster.NewHost("", url, nil, nil).GetStatus() if err != nil { return fmt.Errorf("error connecting to %s: %s", ip, err) } f.hosts[i] = cluster.NewHost(status.ID, url, nil, nil) } } // check expected number of hosts if len(f.hosts) < minHosts { // TODO(titanous): be smarter about this return fmt.Errorf("expected at least %d hosts, but %d found", minHosts, len(f.hosts)) } f.l.Info("found expected hosts", "n", len(f.hosts)) if err := f.FixDiscoverd(); err != nil { return err } if err := f.FixFlannel(); err != nil { return err } f.l.Info("waiting for discoverd to be available") timeout := time.After(time.Minute) for { var err error if _, err = discoverd.GetInstances("discoverd", 30*time.Second); err != nil { time.Sleep(100 * time.Millisecond) } else { break } select { case <-timeout: return fmt.Errorf("timed out waiting for discoverd, last error: %s", err) } } f.l.Info("checking for running controller API") controllerService := discoverd.NewService("controller") controllerInstances, _ := controllerService.Instances() if len(controllerInstances) > 0 { f.l.Info("found running controller API instances", "n", len(controllerInstances)) if err := f.FixController(controllerInstances, false); err != nil { f.l.Error("error fixing controller", "err", err) // if unable to write correct formations, we need to kill the scheduler so that the rest of this works if err := f.KillSchedulers(); err != nil { return err } } } f.l.Info("checking status of sirenia databases") for _, db := range []string{"postgres", "mariadb", "mongodb"} { f.l.Info("checking for database state", "db", db) if _, err := discoverd.NewService(db).GetMeta(); err != nil { if discoverd.IsNotFound(err) { f.l.Info("skipping recovery of db, no state in discoverd", "db", db) continue } f.l.Error("error checking database state", "db", db) return err } if err := f.FixSirenia(db); err != nil { return err } } f.l.Info("checking for running controller API") controllerInstances, _ = controllerService.Instances() if len(controllerInstances) == 0 { // kill schedulers to prevent interference if err := f.KillSchedulers(); err != nil { return err } controllerInstances, err = f.StartAppJob("controller", "web", "controller") if err != nil { return err } } else { f.l.Info("found running controller API instances", "n", len(controllerInstances)) } if err := f.FixController(controllerInstances, true); err != nil { f.l.Error("error fixing controller", "err", err) return err } f.l.Info("cluster fix complete") return nil }