Example #1
0
File: ps.go Project: devick/flynn
func jobList(client *cluster.Client, all bool) (sortJobs, error) {
	hosts, err := client.Hosts()
	if err != nil {
		return nil, fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return nil, errors.New("no hosts found")
	}

	var jobs []host.ActiveJob
	for _, h := range hosts {
		hostJobs, err := h.ListJobs()
		if err != nil {
			return nil, fmt.Errorf("could not get jobs for host %s: %s", h.ID(), err)
		}
		for _, job := range hostJobs {
			jobs = append(jobs, job)
		}
	}

	sorted := make(sortJobs, 0, len(jobs))
	for _, job := range jobs {
		if !all && job.Status != host.StatusStarting && job.Status != host.StatusRunning {
			continue
		}
		sorted = append(sorted, job)
	}
	sort.Sort(sort.Reverse(sorted))
	return sorted, nil
}
Example #2
0
func getLog(hostID, jobID string, client *cluster.Client, follow, init bool, stdout, stderr io.Writer) error {
	hostClient, err := client.Host(hostID)
	if err != nil {
		return fmt.Errorf("could not connect to host %s: %s", hostID, err)
	}
	attachReq := &host.AttachReq{
		JobID: jobID,
		Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs,
	}
	if follow {
		attachReq.Flags |= host.AttachFlagStream
	}
	if init {
		attachReq.Flags |= host.AttachFlagInitLog
	}
	attachClient, err := hostClient.Attach(attachReq, false)
	if err != nil {
		switch err {
		case host.ErrJobNotRunning:
			return nil
		case cluster.ErrWouldWait:
			return errors.New("no such job")
		}
		return err
	}
	defer attachClient.Close()
	_, err = attachClient.Receive(stdout, stderr)
	return err
}
Example #3
0
func jobList(client *cluster.Client, all bool) (sortJobs, error) {
	hosts, err := client.ListHosts()
	if err != nil {
		return nil, fmt.Errorf("could not list hosts: %s", err)
	}

	var jobs []host.ActiveJob
	for id := range hosts {
		h, err := client.DialHost(id)
		if err != nil {
			return nil, fmt.Errorf("could not dial host %s: %s", id, err)
		}
		hostJobs, err := h.ListJobs()
		if err != nil {
			return nil, fmt.Errorf("could not get jobs for host %s: %s", id, err)
		}
		for _, job := range hostJobs {
			jobs = append(jobs, job)
		}
	}

	sorted := make(sortJobs, 0, len(jobs))
	for _, job := range jobs {
		if !all && job.Status != host.StatusStarting && job.Status != host.StatusRunning {
			continue
		}
		sorted = append(sorted, job)
	}
	sort.Sort(sort.Reverse(sorted))
	return sorted, nil
}
Example #4
0
func runVolumeList(args *docopt.Args, client *cluster.Client) error {
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

	w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0)
	defer w.Flush()
	listRec(w,
		"ID",
		"HOST",
	)

	for _, volume := range volumes {
		listRec(w,
			volume.Volume.ID,
			volume.Host.ID(),
		)
	}
	return nil
}
Example #5
0
func runStop(args *docopt.Args, client *cluster.Client) error {
	success := true
	clients := make(map[string]cluster.Host)
	for _, id := range args.All["ID"].([]string) {
		hostID, jobID, err := cluster.ParseJobID(id)
		if err != nil {
			fmt.Printf("could not parse %s: %s", id, err)
			success = false
			continue
		}
		hostClient, ok := clients[hostID]
		if !ok {
			var err error
			hostClient, err = client.DialHost(hostID)
			if err != nil {
				fmt.Printf("could not connect to host %s: %s\n", hostID, err)
				success = false
				continue
			}
			clients[hostID] = hostClient
		}
		if err := hostClient.StopJob(jobID); err != nil {
			fmt.Printf("could not stop job %s: %s\n", jobID, err)
			success = false
			continue
		}
		fmt.Println(jobID, "stopped")
	}
	if !success {
		return errors.New("could not stop all jobs")
	}
	return nil
}
Example #6
0
func randomHost(cc *cluster.Client) (string, error) {
	hosts, err := cc.ListHosts()
	if err != nil {
		return "", err
	}
	if len(hosts) == 0 {
		return "", cluster.ErrNoServers
	}
	return schedutil.PickHost(hosts).ID, nil
}
Example #7
0
func randomHost(cc *cluster.Client) (string, error) {
	hosts, err := cc.ListHosts()
	if err != nil {
		return "", err
	}

	for _, host := range hosts {
		return host.ID, nil
	}
	return "", cluster.ErrNoServers
}
Example #8
0
func runTagsDel(args *docopt.Args, client *cluster.Client) error {
	host, err := client.Host(args.String["<hostid>"])
	if err != nil {
		return err
	}
	vars := args.All["<var>"].([]string)
	tags := make(map[string]string, len(vars))
	for _, v := range vars {
		// empty tags get deleted on the host
		tags[v] = ""
	}
	return host.UpdateTags(tags)
}
Example #9
0
func runVolumeGarbageCollection(args *docopt.Args, client *cluster.Client) error {
	// collect list of all volume ids currently attached to jobs
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	attached := make(map[string]struct{})
	for _, h := range hosts {
		jobs, err := h.ListJobs()
		if err != nil {
			fmt.Printf("error listing jobs on host %s: %s\n", h.ID(), err)
			continue
		}
		for _, j := range jobs {
			for _, vb := range j.Job.Config.Volumes {
				attached[vb.VolumeID] = struct{}{}
			}
		}
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

	// iterate over list of all volumes, deleting any not found in the attached list
	success := true
outer:
	for _, v := range volumes {
		if _, ok := attached[v.Volume.ID]; ok {
			// volume is attached, continue to next volume
			continue outer
		}
		if err := v.Host.DestroyVolume(v.Volume.ID); err != nil {
			success = false
			fmt.Printf("could not delete volume %s: %s\n", v.Volume.ID, err)
			continue outer
		}
		fmt.Println(v.Volume.ID, "deleted")
	}
	if !success {
		return errors.New("could not garbage collect all volumes")
	}

	return nil
}
Example #10
0
func runTagsSet(args *docopt.Args, client *cluster.Client) error {
	host, err := client.Host(args.String["<hostid>"])
	if err != nil {
		return err
	}
	pairs := args.All["<var>=<val>"].([]string)
	tags := make(map[string]string, len(pairs))
	for _, s := range pairs {
		keyVal := strings.SplitN(s, "=", 2)
		if len(keyVal) == 1 && keyVal[0] != "" {
			tags[keyVal[0]] = "true"
		} else if len(keyVal) == 2 {
			tags[keyVal[0]] = keyVal[1]
		}
	}
	return host.UpdateTags(tags)
}
Example #11
0
func runInspect(args *docopt.Args, client *cluster.Client) error {
	hostID, jobID, err := cluster.ParseJobID(args.String["ID"])
	if err != nil {
		return err
	}
	hostClient, err := client.Host(hostID)
	if err != nil {
		return fmt.Errorf("could not connect to host %s: %s", hostID, err)
	}
	job, err := hostClient.GetJob(jobID)
	if err != nil {
		return fmt.Errorf("no such job")
	}

	printJobDesc(job, os.Stdout, !args.Bool["--omit-env"])
	return nil
}
Example #12
0
func runVolumeCreate(args *docopt.Args, client *cluster.Client) error {
	hostId := args.String["<host>"]
	hostClient, err := client.Host(hostId)
	if err != nil {
		fmt.Println("could not connect to host", hostId)
	}
	provider := "default"
	if args.String["--provider"] != "" {
		provider = args.String["--provider"]
	}
	v, err := hostClient.CreateVolume(provider)
	if err != nil {
		fmt.Printf("could not create volume: %s\n", err)
		return err
	}
	fmt.Printf("created volume %s on %s\n", v.ID, hostId)
	return nil
}
Example #13
0
func runVolumeList(args *docopt.Args, client *cluster.Client) error {
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

	w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0)
	defer w.Flush()
	listRec(w,
		"ID",
		"TYPE",
		"HOST",
		"CREATED",
		"META",
	)

	for _, volume := range volumes {
		meta := make([]string, 0, len(volume.Volume.Meta))
		for k, v := range volume.Volume.Meta {
			meta = append(meta, fmt.Sprintf("%s=%s", k, v))
		}
		listRec(w,
			volume.Volume.ID,
			volume.Volume.Type,
			volume.Host.ID(),
			units.HumanDuration(time.Now().UTC().Sub(volume.Volume.CreatedAt))+" ago",
			strings.Join(meta, " "),
		)
	}
	return nil
}
Example #14
0
func runTags(args *docopt.Args, client *cluster.Client) error {
	if args.Bool["set"] {
		return runTagsSet(args, client)
	} else if args.Bool["del"] {
		return runTagsDel(args, client)
	}
	hosts, err := client.Hosts()
	if err != nil {
		return err
	}
	w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0)
	defer w.Flush()
	listRec(w, "HOST", "TAGS")
	for _, host := range hosts {
		tags := make([]string, 0, len(host.Tags()))
		for k, v := range host.Tags() {
			tags = append(tags, fmt.Sprintf("%s=%s", k, v))
		}
		listRec(w, host.ID(), strings.Join(tags, " "))
	}
	return nil
}
Example #15
0
func runVolumeDelete(args *docopt.Args, client *cluster.Client) error {
	success := true
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

outer:
	for _, id := range args.All["ID"].([]string) {
		// find this volume in the list
		for _, v := range volumes {
			if v.Volume.ID == id {
				if err := v.Host.DestroyVolume(id); err != nil {
					success = false
					fmt.Printf("could not delete volume %s: %s\n", id, err)
					continue outer
				}
				// delete the volume
				fmt.Println(id, "deleted")
				continue outer
			}
		}
		success = false
		fmt.Printf("could not delete volume %s: volume not found\n", id)
	}
	if !success {
		return errors.New("could not delete all volumes")
	}
	return nil
}
Example #16
0
func getLog(hostID, jobID string, client *cluster.Client, follow bool, stdout, stderr io.Writer) error {
	hostClient, err := client.DialHost(hostID)
	if err != nil {
		return fmt.Errorf("could not connect to host %s: %s", hostID, err)
	}
	defer hostClient.Close()
	attachReq := &host.AttachReq{
		JobID: jobID,
		Flags: host.AttachFlagStdout | host.AttachFlagStderr | host.AttachFlagLogs,
	}
	if follow {
		attachReq.Flags |= host.AttachFlagStream
	}
	attachClient, err := hostClient.Attach(attachReq, false)
	if err != nil {
		if err == cluster.ErrWouldWait {
			return errors.New("no such job")
		}
		return err
	}
	defer attachClient.Close()
	attachClient.Receive(stdout, stderr)
	return nil
}
Example #17
0
File: tags.go Project: QY-Y/flynn
func runTags(args *docopt.Args, client *cluster.Client) error {
	if args.Bool["set"] {
		return runTagsSet(args, client)
	} else if args.Bool["del"] {
		return runTagsDel(args, client)
	}
	instances, err := client.HostInstances()
	if err != nil {
		return err
	}
	w := tabwriter.NewWriter(os.Stdout, 1, 2, 2, ' ', 0)
	defer w.Flush()
	listRec(w, "HOST", "TAGS")
	for _, inst := range instances {
		tags := make([]string, 0, len(inst.Meta))
		for k, v := range inst.Meta {
			if strings.HasPrefix(k, host.TagPrefix) {
				tags = append(tags, fmt.Sprintf("%s=%s", strings.TrimPrefix(k, host.TagPrefix), v))
			}
		}
		listRec(w, inst.Meta["id"], strings.Join(tags, " "))
	}
	return nil
}
Example #18
0
func runSignal(args *docopt.Args, client *cluster.Client) error {
	id := args.String["ID"]
	sig, err := strconv.Atoi(args.String["SIGNAL"])
	if err != nil {
		fmt.Println("invalid value for SIGNAL")
		return err
	}
	hostID, err := cluster.ExtractHostID(id)
	if err != nil {
		fmt.Println("could not parse", id)
		return err
	}
	hostClient, err := client.Host(hostID)
	if err != nil {
		fmt.Println("could not connect to host", hostID)
		return err
	}
	if err := hostClient.SignalJob(id, sig); err != nil {
		fmt.Println("could not signal job", id)
		return err
	}
	fmt.Printf("sent signal %d to %s successfully\n", sig, id)
	return nil
}
Example #19
0
func runVolumeGarbageCollection(args *docopt.Args, client *cluster.Client) error {
	// collect list of all volume ids currently attached to jobs
	hosts, err := client.Hosts()
	if err != nil {
		return fmt.Errorf("could not list hosts: %s", err)
	}
	if len(hosts) == 0 {
		return errors.New("no hosts found")
	}

	keep := make(map[string]struct{})
	for _, h := range hosts {
		jobs, err := h.ListJobs()
		if err != nil {
			fmt.Printf("error listing jobs on host %s: %s\n", h.ID(), err)
			continue
		}
		for _, j := range jobs {
			if j.Status != host.StatusRunning && j.Status != host.StatusStarting {
				continue
			}

			// keep the tmpfs (it has the same ID as the job)
			keep[j.Job.ID] = struct{}{}

			// keep the data volumes
			for _, vb := range j.Job.Config.Volumes {
				keep[vb.VolumeID] = struct{}{}
			}

			// keep the mounted layers
			for _, m := range j.Job.Mountspecs {
				keep[m.ID] = struct{}{}
			}
		}
	}

	volumes, err := clusterVolumes(hosts)
	if err != nil {
		return err
	}

	// iterate over list of all volumes, deleting any not found in the keep list
	success := true
outer:
	for _, v := range volumes {
		if _, ok := keep[v.Volume.ID]; ok {
			continue outer
		}
		// don't delete system images
		if v.Volume.Meta["flynn.system-image"] == "true" {
			continue
		}
		if err := v.Host.DestroyVolume(v.Volume.ID); err != nil {
			success = false
			fmt.Printf("could not delete %s volume %s: %s\n", v.Volume.Type, v.Volume.ID, err)
			continue outer
		}
		fmt.Println("Deleted", v.Volume.Type, "volume", v.Volume.ID)
	}
	if !success {
		return errors.New("could not garbage collect all volumes")
	}

	return nil
}
Example #20
0
func (f *ClusterFixer) Run(args *docopt.Args, c *cluster.Client) error {
	f.c = c
	f.l = log15.New()
	var err error

	minHosts, err := strconv.Atoi(args.String["--min-hosts"])
	if err != nil || minHosts < 1 {
		return fmt.Errorf("invalid or missing --min-hosts value")
	}

	f.hosts, err = c.Hosts()
	if err != nil {
		f.l.Error("unable to list hosts from discoverd, falling back to peer IP list", "error", err)
		var ips []string
		if ipList := args.String["--peer-ips"]; ipList != "" {
			ips = strings.Split(ipList, ",")
			if minHosts == 0 {
				minHosts = len(ips)
			}
		}
		if len(ips) == 0 {
			return fmt.Errorf("error connecting to discoverd, use --peer-ips: %s", err)
		}
		if len(ips) < minHosts {
			return fmt.Errorf("number of peer IPs provided (%d) is less than --min-hosts (%d)", len(ips), minHosts)
		}

		f.hosts = make([]*cluster.Host, len(ips))
		for i, ip := range ips {
			url := fmt.Sprintf("http://%s:1113", ip)
			status, err := cluster.NewHost("", url, nil, nil).GetStatus()
			if err != nil {
				return fmt.Errorf("error connecting to %s: %s", ip, err)
			}
			f.hosts[i] = cluster.NewHost(status.ID, url, nil, nil)
		}
	}
	// check expected number of hosts
	if len(f.hosts) < minHosts {
		// TODO(titanous): be smarter about this
		return fmt.Errorf("expected at least %d hosts, but %d found", minHosts, len(f.hosts))
	}
	f.l.Info("found expected hosts", "n", len(f.hosts))

	if err := f.FixDiscoverd(); err != nil {
		return err
	}
	if err := f.FixFlannel(); err != nil {
		return err
	}

	f.l.Info("waiting for discoverd to be available")
	timeout := time.After(time.Minute)
	for {
		var err error
		if _, err = discoverd.GetInstances("discoverd", 30*time.Second); err != nil {
			time.Sleep(100 * time.Millisecond)
		} else {
			break
		}
		select {
		case <-timeout:
			return fmt.Errorf("timed out waiting for discoverd, last error: %s", err)
		}
	}

	f.l.Info("checking for running controller API")
	controllerService := discoverd.NewService("controller")
	controllerInstances, _ := controllerService.Instances()
	if len(controllerInstances) > 0 {
		f.l.Info("found running controller API instances", "n", len(controllerInstances))
		if err := f.FixController(controllerInstances, false); err != nil {
			f.l.Error("error fixing controller", "err", err)
			// if unable to write correct formations, we need to kill the scheduler so that the rest of this works
			if err := f.KillSchedulers(); err != nil {
				return err
			}
		}
	}

	f.l.Info("checking status of sirenia databases")
	for _, db := range []string{"postgres", "mariadb", "mongodb"} {
		f.l.Info("checking for database state", "db", db)
		if _, err := discoverd.NewService(db).GetMeta(); err != nil {
			if discoverd.IsNotFound(err) {
				f.l.Info("skipping recovery of db, no state in discoverd", "db", db)
				continue
			}
			f.l.Error("error checking database state", "db", db)
			return err
		}
		if err := f.FixSirenia(db); err != nil {
			return err
		}
	}

	f.l.Info("checking for running controller API")
	controllerInstances, _ = controllerService.Instances()
	if len(controllerInstances) == 0 {
		// kill schedulers to prevent interference
		if err := f.KillSchedulers(); err != nil {
			return err
		}
		controllerInstances, err = f.StartAppJob("controller", "web", "controller")
		if err != nil {
			return err
		}
	} else {
		f.l.Info("found running controller API instances", "n", len(controllerInstances))
	}

	if err := f.FixController(controllerInstances, true); err != nil {
		f.l.Error("error fixing controller", "err", err)
		return err
	}

	f.l.Info("cluster fix complete")

	return nil
}