Example #1
0
func runStop(args *docopt.Args, client *cluster.Client) error {
	success := true
	clients := make(map[string]*cluster.Host)
	for _, id := range args.All["ID"].([]string) {
		hostID, err := cluster.ExtractHostID(id)
		if err != nil {
			fmt.Printf("could not parse %s: %s", id, err)
			success = false
			continue
		}
		hostClient, ok := clients[hostID]
		if !ok {
			var err error
			hostClient, err = client.Host(hostID)
			if err != nil {
				fmt.Printf("could not connect to host %s: %s\n", hostID, err)
				success = false
				continue
			}
			clients[hostID] = hostClient
		}
		if err := hostClient.StopJob(id); err != nil {
			fmt.Printf("could not stop job %s: %s\n", id, err)
			success = false
			continue
		}
		fmt.Println(id, "stopped")
	}
	if !success {
		return errors.New("could not stop all jobs")
	}
	return nil
}
Example #2
0
func (c *controllerAPI) connectHost(ctx context.Context) (utils.HostClient, string, error) {
	params, _ := ctxhelper.ParamsFromContext(ctx)
	jobID := params.ByName("jobs_id")
	hostID, err := cluster.ExtractHostID(jobID)
	if err != nil {
		log.Printf("Unable to parse hostID from %q", jobID)
		return nil, jobID, err
	}

	host, err := c.clusterClient.Host(hostID)
	return host, jobID, err
}
Example #3
0
func (s *SchedulerSuite) TestControllerRestart(t *c.C) {
	// get the current controller details
	app, err := s.controllerClient(t).GetApp("controller")
	t.Assert(err, c.IsNil)
	release, err := s.controllerClient(t).GetAppRelease("controller")
	t.Assert(err, c.IsNil)
	formation, err := s.controllerClient(t).GetFormation(app.ID, release.ID)
	t.Assert(err, c.IsNil)
	list, err := s.controllerClient(t).JobList("controller")
	t.Assert(err, c.IsNil)
	var jobs []*ct.Job
	for _, job := range list {
		if job.Type == "web" && job.State == "up" {
			jobs = append(jobs, job)
		}
	}
	t.Assert(jobs, c.HasLen, formation.Processes["web"])
	jobID := jobs[0].ID
	hostID, _ := cluster.ExtractHostID(jobID)
	t.Assert(hostID, c.Not(c.Equals), "")
	debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID)

	// start another controller and wait for it to come up
	watcher, err := s.controllerClient(t).WatchJobEvents("controller", release.ID)
	t.Assert(err, c.IsNil)
	defer watcher.Close()
	debug(t, "scaling the controller up")
	formation.Processes["web"]++
	t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)
	err = watcher.WaitFor(ct.JobEvents{"web": {"up": 1}}, scaleTimeout, nil)
	t.Assert(err, c.IsNil)

	// kill the first controller and check the scheduler brings it back online
	cc := cluster.NewClientWithServices(s.discoverdClient(t).Service)
	hc, err := cc.Host(hostID)
	t.Assert(err, c.IsNil)
	debug(t, "stopping job ", jobID)
	t.Assert(hc.StopJob(jobID), c.IsNil)
	err = watcher.WaitFor(ct.JobEvents{"web": {"down": 1, "up": 1}}, scaleTimeout, nil)
	t.Assert(err, c.IsNil)

	// scale back down
	debug(t, "scaling the controller down")
	formation.Processes["web"]--
	t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)
	err = watcher.WaitFor(ct.JobEvents{"web": {"down": 1}}, scaleTimeout, nil)
	t.Assert(err, c.IsNil)

	// unset the suite's client so other tests use a new client
	s.controller = nil
}
Example #4
0
func (f *ClusterFixer) GetJob(jobID string) (*host.Job, *cluster.Host, error) {
	hostID, err := cluster.ExtractHostID(jobID)
	if err != nil {
		return nil, nil, fmt.Errorf("error parsing host ID from %q", jobID)
	}
	host, err := f.c.Host(hostID)
	if err != nil {
		return nil, nil, fmt.Errorf("unable to get host for job lookup: %s", err)
	}
	job, err := host.GetJob(jobID)
	if err != nil {
		return nil, nil, fmt.Errorf("unable to get job from host: %s", err)
	}
	return job.Job, host, nil
}
Example #5
0
func runLog(args *docopt.Args, client *cluster.Client) error {
	jobID := args.String["ID"]
	hostID, err := cluster.ExtractHostID(jobID)
	if err != nil {
		return err
	}
	return getLog(
		hostID,
		jobID,
		client,
		args.Bool["-f"] || args.Bool["--follow"],
		args.Bool["--init"],
		os.Stdout,
		os.Stderr,
	)
}
Example #6
0
func runInspect(args *docopt.Args, client *cluster.Client) error {
	jobID := args.String["ID"]
	hostID, err := cluster.ExtractHostID(jobID)
	if err != nil {
		return err
	}
	hostClient, err := client.Host(hostID)
	if err != nil {
		return fmt.Errorf("could not connect to host %s: %s", hostID, err)
	}
	job, err := hostClient.GetJob(jobID)
	if err != nil {
		return fmt.Errorf("no such job")
	}

	printJobDesc(job, os.Stdout, !args.Bool["--omit-env"])
	return nil
}
Example #7
0
func runLog(args *docopt.Args, client *cluster.Client) error {
	jobID := args.String["ID"]
	hostID, err := cluster.ExtractHostID(jobID)
	if err != nil {
		return err
	}

	lines := 0
	if args.String["--lines"] != "" {
		lines, err = strconv.Atoi(args.String["--lines"])
		if err != nil {
			return err
		}
	}

	stderr := os.Stdout
	if args.Bool["--split-stderr"] {
		stderr = os.Stderr
	}

	if lines > 0 {
		stdoutR, stdoutW := io.Pipe()
		stderrR, stderrW := io.Pipe()

		go func() {
			getLog(hostID, jobID, client, false, args.Bool["--init"], stdoutW, stderrW)
			stdoutW.Close()
			stderrW.Close()
		}()
		tailLogs(stdoutR, stderrR, lines, os.Stdout, stderr)
		return nil
	}
	return getLog(
		hostID,
		jobID,
		client,
		args.Bool["-f"] || args.Bool["--follow"],
		args.Bool["--init"],
		os.Stdout,
		stderr,
	)
}
Example #8
0
File: main.go Project: kgrz/flynn
func (c *context) syncJobStates() error {
	g := grohl.NewContext(grohl.Data{"fn": "syncJobStates"})
	g.Log(grohl.Data{"at": "appList"})
	apps, err := c.AppList()
	if err != nil {
		g.Log(grohl.Data{"at": "appList", "status": "error", "err": err})
		return err
	}
	for _, app := range apps {
		g.Log(grohl.Data{"at": "jobList", "app.id": app.ID})
		jobs, err := c.JobList(app.ID)
		if err != nil {
			g.Log(grohl.Data{"at": "jobList", "app.id": app.ID, "status": "error", "err": err})
			continue
		}
		for _, job := range jobs {
			gg := g.New(grohl.Data{"job.id": job.ID, "app.id": app.ID, "state": job.State})
			gg.Log(grohl.Data{"at": "checkState"})
			if job.State != "up" {
				continue
			}
			hostID, err := cluster.ExtractHostID(job.ID)
			if err != nil {
				gg.Log(grohl.Data{"at": "jobHostID", "status": "error", "err": err})
				continue
			}
			if j := c.jobs.Get(hostID, job.ID); j != nil {
				continue
			}
			job.State = "down"
			gg.Log(grohl.Data{"at": "putJob", "state": "down"})
			go c.PutJob(job)
		}
	}
	return nil
}
Example #9
0
func runSignal(args *docopt.Args, client *cluster.Client) error {
	id := args.String["ID"]
	sig, err := strconv.Atoi(args.String["SIGNAL"])
	if err != nil {
		fmt.Println("invalid value for SIGNAL")
		return err
	}
	hostID, err := cluster.ExtractHostID(id)
	if err != nil {
		fmt.Println("could not parse", id)
		return err
	}
	hostClient, err := client.Host(hostID)
	if err != nil {
		fmt.Println("could not connect to host", hostID)
		return err
	}
	if err := hostClient.SignalJob(id, sig); err != nil {
		fmt.Println("could not signal job", id)
		return err
	}
	fmt.Printf("sent signal %d to %s successfully\n", sig, id)
	return nil
}
Example #10
0
func (h *Helper) stopJob(t *c.C, id string) {
	debugf(t, "stopping job %s", id)
	hostID, _ := cluster.ExtractHostID(id)
	hc := h.hostClient(t, hostID)
	t.Assert(hc.StopJob(id), c.IsNil)
}
Example #11
0
func (s *SchedulerSuite) TestControllerRestart(t *c.C) {
	// get the current controller details
	app, err := s.controllerClient(t).GetApp("controller")
	t.Assert(err, c.IsNil)
	release, err := s.controllerClient(t).GetAppRelease("controller")
	t.Assert(err, c.IsNil)
	formation, err := s.controllerClient(t).GetFormation(app.ID, release.ID)
	t.Assert(err, c.IsNil)
	list, err := s.controllerClient(t).JobList("controller")
	t.Assert(err, c.IsNil)
	var jobs []*ct.Job
	for _, job := range list {
		if job.Type == "web" && job.State == ct.JobStateUp {
			jobs = append(jobs, job)
		}
	}
	t.Assert(jobs, c.HasLen, formation.Processes["web"])
	jobID := jobs[0].ID
	hostID, _ := cluster.ExtractHostID(jobID)
	t.Assert(hostID, c.Not(c.Equals), "")
	debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID)

	// subscribe to service events, wait for current event
	events := make(chan *discoverd.Event)
	stream, err := s.discoverdClient(t).Service("controller").Watch(events)
	t.Assert(err, c.IsNil)
	defer stream.Close()
	type serviceEvents map[discoverd.EventKind]int
	wait := func(expected serviceEvents) {
		actual := make(serviceEvents)
	outer:
		for {
			select {
			case event := <-events:
				actual[event.Kind]++
				for kind, count := range expected {
					if actual[kind] != count {
						continue outer
					}
				}
				return
			case <-time.After(scaleTimeout):
				t.Fatal("timed out waiting for controller service event")
			}
		}
	}
	wait(serviceEvents{discoverd.EventKindCurrent: 1})

	// start another controller and wait for it to come up
	debug(t, "scaling the controller up")
	formation.Processes["web"]++
	t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)
	wait(serviceEvents{discoverd.EventKindUp: 1})

	// kill the first controller and check the scheduler brings it back online
	cc := cluster.NewClientWithServices(s.discoverdClient(t).Service)
	hc, err := cc.Host(hostID)
	t.Assert(err, c.IsNil)
	debug(t, "stopping job ", jobID)
	t.Assert(hc.StopJob(jobID), c.IsNil)
	wait(serviceEvents{discoverd.EventKindUp: 1, discoverd.EventKindDown: 1})

	// scale back down
	debug(t, "scaling the controller down")
	formation.Processes["web"]--
	t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil)
	wait(serviceEvents{discoverd.EventKindDown: 1})

	// unset the suite's client so other tests use a new client
	s.controller = nil
}
Example #12
0
func (f *ClusterFixer) FixSirenia(svc string) error {
	log := f.l.New("fn", "FixSirenia", "service", svc)

	service := discoverd.NewService(svc)
	instances, _ := service.Instances()
	leader, _ := service.Leader()

	log.Info("getting service metadata")
	meta, err := service.GetMeta()
	if err != nil {
		return fmt.Errorf("error getting sirenia state from discoverd: %s", err)
	}

	var state state.State
	if err := json.Unmarshal(meta.Data, &state); err != nil {
		return fmt.Errorf("error decoding state: %s", err)
	}
	if state.Primary == nil {
		return fmt.Errorf("no primary in sirenia state")
	}

	log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"])
	primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"])
	if err != nil {
		log.Error("unable to get primary job info")
	}
	var syncJob *host.Job
	var syncHost *cluster.Host
	if state.Sync != nil {
		log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"])
		syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"])
		if err != nil {
			log.Error("unable to get sync job info")
		}
	}

	waitForInstance := func(jobID string) (func() (string, error), error) {
		watchCh := make(chan *discoverd.Event)
		upCh := make(chan string)
		stream, err := service.Watch(watchCh)
		if err != nil {
			return nil, fmt.Errorf("error watching discoverd service: %s", err)
		}
		go func() {
			var current bool
			for event := range watchCh {
				if event.Kind == discoverd.EventKindCurrent {
					current = true
					continue
				}
				if !current || event.Kind != discoverd.EventKindUp {
					continue
				}
				if event.Instance.Meta["FLYNN_JOB_ID"] == jobID {
					upCh <- event.Instance.Addr
				}
			}
		}()
		return func() (string, error) {
			log.Info("waiting for instance to start", "job.id", jobID)
			defer stream.Close()
			select {
			case addr := <-upCh:
				return addr, nil
			case <-time.After(time.Minute):
				return "", fmt.Errorf("timed out waiting for sirenia instance to come up")
			}
		}, nil
	}

	log.Info("terminating unassigned sirenia instances")
outer:
	for _, i := range instances {
		if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) {
			continue
		}
		for _, a := range state.Async {
			if i.Addr == a.Addr {
				continue outer
			}
		}
		// job not assigned in state, attempt to terminate it
		if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok {
			hostID, err := cluster.ExtractHostID(jobID)
			if err != nil {
				log.Error("error extracting host id from jobID", "jobID", jobID, "err", err)
			}
			h := f.Host(hostID)
			if h != nil {
				if err := h.StopJob(jobID); err != nil {
					log.Error("error stopping unassigned sirenia job", "jobID", jobID)
				}
			} else {
				log.Error("host not found", "hostID", hostID)
			}
		}
	}

	isRunning := func(addr string) bool {
		for _, i := range instances {
			if i.Addr == addr {
				return true
			}
		}
		return false
	}

	// if the leader isn't currently running then start it using primaryJob/primaryHost
	var wait func() (string, error)
	if !isRunning(state.Primary.Addr) {
		// if we don't have info about the primary job attempt to promote the sync
		if primaryJob == nil {
			if syncJob != nil {
				// set primary job to sync
				primaryJob = syncJob
				primaryHost = syncHost

				// nil out sync job now so we can re-allocate it.
				syncJob = nil
				syncHost = nil
			} else {
				return fmt.Errorf("neither primary or sync job info available")
			}
		}

		primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "")
		f.FixJobEnv(primaryJob)
		log.Info("starting primary job", "job.id", primaryJob.ID)
		wait, err = waitForInstance(primaryJob.ID)
		if err != nil {
			return err
		}
		if err := primaryHost.AddJob(primaryJob); err != nil {
			return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err)
		}
	}
	if !state.Singleton && !isRunning(state.Sync.Addr) {
		if syncHost == nil {
			for _, h := range f.hosts {
				if h.ID() != primaryHost.ID() {
					syncHost = h
					break
				}
			}
			if syncHost == nil {
				// if there are no other hosts, use the same one we put the primary on
				syncHost = primaryHost
			}
		}
		// if we don't have a sync job then copy the primary job
		// and provision a new volume
		if syncJob == nil {
			syncJob = primaryJob
			vol := &ct.VolumeReq{Path: "/data"}
			if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil {
				return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err)
			}
		}
		syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "")
		f.FixJobEnv(syncJob)
		log.Info("starting sync job", "job.id", syncJob.ID)
		if wait == nil {
			wait, err = waitForInstance(syncJob.ID)
			if err != nil {
				return err
			}
		}
		if err := syncHost.AddJob(syncJob); err != nil {
			return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err)
		}
	}

	if wait != nil {
		addr, err := wait()
		if err != nil {
			return err
		}
		if leader != nil && leader.Addr != "" {
			addr = leader.Addr
		}
		log.Info("waiting for cluster to come up read-write", "addr", addr)
		return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute)
	}
	return nil
}