func runStop(args *docopt.Args, client *cluster.Client) error { success := true clients := make(map[string]*cluster.Host) for _, id := range args.All["ID"].([]string) { hostID, err := cluster.ExtractHostID(id) if err != nil { fmt.Printf("could not parse %s: %s", id, err) success = false continue } hostClient, ok := clients[hostID] if !ok { var err error hostClient, err = client.Host(hostID) if err != nil { fmt.Printf("could not connect to host %s: %s\n", hostID, err) success = false continue } clients[hostID] = hostClient } if err := hostClient.StopJob(id); err != nil { fmt.Printf("could not stop job %s: %s\n", id, err) success = false continue } fmt.Println(id, "stopped") } if !success { return errors.New("could not stop all jobs") } return nil }
func (c *controllerAPI) connectHost(ctx context.Context) (utils.HostClient, string, error) { params, _ := ctxhelper.ParamsFromContext(ctx) jobID := params.ByName("jobs_id") hostID, err := cluster.ExtractHostID(jobID) if err != nil { log.Printf("Unable to parse hostID from %q", jobID) return nil, jobID, err } host, err := c.clusterClient.Host(hostID) return host, jobID, err }
func (s *SchedulerSuite) TestControllerRestart(t *c.C) { // get the current controller details app, err := s.controllerClient(t).GetApp("controller") t.Assert(err, c.IsNil) release, err := s.controllerClient(t).GetAppRelease("controller") t.Assert(err, c.IsNil) formation, err := s.controllerClient(t).GetFormation(app.ID, release.ID) t.Assert(err, c.IsNil) list, err := s.controllerClient(t).JobList("controller") t.Assert(err, c.IsNil) var jobs []*ct.Job for _, job := range list { if job.Type == "web" && job.State == "up" { jobs = append(jobs, job) } } t.Assert(jobs, c.HasLen, formation.Processes["web"]) jobID := jobs[0].ID hostID, _ := cluster.ExtractHostID(jobID) t.Assert(hostID, c.Not(c.Equals), "") debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID) // start another controller and wait for it to come up watcher, err := s.controllerClient(t).WatchJobEvents("controller", release.ID) t.Assert(err, c.IsNil) defer watcher.Close() debug(t, "scaling the controller up") formation.Processes["web"]++ t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) err = watcher.WaitFor(ct.JobEvents{"web": {"up": 1}}, scaleTimeout, nil) t.Assert(err, c.IsNil) // kill the first controller and check the scheduler brings it back online cc := cluster.NewClientWithServices(s.discoverdClient(t).Service) hc, err := cc.Host(hostID) t.Assert(err, c.IsNil) debug(t, "stopping job ", jobID) t.Assert(hc.StopJob(jobID), c.IsNil) err = watcher.WaitFor(ct.JobEvents{"web": {"down": 1, "up": 1}}, scaleTimeout, nil) t.Assert(err, c.IsNil) // scale back down debug(t, "scaling the controller down") formation.Processes["web"]-- t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) err = watcher.WaitFor(ct.JobEvents{"web": {"down": 1}}, scaleTimeout, nil) t.Assert(err, c.IsNil) // unset the suite's client so other tests use a new client s.controller = nil }
func (f *ClusterFixer) GetJob(jobID string) (*host.Job, *cluster.Host, error) { hostID, err := cluster.ExtractHostID(jobID) if err != nil { return nil, nil, fmt.Errorf("error parsing host ID from %q", jobID) } host, err := f.c.Host(hostID) if err != nil { return nil, nil, fmt.Errorf("unable to get host for job lookup: %s", err) } job, err := host.GetJob(jobID) if err != nil { return nil, nil, fmt.Errorf("unable to get job from host: %s", err) } return job.Job, host, nil }
func runLog(args *docopt.Args, client *cluster.Client) error { jobID := args.String["ID"] hostID, err := cluster.ExtractHostID(jobID) if err != nil { return err } return getLog( hostID, jobID, client, args.Bool["-f"] || args.Bool["--follow"], args.Bool["--init"], os.Stdout, os.Stderr, ) }
func runInspect(args *docopt.Args, client *cluster.Client) error { jobID := args.String["ID"] hostID, err := cluster.ExtractHostID(jobID) if err != nil { return err } hostClient, err := client.Host(hostID) if err != nil { return fmt.Errorf("could not connect to host %s: %s", hostID, err) } job, err := hostClient.GetJob(jobID) if err != nil { return fmt.Errorf("no such job") } printJobDesc(job, os.Stdout, !args.Bool["--omit-env"]) return nil }
func runLog(args *docopt.Args, client *cluster.Client) error { jobID := args.String["ID"] hostID, err := cluster.ExtractHostID(jobID) if err != nil { return err } lines := 0 if args.String["--lines"] != "" { lines, err = strconv.Atoi(args.String["--lines"]) if err != nil { return err } } stderr := os.Stdout if args.Bool["--split-stderr"] { stderr = os.Stderr } if lines > 0 { stdoutR, stdoutW := io.Pipe() stderrR, stderrW := io.Pipe() go func() { getLog(hostID, jobID, client, false, args.Bool["--init"], stdoutW, stderrW) stdoutW.Close() stderrW.Close() }() tailLogs(stdoutR, stderrR, lines, os.Stdout, stderr) return nil } return getLog( hostID, jobID, client, args.Bool["-f"] || args.Bool["--follow"], args.Bool["--init"], os.Stdout, stderr, ) }
func (c *context) syncJobStates() error { g := grohl.NewContext(grohl.Data{"fn": "syncJobStates"}) g.Log(grohl.Data{"at": "appList"}) apps, err := c.AppList() if err != nil { g.Log(grohl.Data{"at": "appList", "status": "error", "err": err}) return err } for _, app := range apps { g.Log(grohl.Data{"at": "jobList", "app.id": app.ID}) jobs, err := c.JobList(app.ID) if err != nil { g.Log(grohl.Data{"at": "jobList", "app.id": app.ID, "status": "error", "err": err}) continue } for _, job := range jobs { gg := g.New(grohl.Data{"job.id": job.ID, "app.id": app.ID, "state": job.State}) gg.Log(grohl.Data{"at": "checkState"}) if job.State != "up" { continue } hostID, err := cluster.ExtractHostID(job.ID) if err != nil { gg.Log(grohl.Data{"at": "jobHostID", "status": "error", "err": err}) continue } if j := c.jobs.Get(hostID, job.ID); j != nil { continue } job.State = "down" gg.Log(grohl.Data{"at": "putJob", "state": "down"}) go c.PutJob(job) } } return nil }
func runSignal(args *docopt.Args, client *cluster.Client) error { id := args.String["ID"] sig, err := strconv.Atoi(args.String["SIGNAL"]) if err != nil { fmt.Println("invalid value for SIGNAL") return err } hostID, err := cluster.ExtractHostID(id) if err != nil { fmt.Println("could not parse", id) return err } hostClient, err := client.Host(hostID) if err != nil { fmt.Println("could not connect to host", hostID) return err } if err := hostClient.SignalJob(id, sig); err != nil { fmt.Println("could not signal job", id) return err } fmt.Printf("sent signal %d to %s successfully\n", sig, id) return nil }
func (h *Helper) stopJob(t *c.C, id string) { debugf(t, "stopping job %s", id) hostID, _ := cluster.ExtractHostID(id) hc := h.hostClient(t, hostID) t.Assert(hc.StopJob(id), c.IsNil) }
func (s *SchedulerSuite) TestControllerRestart(t *c.C) { // get the current controller details app, err := s.controllerClient(t).GetApp("controller") t.Assert(err, c.IsNil) release, err := s.controllerClient(t).GetAppRelease("controller") t.Assert(err, c.IsNil) formation, err := s.controllerClient(t).GetFormation(app.ID, release.ID) t.Assert(err, c.IsNil) list, err := s.controllerClient(t).JobList("controller") t.Assert(err, c.IsNil) var jobs []*ct.Job for _, job := range list { if job.Type == "web" && job.State == ct.JobStateUp { jobs = append(jobs, job) } } t.Assert(jobs, c.HasLen, formation.Processes["web"]) jobID := jobs[0].ID hostID, _ := cluster.ExtractHostID(jobID) t.Assert(hostID, c.Not(c.Equals), "") debugf(t, "current controller app[%s] host[%s] job[%s]", app.ID, hostID, jobID) // subscribe to service events, wait for current event events := make(chan *discoverd.Event) stream, err := s.discoverdClient(t).Service("controller").Watch(events) t.Assert(err, c.IsNil) defer stream.Close() type serviceEvents map[discoverd.EventKind]int wait := func(expected serviceEvents) { actual := make(serviceEvents) outer: for { select { case event := <-events: actual[event.Kind]++ for kind, count := range expected { if actual[kind] != count { continue outer } } return case <-time.After(scaleTimeout): t.Fatal("timed out waiting for controller service event") } } } wait(serviceEvents{discoverd.EventKindCurrent: 1}) // start another controller and wait for it to come up debug(t, "scaling the controller up") formation.Processes["web"]++ t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) wait(serviceEvents{discoverd.EventKindUp: 1}) // kill the first controller and check the scheduler brings it back online cc := cluster.NewClientWithServices(s.discoverdClient(t).Service) hc, err := cc.Host(hostID) t.Assert(err, c.IsNil) debug(t, "stopping job ", jobID) t.Assert(hc.StopJob(jobID), c.IsNil) wait(serviceEvents{discoverd.EventKindUp: 1, discoverd.EventKindDown: 1}) // scale back down debug(t, "scaling the controller down") formation.Processes["web"]-- t.Assert(s.controllerClient(t).PutFormation(formation), c.IsNil) wait(serviceEvents{discoverd.EventKindDown: 1}) // unset the suite's client so other tests use a new client s.controller = nil }
func (f *ClusterFixer) FixSirenia(svc string) error { log := f.l.New("fn", "FixSirenia", "service", svc) service := discoverd.NewService(svc) instances, _ := service.Instances() leader, _ := service.Leader() log.Info("getting service metadata") meta, err := service.GetMeta() if err != nil { return fmt.Errorf("error getting sirenia state from discoverd: %s", err) } var state state.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in sirenia state") } log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get primary job info") } var syncJob *host.Job var syncHost *cluster.Host if state.Sync != nil { log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get sync job info") } } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { log.Info("waiting for instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for sirenia instance to come up") } }, nil } log.Info("terminating unassigned sirenia instances") outer: for _, i := range instances { if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) { continue } for _, a := range state.Async { if i.Addr == a.Addr { continue outer } } // job not assigned in state, attempt to terminate it if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok { hostID, err := cluster.ExtractHostID(jobID) if err != nil { log.Error("error extracting host id from jobID", "jobID", jobID, "err", err) } h := f.Host(hostID) if h != nil { if err := h.StopJob(jobID); err != nil { log.Error("error stopping unassigned sirenia job", "jobID", jobID) } } else { log.Error("host not found", "hostID", hostID) } } } isRunning := func(addr string) bool { for _, i := range instances { if i.Addr == addr { return true } } return false } // if the leader isn't currently running then start it using primaryJob/primaryHost var wait func() (string, error) if !isRunning(state.Primary.Addr) { // if we don't have info about the primary job attempt to promote the sync if primaryJob == nil { if syncJob != nil { // set primary job to sync primaryJob = syncJob primaryHost = syncHost // nil out sync job now so we can re-allocate it. syncJob = nil syncHost = nil } else { return fmt.Errorf("neither primary or sync job info available") } } primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "") f.FixJobEnv(primaryJob) log.Info("starting primary job", "job.id", primaryJob.ID) wait, err = waitForInstance(primaryJob.ID) if err != nil { return err } if err := primaryHost.AddJob(primaryJob); err != nil { return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err) } } if !state.Singleton && !isRunning(state.Sync.Addr) { if syncHost == nil { for _, h := range f.hosts { if h.ID() != primaryHost.ID() { syncHost = h break } } if syncHost == nil { // if there are no other hosts, use the same one we put the primary on syncHost = primaryHost } } // if we don't have a sync job then copy the primary job // and provision a new volume if syncJob == nil { syncJob = primaryJob vol := &ct.VolumeReq{Path: "/data"} if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil { return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err) } } syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "") f.FixJobEnv(syncJob) log.Info("starting sync job", "job.id", syncJob.ID) if wait == nil { wait, err = waitForInstance(syncJob.ID) if err != nil { return err } } if err := syncHost.AddJob(syncJob); err != nil { return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil && leader.Addr != "" { addr = leader.Addr } log.Info("waiting for cluster to come up read-write", "addr", addr) return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }