func (a *API) ping(w http.ResponseWriter, req *http.Request, _ httprouter.Params) { logger := a.logger().New("fn", "ping") logger.Info("checking status", "host", serviceHost) if status, err := sirenia.NewClient(serviceHost + ":3306").Status(); err == nil && status.Database != nil && status.Database.ReadWrite { logger.Info("database is up, skipping scale check") } else { scaled, err := scale.CheckScale(app, controllerKey, "mongodb", a.logger()) if err != nil { httphelper.Error(w, err) return } // Cluster has yet to be scaled, return healthy if !scaled { w.WriteHeader(200) return } } session, err := mgo.DialWithInfo(&mgo.DialInfo{ Addrs: []string{net.JoinHostPort(serviceHost, "27017")}, Username: "******", Password: os.Getenv("MONGO_PWD"), Database: "admin", }) if err != nil { httphelper.Error(w, err) return } defer session.Close() w.WriteHeader(200) }
func (f *ClusterFixer) CheckSirenia(svc string) error { log := f.l.New("fn", "CheckSirenia", "service", svc) log.Info("checking sirenia cluster status") service := discoverd.NewService(svc) leader, _ := service.Leader() if leader == nil || leader.Addr == "" { log.Info("no running leader") leader = nil } else { log.Info("found running leader") } instances, _ := service.Instances() log.Info("found running instances", "count", len(instances)) log.Info("getting sirenia status") var status *sirenia.Status if leader != nil && leader.Addr != "" { client := sirenia.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { log.Error("error getting status from leader", "error", err) } } if status != nil && status.Database != nil && status.Database.ReadWrite { log.Info("cluster claims to be read-write") return nil } return fmt.Errorf("cluster isn't read-write") }
func (a *API) ping(ctx context.Context, w http.ResponseWriter, req *http.Request) { logger := a.logger().New("fn", "ping") logger.Info("checking status", "host", serviceHost) if status, err := sirenia.NewClient(serviceHost + ":3306").Status(); err == nil && status.Database != nil && status.Database.ReadWrite { logger.Info("database is up, skipping scale check") } else { scaled, err := scale.CheckScale(app, controllerKey, "mariadb", a.logger()) if err != nil { httphelper.Error(w, err) return } // Cluster has yet to be scaled, return healthy if !scaled { w.WriteHeader(200) return } } db, err := a.connect() if err != nil { httphelper.Error(w, err) return } defer db.Close() if _, err := db.Exec("SELECT 1"); err != nil { httphelper.Error(w, err) return } w.WriteHeader(200) }
func testSireniaDeploy(client controller.Client, disc *discoverd.Client, t *c.C, d *sireniaDeploy) { // create app app := &ct.App{Name: d.name, Strategy: "sirenia"} t.Assert(client.CreateApp(app), c.IsNil) // copy release from default app release, err := client.GetAppRelease(d.db.appName) t.Assert(err, c.IsNil) release.ID = "" release.Env[d.db.hostKey] = fmt.Sprintf("leader.%s.discoverd", d.name) release.Env[d.db.serviceKey] = d.name procName := release.Env["SIRENIA_PROCESS"] proc := release.Processes[procName] delete(proc.Env, "SINGLETON") proc.Service = d.name release.Processes[procName] = proc t.Assert(client.CreateRelease(release), c.IsNil) t.Assert(client.SetAppRelease(app.ID, release.ID), c.IsNil) oldRelease := release.ID // create formation discEvents := make(chan *discoverd.Event) discService := disc.Service(d.name) discStream, err := discService.Watch(discEvents) t.Assert(err, c.IsNil) defer discStream.Close() jobEvents := make(chan *ct.Job) jobStream, err := client.StreamJobEvents(d.name, jobEvents) t.Assert(err, c.IsNil) defer jobStream.Close() t.Assert(client.PutFormation(&ct.Formation{ AppID: app.ID, ReleaseID: release.ID, Processes: map[string]int{procName: d.sireniaJobs, "web": d.webJobs}, }), c.IsNil) // watch cluster state changes type stateChange struct { state *state.State err error } stateCh := make(chan stateChange) go func() { for event := range discEvents { if event.Kind != discoverd.EventKindServiceMeta { continue } var state state.State if err := json.Unmarshal(event.ServiceMeta.Data, &state); err != nil { stateCh <- stateChange{err: err} return } primary := "" if state.Primary != nil { primary = state.Primary.Addr } sync := "" if state.Sync != nil { sync = state.Sync.Addr } var async []string for _, a := range state.Async { async = append(async, a.Addr) } debugf(t, "got cluster state: index=%d primary=%s sync=%s async=%s", event.ServiceMeta.Index, primary, sync, strings.Join(async, ",")) stateCh <- stateChange{state: &state} } }() // wait for correct cluster state and number of web processes var sireniaState state.State var webJobs int ready := func() bool { if webJobs != d.webJobs { return false } if sireniaState.Primary == nil { return false } if d.sireniaJobs > 1 && sireniaState.Sync == nil { return false } if d.sireniaJobs > 2 && len(sireniaState.Async) != d.sireniaJobs-2 { return false } return true } for { if ready() { break } select { case s := <-stateCh: t.Assert(s.err, c.IsNil) sireniaState = *s.state case e, ok := <-jobEvents: if !ok { t.Fatalf("job event stream closed: %s", jobStream.Err()) } debugf(t, "got job event: %s %s %s", e.Type, e.ID, e.State) if e.Type == "web" && e.State == ct.JobStateUp { webJobs++ } case <-time.After(30 * time.Second): t.Fatal("timed out waiting for formation") } } // wait for the primary to indicate downstream replication sync debug(t, "waiting for primary to indicate downstream replication sync") sireniaClient := sc.NewClient(sireniaState.Primary.Addr) t.Assert(sireniaClient.WaitForReplSync(sireniaState.Sync, 1*time.Minute), c.IsNil) // connect to the db so we can test writes d.db.initDb(t, release, d) // check currently writeable d.db.assertWriteable(t, release, d) // check a deploy completes with expected cluster state changes release.ID = "" t.Assert(client.CreateRelease(release), c.IsNil) newRelease := release.ID deployment, err := client.CreateDeployment(app.ID, newRelease) t.Assert(err, c.IsNil) deployEvents := make(chan *ct.DeploymentEvent) deployStream, err := client.StreamDeployment(deployment, deployEvents) t.Assert(err, c.IsNil) defer deployStream.Close() // assertNextState checks that the next state received is in the remaining states // that were expected, so handles the fact that some states don't happen, but the // states that do happen are expected and in-order. assertNextState := func(remaining []expectedSireniaState) int { var state state.State loop: for { select { case s := <-stateCh: t.Assert(s.err, c.IsNil) if len(s.state.Async) < d.expectedAsyncs() { // we shouldn't usually receive states with less asyncs than // expected, but they can occur as an intermediate state between // two expected states (e.g. when a sync does a takeover at the // same time as a new async is started) so just ignore them. debug(t, "ignoring state with too few asyncs") continue } state = *s.state break loop case <-time.After(60 * time.Second): t.Fatal("timed out waiting for cluster state") } } if state.Primary == nil { t.Fatal("no primary configured") } log := func(format string, v ...interface{}) { debugf(t, "skipping expected state: %s", fmt.Sprintf(format, v...)) } outer: for i, expected := range remaining { if state.Primary.Meta["FLYNN_RELEASE_ID"] != expected.Primary { log("primary has incorrect release") continue } if state.Sync == nil { if expected.Sync == "" { return i } log("state has no sync node") continue } if state.Sync.Meta["FLYNN_RELEASE_ID"] != expected.Sync { log("sync has incorrect release") continue } if state.Async == nil { if expected.Async == nil { return i } log("state has no async nodes") continue } if len(state.Async) != len(expected.Async) { log("expected %d asyncs, got %d", len(expected.Async), len(state.Async)) continue } for i, release := range expected.Async { if state.Async[i].Meta["FLYNN_RELEASE_ID"] != release { log("async[%d] has incorrect release", i) continue outer } } return i } t.Fatal("unexpected state") return -1 } expected := d.expected(oldRelease, newRelease) var expectedIndex, newWebJobs int loop: for { select { case e, ok := <-deployEvents: if !ok { t.Fatal("unexpected close of deployment event stream") } switch e.Status { case "complete": break loop case "failed": t.Fatalf("deployment failed: %s", e.Error) } debugf(t, "got deployment event: %s %s", e.JobType, e.JobState) if e.JobState != ct.JobStateUp && e.JobState != ct.JobStateDown { continue } switch e.JobType { case procName: // move on if we have seen all the expected events if expectedIndex >= len(expected) { continue } skipped := assertNextState(expected[expectedIndex:]) expectedIndex += 1 + skipped case "web": if e.JobState == ct.JobStateUp && e.ReleaseID == newRelease { newWebJobs++ } } case <-time.After(2 * time.Minute): t.Fatal("timed out waiting for deployment") } } // check we have the correct number of new web jobs t.Assert(newWebJobs, c.Equals, d.webJobs) // check writeable now deploy is complete d.db.assertWriteable(t, release, d) }
// ScaleUp scales up a dormant Sirenia cluster func ScaleUp(app, controllerKey, serviceAddr, procName, singleton string, logger log15.Logger) error { logger = logger.New("fn", "ScaleUp") sc := sirenia.NewClient(serviceAddr) logger.Info("checking status", "host", serviceAddr) if status, err := sc.Status(); err == nil && status.Database != nil && status.Database.ReadWrite { logger.Info("database is up, skipping scale") // Skip the rest, the database is already available return nil } else if err != nil { logger.Info("error checking status", "err", err) } else { logger.Info("got status, but database is not read-write") } // Connect to controller. logger.Info("connecting to controller") client, err := controller.NewClient("", controllerKey) if err != nil { logger.Error("controller client error", "err", err) return err } // Retrieve the app release. logger.Info("retrieving app release", "app", app) release, err := client.GetAppRelease(app) if err == controller.ErrNotFound { logger.Error("release not found", "app", app) return errors.New("release not found") } else if err != nil { logger.Error("get release error", "app", app, "err", err) return err } // Retrieve current formation. logger.Info("retrieving formation", "app", app, "release_id", release.ID) formation, err := client.GetFormation(app, release.ID) if err == controller.ErrNotFound { logger.Error("formation not found", "app", app, "release_id", release.ID) return errors.New("formation not found") } else if err != nil { logger.Error("formation error", "app", app, "release_id", release.ID, "err", err) return err } // If database is running then exit. if formation.Processes[procName] > 0 { logger.Info("database is running, scaling not necessary") return nil } // Copy processes and increase database processes. processes := make(map[string]int, len(formation.Processes)) for k, v := range formation.Processes { processes[k] = v } if singleton == "true" { processes[procName] = 1 } else { processes[procName] = 3 } // Update formation. logger.Info("updating formation", "app", app, "release_id", release.ID) formation.Processes = processes if err := client.PutFormation(formation); err != nil { logger.Error("put formation error", "app", app, "release_id", release.ID, "err", err) return err } if err := sc.WaitForReadWrite(5 * time.Minute); err != nil { logger.Error("wait for read write", "err", err) return errors.New("timed out while starting sirenia cluster") } logger.Info("scaling complete") return nil }
func (f *ClusterFixer) FixSirenia(svc string) error { log := f.l.New("fn", "FixSirenia", "service", svc) service := discoverd.NewService(svc) instances, _ := service.Instances() leader, _ := service.Leader() log.Info("getting service metadata") meta, err := service.GetMeta() if err != nil { return fmt.Errorf("error getting sirenia state from discoverd: %s", err) } var state state.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in sirenia state") } log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) primaryJob, primaryHost, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get primary job info") } var syncJob *host.Job var syncHost *cluster.Host if state.Sync != nil { log.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) syncJob, syncHost, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { log.Error("unable to get sync job info") } } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { log.Info("waiting for instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for sirenia instance to come up") } }, nil } log.Info("terminating unassigned sirenia instances") outer: for _, i := range instances { if i.Addr == state.Primary.Addr || (state.Sync != nil && i.Addr == state.Sync.Addr) { continue } for _, a := range state.Async { if i.Addr == a.Addr { continue outer } } // job not assigned in state, attempt to terminate it if jobID, ok := i.Meta["FLYNN_JOB_ID"]; ok { hostID, err := cluster.ExtractHostID(jobID) if err != nil { log.Error("error extracting host id from jobID", "jobID", jobID, "err", err) } h := f.Host(hostID) if h != nil { if err := h.StopJob(jobID); err != nil { log.Error("error stopping unassigned sirenia job", "jobID", jobID) } } else { log.Error("host not found", "hostID", hostID) } } } isRunning := func(addr string) bool { for _, i := range instances { if i.Addr == addr { return true } } return false } // if the leader isn't currently running then start it using primaryJob/primaryHost var wait func() (string, error) if !isRunning(state.Primary.Addr) { // if we don't have info about the primary job attempt to promote the sync if primaryJob == nil { if syncJob != nil { // set primary job to sync primaryJob = syncJob primaryHost = syncHost // nil out sync job now so we can re-allocate it. syncJob = nil syncHost = nil } else { return fmt.Errorf("neither primary or sync job info available") } } primaryJob.ID = cluster.GenerateJobID(primaryHost.ID(), "") f.FixJobEnv(primaryJob) log.Info("starting primary job", "job.id", primaryJob.ID) wait, err = waitForInstance(primaryJob.ID) if err != nil { return err } if err := primaryHost.AddJob(primaryJob); err != nil { return fmt.Errorf("error starting primary job on %s: %s", primaryHost.ID(), err) } } if !state.Singleton && !isRunning(state.Sync.Addr) { if syncHost == nil { for _, h := range f.hosts { if h.ID() != primaryHost.ID() { syncHost = h break } } if syncHost == nil { // if there are no other hosts, use the same one we put the primary on syncHost = primaryHost } } // if we don't have a sync job then copy the primary job // and provision a new volume if syncJob == nil { syncJob = primaryJob vol := &ct.VolumeReq{Path: "/data"} if _, err := utils.ProvisionVolume(vol, syncHost, syncJob); err != nil { return fmt.Errorf("error creating volume on %s: %s", syncHost.ID(), err) } } syncJob.ID = cluster.GenerateJobID(syncHost.ID(), "") f.FixJobEnv(syncJob) log.Info("starting sync job", "job.id", syncJob.ID) if wait == nil { wait, err = waitForInstance(syncJob.ID) if err != nil { return err } } if err := syncHost.AddJob(syncJob); err != nil { return fmt.Errorf("error starting additional job on %s: %s", syncHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil && leader.Addr != "" { addr = leader.Addr } log.Info("waiting for cluster to come up read-write", "addr", addr) return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }
func (f *ClusterFixer) FixSirenia(svc string) error { log := f.l.New("fn", "FixSirenia", "service", svc) log.Info("checking sirenia cluster status") service := discoverd.NewService(svc) leader, _ := service.Leader() if leader == nil || leader.Addr == "" { log.Info("no running leader") leader = nil } else { log.Info("found running leader") } instances, _ := service.Instances() log.Info("found running instances", "count", len(instances)) log.Info("getting sirenia status") var status *sirenia.Status if leader != nil && leader.Addr != "" { client := sirenia.NewClient(leader.Addr) var err error status, err = client.Status() if err != nil { log.Error("error getting status from leader", "error", err) } } if status != nil && status.Database != nil && status.Database.ReadWrite { log.Info("cluster claims to be read-write") return nil } log.Info("getting service metadata") meta, err := discoverd.NewService(svc).GetMeta() if err != nil { return fmt.Errorf("error getting sirenia state from discoverd: %s", err) } var state state.State if err := json.Unmarshal(meta.Data, &state); err != nil { return fmt.Errorf("error decoding state: %s", err) } if state.Primary == nil { return fmt.Errorf("no primary in sirenia state") } log.Info("getting primary job info", "job.id", state.Primary.Meta["FLYNN_JOB_ID"]) job, host, err := f.GetJob(state.Primary.Meta["FLYNN_JOB_ID"]) if err != nil { if state.Sync != nil { f.l.Error("unable to get primary job info", "error", err) f.l.Info("getting sync job info", "job.id", state.Sync.Meta["FLYNN_JOB_ID"]) job, host, err = f.GetJob(state.Sync.Meta["FLYNN_JOB_ID"]) if err != nil { return fmt.Errorf("unable to get primary or sync job details: %s", err) } } else { return fmt.Errorf("unable to get primary job details: %s", err) } } if leader != nil && state.Singleton { return fmt.Errorf("sirenia leader is running in singleton mode, unable to fix") } waitForInstance := func(jobID string) (func() (string, error), error) { watchCh := make(chan *discoverd.Event) upCh := make(chan string) stream, err := service.Watch(watchCh) if err != nil { return nil, fmt.Errorf("error watching discoverd service: %s", err) } go func() { var current bool for event := range watchCh { if event.Kind == discoverd.EventKindCurrent { current = true continue } if !current || event.Kind != discoverd.EventKindUp { continue } if event.Instance.Meta["FLYNN_JOB_ID"] == jobID { upCh <- event.Instance.Addr } } }() return func() (string, error) { log.Info("waiting for instance to start", "job.id", jobID) defer stream.Close() select { case addr := <-upCh: return addr, nil case <-time.After(time.Minute): return "", fmt.Errorf("timed out waiting for sirenia instance to come up") } }, nil } var wait func() (string, error) have := len(instances) want := 2 if state.Singleton { want = 1 } if have >= want { return fmt.Errorf("already have enough instances, unable to fix") } log.Info("attempting to start missing jobs", "want", want, "have", have) if leader == nil { job.ID = cluster.GenerateJobID(host.ID(), "") f.FixJobEnv(job) log.Info("starting primary job", "job.id", job.ID) wait, err = waitForInstance(job.ID) if err != nil { return err } if err := host.AddJob(job); err != nil { return fmt.Errorf("error starting primary job on %s: %s", host.ID(), err) } have++ } if want > have { // if not enough postgres instances, start another var secondHost *cluster.Host for _, h := range f.hosts { if h.ID() != host.ID() { secondHost = h break } } if secondHost == nil { // if there are no other hosts, use the same one we put the primary on secondHost = host } job.ID = cluster.GenerateJobID(secondHost.ID(), "") f.FixJobEnv(job) log.Info("starting second job", "job.id", job.ID) if wait == nil { wait, err = waitForInstance(job.ID) if err != nil { return err } } if err := utils.ProvisionVolume(secondHost, job); err != nil { return fmt.Errorf("error creating volume on %s: %s", secondHost.ID(), err) } if err := secondHost.AddJob(job); err != nil { return fmt.Errorf("error starting additional job on %s: %s", secondHost.ID(), err) } } if wait != nil { addr, err := wait() if err != nil { return err } if leader != nil { addr = leader.Addr } log.Info("waiting for cluster to come up read-write") return sirenia.NewClient(addr).WaitForReadWrite(5 * time.Minute) } return nil }
func (a *API) scaleUp() error { a.mtx.Lock() defer a.mtx.Unlock() // Ignore if already scaled up. if a.scaledUp { return nil } app := os.Getenv("FLYNN_APP_ID") logger := a.logger().New("fn", "scaleUp") sc := sirenia.NewClient(serviceHost + ":3306") logger.Info("checking status", "host", serviceHost) if status, err := sc.Status(); err == nil && status.Database != nil && status.Database.ReadWrite { logger.Info("database is up, skipping scale") // Skip the rest, the database is already available a.scaledUp = true return nil } else if err != nil { logger.Info("error checking status", "err", err) } else { logger.Info("got status, but database is not read-write") } // Connect to controller. logger.Info("connecting to controller") client, err := controller.NewClient("", os.Getenv("CONTROLLER_KEY")) if err != nil { logger.Error("controller client error", "err", err) return err } // Retrieve mariadb release. logger.Info("retrieving app release", "app", app) release, err := client.GetAppRelease(app) if err == controller.ErrNotFound { logger.Error("release not found", "app", app) return errors.New("mariadb release not found") } else if err != nil { logger.Error("get release error", "app", app, "err", err) return err } // Retrieve current formation. logger.Info("retrieving formation", "app", app, "release_id", release.ID) formation, err := client.GetFormation(app, release.ID) if err == controller.ErrNotFound { logger.Error("formation not found", "app", app, "release_id", release.ID) return errors.New("mariadb formation not found") } else if err != nil { logger.Error("formation error", "app", app, "release_id", release.ID, "err", err) return err } // If mariadb is running then exit. if formation.Processes["mariadb"] > 0 { logger.Info("database is running, scaling not necessary") return nil } // Copy processes and increase database processes. processes := make(map[string]int, len(formation.Processes)) for k, v := range formation.Processes { processes[k] = v } if os.Getenv("SINGLETON") == "true" { processes["mariadb"] = 1 } else { processes["mariadb"] = 3 } // Update formation. logger.Info("updating formation", "app", app, "release_id", release.ID) formation.Processes = processes if err := client.PutFormation(formation); err != nil { logger.Error("put formation error", "app", app, "release_id", release.ID, "err", err) return err } if err := sc.WaitForReadWrite(5 * time.Minute); err != nil { logger.Error("wait for read write", "err", err) return errors.New("timed out while starting mariadb cluster") } logger.Info("scaling complete") // Mark as successfully scaled up. a.scaledUp = true return nil }
func (a *API) ping(ctx context.Context, w http.ResponseWriter, req *http.Request) { app := os.Getenv("FLYNN_APP_ID") logger := a.logger().New("fn", "ping") logger.Info("checking status", "host", serviceHost) if status, err := sirenia.NewClient(serviceHost + ":3306").Status(); err == nil && status.Database != nil && status.Database.ReadWrite { logger.Info("database is up, skipping scale check") } else { // Connect to controller. logger.Info("connecting to controller") client, err := controller.NewClient("", os.Getenv("CONTROLLER_KEY")) if err != nil { logger.Error("controller client error", "err", err) httphelper.Error(w, err) return } // Retrieve mariadb release. logger.Info("retrieving app release", "app", app) release, err := client.GetAppRelease(app) if err == controller.ErrNotFound { logger.Error("release not found", "app", app) httphelper.Error(w, err) return } else if err != nil { logger.Error("get release error", "app", app, "err", err) httphelper.Error(w, err) return } // Retrieve current formation. logger.Info("retrieving formation", "app", app, "release_id", release.ID) formation, err := client.GetFormation(app, release.ID) if err == controller.ErrNotFound { logger.Error("formation not found", "app", app, "release_id", release.ID) httphelper.Error(w, err) return } else if err != nil { logger.Error("formation error", "app", app, "release_id", release.ID, "err", err) httphelper.Error(w, err) return } // MariaDB isn't running, just return healthy if formation.Processes["mariadb"] == 0 { w.WriteHeader(200) return } } db, err := a.connect() if err != nil { httphelper.Error(w, err) return } defer db.Close() if _, err := db.Exec("SELECT 1"); err != nil { httphelper.Error(w, err) return } w.WriteHeader(200) }