示例#1
0
func (m *Monitor) repairCluster() error {
	log := monitorLogger.New("fn", "repairCluster")
	log.Info("initiating cluster repair")
	hosts, err := m.c.Hosts()
	if err != nil {
		return err
	}
	f := fixer.NewClusterFixer(hosts, m.c, log)
	// killing the schedulers to prevent interference
	f.KillSchedulers()
	// ensure postgres is working
	f.FixPostgres()
	// ensure controller api is working
	controllerService := discoverd.NewService("controller")
	controllerInstances, _ := controllerService.Instances()
	if len(controllerInstances) == 0 {
		controllerInstances, err = f.StartAppJob("controller", "web", "controller")
		if err != nil {
			return err
		}
	}
	// fix any formations and start the scheduler again
	if err := f.FixController(controllerInstances, true); err != nil {
		return err
	}
	// zero out the deadline timer
	m.deadline = time.Time{}
	return nil
}
示例#2
0
文件: monitor.go 项目: imjorge/flynn
func (m *Monitor) repairCluster() error {
	log := monitorLogger.New("fn", "repairCluster")
	log.Info("initiating cluster repair")
	hosts, err := m.c.Hosts()
	if err != nil {
		return err
	}
	f := fixer.NewClusterFixer(hosts, m.c, log)
	// killing the schedulers to prevent interference
	f.KillSchedulers()

	log.Info("checking status of sirenia databases")
	for _, db := range []string{"postgres", "mariadb"} {
		log.Info("checking for database state", "db", db)
		if _, err := discoverd.NewService(db).GetMeta(); err != nil {
			if discoverd.IsNotFound(err) {
				log.Info("skipping recovery of db, no state in discoverd", "db", db)
				continue
			}
			log.Error("error checking database state", "db", db)
			return err
		}
		if err := f.CheckSirenia(db); err != nil {
			if err := f.FixSirenia(db); err != nil {
				if db == "postgres" {
					return err
				} else {
					log.Error("failed database recovery", "db", db)
				}
			}
		}
	}

	// ensure controller api is working
	controllerService := discoverd.NewService("controller")
	controllerInstances, _ := controllerService.Instances()
	if len(controllerInstances) == 0 {
		controllerInstances, err = f.StartAppJob("controller", "web", "controller")
		if err != nil {
			return err
		}
	}
	// fix any formations and start the scheduler again
	if err := f.FixController(controllerInstances, true); err != nil {
		return err
	}
	// zero out the deadline timer
	m.deadline = time.Time{}
	return nil
}