Beispiel #1
0
func (s *Server) serveH2r(w http.ResponseWriter, r *http.Request) {
	defer func() {
		if err := recover(); err != nil {
			w.WriteHeader(http.StatusInternalServerError)
			Log.Error("REVERSE failed", zap.Object("err", err))
		} else {
			w.WriteHeader(http.StatusOK)
		}
	}()

	remote, err := net.DialTimeout("tcp", r.Host, time.Second*10)
	if err != nil {
		Log.Error("dail failed", zap.Error(err), zap.String("host", r.Host))
		w.WriteHeader(http.StatusNotImplemented)
		return
	}
	defer remote.Close()

	go io.Copy(remote, r.Body)
	//	go io.Copy(remote, io.TeeReader(r.Body, os.Stdout))
	resr := io.TeeReader(remote, w)
	//	resr = io.TeeReader(resr, os.Stdout)
	res, err := http.ReadResponse(bufio.NewReader(resr), nil)
	if err != nil {
		return
	}
	if res.Body != nil {
		defer res.Body.Close()
		io.Copy(ioutil.Discard, res.Body)
	}
}
Beispiel #2
0
func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string) error {
	// ioutil.Tempfile already creates files with 0600 permissions
	pgpass, err := ioutil.TempFile("", "pgpass")
	if err != nil {
		return err
	}
	defer os.Remove(pgpass.Name())
	defer pgpass.Close()

	host := followedConnParams.Get("host")
	port := followedConnParams.Get("port")
	user := followedConnParams.Get("user")
	pgpass.WriteString(fmt.Sprintf("%s:%s:*:%s:%s\n", host, port, user, password))

	// Disable synchronous commits. pg_rewind needs to create a
	// temporary table on the master but if synchronous replication is
	// enabled and there're no active standbys it will hang.
	followedConnParams.Set("options", "-c synchronous_commit=off")
	followedConnString := followedConnParams.ConnString()

	log.Info("running pg_rewind")
	name := filepath.Join(p.pgBinPath, "pg_rewind")
	cmd := exec.Command(name, "--debug", "-D", p.dataDir, "--source-server="+followedConnString)
	cmd.Env = append(cmd.Env, fmt.Sprintf("PGPASSFILE=%s", pgpass.Name()))
	log.Debug("execing cmd", zap.Object("cmd", cmd))
	out, err := cmd.CombinedOutput()
	if err != nil {
		return fmt.Errorf("error: %v, output: %s", err, string(out))
	}
	log.Debug("cmd out", zap.String("out", string(out)))
	return nil
}
Beispiel #3
0
func (p *Manager) SyncFromFollowed(followedConnParams ConnParams) error {
	// ioutil.Tempfile already creates files with 0600 permissions
	pgpass, err := ioutil.TempFile("", "pgpass")
	if err != nil {
		return err
	}
	defer os.Remove(pgpass.Name())
	defer pgpass.Close()

	host := followedConnParams.Get("host")
	port := followedConnParams.Get("port")
	user := followedConnParams.Get("user")
	password := followedConnParams.Get("password")
	pgpass.WriteString(fmt.Sprintf("%s:%s:*:%s:%s\n", host, port, user, password))

	log.Info("running pg_basebackup")
	name := filepath.Join(p.pgBinPath, "pg_basebackup")
	cmd := exec.Command(name, "-R", "-D", p.dataDir, "--host="+host, "--port="+port, "-U", user)
	cmd.Env = append(cmd.Env, fmt.Sprintf("PGPASSFILE=%s", pgpass.Name()))
	log.Debug("execing cmd", zap.Object("cmd", cmd))
	if out, err := cmd.CombinedOutput(); err != nil {
		return fmt.Errorf("error: %v, output: %s", err, string(out))
	}
	return nil
}
Beispiel #4
0
func newBot() (b MMJira) {

	b = MMJira{l: zap.NewJSON(zap.DebugLevel), reg: metrics.NewRegistry()}
	data, err := ioutil.ReadFile("config.yaml")
	if err != nil {
		b.l.Panic("not able to read the file", zap.Error(err))
	}
	var config InstanceConfig
	if err = yaml.Unmarshal(data, &config); err != nil {
		b.l.Panic("not able to marshal the file", zap.Error(err))
	}
	b.c = &config
	if !b.c.Debug {
		b.l.SetLevel(zap.ErrorLevel)
	}
	mmpost, err := mmcontroller.NewController(b.c.MMicon, b.c.MMuser, b.c.Hooks, b.c.Debug, metrics.NewPrefixedChildRegistry(b.reg, "mmc."))
	if err != nil {
		panic(err)
	}

	b.m = mmpost
	b.l.Debug("outputting config", zap.Object("config", b.c))
	b.r = mux.NewRouter()
	b.r.HandleFunc("/", b.homeHandler)
	b.r.HandleFunc("/hooks/", b.getHandler).Methods("GET")
	b.r.HandleFunc("/hooks/{hookid}", b.postHandler).Methods("POST")
	b.r.Handle("/metrics", exp.ExpHandler(b.reg))
	b.r.HandleFunc("/config/", b.configGetHandler).Methods("GET")

	return b

}
Beispiel #5
0
func (conn *Connection) reader(responses chan<- Response, logger zap.Logger) {
	buffer := make([]byte, 6)
	for {
		n, err := conn.conn.Read(buffer)
		if err != nil && n < 6 {
			logger.Info("APNS: Connection error before reading complete response",
				zap.Int("connectionId", conn.id),
				zap.Int("n", n),
				zap.Error(err),
			)
			conn.shouldReconnect <- true
			return
		} else if err != nil {
			logger.Info("APNS: Connection error before reading complete response",
				zap.Int("connectionId", conn.id),
				zap.Error(err),
			)
		}
		command := uint8(buffer[0])
		if command != 8 {
			logger.Info("APNS: Something went wrong in a connection - Command should have been 8 but it had other value instead",
				zap.Int("connectionId", conn.id),
				zap.Object("commandValue", command),
			)
		}
		resp := newResponse()
		resp.Identifier = binary.BigEndian.Uint32(buffer[2:6])
		resp.Status = uint8(buffer[1])
		responses <- resp
		conn.shouldReconnect <- true
		return
	}
}
Beispiel #6
0
func (s *Server) serveH2c(w http.ResponseWriter, r *http.Request) {
	defer func() {
		if err := recover(); err != nil {
			w.WriteHeader(http.StatusInternalServerError)
			Log.Error("CONNECT failed", zap.Object("err", err))
		}
	}()
	remote, err := net.DialTimeout("tcp", r.Host, time.Second*10)
	if err != nil {
		Log.Error("dail failed", zap.Error(err), zap.String("host", r.Host))
		w.WriteHeader(http.StatusNotImplemented)
		return
	}
	defer remote.Close()

	fw := &flushWriter{w}
	fw.FlushHeader(http.StatusOK)
	go io.Copy(remote, r.Body)
	srcRemote := &TryReader{
		c:        remote,
		ignore:   3,
		maxRetry: 2,
		tryDur:   time.Millisecond * 600,
		timeout:  time.Second * 15,
	}
	io.Copy(fw, srcRemote)
}
Beispiel #7
0
// handleInbox handles incomming chat message
func (b *satpamBot) handleInbox() {
	for {
		select {
		case rawMsg := <-b.in:
			if rawMsg == nil {
				log.Fatal("handleInbox input channel is closed")
			}
			switch msg := rawMsg.(type) {
			case *bot.Message:
				if msg.Date.Before(startedAt) {
					// ignore message that is received before the process started
					log.Debug("message before started at", zap.Object("msg", msg), zap.String("startedAt", startedAt.String()), zap.String("date", msg.Date.String()))
					continue
				}
				log.Debug("handleInbox got message", zap.Object("msg", msg))

				if msg.From.ID != adminID {
					continue
				}

				msgType := msg.Chat.Type
				if msgType == bot.Private {
					log.Debug("Got private message", zap.Object("msg", msg))
					if msg.From.ID == adminID {
						// TODO
					}
					continue
				}

				// ## Handle Commands ##
				switch msg.Text {
				case "/leave", "/leave@" + botName:
					if b.cmdLeave(msg) {
						continue
					}
				}
			}
		}
	}
}
Beispiel #8
0
func (p *Manager) start(args ...string) error {
	log.Info("starting database")
	name := filepath.Join(p.pgBinPath, "pg_ctl")
	args = append([]string{"start", "-w", "-D", p.dataDir, "-o", "-c unix_socket_directories=/tmp"}, args...)
	cmd := exec.Command(name, args...)
	log.Debug("execing cmd", zap.Object("cmd", cmd))
	// TODO(sgotti) attaching a pipe to sdtout/stderr makes the postgres
	// process executed by pg_ctl inheriting it's file descriptors. So
	// cmd.Wait() will block and waiting on them to be closed (will happend
	// only when postgres is stopped). So this functions will never return.
	// To avoid this no output is captured. If needed there's the need to
	// find a way to get the output whitout blocking.
	if err := cmd.Run(); err != nil {
		return fmt.Errorf("error: %v", err)
	}
	return nil
}
Beispiel #9
0
func (s *Server) serveWs(w http.ResponseWriter, r *http.Request) {
	defer func() {
		if err := recover(); err != nil {
			w.WriteHeader(http.StatusInternalServerError)
			Log.Error("serveWs error", zap.Object("err", err))
		}
	}()
	Log.Debug("websocket start")
	ws, err := s.upgrader.Upgrade(w, r, nil)
	if err != nil {
		w.WriteHeader(http.StatusInternalServerError)
		Log.Error("websocket failed", zap.Error(err))
		return
	}
	Log.Debug("websocket ok")
	s.globalWsChan <- NewWs(ws, s.WsBufSize)
}
Beispiel #10
0
//Analyse the response from mm
func (c *Controller) Analyse(in <-chan Response) {

	count := metrics.GetOrRegisterCounter("analyse.response.total", c.reg)
	count.Inc(1)

	response := <-in
	if response.StatusCode != 200 {
		n := "analyse.response." + response.Project + ".error"
		count := metrics.GetOrRegisterCounter(n, c.reg)
		count.Inc(1)
	} else {
		n := "analyse.response." + response.Project + ".ok"
		count := metrics.GetOrRegisterCounter(n, c.reg)
		count.Inc(1)
	}
	c.l.Info("response received", zap.Object("response", response))
}
Beispiel #11
0
func (p *Manager) SyncFromFollowed(followedConnParams ConnParams) error {
	fcp := followedConnParams.Copy()

	// ioutil.Tempfile already creates files with 0600 permissions
	pgpass, err := ioutil.TempFile("", "pgpass")
	if err != nil {
		return err
	}
	defer os.Remove(pgpass.Name())
	defer pgpass.Close()

	host := fcp.Get("host")
	port := fcp.Get("port")
	user := fcp.Get("user")
	password := fcp.Get("password")
	pgpass.WriteString(fmt.Sprintf("%s:%s:*:%s:%s\n", host, port, user, password))

	// Remove password from the params passed to pg_basebackup
	fcp.Del("password")

	// Disable synchronous commits. pg_basebackup calls
	// pg_start_backup()/pg_stop_backup() on the master but if synchronous
	// replication is enabled and there're no active standbys they will hang.
	fcp.Set("options", "-c synchronous_commit=off")
	followedConnString := fcp.ConnString()

	log.Info("running pg_basebackup")
	name := filepath.Join(p.pgBinPath, "pg_basebackup")
	cmd := exec.Command(name, "-R", "-D", p.dataDir, "-d", followedConnString)
	cmd.Env = append(cmd.Env, fmt.Sprintf("PGPASSFILE=%s", pgpass.Name()))
	log.Debug("execing cmd", zap.Object("cmd", cmd))
	if out, err := cmd.CombinedOutput(); err != nil {
		return fmt.Errorf("error: %v, output: %s", err, string(out))
	}
	return nil
}
Beispiel #12
0
// GetTarget retrieve the hook assigned to a projet, return an error in anyother case
func (b MMJira) postHandler(w http.ResponseWriter, r *http.Request) {
	vars := mux.Vars(r)
	hookid := strings.ToLower(vars["hookid"])
	b.l.Info("project", zap.String("hook", hookid))
	if b.c.Hooks[hookid] == "" {
		c := metrics.GetOrRegisterCounter("hooks.post.unknown.project", b.reg)
		c.Inc(1)
		http.Error(w, "unknwon project", http.StatusBadRequest)
		return
	}
	b.l.Debug("received a request")
	c := metrics.GetOrRegisterCounter("hooks.received."+hookid, b.reg)
	c.Inc(1)
	if b.c.Debug {
		if err := utils.DumpRequest(r, b.c.DumpDir); err != nil {
			b.l.Info("unable to dump the request in the directory", zap.String("Directory", b.c.DumpDir))
		}
	}
	issue, err := b.m.Create(r.Body)
	if err != nil {
		http.Error(w, fmt.Sprint(err), http.StatusBadRequest)
		return
	}

	if err != nil {
		http.Error(w, fmt.Sprint(err), http.StatusBadRequest)
		return
	}

	// We only know our top-level keys are strings

	b.l.Debug("sending", zap.Object("issue", issue))

	ch := b.m.Inform(issue)
	go b.m.Analyse(ch)
}
Beispiel #13
0
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
	e := p.e
	pgm := p.pgm

	cd, _, err := e.GetClusterData()
	if err != nil {
		log.Error("error retrieving cluster data", zap.Error(err))
		return
	}
	log.Debug("cd dump", zap.String("cd", spew.Sdump(cd)))

	if cd == nil {
		log.Info("no cluster data available, waiting for it to appear")
		return
	}
	if cd.FormatVersion != cluster.CurrentCDFormatVersion {
		log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion))
		return
	}
	if cd.Cluster != nil {
		p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration
		p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration

		if p.keeperLocalState.ClusterUID != cd.Cluster.UID {
			p.keeperLocalState.ClusterUID = cd.Cluster.UID
			if err = p.saveKeeperLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
		}
	}

	k, ok := cd.Keepers[p.keeperLocalState.UID]
	if !ok {
		log.Info("our keeper data is not available, waiting for it to appear")
		return
	}
	// TODO(sgotti) Check that the Keeper.Status address:port has been updated

	db := cd.FindDB(k)
	if db == nil {
		log.Info("no db assigned")
		return
	}
	// TODO(sgotti) Check that the DB.Status address:port has been updated

	followersUIDs := db.Spec.Followers

	prevPGParameters := pgm.GetParameters()
	// create postgres parameteres
	pgParameters := p.createPGParameters(db)
	// update pgm postgres parameters
	pgm.SetParameters(pgParameters)

	dbls := p.dbLocalState
	if dbls.Initializing {
		// If we are here this means that the db initialization or
		// resync as failed so we have to clean up stale data
		log.Error("db failed to initialize or resync")
		// Clean up cluster db datadir
		if err = pgm.RemoveAll(); err != nil {
			log.Error("failed to remove the postgres data dir", zap.Error(err))
			return
		}
		// Reset current db local state since it's not valid anymore
		p.localStateMutex.Lock()
		dbls.UID = ""
		dbls.Generation = cluster.NoGeneration
		dbls.Initializing = false
		p.localStateMutex.Unlock()
		if err = p.saveDBLocalState(); err != nil {
			log.Error("error", zap.Error(err))
			return
		}
	}

	initialized, err := pgm.IsInitialized()
	if err != nil {
		log.Error("failed to detect if instance is initialized", zap.Error(err))
		return
	}

	started := false
	if initialized {
		started, err = pgm.IsStarted()
		if err != nil {
			// log error getting instance state but go ahead.
			log.Info("failed to retrieve instance status", zap.Error(err))
		}
	}

	log.Debug("db status", zap.Bool("started", started))

	// if the db is initialized but there isn't a db local state then generate a new one
	if initialized && dbls.UID == "" {
		p.localStateMutex.Lock()
		dbls.UID = common.UID()
		dbls.Generation = cluster.NoGeneration
		dbls.InitPGParameters = nil
		dbls.Initializing = false
		p.localStateMutex.Unlock()
		if err = p.saveDBLocalState(); err != nil {
			log.Error("error", zap.Error(err))
			return
		}
	}

	if dbls.UID != db.UID {
		log.Info("current db UID different than cluster data db UID", zap.String("db", dbls.UID), zap.String("cdDB", db.UID))
		switch db.Spec.InitMode {
		case cluster.DBInitModeNew:
			log.Info("initializing the database cluster")
			p.localStateMutex.Lock()
			dbls.UID = db.UID
			// Set a no generation since we aren't already converged.
			dbls.Generation = cluster.NoGeneration
			dbls.InitPGParameters = nil
			dbls.Initializing = true
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if started {
				if err = pgm.Stop(true); err != nil {
					log.Error("failed to stop pg instance", zap.Error(err))
					return
				}
				started = false
			}
			if err = pgm.RemoveAll(); err != nil {
				log.Error("failed to remove the postgres data dir", zap.Error(err))
				return
			}
			if err = pgm.Init(); err != nil {
				log.Error("failed to initialize postgres database cluster", zap.Error(err))
				return
			}
			initialized = true

			if db.Spec.IncludeConfig {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
				pgParameters, err = pgm.GetConfigFilePGParameters()
				if err != nil {
					log.Error("failed to rename previous postgresql.conf", zap.Error(err))
					return
				}
				p.localStateMutex.Lock()
				dbls.InitPGParameters = pgParameters
				p.localStateMutex.Unlock()
			} else {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
			}

			log.Info("setting roles")
			if err = pgm.SetupRoles(); err != nil {
				log.Error("failed to setup roles", zap.Error(err))
				return
			}

			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if err = pgm.Stop(true); err != nil {
				log.Error("failed to stop pg instance", zap.Error(err))
				return
			}
		case cluster.DBInitModePITR:
			log.Info("restoring the database cluster")
			p.localStateMutex.Lock()
			dbls.UID = db.UID
			// Set a no generation since we aren't already converged.
			dbls.Generation = cluster.NoGeneration
			dbls.InitPGParameters = nil
			dbls.Initializing = true
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if started {
				if err = pgm.Stop(true); err != nil {
					log.Error("failed to stop pg instance", zap.Error(err))
					return
				}
				started = false
			}
			if err = pgm.RemoveAll(); err != nil {
				log.Error("failed to remove the postgres data dir", zap.Error(err))
				return
			}
			if err = pgm.Restore(db.Spec.PITRConfig.DataRestoreCommand); err != nil {
				log.Error("failed to restore postgres database cluster", zap.Error(err))
				return
			}
			if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(nil, db.Spec.PITRConfig.ArchiveRecoverySettings)); err != nil {
				log.Error("err", zap.Error(err))
				return
			}
			if db.Spec.IncludeConfig {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
				pgParameters, err = pgm.GetConfigFilePGParameters()
				if err != nil {
					log.Error("failed to rename previous postgresql.conf", zap.Error(err))
					return
				}
				p.localStateMutex.Lock()
				dbls.InitPGParameters = pgParameters
				p.localStateMutex.Unlock()
			} else {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
			}
			initialized = true

			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if err = pgm.Stop(true); err != nil {
				log.Error("failed to stop pg instance", zap.Error(err))
				return
			}
		case cluster.DBInitModeExisting:
			// replace our current db uid with the required one.
			p.localStateMutex.Lock()
			dbls.UID = db.UID
			// Set a no generation since we aren't already converged.
			dbls.Generation = cluster.NoGeneration
			dbls.InitPGParameters = nil
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if started {
				if err = pgm.Stop(true); err != nil {
					log.Error("failed to stop pg instance", zap.Error(err))
					return
				}
				started = false
			}
			if db.Spec.IncludeConfig {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
				pgParameters, err = pgm.GetConfigFilePGParameters()
				if err != nil {
					log.Error("failed to rename previous postgresql.conf", zap.Error(err))
					return
				}
				p.localStateMutex.Lock()
				dbls.InitPGParameters = pgParameters
				p.localStateMutex.Unlock()
			} else {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
			}
			log.Info("updating our db UID with the cluster data provided db UID")
			// replace our current db uid with the required one.
			p.localStateMutex.Lock()
			dbls.InitPGParameters = pgParameters
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if err = pgm.Stop(true); err != nil {
				log.Error("failed to stop pg instance", zap.Error(err))
				return
			}
		case cluster.DBInitModeNone:
			// replace our current db uid with the required one.
			p.localStateMutex.Lock()
			dbls.UID = db.UID
			// Set a no generation since we aren't already converged.
			dbls.Generation = cluster.NoGeneration
			dbls.InitPGParameters = nil
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			return
		default:
			log.Error("unknown db init mode", zap.String("initMode", string(db.Spec.InitMode)))
			return
		}
	}

	pgm.SetParameters(pgParameters)

	var localRole common.Role
	var systemID string
	if !initialized {
		log.Info("database cluster not initialized")
		localRole = common.RoleUndefined
	} else {
		localRole, err = pgm.GetRole()
		if err != nil {
			log.Error("error retrieving current pg role", zap.Error(err))
			return
		}
		systemID, err = p.pgm.GetSystemdID()
		if err != nil {
			log.Error("error retrieving systemd ID", zap.Error(err))
			return
		}
	}

	targetRole := db.Spec.Role
	log.Debug("target role", zap.String("targetRole", string(targetRole)))

	switch targetRole {
	case common.RoleMaster:
		// We are the elected master
		log.Info("our db requested role is master")
		if localRole == common.RoleUndefined {
			log.Error("database cluster not initialized but requested role is master. This shouldn't happen!")
			return
		}
		if !started {
			if err = pgm.Start(); err != nil {
				log.Error("failed to start postgres", zap.Error(err))
				return
			}
			started = true
		}

		if localRole == common.RoleStandby {
			log.Info("promoting to master")
			if err = pgm.Promote(); err != nil {
				log.Error("err", zap.Error(err))
				return
			}
		} else {
			log.Info("already master")
		}

		var replSlots []string
		replSlots, err = pgm.GetReplicatinSlots()
		log.Debug("replication slots", zap.Object("replSlots", replSlots))
		if err != nil {
			log.Error("err", zap.Error(err))
			return
		}
		// Drop replication slots
		for _, slotName := range replSlots {
			if !common.IsStolonName(slotName) {
				continue
			}
			if !util.StringInSlice(followersUIDs, common.NameFromStolonName(slotName)) {
				log.Info("dropping replication slot since db not marked as follower", zap.String("slot", slotName), zap.String("db", common.NameFromStolonName(slotName)))
				if err = pgm.DropReplicationSlot(slotName); err != nil {
					log.Error("err", zap.Error(err))
				}
			}
		}
		// Create replication slots
		for _, followerUID := range followersUIDs {
			if followerUID == dbls.UID {
				continue
			}
			replSlot := common.StolonName(followerUID)
			if !util.StringInSlice(replSlots, replSlot) {
				log.Info("creating replication slot", zap.String("slot", replSlot), zap.String("db", followerUID))
				if err = pgm.CreateReplicationSlot(replSlot); err != nil {
					log.Error("err", zap.Error(err))
				}
			}
		}
	case common.RoleStandby:
		// We are a standby
		followedUID := db.Spec.FollowConfig.DBUID
		log.Info("our db requested role is standby", zap.String("followedDB", followedUID))
		followedDB, ok := cd.DBs[followedUID]
		if !ok {
			log.Error("no db data available for followed db", zap.String("followedDB", followedUID))
			return
		}
		switch localRole {
		case common.RoleMaster:
			if systemID == followedDB.Status.SystemID {
				// There can be the possibility that this
				// database is on the same branch of the
				// current followed instance.
				// So we try to put it in recovery and then
				// check if it's on the same branch or force a
				// resync
				replConnParams := p.getReplConnParams(db, followedDB)
				standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)}
				if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				if !started {
					if err = pgm.Start(); err != nil {
						log.Error("err", zap.Error(err))
						return
					}
					started = true
				} else {
					if err = pgm.Restart(true); err != nil {
						log.Error("err", zap.Error(err))
						return
					}
				}

				// TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines.
				// So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream.

				// Check timeline history
				// We need to update our pgState to avoid dealing with
				// an old pgState not reflecting the real state
				var pgState *cluster.PostgresState
				pgState, err = p.GetPGState(pctx)
				if err != nil {
					log.Error("cannot get current pgstate", zap.Error(err))
					return
				}

				if p.isDifferentTimelineBranch(followedDB, pgState) {
					if err = p.resync(db, followedDB, true, started); err != nil {
						log.Error("failed to resync from followed instance", zap.Error(err))
						return
					}
					if err = pgm.Start(); err != nil {
						log.Error("err", zap.Error(err))
						return
					}
					started = true

					// Check again if it was really synced
					pgState, err = p.GetPGState(pctx)
					if err != nil {
						log.Error("cannot get current pgstate", zap.Error(err))
						return
					}
					if p.isDifferentTimelineBranch(followedDB, pgState) {
						if err = p.resync(db, followedDB, false, started); err != nil {
							log.Error("failed to resync from followed instance", zap.Error(err))
							return
						}
						if err = pgm.Start(); err != nil {
							log.Error("err", zap.Error(err))
							return
						}
						started = true
					}
				}
			} else {
				if err = p.resync(db, followedDB, false, started); err != nil {
					log.Error("failed to resync from followed instance", zap.Error(err))
					return
				}
				if err = pgm.Start(); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				started = true
			}
		case common.RoleStandby:
			log.Info("already standby")
			if !started {
				replConnParams := p.getReplConnParams(db, followedDB)
				standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)}
				if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				if err = pgm.Start(); err != nil {
					log.Error("failed to start postgres", zap.Error(err))
					return
				}
				started = true
			}

			// Check that we can sync with followed instance

			// We need to update our pgState to avoid dealing with
			// an old pgState not reflecting the real state
			var pgState *cluster.PostgresState
			pgState, err = p.GetPGState(pctx)
			if err != nil {
				log.Error("cannot get current pgstate", zap.Error(err))
				return
			}
			needsResync := false
			tryPgrewind := false
			// If the db has a different systemdID then a resync is needed
			if systemID != followedDB.Status.SystemID {
				needsResync = true
				// Check timeline history
			} else if p.isDifferentTimelineBranch(followedDB, pgState) {
				needsResync = true
				tryPgrewind = true
			}
			if needsResync {
				// TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines.
				// So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream.
				if err = p.resync(db, followedDB, tryPgrewind, started); err != nil {
					log.Error("failed to full resync from followed instance", zap.Error(err))
					return
				}
				if err = pgm.Start(); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				started = true

				// Check again if it was really synced
				pgState, err = p.GetPGState(pctx)
				if err != nil {
					log.Error("cannot get current pgstate", zap.Error(err))
					return
				}
				if p.isDifferentTimelineBranch(followedDB, pgState) {
					if err = p.resync(db, followedDB, false, started); err != nil {
						log.Error("failed to resync from followed instance", zap.Error(err))
						return
					}
					if err = pgm.Start(); err != nil {
						log.Error("err", zap.Error(err))
						return
					}
					started = true
				}
			}

			// TODO(sgotti) Check that the followed instance has all the needed WAL segments

			// Update our primary_conninfo if replConnString changed
			var curReplConnParams postgresql.ConnParams

			curReplConnParams, err = pgm.GetPrimaryConninfo()
			if err != nil {
				log.Error("err", zap.Error(err))
				return
			}
			log.Debug("curReplConnParams", zap.Object("curReplConnParams", curReplConnParams))

			newReplConnParams := p.getReplConnParams(db, followedDB)
			log.Debug("newReplConnParams", zap.Object("newReplConnParams", newReplConnParams))

			if !curReplConnParams.Equals(newReplConnParams) {
				log.Info("connection parameters changed. Reconfiguring.", zap.String("followedDB", followedUID), zap.Object("replConnParams", newReplConnParams))
				standbySettings := &cluster.StandbySettings{PrimaryConninfo: newReplConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)}
				if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				if err = pgm.Restart(true); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
			}
		case common.RoleUndefined:
			if err = p.resync(db, followedDB, false, started); err != nil {
				log.Error("failed to full resync from followed instance", zap.Error(err))
				return
			}
			if err = pgm.Start(); err != nil {
				log.Error("err", zap.Error(err))
				return
			}
			started = true
		}
	case common.RoleUndefined:
		log.Info("our db requested role is none")
		return
	}

	// update pg parameters
	pgParameters = p.createPGParameters(db)

	// Log synchronous replication changes
	prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"]
	syncStandbyNames := pgParameters["synchronous_standby_names"]
	if db.Spec.SynchronousReplication {
		if prevSyncStandbyNames != syncStandbyNames {
			log.Info("needed synchronous_standby_names changed", zap.String("prevSyncStandbyNames", prevSyncStandbyNames), zap.String("syncStandbyNames", syncStandbyNames))
		}
	} else {
		if prevSyncStandbyNames != "" {
			log.Info("sync replication disabled, removing current synchronous_standby_names", zap.String("syncStandbyNames", prevSyncStandbyNames))
		}
	}

	if !pgParameters.Equals(prevPGParameters) {
		log.Info("postgres parameters changed, reloading postgres instance")
		pgm.SetParameters(pgParameters)
		if err := pgm.Reload(); err != nil {
			log.Error("failed to reload postgres instance", zap.Error(err))
		}
	} else {
		// for tests
		log.Info("postgres parameters not changed")
	}

	// If we are here, then all went well and we can update the db generation and save it locally
	p.localStateMutex.Lock()
	dbls.Generation = db.Generation
	dbls.Initializing = false
	p.localStateMutex.Unlock()
	if err := p.saveDBLocalState(); err != nil {
		log.Error("err", zap.Error(err))
		return
	}
}
Beispiel #14
0
func (p *PostgresKeeper) GetPGState(pctx context.Context) (*cluster.PostgresState, error) {
	p.getPGStateMutex.Lock()
	defer p.getPGStateMutex.Unlock()
	// Just get one pgstate at a time to avoid exausting available connections
	pgState := &cluster.PostgresState{}

	p.localStateMutex.Lock()
	pgState.UID = p.dbLocalState.UID
	pgState.Generation = p.dbLocalState.Generation
	p.localStateMutex.Unlock()

	pgState.ListenAddress = p.pgListenAddress
	pgState.Port = p.pgPort

	initialized, err := p.pgm.IsInitialized()
	if err != nil {
		return nil, err
	}
	if initialized {
		pgParameters, err := p.pgm.GetConfigFilePGParameters()
		if err != nil {
			log.Error("cannot get configured pg parameters", zap.Error(err))
			return pgState, nil
		}
		log.Debug("got configured pg parameters", zap.Object("pgParameters", pgParameters))
		filteredPGParameters := common.Parameters{}
		for k, v := range pgParameters {
			if !util.StringInSlice(managedPGParameters, k) {
				filteredPGParameters[k] = v
			}
		}
		log.Debug("filtered out managed pg parameters", zap.Object("filteredPGParameters", filteredPGParameters))
		pgState.PGParameters = filteredPGParameters

		sd, err := p.pgm.GetSystemData()
		if err != nil {
			log.Error("error getting pg state", zap.Error(err))
			return pgState, nil
		}
		pgState.SystemID = sd.SystemID
		pgState.TimelineID = sd.TimelineID
		pgState.XLogPos = sd.XLogPos

		// if timeline <= 1 then no timeline history file exists.
		pgState.TimelinesHistory = cluster.PostgresTimelinesHistory{}
		if pgState.TimelineID > 1 {
			tlsh, err := p.pgm.GetTimelinesHistory(pgState.TimelineID)
			if err != nil {
				log.Error("error getting timeline history", zap.Error(err))
				return pgState, nil
			}
			ctlsh := cluster.PostgresTimelinesHistory{}

			for _, tlh := range tlsh {
				ctlh := &cluster.PostgresTimelineHistory{
					TimelineID:  tlh.TimelineID,
					SwitchPoint: tlh.SwitchPoint,
					Reason:      tlh.Reason,
				}
				ctlsh = append(ctlsh, ctlh)
			}
			pgState.TimelinesHistory = ctlsh
		}
		pgState.Healthy = true
	}

	return pgState, nil
}
Beispiel #15
0
func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo cluster.KeepersInfo, firstRun bool) (*cluster.ClusterData, KeeperInfoHistories) {
	// Create a copy of cd
	cd = cd.DeepCopy()

	kihs := s.keeperInfoHistories.DeepCopy()

	// Remove keepers with wrong cluster UID
	tmpKeepersInfo := keepersInfo.DeepCopy()
	for _, ki := range keepersInfo {
		if ki.ClusterUID != cd.Cluster.UID {
			delete(tmpKeepersInfo, ki.UID)
		}
	}
	keepersInfo = tmpKeepersInfo

	// On first run just insert keepers info in the history with Seen set
	// to false and don't do any change to the keepers' state
	if firstRun {
		for keeperUID, ki := range keepersInfo {
			kihs[keeperUID] = &KeeperInfoHistory{KeeperInfo: ki, Seen: false}
		}
		return cd, kihs
	}

	tmpKeepersInfo = keepersInfo.DeepCopy()
	// keep only updated keepers info
	for keeperUID, ki := range keepersInfo {
		if kih, ok := kihs[keeperUID]; ok {
			log.Debug("kih", zap.Object("kih", kih))
			if kih.KeeperInfo.InfoUID == ki.InfoUID {
				if !kih.Seen {
					//Remove since it was already there and wasn't updated
					delete(tmpKeepersInfo, ki.UID)
				} else if kih.Seen && timer.Since(kih.Timer) > s.sleepInterval {
					//Remove since it wasn't updated
					delete(tmpKeepersInfo, ki.UID)
				}
			}
			if kih.KeeperInfo.InfoUID != ki.InfoUID {
				kihs[keeperUID] = &KeeperInfoHistory{KeeperInfo: ki, Seen: true, Timer: timer.Now()}
			}
		} else {
			kihs[keeperUID] = &KeeperInfoHistory{KeeperInfo: ki, Seen: true, Timer: timer.Now()}
		}
	}
	keepersInfo = tmpKeepersInfo

	// Create new keepers from keepersInfo
	for keeperUID, ki := range keepersInfo {
		if _, ok := cd.Keepers[keeperUID]; !ok {
			k := cluster.NewKeeperFromKeeperInfo(ki)
			cd.Keepers[k.UID] = k
		}
	}

	// Mark keepers without a keeperInfo (cleaned up above from not updated
	// ones) as in error
	for keeperUID, _ := range cd.Keepers {
		if _, ok := keepersInfo[keeperUID]; !ok {
			s.SetKeeperError(keeperUID)
		} else {
			s.CleanKeeperError(keeperUID)
		}
	}

	// Update keepers' healthy states
	for _, k := range cd.Keepers {
		k.Status.Healthy = s.isKeeperHealthy(cd, k)
	}

	// Update dbs' states
	for _, db := range cd.DBs {
		// Mark not found DBs in DBstates in error
		k, ok := keepersInfo[db.Spec.KeeperUID]
		if !ok {
			log.Error("no keeper info available", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
			s.SetDBError(db.UID)
			continue
		}
		dbs := k.PostgresState
		if dbs == nil {
			log.Error("no db state available", zap.String("db", db.UID))
			s.SetDBError(db.UID)
			continue
		}
		if dbs.UID != db.UID {
			log.Warn("received db state for unexpected db uid", zap.String("receivedDB", dbs.UID), zap.String("db", db.UID))
			s.SetDBError(db.UID)
			continue
		}
		log.Debug("received db state", zap.String("db", db.UID))
		db.Status.ListenAddress = dbs.ListenAddress
		db.Status.Port = dbs.Port
		db.Status.CurrentGeneration = dbs.Generation
		if dbs.Healthy {
			s.CleanDBError(db.UID)
			db.Status.SystemID = dbs.SystemID
			db.Status.TimelineID = dbs.TimelineID
			db.Status.XLogPos = dbs.XLogPos
			db.Status.TimelinesHistory = dbs.TimelinesHistory
			db.Status.PGParameters = cluster.PGParameters(dbs.PGParameters)
		} else {
			s.SetDBError(db.UID)
		}
	}

	// Update dbs' healthy state
	for _, db := range cd.DBs {
		db.Status.Healthy = s.isDBHealthy(cd, db)
	}

	return cd, kihs
}
Beispiel #16
0
func (s *Sentinel) updateCluster(cd *cluster.ClusterData) (*cluster.ClusterData, error) {
	newcd := cd.DeepCopy()
	switch cd.Cluster.Status.Phase {
	case cluster.ClusterPhaseInitializing:
		switch *cd.Cluster.DefSpec().InitMode {
		case cluster.ClusterInitModeNew:
			// Is there already a keeper choosed to be the new master?
			if cd.Cluster.Status.Master == "" {
				log.Info("trying to find initial master")
				k, err := s.findInitialKeeper(cd)
				if err != nil {
					return nil, fmt.Errorf("cannot choose initial master: %v", err)
				}
				log.Info("initializing cluster", zap.String("keeper", k.UID))
				db := &cluster.DB{
					UID:        s.UIDFn(),
					Generation: cluster.InitialGeneration,
					ChangeTime: time.Now(),
					Spec: &cluster.DBSpec{
						KeeperUID:     k.UID,
						InitMode:      cluster.DBInitModeNew,
						Role:          common.RoleMaster,
						Followers:     []string{},
						IncludeConfig: *cd.Cluster.DefSpec().MergePgParameters,
					},
				}
				newcd.DBs[db.UID] = db
				newcd.Cluster.Status.Master = db.UID
				log.Debug("newcd dump", zap.String("newcd", spew.Sdump(newcd)))
			} else {
				db, ok := cd.DBs[cd.Cluster.Status.Master]
				if !ok {
					panic(fmt.Errorf("db %q object doesn't exists. This shouldn't happen", cd.Cluster.Status.Master))
				}
				// Check that the choosed db for being the master has correctly initialized
				switch s.dbConvergenceState(db, cd.Cluster.DefSpec().InitTimeout.Duration) {
				case Converged:
					if db.Status.Healthy {
						log.Info("db initialized", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
						// Set db initMode to none, not needed but just a security measure
						db.Spec.InitMode = cluster.DBInitModeNone
						// Don't include previous config anymore
						db.Spec.IncludeConfig = false
						// Replace reported pg parameters in cluster spec
						if *cd.Cluster.DefSpec().MergePgParameters {
							newcd.Cluster.Spec.PGParameters = db.Status.PGParameters
						}
						// Cluster initialized, switch to Normal state
						newcd.Cluster.Status.Phase = cluster.ClusterPhaseNormal
					}
				case Converging:
					log.Info("waiting for db", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
				case ConvergenceFailed:
					log.Info("db failed to initialize", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
					// Empty DBs
					newcd.DBs = cluster.DBs{}
					// Unset master so another keeper can be choosen
					newcd.Cluster.Status.Master = ""
				}
			}
		case cluster.ClusterInitModeExisting:
			if cd.Cluster.Status.Master == "" {
				wantedKeeper := cd.Cluster.DefSpec().ExistingConfig.KeeperUID
				log.Info("trying to use keeper as initial master", zap.String("keeper", wantedKeeper))

				k, ok := cd.Keepers[wantedKeeper]
				if !ok {
					return nil, fmt.Errorf("keeper %q state not available", wantedKeeper)
				}

				log.Info("initializing cluster using selected keeper as master db owner", zap.String("keeper", k.UID))

				db := &cluster.DB{
					UID:        s.UIDFn(),
					Generation: cluster.InitialGeneration,
					ChangeTime: time.Now(),
					Spec: &cluster.DBSpec{
						KeeperUID:     k.UID,
						InitMode:      cluster.DBInitModeExisting,
						Role:          common.RoleMaster,
						Followers:     []string{},
						IncludeConfig: *cd.Cluster.DefSpec().MergePgParameters,
					},
				}
				newcd.DBs[db.UID] = db
				newcd.Cluster.Status.Master = db.UID
				log.Debug("newcd dump", zap.String("newcd", spew.Sdump(newcd)))
			} else {
				db, ok := newcd.DBs[cd.Cluster.Status.Master]
				if !ok {
					panic(fmt.Errorf("db %q object doesn't exists. This shouldn't happen", cd.Cluster.Status.Master))
				}
				// Check that the choosed db for being the master has correctly initialized
				if db.Status.Healthy && s.dbConvergenceState(db, cd.Cluster.DefSpec().ConvergenceTimeout.Duration) == Converged {
					log.Info("db initialized", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
					// Don't include previous config anymore
					db.Spec.IncludeConfig = false
					// Replace reported pg parameters in cluster spec
					if *cd.Cluster.DefSpec().MergePgParameters {
						newcd.Cluster.Spec.PGParameters = db.Status.PGParameters
					}
					// Cluster initialized, switch to Normal state
					newcd.Cluster.Status.Phase = cluster.ClusterPhaseNormal
				}
			}
		case cluster.ClusterInitModePITR:
			// Is there already a keeper choosed to be the new master?
			if cd.Cluster.Status.Master == "" {
				log.Info("trying to find initial master")
				k, err := s.findInitialKeeper(cd)
				if err != nil {
					return nil, fmt.Errorf("cannot choose initial master: %v", err)
				}
				log.Info("initializing cluster using selected keeper as master db owner", zap.String("keeper", k.UID))
				db := &cluster.DB{
					UID:        s.UIDFn(),
					Generation: cluster.InitialGeneration,
					ChangeTime: time.Now(),
					Spec: &cluster.DBSpec{
						KeeperUID:     k.UID,
						InitMode:      cluster.DBInitModePITR,
						PITRConfig:    cd.Cluster.DefSpec().PITRConfig,
						Role:          common.RoleMaster,
						Followers:     []string{},
						IncludeConfig: *cd.Cluster.DefSpec().MergePgParameters,
					},
				}
				newcd.DBs[db.UID] = db
				newcd.Cluster.Status.Master = db.UID
				log.Debug("newcd dump", zap.String("newcd", spew.Sdump(newcd)))
			} else {
				db, ok := cd.DBs[cd.Cluster.Status.Master]
				if !ok {
					panic(fmt.Errorf("db %q object doesn't exists. This shouldn't happen", cd.Cluster.Status.Master))
				}
				// Check that the choosed db for being the master has correctly initialized
				// TODO(sgotti) set a timeout (the max time for a restore operation)
				switch s.dbConvergenceState(db, 0) {
				case Converged:
					if db.Status.Healthy {
						log.Info("db initialized", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
						// Set db initMode to none, not needed but just a security measure
						db.Spec.InitMode = cluster.DBInitModeNone
						// Don't include previous config anymore
						db.Spec.IncludeConfig = false
						// Replace reported pg parameters in cluster spec
						if *cd.Cluster.DefSpec().MergePgParameters {
							newcd.Cluster.Spec.PGParameters = db.Status.PGParameters
						}
						// Cluster initialized, switch to Normal state
						newcd.Cluster.Status.Phase = cluster.ClusterPhaseNormal
					}
				case Converging:
					log.Info("waiting for db to converge", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
				case ConvergenceFailed:
					log.Info("db failed to initialize", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
					// Empty DBs
					newcd.DBs = cluster.DBs{}
					// Unset master so another keeper can be choosen
					newcd.Cluster.Status.Master = ""
				}
			}
		default:
			return nil, fmt.Errorf("unknown init mode %q", cd.Cluster.DefSpec().InitMode)
		}
	case cluster.ClusterPhaseNormal:
		// TODO(sgotti) When keeper removal is implemented, remove DBs for unexistent keepers

		// Calculate current master status
		curMasterDBUID := cd.Cluster.Status.Master
		wantedMasterDBUID := curMasterDBUID

		masterOK := true
		curMasterDB := cd.DBs[curMasterDBUID]
		if curMasterDB == nil {
			return nil, fmt.Errorf("db for keeper %q not available. This shouldn't happen!", curMasterDBUID)
		}
		log.Debug("db dump", zap.String("db", spew.Sdump(curMasterDB)))

		if !curMasterDB.Status.Healthy {
			log.Info("master db is failed", zap.String("db", curMasterDB.UID), zap.String("keeper", curMasterDB.Spec.KeeperUID))
			masterOK = false
		}

		// Check that the wanted master is in master state (i.e. check that promotion from standby to master happened)
		if s.dbConvergenceState(curMasterDB, cd.Cluster.DefSpec().ConvergenceTimeout.Duration) == ConvergenceFailed {
			log.Info("db not converged", zap.String("db", curMasterDB.UID), zap.String("keeper", curMasterDB.Spec.KeeperUID))
			masterOK = false
		}

		if !masterOK {
			log.Info("trying to find a new master to replace failed master")
			bestNewMasters := s.findBestNewMasters(cd, curMasterDB)
			if len(bestNewMasters) == 0 {
				log.Error("no eligible masters")
			} else {
				// if synchronous replication is enabled, only choose new master in the synchronous replication standbys.
				var bestNewMasterDB *cluster.DB
				if *cd.Cluster.DefSpec().SynchronousReplication {
					onlyFake := true
					// if only fake synchronous standbys are defined we cannot choose any standby
					for _, dbUID := range curMasterDB.Spec.SynchronousStandbys {
						if dbUID != fakeStandbyName {
							onlyFake = false
						}
					}
					if !onlyFake {
						if !util.CompareStringSlice(curMasterDB.Status.SynchronousStandbys, curMasterDB.Spec.SynchronousStandbys) {
							log.Warn("cannot choose synchronous standby since the latest master reported synchronous standbys are different from the db spec ones", zap.Object("reported", spew.Sdump(curMasterDB.Status.SynchronousStandbys)), zap.Object("spec", spew.Sdump(curMasterDB.Spec.SynchronousStandbys)))
						} else {
							for _, nm := range bestNewMasters {
								if util.StringInSlice(curMasterDB.Spec.SynchronousStandbys, nm.UID) {
									bestNewMasterDB = nm
									break
								}
							}
						}
					}
				} else {
					bestNewMasterDB = bestNewMasters[0]
				}
				if bestNewMasterDB != nil {
					log.Info("electing db as the new master", zap.String("db", bestNewMasterDB.UID), zap.String("keeper", bestNewMasterDB.Spec.KeeperUID))
					wantedMasterDBUID = bestNewMasterDB.UID
				} else {
					log.Error("no eligible masters")
				}
			}
		}

		// New master elected
		if curMasterDBUID != wantedMasterDBUID {
			// maintain the current role, remove followers
			oldMasterdb := newcd.DBs[curMasterDBUID]
			oldMasterdb.Spec.Followers = []string{}

			newcd.Cluster.Status.Master = wantedMasterDBUID
			newMasterDB := newcd.DBs[wantedMasterDBUID]
			newMasterDB.Spec.Role = common.RoleMaster
			newMasterDB.Spec.FollowConfig = nil

			// Tell proxy that there's currently no active master
			newcd.Proxy.Spec.MasterDBUID = ""
			newcd.Proxy.ChangeTime = time.Now()

			// Setup synchronous standbys to the one of the previous master (replacing ourself with the previous master)
			if *cd.Cluster.DefSpec().SynchronousReplication {
				for _, dbUID := range oldMasterdb.Spec.SynchronousStandbys {
					newMasterDB.Spec.SynchronousStandbys = []string{}
					if dbUID != newMasterDB.UID {
						newMasterDB.Spec.SynchronousStandbys = append(newMasterDB.Spec.SynchronousStandbys, dbUID)
					} else {
						newMasterDB.Spec.SynchronousStandbys = append(newMasterDB.Spec.SynchronousStandbys, oldMasterdb.UID)
					}
				}
				if len(newMasterDB.Spec.SynchronousStandbys) == 0 {
					newMasterDB.Spec.SynchronousStandbys = []string{fakeStandbyName}
				}
			}
		}

		// TODO(sgotti) Wait for the proxies being converged (closed connections to old master)?

		// Setup standbys, do this only when there's no master change
		if curMasterDBUID == wantedMasterDBUID {
			masterDB := newcd.DBs[curMasterDBUID]
			// Set standbys to follow master only if it's healthy and converged
			if masterDB.Status.Healthy && s.dbConvergenceState(masterDB, cd.Cluster.DefSpec().ConvergenceTimeout.Duration) == Converged {
				// Tell proxy that there's a new active master
				newcd.Proxy.Spec.MasterDBUID = wantedMasterDBUID
				newcd.Proxy.ChangeTime = time.Now()

				// Remove old masters
				toRemove := []*cluster.DB{}
				for _, db := range newcd.DBs {
					if db.UID == wantedMasterDBUID {
						continue
					}
					if s.dbType(newcd, db.UID) != dbTypeMaster {
						continue
					}
					log.Info("removing old master db", zap.String("db", db.UID))
					toRemove = append(toRemove, db)
				}
				for _, db := range toRemove {
					delete(newcd.DBs, db.UID)
				}

				// Remove invalid dbs
				toRemove = []*cluster.DB{}
				for _, db := range newcd.DBs {
					if db.UID == wantedMasterDBUID {
						continue
					}
					if s.dbValidity(newcd, db.UID) != dbValidityInvalid {
						continue
					}
					log.Info("removing invalid db", zap.String("db", db.UID))
					toRemove = append(toRemove, db)
				}
				for _, db := range toRemove {
					delete(newcd.DBs, db.UID)
				}

				goodStandbys, failedStandbys, convergingStandbys := s.validStandbysByStatus(newcd)
				goodStandbysCount := len(goodStandbys)
				failedStandbysCount := len(failedStandbys)
				convergingStandbysCount := len(convergingStandbys)
				log.Debug("standbys states", zap.Int("good", goodStandbysCount), zap.Int("failed", failedStandbysCount), zap.Int("converging", convergingStandbysCount))

				// Setup synchronous standbys
				if *cd.Cluster.DefSpec().SynchronousReplication {
					// make a map of synchronous standbys starting from the current ones
					synchronousStandbys := map[string]struct{}{}
					for _, dbUID := range masterDB.Spec.SynchronousStandbys {
						// filter out fake standby
						if dbUID == fakeStandbyName {
							continue
						}
						synchronousStandbys[dbUID] = struct{}{}
					}

					// Check if the current synchronous standbys are healthy or remove them
					toRemove := map[string]struct{}{}
					for dbUID, _ := range synchronousStandbys {
						if _, ok := goodStandbys[dbUID]; !ok {
							log.Info("removing failed synchronous standby", zap.String("masterDB", masterDB.UID), zap.String("db", dbUID))
							toRemove[dbUID] = struct{}{}
						}
					}
					for dbUID, _ := range toRemove {
						delete(synchronousStandbys, dbUID)
					}

					// Remove synchronous standbys in excess
					if uint16(len(synchronousStandbys)) > *cd.Cluster.DefSpec().MaxSynchronousStandbys {
						rc := len(synchronousStandbys) - int(*cd.Cluster.DefSpec().MaxSynchronousStandbys)
						removedCount := 0
						toRemove = map[string]struct{}{}
						for dbUID, _ := range synchronousStandbys {
							if removedCount >= rc {
								break
							}
							log.Info("removing synchronous standby in excess", zap.String("masterDB", masterDB.UID), zap.String("db", dbUID))
							toRemove[dbUID] = struct{}{}
							removedCount++
						}
						for dbUID, _ := range toRemove {
							delete(synchronousStandbys, dbUID)
						}
					}

					// try to add missing standbys up to *cd.Cluster.DefSpec().MaxSynchronousStandbys
					bestStandbys := s.findBestStandbys(newcd, curMasterDB)
					ac := int(*cd.Cluster.DefSpec().MaxSynchronousStandbys) - len(synchronousStandbys)
					addedCount := 0
					for _, bestStandby := range bestStandbys {
						if addedCount >= ac {
							break
						}
						if _, ok := synchronousStandbys[bestStandby.UID]; ok {
							continue
						}
						log.Info("adding synchronous standby", zap.String("masterDB", masterDB.UID), zap.String("synchronousStandbyDB", bestStandby.UID))
						synchronousStandbys[bestStandby.UID] = struct{}{}
						addedCount++
					}

					// If there're not enough real synchronous standbys add a fake synchronous standby because we have to be strict and make the master block transactions until MaxSynchronousStandbys real standbys are available
					if len(synchronousStandbys) < int(*cd.Cluster.DefSpec().MinSynchronousStandbys) {
						log.Info("using a fake synchronous standby since there are not enough real standbys available", zap.String("masterDB", masterDB.UID), zap.Int("required", int(*cd.Cluster.DefSpec().MinSynchronousStandbys)))
						synchronousStandbys[fakeStandbyName] = struct{}{}
					}

					masterDB.Spec.SynchronousStandbys = []string{}
					for dbUID, _ := range synchronousStandbys {
						masterDB.Spec.SynchronousStandbys = append(masterDB.Spec.SynchronousStandbys, dbUID)
					}

					// Sort synchronousStandbys so we can compare the slice regardless of its order
					sort.Sort(sort.StringSlice(masterDB.Spec.SynchronousStandbys))
				}

				// NotFailed != Good since there can be some dbs that are converging
				// it's the total number of standbys - the failed standbys
				// or the sum of good + converging standbys
				notFailedStandbysCount := goodStandbysCount + convergingStandbysCount

				// Remove dbs in excess if we have a good number >= MaxStandbysPerSender
				if uint16(goodStandbysCount) >= *cd.Cluster.DefSpec().MaxStandbysPerSender {
					toRemove := []*cluster.DB{}
					// Remove all non good standbys
					for _, db := range newcd.DBs {
						if s.dbType(newcd, db.UID) != dbTypeStandby {
							continue
						}
						if _, ok := goodStandbys[db.UID]; !ok {
							log.Info("removing non good standby", zap.String("db", db.UID))
							toRemove = append(toRemove, db)
						}
					}
					// Remove good standbys in excess
					nr := int(uint16(goodStandbysCount) - *cd.Cluster.DefSpec().MaxStandbysPerSender)
					i := 0
					for _, db := range goodStandbys {
						if i >= nr {
							break
						}
						// Don't remove standbys marked as synchronous standbys
						if util.StringInSlice(masterDB.Spec.SynchronousStandbys, db.UID) {
							continue
						}
						log.Info("removing good standby in excess", zap.String("db", db.UID))
						toRemove = append(toRemove, db)
						i++
					}
					for _, db := range toRemove {
						delete(newcd.DBs, db.UID)
					}

				} else {
					// Add new dbs to substitute failed dbs. we
					// don't remove failed db until the number of
					// good db is >= MaxStandbysPerSender since they can come back

					// define, if there're available keepers, new dbs
					// nc can be negative if MaxStandbysPerSender has been lowered
					nc := int(*cd.Cluster.DefSpec().MaxStandbysPerSender - uint16(notFailedStandbysCount))
					// Add missing DBs until MaxStandbysPerSender
					freeKeepers := s.freeKeepers(newcd)
					nf := len(freeKeepers)
					for i := 0; i < nc && i < nf; i++ {
						freeKeeper := freeKeepers[i]
						db := &cluster.DB{
							UID:        s.UIDFn(),
							Generation: cluster.InitialGeneration,
							ChangeTime: time.Now(),
							Spec: &cluster.DBSpec{
								KeeperUID:    freeKeeper.UID,
								InitMode:     cluster.DBInitModeResync,
								Role:         common.RoleStandby,
								Followers:    []string{},
								FollowConfig: &cluster.FollowConfig{Type: cluster.FollowTypeInternal, DBUID: wantedMasterDBUID},
							},
						}
						newcd.DBs[db.UID] = db
						log.Info("added new standby db", zap.String("db", db.UID), zap.String("keeper", db.Spec.KeeperUID))
					}
				}

				// Reconfigure all standbys as followers of the current master
				for _, db := range newcd.DBs {
					if s.dbType(newcd, db.UID) != dbTypeStandby {
						continue
					}

					db.Spec.Role = common.RoleStandby
					// Remove followers
					db.Spec.Followers = []string{}
					db.Spec.FollowConfig = &cluster.FollowConfig{Type: cluster.FollowTypeInternal, DBUID: wantedMasterDBUID}
				}

				// Set followers for master DB
				masterDB.Spec.Followers = []string{}
				for _, db := range newcd.DBs {
					if masterDB.UID == db.UID {
						continue
					}
					fc := db.Spec.FollowConfig
					if fc != nil {
						if fc.Type == cluster.FollowTypeInternal && fc.DBUID == wantedMasterDBUID {
							masterDB.Spec.Followers = append(masterDB.Spec.Followers, db.UID)
						}
					}
				}
				// Sort followers so the slice won't be considered changed due to different order of the same entries.
				sort.Strings(masterDB.Spec.Followers)
			}
		}

	default:
		return nil, fmt.Errorf("unknown cluster phase %s", cd.Cluster.Status.Phase)
	}

	// Copy the clusterSpec parameters to the dbSpec
	s.setDBSpecFromClusterSpec(newcd)

	// Update generation on DBs if they have changed
	for dbUID, db := range newcd.DBs {
		prevDB, ok := cd.DBs[dbUID]
		if !ok {
			continue
		}
		if !reflect.DeepEqual(db.Spec, prevDB.Spec) {
			log.Debug("db spec changed, updating generation", zap.String("prevDB", spew.Sdump(prevDB.Spec)), zap.String("db", spew.Sdump(db.Spec)))
			db.Generation++
			db.ChangeTime = time.Now()
		}
	}

	return newcd, nil
}