Beispiel #1
0
func newBot() (b MMJira) {

	b = MMJira{l: zap.NewJSON(zap.DebugLevel), reg: metrics.NewRegistry()}
	data, err := ioutil.ReadFile("config.yaml")
	if err != nil {
		b.l.Panic("not able to read the file", zap.Error(err))
	}
	var config InstanceConfig
	if err = yaml.Unmarshal(data, &config); err != nil {
		b.l.Panic("not able to marshal the file", zap.Error(err))
	}
	b.c = &config
	if !b.c.Debug {
		b.l.SetLevel(zap.ErrorLevel)
	}
	mmpost, err := mmcontroller.NewController(b.c.MMicon, b.c.MMuser, b.c.Hooks, b.c.Debug, metrics.NewPrefixedChildRegistry(b.reg, "mmc."))
	if err != nil {
		panic(err)
	}

	b.m = mmpost
	b.l.Debug("outputting config", zap.Object("config", b.c))
	b.r = mux.NewRouter()
	b.r.HandleFunc("/", b.homeHandler)
	b.r.HandleFunc("/hooks/", b.getHandler).Methods("GET")
	b.r.HandleFunc("/hooks/{hookid}", b.postHandler).Methods("POST")
	b.r.Handle("/metrics", exp.ExpHandler(b.reg))
	b.r.HandleFunc("/config/", b.configGetHandler).Methods("GET")

	return b

}
Beispiel #2
0
func (conn *Connection) reader(responses chan<- Response, logger zap.Logger) {
	buffer := make([]byte, 6)
	for {
		n, err := conn.conn.Read(buffer)
		if err != nil && n < 6 {
			logger.Info("APNS: Connection error before reading complete response",
				zap.Int("connectionId", conn.id),
				zap.Int("n", n),
				zap.Error(err),
			)
			conn.shouldReconnect <- true
			return
		} else if err != nil {
			logger.Info("APNS: Connection error before reading complete response",
				zap.Int("connectionId", conn.id),
				zap.Error(err),
			)
		}
		command := uint8(buffer[0])
		if command != 8 {
			logger.Info("APNS: Something went wrong in a connection - Command should have been 8 but it had other value instead",
				zap.Int("connectionId", conn.id),
				zap.Object("commandValue", command),
			)
		}
		resp := newResponse()
		resp.Identifier = binary.BigEndian.Uint32(buffer[2:6])
		resp.Status = uint8(buffer[1])
		responses <- resp
		conn.shouldReconnect <- true
		return
	}
}
Beispiel #3
0
func (s *Server) Serve() error {
	s.globalWsChan = make(chan *Ws)
	if s.H2SleepToRunSecond == 0 {
		s.H2SleepToRunSecond = 2
	}
	if s.H2BufSize == 0 {
		s.H2BufSize = 64 << 10
	}
	if s.WsBufSize == 0 {
		s.WsBufSize = 65 << 10
	}
	if s.PingSecond == 0 {
		s.PingSecond = 45
	}
	s.upgrader.ReadBufferSize = s.WsBufSize
	s.upgrader.WriteBufferSize = s.WsBufSize

	if s.H2RetryMaxSecond == 0 {
		s.H2RetryMaxSecond = 30
	}

	info, err := json.Marshal(map[string]interface{}{
		"PingSecond": s.PingSecond,
	})
	if err != nil {
		Log.Error("compute server info", zap.Error(err))
		return err
	}

	if s.TCP == 0 {
		s.dbox, err = newDropbox(s.DropboxAccessToken, s.DropboxDomainKey)
		if err != nil {
			Log.Error("create dropbox client", zap.Error(err))
			return err
		}
	}

	if _, err = s.loadPac(); err != nil {
		return err
	}

	s.info = info

	if s.TCP == 0 {
		s.challengeProvider = new(wrapperChallengeProvider)
		s.httpServer = s.newHttpServer()

		go s.listenAndServeH2All()
		return s.httpServer.ListenAndServe()
	}

	s.listenAndServeH2All()
	return errors.New("TCP server failed")
}
Beispiel #4
0
func (conn *Connection) connect(logger zap.Logger) error {
	if conn.conn != nil {
		conn.conn.Close()
	}
	if conn.connAux != nil {
		conn.connAux.Close()
	}

	var cert tls.Certificate
	var err error
	if len(conn.CertificateBase64) == 0 && len(conn.KeyBase64) == 0 {
		// The user did not specify raw block contents, so check the filesystem.
		cert, err = tls.LoadX509KeyPair(conn.CertificateFile, conn.KeyFile)
	} else {
		// The user provided the raw block contents, so use that.
		cert, err = tls.X509KeyPair([]byte(conn.CertificateBase64), []byte(conn.KeyBase64))
	}

	if err != nil {
		logger.Fatal("APNS: Failed to obtain certificate",
			zap.Error(err),
		)
		return err
	}

	conf := &tls.Config{
		Certificates: []tls.Certificate{cert},
		ServerName:   strings.Split(conn.Gateway, ":")[0],
	}

	connAux, err := net.Dial("tcp", conn.Gateway)
	if err != nil {
		logger.Fatal("APNS: Failed while dialing gateway",
			zap.String("gateway", conn.Gateway),
			zap.Error(err),
		)
		return err
	}
	tlsConn := tls.Client(connAux, conf)
	err = tlsConn.Handshake()
	if err != nil {
		logger.Fatal("APNS: Failed while handshaking",
			zap.Error(err),
		)
		_ = tlsConn.Close()
		return err
	}
	conn.conn = tlsConn
	conn.connAux = connAux
	//Start reader goroutine
	go conn.reader(conn.responses, logger)
	return nil
}
Beispiel #5
0
func (s *Server) newH2TlsConfig() (*tls.Config, error) {
	if s.TCP != 0 {
		// 1. LoadServerCert
		cert, err := tls.LoadX509KeyPair("server.crt", "server.key")
		if err != nil {
			Log.Error("loading server certificate", zap.Error(err))
			return nil, err
		}

		// 2. LoadCACert
		caCert, err := ioutil.ReadFile("chain.pem")
		if err != nil {
			Log.Error("loading CA certificate", zap.Error(err))
			return nil, err
		}
		caPool := x509.NewCertPool()
		caPool.AppendCertsFromPEM(caCert)

		config := tls.Config{
			Certificates: []tls.Certificate{cert},
			ClientCAs:    caPool,
			ClientAuth:   tls.RequireAndVerifyClientCert,
			MinVersion:   tls.VersionTLS12,
			NextProtos:   []string{http2.NextProtoTLS},
		}
		return &config, nil
	}

	w, err := acmewrapper.New(acmewrapper.Config{
		Domains: []string{s.AcmeDomain},

		TLSCertFile: fmt.Sprintf("/%s/%s", s.AcmeDomain, "cert.pem"),
		TLSKeyFile:  fmt.Sprintf("/%s/%s", s.AcmeDomain, "key.pem"),

		RegistrationFile: fmt.Sprintf("/%s/%s", s.AcmeDomain, "user.reg"),
		PrivateKeyFile:   fmt.Sprintf("/%s/%s", s.AcmeDomain, "user.pem"),

		TOSCallback: acmewrapper.TOSAgree,

		HTTP01ChallengeProvider: s.challengeProvider,

		SaveFileCallback: s.dbox.SaveFile,
		LoadFileCallback: s.dbox.LoadFile,
	})
	if err != nil {
		Log.Error("acmewrapper failed", zap.Error(err))
		return nil, err
	}
	return w.TLSConfig(), nil
}
Beispiel #6
0
func (c *Connection) spinUntilReconnect(logger zap.Logger) {
	var backoff = time.Duration(100)
	for {
		logger.Info("APNS: Connection lost. Reconnecting",
			zap.Int("connectionId", c.id),
		)
		err := c.connect(logger)
		if err != nil {
			//Exponential backoff up to a limit
			logger.Info("APNS: Error connecting to server",
				zap.Int("connectionId", c.id),
				zap.Error(err),
			)
			backoff = backoff * 2
			if backoff > maxBackoff {
				backoff = maxBackoff
			}
			time.Sleep(backoff)
		} else {
			backoff = 100
			logger.Info("APNS: New connection established",
				zap.Int("connectionId", c.id),
			)
			break
		}
	}
}
Beispiel #7
0
func (s *Server) newH2Server(tlsConfig *tls.Config, laddr string) (*http.Server, func(error)) {
	var mu sync.Mutex
	var h2sleep time.Duration = 1
	h2Server := &http.Server{
		Addr:      laddr,
		Handler:   http.HandlerFunc(s.serveH2),
		TLSConfig: tlsConfig,
		ConnState: func(c net.Conn, s http.ConnState) {
			if s == http.StateNew {
				mu.Lock()
				h2sleep = 1
				mu.Unlock()
			}
		},
	}
	http2.ConfigureServer(h2Server, &http2.Server{
		MaxReadFrameSize: s.H2BufSize,
	})

	afterServeError := func(err error) {
		Log.Error("h2 server failed", zap.Error(err))
		mu.Lock()
		if h2sleep < s.H2RetryMaxSecond {
			h2sleep++
		}
		sec := h2sleep
		mu.Unlock()
		time.Sleep(time.Second * sec)
	}
	return h2Server, afterServeError
}
Beispiel #8
0
func (s *Server) listenAndServeH2(tlsConfig *tls.Config, tcp bool) {
	var laddr string
	var tlsListener net.Listener
	var err error
	if tcp {
		if s.TCP == 0 {
			return
		}
		laddr = fmt.Sprintf(":%d", s.TCP)
		tlsListener, err = tls.Listen("tcp", laddr, tlsConfig)
		if err != nil {
			Log.Error("tcp tlsListener failed", zap.Error(err))
			return
		}
	} else {
		laddr = ":8444"
		tlsListener = tls.NewListener(newGlobalWsListener(s.globalWsChan), tlsConfig)
	}

	h2Server, afterServeError := s.newH2Server(tlsConfig, laddr)
	for {
		err = h2Server.Serve(tlsListener)
		afterServeError(err)
	}
}
Beispiel #9
0
func (s *Server) serveH2r(w http.ResponseWriter, r *http.Request) {
	defer func() {
		if err := recover(); err != nil {
			w.WriteHeader(http.StatusInternalServerError)
			Log.Error("REVERSE failed", zap.Object("err", err))
		} else {
			w.WriteHeader(http.StatusOK)
		}
	}()

	remote, err := net.DialTimeout("tcp", r.Host, time.Second*10)
	if err != nil {
		Log.Error("dail failed", zap.Error(err), zap.String("host", r.Host))
		w.WriteHeader(http.StatusNotImplemented)
		return
	}
	defer remote.Close()

	go io.Copy(remote, r.Body)
	//	go io.Copy(remote, io.TeeReader(r.Body, os.Stdout))
	resr := io.TeeReader(remote, w)
	//	resr = io.TeeReader(resr, os.Stdout)
	res, err := http.ReadResponse(bufio.NewReader(resr), nil)
	if err != nil {
		return
	}
	if res.Body != nil {
		defer res.Body.Close()
		io.Copy(ioutil.Discard, res.Body)
	}
}
Beispiel #10
0
func (s *Server) serveH2c(w http.ResponseWriter, r *http.Request) {
	defer func() {
		if err := recover(); err != nil {
			w.WriteHeader(http.StatusInternalServerError)
			Log.Error("CONNECT failed", zap.Object("err", err))
		}
	}()
	remote, err := net.DialTimeout("tcp", r.Host, time.Second*10)
	if err != nil {
		Log.Error("dail failed", zap.Error(err), zap.String("host", r.Host))
		w.WriteHeader(http.StatusNotImplemented)
		return
	}
	defer remote.Close()

	fw := &flushWriter{w}
	fw.FlushHeader(http.StatusOK)
	go io.Copy(remote, r.Body)
	srcRemote := &TryReader{
		c:        remote,
		ignore:   3,
		maxRetry: 2,
		tryDur:   time.Millisecond * 600,
		timeout:  time.Second * 15,
	}
	io.Copy(fw, srcRemote)
}
Beispiel #11
0
func (s *Sentinel) electionLoop() {
	for {
		log.Info("Trying to acquire sentinels leadership")
		electedCh, errCh := s.candidate.RunForElection()
		for {
			select {
			case elected := <-electedCh:
				s.leaderMutex.Lock()
				if elected {
					log.Info("sentinel leadership acquired")
					s.leader = true
					s.leadershipCount++
				} else {
					if s.leader {
						log.Info("sentinel leadership lost")
					}
					s.leader = false
				}
				s.leaderMutex.Unlock()

			case err := <-errCh:
				if err != nil {
					log.Error("election loop error", zap.Error(err))
				}
				goto end
			case <-s.stop:
				log.Debug("stopping election Loop")
				return
			}
		}
	end:
		time.Sleep(10 * time.Second)
	}
}
Beispiel #12
0
func main() {
	logLevel := zap.LevelFlag("v", zap.InfoLevel, "log level: all, debug, info, warn, error, panic, fatal, none")
	flag.StringVar(&botName, "botname", "satpam_bot", "bot name")
	flag.StringVar(&adminID, "admin", "", "admin id")
	flag.Parse()

	// setup logger
	log.SetLevel(*logLevel)
	bot.SetLogger(log)
	log.Info("STARTED", zap.String("version", VERSION), zap.String("buildtime", BUILDTIME))

	key := os.Getenv("TELEGRAM_KEY")
	if key == "" {
		log.Fatal("TELEGRAM_KEY can not be empty")
	}

	startedAt = time.Now()
	telegram := bot.NewTelegram(key)
	plugin := satpamBot{t: telegram}
	if err := telegram.AddPlugin(&plugin); err != nil {
		log.Fatal("Failed AddPlugin", zap.Error(err))
	}
	plugin.start()
	telegram.Start()

}
Beispiel #13
0
func (c *ClusterChecker) Start() error {
	endPollonProxyCh := make(chan error)
	checkCh := make(chan error)
	timerCh := time.NewTimer(0).C

	for true {
		select {
		case <-timerCh:
			go func() {
				checkCh <- c.Check()
			}()
		case err := <-checkCh:
			if err != nil {
				log.Debug("check reported error", zap.Error(err))
			}
			if err != nil {
				return fmt.Errorf("checker fatal error: %v", err)
			}
			timerCh = time.NewTimer(cluster.DefaultProxyCheckInterval).C
		case err := <-endPollonProxyCh:
			if err != nil {
				return fmt.Errorf("proxy error: %v", err)
			}
		}
	}
	return nil
}
Beispiel #14
0
func (p *PostgresKeeper) updatePGState(pctx context.Context) {
	p.pgStateMutex.Lock()
	defer p.pgStateMutex.Unlock()
	pgState, err := p.GetPGState(pctx)
	if err != nil {
		log.Error("failed to get pg state", zap.Error(err))
		return
	}
	p.lastPGState = pgState
}
Beispiel #15
0
func (cg *ConsumerGroup) topicListConsumer(topics []string, logger zap.Logger) {
	for {
		select {
		case <-cg.stopper:
			return
		default:
		}

		consumers, consumerChanges, err := cg.group.WatchInstances()
		if err != nil {
			logger.Fatal("KAFKA: FAILED to get list of registered consumer instances for replica",
				zap.Int("replicaId", cg.replicaId),
				zap.Error(err),
			)
			return
		}

		cg.consumers = consumers
		logger.Info("KAFKA: Got currently registered consumers for replica",
			zap.Int("replicaId", cg.replicaId),
			zap.Int("numRegisteredConsumers", len(cg.consumers)),
		)

		stopper := make(chan struct{})

		for _, topic := range topics {
			cg.wg.Add(1)
			go cg.topicConsumer(topic, cg.messages, cg.errors, stopper, logger)
		}

		select {
		case <-cg.stopper:
			close(stopper)
			return

		case event := <-consumerChanges:
			if event.Err == zk.ErrSessionExpired || event.Err == zk.ErrConnectionClosed {
				logger.Info("KAFKA: Session was expired, reloading consumer for replica",
					zap.Int("replicaId", cg.replicaId),
				)
				go cg.reload(logger)
				<-cg.stopper
				close(stopper)
				return
			} else {
				logger.Info("KAFKA: Triggering rebalance due to consumer list change in replica",
					zap.Int("replicaId", cg.replicaId),
				)
				close(stopper)
				cg.wg.Wait()
			}
		}
	}
}
Beispiel #16
0
func (s *Server) tryLoadPac() []byte {
	ps, err := s.dbox.LoadPlainFile("/bricks.pac")
	if err != nil {
		Log.Error("load pac from dropbox", zap.Error(err))
		s.muPac.RLock()
		defer s.muPac.RUnlock()
		return s.pac
	}
	s.muPac.Lock()
	defer s.muPac.Unlock()
	s.pac = ps
	return s.pac
}
Beispiel #17
0
func (s *Server) loadPac() ([]byte, error) {
	ps, err := ioutil.ReadFile("bricks.pac")
	if err != nil {
		ps, err = s.dbox.LoadPlainFile("/bricks.pac")
	}
	if err != nil {
		Log.Error("load pac from dropbox", zap.Error(err))
		return nil, err
	}
	s.muPac.Lock()
	defer s.muPac.Unlock()
	s.pac = ps
	return ps, nil
}
Beispiel #18
0
func (cg *ConsumerGroup) Close(logger zap.Logger) error {
	shutdownError := AlreadyClosing
	cg.singleShutdown.Do(func() {
		defer cg.kazoo.Close()

		shutdownError = nil
		close(cg.stopper)
		cg.wg.Wait()
		if err := cg.offsetManager.Close(logger); err != nil {
			logger.Error("KAFKA: FAILED closing the offset manager for replica!",
				zap.Int("replicaId", cg.replicaId),
				zap.Error(err),
			)
		}
		if shutdownError = cg.instance.Deregister(); shutdownError != nil {
			logger.Warn("KAFKA: Replica FAILED deregistering consumer instance",
				zap.Int("replicaId", cg.replicaId),
				zap.Error(shutdownError),
			)
		} else {
			logger.Info("KAFKA: Replica deregistered consumer instance",
				zap.Int("replicaId", cg.replicaId),
				zap.String("instanceId", cg.instance.ID),
			)
		}
		if shutdownError = cg.consumer.Close(); shutdownError != nil {
			logger.Error("Replica FAILED closing the Sarama client",
				zap.Int("replicaId", cg.replicaId),
				zap.Error(shutdownError),
			)
		}
		close(cg.messages)
		close(cg.errors)
		cg.instance = nil
	})
	return shutdownError
}
Beispiel #19
0
func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind, started bool) error {
	pgm := p.pgm
	if started {
		if err := pgm.Stop(true); err != nil {
			return fmt.Errorf("failed to stop pg instance: %v", err)
		}
	}
	replConnParams := p.getReplConnParams(db, followedDB)
	standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)}

	// TODO(sgotti) Actually we don't check if pg_rewind is installed or if
	// postgresql version is > 9.5 since someone can also use an externally
	// installed pg_rewind for postgres 9.4. If a pg_rewind executable
	// doesn't exists pgm.SyncFromFollowedPGRewind will return an error and
	// fallback to pg_basebackup
	if tryPgrewind && p.usePgrewind(db) {
		connParams := p.getSUConnParams(db, followedDB)
		log.Info("syncing using pg_rewind", zap.String("followedDB", followedDB.UID), zap.String("keeper", followedDB.Spec.KeeperUID))
		if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword); err != nil {
			// log pg_rewind error and fallback to pg_basebackup
			log.Error("error syncing with pg_rewind", zap.Error(err))
		} else {
			if err := pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil {
				return fmt.Errorf("err: %v", err)
			}
			return nil
		}
	}

	if err := pgm.RemoveAll(); err != nil {
		return fmt.Errorf("failed to remove the postgres data dir: %v", err)
	}
	if log.Level() >= zap.DebugLevel {
		log.Debug("syncing from followed db", zap.String("followedDB", followedDB.UID), zap.String("keeper", followedDB.Spec.KeeperUID), zap.String("replConnParams", fmt.Sprintf("%v", replConnParams)))
	} else {
		log.Info("syncing from followed db", zap.String("followedDB", followedDB.UID), zap.String("keeper", followedDB.Spec.KeeperUID))
	}
	if err := pgm.SyncFromFollowed(replConnParams); err != nil {
		return fmt.Errorf("sync error: %v", err)
	}
	log.Info("sync succeeded")

	if err := pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil {
		return fmt.Errorf("err: %v", err)
	}
	return nil
}
Beispiel #20
0
func (s *Server) serveWs(w http.ResponseWriter, r *http.Request) {
	defer func() {
		if err := recover(); err != nil {
			w.WriteHeader(http.StatusInternalServerError)
			Log.Error("serveWs error", zap.Object("err", err))
		}
	}()
	Log.Debug("websocket start")
	ws, err := s.upgrader.Upgrade(w, r, nil)
	if err != nil {
		w.WriteHeader(http.StatusInternalServerError)
		Log.Error("websocket failed", zap.Error(err))
		return
	}
	Log.Debug("websocket ok")
	s.globalWsChan <- NewWs(ws, s.WsBufSize)
}
Beispiel #21
0
//Start initiates a connection to APNS and asnchronously sends notifications which have been queued.
func (conn *Connection) Start(logger zap.Logger) error {
	//Connect to APNS. The reason this is here as well as in sender is that this probably catches any unavoidable errors in a synchronous fashion, while in sender it can reconnect after temporary errors (which should work most of the time.)
	err := conn.connect(logger)
	if err != nil {
		logger.Fatal("APNS: Failed to connect",
			zap.Int("connectionId", conn.id),
			zap.Error(err),
		)
		return err
	}
	//Start sender goroutine
	sent := make(chan PushNotification, 10000)
	go conn.sender(conn.queue, sent, logger)
	//Start limbo goroutine
	go conn.limbo(sent, conn.responses, conn.errors, conn.queue, logger)
	return nil
}
Beispiel #22
0
func (cg *ConsumerGroup) reload(logger zap.Logger) error {
	cg.reloadMutex.Lock()
	defer cg.reloadMutex.Unlock()
	cg.singleReload.Do(func() {
		logger.Info("KAFKA: Closing down old connections for replica",
			zap.Int("replicaId", cg.replicaId),
		)
		err := cg.Close(logger)
		if err != nil {
			logger.Error("KAFKA: Failed to close consumergroup for replica",
				zap.Int("replicaId", cg.replicaId),
				zap.Error(err),
			)
		}
		cg.Load(logger)
	})
	return nil
}
Beispiel #23
0
func (c *ClusterChecker) Check() error {
	cd, _, err := c.e.GetClusterData()
	if err != nil {
		log.Error("cannot get cluster data", zap.Error(err))
		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
		if c.stopListening {
			c.stopPollonProxy()
		}
		return nil
	}
	log.Debug("cd dump", zap.String("cd", spew.Sdump(cd)))
	if cd == nil {
		log.Info("no clusterdata available, closing connections to previous master")
		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
		return nil
	}
	if cd.FormatVersion != cluster.CurrentCDFormatVersion {
		log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion))
		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
		return nil
	}
	if err = cd.Cluster.Spec.Validate(); err != nil {
		log.Error("clusterdata validation failed", zap.Error(err))
		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
		return nil
	}

	// Start pollon if not active
	if err = c.startPollonProxy(); err != nil {
		log.Error("failed to start proxy", zap.Error(err))
		return nil
	}

	proxy := cd.Proxy
	if proxy == nil {
		log.Info("no proxy object available, closing connections to previous master")
		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
		if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil {
			log.Error("failed to update proxyInfo", zap.Error(err))
		}
		return nil
	}

	db, ok := cd.DBs[proxy.Spec.MasterDBUID]
	if !ok {
		log.Info("no db object available, closing connections to previous master", zap.String("db", proxy.Spec.MasterDBUID))
		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
		if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil {
			log.Error("failed to update proxyInfo", zap.Error(err))
		}
		return nil
	}

	addr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("%s:%s", db.Status.ListenAddress, db.Status.Port))
	if err != nil {
		log.Error("error", zap.Error(err))
		c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
		return nil
	}
	log.Info("master address", zap.Stringer("address", addr))
	if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil {
		log.Error("failed to update proxyInfo", zap.Error(err))
	}

	c.sendPollonConfData(pollon.ConfData{DestAddr: addr})
	return nil
}
Beispiel #24
0
func (b MMJira) start() {
	http.Handle("/", b.r)
	endpoint := b.c.Host + ":" + strconv.Itoa(b.c.Port)
	b.l.Fatal("error server", zap.Error(http.ListenAndServe(endpoint, b.r)))
}
Beispiel #25
0
// Consumes a partition
func (cg *ConsumerGroup) partitionConsumer(topic string, partition int32, messages chan<- *sarama.ConsumerMessage, errors chan<- *sarama.ConsumerError, wg *sync.WaitGroup, stopper <-chan struct{}, logger zap.Logger) {
	defer wg.Done()

	select {
	case <-stopper:
		return
	default:
	}

	for maxRetries, tries := 3, 0; tries < maxRetries; tries++ {
		if err := cg.instance.ClaimPartition(topic, partition); err == nil {
			break
		} else if err == kazoo.ErrPartitionClaimedByOther && tries+1 < maxRetries {
			time.Sleep(1 * time.Second)
		} else {
			logger.Warn("KAFKA: Replica FAILED to claim partition",
				zap.Int("replicaId", cg.replicaId),
				zap.String("topic", topic),
				zap.Int64("partition", int64(partition)),
				zap.Error(err),
			)
			return
		}
	}
	defer cg.instance.ReleasePartition(topic, partition)

	nextOffset, err := cg.offsetManager.InitializePartition(topic, partition)

	if err != nil {
		logger.Error("KAFKA: Replica FAILED to determine initial offset",
			zap.Int("replicaId", cg.replicaId),
			zap.String("topic", topic),
			zap.Int64("partition", int64(partition)),
			zap.Error(err),
		)
		return
	}

	if nextOffset >= 0 {
		logger.Info("KAFKA: Replica partition consumer starting at offset",
			zap.Int("replicaId", cg.replicaId),
			zap.String("topic", topic),
			zap.Int64("partition", int64(partition)),
			zap.Int64("nextOffset", nextOffset),
		)
	} else {
		nextOffset = cg.config.Offsets.Initial
		if nextOffset == sarama.OffsetOldest {
			logger.Info("KAFKA: Replica partition consumer starting at the oldest available offset",
				zap.Int("replicaId", cg.replicaId),
				zap.String("topic", topic),
				zap.Int64("partition", int64(partition)),
			)
		} else if nextOffset == sarama.OffsetNewest {
			logger.Info("KAFKA: Replica partition consumer listening for new messages only",
				zap.Int("replicaId", cg.replicaId),
				zap.String("topic", topic),
				zap.Int64("partition", int64(partition)),
			)
		}
	}

	consumer, err := cg.consumer.ConsumePartition(topic, partition, nextOffset)
	if err == sarama.ErrOffsetOutOfRange {
		logger.Warn("KAFKA: Replica partition consumer offset out of Range",
			zap.Int("replicaId", cg.replicaId),
			zap.String("topic", topic),
			zap.Int64("partition", int64(partition)),
		)
		// if the offset is out of range, simplistically decide whether to use OffsetNewest or OffsetOldest
		// if the configuration specified offsetOldest, then switch to the oldest available offset, else
		// switch to the newest available offset.
		if cg.config.Offsets.Initial == sarama.OffsetOldest {
			nextOffset = sarama.OffsetOldest
			logger.Info("KAFKA: Replica partition consumer offset reset to oldest available offset",
				zap.Int("replicaId", cg.replicaId),
				zap.String("topic", topic),
				zap.Int64("partition", int64(partition)),
			)
		} else {
			nextOffset = sarama.OffsetNewest
			logger.Info("KAFKA: Replica partition consumer offset reset to newest available offset",
				zap.Int("replicaId", cg.replicaId),
				zap.String("topic", topic),
				zap.Int64("partition", int64(partition)),
			)
		}
		// retry the consumePartition with the adjusted offset
		consumer, err = cg.consumer.ConsumePartition(topic, partition, nextOffset)
	}
	if err != nil {
		logger.Fatal("KAFKA: Replica FAILED to start partition consumer",
			zap.Int("replicaId", cg.replicaId),
			zap.String("topic", topic),
			zap.Int64("partition", int64(partition)),
			zap.Error(err),
		)
		return
	}
	defer consumer.Close()

	err = nil
	var lastOffset int64 = -1 // aka unknown
partitionConsumerLoop:
	for {
		select {
		case <-stopper:
			break partitionConsumerLoop

		case err := <-consumer.Errors():
			for {
				select {
				case errors <- err:
					continue partitionConsumerLoop

				case <-stopper:
					break partitionConsumerLoop
				}
			}

		case message := <-consumer.Messages():
			for {
				select {
				case <-stopper:
					break partitionConsumerLoop

				case messages <- message:
					lastOffset = message.Offset
					continue partitionConsumerLoop
				}
			}
		}
	}

	logger.Info("KAFKA: Replica is stopping partition consumer at offset",
		zap.Int("replicaId", cg.replicaId),
		zap.String("topic", topic),
		zap.Int64("partition", int64(partition)),
		zap.Int64("lastOffset", lastOffset),
	)
	if err = cg.offsetManager.FinalizePartition(topic, partition, lastOffset, cg.config.Offsets.ProcessingTimeout, cg.replicaId, logger); err != nil {
		logger.Fatal("KAFKA: Replica error trying to stop partition consumer",
			zap.Int("replicaId", cg.replicaId),
			zap.String("topic", topic),
			zap.Int64("partition", int64(partition)),
			zap.Error(err),
		)
	}
	logger.Info("KAFKA: Replica successfully stoped partition",
		zap.Int("replicaId", cg.replicaId),
		zap.String("topic", topic),
		zap.Int64("partition", int64(partition)),
	)
}
Beispiel #26
0
func (cg *ConsumerGroup) topicConsumer(topic string, messages chan<- *sarama.ConsumerMessage, errors chan<- *sarama.ConsumerError, stopper <-chan struct{}, logger zap.Logger) {
	defer cg.wg.Done()

	select {
	case <-stopper:
		return
	default:
	}

	logger.Info("KAFKA: Replica started consumer for topic",
		zap.Int("replicaId", cg.replicaId),
		zap.String("topic", topic),
	)

	// Fetch a list of partition IDs
	partitions, err := cg.kazoo.Topic(topic).Partitions()
	if err != nil {
		logger.Fatal("KAFKA: Replica FAILED to get list of partitions for topic",
			zap.Int("replicaId", cg.replicaId),
			zap.String("topic", topic),
			zap.Error(err),
		)
		cg.errors <- &sarama.ConsumerError{
			Topic:     topic,
			Partition: -1,
			Err:       err,
		}
		return
	}

	partitionLeaders, err := retrievePartitionLeaders(partitions)
	if err != nil {
		logger.Fatal("KAFKA: Replica FAILED to get leaders of partitions for topic",
			zap.Int("replicaId", cg.replicaId),
			zap.String("topic", topic),
			zap.Error(err),
		)
		cg.errors <- &sarama.ConsumerError{
			Topic:     topic,
			Partition: -1,
			Err:       err,
		}
		return
	}

	dividedPartitions := dividePartitionsBetweenConsumers(cg.consumers, partitionLeaders)
	myPartitions := dividedPartitions[cg.instance.ID]
	logger.Info("KAFKA: Replica is claiming partitions",
		zap.Int("replicaId", cg.replicaId),
		zap.String("topic", topic),
		zap.Int("claimedPartitions", len(myPartitions)),
		zap.Int("numPartitionLeaders", len(partitionLeaders)),
	)
	// Consume all the assigned partitions
	var wg sync.WaitGroup
	myPartitionsStr := ""
	for _, pid := range myPartitions {
		myPartitionsStr += fmt.Sprintf("%d ", pid.ID)
		wg.Add(1)
		go cg.partitionConsumer(topic, pid.ID, messages, errors, &wg, stopper, logger)
	}
	logger.Info("KAFKA: Retrieved replica's partitions",
		zap.Int("replicaId", cg.replicaId),
		zap.String("myPartitions", myPartitionsStr),
	)
	wg.Wait()
	logger.Info("KAFKA: Replica stopped consumer of a topic",
		zap.Int("replicaId", cg.replicaId),
		zap.String("topic", topic),
	)
}
Beispiel #27
0
func (cg *ConsumerGroup) Load(logger zap.Logger) error {
	var kz *kazoo.Kazoo
	var err error
	if kz, err = kazoo.NewKazoo(cg.zookeeper, cg.config.Zookeeper); err != nil {
		return err
	}

	logger.Info("KAFKA: Getting broker list for replica",
		zap.Int("replicaId", cg.replicaId),
	)
	brokers, err := kz.BrokerList()
	if err != nil {
		kz.Close()
		return err
	}

	group := kz.Consumergroup(cg.config.ClientID)
	instance := group.NewInstance()

	var consumer sarama.Consumer
	if consumer, err = sarama.NewConsumer(brokers, cg.config.Config); err != nil {
		kz.Close()
		return err
	}

	cg.kazoo = kz
	cg.group = group
	cg.instance = instance
	cg.messages = make(chan *sarama.ConsumerMessage, cg.config.ChannelBufferSize)
	cg.consumer = consumer
	cg.singleShutdown = sync.Once{}
	cg.errors = make(chan *sarama.ConsumerError, cg.config.ChannelBufferSize)
	cg.stopper = make(chan struct{})

	if exists, err := cg.group.Exists(); err != nil {
		logger.Fatal("KAFKA: Replica failed to check existence of consumergroup",
			zap.Int("replicaId", cg.replicaId),
			zap.Error(err),
		)
		consumer.Close()
		kz.Close()
		return err
	} else if !exists {
		logger.Info("KAFKA: Consumergroup does not exist, creating it",
			zap.Int("replicaId", cg.replicaId),
			zap.String("consumerGroupName", cg.group.Name),
		)
		if err := cg.group.Create(); err != nil {
			logger.Fatal("KAFKA: Failed to create consumergroup in Zookeeper",
				zap.Int("replicaId", cg.replicaId),
				zap.Error(err),
			)
			consumer.Close()
			kz.Close()
			return err
		}
	}

	if err := cg.instance.Register(cg.topics); err != nil {
		logger.Fatal("KAFKA: Failed to create consumer instance",
			zap.Int("replicaId", cg.replicaId),
			zap.Error(err),
		)
		return err
	} else {
		logger.Info("KAFKA: Consumer instance registered",
			zap.Int("replicaId", cg.replicaId),
		)
	}

	offsetConfig := OffsetManagerConfig{
		CommitInterval:   cg.config.Offsets.CommitInterval,
		EnableAutoCommit: cg.config.EnableOffsetAutoCommit,
	}
	cg.offsetManager = NewZookeeperOffsetManager(cg, &offsetConfig, logger)
	go cg.topicListConsumer(cg.topics, logger)

	return nil
}
Beispiel #28
0
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
	e := p.e
	pgm := p.pgm

	cd, _, err := e.GetClusterData()
	if err != nil {
		log.Error("error retrieving cluster data", zap.Error(err))
		return
	}
	log.Debug("cd dump", zap.String("cd", spew.Sdump(cd)))

	if cd == nil {
		log.Info("no cluster data available, waiting for it to appear")
		return
	}
	if cd.FormatVersion != cluster.CurrentCDFormatVersion {
		log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion))
		return
	}
	if cd.Cluster != nil {
		p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration
		p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration

		if p.keeperLocalState.ClusterUID != cd.Cluster.UID {
			p.keeperLocalState.ClusterUID = cd.Cluster.UID
			if err = p.saveKeeperLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
		}
	}

	k, ok := cd.Keepers[p.keeperLocalState.UID]
	if !ok {
		log.Info("our keeper data is not available, waiting for it to appear")
		return
	}
	// TODO(sgotti) Check that the Keeper.Status address:port has been updated

	db := cd.FindDB(k)
	if db == nil {
		log.Info("no db assigned")
		return
	}
	// TODO(sgotti) Check that the DB.Status address:port has been updated

	followersUIDs := db.Spec.Followers

	prevPGParameters := pgm.GetParameters()
	// create postgres parameteres
	pgParameters := p.createPGParameters(db)
	// update pgm postgres parameters
	pgm.SetParameters(pgParameters)

	dbls := p.dbLocalState
	if dbls.Initializing {
		// If we are here this means that the db initialization or
		// resync as failed so we have to clean up stale data
		log.Error("db failed to initialize or resync")
		// Clean up cluster db datadir
		if err = pgm.RemoveAll(); err != nil {
			log.Error("failed to remove the postgres data dir", zap.Error(err))
			return
		}
		// Reset current db local state since it's not valid anymore
		p.localStateMutex.Lock()
		dbls.UID = ""
		dbls.Generation = cluster.NoGeneration
		dbls.Initializing = false
		p.localStateMutex.Unlock()
		if err = p.saveDBLocalState(); err != nil {
			log.Error("error", zap.Error(err))
			return
		}
	}

	initialized, err := pgm.IsInitialized()
	if err != nil {
		log.Error("failed to detect if instance is initialized", zap.Error(err))
		return
	}

	started := false
	if initialized {
		started, err = pgm.IsStarted()
		if err != nil {
			// log error getting instance state but go ahead.
			log.Info("failed to retrieve instance status", zap.Error(err))
		}
	}

	log.Debug("db status", zap.Bool("started", started))

	// if the db is initialized but there isn't a db local state then generate a new one
	if initialized && dbls.UID == "" {
		p.localStateMutex.Lock()
		dbls.UID = common.UID()
		dbls.Generation = cluster.NoGeneration
		dbls.InitPGParameters = nil
		dbls.Initializing = false
		p.localStateMutex.Unlock()
		if err = p.saveDBLocalState(); err != nil {
			log.Error("error", zap.Error(err))
			return
		}
	}

	if dbls.UID != db.UID {
		log.Info("current db UID different than cluster data db UID", zap.String("db", dbls.UID), zap.String("cdDB", db.UID))
		switch db.Spec.InitMode {
		case cluster.DBInitModeNew:
			log.Info("initializing the database cluster")
			p.localStateMutex.Lock()
			dbls.UID = db.UID
			// Set a no generation since we aren't already converged.
			dbls.Generation = cluster.NoGeneration
			dbls.InitPGParameters = nil
			dbls.Initializing = true
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if started {
				if err = pgm.Stop(true); err != nil {
					log.Error("failed to stop pg instance", zap.Error(err))
					return
				}
				started = false
			}
			if err = pgm.RemoveAll(); err != nil {
				log.Error("failed to remove the postgres data dir", zap.Error(err))
				return
			}
			if err = pgm.Init(); err != nil {
				log.Error("failed to initialize postgres database cluster", zap.Error(err))
				return
			}
			initialized = true

			if db.Spec.IncludeConfig {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
				pgParameters, err = pgm.GetConfigFilePGParameters()
				if err != nil {
					log.Error("failed to rename previous postgresql.conf", zap.Error(err))
					return
				}
				p.localStateMutex.Lock()
				dbls.InitPGParameters = pgParameters
				p.localStateMutex.Unlock()
			} else {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
			}

			log.Info("setting roles")
			if err = pgm.SetupRoles(); err != nil {
				log.Error("failed to setup roles", zap.Error(err))
				return
			}

			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if err = pgm.Stop(true); err != nil {
				log.Error("failed to stop pg instance", zap.Error(err))
				return
			}
		case cluster.DBInitModePITR:
			log.Info("restoring the database cluster")
			p.localStateMutex.Lock()
			dbls.UID = db.UID
			// Set a no generation since we aren't already converged.
			dbls.Generation = cluster.NoGeneration
			dbls.InitPGParameters = nil
			dbls.Initializing = true
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if started {
				if err = pgm.Stop(true); err != nil {
					log.Error("failed to stop pg instance", zap.Error(err))
					return
				}
				started = false
			}
			if err = pgm.RemoveAll(); err != nil {
				log.Error("failed to remove the postgres data dir", zap.Error(err))
				return
			}
			if err = pgm.Restore(db.Spec.PITRConfig.DataRestoreCommand); err != nil {
				log.Error("failed to restore postgres database cluster", zap.Error(err))
				return
			}
			if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(nil, db.Spec.PITRConfig.ArchiveRecoverySettings)); err != nil {
				log.Error("err", zap.Error(err))
				return
			}
			if db.Spec.IncludeConfig {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
				pgParameters, err = pgm.GetConfigFilePGParameters()
				if err != nil {
					log.Error("failed to rename previous postgresql.conf", zap.Error(err))
					return
				}
				p.localStateMutex.Lock()
				dbls.InitPGParameters = pgParameters
				p.localStateMutex.Unlock()
			} else {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
			}
			initialized = true

			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if err = pgm.Stop(true); err != nil {
				log.Error("failed to stop pg instance", zap.Error(err))
				return
			}
		case cluster.DBInitModeExisting:
			// replace our current db uid with the required one.
			p.localStateMutex.Lock()
			dbls.UID = db.UID
			// Set a no generation since we aren't already converged.
			dbls.Generation = cluster.NoGeneration
			dbls.InitPGParameters = nil
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if started {
				if err = pgm.Stop(true); err != nil {
					log.Error("failed to stop pg instance", zap.Error(err))
					return
				}
				started = false
			}
			if db.Spec.IncludeConfig {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
				pgParameters, err = pgm.GetConfigFilePGParameters()
				if err != nil {
					log.Error("failed to rename previous postgresql.conf", zap.Error(err))
					return
				}
				p.localStateMutex.Lock()
				dbls.InitPGParameters = pgParameters
				p.localStateMutex.Unlock()
			} else {
				if err = pgm.StartTmpMerged(); err != nil {
					log.Error("failed to start instance", zap.Error(err))
					return
				}
			}
			log.Info("updating our db UID with the cluster data provided db UID")
			// replace our current db uid with the required one.
			p.localStateMutex.Lock()
			dbls.InitPGParameters = pgParameters
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			if err = pgm.Stop(true); err != nil {
				log.Error("failed to stop pg instance", zap.Error(err))
				return
			}
		case cluster.DBInitModeNone:
			// replace our current db uid with the required one.
			p.localStateMutex.Lock()
			dbls.UID = db.UID
			// Set a no generation since we aren't already converged.
			dbls.Generation = cluster.NoGeneration
			dbls.InitPGParameters = nil
			p.localStateMutex.Unlock()
			if err = p.saveDBLocalState(); err != nil {
				log.Error("error", zap.Error(err))
				return
			}
			return
		default:
			log.Error("unknown db init mode", zap.String("initMode", string(db.Spec.InitMode)))
			return
		}
	}

	pgm.SetParameters(pgParameters)

	var localRole common.Role
	var systemID string
	if !initialized {
		log.Info("database cluster not initialized")
		localRole = common.RoleUndefined
	} else {
		localRole, err = pgm.GetRole()
		if err != nil {
			log.Error("error retrieving current pg role", zap.Error(err))
			return
		}
		systemID, err = p.pgm.GetSystemdID()
		if err != nil {
			log.Error("error retrieving systemd ID", zap.Error(err))
			return
		}
	}

	targetRole := db.Spec.Role
	log.Debug("target role", zap.String("targetRole", string(targetRole)))

	switch targetRole {
	case common.RoleMaster:
		// We are the elected master
		log.Info("our db requested role is master")
		if localRole == common.RoleUndefined {
			log.Error("database cluster not initialized but requested role is master. This shouldn't happen!")
			return
		}
		if !started {
			if err = pgm.Start(); err != nil {
				log.Error("failed to start postgres", zap.Error(err))
				return
			}
			started = true
		}

		if localRole == common.RoleStandby {
			log.Info("promoting to master")
			if err = pgm.Promote(); err != nil {
				log.Error("err", zap.Error(err))
				return
			}
		} else {
			log.Info("already master")
		}

		var replSlots []string
		replSlots, err = pgm.GetReplicatinSlots()
		log.Debug("replication slots", zap.Object("replSlots", replSlots))
		if err != nil {
			log.Error("err", zap.Error(err))
			return
		}
		// Drop replication slots
		for _, slotName := range replSlots {
			if !common.IsStolonName(slotName) {
				continue
			}
			if !util.StringInSlice(followersUIDs, common.NameFromStolonName(slotName)) {
				log.Info("dropping replication slot since db not marked as follower", zap.String("slot", slotName), zap.String("db", common.NameFromStolonName(slotName)))
				if err = pgm.DropReplicationSlot(slotName); err != nil {
					log.Error("err", zap.Error(err))
				}
			}
		}
		// Create replication slots
		for _, followerUID := range followersUIDs {
			if followerUID == dbls.UID {
				continue
			}
			replSlot := common.StolonName(followerUID)
			if !util.StringInSlice(replSlots, replSlot) {
				log.Info("creating replication slot", zap.String("slot", replSlot), zap.String("db", followerUID))
				if err = pgm.CreateReplicationSlot(replSlot); err != nil {
					log.Error("err", zap.Error(err))
				}
			}
		}
	case common.RoleStandby:
		// We are a standby
		followedUID := db.Spec.FollowConfig.DBUID
		log.Info("our db requested role is standby", zap.String("followedDB", followedUID))
		followedDB, ok := cd.DBs[followedUID]
		if !ok {
			log.Error("no db data available for followed db", zap.String("followedDB", followedUID))
			return
		}
		switch localRole {
		case common.RoleMaster:
			if systemID == followedDB.Status.SystemID {
				// There can be the possibility that this
				// database is on the same branch of the
				// current followed instance.
				// So we try to put it in recovery and then
				// check if it's on the same branch or force a
				// resync
				replConnParams := p.getReplConnParams(db, followedDB)
				standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)}
				if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				if !started {
					if err = pgm.Start(); err != nil {
						log.Error("err", zap.Error(err))
						return
					}
					started = true
				} else {
					if err = pgm.Restart(true); err != nil {
						log.Error("err", zap.Error(err))
						return
					}
				}

				// TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines.
				// So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream.

				// Check timeline history
				// We need to update our pgState to avoid dealing with
				// an old pgState not reflecting the real state
				var pgState *cluster.PostgresState
				pgState, err = p.GetPGState(pctx)
				if err != nil {
					log.Error("cannot get current pgstate", zap.Error(err))
					return
				}

				if p.isDifferentTimelineBranch(followedDB, pgState) {
					if err = p.resync(db, followedDB, true, started); err != nil {
						log.Error("failed to resync from followed instance", zap.Error(err))
						return
					}
					if err = pgm.Start(); err != nil {
						log.Error("err", zap.Error(err))
						return
					}
					started = true

					// Check again if it was really synced
					pgState, err = p.GetPGState(pctx)
					if err != nil {
						log.Error("cannot get current pgstate", zap.Error(err))
						return
					}
					if p.isDifferentTimelineBranch(followedDB, pgState) {
						if err = p.resync(db, followedDB, false, started); err != nil {
							log.Error("failed to resync from followed instance", zap.Error(err))
							return
						}
						if err = pgm.Start(); err != nil {
							log.Error("err", zap.Error(err))
							return
						}
						started = true
					}
				}
			} else {
				if err = p.resync(db, followedDB, false, started); err != nil {
					log.Error("failed to resync from followed instance", zap.Error(err))
					return
				}
				if err = pgm.Start(); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				started = true
			}
		case common.RoleStandby:
			log.Info("already standby")
			if !started {
				replConnParams := p.getReplConnParams(db, followedDB)
				standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)}
				if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				if err = pgm.Start(); err != nil {
					log.Error("failed to start postgres", zap.Error(err))
					return
				}
				started = true
			}

			// Check that we can sync with followed instance

			// We need to update our pgState to avoid dealing with
			// an old pgState not reflecting the real state
			var pgState *cluster.PostgresState
			pgState, err = p.GetPGState(pctx)
			if err != nil {
				log.Error("cannot get current pgstate", zap.Error(err))
				return
			}
			needsResync := false
			tryPgrewind := false
			// If the db has a different systemdID then a resync is needed
			if systemID != followedDB.Status.SystemID {
				needsResync = true
				// Check timeline history
			} else if p.isDifferentTimelineBranch(followedDB, pgState) {
				needsResync = true
				tryPgrewind = true
			}
			if needsResync {
				// TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines.
				// So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream.
				if err = p.resync(db, followedDB, tryPgrewind, started); err != nil {
					log.Error("failed to full resync from followed instance", zap.Error(err))
					return
				}
				if err = pgm.Start(); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				started = true

				// Check again if it was really synced
				pgState, err = p.GetPGState(pctx)
				if err != nil {
					log.Error("cannot get current pgstate", zap.Error(err))
					return
				}
				if p.isDifferentTimelineBranch(followedDB, pgState) {
					if err = p.resync(db, followedDB, false, started); err != nil {
						log.Error("failed to resync from followed instance", zap.Error(err))
						return
					}
					if err = pgm.Start(); err != nil {
						log.Error("err", zap.Error(err))
						return
					}
					started = true
				}
			}

			// TODO(sgotti) Check that the followed instance has all the needed WAL segments

			// Update our primary_conninfo if replConnString changed
			var curReplConnParams postgresql.ConnParams

			curReplConnParams, err = pgm.GetPrimaryConninfo()
			if err != nil {
				log.Error("err", zap.Error(err))
				return
			}
			log.Debug("curReplConnParams", zap.Object("curReplConnParams", curReplConnParams))

			newReplConnParams := p.getReplConnParams(db, followedDB)
			log.Debug("newReplConnParams", zap.Object("newReplConnParams", newReplConnParams))

			if !curReplConnParams.Equals(newReplConnParams) {
				log.Info("connection parameters changed. Reconfiguring.", zap.String("followedDB", followedUID), zap.Object("replConnParams", newReplConnParams))
				standbySettings := &cluster.StandbySettings{PrimaryConninfo: newReplConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)}
				if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
				if err = pgm.Restart(true); err != nil {
					log.Error("err", zap.Error(err))
					return
				}
			}
		case common.RoleUndefined:
			if err = p.resync(db, followedDB, false, started); err != nil {
				log.Error("failed to full resync from followed instance", zap.Error(err))
				return
			}
			if err = pgm.Start(); err != nil {
				log.Error("err", zap.Error(err))
				return
			}
			started = true
		}
	case common.RoleUndefined:
		log.Info("our db requested role is none")
		return
	}

	// update pg parameters
	pgParameters = p.createPGParameters(db)

	// Log synchronous replication changes
	prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"]
	syncStandbyNames := pgParameters["synchronous_standby_names"]
	if db.Spec.SynchronousReplication {
		if prevSyncStandbyNames != syncStandbyNames {
			log.Info("needed synchronous_standby_names changed", zap.String("prevSyncStandbyNames", prevSyncStandbyNames), zap.String("syncStandbyNames", syncStandbyNames))
		}
	} else {
		if prevSyncStandbyNames != "" {
			log.Info("sync replication disabled, removing current synchronous_standby_names", zap.String("syncStandbyNames", prevSyncStandbyNames))
		}
	}

	if !pgParameters.Equals(prevPGParameters) {
		log.Info("postgres parameters changed, reloading postgres instance")
		pgm.SetParameters(pgParameters)
		if err := pgm.Reload(); err != nil {
			log.Error("failed to reload postgres instance", zap.Error(err))
		}
	} else {
		// for tests
		log.Info("postgres parameters not changed")
	}

	// If we are here, then all went well and we can update the db generation and save it locally
	p.localStateMutex.Lock()
	dbls.Generation = db.Generation
	dbls.Initializing = false
	p.localStateMutex.Unlock()
	if err := p.saveDBLocalState(); err != nil {
		log.Error("err", zap.Error(err))
		return
	}
}
Beispiel #29
0
func (p *PostgresKeeper) Start() {
	endSMCh := make(chan struct{})
	endPgStatecheckerCh := make(chan struct{})
	endUpdateKeeperInfo := make(chan struct{})

	var err error
	var cd *cluster.ClusterData
	cd, _, err = p.e.GetClusterData()
	if err != nil {
		log.Error("error retrieving cluster data", zap.Error(err))
	} else if cd != nil {
		if cd.FormatVersion != cluster.CurrentCDFormatVersion {
			log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion))
		} else if cd.Cluster != nil {
			p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration
			p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration
		}
	}

	log.Debug("cd dump", zap.String("cd", spew.Sdump(cd)))

	// TODO(sgotti) reconfigure the various configurations options
	// (RequestTimeout) after a changed cluster config
	pgParameters := make(common.Parameters)
	pgm := postgresql.NewManager(p.pgBinPath, p.dataDir, pgParameters, p.getLocalConnParams(), p.getOurReplConnParams(), p.pgSUUsername, p.pgSUPassword, p.pgReplUsername, p.pgReplPassword, p.requestTimeout)
	p.pgm = pgm

	p.pgm.Stop(true)

	ctx, cancel := context.WithCancel(context.Background())
	smTimerCh := time.NewTimer(0).C
	updatePGStateTimerCh := time.NewTimer(0).C
	updateKeeperInfoTimerCh := time.NewTimer(0).C
	for true {
		select {
		case <-p.stop:
			log.Debug("stopping stolon keeper")
			cancel()
			p.pgm.Stop(true)
			p.end <- nil
			return

		case <-smTimerCh:
			go func() {
				p.postgresKeeperSM(ctx)
				endSMCh <- struct{}{}
			}()

		case <-endSMCh:
			smTimerCh = time.NewTimer(p.sleepInterval).C

		case <-updatePGStateTimerCh:
			// updateKeeperInfo two times faster than the sleep interval
			go func() {
				p.updatePGState(ctx)
				endPgStatecheckerCh <- struct{}{}
			}()

		case <-endPgStatecheckerCh:
			// updateKeeperInfo two times faster than the sleep interval
			updatePGStateTimerCh = time.NewTimer(p.sleepInterval / 2).C

		case <-updateKeeperInfoTimerCh:
			go func() {
				if err := p.updateKeeperInfo(); err != nil {
					log.Error("failed to update keeper info", zap.Error(err))
				}
				endUpdateKeeperInfo <- struct{}{}
			}()

		case <-endUpdateKeeperInfo:
			updateKeeperInfoTimerCh = time.NewTimer(p.sleepInterval).C
		}
	}
}
Beispiel #30
0
func (p *PostgresKeeper) GetPGState(pctx context.Context) (*cluster.PostgresState, error) {
	p.getPGStateMutex.Lock()
	defer p.getPGStateMutex.Unlock()
	// Just get one pgstate at a time to avoid exausting available connections
	pgState := &cluster.PostgresState{}

	p.localStateMutex.Lock()
	pgState.UID = p.dbLocalState.UID
	pgState.Generation = p.dbLocalState.Generation
	p.localStateMutex.Unlock()

	pgState.ListenAddress = p.pgListenAddress
	pgState.Port = p.pgPort

	initialized, err := p.pgm.IsInitialized()
	if err != nil {
		return nil, err
	}
	if initialized {
		pgParameters, err := p.pgm.GetConfigFilePGParameters()
		if err != nil {
			log.Error("cannot get configured pg parameters", zap.Error(err))
			return pgState, nil
		}
		log.Debug("got configured pg parameters", zap.Object("pgParameters", pgParameters))
		filteredPGParameters := common.Parameters{}
		for k, v := range pgParameters {
			if !util.StringInSlice(managedPGParameters, k) {
				filteredPGParameters[k] = v
			}
		}
		log.Debug("filtered out managed pg parameters", zap.Object("filteredPGParameters", filteredPGParameters))
		pgState.PGParameters = filteredPGParameters

		sd, err := p.pgm.GetSystemData()
		if err != nil {
			log.Error("error getting pg state", zap.Error(err))
			return pgState, nil
		}
		pgState.SystemID = sd.SystemID
		pgState.TimelineID = sd.TimelineID
		pgState.XLogPos = sd.XLogPos

		// if timeline <= 1 then no timeline history file exists.
		pgState.TimelinesHistory = cluster.PostgresTimelinesHistory{}
		if pgState.TimelineID > 1 {
			tlsh, err := p.pgm.GetTimelinesHistory(pgState.TimelineID)
			if err != nil {
				log.Error("error getting timeline history", zap.Error(err))
				return pgState, nil
			}
			ctlsh := cluster.PostgresTimelinesHistory{}

			for _, tlh := range tlsh {
				ctlh := &cluster.PostgresTimelineHistory{
					TimelineID:  tlh.TimelineID,
					SwitchPoint: tlh.SwitchPoint,
					Reason:      tlh.Reason,
				}
				ctlsh = append(ctlsh, ctlh)
			}
			pgState.TimelinesHistory = ctlsh
		}
		pgState.Healthy = true
	}

	return pgState, nil
}