func newBot() (b MMJira) { b = MMJira{l: zap.NewJSON(zap.DebugLevel), reg: metrics.NewRegistry()} data, err := ioutil.ReadFile("config.yaml") if err != nil { b.l.Panic("not able to read the file", zap.Error(err)) } var config InstanceConfig if err = yaml.Unmarshal(data, &config); err != nil { b.l.Panic("not able to marshal the file", zap.Error(err)) } b.c = &config if !b.c.Debug { b.l.SetLevel(zap.ErrorLevel) } mmpost, err := mmcontroller.NewController(b.c.MMicon, b.c.MMuser, b.c.Hooks, b.c.Debug, metrics.NewPrefixedChildRegistry(b.reg, "mmc.")) if err != nil { panic(err) } b.m = mmpost b.l.Debug("outputting config", zap.Object("config", b.c)) b.r = mux.NewRouter() b.r.HandleFunc("/", b.homeHandler) b.r.HandleFunc("/hooks/", b.getHandler).Methods("GET") b.r.HandleFunc("/hooks/{hookid}", b.postHandler).Methods("POST") b.r.Handle("/metrics", exp.ExpHandler(b.reg)) b.r.HandleFunc("/config/", b.configGetHandler).Methods("GET") return b }
func (conn *Connection) reader(responses chan<- Response, logger zap.Logger) { buffer := make([]byte, 6) for { n, err := conn.conn.Read(buffer) if err != nil && n < 6 { logger.Info("APNS: Connection error before reading complete response", zap.Int("connectionId", conn.id), zap.Int("n", n), zap.Error(err), ) conn.shouldReconnect <- true return } else if err != nil { logger.Info("APNS: Connection error before reading complete response", zap.Int("connectionId", conn.id), zap.Error(err), ) } command := uint8(buffer[0]) if command != 8 { logger.Info("APNS: Something went wrong in a connection - Command should have been 8 but it had other value instead", zap.Int("connectionId", conn.id), zap.Object("commandValue", command), ) } resp := newResponse() resp.Identifier = binary.BigEndian.Uint32(buffer[2:6]) resp.Status = uint8(buffer[1]) responses <- resp conn.shouldReconnect <- true return } }
func (s *Server) Serve() error { s.globalWsChan = make(chan *Ws) if s.H2SleepToRunSecond == 0 { s.H2SleepToRunSecond = 2 } if s.H2BufSize == 0 { s.H2BufSize = 64 << 10 } if s.WsBufSize == 0 { s.WsBufSize = 65 << 10 } if s.PingSecond == 0 { s.PingSecond = 45 } s.upgrader.ReadBufferSize = s.WsBufSize s.upgrader.WriteBufferSize = s.WsBufSize if s.H2RetryMaxSecond == 0 { s.H2RetryMaxSecond = 30 } info, err := json.Marshal(map[string]interface{}{ "PingSecond": s.PingSecond, }) if err != nil { Log.Error("compute server info", zap.Error(err)) return err } if s.TCP == 0 { s.dbox, err = newDropbox(s.DropboxAccessToken, s.DropboxDomainKey) if err != nil { Log.Error("create dropbox client", zap.Error(err)) return err } } if _, err = s.loadPac(); err != nil { return err } s.info = info if s.TCP == 0 { s.challengeProvider = new(wrapperChallengeProvider) s.httpServer = s.newHttpServer() go s.listenAndServeH2All() return s.httpServer.ListenAndServe() } s.listenAndServeH2All() return errors.New("TCP server failed") }
func (conn *Connection) connect(logger zap.Logger) error { if conn.conn != nil { conn.conn.Close() } if conn.connAux != nil { conn.connAux.Close() } var cert tls.Certificate var err error if len(conn.CertificateBase64) == 0 && len(conn.KeyBase64) == 0 { // The user did not specify raw block contents, so check the filesystem. cert, err = tls.LoadX509KeyPair(conn.CertificateFile, conn.KeyFile) } else { // The user provided the raw block contents, so use that. cert, err = tls.X509KeyPair([]byte(conn.CertificateBase64), []byte(conn.KeyBase64)) } if err != nil { logger.Fatal("APNS: Failed to obtain certificate", zap.Error(err), ) return err } conf := &tls.Config{ Certificates: []tls.Certificate{cert}, ServerName: strings.Split(conn.Gateway, ":")[0], } connAux, err := net.Dial("tcp", conn.Gateway) if err != nil { logger.Fatal("APNS: Failed while dialing gateway", zap.String("gateway", conn.Gateway), zap.Error(err), ) return err } tlsConn := tls.Client(connAux, conf) err = tlsConn.Handshake() if err != nil { logger.Fatal("APNS: Failed while handshaking", zap.Error(err), ) _ = tlsConn.Close() return err } conn.conn = tlsConn conn.connAux = connAux //Start reader goroutine go conn.reader(conn.responses, logger) return nil }
func (s *Server) newH2TlsConfig() (*tls.Config, error) { if s.TCP != 0 { // 1. LoadServerCert cert, err := tls.LoadX509KeyPair("server.crt", "server.key") if err != nil { Log.Error("loading server certificate", zap.Error(err)) return nil, err } // 2. LoadCACert caCert, err := ioutil.ReadFile("chain.pem") if err != nil { Log.Error("loading CA certificate", zap.Error(err)) return nil, err } caPool := x509.NewCertPool() caPool.AppendCertsFromPEM(caCert) config := tls.Config{ Certificates: []tls.Certificate{cert}, ClientCAs: caPool, ClientAuth: tls.RequireAndVerifyClientCert, MinVersion: tls.VersionTLS12, NextProtos: []string{http2.NextProtoTLS}, } return &config, nil } w, err := acmewrapper.New(acmewrapper.Config{ Domains: []string{s.AcmeDomain}, TLSCertFile: fmt.Sprintf("/%s/%s", s.AcmeDomain, "cert.pem"), TLSKeyFile: fmt.Sprintf("/%s/%s", s.AcmeDomain, "key.pem"), RegistrationFile: fmt.Sprintf("/%s/%s", s.AcmeDomain, "user.reg"), PrivateKeyFile: fmt.Sprintf("/%s/%s", s.AcmeDomain, "user.pem"), TOSCallback: acmewrapper.TOSAgree, HTTP01ChallengeProvider: s.challengeProvider, SaveFileCallback: s.dbox.SaveFile, LoadFileCallback: s.dbox.LoadFile, }) if err != nil { Log.Error("acmewrapper failed", zap.Error(err)) return nil, err } return w.TLSConfig(), nil }
func (c *Connection) spinUntilReconnect(logger zap.Logger) { var backoff = time.Duration(100) for { logger.Info("APNS: Connection lost. Reconnecting", zap.Int("connectionId", c.id), ) err := c.connect(logger) if err != nil { //Exponential backoff up to a limit logger.Info("APNS: Error connecting to server", zap.Int("connectionId", c.id), zap.Error(err), ) backoff = backoff * 2 if backoff > maxBackoff { backoff = maxBackoff } time.Sleep(backoff) } else { backoff = 100 logger.Info("APNS: New connection established", zap.Int("connectionId", c.id), ) break } } }
func (s *Server) newH2Server(tlsConfig *tls.Config, laddr string) (*http.Server, func(error)) { var mu sync.Mutex var h2sleep time.Duration = 1 h2Server := &http.Server{ Addr: laddr, Handler: http.HandlerFunc(s.serveH2), TLSConfig: tlsConfig, ConnState: func(c net.Conn, s http.ConnState) { if s == http.StateNew { mu.Lock() h2sleep = 1 mu.Unlock() } }, } http2.ConfigureServer(h2Server, &http2.Server{ MaxReadFrameSize: s.H2BufSize, }) afterServeError := func(err error) { Log.Error("h2 server failed", zap.Error(err)) mu.Lock() if h2sleep < s.H2RetryMaxSecond { h2sleep++ } sec := h2sleep mu.Unlock() time.Sleep(time.Second * sec) } return h2Server, afterServeError }
func (s *Server) listenAndServeH2(tlsConfig *tls.Config, tcp bool) { var laddr string var tlsListener net.Listener var err error if tcp { if s.TCP == 0 { return } laddr = fmt.Sprintf(":%d", s.TCP) tlsListener, err = tls.Listen("tcp", laddr, tlsConfig) if err != nil { Log.Error("tcp tlsListener failed", zap.Error(err)) return } } else { laddr = ":8444" tlsListener = tls.NewListener(newGlobalWsListener(s.globalWsChan), tlsConfig) } h2Server, afterServeError := s.newH2Server(tlsConfig, laddr) for { err = h2Server.Serve(tlsListener) afterServeError(err) } }
func (s *Server) serveH2r(w http.ResponseWriter, r *http.Request) { defer func() { if err := recover(); err != nil { w.WriteHeader(http.StatusInternalServerError) Log.Error("REVERSE failed", zap.Object("err", err)) } else { w.WriteHeader(http.StatusOK) } }() remote, err := net.DialTimeout("tcp", r.Host, time.Second*10) if err != nil { Log.Error("dail failed", zap.Error(err), zap.String("host", r.Host)) w.WriteHeader(http.StatusNotImplemented) return } defer remote.Close() go io.Copy(remote, r.Body) // go io.Copy(remote, io.TeeReader(r.Body, os.Stdout)) resr := io.TeeReader(remote, w) // resr = io.TeeReader(resr, os.Stdout) res, err := http.ReadResponse(bufio.NewReader(resr), nil) if err != nil { return } if res.Body != nil { defer res.Body.Close() io.Copy(ioutil.Discard, res.Body) } }
func (s *Server) serveH2c(w http.ResponseWriter, r *http.Request) { defer func() { if err := recover(); err != nil { w.WriteHeader(http.StatusInternalServerError) Log.Error("CONNECT failed", zap.Object("err", err)) } }() remote, err := net.DialTimeout("tcp", r.Host, time.Second*10) if err != nil { Log.Error("dail failed", zap.Error(err), zap.String("host", r.Host)) w.WriteHeader(http.StatusNotImplemented) return } defer remote.Close() fw := &flushWriter{w} fw.FlushHeader(http.StatusOK) go io.Copy(remote, r.Body) srcRemote := &TryReader{ c: remote, ignore: 3, maxRetry: 2, tryDur: time.Millisecond * 600, timeout: time.Second * 15, } io.Copy(fw, srcRemote) }
func (s *Sentinel) electionLoop() { for { log.Info("Trying to acquire sentinels leadership") electedCh, errCh := s.candidate.RunForElection() for { select { case elected := <-electedCh: s.leaderMutex.Lock() if elected { log.Info("sentinel leadership acquired") s.leader = true s.leadershipCount++ } else { if s.leader { log.Info("sentinel leadership lost") } s.leader = false } s.leaderMutex.Unlock() case err := <-errCh: if err != nil { log.Error("election loop error", zap.Error(err)) } goto end case <-s.stop: log.Debug("stopping election Loop") return } } end: time.Sleep(10 * time.Second) } }
func main() { logLevel := zap.LevelFlag("v", zap.InfoLevel, "log level: all, debug, info, warn, error, panic, fatal, none") flag.StringVar(&botName, "botname", "satpam_bot", "bot name") flag.StringVar(&adminID, "admin", "", "admin id") flag.Parse() // setup logger log.SetLevel(*logLevel) bot.SetLogger(log) log.Info("STARTED", zap.String("version", VERSION), zap.String("buildtime", BUILDTIME)) key := os.Getenv("TELEGRAM_KEY") if key == "" { log.Fatal("TELEGRAM_KEY can not be empty") } startedAt = time.Now() telegram := bot.NewTelegram(key) plugin := satpamBot{t: telegram} if err := telegram.AddPlugin(&plugin); err != nil { log.Fatal("Failed AddPlugin", zap.Error(err)) } plugin.start() telegram.Start() }
func (c *ClusterChecker) Start() error { endPollonProxyCh := make(chan error) checkCh := make(chan error) timerCh := time.NewTimer(0).C for true { select { case <-timerCh: go func() { checkCh <- c.Check() }() case err := <-checkCh: if err != nil { log.Debug("check reported error", zap.Error(err)) } if err != nil { return fmt.Errorf("checker fatal error: %v", err) } timerCh = time.NewTimer(cluster.DefaultProxyCheckInterval).C case err := <-endPollonProxyCh: if err != nil { return fmt.Errorf("proxy error: %v", err) } } } return nil }
func (p *PostgresKeeper) updatePGState(pctx context.Context) { p.pgStateMutex.Lock() defer p.pgStateMutex.Unlock() pgState, err := p.GetPGState(pctx) if err != nil { log.Error("failed to get pg state", zap.Error(err)) return } p.lastPGState = pgState }
func (cg *ConsumerGroup) topicListConsumer(topics []string, logger zap.Logger) { for { select { case <-cg.stopper: return default: } consumers, consumerChanges, err := cg.group.WatchInstances() if err != nil { logger.Fatal("KAFKA: FAILED to get list of registered consumer instances for replica", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) return } cg.consumers = consumers logger.Info("KAFKA: Got currently registered consumers for replica", zap.Int("replicaId", cg.replicaId), zap.Int("numRegisteredConsumers", len(cg.consumers)), ) stopper := make(chan struct{}) for _, topic := range topics { cg.wg.Add(1) go cg.topicConsumer(topic, cg.messages, cg.errors, stopper, logger) } select { case <-cg.stopper: close(stopper) return case event := <-consumerChanges: if event.Err == zk.ErrSessionExpired || event.Err == zk.ErrConnectionClosed { logger.Info("KAFKA: Session was expired, reloading consumer for replica", zap.Int("replicaId", cg.replicaId), ) go cg.reload(logger) <-cg.stopper close(stopper) return } else { logger.Info("KAFKA: Triggering rebalance due to consumer list change in replica", zap.Int("replicaId", cg.replicaId), ) close(stopper) cg.wg.Wait() } } } }
func (s *Server) tryLoadPac() []byte { ps, err := s.dbox.LoadPlainFile("/bricks.pac") if err != nil { Log.Error("load pac from dropbox", zap.Error(err)) s.muPac.RLock() defer s.muPac.RUnlock() return s.pac } s.muPac.Lock() defer s.muPac.Unlock() s.pac = ps return s.pac }
func (s *Server) loadPac() ([]byte, error) { ps, err := ioutil.ReadFile("bricks.pac") if err != nil { ps, err = s.dbox.LoadPlainFile("/bricks.pac") } if err != nil { Log.Error("load pac from dropbox", zap.Error(err)) return nil, err } s.muPac.Lock() defer s.muPac.Unlock() s.pac = ps return ps, nil }
func (cg *ConsumerGroup) Close(logger zap.Logger) error { shutdownError := AlreadyClosing cg.singleShutdown.Do(func() { defer cg.kazoo.Close() shutdownError = nil close(cg.stopper) cg.wg.Wait() if err := cg.offsetManager.Close(logger); err != nil { logger.Error("KAFKA: FAILED closing the offset manager for replica!", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) } if shutdownError = cg.instance.Deregister(); shutdownError != nil { logger.Warn("KAFKA: Replica FAILED deregistering consumer instance", zap.Int("replicaId", cg.replicaId), zap.Error(shutdownError), ) } else { logger.Info("KAFKA: Replica deregistered consumer instance", zap.Int("replicaId", cg.replicaId), zap.String("instanceId", cg.instance.ID), ) } if shutdownError = cg.consumer.Close(); shutdownError != nil { logger.Error("Replica FAILED closing the Sarama client", zap.Int("replicaId", cg.replicaId), zap.Error(shutdownError), ) } close(cg.messages) close(cg.errors) cg.instance = nil }) return shutdownError }
func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind, started bool) error { pgm := p.pgm if started { if err := pgm.Stop(true); err != nil { return fmt.Errorf("failed to stop pg instance: %v", err) } } replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} // TODO(sgotti) Actually we don't check if pg_rewind is installed or if // postgresql version is > 9.5 since someone can also use an externally // installed pg_rewind for postgres 9.4. If a pg_rewind executable // doesn't exists pgm.SyncFromFollowedPGRewind will return an error and // fallback to pg_basebackup if tryPgrewind && p.usePgrewind(db) { connParams := p.getSUConnParams(db, followedDB) log.Info("syncing using pg_rewind", zap.String("followedDB", followedDB.UID), zap.String("keeper", followedDB.Spec.KeeperUID)) if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword); err != nil { // log pg_rewind error and fallback to pg_basebackup log.Error("error syncing with pg_rewind", zap.Error(err)) } else { if err := pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { return fmt.Errorf("err: %v", err) } return nil } } if err := pgm.RemoveAll(); err != nil { return fmt.Errorf("failed to remove the postgres data dir: %v", err) } if log.Level() >= zap.DebugLevel { log.Debug("syncing from followed db", zap.String("followedDB", followedDB.UID), zap.String("keeper", followedDB.Spec.KeeperUID), zap.String("replConnParams", fmt.Sprintf("%v", replConnParams))) } else { log.Info("syncing from followed db", zap.String("followedDB", followedDB.UID), zap.String("keeper", followedDB.Spec.KeeperUID)) } if err := pgm.SyncFromFollowed(replConnParams); err != nil { return fmt.Errorf("sync error: %v", err) } log.Info("sync succeeded") if err := pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { return fmt.Errorf("err: %v", err) } return nil }
func (s *Server) serveWs(w http.ResponseWriter, r *http.Request) { defer func() { if err := recover(); err != nil { w.WriteHeader(http.StatusInternalServerError) Log.Error("serveWs error", zap.Object("err", err)) } }() Log.Debug("websocket start") ws, err := s.upgrader.Upgrade(w, r, nil) if err != nil { w.WriteHeader(http.StatusInternalServerError) Log.Error("websocket failed", zap.Error(err)) return } Log.Debug("websocket ok") s.globalWsChan <- NewWs(ws, s.WsBufSize) }
//Start initiates a connection to APNS and asnchronously sends notifications which have been queued. func (conn *Connection) Start(logger zap.Logger) error { //Connect to APNS. The reason this is here as well as in sender is that this probably catches any unavoidable errors in a synchronous fashion, while in sender it can reconnect after temporary errors (which should work most of the time.) err := conn.connect(logger) if err != nil { logger.Fatal("APNS: Failed to connect", zap.Int("connectionId", conn.id), zap.Error(err), ) return err } //Start sender goroutine sent := make(chan PushNotification, 10000) go conn.sender(conn.queue, sent, logger) //Start limbo goroutine go conn.limbo(sent, conn.responses, conn.errors, conn.queue, logger) return nil }
func (cg *ConsumerGroup) reload(logger zap.Logger) error { cg.reloadMutex.Lock() defer cg.reloadMutex.Unlock() cg.singleReload.Do(func() { logger.Info("KAFKA: Closing down old connections for replica", zap.Int("replicaId", cg.replicaId), ) err := cg.Close(logger) if err != nil { logger.Error("KAFKA: Failed to close consumergroup for replica", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) } cg.Load(logger) }) return nil }
func (c *ClusterChecker) Check() error { cd, _, err := c.e.GetClusterData() if err != nil { log.Error("cannot get cluster data", zap.Error(err)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) if c.stopListening { c.stopPollonProxy() } return nil } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) if cd == nil { log.Info("no clusterdata available, closing connections to previous master") c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) return nil } if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) return nil } if err = cd.Cluster.Spec.Validate(); err != nil { log.Error("clusterdata validation failed", zap.Error(err)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) return nil } // Start pollon if not active if err = c.startPollonProxy(); err != nil { log.Error("failed to start proxy", zap.Error(err)) return nil } proxy := cd.Proxy if proxy == nil { log.Info("no proxy object available, closing connections to previous master") c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil { log.Error("failed to update proxyInfo", zap.Error(err)) } return nil } db, ok := cd.DBs[proxy.Spec.MasterDBUID] if !ok { log.Info("no db object available, closing connections to previous master", zap.String("db", proxy.Spec.MasterDBUID)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil { log.Error("failed to update proxyInfo", zap.Error(err)) } return nil } addr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("%s:%s", db.Status.ListenAddress, db.Status.Port)) if err != nil { log.Error("error", zap.Error(err)) c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) return nil } log.Info("master address", zap.Stringer("address", addr)) if err = c.SetProxyInfo(c.e, proxy.UID, proxy.Generation, 2*cluster.DefaultProxyCheckInterval); err != nil { log.Error("failed to update proxyInfo", zap.Error(err)) } c.sendPollonConfData(pollon.ConfData{DestAddr: addr}) return nil }
func (b MMJira) start() { http.Handle("/", b.r) endpoint := b.c.Host + ":" + strconv.Itoa(b.c.Port) b.l.Fatal("error server", zap.Error(http.ListenAndServe(endpoint, b.r))) }
// Consumes a partition func (cg *ConsumerGroup) partitionConsumer(topic string, partition int32, messages chan<- *sarama.ConsumerMessage, errors chan<- *sarama.ConsumerError, wg *sync.WaitGroup, stopper <-chan struct{}, logger zap.Logger) { defer wg.Done() select { case <-stopper: return default: } for maxRetries, tries := 3, 0; tries < maxRetries; tries++ { if err := cg.instance.ClaimPartition(topic, partition); err == nil { break } else if err == kazoo.ErrPartitionClaimedByOther && tries+1 < maxRetries { time.Sleep(1 * time.Second) } else { logger.Warn("KAFKA: Replica FAILED to claim partition", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } } defer cg.instance.ReleasePartition(topic, partition) nextOffset, err := cg.offsetManager.InitializePartition(topic, partition) if err != nil { logger.Error("KAFKA: Replica FAILED to determine initial offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } if nextOffset >= 0 { logger.Info("KAFKA: Replica partition consumer starting at offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Int64("nextOffset", nextOffset), ) } else { nextOffset = cg.config.Offsets.Initial if nextOffset == sarama.OffsetOldest { logger.Info("KAFKA: Replica partition consumer starting at the oldest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } else if nextOffset == sarama.OffsetNewest { logger.Info("KAFKA: Replica partition consumer listening for new messages only", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } } consumer, err := cg.consumer.ConsumePartition(topic, partition, nextOffset) if err == sarama.ErrOffsetOutOfRange { logger.Warn("KAFKA: Replica partition consumer offset out of Range", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) // if the offset is out of range, simplistically decide whether to use OffsetNewest or OffsetOldest // if the configuration specified offsetOldest, then switch to the oldest available offset, else // switch to the newest available offset. if cg.config.Offsets.Initial == sarama.OffsetOldest { nextOffset = sarama.OffsetOldest logger.Info("KAFKA: Replica partition consumer offset reset to oldest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } else { nextOffset = sarama.OffsetNewest logger.Info("KAFKA: Replica partition consumer offset reset to newest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } // retry the consumePartition with the adjusted offset consumer, err = cg.consumer.ConsumePartition(topic, partition, nextOffset) } if err != nil { logger.Fatal("KAFKA: Replica FAILED to start partition consumer", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } defer consumer.Close() err = nil var lastOffset int64 = -1 // aka unknown partitionConsumerLoop: for { select { case <-stopper: break partitionConsumerLoop case err := <-consumer.Errors(): for { select { case errors <- err: continue partitionConsumerLoop case <-stopper: break partitionConsumerLoop } } case message := <-consumer.Messages(): for { select { case <-stopper: break partitionConsumerLoop case messages <- message: lastOffset = message.Offset continue partitionConsumerLoop } } } } logger.Info("KAFKA: Replica is stopping partition consumer at offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Int64("lastOffset", lastOffset), ) if err = cg.offsetManager.FinalizePartition(topic, partition, lastOffset, cg.config.Offsets.ProcessingTimeout, cg.replicaId, logger); err != nil { logger.Fatal("KAFKA: Replica error trying to stop partition consumer", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) } logger.Info("KAFKA: Replica successfully stoped partition", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) }
func (cg *ConsumerGroup) topicConsumer(topic string, messages chan<- *sarama.ConsumerMessage, errors chan<- *sarama.ConsumerError, stopper <-chan struct{}, logger zap.Logger) { defer cg.wg.Done() select { case <-stopper: return default: } logger.Info("KAFKA: Replica started consumer for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), ) // Fetch a list of partition IDs partitions, err := cg.kazoo.Topic(topic).Partitions() if err != nil { logger.Fatal("KAFKA: Replica FAILED to get list of partitions for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Error(err), ) cg.errors <- &sarama.ConsumerError{ Topic: topic, Partition: -1, Err: err, } return } partitionLeaders, err := retrievePartitionLeaders(partitions) if err != nil { logger.Fatal("KAFKA: Replica FAILED to get leaders of partitions for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Error(err), ) cg.errors <- &sarama.ConsumerError{ Topic: topic, Partition: -1, Err: err, } return } dividedPartitions := dividePartitionsBetweenConsumers(cg.consumers, partitionLeaders) myPartitions := dividedPartitions[cg.instance.ID] logger.Info("KAFKA: Replica is claiming partitions", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int("claimedPartitions", len(myPartitions)), zap.Int("numPartitionLeaders", len(partitionLeaders)), ) // Consume all the assigned partitions var wg sync.WaitGroup myPartitionsStr := "" for _, pid := range myPartitions { myPartitionsStr += fmt.Sprintf("%d ", pid.ID) wg.Add(1) go cg.partitionConsumer(topic, pid.ID, messages, errors, &wg, stopper, logger) } logger.Info("KAFKA: Retrieved replica's partitions", zap.Int("replicaId", cg.replicaId), zap.String("myPartitions", myPartitionsStr), ) wg.Wait() logger.Info("KAFKA: Replica stopped consumer of a topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), ) }
func (cg *ConsumerGroup) Load(logger zap.Logger) error { var kz *kazoo.Kazoo var err error if kz, err = kazoo.NewKazoo(cg.zookeeper, cg.config.Zookeeper); err != nil { return err } logger.Info("KAFKA: Getting broker list for replica", zap.Int("replicaId", cg.replicaId), ) brokers, err := kz.BrokerList() if err != nil { kz.Close() return err } group := kz.Consumergroup(cg.config.ClientID) instance := group.NewInstance() var consumer sarama.Consumer if consumer, err = sarama.NewConsumer(brokers, cg.config.Config); err != nil { kz.Close() return err } cg.kazoo = kz cg.group = group cg.instance = instance cg.messages = make(chan *sarama.ConsumerMessage, cg.config.ChannelBufferSize) cg.consumer = consumer cg.singleShutdown = sync.Once{} cg.errors = make(chan *sarama.ConsumerError, cg.config.ChannelBufferSize) cg.stopper = make(chan struct{}) if exists, err := cg.group.Exists(); err != nil { logger.Fatal("KAFKA: Replica failed to check existence of consumergroup", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) consumer.Close() kz.Close() return err } else if !exists { logger.Info("KAFKA: Consumergroup does not exist, creating it", zap.Int("replicaId", cg.replicaId), zap.String("consumerGroupName", cg.group.Name), ) if err := cg.group.Create(); err != nil { logger.Fatal("KAFKA: Failed to create consumergroup in Zookeeper", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) consumer.Close() kz.Close() return err } } if err := cg.instance.Register(cg.topics); err != nil { logger.Fatal("KAFKA: Failed to create consumer instance", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) return err } else { logger.Info("KAFKA: Consumer instance registered", zap.Int("replicaId", cg.replicaId), ) } offsetConfig := OffsetManagerConfig{ CommitInterval: cg.config.Offsets.CommitInterval, EnableAutoCommit: cg.config.EnableOffsetAutoCommit, } cg.offsetManager = NewZookeeperOffsetManager(cg, &offsetConfig, logger) go cg.topicListConsumer(cg.topics, logger) return nil }
func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) { e := p.e pgm := p.pgm cd, _, err := e.GetClusterData() if err != nil { log.Error("error retrieving cluster data", zap.Error(err)) return } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) if cd == nil { log.Info("no cluster data available, waiting for it to appear") return } if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) return } if cd.Cluster != nil { p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration if p.keeperLocalState.ClusterUID != cd.Cluster.UID { p.keeperLocalState.ClusterUID = cd.Cluster.UID if err = p.saveKeeperLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } } k, ok := cd.Keepers[p.keeperLocalState.UID] if !ok { log.Info("our keeper data is not available, waiting for it to appear") return } // TODO(sgotti) Check that the Keeper.Status address:port has been updated db := cd.FindDB(k) if db == nil { log.Info("no db assigned") return } // TODO(sgotti) Check that the DB.Status address:port has been updated followersUIDs := db.Spec.Followers prevPGParameters := pgm.GetParameters() // create postgres parameteres pgParameters := p.createPGParameters(db) // update pgm postgres parameters pgm.SetParameters(pgParameters) dbls := p.dbLocalState if dbls.Initializing { // If we are here this means that the db initialization or // resync as failed so we have to clean up stale data log.Error("db failed to initialize or resync") // Clean up cluster db datadir if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } // Reset current db local state since it's not valid anymore p.localStateMutex.Lock() dbls.UID = "" dbls.Generation = cluster.NoGeneration dbls.Initializing = false p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } initialized, err := pgm.IsInitialized() if err != nil { log.Error("failed to detect if instance is initialized", zap.Error(err)) return } started := false if initialized { started, err = pgm.IsStarted() if err != nil { // log error getting instance state but go ahead. log.Info("failed to retrieve instance status", zap.Error(err)) } } log.Debug("db status", zap.Bool("started", started)) // if the db is initialized but there isn't a db local state then generate a new one if initialized && dbls.UID == "" { p.localStateMutex.Lock() dbls.UID = common.UID() dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = false p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } } if dbls.UID != db.UID { log.Info("current db UID different than cluster data db UID", zap.String("db", dbls.UID), zap.String("cdDB", db.UID)) switch db.Spec.InitMode { case cluster.DBInitModeNew: log.Info("initializing the database cluster") p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = true p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } if err = pgm.Init(); err != nil { log.Error("failed to initialize postgres database cluster", zap.Error(err)) return } initialized = true if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } log.Info("setting roles") if err = pgm.SetupRoles(); err != nil { log.Error("failed to setup roles", zap.Error(err)) return } if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModePITR: log.Info("restoring the database cluster") p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil dbls.Initializing = true p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if err = pgm.RemoveAll(); err != nil { log.Error("failed to remove the postgres data dir", zap.Error(err)) return } if err = pgm.Restore(db.Spec.PITRConfig.DataRestoreCommand); err != nil { log.Error("failed to restore postgres database cluster", zap.Error(err)) return } if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(nil, db.Spec.PITRConfig.ArchiveRecoverySettings)); err != nil { log.Error("err", zap.Error(err)) return } if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } initialized = true if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModeExisting: // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if started { if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } started = false } if db.Spec.IncludeConfig { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } pgParameters, err = pgm.GetConfigFilePGParameters() if err != nil { log.Error("failed to rename previous postgresql.conf", zap.Error(err)) return } p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() } else { if err = pgm.StartTmpMerged(); err != nil { log.Error("failed to start instance", zap.Error(err)) return } } log.Info("updating our db UID with the cluster data provided db UID") // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.InitPGParameters = pgParameters p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } if err = pgm.Stop(true); err != nil { log.Error("failed to stop pg instance", zap.Error(err)) return } case cluster.DBInitModeNone: // replace our current db uid with the required one. p.localStateMutex.Lock() dbls.UID = db.UID // Set a no generation since we aren't already converged. dbls.Generation = cluster.NoGeneration dbls.InitPGParameters = nil p.localStateMutex.Unlock() if err = p.saveDBLocalState(); err != nil { log.Error("error", zap.Error(err)) return } return default: log.Error("unknown db init mode", zap.String("initMode", string(db.Spec.InitMode))) return } } pgm.SetParameters(pgParameters) var localRole common.Role var systemID string if !initialized { log.Info("database cluster not initialized") localRole = common.RoleUndefined } else { localRole, err = pgm.GetRole() if err != nil { log.Error("error retrieving current pg role", zap.Error(err)) return } systemID, err = p.pgm.GetSystemdID() if err != nil { log.Error("error retrieving systemd ID", zap.Error(err)) return } } targetRole := db.Spec.Role log.Debug("target role", zap.String("targetRole", string(targetRole))) switch targetRole { case common.RoleMaster: // We are the elected master log.Info("our db requested role is master") if localRole == common.RoleUndefined { log.Error("database cluster not initialized but requested role is master. This shouldn't happen!") return } if !started { if err = pgm.Start(); err != nil { log.Error("failed to start postgres", zap.Error(err)) return } started = true } if localRole == common.RoleStandby { log.Info("promoting to master") if err = pgm.Promote(); err != nil { log.Error("err", zap.Error(err)) return } } else { log.Info("already master") } var replSlots []string replSlots, err = pgm.GetReplicatinSlots() log.Debug("replication slots", zap.Object("replSlots", replSlots)) if err != nil { log.Error("err", zap.Error(err)) return } // Drop replication slots for _, slotName := range replSlots { if !common.IsStolonName(slotName) { continue } if !util.StringInSlice(followersUIDs, common.NameFromStolonName(slotName)) { log.Info("dropping replication slot since db not marked as follower", zap.String("slot", slotName), zap.String("db", common.NameFromStolonName(slotName))) if err = pgm.DropReplicationSlot(slotName); err != nil { log.Error("err", zap.Error(err)) } } } // Create replication slots for _, followerUID := range followersUIDs { if followerUID == dbls.UID { continue } replSlot := common.StolonName(followerUID) if !util.StringInSlice(replSlots, replSlot) { log.Info("creating replication slot", zap.String("slot", replSlot), zap.String("db", followerUID)) if err = pgm.CreateReplicationSlot(replSlot); err != nil { log.Error("err", zap.Error(err)) } } } case common.RoleStandby: // We are a standby followedUID := db.Spec.FollowConfig.DBUID log.Info("our db requested role is standby", zap.String("followedDB", followedUID)) followedDB, ok := cd.DBs[followedUID] if !ok { log.Error("no db data available for followed db", zap.String("followedDB", followedUID)) return } switch localRole { case common.RoleMaster: if systemID == followedDB.Status.SystemID { // There can be the possibility that this // database is on the same branch of the // current followed instance. // So we try to put it in recovery and then // check if it's on the same branch or force a // resync replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if !started { if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } else { if err = pgm.Restart(true); err != nil { log.Error("err", zap.Error(err)) return } } // TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines. // So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream. // Check timeline history // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state var pgState *cluster.PostgresState pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, true, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true // Check again if it was really synced pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } } } else { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } case common.RoleStandby: log.Info("already standby") if !started { replConnParams := p.getReplConnParams(db, followedDB) standbySettings := &cluster.StandbySettings{PrimaryConninfo: replConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("failed to start postgres", zap.Error(err)) return } started = true } // Check that we can sync with followed instance // We need to update our pgState to avoid dealing with // an old pgState not reflecting the real state var pgState *cluster.PostgresState pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } needsResync := false tryPgrewind := false // If the db has a different systemdID then a resync is needed if systemID != followedDB.Status.SystemID { needsResync = true // Check timeline history } else if p.isDifferentTimelineBranch(followedDB, pgState) { needsResync = true tryPgrewind = true } if needsResync { // TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and doesn't check if they diverged at different position in previous timelines. // So check that the db as been synced or resync again with pg_rewind disabled. Will need to report this upstream. if err = p.resync(db, followedDB, tryPgrewind, started); err != nil { log.Error("failed to full resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true // Check again if it was really synced pgState, err = p.GetPGState(pctx) if err != nil { log.Error("cannot get current pgstate", zap.Error(err)) return } if p.isDifferentTimelineBranch(followedDB, pgState) { if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } } // TODO(sgotti) Check that the followed instance has all the needed WAL segments // Update our primary_conninfo if replConnString changed var curReplConnParams postgresql.ConnParams curReplConnParams, err = pgm.GetPrimaryConninfo() if err != nil { log.Error("err", zap.Error(err)) return } log.Debug("curReplConnParams", zap.Object("curReplConnParams", curReplConnParams)) newReplConnParams := p.getReplConnParams(db, followedDB) log.Debug("newReplConnParams", zap.Object("newReplConnParams", newReplConnParams)) if !curReplConnParams.Equals(newReplConnParams) { log.Info("connection parameters changed. Reconfiguring.", zap.String("followedDB", followedUID), zap.Object("replConnParams", newReplConnParams)) standbySettings := &cluster.StandbySettings{PrimaryConninfo: newReplConnParams.ConnString(), PrimarySlotName: common.StolonName(db.UID)} if err = pgm.WriteRecoveryConf(p.createRecoveryParameters(standbySettings, nil)); err != nil { log.Error("err", zap.Error(err)) return } if err = pgm.Restart(true); err != nil { log.Error("err", zap.Error(err)) return } } case common.RoleUndefined: if err = p.resync(db, followedDB, false, started); err != nil { log.Error("failed to full resync from followed instance", zap.Error(err)) return } if err = pgm.Start(); err != nil { log.Error("err", zap.Error(err)) return } started = true } case common.RoleUndefined: log.Info("our db requested role is none") return } // update pg parameters pgParameters = p.createPGParameters(db) // Log synchronous replication changes prevSyncStandbyNames := prevPGParameters["synchronous_standby_names"] syncStandbyNames := pgParameters["synchronous_standby_names"] if db.Spec.SynchronousReplication { if prevSyncStandbyNames != syncStandbyNames { log.Info("needed synchronous_standby_names changed", zap.String("prevSyncStandbyNames", prevSyncStandbyNames), zap.String("syncStandbyNames", syncStandbyNames)) } } else { if prevSyncStandbyNames != "" { log.Info("sync replication disabled, removing current synchronous_standby_names", zap.String("syncStandbyNames", prevSyncStandbyNames)) } } if !pgParameters.Equals(prevPGParameters) { log.Info("postgres parameters changed, reloading postgres instance") pgm.SetParameters(pgParameters) if err := pgm.Reload(); err != nil { log.Error("failed to reload postgres instance", zap.Error(err)) } } else { // for tests log.Info("postgres parameters not changed") } // If we are here, then all went well and we can update the db generation and save it locally p.localStateMutex.Lock() dbls.Generation = db.Generation dbls.Initializing = false p.localStateMutex.Unlock() if err := p.saveDBLocalState(); err != nil { log.Error("err", zap.Error(err)) return } }
func (p *PostgresKeeper) Start() { endSMCh := make(chan struct{}) endPgStatecheckerCh := make(chan struct{}) endUpdateKeeperInfo := make(chan struct{}) var err error var cd *cluster.ClusterData cd, _, err = p.e.GetClusterData() if err != nil { log.Error("error retrieving cluster data", zap.Error(err)) } else if cd != nil { if cd.FormatVersion != cluster.CurrentCDFormatVersion { log.Error("unsupported clusterdata format version", zap.Uint64("version", cd.FormatVersion)) } else if cd.Cluster != nil { p.sleepInterval = cd.Cluster.Spec.SleepInterval.Duration p.requestTimeout = cd.Cluster.Spec.RequestTimeout.Duration } } log.Debug("cd dump", zap.String("cd", spew.Sdump(cd))) // TODO(sgotti) reconfigure the various configurations options // (RequestTimeout) after a changed cluster config pgParameters := make(common.Parameters) pgm := postgresql.NewManager(p.pgBinPath, p.dataDir, pgParameters, p.getLocalConnParams(), p.getOurReplConnParams(), p.pgSUUsername, p.pgSUPassword, p.pgReplUsername, p.pgReplPassword, p.requestTimeout) p.pgm = pgm p.pgm.Stop(true) ctx, cancel := context.WithCancel(context.Background()) smTimerCh := time.NewTimer(0).C updatePGStateTimerCh := time.NewTimer(0).C updateKeeperInfoTimerCh := time.NewTimer(0).C for true { select { case <-p.stop: log.Debug("stopping stolon keeper") cancel() p.pgm.Stop(true) p.end <- nil return case <-smTimerCh: go func() { p.postgresKeeperSM(ctx) endSMCh <- struct{}{} }() case <-endSMCh: smTimerCh = time.NewTimer(p.sleepInterval).C case <-updatePGStateTimerCh: // updateKeeperInfo two times faster than the sleep interval go func() { p.updatePGState(ctx) endPgStatecheckerCh <- struct{}{} }() case <-endPgStatecheckerCh: // updateKeeperInfo two times faster than the sleep interval updatePGStateTimerCh = time.NewTimer(p.sleepInterval / 2).C case <-updateKeeperInfoTimerCh: go func() { if err := p.updateKeeperInfo(); err != nil { log.Error("failed to update keeper info", zap.Error(err)) } endUpdateKeeperInfo <- struct{}{} }() case <-endUpdateKeeperInfo: updateKeeperInfoTimerCh = time.NewTimer(p.sleepInterval).C } } }
func (p *PostgresKeeper) GetPGState(pctx context.Context) (*cluster.PostgresState, error) { p.getPGStateMutex.Lock() defer p.getPGStateMutex.Unlock() // Just get one pgstate at a time to avoid exausting available connections pgState := &cluster.PostgresState{} p.localStateMutex.Lock() pgState.UID = p.dbLocalState.UID pgState.Generation = p.dbLocalState.Generation p.localStateMutex.Unlock() pgState.ListenAddress = p.pgListenAddress pgState.Port = p.pgPort initialized, err := p.pgm.IsInitialized() if err != nil { return nil, err } if initialized { pgParameters, err := p.pgm.GetConfigFilePGParameters() if err != nil { log.Error("cannot get configured pg parameters", zap.Error(err)) return pgState, nil } log.Debug("got configured pg parameters", zap.Object("pgParameters", pgParameters)) filteredPGParameters := common.Parameters{} for k, v := range pgParameters { if !util.StringInSlice(managedPGParameters, k) { filteredPGParameters[k] = v } } log.Debug("filtered out managed pg parameters", zap.Object("filteredPGParameters", filteredPGParameters)) pgState.PGParameters = filteredPGParameters sd, err := p.pgm.GetSystemData() if err != nil { log.Error("error getting pg state", zap.Error(err)) return pgState, nil } pgState.SystemID = sd.SystemID pgState.TimelineID = sd.TimelineID pgState.XLogPos = sd.XLogPos // if timeline <= 1 then no timeline history file exists. pgState.TimelinesHistory = cluster.PostgresTimelinesHistory{} if pgState.TimelineID > 1 { tlsh, err := p.pgm.GetTimelinesHistory(pgState.TimelineID) if err != nil { log.Error("error getting timeline history", zap.Error(err)) return pgState, nil } ctlsh := cluster.PostgresTimelinesHistory{} for _, tlh := range tlsh { ctlh := &cluster.PostgresTimelineHistory{ TimelineID: tlh.TimelineID, SwitchPoint: tlh.SwitchPoint, Reason: tlh.Reason, } ctlsh = append(ctlsh, ctlh) } pgState.TimelinesHistory = ctlsh } pgState.Healthy = true } return pgState, nil }