func (cg *ConsumerGroup) topicListConsumer(topics []string, logger zap.Logger) { for { select { case <-cg.stopper: return default: } consumers, consumerChanges, err := cg.group.WatchInstances() if err != nil { logger.Fatal("KAFKA: FAILED to get list of registered consumer instances for replica", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) return } cg.consumers = consumers logger.Info("KAFKA: Got currently registered consumers for replica", zap.Int("replicaId", cg.replicaId), zap.Int("numRegisteredConsumers", len(cg.consumers)), ) stopper := make(chan struct{}) for _, topic := range topics { cg.wg.Add(1) go cg.topicConsumer(topic, cg.messages, cg.errors, stopper, logger) } select { case <-cg.stopper: close(stopper) return case event := <-consumerChanges: if event.Err == zk.ErrSessionExpired || event.Err == zk.ErrConnectionClosed { logger.Info("KAFKA: Session was expired, reloading consumer for replica", zap.Int("replicaId", cg.replicaId), ) go cg.reload(logger) <-cg.stopper close(stopper) return } else { logger.Info("KAFKA: Triggering rebalance due to consumer list change in replica", zap.Int("replicaId", cg.replicaId), ) close(stopper) cg.wg.Wait() } } } }
//Start initiates a connection to APNS and asnchronously sends notifications which have been queued. func (conn *Connection) Start(logger zap.Logger) error { //Connect to APNS. The reason this is here as well as in sender is that this probably catches any unavoidable errors in a synchronous fashion, while in sender it can reconnect after temporary errors (which should work most of the time.) err := conn.connect(logger) if err != nil { logger.Fatal("APNS: Failed to connect", zap.Int("connectionId", conn.id), zap.Error(err), ) return err } //Start sender goroutine sent := make(chan PushNotification, 10000) go conn.sender(conn.queue, sent, logger) //Start limbo goroutine go conn.limbo(sent, conn.responses, conn.errors, conn.queue, logger) return nil }
func (conn *Connection) connect(logger zap.Logger) error { if conn.conn != nil { conn.conn.Close() } if conn.connAux != nil { conn.connAux.Close() } var cert tls.Certificate var err error if len(conn.CertificateBase64) == 0 && len(conn.KeyBase64) == 0 { // The user did not specify raw block contents, so check the filesystem. cert, err = tls.LoadX509KeyPair(conn.CertificateFile, conn.KeyFile) } else { // The user provided the raw block contents, so use that. cert, err = tls.X509KeyPair([]byte(conn.CertificateBase64), []byte(conn.KeyBase64)) } if err != nil { logger.Fatal("APNS: Failed to obtain certificate", zap.Error(err), ) return err } conf := &tls.Config{ Certificates: []tls.Certificate{cert}, ServerName: strings.Split(conn.Gateway, ":")[0], } connAux, err := net.Dial("tcp", conn.Gateway) if err != nil { logger.Fatal("APNS: Failed while dialing gateway", zap.String("gateway", conn.Gateway), zap.Error(err), ) return err } tlsConn := tls.Client(connAux, conf) err = tlsConn.Handshake() if err != nil { logger.Fatal("APNS: Failed while handshaking", zap.Error(err), ) _ = tlsConn.Close() return err } conn.conn = tlsConn conn.connAux = connAux //Start reader goroutine go conn.reader(conn.responses, logger) return nil }
// Consumes a partition func (cg *ConsumerGroup) partitionConsumer(topic string, partition int32, messages chan<- *sarama.ConsumerMessage, errors chan<- *sarama.ConsumerError, wg *sync.WaitGroup, stopper <-chan struct{}, logger zap.Logger) { defer wg.Done() select { case <-stopper: return default: } for maxRetries, tries := 3, 0; tries < maxRetries; tries++ { if err := cg.instance.ClaimPartition(topic, partition); err == nil { break } else if err == kazoo.ErrPartitionClaimedByOther && tries+1 < maxRetries { time.Sleep(1 * time.Second) } else { logger.Warn("KAFKA: Replica FAILED to claim partition", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } } defer cg.instance.ReleasePartition(topic, partition) nextOffset, err := cg.offsetManager.InitializePartition(topic, partition) if err != nil { logger.Error("KAFKA: Replica FAILED to determine initial offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } if nextOffset >= 0 { logger.Info("KAFKA: Replica partition consumer starting at offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Int64("nextOffset", nextOffset), ) } else { nextOffset = cg.config.Offsets.Initial if nextOffset == sarama.OffsetOldest { logger.Info("KAFKA: Replica partition consumer starting at the oldest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } else if nextOffset == sarama.OffsetNewest { logger.Info("KAFKA: Replica partition consumer listening for new messages only", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } } consumer, err := cg.consumer.ConsumePartition(topic, partition, nextOffset) if err == sarama.ErrOffsetOutOfRange { logger.Warn("KAFKA: Replica partition consumer offset out of Range", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) // if the offset is out of range, simplistically decide whether to use OffsetNewest or OffsetOldest // if the configuration specified offsetOldest, then switch to the oldest available offset, else // switch to the newest available offset. if cg.config.Offsets.Initial == sarama.OffsetOldest { nextOffset = sarama.OffsetOldest logger.Info("KAFKA: Replica partition consumer offset reset to oldest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } else { nextOffset = sarama.OffsetNewest logger.Info("KAFKA: Replica partition consumer offset reset to newest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } // retry the consumePartition with the adjusted offset consumer, err = cg.consumer.ConsumePartition(topic, partition, nextOffset) } if err != nil { logger.Fatal("KAFKA: Replica FAILED to start partition consumer", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } defer consumer.Close() err = nil var lastOffset int64 = -1 // aka unknown partitionConsumerLoop: for { select { case <-stopper: break partitionConsumerLoop case err := <-consumer.Errors(): for { select { case errors <- err: continue partitionConsumerLoop case <-stopper: break partitionConsumerLoop } } case message := <-consumer.Messages(): for { select { case <-stopper: break partitionConsumerLoop case messages <- message: lastOffset = message.Offset continue partitionConsumerLoop } } } } logger.Info("KAFKA: Replica is stopping partition consumer at offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Int64("lastOffset", lastOffset), ) if err = cg.offsetManager.FinalizePartition(topic, partition, lastOffset, cg.config.Offsets.ProcessingTimeout, cg.replicaId, logger); err != nil { logger.Fatal("KAFKA: Replica error trying to stop partition consumer", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) } logger.Info("KAFKA: Replica successfully stoped partition", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) }
func (cg *ConsumerGroup) topicConsumer(topic string, messages chan<- *sarama.ConsumerMessage, errors chan<- *sarama.ConsumerError, stopper <-chan struct{}, logger zap.Logger) { defer cg.wg.Done() select { case <-stopper: return default: } logger.Info("KAFKA: Replica started consumer for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), ) // Fetch a list of partition IDs partitions, err := cg.kazoo.Topic(topic).Partitions() if err != nil { logger.Fatal("KAFKA: Replica FAILED to get list of partitions for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Error(err), ) cg.errors <- &sarama.ConsumerError{ Topic: topic, Partition: -1, Err: err, } return } partitionLeaders, err := retrievePartitionLeaders(partitions) if err != nil { logger.Fatal("KAFKA: Replica FAILED to get leaders of partitions for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Error(err), ) cg.errors <- &sarama.ConsumerError{ Topic: topic, Partition: -1, Err: err, } return } dividedPartitions := dividePartitionsBetweenConsumers(cg.consumers, partitionLeaders) myPartitions := dividedPartitions[cg.instance.ID] logger.Info("KAFKA: Replica is claiming partitions", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int("claimedPartitions", len(myPartitions)), zap.Int("numPartitionLeaders", len(partitionLeaders)), ) // Consume all the assigned partitions var wg sync.WaitGroup myPartitionsStr := "" for _, pid := range myPartitions { myPartitionsStr += fmt.Sprintf("%d ", pid.ID) wg.Add(1) go cg.partitionConsumer(topic, pid.ID, messages, errors, &wg, stopper, logger) } logger.Info("KAFKA: Retrieved replica's partitions", zap.Int("replicaId", cg.replicaId), zap.String("myPartitions", myPartitionsStr), ) wg.Wait() logger.Info("KAFKA: Replica stopped consumer of a topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), ) }
func (cg *ConsumerGroup) Load(logger zap.Logger) error { var kz *kazoo.Kazoo var err error if kz, err = kazoo.NewKazoo(cg.zookeeper, cg.config.Zookeeper); err != nil { return err } logger.Info("KAFKA: Getting broker list for replica", zap.Int("replicaId", cg.replicaId), ) brokers, err := kz.BrokerList() if err != nil { kz.Close() return err } group := kz.Consumergroup(cg.config.ClientID) instance := group.NewInstance() var consumer sarama.Consumer if consumer, err = sarama.NewConsumer(brokers, cg.config.Config); err != nil { kz.Close() return err } cg.kazoo = kz cg.group = group cg.instance = instance cg.messages = make(chan *sarama.ConsumerMessage, cg.config.ChannelBufferSize) cg.consumer = consumer cg.singleShutdown = sync.Once{} cg.errors = make(chan *sarama.ConsumerError, cg.config.ChannelBufferSize) cg.stopper = make(chan struct{}) if exists, err := cg.group.Exists(); err != nil { logger.Fatal("KAFKA: Replica failed to check existence of consumergroup", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) consumer.Close() kz.Close() return err } else if !exists { logger.Info("KAFKA: Consumergroup does not exist, creating it", zap.Int("replicaId", cg.replicaId), zap.String("consumerGroupName", cg.group.Name), ) if err := cg.group.Create(); err != nil { logger.Fatal("KAFKA: Failed to create consumergroup in Zookeeper", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) consumer.Close() kz.Close() return err } } if err := cg.instance.Register(cg.topics); err != nil { logger.Fatal("KAFKA: Failed to create consumer instance", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) return err } else { logger.Info("KAFKA: Consumer instance registered", zap.Int("replicaId", cg.replicaId), ) } offsetConfig := OffsetManagerConfig{ CommitInterval: cg.config.Offsets.CommitInterval, EnableAutoCommit: cg.config.EnableOffsetAutoCommit, } cg.offsetManager = NewZookeeperOffsetManager(cg, &offsetConfig, logger) go cg.topicListConsumer(cg.topics, logger) return nil }