func (zom *zookeeperOffsetManager) FinalizePartition(topic string, partition int32, lastOffset int64, timeout time.Duration, replicaId int, logger zap.Logger) error { zom.l.RLock() tracker := zom.offsets[topic][partition] zom.l.RUnlock() if lastOffset >= 0 { if lastOffset-tracker.highestProcessedOffset > 0 { logger.Info("ZOOKEEPER: Finalizing partition. Waiting before processing remaining messages", zap.Int("replicaId", replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Int64("lastProcessedOffset", tracker.highestProcessedOffset), zap.Duration("waitingTimeToProcessMoreMessages", timeout/time.Second), zap.Int64("numMessagesToProcess", lastOffset-tracker.highestProcessedOffset), ) if !tracker.waitForOffset(lastOffset, timeout) { return fmt.Errorf("REP %d - TIMEOUT waiting for offset %d. Last committed offset: %d", replicaId, lastOffset, tracker.lastCommittedOffset) } } if err := zom.commitOffset(topic, partition, tracker, logger); err != nil { return fmt.Errorf("REP %d - FAILED to commit offset %d to Zookeeper. Last committed offset: %d", replicaId, tracker.highestProcessedOffset, tracker.lastCommittedOffset) } } zom.l.Lock() delete(zom.offsets[topic], partition) zom.l.Unlock() return nil }
//Stop gracefully closes the connection - it waits for the sending queue to clear, and then shuts down. func (conn *Connection) Stop(logger zap.Logger) chan bool { logger.Info("APNS: Shutting down one of the connections", zap.Int("connectionId", conn.id), ) conn.stopping <- true return conn.stopped //Thought: Don't necessarily need a channel here. Could signal finishing by closing errors? }
func (cg *ConsumerGroup) reload(logger zap.Logger) error { cg.reloadMutex.Lock() defer cg.reloadMutex.Unlock() cg.singleReload.Do(func() { logger.Info("KAFKA: Closing down old connections for replica", zap.Int("replicaId", cg.replicaId), ) err := cg.Close(logger) if err != nil { logger.Error("KAFKA: Failed to close consumergroup for replica", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) } cg.Load(logger) }) return nil }
func (c *Connection) spinUntilReconnect(logger zap.Logger) { var backoff = time.Duration(100) for { logger.Info("APNS: Connection lost. Reconnecting", zap.Int("connectionId", c.id), ) err := c.connect(logger) if err != nil { //Exponential backoff up to a limit logger.Info("APNS: Error connecting to server", zap.Int("connectionId", c.id), zap.Error(err), ) backoff = backoff * 2 if backoff > maxBackoff { backoff = maxBackoff } time.Sleep(backoff) } else { backoff = 100 logger.Info("APNS: New connection established", zap.Int("connectionId", c.id), ) break } } }
func (conn *Connection) reader(responses chan<- Response, logger zap.Logger) { buffer := make([]byte, 6) for { n, err := conn.conn.Read(buffer) if err != nil && n < 6 { logger.Info("APNS: Connection error before reading complete response", zap.Int("connectionId", conn.id), zap.Int("n", n), zap.Error(err), ) conn.shouldReconnect <- true return } else if err != nil { logger.Info("APNS: Connection error before reading complete response", zap.Int("connectionId", conn.id), zap.Error(err), ) } command := uint8(buffer[0]) if command != 8 { logger.Info("APNS: Something went wrong in a connection - Command should have been 8 but it had other value instead", zap.Int("connectionId", conn.id), zap.Object("commandValue", command), ) } resp := newResponse() resp.Identifier = binary.BigEndian.Uint32(buffer[2:6]) resp.Status = uint8(buffer[1]) responses <- resp conn.shouldReconnect <- true return } }
func (cg *ConsumerGroup) Close(logger zap.Logger) error { shutdownError := AlreadyClosing cg.singleShutdown.Do(func() { defer cg.kazoo.Close() shutdownError = nil close(cg.stopper) cg.wg.Wait() if err := cg.offsetManager.Close(logger); err != nil { logger.Error("KAFKA: FAILED closing the offset manager for replica!", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) } if shutdownError = cg.instance.Deregister(); shutdownError != nil { logger.Warn("KAFKA: Replica FAILED deregistering consumer instance", zap.Int("replicaId", cg.replicaId), zap.Error(shutdownError), ) } else { logger.Info("KAFKA: Replica deregistered consumer instance", zap.Int("replicaId", cg.replicaId), zap.String("instanceId", cg.instance.ID), ) } if shutdownError = cg.consumer.Close(); shutdownError != nil { logger.Error("Replica FAILED closing the Sarama client", zap.Int("replicaId", cg.replicaId), zap.Error(shutdownError), ) } close(cg.messages) close(cg.errors) cg.instance = nil }) return shutdownError }
func (cg *ConsumerGroup) topicListConsumer(topics []string, logger zap.Logger) { for { select { case <-cg.stopper: return default: } consumers, consumerChanges, err := cg.group.WatchInstances() if err != nil { logger.Fatal("KAFKA: FAILED to get list of registered consumer instances for replica", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) return } cg.consumers = consumers logger.Info("KAFKA: Got currently registered consumers for replica", zap.Int("replicaId", cg.replicaId), zap.Int("numRegisteredConsumers", len(cg.consumers)), ) stopper := make(chan struct{}) for _, topic := range topics { cg.wg.Add(1) go cg.topicConsumer(topic, cg.messages, cg.errors, stopper, logger) } select { case <-cg.stopper: close(stopper) return case event := <-consumerChanges: if event.Err == zk.ErrSessionExpired || event.Err == zk.ErrConnectionClosed { logger.Info("KAFKA: Session was expired, reloading consumer for replica", zap.Int("replicaId", cg.replicaId), ) go cg.reload(logger) <-cg.stopper close(stopper) return } else { logger.Info("KAFKA: Triggering rebalance due to consumer list change in replica", zap.Int("replicaId", cg.replicaId), ) close(stopper) cg.wg.Wait() } } } }
// Consumes a partition func (cg *ConsumerGroup) partitionConsumer(topic string, partition int32, messages chan<- *sarama.ConsumerMessage, errors chan<- *sarama.ConsumerError, wg *sync.WaitGroup, stopper <-chan struct{}, logger zap.Logger) { defer wg.Done() select { case <-stopper: return default: } for maxRetries, tries := 3, 0; tries < maxRetries; tries++ { if err := cg.instance.ClaimPartition(topic, partition); err == nil { break } else if err == kazoo.ErrPartitionClaimedByOther && tries+1 < maxRetries { time.Sleep(1 * time.Second) } else { logger.Warn("KAFKA: Replica FAILED to claim partition", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } } defer cg.instance.ReleasePartition(topic, partition) nextOffset, err := cg.offsetManager.InitializePartition(topic, partition) if err != nil { logger.Error("KAFKA: Replica FAILED to determine initial offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } if nextOffset >= 0 { logger.Info("KAFKA: Replica partition consumer starting at offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Int64("nextOffset", nextOffset), ) } else { nextOffset = cg.config.Offsets.Initial if nextOffset == sarama.OffsetOldest { logger.Info("KAFKA: Replica partition consumer starting at the oldest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } else if nextOffset == sarama.OffsetNewest { logger.Info("KAFKA: Replica partition consumer listening for new messages only", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } } consumer, err := cg.consumer.ConsumePartition(topic, partition, nextOffset) if err == sarama.ErrOffsetOutOfRange { logger.Warn("KAFKA: Replica partition consumer offset out of Range", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) // if the offset is out of range, simplistically decide whether to use OffsetNewest or OffsetOldest // if the configuration specified offsetOldest, then switch to the oldest available offset, else // switch to the newest available offset. if cg.config.Offsets.Initial == sarama.OffsetOldest { nextOffset = sarama.OffsetOldest logger.Info("KAFKA: Replica partition consumer offset reset to oldest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } else { nextOffset = sarama.OffsetNewest logger.Info("KAFKA: Replica partition consumer offset reset to newest available offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) } // retry the consumePartition with the adjusted offset consumer, err = cg.consumer.ConsumePartition(topic, partition, nextOffset) } if err != nil { logger.Fatal("KAFKA: Replica FAILED to start partition consumer", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) return } defer consumer.Close() err = nil var lastOffset int64 = -1 // aka unknown partitionConsumerLoop: for { select { case <-stopper: break partitionConsumerLoop case err := <-consumer.Errors(): for { select { case errors <- err: continue partitionConsumerLoop case <-stopper: break partitionConsumerLoop } } case message := <-consumer.Messages(): for { select { case <-stopper: break partitionConsumerLoop case messages <- message: lastOffset = message.Offset continue partitionConsumerLoop } } } } logger.Info("KAFKA: Replica is stopping partition consumer at offset", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Int64("lastOffset", lastOffset), ) if err = cg.offsetManager.FinalizePartition(topic, partition, lastOffset, cg.config.Offsets.ProcessingTimeout, cg.replicaId, logger); err != nil { logger.Fatal("KAFKA: Replica error trying to stop partition consumer", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), zap.Error(err), ) } logger.Info("KAFKA: Replica successfully stoped partition", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int64("partition", int64(partition)), ) }
func (cg *ConsumerGroup) topicConsumer(topic string, messages chan<- *sarama.ConsumerMessage, errors chan<- *sarama.ConsumerError, stopper <-chan struct{}, logger zap.Logger) { defer cg.wg.Done() select { case <-stopper: return default: } logger.Info("KAFKA: Replica started consumer for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), ) // Fetch a list of partition IDs partitions, err := cg.kazoo.Topic(topic).Partitions() if err != nil { logger.Fatal("KAFKA: Replica FAILED to get list of partitions for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Error(err), ) cg.errors <- &sarama.ConsumerError{ Topic: topic, Partition: -1, Err: err, } return } partitionLeaders, err := retrievePartitionLeaders(partitions) if err != nil { logger.Fatal("KAFKA: Replica FAILED to get leaders of partitions for topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Error(err), ) cg.errors <- &sarama.ConsumerError{ Topic: topic, Partition: -1, Err: err, } return } dividedPartitions := dividePartitionsBetweenConsumers(cg.consumers, partitionLeaders) myPartitions := dividedPartitions[cg.instance.ID] logger.Info("KAFKA: Replica is claiming partitions", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), zap.Int("claimedPartitions", len(myPartitions)), zap.Int("numPartitionLeaders", len(partitionLeaders)), ) // Consume all the assigned partitions var wg sync.WaitGroup myPartitionsStr := "" for _, pid := range myPartitions { myPartitionsStr += fmt.Sprintf("%d ", pid.ID) wg.Add(1) go cg.partitionConsumer(topic, pid.ID, messages, errors, &wg, stopper, logger) } logger.Info("KAFKA: Retrieved replica's partitions", zap.Int("replicaId", cg.replicaId), zap.String("myPartitions", myPartitionsStr), ) wg.Wait() logger.Info("KAFKA: Replica stopped consumer of a topic", zap.Int("replicaId", cg.replicaId), zap.String("topic", topic), ) }
func (cg *ConsumerGroup) Load(logger zap.Logger) error { var kz *kazoo.Kazoo var err error if kz, err = kazoo.NewKazoo(cg.zookeeper, cg.config.Zookeeper); err != nil { return err } logger.Info("KAFKA: Getting broker list for replica", zap.Int("replicaId", cg.replicaId), ) brokers, err := kz.BrokerList() if err != nil { kz.Close() return err } group := kz.Consumergroup(cg.config.ClientID) instance := group.NewInstance() var consumer sarama.Consumer if consumer, err = sarama.NewConsumer(brokers, cg.config.Config); err != nil { kz.Close() return err } cg.kazoo = kz cg.group = group cg.instance = instance cg.messages = make(chan *sarama.ConsumerMessage, cg.config.ChannelBufferSize) cg.consumer = consumer cg.singleShutdown = sync.Once{} cg.errors = make(chan *sarama.ConsumerError, cg.config.ChannelBufferSize) cg.stopper = make(chan struct{}) if exists, err := cg.group.Exists(); err != nil { logger.Fatal("KAFKA: Replica failed to check existence of consumergroup", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) consumer.Close() kz.Close() return err } else if !exists { logger.Info("KAFKA: Consumergroup does not exist, creating it", zap.Int("replicaId", cg.replicaId), zap.String("consumerGroupName", cg.group.Name), ) if err := cg.group.Create(); err != nil { logger.Fatal("KAFKA: Failed to create consumergroup in Zookeeper", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) consumer.Close() kz.Close() return err } } if err := cg.instance.Register(cg.topics); err != nil { logger.Fatal("KAFKA: Failed to create consumer instance", zap.Int("replicaId", cg.replicaId), zap.Error(err), ) return err } else { logger.Info("KAFKA: Consumer instance registered", zap.Int("replicaId", cg.replicaId), ) } offsetConfig := OffsetManagerConfig{ CommitInterval: cg.config.Offsets.CommitInterval, EnableAutoCommit: cg.config.EnableOffsetAutoCommit, } cg.offsetManager = NewZookeeperOffsetManager(cg, &offsetConfig, logger) go cg.topicListConsumer(cg.topics, logger) return nil }
func (conn *Connection) limbo(sent <-chan PushNotification, responses chan Response, errors chan BadPushNotification, queue chan PushNotification, logger zap.Logger) { stopping := false limbo := make([]timedPushNotification, 0, SentBufferSize) ticker := time.NewTicker(1 * time.Second) for { select { case pn, ok := <-sent: limbo = append(limbo, pn.timed()) stopping = false if !ok { logger.Info("APNS: Connection limbo - sent is closed, so sender is done. So am I, then", zap.Int("connectionId", conn.id), ) close(errors) conn.stopped <- true return } case <-conn.senderFinished: //senderFinished means the sender thinks it's done. //However, sender might not be - limbo could resend some, if there are any left here. //So we just take note of this until limbo is empty too. stopping = true case resp, ok := <-responses: if !ok { //If the responses channel is closed, //that means we're shutting down the connection. } for i, pn := range limbo { if pn.Identifier == resp.Identifier { if resp.Status != 10 { //It was an error, we should report this on the error channel bad := BadPushNotification{PushNotification: pn.PushNotification, Status: resp.Status} errors <- bad } if len(limbo) > i { toRequeue := len(limbo) - (i + 1) if toRequeue > 0 { conn.requeue(limbo[i+1:]) //We resent some notifications: that means we should wait for sender to tell us it's done, again. stopping = false } } } } limbo = make([]timedPushNotification, 0, SentBufferSize) case <-ticker.C: flushed := false for i := range limbo { if limbo[i].After(time.Now().Add(-TimeoutSeconds * time.Second)) { if i > 0 { newLimbo := make([]timedPushNotification, len(limbo[i:]), SentBufferSize) copy(newLimbo, limbo[i:]) limbo = newLimbo flushed = true break } } } if !flushed { limbo = make([]timedPushNotification, 0, SentBufferSize) } if stopping && len(limbo) == 0 { //sender() is finished and so is limbo - so the connection is done. logger.Info("APNS: Connection limbo - I've flushed all my notifications. Tell sender I'm done", zap.Int("connectionId", conn.id), ) conn.ackFinished <- true } } } }
func (conn *Connection) sender(queue <-chan PushNotification, sent chan PushNotification, logger zap.Logger) { i := 0 stopping := false defer conn.conn.Close() defer conn.connAux.Close() logger.Info("APNS: Starting sender for connection", zap.Int("connectionId", conn.id), ) for { select { case pn, ok := <-conn.queue: if !ok { logger.Info("APNS: Connection not okay; queue closed", zap.Int("connectionId", conn.id), ) //That means the Connection is stopped //close sent? return } //This means we saw a response; connection is over. select { case <-conn.shouldReconnect: conn.conn.Close() conn.conn = nil conn.connAux.Close() conn.connAux = nil conn.spinUntilReconnect(logger) default: } //Then send the push notification pn.Priority = 10 payload, err := pn.ToBytes() if err != nil { logger.Info("APNS: Connection Error", zap.Int("connectionId", conn.id), zap.Error(err), ) //Should report this on the bad notifications channel probably } else { if conn.conn == nil { conn.spinUntilReconnect(logger) } _, err = conn.conn.Write(payload) if err != nil { logger.Info("APNS: Error writing payload", zap.Error(err), ) go func() { conn.shouldReconnect <- true }() //Disconnect? } else { i++ sent <- pn if stopping && len(queue) == 0 { conn.senderFinished <- true } } } case <-conn.stopping: logger.Info("APNS: Connection sender - Got a stop message", zap.Int("connectionId", conn.id), ) stopping = true if len(queue) == 0 { logger.Info("APNS: Connection sender - Stopping because ran out of things to send. Let's see if limbo is empty", zap.Int("connectionId", conn.id), ) conn.senderFinished <- true } case <-conn.ackFinished: logger.Info("APNS: Connection sender - limbo is empty", zap.Int("connectionId", conn.id), ) if len(queue) == 0 { logger.Info("APNS: Connection sender - limbo is empty and so am I", zap.Int("connectionId", conn.id), ) close(sent) return } } } }
func (conn_pool *ConnectionPool) Start(logger zap.Logger) { logger.Info("APNS: Starting pool of connections") go conn_pool.sendLoop() }