func ExampleBroker_Consumer() { broker := NewBroker() msg := &proto.Message{Value: []byte("first")} // mock server actions, pushing data through consumer go func() { consumer, _ := broker.Consumer(kafka.NewConsumerConf("my-topic", 0)) c := consumer.(*Consumer) // it is possible to send messages through consumer... c.Messages <- msg // every consumer fetch call is blocking untill there is either message // or error ready to return, this way we can test slow consumers time.Sleep(time.Millisecond * 20) // ...as well as push errors to mock failure c.Errors <- errors.New("expected error is expected") }() // test broker never fails creating consumer consumer, _ := broker.Consumer(kafka.NewConsumerConf("my-topic", 0)) m, err := consumer.Consume() if err == nil { fmt.Printf("Value: %q\n", m.Value) } if _, err = consumer.Consume(); err != nil { fmt.Printf("Error: %s\n", err) } // output: // // Value: "first" // Error: expected error is expected }
// setup is the initial worker that initializes the claim structure. Until this is done, // our internal state is inconsistent. func (c *claim) setup() { c.lock.Lock() defer c.lock.Unlock() // Of course, if the current offset is greater than the earliest, we must reset // to the earliest known if c.offsets.Current < c.offsets.Earliest { log.Warningf("[%s:%d] consumer fast-forwarding from %d to %d", c.topic, c.partID, c.offsets.Current, c.offsets.Earliest) c.offsets.Current = c.offsets.Earliest } // Since it's claimed, we now want to heartbeat with the last seen offset err := c.marshal.Heartbeat(c.topic, c.partID, c.offsets.Current) if err != nil { log.Errorf("[%s:%d] consumer failed to heartbeat: %s", c.topic, c.partID, err) go c.Release() return } c.lastHeartbeat = time.Now().Unix() // Set up Kafka consumer consumerConf := kafka.NewConsumerConf(c.topic, int32(c.partID)) consumerConf.StartOffset = c.offsets.Current consumerConf.MaxFetchSize = c.marshal.cluster.options.MaxMessageSize consumerConf.RequestTimeout = c.marshal.cluster.options.ConsumeRequestTimeout // Do not retry. If we get back no data, we'll do our own retries. consumerConf.RetryLimit = 0 kafkaConsumer, err := c.marshal.cluster.broker.Consumer(consumerConf) if err != nil { log.Errorf("[%s:%d] consumer failed to create Kafka Consumer: %s", c.topic, c.partID, err) go c.Release() return } c.kafkaConsumer = kafkaConsumer // Start our maintenance goroutines that keep this system healthy go c.messagePump() // Totally done, let the world know and move on log.Infof("[%s:%d] consumer %s claimed at offset %d (is %d behind)", c.topic, c.partID, c.marshal.clientID, c.offsets.Current, c.offsets.Latest-c.offsets.Current) }
func TestProducerBrokenConnection(t *testing.T) { IntegrationTest(t) topics := []string{"Topic3", "Topic4"} cluster := NewKafkaCluster("kafka-docker/", 4) if err := cluster.Start(); err != nil { t.Fatalf("cannot start kafka cluster: %s", err) } defer func() { _ = cluster.Stop() }() bconf := kafka.NewBrokerConf("producer-broken-connection") addrs, err := cluster.KafkaAddrs() if err != nil { t.Fatalf("cannot get kafka address: %s", err) } broker, err := kafka.Dial(addrs, bconf) if err != nil { t.Fatalf("cannot connect to cluster (%q): %s", addrs, err) } defer broker.Close() // produce big message to enforce TCP buffer flush m := proto.Message{ Value: []byte(strings.Repeat("producer broken connection message ", 1000)), } pconf := kafka.NewProducerConf() producer := broker.Producer(pconf) // send message to all topics to make sure it's working for _, name := range topics { if _, err := producer.Produce(name, 0, &m); err != nil { t.Fatalf("cannot produce to %q: %s", name, err) } } // close two kafka clusters and publish to all 3 topics - 2 of them should // retry sending, because lack of leader makes the request fail // // request should not succeed until nodes are back - bring them back after // small delay and make sure producing was successful containers, err := cluster.Containers() if err != nil { t.Fatalf("cannot get containers: %s", err) } var stopped []*Container for _, container := range containers { if container.RunningKafka() { if err := container.Kill(); err != nil { t.Fatalf("cannot kill %q kafka container: %s", container.ID, err) } stopped = append(stopped, container) } if len(stopped) == 2 { break } } // bring stopped containers back errc := make(chan error) go func() { time.Sleep(500 * time.Millisecond) for _, container := range stopped { if err := container.Start(); err != nil { errc <- err } } close(errc) }() // send message to all topics to make sure it's working for _, name := range topics { if _, err := producer.Produce(name, 0, &m); err != nil { t.Errorf("cannot produce to %q: %s", name, err) } } for err := range errc { t.Errorf("cannot start container: %s", err) } // make sure data was persisted for _, name := range topics { consumer, err := broker.Consumer(kafka.NewConsumerConf(name, 0)) if err != nil { t.Errorf("cannot create consumer for %q: %s", name, err) continue } for i := 0; i < 2; i++ { if _, err := consumer.Consume(); err != nil { t.Errorf("cannot consume %d message from %q: %s", i, name, err) } } } }
// consumeFromKafka will start consuming messages from Kafka and writing them to the given // channel forever. It is important that this method closes the "out" channel when it's done, // as that instructs the downstream goroutine to exit. func (c *KafkaCluster) consumeFromKafka(partID int, out chan message, startOldest bool) { var err error var alive bool var offsetFirst, offsetNext int64 // Exit logic -- make sure downstream knows we exited. defer func() { log.Debugf("[%s] rationalize[%d]: terminating.", c.name, partID) close(out) }() // Try to connect to Kafka. This might sleep a bit and retry since the broker could // be down a bit. retry := &backoff.Backoff{Min: 500 * time.Millisecond, Jitter: true} for ; true; time.Sleep(retry.Duration()) { // Figure out how many messages are in this topic. This can fail if the broker handling // this partition is down, so we will loop. offsetFirst, err = c.broker.OffsetEarliest(MarshalTopic, int32(partID)) if err != nil { log.Errorf("[%s] rationalize[%d]: failed to get offset: %s", c.name, partID, err) continue } offsetNext, err = c.broker.OffsetLatest(MarshalTopic, int32(partID)) if err != nil { log.Errorf("[%s] rationalize[%d]: failed to get offset: %s", c.name, partID, err) continue } log.Debugf("[%s] rationalize[%d]: offsets %d to %d", c.name, partID, offsetFirst, offsetNext) // TODO: Is there a case where the latest offset is X>0 but there is no data in // the partition? does the offset reset to 0? if offsetNext == 0 || offsetFirst == offsetNext { alive = true c.rationalizers.Done() } break } retry.Reset() // Assume we're starting at the oldest offset for consumption consumerConf := kafka.NewConsumerConf(MarshalTopic, int32(partID)) consumerConf.RetryErrLimit = 1 // Do not retry consumerConf.StartOffset = kafka.StartOffsetOldest consumerConf.RequestTimeout = c.options.MarshalRequestTimeout consumerConf.RetryWait = c.options.MarshalRequestRetryWait // Get the offsets of this partition, we're going to arbitrarily pick something that // is ~100,000 from the end if there's more than that. This is only if startOldest is // false, i.e., we didn't run into a "message too new" situation. checkMessageTs := false if !startOldest && offsetNext-offsetFirst > 100000 { checkMessageTs = true consumerConf.StartOffset = offsetNext - 100000 log.Infof("[%s] rationalize[%d]: fast forwarding to offset %d.", c.name, partID, consumerConf.StartOffset) } consumer, err := c.broker.Consumer(consumerConf) if err != nil { // Unfortunately this is a termination error, as without being able to consume this // partition we can't effectively rationalize. log.Errorf("[%s] rationalize[%d]: Failed to create consumer: %s", c.name, partID, err) c.Terminate() return } // Consume messages forever, or until told to quit. for !c.Terminated() { msgb, err := consumer.Consume() if err != nil { // The internal consumer will do a number of retries. If we get an error here, // we're probably in the middle of a partition handoff. We should pause so we // don't hammer the cluster, but otherwise continue. log.Warningf("[%s] rationalize[%d]: failed to consume: %s", c.name, partID, err) time.Sleep(retry.Duration()) continue } retry.Reset() msg, err := decode(msgb.Value) if err != nil { // Invalid message in the streac. This should never happen, but if it does, just // continue on. // TODO: We should probably think about this. If we end up in a situation where // one version of this software has a bug that writes invalid messages, it could // be doing things we don't anticipate. Of course, crashing all consumers // reading that partition is also bad. log.Errorf("[%s] rationalize[%d]: %s", c.name, partID, err) // In the case where the first message is an invalid message, we need to // to notify that we're alive now if !alive { alive = true c.rationalizers.Done() } continue } // If we are on our first message, and we started at a non-zero offset, we need // to check to make sure that the timestamp is older than a given threshold. If it's // too new, that indicates our 100000 try didn't work, so let's go from the start. // TODO: This could be a binary search or something. if checkMessageTs { if int64(msg.Timestamp()) > time.Now().Unix()-HeartbeatInterval*2 { log.Warningf("[%s] rationalize[%d]: rewinding, fast-forwarded message was too new", c.name, partID) go c.consumeFromKafka(partID, out, true) return // terminate self. } checkMessageTs = false } log.Debugf("[%s] rationalize[%d]: @%d: [%s]", c.name, partID, msgb.Offset, msg.Encode()) out <- msg // This is a one-time thing that fires the first time the rationalizer comes up // and makes sure we actually process all of the messages. if !alive && msgb.Offset >= offsetNext-1 { for len(out) > 0 { time.Sleep(100 * time.Millisecond) } log.Infof("[%s] rationalize[%d]: reached offset %d, now alive", c.name, partID, msgb.Offset) alive = true c.rationalizers.Done() } } }