func TestProducerBrokenConnection(t *testing.T) { IntegrationTest(t) topics := []string{"Topic3", "Topic4"} cluster := NewKafkaCluster("kafka-docker/", 4) if err := cluster.Start(); err != nil { t.Fatalf("cannot start kafka cluster: %s", err) } defer func() { _ = cluster.Stop() }() bconf := kafka.NewBrokerConf("producer-broken-connection") addrs, err := cluster.KafkaAddrs() if err != nil { t.Fatalf("cannot get kafka address: %s", err) } broker, err := kafka.Dial(addrs, bconf) if err != nil { t.Fatalf("cannot connect to cluster (%q): %s", addrs, err) } defer broker.Close() // produce big message to enforce TCP buffer flush m := proto.Message{ Value: []byte(strings.Repeat("producer broken connection message ", 1000)), } pconf := kafka.NewProducerConf() producer := broker.Producer(pconf) // send message to all topics to make sure it's working for _, name := range topics { if _, err := producer.Produce(name, 0, &m); err != nil { t.Fatalf("cannot produce to %q: %s", name, err) } } // close two kafka clusters and publish to all 3 topics - 2 of them should // retry sending, because lack of leader makes the request fail // // request should not succeed until nodes are back - bring them back after // small delay and make sure producing was successful containers, err := cluster.Containers() if err != nil { t.Fatalf("cannot get containers: %s", err) } var stopped []*Container for _, container := range containers { if container.RunningKafka() { if err := container.Kill(); err != nil { t.Fatalf("cannot kill %q kafka container: %s", container.ID, err) } stopped = append(stopped, container) } if len(stopped) == 2 { break } } // bring stopped containers back errc := make(chan error) go func() { time.Sleep(500 * time.Millisecond) for _, container := range stopped { if err := container.Start(); err != nil { errc <- err } } close(errc) }() // send message to all topics to make sure it's working for _, name := range topics { if _, err := producer.Produce(name, 0, &m); err != nil { t.Errorf("cannot produce to %q: %s", name, err) } } for err := range errc { t.Errorf("cannot start container: %s", err) } // make sure data was persisted for _, name := range topics { consumer, err := broker.Consumer(kafka.NewConsumerConf(name, 0)) if err != nil { t.Errorf("cannot create consumer for %q: %s", name, err) continue } for i := 0; i < 2; i++ { if _, err := consumer.Consume(); err != nil { t.Errorf("cannot consume %d message from %q: %s", i, name, err) } } } }
// Dial returns a new cluster object which can be used to instantiate a number of Marshalers // that all use the same cluster. You may pass brokerConf or may set it to nil. func Dial(name string, brokers []string, options MarshalOptions) (*KafkaCluster, error) { // Connect to Kafka brokerConf := kafka.NewBrokerConf("PortalMarshal") brokerConf.MetadataRefreshFrequency = time.Hour brokerConf.ConnectionLimit = options.BrokerConnectionLimit brokerConf.LeaderRetryLimit = 1 // Do not retry broker, err := kafka.Dial(brokers, brokerConf) if err != nil { return nil, err } c := &KafkaCluster{ quit: new(int32), rsteps: new(int32), name: name, options: options, lock: &sync.RWMutex{}, rationalizers: &sync.WaitGroup{}, broker: broker, producer: broker.Producer(kafka.NewProducerConf()), topics: make(map[string]int), groups: make(map[string]map[string]*topicState), pausedGroups: make(map[string]time.Time), jitters: make(chan time.Duration, 100), } // Do an initial metadata fetch, this will block a bit err = c.refreshMetadata() if err != nil { return nil, fmt.Errorf("Failed to get metadata: %s", err) } // If there is no marshal topic, then we can't run. The admins must go create the topic // before they can use this library. Please see the README. c.partitions = c.getTopicPartitions(MarshalTopic) if c.partitions == 0 { return nil, errors.New("Marshalling topic not found. Please see the documentation.") } // Now we start a goroutine to start consuming each of the partitions in the marshal // topic. Note that this doesn't handle increasing the partition count on that topic // without stopping all consumers. c.rationalizers.Add(c.partitions) for id := 0; id < c.partitions; id++ { go c.rationalize(id, c.kafkaConsumerChannel(id)) } // A jitter calculator, just fills a channel with random numbers so that other // people don't have to build their own random generator. It is important that // these values be somewhat less than the HeartbeatInterval as we use this for // jittering our heartbeats. go func() { rnd := rand.New(rand.NewSource(time.Now().UnixNano())) for { jitter := rnd.Intn(HeartbeatInterval/2) + (HeartbeatInterval / 4) c.jitters <- time.Duration(jitter) * time.Second } }() // Now start the metadata refreshing goroutine go func() { for !c.Terminated() { time.Sleep(<-c.jitters) log.Infof("[%s] Refreshing topic metadata.", c.name) c.refreshMetadata() // See if the number of partitions in the marshal topic changed. This is bad if // it happens, since it means we can no longer coordinate correctly. if c.getTopicPartitions(MarshalTopic) != c.partitions { log.Errorf("[%s] Marshal topic partition count changed. Terminating!", c.name) c.Terminate() } } }() // Wait for all rationalizers to come alive log.Infof("[%s] Waiting for all rationalizers to come alive.", c.name) c.rationalizers.Wait() log.Infof("[%s] All rationalizers alive, KafkaCluster now alive.", c.name) return c, nil }