func NewKafkaProducer() (*IndeedKafkaProducer, error) {
	config := sarama.NewConfig()
	config.ClientID = ipresolver.GetLocalAddr()
	config.Producer.RequiredAcks = sarama.WaitForLocal
	config.Producer.Compression = sarama.CompressionNone
	config.Producer.Return.Successes = true
	config.Producer.Return.Errors = true
	config.Producer.Partitioner = sarama.NewHashPartitioner
	asyncProducer, err := sarama.NewAsyncProducer(eatonconfig.KafkaServers, config)
	if err != nil {
		return nil, err
	}
	go func() {
		for msg := range asyncProducer.Successes() {
			eatonevents.Info(fmt.Sprintf("Successfully sent message to topic %s with key %s", msg.Topic, msg.Key))
		}
	}()
	go func() {
		for err := range asyncProducer.Errors() {
			eatonevents.Error("Failed to send message due to error: ", err)
		}
	}()
	return &IndeedKafkaProducer{
		producer: asyncProducer,
	}, nil
}
func (a *AWSWork) DoWork() error {
	var err error
	for i := 1; i < retryCount; i++ {
		_, err = s3Svc.PutObject(&s3.PutObjectInput{
			Bucket:      aws.String(S3BucketName),
			Key:         aws.String(a.jobResult.JobKey),
			Body:        bytes.NewReader(a.msgValue),
			ContentType: aws.String(fmt.Sprintf("application/%s", indeedConstants.IndeedResponseFormat)),
		})
		if err == nil {
			break
		}
	}
	if err != nil {
		return err
	}
	attrItem := map[string]*dynamodb.AttributeValue{
		//Minimum required fields as defined by EAT-3
		"DocumentID": {
			S: aws.String(a.jobResult.JobKey),
		},
		"Source": {
			S: aws.String("indeed"),
		},
		"Role": {
			S: aws.String("none"),
		},
		"Type": {
			S: aws.String("job description"),
		},
		"FileType": {
			S: aws.String(fmt.Sprintf("https://s3-%s.amazonaws.com/%s/%s", AWSRegion, S3BucketName, a.jobResult.JobKey)),
		},
		//extended metadata for the actual result.
		"CreateDate": {
			S: aws.String(a.jobResult.GetDateString()),
		},
		"JobTitle": {
			S: aws.String(a.jobResult.JobTitle),
		},
		"Company": {
			S: aws.String(a.jobResult.Company),
		},
		"City": {
			S: aws.String(a.jobResult.City),
		},
		"State": {
			S: aws.String(a.jobResult.State),
		},
		"Country": {
			S: aws.String(a.jobResult.Country),
		},
		"FormattedLocation": {
			S: aws.String(a.jobResult.FormattedLocation),
		},
		"ResultSource": {
			S: aws.String(a.jobResult.Source),
		},
		"Snippet": {
			S: aws.String(a.jobResult.Snippet),
		},
		"Latitude": {
			N: aws.String(fmt.Sprintf("%f", a.jobResult.Latitude)),
		},
		"Longitude": {
			N: aws.String(fmt.Sprintf("%f", a.jobResult.Longitude)),
		},
		"Sponsored": {
			BOOL: aws.Bool(a.jobResult.Sponsored),
		},
		"Expired": {
			BOOL: aws.Bool(a.jobResult.Expired),
		},
		"FormattedLocationFull": {
			S: aws.String(a.jobResult.FormattedLocationFull),
		},
		"FormattedRelativeTime": {
			S: aws.String(a.jobResult.FormattedRelativeTime),
		},
	}
	if a.jobResult.FullJobSummary != "" {
		attrItem["FullJobSummary"] = &dynamodb.AttributeValue{
			S: aws.String(a.jobResult.FullJobSummary),
		}
	}
	putItemInput := &dynamodb.PutItemInput{
		Item:      attrItem,
		TableName: aws.String(DynamoDBTableName),
	}
	for i := 1; i < retryCount; i++ {
		_, err = dbSvc.PutItem(putItemInput)
		if err == nil {
			break
		}
	}
	if err != nil {
		return err
	}
	msg := fmt.Sprintf("Successfully stored jobkey %s in table %s and in bucket %s", a.jobResult.JobKey, DynamoDBTableName, S3BucketName)
	err = eatonevents.Info(msg)
	if err != nil {
		return err
	}
	return nil
}
Exemple #3
0
func main() {
	flag.StringVar(&S3BucketName, "bucket", "eaton-jobdescription-bucket", "the bucket to store retrieved indeed api messages from.")
	flag.StringVar(&DynamoDBTableName, "table", "Documents", "the dynamodb table to store the indeed api messages.")
	flag.StringVar(&logFile, "log", "eaton-feeder.log", "the log file to write results to.")
	flag.StringVar(&AWSRegion, "region", "us-west-2", "the aws region to use when saving content to dynamodb and s3.")
	flag.StringVar(&eatonconfig.OffsetType, "offset", "oldest", "the offset to use. either \"oldest\" or \"newest\" ")
	flag.BoolVar(&doConsume, "consume", false, "set to true to consume messages from KAFKA_SERVERS and send them to S3/DynamoDB")
	flag.BoolVar(&doProduce, "produce", false, "set to true to pull from the indeed api and push messages to KAFKA_SERVERS.")
	flag.IntVar(&interval, "interval", -1, "the time between polls of the indeed api in millis. anything equal to or below 0 disables the interval function.")
	flag.IntVar(&awsWorkPoolSize, "awsWorkPoolSize", 5, "the number of concurrent requests allowed when storing information in S3/DynamoDB")
	flag.Parse()

	log.Println("Using the following: ")
	log.Println("\tbucket:\t", S3BucketName)
	log.Println("\ttable:\t", DynamoDBTableName)
	log.Println("\tlogFile:\t", logFile)
	log.Println("\tregion:\t", AWSRegion)
	log.Println("\toffset:\t", eatonconfig.OffsetType)
	log.Println("\tconsume:\t", doConsume)
	log.Println("\tproduce:\t", doProduce)
	log.Println("\tinterval:\t", interval)
	log.Println("\tawsWorkPoolSize\t", awsWorkPoolSize)

	file, err := os.OpenFile(logFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
	if err != nil {
		log.Fatal("ERROR - failed to create log file: ", err)
	}

	defer file.Close()
	log.SetOutput(file)

	if !doProduce && !doConsume {
		flag.PrintDefaults()
		return
	}

	if doProduce && doConsume {
		log.Fatal("ERROR - Cannot both produce and consume in a single application!")
	}

	err = eatonevents.Init()
	if err != nil {
		log.Fatal("ERROR - failed to create event publisher!")
	}

	var kafkaConsumer *IndeedKafkaConsumer
	var kafkaProducer *IndeedKafkaProducer

	if doProduce {
		if eatonconfig.IsDebug() {
			log.Println("Creating new IndeedKafkaProducer.")
		}
		indeedClient := new(IndeedClient)
		indeedScraper := new(IndeedScraper)
		kafkaProducer, err = NewKafkaProducer()
		if err != nil {
			eatonevents.Error("failed to create new kafka producer: ", err)
			os.Exit(1)
		}
		errChannel, jobResultChannel := indeedClient.GetResults()
		scraperErrChannel, scraperJobResultChannel := indeedScraper.GetFullJobSummary(jobResultChannel)
		kafkaErrChannel, kafkaDoneChannel := kafkaProducer.SendMessages(scraperJobResultChannel)

		go func() {
			eatonevents.Debug("Waiting for messages from the indeedClient error channel...")
			for err := range errChannel {
				eatonevents.Error("IndeedClient: ", err)
			}
			eatonevents.Debug("Finished waiting on messages from the indeedClient error channel.")
		}()
		go func() {
			eatonevents.Debug("Waiting on errors from the IndeedKafkaProducer error channel...")
			for err := range kafkaErrChannel {
				eatonevents.Error("IndeedKafkaProducer: ", err)
			}
			eatonevents.Debug("Finished waiting on messages from the IndeedKafkaProducer error channel.")
		}()
		go func() {
			eatonevents.Debug("Waiting on errors from the IndeedScraper error channel...")
			for err := range scraperErrChannel {
				eatonevents.Error("IndeedScraper: ", err)
			}
			eatonevents.Debug("Finshed waiting on messages from the indeed scraper error channel.")
		}()
		for done := range kafkaDoneChannel {
			eatonevents.Debug(fmt.Sprintf("completed sending messages to kafka (signal value: %d)", done))
			if kafkaConsumer != nil {
				kafkaConsumer.Close()
			}
		}
		return
	}

	if doConsume {
		eatonevents.Debug("Creating new IndeedKafkaConsumer.")
		kafkaConsumer, err = NewKafkaConsumer()
		if err != nil {
			eatonevents.Error("failed to create new kafka consumer: ", err)
			os.Exit(1)
		}
		errChannel := kafkaConsumer.ConsumeMessages()
		for err := range errChannel {
			eatonevents.Error("IndeedKafkaConsumer: ", err)
		}
	}

	eatonevents.Info("Main function has completed.  Exiting program.")
}