Esempio n. 1
0
// ProcessData defers to util.SQLInsertData
func (s *SQLWriter) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) {
	// handle panics a bit more gracefully
	defer func() {
		if err := recover(); err != nil {
			util.KillPipelineIfErr(err.(error), killChan)
		}
	}()

	// First check for SQLWriterData
	var wd SQLWriterData
	err := data.ParseJSONSilent(d, &wd)
	logger.Info("SQLWriter: Writing data...")
	if err == nil && wd.TableName != "" && wd.InsertData != nil {
		logger.Debug("SQLWriter: SQLWriterData scenario")
		dd, err := data.NewJSON(wd.InsertData)
		util.KillPipelineIfErr(err, killChan)
		err = util.SQLInsertData(s.writeDB, dd, wd.TableName, s.OnDupKeyUpdate, s.OnDupKeyFields, s.BatchSize)
		util.KillPipelineIfErr(err, killChan)
	} else {
		logger.Debug("SQLWriter: normal data scenario")
		err = util.SQLInsertData(s.writeDB, d, s.TableName, s.OnDupKeyUpdate, s.OnDupKeyFields, s.BatchSize)
		util.KillPipelineIfErr(err, killChan)
	}
	logger.Info("SQLWriter: Write complete")
}
Esempio n. 2
0
func insertObjects(db *sql.DB, objects []map[string]interface{}, tableName string, onDupKeyUpdate bool, onDupKeyFields []string) error {
	logger.Info("SQLInsertData: building INSERT for len(objects) =", len(objects))
	insertSQL, vals := buildInsertSQL(objects, tableName, onDupKeyUpdate, onDupKeyFields)

	logger.Debug("SQLInsertData:", insertSQL)
	logger.Debug("SQLInsertData: values", vals)

	stmt, err := db.Prepare(insertSQL)
	if err != nil {
		logger.Debug("SQLInsertData: error preparing SQL")
		return err
	}
	defer stmt.Close()

	res, err := stmt.Exec(vals...)
	if err != nil {
		return err
	}
	lastID, err := res.LastInsertId()
	if err != nil {
		return err
	}
	rowCnt, err := res.RowsAffected()
	if err != nil {
		return err
	}

	logger.Info(fmt.Sprintf("SQLInsertData: rows affected = %d, last insert ID = %d", rowCnt, lastID))
	return nil
}
Esempio n. 3
0
// ParseJSON is a simple wrapper for json.Unmarshal
func ParseJSON(d JSON, v interface{}) error {
	err := json.Unmarshal(d, v)
	if err != nil {
		logger.Debug(fmt.Sprintf("data: failure to unmarshal JSON into %+v - error is \"%v\"", v, err.Error()))
		logger.Debug(fmt.Sprintf("	Failed Data: %+v", string(d)))
	}
	return err
}
Esempio n. 4
0
// NewJSON is a simple wrapper for json.Marshal.
func NewJSON(v interface{}) (JSON, error) {
	d, err := json.Marshal(v)
	if err != nil {
		logger.Debug(fmt.Sprintf("data: failure to marshal JSON %+v - error is \"%v\"", v, err.Error()))
		logger.Debug(fmt.Sprintf("	Failed val: %+v", v))
	}
	return d, err
}
Esempio n. 5
0
// ProcessData runs the SQL statements, deferring to util.ExecuteSQLQuery
func (s *SQLExecutor) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) {
	// handle panics a bit more gracefully
	defer func() {
		if err := recover(); err != nil {
			util.KillPipelineIfErr(err.(error), killChan)
		}
	}()

	sql := ""
	var err error
	if s.query == "" && s.sqlGenerator != nil {
		sql, err = s.sqlGenerator(d)
		util.KillPipelineIfErr(err, killChan)
	} else if s.query != "" {
		sql = s.query
	} else {
		killChan <- errors.New("SQLExecutor: must have either static query or sqlGenerator func")
	}

	logger.Debug("SQLExecutor: Running - ", sql)
	// See sql.go
	err = util.ExecuteSQLQuery(s.readDB, sql)
	util.KillPipelineIfErr(err, killChan)
	logger.Info("SQLExecutor: Query complete")
}
Esempio n. 6
0
// GetS3Object returns the object output for the given object key
func GetS3Object(client *s3.S3, bucket, objKey string) (*s3.GetObjectOutput, error) {
	logger.Debug("GetS3Object: ", bucket, "-", objKey)
	params := &s3.GetObjectInput{
		Bucket: aws.String(bucket), // Required
		Key:    aws.String(objKey), // Required
		// IfMatch:                    aws.String("IfMatch"),
		// IfModifiedSince:            aws.Time(time.Now()),
		// IfNoneMatch:                aws.String("IfNoneMatch"),
		// IfUnmodifiedSince:          aws.Time(time.Now()),
		// Range:                      aws.String("Range"),
		// RequestPayer:               aws.String("RequestPayer"),
		// ResponseCacheControl:       aws.String("ResponseCacheControl"),
		// ResponseContentDisposition: aws.String("ResponseContentDisposition"),
		// ResponseContentEncoding:    aws.String("ResponseContentEncoding"),
		// ResponseContentLanguage:    aws.String("ResponseContentLanguage"),
		// ResponseContentType:        aws.String("ResponseContentType"),
		// ResponseExpires:            aws.Time(time.Now()),
		// SSECustomerAlgorithm:       aws.String("SSECustomerAlgorithm"),
		// SSECustomerKey:             aws.String("SSECustomerKey"),
		// SSECustomerKeyMD5:          aws.String("SSECustomerKeyMD5"),
		// VersionId:                  aws.String("ObjectVersionId"),
	}

	return client.GetObject(params)
}
Esempio n. 7
0
// ObjectsFromJSON is a helper for parsing JSON into a slice of
// generic maps/objects. The use-case is when a stage is expecting
// to receive either a JSON object or an array of JSON objects, and
// want to deal with it in a generic fashion.
func ObjectsFromJSON(d JSON) ([]map[string]interface{}, error) {
	var objects []map[string]interface{}

	// return if we have null instead of object(s).
	if bytes.Equal(d, []byte("null")) {
		logger.Debug("ObjectsFromJSON: received null. Expected object or objects. Skipping.")
		return objects, nil
	}

	var v interface{}
	err := ParseJSON(d, &v)
	if err != nil {
		return nil, err
	}

	// check if we have a single object or a slice of objects
	switch vv := v.(type) {
	case []interface{}:
		for _, o := range vv {
			objects = append(objects, o.(map[string]interface{}))
		}
	case map[string]interface{}:
		objects = []map[string]interface{}{vv}
	case []map[string]interface{}:
		objects = vv
	default:
		err = fmt.Errorf("ObjectsFromJSON: unsupported data type: %T", vv)
		return nil, err
	}

	return objects, nil
}
Esempio n. 8
0
func (p *Pipeline) runStages(killChan chan error) {
	for n, stage := range p.layout.stages {
		for _, dp := range stage.processors {
			p.wg.Add(1)
			// Each DataProcessor runs in a separate gorountine.
			go func(n int, dp *dataProcessor) {
				// This is where the main DataProcessor interface
				// functions are called.
				logger.Info(p.Name, "- stage", n+1, dp, "waiting to receive data")
				for d := range dp.inputChan {
					logger.Info(p.Name, "- stage", n+1, dp, "received data")
					if p.PrintData {
						logger.Debug(p.Name, "- stage", n+1, dp, "data =", string(d))
					}
					dp.recordDataReceived(d)
					dp.processData(d, killChan)
				}
				logger.Info(p.Name, "- stage", n+1, dp, "input closed, calling Finish")
				dp.Finish(dp.outputChan, killChan)
				if dp.outputChan != nil {
					logger.Info(p.Name, "- stage", n+1, dp, "closing output")
					close(dp.outputChan)
				}
				p.wg.Done()
			}(n, dp)
		}
	}
}
Esempio n. 9
0
// ForEachQueryData handles generating the SQL (in case of dynamic mode),
// running the query and retrieving the data in data.JSON format, and then
// passing the results back witih the function call to forEach.
func (s *SQLReader) ForEachQueryData(d data.JSON, killChan chan error, forEach func(d data.JSON)) {
	sql := ""
	var err error
	if s.query == "" && s.sqlGenerator != nil {
		sql, err = s.sqlGenerator(d)
		util.KillPipelineIfErr(err, killChan)
	} else if s.query != "" {
		sql = s.query
	} else {
		killChan <- errors.New("SQLReader: must have either static query or sqlGenerator func")
	}

	logger.Debug("SQLReader: Running - ", sql)
	// See sql.go
	dataChan, err := util.GetDataFromSQLQuery(s.readDB, sql, s.BatchSize, s.StructDestination)
	util.KillPipelineIfErr(err, killChan)

	for d := range dataChan {
		// First check if an error was returned back from the SQL processing
		// helper, then if not call forEach with the received data.
		var derr dataErr
		if err := data.ParseJSONSilent(d, &derr); err == nil {
			util.KillPipelineIfErr(errors.New(derr.Error), killChan)
		} else {
			forEach(d)
		}
	}
}
Esempio n. 10
0
// Run finalizes the channel connections between PipelineStages
// and kicks off execution.
// Run will return a killChan that should be waited on so your calling function doesn't
// return prematurely. Any stage of the pipeline can send to the killChan to halt
// execution. Your calling function should check if the sent value is an error or nil to know if
// execution was a failure or a success (nil being the success value).
func (p *Pipeline) Run() (killChan chan error) {
	p.timer = util.StartTimer()
	killChan = make(chan error)

	p.connectStages()
	p.runStages(killChan)

	for _, dp := range p.layout.stages[0].processors {
		logger.Debug(p.Name, ": sending", StartSignal, "to", dp)
		dp.inputChan <- data.JSON(StartSignal)
		dp.Finish(dp.outputChan, killChan)
		close(dp.inputChan)
	}

	// After all the stages are running, send the StartSignal
	// to the initial stage processors to kick off execution, and
	// then wait until all the processing goroutines are done to
	// signal successful pipeline completion.
	go func() {
		p.wg.Wait()
		p.timer.Stop()
		killChan <- nil
	}()

	handleInterrupt(killChan)

	return killChan
}
Esempio n. 11
0
// At this point in pipeline initialization, every dataProcessor has an input
// and output channel, but there is nothing connecting them together. In order
// to support branching and merging between stages (as defined by each
// dataProcessor's outputs), we set up some intermediary channels that will
// manage copying and passing data between stages, as well as properly closing
// channels when all data is received.
func (p *Pipeline) connectStages() {
	logger.Debug(p.Name, ": connecting stages")
	// First, setup the bridgeing channels & brancher/merger's to aid in
	// managing channel communication between processors.
	for _, stage := range p.layout.stages {
		for _, from := range stage.processors {
			if from.outputs != nil {
				from.branchOutChans = []chan data.JSON{}
				for _, to := range p.dataProcessorOutputs(from) {
					if to.mergeInChans == nil {
						to.mergeInChans = []chan data.JSON{}
					}
					c := p.initDataChan()
					from.branchOutChans = append(from.branchOutChans, c)
					to.mergeInChans = append(to.mergeInChans, c)
				}
			}
		}
	}
	// Loop through again and setup goroutines to handle data management
	// between the branchers and mergers
	for _, stage := range p.layout.stages {
		for _, dp := range stage.processors {
			if dp.branchOutChans != nil {
				dp.branchOut()
			}
			if dp.mergeInChans != nil {
				dp.mergeIn()
			}
		}
	}
}
Esempio n. 12
0
// ForEachQueryData handles generating the SQL (in case of dynamic mode),
// running the query and retrieving the data in data.JSON format, and then
// passing the results back witih the function call to forEach.
func (r *BigQueryReader) ForEachQueryData(d data.JSON, killChan chan error, forEach func(d data.JSON)) {
	sql := ""
	var err error
	if r.query == "" && r.sqlGenerator != nil {
		sql, err = r.sqlGenerator(d)
		util.KillPipelineIfErr(err, killChan)
	} else if r.query != "" {
		sql = r.query
	} else {
		killChan <- errors.New("BigQueryReader: must have either static query or sqlGenerator func")
	}

	logger.Debug("BigQueryReader: Running -", sql)

	bqDataChan := make(chan bigquery.Data)
	go r.bqClient().AsyncQuery(r.PageSize, r.config.DatasetID, r.config.ProjectID, sql, bqDataChan)
	aggregatedData := bigquery.Data{}

	for bqd := range bqDataChan {
		util.KillPipelineIfErr(bqd.Err, killChan)
		logger.Info("BigQueryReader: received bqData: len(rows) =", len(bqd.Rows))
		// logger.Debug("   %+v", bqd)

		if bqd.Rows != nil && bqd.Headers != nil && len(bqd.Rows) > 0 {
			if r.AggregateResults {
				logger.Debug("BigQueryReader: aggregating results")
				aggregatedData.Headers = bqd.Headers
				aggregatedData.Rows = append(aggregatedData.Rows, bqd.Rows...)
			} else {
				// Send data as soon as we get it back
				logger.Debug("BigQueryReader: sending data without aggregation")
				d, err := data.JSONFromHeaderAndRows(bqd.Headers, bqd.Rows)
				util.KillPipelineIfErr(err, killChan)
				forEach(d) // pass back out via the forEach func
			}
		}
	}
	if r.AggregateResults {
		logger.Info("BigQueryReader: sending aggregated results: len(rows) =", len(aggregatedData.Rows))
		d, err := data.JSONFromHeaderAndRows(aggregatedData.Headers, aggregatedData.Rows)
		util.KillPipelineIfErr(err, killChan)
		forEach(d) // pass back out via the forEach func
	}
}
Esempio n. 13
0
// ProcessData sends the data it receives to the outputChan only if it matches the supplied regex
func (r *RegexpMatcher) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) {
	matches, err := regexp.Match(r.pattern, d)
	util.KillPipelineIfErr(err, killChan)
	if r.DebugLog {
		logger.Debug("RegexpMatcher: checking if", string(d), "matches pattern", r.pattern, ". MATCH=", matches)
	}
	if matches {
		outputChan <- d
	}
}
Esempio n. 14
0
// ProcessData writes the data
func (w *IoWriter) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) {
	var bytesWritten int
	var err error
	if w.AddNewline {
		bytesWritten, err = fmt.Fprintln(w.Writer, string(d))
	} else {
		bytesWritten, err = w.Writer.Write(d)
	}
	util.KillPipelineIfErr(err, killChan)
	logger.Debug("IoWriter:", bytesWritten, "bytes written")
}
Esempio n. 15
0
// ProcessData writes data as is directly to the output file
func (f *FtpWriter) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) {
	logger.Debug("FTPWriter Process data:", string(d))
	if !f.authenticated {
		f.connect(killChan)
	}

	_, e := f.fileWriter.Write([]byte(d))
	if e != nil {
		util.KillPipelineIfErr(e, killChan)
	}
}
// sendResults handles sending work that is completed, as well as
// guaranteeing a FIFO order of the resulting data sent over the
// original outputChan.
func (dp *dataProcessor) sendResults() {
	dp.Lock()
	logger.Debug("dataProcessor: sendResults checking for valid data to send")
	e := dp.workList.Front()
	for e != nil && e.Value.(*result).done {
		logger.Debug("dataHandler: sendResults sending data")
		res := dp.workList.Remove(e).(*result)
		for _, d := range res.data {
			res.outputChan <- d
		}
		if !res.open {
			logger.Debug("dataProcessor: sendResults closing outputChan")
			close(res.outputChan)
		}
		e = dp.workList.Front()
	}
	dp.Unlock()

	if dp.inputClosed && dp.workList.Len() == 0 {
		dp.doneChan <- true
	}
}
Esempio n. 17
0
// ProcessData reads an entire directory if a prefix is provided (sending each file in that
// directory to outputChan), or just sends the single file to outputChan if a complete
// file path is provided (not a prefix/directory).
//
// It optionally deletes all processed objects once the contents have been sent to outputChan
func (r *S3Reader) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) {
	if r.prefix != "" {
		logger.Debug("S3Reader: process data for prefix", r.prefix)
		objects, err := util.ListS3Objects(r.client, r.bucket, r.prefix)
		logger.Debug("S3Reader: list =", objects)
		util.KillPipelineIfErr(err, killChan)
		for _, o := range objects {
			obj, err := util.GetS3Object(r.client, r.bucket, o)
			util.KillPipelineIfErr(err, killChan)
			r.processObject(obj, outputChan, killChan)
			r.processedObjectKeys = append(r.processedObjectKeys, o)
		}
	} else {
		logger.Debug("S3Reader: process data for object", r.object)
		obj, err := util.GetS3Object(r.client, r.bucket, r.object)
		util.KillPipelineIfErr(err, killChan)
		r.processObject(obj, outputChan, killChan)
		r.processedObjectKeys = append(r.processedObjectKeys, r.object)
	}
	if r.DeleteObjects {
		_, err := util.DeleteS3Objects(r.client, r.bucket, r.processedObjectKeys)
		util.KillPipelineIfErr(err, killChan)
	}
}
func (dp *dataProcessor) processData(d data.JSON, killChan chan error) {
	logger.Debug("dataProcessor: processData", dp, "with concurrency =", dp.concurrency)
	// If no concurrency is needed, simply call stage.ProcessData and return...
	if dp.concurrency <= 1 {
		dp.recordExecution(func() {
			dp.ProcessData(d, dp.outputChan, killChan)
		})
		return
	}
	// ... otherwise process the data in a concurrent queue/pool of goroutines
	logger.Debug("dataProcessor: processData", dp, "waiting for work")
	// wait for room in the queue
	dp.workThrottle <- workSignal{}
	logger.Debug("dataProcessor: processData", dp, "work obtained")
	rc := make(chan data.JSON)
	done := make(chan bool)
	exit := make(chan bool)
	// setup goroutine to handle result
	go func() {
		res := result{outputChan: dp.outputChan, data: []data.JSON{}, open: true}
		dp.Lock()
		dp.workList.PushBack(&res)
		dp.Unlock()
		logger.Debug("dataProcessor: processData", dp, "waiting to receive data on result chan")
		for {
			select {
			case d, open := <-rc:
				logger.Debug("dataProcessor: processData", dp, "received data on result chan")
				res.data = append(res.data, d)
				// outputChan will need to be closed if the rc chan was closed
				res.open = open
			case <-done:
				res.done = true
				logger.Debug("dataProcessor: processData", dp, "done, releasing work")
				<-dp.workThrottle
				dp.sendResults()
				exit <- true
				return
			}
		}
	}()
	// do normal data processing, passing in new result chan
	// instead of the original outputChan
	go dp.recordExecution(func() {
		dp.ProcessData(d, rc, killChan)
		done <- true
	})

	// wait on processing to complete
	<-exit
}
Esempio n. 19
0
// DeleteS3Objects deletes the objects specified by the given object keys
func DeleteS3Objects(client *s3.S3, bucket string, objKeys []string) (*s3.DeleteObjectsOutput, error) {
	logger.Debug("DeleteS3Objects: ", bucket, "-", objKeys)
	s3Ids := make([]*s3.ObjectIdentifier, len(objKeys))
	for i, key := range objKeys {
		s3Ids[i] = &s3.ObjectIdentifier{Key: aws.String(key)}
	}

	params := &s3.DeleteObjectsInput{
		Bucket: aws.String(bucket), // Required
		Delete: &s3.Delete{ // Required
			Objects: s3Ids,
			Quiet:   aws.Bool(true),
		},
		// MFA:          aws.String("MFA"),
		// RequestPayer: aws.String("RequestPayer"),
	}

	return client.DeleteObjects(params)
}
Esempio n. 20
0
// ListS3Objects returns all object keys matching the given prefix. Note that
// delimiter is set to "/". See http://docs.aws.amazon.com/AmazonS3/latest/dev/ListingKeysHierarchy.html
func ListS3Objects(client *s3.S3, bucket, keyPrefix string) ([]string, error) {
	logger.Debug("ListS3Objects: ", bucket, "-", keyPrefix)
	params := &s3.ListObjectsInput{
		Bucket:    aws.String(bucket), // Required
		Delimiter: aws.String("/"),
		// EncodingType: aws.String("EncodingType"),
		// Marker:       aws.String("Marker"),
		MaxKeys: aws.Int64(1000),
		Prefix:  aws.String(keyPrefix),
	}

	objects := []string{}
	err := client.ListObjectsPages(params, func(page *s3.ListObjectsOutput, lastPage bool) bool {
		for _, o := range page.Contents {
			objects = append(objects, *o.Key)
		}
		return lastPage
	})
	if err != nil {
		return nil, err
	}

	return objects, nil
}
Esempio n. 21
0
// ProcessData writes data as is directly to the output file
func (w *SftpWriter) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) {
	logger.Debug("SftpWriter Process data:", string(d))
	w.ensureInitialized(killChan)
	_, e := w.file.Write([]byte(d))
	util.KillPipelineIfErr(e, killChan)
}