// ProcessData defers to util.SQLInsertData func (s *SQLWriter) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) { // handle panics a bit more gracefully defer func() { if err := recover(); err != nil { util.KillPipelineIfErr(err.(error), killChan) } }() // First check for SQLWriterData var wd SQLWriterData err := data.ParseJSONSilent(d, &wd) logger.Info("SQLWriter: Writing data...") if err == nil && wd.TableName != "" && wd.InsertData != nil { logger.Debug("SQLWriter: SQLWriterData scenario") dd, err := data.NewJSON(wd.InsertData) util.KillPipelineIfErr(err, killChan) err = util.SQLInsertData(s.writeDB, dd, wd.TableName, s.OnDupKeyUpdate, s.OnDupKeyFields, s.BatchSize) util.KillPipelineIfErr(err, killChan) } else { logger.Debug("SQLWriter: normal data scenario") err = util.SQLInsertData(s.writeDB, d, s.TableName, s.OnDupKeyUpdate, s.OnDupKeyFields, s.BatchSize) util.KillPipelineIfErr(err, killChan) } logger.Info("SQLWriter: Write complete") }
func insertObjects(db *sql.DB, objects []map[string]interface{}, tableName string, onDupKeyUpdate bool, onDupKeyFields []string) error { logger.Info("SQLInsertData: building INSERT for len(objects) =", len(objects)) insertSQL, vals := buildInsertSQL(objects, tableName, onDupKeyUpdate, onDupKeyFields) logger.Debug("SQLInsertData:", insertSQL) logger.Debug("SQLInsertData: values", vals) stmt, err := db.Prepare(insertSQL) if err != nil { logger.Debug("SQLInsertData: error preparing SQL") return err } defer stmt.Close() res, err := stmt.Exec(vals...) if err != nil { return err } lastID, err := res.LastInsertId() if err != nil { return err } rowCnt, err := res.RowsAffected() if err != nil { return err } logger.Info(fmt.Sprintf("SQLInsertData: rows affected = %d, last insert ID = %d", rowCnt, lastID)) return nil }
// ParseJSON is a simple wrapper for json.Unmarshal func ParseJSON(d JSON, v interface{}) error { err := json.Unmarshal(d, v) if err != nil { logger.Debug(fmt.Sprintf("data: failure to unmarshal JSON into %+v - error is \"%v\"", v, err.Error())) logger.Debug(fmt.Sprintf(" Failed Data: %+v", string(d))) } return err }
// NewJSON is a simple wrapper for json.Marshal. func NewJSON(v interface{}) (JSON, error) { d, err := json.Marshal(v) if err != nil { logger.Debug(fmt.Sprintf("data: failure to marshal JSON %+v - error is \"%v\"", v, err.Error())) logger.Debug(fmt.Sprintf(" Failed val: %+v", v)) } return d, err }
// ProcessData runs the SQL statements, deferring to util.ExecuteSQLQuery func (s *SQLExecutor) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) { // handle panics a bit more gracefully defer func() { if err := recover(); err != nil { util.KillPipelineIfErr(err.(error), killChan) } }() sql := "" var err error if s.query == "" && s.sqlGenerator != nil { sql, err = s.sqlGenerator(d) util.KillPipelineIfErr(err, killChan) } else if s.query != "" { sql = s.query } else { killChan <- errors.New("SQLExecutor: must have either static query or sqlGenerator func") } logger.Debug("SQLExecutor: Running - ", sql) // See sql.go err = util.ExecuteSQLQuery(s.readDB, sql) util.KillPipelineIfErr(err, killChan) logger.Info("SQLExecutor: Query complete") }
// GetS3Object returns the object output for the given object key func GetS3Object(client *s3.S3, bucket, objKey string) (*s3.GetObjectOutput, error) { logger.Debug("GetS3Object: ", bucket, "-", objKey) params := &s3.GetObjectInput{ Bucket: aws.String(bucket), // Required Key: aws.String(objKey), // Required // IfMatch: aws.String("IfMatch"), // IfModifiedSince: aws.Time(time.Now()), // IfNoneMatch: aws.String("IfNoneMatch"), // IfUnmodifiedSince: aws.Time(time.Now()), // Range: aws.String("Range"), // RequestPayer: aws.String("RequestPayer"), // ResponseCacheControl: aws.String("ResponseCacheControl"), // ResponseContentDisposition: aws.String("ResponseContentDisposition"), // ResponseContentEncoding: aws.String("ResponseContentEncoding"), // ResponseContentLanguage: aws.String("ResponseContentLanguage"), // ResponseContentType: aws.String("ResponseContentType"), // ResponseExpires: aws.Time(time.Now()), // SSECustomerAlgorithm: aws.String("SSECustomerAlgorithm"), // SSECustomerKey: aws.String("SSECustomerKey"), // SSECustomerKeyMD5: aws.String("SSECustomerKeyMD5"), // VersionId: aws.String("ObjectVersionId"), } return client.GetObject(params) }
// ObjectsFromJSON is a helper for parsing JSON into a slice of // generic maps/objects. The use-case is when a stage is expecting // to receive either a JSON object or an array of JSON objects, and // want to deal with it in a generic fashion. func ObjectsFromJSON(d JSON) ([]map[string]interface{}, error) { var objects []map[string]interface{} // return if we have null instead of object(s). if bytes.Equal(d, []byte("null")) { logger.Debug("ObjectsFromJSON: received null. Expected object or objects. Skipping.") return objects, nil } var v interface{} err := ParseJSON(d, &v) if err != nil { return nil, err } // check if we have a single object or a slice of objects switch vv := v.(type) { case []interface{}: for _, o := range vv { objects = append(objects, o.(map[string]interface{})) } case map[string]interface{}: objects = []map[string]interface{}{vv} case []map[string]interface{}: objects = vv default: err = fmt.Errorf("ObjectsFromJSON: unsupported data type: %T", vv) return nil, err } return objects, nil }
func (p *Pipeline) runStages(killChan chan error) { for n, stage := range p.layout.stages { for _, dp := range stage.processors { p.wg.Add(1) // Each DataProcessor runs in a separate gorountine. go func(n int, dp *dataProcessor) { // This is where the main DataProcessor interface // functions are called. logger.Info(p.Name, "- stage", n+1, dp, "waiting to receive data") for d := range dp.inputChan { logger.Info(p.Name, "- stage", n+1, dp, "received data") if p.PrintData { logger.Debug(p.Name, "- stage", n+1, dp, "data =", string(d)) } dp.recordDataReceived(d) dp.processData(d, killChan) } logger.Info(p.Name, "- stage", n+1, dp, "input closed, calling Finish") dp.Finish(dp.outputChan, killChan) if dp.outputChan != nil { logger.Info(p.Name, "- stage", n+1, dp, "closing output") close(dp.outputChan) } p.wg.Done() }(n, dp) } } }
// ForEachQueryData handles generating the SQL (in case of dynamic mode), // running the query and retrieving the data in data.JSON format, and then // passing the results back witih the function call to forEach. func (s *SQLReader) ForEachQueryData(d data.JSON, killChan chan error, forEach func(d data.JSON)) { sql := "" var err error if s.query == "" && s.sqlGenerator != nil { sql, err = s.sqlGenerator(d) util.KillPipelineIfErr(err, killChan) } else if s.query != "" { sql = s.query } else { killChan <- errors.New("SQLReader: must have either static query or sqlGenerator func") } logger.Debug("SQLReader: Running - ", sql) // See sql.go dataChan, err := util.GetDataFromSQLQuery(s.readDB, sql, s.BatchSize, s.StructDestination) util.KillPipelineIfErr(err, killChan) for d := range dataChan { // First check if an error was returned back from the SQL processing // helper, then if not call forEach with the received data. var derr dataErr if err := data.ParseJSONSilent(d, &derr); err == nil { util.KillPipelineIfErr(errors.New(derr.Error), killChan) } else { forEach(d) } } }
// Run finalizes the channel connections between PipelineStages // and kicks off execution. // Run will return a killChan that should be waited on so your calling function doesn't // return prematurely. Any stage of the pipeline can send to the killChan to halt // execution. Your calling function should check if the sent value is an error or nil to know if // execution was a failure or a success (nil being the success value). func (p *Pipeline) Run() (killChan chan error) { p.timer = util.StartTimer() killChan = make(chan error) p.connectStages() p.runStages(killChan) for _, dp := range p.layout.stages[0].processors { logger.Debug(p.Name, ": sending", StartSignal, "to", dp) dp.inputChan <- data.JSON(StartSignal) dp.Finish(dp.outputChan, killChan) close(dp.inputChan) } // After all the stages are running, send the StartSignal // to the initial stage processors to kick off execution, and // then wait until all the processing goroutines are done to // signal successful pipeline completion. go func() { p.wg.Wait() p.timer.Stop() killChan <- nil }() handleInterrupt(killChan) return killChan }
// At this point in pipeline initialization, every dataProcessor has an input // and output channel, but there is nothing connecting them together. In order // to support branching and merging between stages (as defined by each // dataProcessor's outputs), we set up some intermediary channels that will // manage copying and passing data between stages, as well as properly closing // channels when all data is received. func (p *Pipeline) connectStages() { logger.Debug(p.Name, ": connecting stages") // First, setup the bridgeing channels & brancher/merger's to aid in // managing channel communication between processors. for _, stage := range p.layout.stages { for _, from := range stage.processors { if from.outputs != nil { from.branchOutChans = []chan data.JSON{} for _, to := range p.dataProcessorOutputs(from) { if to.mergeInChans == nil { to.mergeInChans = []chan data.JSON{} } c := p.initDataChan() from.branchOutChans = append(from.branchOutChans, c) to.mergeInChans = append(to.mergeInChans, c) } } } } // Loop through again and setup goroutines to handle data management // between the branchers and mergers for _, stage := range p.layout.stages { for _, dp := range stage.processors { if dp.branchOutChans != nil { dp.branchOut() } if dp.mergeInChans != nil { dp.mergeIn() } } } }
// ForEachQueryData handles generating the SQL (in case of dynamic mode), // running the query and retrieving the data in data.JSON format, and then // passing the results back witih the function call to forEach. func (r *BigQueryReader) ForEachQueryData(d data.JSON, killChan chan error, forEach func(d data.JSON)) { sql := "" var err error if r.query == "" && r.sqlGenerator != nil { sql, err = r.sqlGenerator(d) util.KillPipelineIfErr(err, killChan) } else if r.query != "" { sql = r.query } else { killChan <- errors.New("BigQueryReader: must have either static query or sqlGenerator func") } logger.Debug("BigQueryReader: Running -", sql) bqDataChan := make(chan bigquery.Data) go r.bqClient().AsyncQuery(r.PageSize, r.config.DatasetID, r.config.ProjectID, sql, bqDataChan) aggregatedData := bigquery.Data{} for bqd := range bqDataChan { util.KillPipelineIfErr(bqd.Err, killChan) logger.Info("BigQueryReader: received bqData: len(rows) =", len(bqd.Rows)) // logger.Debug(" %+v", bqd) if bqd.Rows != nil && bqd.Headers != nil && len(bqd.Rows) > 0 { if r.AggregateResults { logger.Debug("BigQueryReader: aggregating results") aggregatedData.Headers = bqd.Headers aggregatedData.Rows = append(aggregatedData.Rows, bqd.Rows...) } else { // Send data as soon as we get it back logger.Debug("BigQueryReader: sending data without aggregation") d, err := data.JSONFromHeaderAndRows(bqd.Headers, bqd.Rows) util.KillPipelineIfErr(err, killChan) forEach(d) // pass back out via the forEach func } } } if r.AggregateResults { logger.Info("BigQueryReader: sending aggregated results: len(rows) =", len(aggregatedData.Rows)) d, err := data.JSONFromHeaderAndRows(aggregatedData.Headers, aggregatedData.Rows) util.KillPipelineIfErr(err, killChan) forEach(d) // pass back out via the forEach func } }
// ProcessData sends the data it receives to the outputChan only if it matches the supplied regex func (r *RegexpMatcher) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) { matches, err := regexp.Match(r.pattern, d) util.KillPipelineIfErr(err, killChan) if r.DebugLog { logger.Debug("RegexpMatcher: checking if", string(d), "matches pattern", r.pattern, ". MATCH=", matches) } if matches { outputChan <- d } }
// ProcessData writes the data func (w *IoWriter) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) { var bytesWritten int var err error if w.AddNewline { bytesWritten, err = fmt.Fprintln(w.Writer, string(d)) } else { bytesWritten, err = w.Writer.Write(d) } util.KillPipelineIfErr(err, killChan) logger.Debug("IoWriter:", bytesWritten, "bytes written") }
// ProcessData writes data as is directly to the output file func (f *FtpWriter) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) { logger.Debug("FTPWriter Process data:", string(d)) if !f.authenticated { f.connect(killChan) } _, e := f.fileWriter.Write([]byte(d)) if e != nil { util.KillPipelineIfErr(e, killChan) } }
// sendResults handles sending work that is completed, as well as // guaranteeing a FIFO order of the resulting data sent over the // original outputChan. func (dp *dataProcessor) sendResults() { dp.Lock() logger.Debug("dataProcessor: sendResults checking for valid data to send") e := dp.workList.Front() for e != nil && e.Value.(*result).done { logger.Debug("dataHandler: sendResults sending data") res := dp.workList.Remove(e).(*result) for _, d := range res.data { res.outputChan <- d } if !res.open { logger.Debug("dataProcessor: sendResults closing outputChan") close(res.outputChan) } e = dp.workList.Front() } dp.Unlock() if dp.inputClosed && dp.workList.Len() == 0 { dp.doneChan <- true } }
// ProcessData reads an entire directory if a prefix is provided (sending each file in that // directory to outputChan), or just sends the single file to outputChan if a complete // file path is provided (not a prefix/directory). // // It optionally deletes all processed objects once the contents have been sent to outputChan func (r *S3Reader) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) { if r.prefix != "" { logger.Debug("S3Reader: process data for prefix", r.prefix) objects, err := util.ListS3Objects(r.client, r.bucket, r.prefix) logger.Debug("S3Reader: list =", objects) util.KillPipelineIfErr(err, killChan) for _, o := range objects { obj, err := util.GetS3Object(r.client, r.bucket, o) util.KillPipelineIfErr(err, killChan) r.processObject(obj, outputChan, killChan) r.processedObjectKeys = append(r.processedObjectKeys, o) } } else { logger.Debug("S3Reader: process data for object", r.object) obj, err := util.GetS3Object(r.client, r.bucket, r.object) util.KillPipelineIfErr(err, killChan) r.processObject(obj, outputChan, killChan) r.processedObjectKeys = append(r.processedObjectKeys, r.object) } if r.DeleteObjects { _, err := util.DeleteS3Objects(r.client, r.bucket, r.processedObjectKeys) util.KillPipelineIfErr(err, killChan) } }
func (dp *dataProcessor) processData(d data.JSON, killChan chan error) { logger.Debug("dataProcessor: processData", dp, "with concurrency =", dp.concurrency) // If no concurrency is needed, simply call stage.ProcessData and return... if dp.concurrency <= 1 { dp.recordExecution(func() { dp.ProcessData(d, dp.outputChan, killChan) }) return } // ... otherwise process the data in a concurrent queue/pool of goroutines logger.Debug("dataProcessor: processData", dp, "waiting for work") // wait for room in the queue dp.workThrottle <- workSignal{} logger.Debug("dataProcessor: processData", dp, "work obtained") rc := make(chan data.JSON) done := make(chan bool) exit := make(chan bool) // setup goroutine to handle result go func() { res := result{outputChan: dp.outputChan, data: []data.JSON{}, open: true} dp.Lock() dp.workList.PushBack(&res) dp.Unlock() logger.Debug("dataProcessor: processData", dp, "waiting to receive data on result chan") for { select { case d, open := <-rc: logger.Debug("dataProcessor: processData", dp, "received data on result chan") res.data = append(res.data, d) // outputChan will need to be closed if the rc chan was closed res.open = open case <-done: res.done = true logger.Debug("dataProcessor: processData", dp, "done, releasing work") <-dp.workThrottle dp.sendResults() exit <- true return } } }() // do normal data processing, passing in new result chan // instead of the original outputChan go dp.recordExecution(func() { dp.ProcessData(d, rc, killChan) done <- true }) // wait on processing to complete <-exit }
// DeleteS3Objects deletes the objects specified by the given object keys func DeleteS3Objects(client *s3.S3, bucket string, objKeys []string) (*s3.DeleteObjectsOutput, error) { logger.Debug("DeleteS3Objects: ", bucket, "-", objKeys) s3Ids := make([]*s3.ObjectIdentifier, len(objKeys)) for i, key := range objKeys { s3Ids[i] = &s3.ObjectIdentifier{Key: aws.String(key)} } params := &s3.DeleteObjectsInput{ Bucket: aws.String(bucket), // Required Delete: &s3.Delete{ // Required Objects: s3Ids, Quiet: aws.Bool(true), }, // MFA: aws.String("MFA"), // RequestPayer: aws.String("RequestPayer"), } return client.DeleteObjects(params) }
// ListS3Objects returns all object keys matching the given prefix. Note that // delimiter is set to "/". See http://docs.aws.amazon.com/AmazonS3/latest/dev/ListingKeysHierarchy.html func ListS3Objects(client *s3.S3, bucket, keyPrefix string) ([]string, error) { logger.Debug("ListS3Objects: ", bucket, "-", keyPrefix) params := &s3.ListObjectsInput{ Bucket: aws.String(bucket), // Required Delimiter: aws.String("/"), // EncodingType: aws.String("EncodingType"), // Marker: aws.String("Marker"), MaxKeys: aws.Int64(1000), Prefix: aws.String(keyPrefix), } objects := []string{} err := client.ListObjectsPages(params, func(page *s3.ListObjectsOutput, lastPage bool) bool { for _, o := range page.Contents { objects = append(objects, *o.Key) } return lastPage }) if err != nil { return nil, err } return objects, nil }
// ProcessData writes data as is directly to the output file func (w *SftpWriter) ProcessData(d data.JSON, outputChan chan data.JSON, killChan chan error) { logger.Debug("SftpWriter Process data:", string(d)) w.ensureInitialized(killChan) _, e := w.file.Write([]byte(d)) util.KillPipelineIfErr(e, killChan) }