// filterRejectedRows checks for per-row responses, // removes rejected rows given table, and returns index slice of rows removed. // // Rows are rejected if BigQuery insert response marked them as any string // other than "stopped". See the following url for further info: // https://cloud.google.com/bigquery/streaming-data-into-bigquery#troubleshooting func (b *Streamer) filterRejectedRows( responses *bigquery.TableDataInsertAllResponse, pID, dID, tID string, d map[string]table) (rowsToFilter []int64) { // Go through all rows and rows' errors, and remove rejected (bad) rows. if responses != nil { for _, rowErrors := range responses.InsertErrors { // We use a sanity switch to make sure we don't append // the same row to be deleted more than once. filter := false // Each row can have several errors. // Go through each of these, and remove row if one of these errors != "stopped" or "timeout". // Also log all non-"stopped" errors on the fly. for _, rowErrorPtr := range rowErrors.Errors { rowError := *rowErrorPtr // Mark invalid rows to be deleted. switch rowError.Reason { // Do nothing for these types of error reason. case "stopped", "timeout": // Filter and log everything else. default: if !filter { rowsToFilter = append(rowsToFilter, rowErrors.Index) filter = true } // update schema if necessary if b.shouldUpdateTableSchema(rowError) { fmt.Println("bqstreamer: updating schema...") schema, err := bq.SchemaFromJSON(d[tID][rowErrors.Index].jsonValue) if err == nil { _, err := b.updateTableSchema(pID, dID, tID, schema) if err == nil { // re-queue this request b.QueueRow(pID, dID, tID, d[tID][rowErrors.Index].jsonValue) continue } else { b.Errors <- err } } } // Log all errors besides "stopped" ones. b.Errors <- fmt.Errorf( "%s.%s.%s.row[%d]: %s in %s: %s: %s", pID, dID, tID, rowErrors.Index, rowError.Reason, rowError.Location, rowError.Message, d[tID][rowErrors.Index].jsonValue) } } } } // Remove accumulated rejected rows from table (if there were any). if len(rowsToFilter) > 0 { // Replace modified table instead of original. // This is necessary because original table's slice has a different len(). // // XXX is this ok? d[tID] = b.filterRowsFromTable(rowsToFilter, d[tID]) } return }
// insertAllToBigQuery inserts all rows from all tables to BigQuery. // Each table is inserted separately, according to BigQuery's requirements. // Insert errors are reported to error channel. func (b *Streamer) insertAllToBigQuery() { // Sort rows by project->dataset->table // Necessary because each InsertAll() has to be for a single table. ps := map[string]project{} for i := 0; i < b.rowIndex; i++ { r := b.rows[i] p, d, t := r.projectID, r.datasetID, r.tableID // Create project, dataset and table if uninitalized. createTableIfNotExists(ps, p, d, t) // Append row to table, // and generate random row ID of 16 character length, for de-duplication purposes. ps[p][d][t] = append(ps[p][d][t], &tableRow{ rowID: uniuri.NewLen(16), jsonValue: r.data, }) } // Stream insert each table to BigQuery. for pID, p := range ps { for dID, d := range p { for tID := range d { // Insert to a single table in bulk, and retry insert on certain errors. // Keep on retrying until successful. numRetries := 0 for { numRetries++ if numRetries > b.MaxRetryInsert { b.Errors <- fmt.Errorf( "Insert table %s retried %d times, dropping insert and moving on", tID, numRetries) break } else if len(d[tID]) == 0 { b.Errors <- fmt.Errorf("All rows from table %s have been filtered, moving on", tID) break } responses, err := b.insertTable(pID, dID, tID, d[tID]) // Automatically insert tables if b.shouldInsertNewTable(err) { row := d[tID][0] // TODO: remove bq dependency? schema, _ := bq.SchemaFromJSON(row.jsonValue) var table *bigquery.Table table, err = b.insertNewTable(pID, dID, tID, schema) if err == nil { fmt.Println("BQ: Created table", table.TableReference.TableId) } } // Retry on certain HTTP errors. if b.shouldRetryInsertAfterError(err) { // Retry after HTTP errors usually mean to retry after a certain pause. // See the following link for more info: // https://cloud.google.com/bigquery/troubleshooting-errors time.Sleep(b.SleepBeforeRetry) continue } // Retry if insert was rejected due to bad rows. // Occurence of bad rows do not count against retries, // as this means we're trying to insert bad data to BigQuery. rejectedRows := b.filterRejectedRows(responses, pID, dID, tID, d) if len(rejectedRows) > 0 { numRetries-- continue } // If we reached here it means insert was successful, // so retry isn't necessary. // Thus, break from the "retry insert" loop. break } } } } }