Beispiel #1
0
// filterRejectedRows checks for per-row responses,
// removes rejected rows given table, and returns index slice of rows removed.
//
// Rows are rejected if BigQuery insert response marked them as any string
// other than "stopped".  See the following url for further info:
// https://cloud.google.com/bigquery/streaming-data-into-bigquery#troubleshooting
func (b *Streamer) filterRejectedRows(
	responses *bigquery.TableDataInsertAllResponse,
	pID, dID, tID string,
	d map[string]table) (rowsToFilter []int64) {

	// Go through all rows and rows' errors, and remove rejected (bad) rows.
	if responses != nil {
		for _, rowErrors := range responses.InsertErrors {
			// We use a sanity switch to make sure we don't append
			// the same row to be deleted more than once.
			filter := false

			// Each row can have several errors.
			// Go through each of these, and remove row if one of these errors != "stopped" or "timeout".
			// Also log all non-"stopped" errors on the fly.
			for _, rowErrorPtr := range rowErrors.Errors {
				rowError := *rowErrorPtr

				// Mark invalid rows to be deleted.
				switch rowError.Reason {
				// Do nothing for these types of error reason.
				case "stopped", "timeout":

				// Filter and log everything else.
				default:
					if !filter {
						rowsToFilter = append(rowsToFilter, rowErrors.Index)
						filter = true
					}

					// update schema if necessary
					if b.shouldUpdateTableSchema(rowError) {
						fmt.Println("bqstreamer: updating schema...")
						schema, err := bq.SchemaFromJSON(d[tID][rowErrors.Index].jsonValue)
						if err == nil {
							_, err := b.updateTableSchema(pID, dID, tID, schema)
							if err == nil {
								// re-queue this request
								b.QueueRow(pID, dID, tID, d[tID][rowErrors.Index].jsonValue)
								continue
							} else {
								b.Errors <- err
							}
						}
					}

					// Log all errors besides "stopped" ones.
					b.Errors <- fmt.Errorf(
						"%s.%s.%s.row[%d]: %s in %s: %s: %s",
						pID, dID, tID,
						rowErrors.Index,
						rowError.Reason, rowError.Location, rowError.Message,
						d[tID][rowErrors.Index].jsonValue)
				}
			}
		}
	}

	// Remove accumulated rejected rows from table (if there were any).
	if len(rowsToFilter) > 0 {
		// Replace modified table instead of original.
		// This is necessary because original table's slice has a different len().
		//
		// XXX is this ok?
		d[tID] = b.filterRowsFromTable(rowsToFilter, d[tID])
	}

	return
}
Beispiel #2
0
// insertAllToBigQuery inserts all rows from all tables to BigQuery.
// Each table is inserted separately, according to BigQuery's requirements.
// Insert errors are reported to error channel.
func (b *Streamer) insertAllToBigQuery() {
	// Sort rows by project->dataset->table
	// Necessary because each InsertAll() has to be for a single table.
	ps := map[string]project{}
	for i := 0; i < b.rowIndex; i++ {
		r := b.rows[i]

		p, d, t := r.projectID, r.datasetID, r.tableID

		// Create project, dataset and table if uninitalized.
		createTableIfNotExists(ps, p, d, t)

		// Append row to table,
		// and generate random row ID of 16 character length, for de-duplication purposes.
		ps[p][d][t] = append(ps[p][d][t], &tableRow{
			rowID:     uniuri.NewLen(16),
			jsonValue: r.data,
		})
	}

	// Stream insert each table to BigQuery.
	for pID, p := range ps {
		for dID, d := range p {
			for tID := range d {
				// Insert to a single table in bulk, and retry insert on certain errors.
				// Keep on retrying until successful.
				numRetries := 0

				for {
					numRetries++
					if numRetries > b.MaxRetryInsert {
						b.Errors <- fmt.Errorf(
							"Insert table %s retried %d times, dropping insert and moving on",
							tID, numRetries)
						break
					} else if len(d[tID]) == 0 {
						b.Errors <- fmt.Errorf("All rows from table %s have been filtered, moving on", tID)
						break
					}

					responses, err := b.insertTable(pID, dID, tID, d[tID])

					// Automatically insert tables
					if b.shouldInsertNewTable(err) {
						row := d[tID][0]
						// TODO: remove bq dependency?
						schema, _ := bq.SchemaFromJSON(row.jsonValue)
						var table *bigquery.Table
						table, err = b.insertNewTable(pID, dID, tID, schema)
						if err == nil {
							fmt.Println("BQ: Created table", table.TableReference.TableId)
						}
					}

					// Retry on certain HTTP errors.
					if b.shouldRetryInsertAfterError(err) {
						// Retry after HTTP errors usually mean to retry after a certain pause.
						// See the following link for more info:
						// https://cloud.google.com/bigquery/troubleshooting-errors
						time.Sleep(b.SleepBeforeRetry)
						continue
					}

					// Retry if insert was rejected due to bad rows.
					// Occurence of bad rows do not count against retries,
					// as this means we're trying to insert bad data to BigQuery.
					rejectedRows := b.filterRejectedRows(responses, pID, dID, tID, d)
					if len(rejectedRows) > 0 {
						numRetries--
						continue
					}

					// If we reached here it means insert was successful,
					// so retry isn't necessary.
					// Thus, break from the "retry insert" loop.
					break
				}
			}
		}
	}
}