streamer.go

package bqstreamer

import (
	"fmt"
	"sort"
	"strings"
	"time"

	"github.com/dchest/uniuri"
	"github.com/guregu/bq"
	bigquery "google.golang.org/api/bigquery/v2"
	"google.golang.org/api/googleapi"
	"gopkg.in/validator.v2"
)

// A Streamer is a BigQuery stream inserter,
// queuing rows and stream inserts to BigQuery in bulk by calling InsertAll().
type Streamer struct {
	// BigQuery client connection.
	service *bigquery.Service

	// Upon invokeing Start() streamer will fetch rows from this channel,
	// and queue it into an internal rows queue.
	rowChannel chan *row

	// Internal list to queue rows for stream insert.
	rows []*row

	// Rows index to queue next row into.
	//
	// TODO using a row index is probably not the best way.
	// Maybe we should instead create a slice with len = 0, cap = 500 and use
	// len() instead.
	rowIndex int // 0

	// Max delay between flushes to BigQuery.
	MaxDelay time.Duration `validate:"min=1"`

	// Sleep delay after a rejected insert and before retry.
	SleepBeforeRetry time.Duration `validate:"min=1"`

	// Maximum retry insert attempts for non-rejected row insert errors.
	// e.g. GoogleAPI HTTP errors, generic HTTP errors, etc.
	MaxRetryInsert int `validate:"min=0"`

	// Automatically create tables if they don't exist.
	CreateTables bool

	// Shutdown channel to stop Start() execution.
	stopChannel chan bool

	// Errors are reported to this channel.
	Errors chan error

	// The following functions can be overriden for unit testing.

	// Start read-queue-stream loop function.
	Start func()

	// Stop read-queue-stream loop function.
	Stop func()

	// Flush to BigQuery function.
	flush func()

	// Insert all tables to bigquery function.
	insertAll func()

	// Insert table to BigQuery function.
	insertTable func(projectId, datasetId, tableId string, t table) (r *bigquery.TableDataInsertAllResponse, err error)
}

// NewStreamer returns a new Streamer.
func NewStreamer(
	service *bigquery.Service,
	maxRows int,
	maxDelay time.Duration,
	sleepBeforeRetry time.Duration,
	maxRetryInsert int) (b *Streamer, err error) {
	// TODO add testing for nil bigquery.Service (this will intervene with tests though,
	// maybe find a way to mock this type somehow?)

	// TODO maybe return error if maxRows > 500?

	err = validator.Valid(maxRows, "min=1")
	if err != nil {
		return
	}

	b = &Streamer{
		service:          service,
		rowChannel:       make(chan *row, maxRows),
		rows:             make([]*row, maxRows),
		rowIndex:         0,
		MaxDelay:         maxDelay,
		SleepBeforeRetry: sleepBeforeRetry,
		MaxRetryInsert:   maxRetryInsert,
		stopChannel:      make(chan bool),
		Errors:           make(chan error, errorBufferSize),
	}

	err = validator.Validate(b)
	if err != nil {
		return
	}

	// Assign function defaults.
	b.Start = b.start
	b.Stop = b.stop
	b.flush = b.flushToBigQuery
	b.insertAll = b.insertAllToBigQuery
	b.insertTable = b.insertTableToBigQuery

	return
}

// startStreamer infinitely reads rows from rowChannel and queues them internaly.
// It flushes to BigQuery when queue is filled (according to maxRows) or timer has expired (according to maxDelay).
//
// This function is assigned to Streamer.Start member.
// It is overridable so we can test MultiStreamer without actually
// starting the streamers.
//
// Note the read-insert-flush loop will never stop, so this function should be
// executed in a goroutine, and stopped via calling Stop().
func (b *Streamer) start() {
	t := time.NewTimer(b.MaxDelay)
	toStop := false
	for {
		// Flush and reset timer when one of the following signals (channels) fire:
		select {
		case toStop = <-b.stopChannel:
		case <-t.C:
		case r := <-b.rowChannel:
			// Insert row to queue.
			b.rows[b.rowIndex] = r
			b.rowIndex++

			// Don't flush if rows queue isn't full.
			if b.rowIndex < len(b.rows) {
				continue
			}
		}

		b.flush()

		if !toStop {
			t.Reset(b.MaxDelay)
		} else {
			t.Stop()
			return
		}
	}
}

// stopStreamer sends a stop message to stop channel, causing Start() infinite loop to stop.
func (b *Streamer) stop() {
	b.stopChannel <- true
}

// flushAll streams all queued rows to BigQuery and resets rows queue by
// creating a new queue.
//
// This function is assigned to Streamer.flush member.
// It is overridable so we can test Streamer without actually flushing
// to BigQuery.
//
// TODO Consider making this public. If so, we should use a mutex to lock the object,
// otherwise if the object is running in another goroutine it can call this in parallel.
func (b *Streamer) flushToBigQuery() {
	b.insertAll()

	// Init (reset) a new rows queue - clear old one and re-allocate.
	b.rows = make([]*row, len(b.rows))
	b.rowIndex = 0
}

// QueueRow sends a single row to the row channel, which will be queued and inserted in bulk with other queued rows.
func (b *Streamer) QueueRow(projectID, datasetID, tableID string, jsonRow map[string]bigquery.JsonValue) {
	b.rowChannel <- &row{projectID, datasetID, tableID, jsonRow}
}

// BigQuery data types.
type project map[string]dataset
type dataset map[string]table
type table []*tableRow
type tableRow struct {
	// rowID is used to distinguish this row if a retry insert is necessary.
	// This is needed for row de-duplication.
	rowID string

	// Row payload.
	jsonValue map[string]bigquery.JsonValue
}

// createTableIfNotExists initializes given project, dataset, and table
// in project map if they haven't been initialized yet.
func createTableIfNotExists(ps map[string]project, p, d, t string) {
	// Create table's project if non-existent.
	if _, ok := ps[p]; !ok {
		ps[p] = project{}
	}

	// Create table's dataset if non-existent.
	if _, ok := ps[p][d]; !ok {
		ps[p][d] = dataset{}
	}

	// Create table if non-existent.
	if _, ok := ps[p][d][t]; !ok {
		ps[p][d][t] = table{}
	}
}

// insertAllToBigQuery inserts all rows from all tables to BigQuery.
// Each table is inserted separately, according to BigQuery's requirements.
// Insert errors are reported to error channel.
func (b *Streamer) insertAllToBigQuery() {
	// Sort rows by project->dataset->table
	// Necessary because each InsertAll() has to be for a single table.
	ps := map[string]project{}
	for i := 0; i < b.rowIndex; i++ {
		r := b.rows[i]

		p, d, t := r.projectID, r.datasetID, r.tableID

		// Create project, dataset and table if uninitalized.
		createTableIfNotExists(ps, p, d, t)

		// Append row to table,
		// and generate random row ID of 16 character length, for de-duplication purposes.
		ps[p][d][t] = append(ps[p][d][t], &tableRow{
			rowID:     uniuri.NewLen(16),
			jsonValue: r.data,
		})
	}

	// Stream insert each table to BigQuery.
	for pID, p := range ps {
		for dID, d := range p {
			for tID := range d {
				// Insert to a single table in bulk, and retry insert on certain errors.
				// Keep on retrying until successful.
				numRetries := 0

				for {
					numRetries++
					if numRetries > b.MaxRetryInsert {
						b.Errors <- fmt.Errorf(
							"Insert table %s retried %d times, dropping insert and moving on",
							tID, numRetries)
						break
					} else if len(d[tID]) == 0 {
						b.Errors <- fmt.Errorf("All rows from table %s have been filtered, moving on", tID)
						break
					}

					responses, err := b.insertTable(pID, dID, tID, d[tID])

					// Automatically insert tables
					if b.shouldInsertNewTable(err) {
						row := d[tID][0]
						// TODO: remove bq dependency?
						schema, _ := bq.SchemaFromJSON(row.jsonValue)
						var table *bigquery.Table
						table, err = b.insertNewTable(pID, dID, tID, schema)
						if err == nil {
							fmt.Println("BQ: Created table", table.TableReference.TableId)
						}
					}

					// Retry on certain HTTP errors.
					if b.shouldRetryInsertAfterError(err) {
						// Retry after HTTP errors usually mean to retry after a certain pause.
						// See the following link for more info:
						// https://cloud.google.com/bigquery/troubleshooting-errors
						time.Sleep(b.SleepBeforeRetry)
						continue
					}

					// Retry if insert was rejected due to bad rows.
					// Occurence of bad rows do not count against retries,
					// as this means we're trying to insert bad data to BigQuery.
					rejectedRows := b.filterRejectedRows(responses, pID, dID, tID, d)
					if len(rejectedRows) > 0 {
						numRetries--
						continue
					}

					// If we reached here it means insert was successful,
					// so retry isn't necessary.
					// Thus, break from the "retry insert" loop.
					break
				}
			}
		}
	}
}

// insertTableToBigQuery inserts a single table to BigQuery using BigQuery's InsertAll request.
//
// This function is assigned to Streamer.insertTable member.
// It is overridable so we can test Streamer without actually inserting anythin to BigQuery.
func (b *Streamer) insertTableToBigQuery(projectID, datasetID, tableID string, t table) (
	r *bigquery.TableDataInsertAllResponse, err error) {
	// Convert all rows to bigquery table rows.
	rows := make([]*bigquery.TableDataInsertAllRequestRows, len(t))
	for i, row := range t {
		rows[i] = &bigquery.TableDataInsertAllRequestRows{
			InsertId: row.rowID,
			Json:     row.jsonValue}
	}

	// Generate request, tabledata and send.
	request := bigquery.TableDataInsertAllRequest{Kind: "bigquery#tableDataInsertAllRequest", Rows: rows}

	// TODO might be better to cache table services somehow, instead of re-creating them on every flush.
	tableService := bigquery.NewTabledataService(b.service)

	r, err = tableService.InsertAll(projectID, datasetID, tableID, &request).Do()

	return
}

// insertNewTable creates a new BigQuery table
func (b *Streamer) insertNewTable(projectID, datasetID, tableID string, schema *bigquery.TableSchema) (*bigquery.Table, error) {
	tables := bigquery.NewTablesService(b.service)
	table := &bigquery.Table{
		Schema: schema,
		TableReference: &bigquery.TableReference{
			ProjectId: projectID,
			DatasetId: datasetID,
			TableId:   tableID,
		},
	}
	return tables.Insert(projectID, datasetID, table).Do()
}

// updateTableSchema updates a pre-existing table's schema
func (b *Streamer) updateTableSchema(projectID, datasetID, tableID string, schema *bigquery.TableSchema) (*bigquery.Table, error) {
	tables := bigquery.NewTablesService(b.service)
	table := &bigquery.Table{
		Schema: schema,
		TableReference: &bigquery.TableReference{
			ProjectId: projectID,
			DatasetId: datasetID,
			TableId:   tableID,
		},
	}
	return tables.Update(projectID, datasetID, tableID, table).Do()
}

// shouldRetryInsertAfterError checks for insert HTTP response errors,
// and returns true if insert should be retried.
// See the following url for more info:
// https://cloud.google.com/bigquery/troubleshooting-errors
func (b *Streamer) shouldRetryInsertAfterError(err error) (shouldRetry bool) {
	shouldRetry = false

	if err != nil {
		// Retry on GoogleAPI HTTP server error (500, 503).
		if gerr, ok := err.(*googleapi.Error); ok {
			switch gerr.Code {
			case 500, 503:
				shouldRetry = true
			}
		}

		// Log and don't retry for any other response codes,
		// or if not a Google API response at all.
		b.Errors <- err
	}

	return
}

func (b *Streamer) shouldInsertNewTable(err error) (shouldCreate bool) {
	if !b.CreateTables {
		return false
	}

	// Return true only when a table not found error happens
	if gerr, ok := err.(*googleapi.Error); ok {
		if gerr.Code == 404 && strings.Contains(gerr.Message, "Not found: Table") {
			return true
		}
	}

	return false
}

func (b *Streamer) shouldUpdateTableSchema(err bigquery.ErrorProto) (shouldCreate bool) {
	if !b.CreateTables {
		return false
	}

	// TODO: support errors besides missing fields?
	return err.Reason == "invalid" && strings.Contains(err.Message, "no such field")
}

// filterRejectedRows checks for per-row responses,
// removes rejected rows given table, and returns index slice of rows removed.
//
// Rows are rejected if BigQuery insert response marked them as any string
// other than "stopped".  See the following url for further info:
// https://cloud.google.com/bigquery/streaming-data-into-bigquery#troubleshooting
func (b *Streamer) filterRejectedRows(
	responses *bigquery.TableDataInsertAllResponse,
	pID, dID, tID string,
	d map[string]table) (rowsToFilter []int64) {

	// Go through all rows and rows' errors, and remove rejected (bad) rows.
	if responses != nil {
		for _, rowErrors := range responses.InsertErrors {
			// We use a sanity switch to make sure we don't append
			// the same row to be deleted more than once.
			filter := false

			// Each row can have several errors.
			// Go through each of these, and remove row if one of these errors != "stopped" or "timeout".
			// Also log all non-"stopped" errors on the fly.
			for _, rowErrorPtr := range rowErrors.Errors {
				rowError := *rowErrorPtr

				// Mark invalid rows to be deleted.
				switch rowError.Reason {
				// Do nothing for these types of error reason.
				case "stopped", "timeout":

				// Filter and log everything else.
				default:
					if !filter {
						rowsToFilter = append(rowsToFilter, rowErrors.Index)
						filter = true
					}

					// update schema if necessary
					if b.shouldUpdateTableSchema(rowError) {
						fmt.Println("bqstreamer: updating schema...")
						schema, err := bq.SchemaFromJSON(d[tID][rowErrors.Index].jsonValue)
						if err == nil {
							_, err := b.updateTableSchema(pID, dID, tID, schema)
							if err == nil {
								// re-queue this request
								b.QueueRow(pID, dID, tID, d[tID][rowErrors.Index].jsonValue)
								continue
							} else {
								b.Errors <- err
							}
						}
					}

					// Log all errors besides "stopped" ones.
					b.Errors <- fmt.Errorf(
						"%s.%s.%s.row[%d]: %s in %s: %s: %s",
						pID, dID, tID,
						rowErrors.Index,
						rowError.Reason, rowError.Location, rowError.Message,
						d[tID][rowErrors.Index].jsonValue)
				}
			}
		}
	}

	// Remove accumulated rejected rows from table (if there were any).
	if len(rowsToFilter) > 0 {
		// Replace modified table instead of original.
		// This is necessary because original table's slice has a different len().
		//
		// XXX is this ok?
		d[tID] = b.filterRowsFromTable(rowsToFilter, d[tID])
	}

	return
}

// "sort"-compliant int64 class. Used for sorting in removeRows() below.
// This is necessary because for some reason package sort doesn't support sorting int64 slices.
type int64Slice []int64

func (a int64Slice) Len() int           { return len(a) }
func (a int64Slice) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
func (a int64Slice) Less(i, j int) bool { return a[i] < a[j] }

// removeRows removes rows by given indexes from given table,
// and returns the filtered table.
// Table filtering is done in place, thus the original table is also changed.
//
// The table is returned in addition to mutating the given table argument for idiom's sake.
func (b *Streamer) filterRowsFromTable(indexes []int64, t table) table {
	// Deletion is done in-place, so we copy & sort given indexes,
	// then delete in reverse order. Reverse order is necessary for not
	// breaking the order of elements to remove in the slice.
	//
	// Create a copy of given index slice in order to not modify the outer index slice.
	is := append([]int64(nil), indexes...)
	sort.Sort(sort.Reverse(int64Slice(is)))

	for _, i := range is {
		// Garbage collect old row,
		// and switch its place with with last row of slice.
		t[i], t = t[len(t)-1], t[:len(t)-1]
	}

	// Return the same table.
	return t
}