// RunWriteQuery executes a write query on the database for a migration func (migration *Migration) RunWriteQuery(query string, args ...interface{}) error { if args == nil { glog.Infof("mig_id=%d: Running query '%s'.", migration.Id, query) } else { glog.Infof("mig_id=%d: Running query '%s' (args: %v).", migration.Id, query, args) } err := migration.DbClient.QueryInsertUpdate(query, args...) if err != nil { glog.Errorf("mig_id=%d: Query '%s' failed (error: %s).", migration.Id, query, err) return ErrQueryFailed } return nil }
// RunReadQuery executes a read query on the database for a migration func (migration *Migration) RunReadQuery(query string, args ...interface{}) (map[string][]string, error) { if args == nil { glog.Infof("mig_id=%d: Running query '%s'.", migration.Id, query) } else { glog.Infof("mig_id=%d: Running query '%s' (args: %v).", migration.Id, query, args) } response, err := migration.DbClient.QueryReturnColumnDict(query, args...) if err != nil { glog.Errorf("mig_id=%d: Query '%s' failed (error: %s).", migration.Id, query, err) return nil, ErrQueryFailed } glog.Infof("mig_id=%d: Query response was '%v'", migration.Id, response) return response, nil }
// writeToPtOscLog takes log lines it receives from a channel and passes them // to a function that will write them // Also writes to a buffer which is POSTed to append_to_file when a new log line appears // after LogSyncInterval seconds func (runner *runner) writeToPtOscLog(ptOscLogWriter *bufio.Writer, ptOscLogChan chan string, writerFunc writeToWriter, migrationId int, waitGroup *sync.WaitGroup) { defer fileSyncWaitGroup.Done() defer waitGroup.Done() var buffer bytes.Buffer lastPostTime := time.Unix(0, 0) for line := range ptOscLogChan { currentTime := time.Now().Local() formattedLine := "[" + currentTime.Format("2006-01-02 15:04:05") + "] " + line err := writerFunc(ptOscLogWriter, formattedLine, currentTime) if err != nil { glog.Errorf("Error flushing pt-osc log file (error: %s)", err) } _, err = buffer.WriteString(formattedLine + "\n") if err != nil { glog.Errorf("Error writing to log buffer (error: %s)", err) } if int(currentTime.Sub(lastPostTime)/time.Second) >= runner.LogSyncInterval && buffer.Len() > 0 { err = runner.postLogToFileApi(&buffer, migrationId) if err == nil { lastPostTime = currentTime } } } glog.Infof("Stopped syncing log file") runner.postLogToFileApi(&buffer, migrationId) }
// ValidateFinalInsert validates the syntax of the final insert statement, // and starts/rolls back a trxn to verify that the insert won't fail. func (migration *Migration) ValidateFinalInsert() error { finalInsert := migration.FinalInsert validInsertRegex := regexp.MustCompile("^(?i)(INSERT\\s+INTO\\s+)[^;]+$") validInsertSyntax := validInsertRegex.MatchString(finalInsert) if !validInsertSyntax { return ErrInvalidInsert } glog.Infof("mig_id=%d: Validating final insert '%s' in a transaction and rolling it back.", migration.Id, finalInsert) err := migration.DbClient.ValidateInsertStatement(finalInsert) if err != nil { glog.Errorf("mig_id=%d: Final insert '%s' failed (error: %s).", migration.Id, finalInsert, err) return ErrInvalidInsert } glog.Infof("mig_id=%d: Final insert '%s' was successfully validated and rolled back.", migration.Id, finalInsert) return nil }
// CleanUp cleans up after a migration by dropping triggers if they exist, // and moving the shadow table into the _pending_drops database func (migration *Migration) CleanUp() error { glog.Infof("mig_id=%d: cleaning up triggers.", migration.Id) err := DropTriggers(migration, migration.Table) if err != nil { return err } glog.Infof("mig_id=%d: cleaning up shadow table.", migration.Id) migTable, err := GetMigTable(migration) if err != nil { return err } pdTable := TimestampedTable(migTable) err = MoveToPendingDrops(migration, migTable, pdTable) if err != nil { return err } return nil }
// killPtOscProcess sends a SIGKILL to the pt-osc process associated // with the migrationId, does not lock runningMigMutex func killPtOscProcessById(migrationId int) error { pid := runningMigrations[migrationId] glog.Infof("mig_id=%d: killing (pid = %d).", migrationId, pid) err := syscall.Kill(pid, ptOscKillSignal) if err != nil { glog.Errorf("mig_id=%d: error killing pt-osc (error: %s)", migrationId, err) return ErrPtOscKill } delete(runningMigrations, migrationId) return nil }
// Kills the ptosc job and sends migration through api to be picked up by other hosts func (runner *runner) killAndOfferMigrations() { if len(runningMigrations) > 0 { killedMigrations := killAllPtOscProcesses() glog.Infof("Done killing, waiting for file sync gorountines to finish") fileSyncWaitGroup.Wait() // wait for all file syncing goroutines to finish glog.Infof("All file syncing gorountines are finished, proceeding...") for _, migrationId := range killedMigrations { glog.Infof("Offering migration mig_id=%v", migrationId) urlParams := map[string]string{"id": strconv.Itoa(migrationId)} _, err := runner.RestClient.Offer(urlParams) if err != nil { glog.Error(err) } } glog.Infof("Offered all killed migrations") } }
// failMigration reports a failed migration to the shift database/ui. func (runner *runner) failMigration(migration *migration.Migration, errorMsg string) { glog.Errorf("mig_id=%d: %s.", migration.Id, errorMsg) urlParams := map[string]string{ "id": strconv.Itoa(migration.Id), "error_message": errorMsg, } _, err := runner.RestClient.Fail(urlParams) if err != nil { glog.Errorf("mig_id=%d: %s.", migration.Id, err) return } glog.Infof("mig_id=%d: Successfully failed migration.", migration.Id) return }
// makes a query to the database, retrying if the query returns an error. // returns array of column names and arrays of data stored as string // string equivalent to []byte. data stored as 2d array with each subarray // containing a single column's data func (database *mysqlDB) runQuery(query string, args ...interface{}) ([]string, [][]string, error) { var err error var rows *sql.Rows if args == nil { args = make([]interface{}, 0) } for tries := 0; tries <= MAX_QUERY_READ_RETRIES; tries++ { rows, err = database.Db.Query(query, args...) if (err == nil) || (err.Error() != lockWaitTimeoutError) { if tries > 0 { glog.Infof("Lock wait timeout (%ds) exceeded %d times "+ "(query: %s)", LOCK_WAIT_TIMEOUT, tries, query) } break } time.Sleep(1000 * time.Millisecond) } if err != nil { return nil, nil, err } column_names, err := rows.Columns() if err != nil { return nil, nil, err } columns := len(column_names) values := make([][]string, columns) tmp_values := make([]sql.RawBytes, columns) scanArgs := make([]interface{}, len(values)) for i := range values { scanArgs[i] = &tmp_values[i] } for rows.Next() { err = rows.Scan(scanArgs...) if err != nil { return nil, nil, err } for i, col := range tmp_values { str := string(col) values[i] = append(values[i], str) } } err = rows.Err() return column_names, values, nil }
// watchForStopFile watches for the existence of a stop file. Will kill and offer up migrations if found // loop every 1 seconds func (runner *runner) watchForStopFile() { for { if _, err := os.Stat(runner.StopFilePath); err == nil { glog.Infof("Stop file found (%s), killing...", runner.StopFilePath) unstageMigrationMutex.Lock() canPickupMigrations = false unstagedMigrationsWaitGroup.Wait() unstageMigrationMutex.Unlock() runner.killAndOfferMigrations() } else { canPickupMigrations = true } time.Sleep(1 * time.Second) } }
// killPtOscProcess sends a SIGKILL to pt-osc if it is running func killPtOscProcess(currentMigration *migration.Migration) error { runningMigMutex.Lock() defer runningMigMutex.Unlock() // kill pt-osc if it's running on this host if _, exists := runningMigrations[currentMigration.Id]; exists { err := killPtOscProcessById(currentMigration.Id) if err != nil { return err } delete(runningMigrations, currentMigration.Id) } else { glog.Infof("mig_id=%d: can't kill because it's not running.", currentMigration.Id) } return nil }
// postLogToFileApi POSTs the lines in buffer to append_to_file as a log file then clears the buffer func (runner *runner) postLogToFileApi(buffer *bytes.Buffer, migrationId int) error { if buffer.Len() == 0 { return nil } urlParams := make(map[string]string) urlParams["migration_id"] = strconv.Itoa(migrationId) urlParams["file_type"] = LOG_FILE_TYPE urlParams["contents"] = buffer.String() _, err := runner.RestClient.AppendToFile(urlParams) if err != nil { glog.Errorf("Error POSTing to append_to_file endpoint (error: %s)", err) return err } buffer.Reset() glog.Infof("Sent log lines to append_to_file endpoint") return nil }
// WatchMigrationCopyStderr scans stderr of a migration on the copy step, // and parses each line to get the % copied. It also checks for unexpected // output, and it logs each line to a file func (migration *Migration) WatchMigrationCopyStderr(stderrPipe io.Reader, copyPercentChan chan int, errChan chan error, ptOscLogChan chan string) { scanner := bufio.NewScanner(stderrPipe) var line string var wasError bool for scanner.Scan() { line = scanner.Text() wasError = true ptOscLogChan <- fmt.Sprintf("stderr: %s", line) copyPercentageMatch := copyPercentageRegex.MatchString(line) if copyPercentageMatch { if len(strings.Fields(line)) < 3 { glog.Errorf("mig_id=%d: couldn't get copy percentage from '%s'. Continuing anyway", migration.Id, line) } copyPercentageS := strings.TrimSuffix(strings.Fields(line)[2], "%") copyPercentage, err := strconv.Atoi(copyPercentageS) if err != nil { glog.Errorf("mig_id=%d: couldn't get int percentage from '%s'. Continuing anyway", migration.Id, line) } glog.Infof("mig_id=%d: updating migration with copy percentage of %d", migration.Id, copyPercentage) copyPercentChan <- copyPercentage } } if err := scanner.Err(); err != nil { glog.Errorf("mig_id=%d: error getting stderr of pt-osc (error: %s)", migration.Id, err) errChan <- ErrPtOscStderr return } // 0 lines of stderr is okay, because that just means the migration went super quick. if there // are lines of stderr, though, and the last line isn't one that we expect, there was a problem if wasError { copyPercentageMatch := copyPercentageRegex.MatchString(line) waitingMatch := waitingRegex.MatchString(line) pausingMatch := pausingRegex.MatchString(line) if !copyPercentageMatch && !waitingMatch && !pausingMatch { glog.Errorf("mig_id=%d: last line of stderror was not what we expect (was: %s). Something went wrong", migration.Id, line) errChan <- ErrPtOscUnexpectedStderr return } } errChan <- nil }
func (runner *runner) postStateFile(migration *migration.Migration) { data, err := migration.ReadStateFile() if err != nil { glog.Warningf("Could not read state file (%s)", err) } else { urlParams := make(map[string]string) urlParams["migration_id"] = strconv.Itoa(migration.Id) urlParams["file_type"] = STATE_FILE_TYPE urlParams["contents"] = string(data[:]) _, err = runner.RestClient.WriteFile(urlParams) if err != nil { glog.Errorf("Error POSTing to write_file endpoint (error: %s)", err) } else { glog.Infof("Sent state file to write_file endpoint") } } }
// dbClient creates a mysql client that connects to a database host. func (migration *Migration) SetupDbClient(user, password, cert, key, rootCA string, port int) error { tlsConfig := &dbclient.TlsConfig{} if (cert != "") && (key != "") && (rootCA != "") { tlsConfig.UseTls = true tlsConfig.RootCA = rootCA tlsConfig.ClientCert = cert tlsConfig.ClientKey = key } host := "tcp(" + migration.Host + ":" + strconv.Itoa(port) + ")" db, err := newDbClient(user, password, host, migration.Database, "", tlsConfig) if err != nil { glog.Errorf("Failed to connect to the database (error: %s).", err) return ErrDbConnect } else { migration.DbClient = db } glog.Infof("Successfully connected to the database (host = %s, user = %s, database = %s, root ca = %s, cert = %s, key = %s).", host, user, migration.Database, rootCA, cert, key) return nil }
// makes a query to the database, retrying if the query returns an error. // should be used for inserts and updates. only returns whether or not // there was an error func (database *mysqlDB) QueryInsertUpdate(query string, args ...interface{}) error { var err error if args == nil { args = make([]interface{}, 0) } for tries := 0; tries <= MAX_QUERY_WRITE_RETRIES; tries++ { _, err = database.Db.Exec(query, args...) if (err == nil) || (err.Error() != lockWaitTimeoutError) { if tries > 0 { glog.Infof("Lock wait timeout (%ds) exceeded %d times "+ "(query: %s)", LOCK_WAIT_TIMEOUT, tries, query) } break } time.Sleep(1000 * time.Millisecond) } return err }
// syncStateFile periodically reads a migration's statefile and sends it to the api func (runner *runner) syncStateFile(migration *migration.Migration, quit chan string, waitGroup *sync.WaitGroup) { defer fileSyncWaitGroup.Done() defer waitGroup.Done() lastPostTime := time.Unix(0, 0) for { currentTime := time.Now().Local() select { case <-quit: glog.Infof("Stopped syncing state file") runner.postStateFile(migration) return default: if int(currentTime.Sub(lastPostTime)/time.Second) >= runner.StateSyncInterval { lastPostTime = currentTime runner.postStateFile(migration) } time.Sleep(1 * time.Second) } } }
// processMigration takes migrations from the job channel and decides // what to do with them (ex: run them, kill them, etc.). func (runner *runner) processMigration(currentMigration *migration.Migration) { glog.Infof("mig_id=%d: Picked up migration from the job channel. Processing.", currentMigration.Id) if currentMigration.Status != migration.RunMigrationStatus { // excludes RunMigrationStatus because we want to .Done those after // we start PtOsc defer unstagedMigrationsWaitGroup.Done() } // setup a database client for the migration err := SetupDbClient(currentMigration, runner.MysqlUser, runner.MysqlPassword, maybeReplaceHostname(runner.MysqlCert), maybeReplaceHostname(runner.MysqlKey), maybeReplaceHostname(runner.MysqlRootCA), currentMigration.Port) if err != nil { failMigration(runner, currentMigration, err.Error()) return } if currentMigration.DbClient != nil { defer currentMigration.DbClient.Close() } switch currentMigration.Status { case migration.PrepMigrationStatus: err = prepMigrationStep(runner, currentMigration) case migration.RunMigrationStatus: err = runMigrationStep(runner, currentMigration) case migration.RenameTablesStatus: err = renameTablesStep(runner, currentMigration) case migration.PauseStatus: err = pauseMigrationStep(runner, currentMigration) case migration.CancelStatus: err = killMigration(runner, currentMigration) default: err = ErrUnkownStatus } if err != nil { failMigration(runner, currentMigration, err.Error()) } }
// killMigration kills pt-osc and cleans up after a migration func (runner *runner) killMigration(currentMigration *migration.Migration) error { err := killPtOsc(currentMigration) if err != nil { return err } err = runner.createAndRetrieveFiles(currentMigration) if err != nil { return err } // we want to clean up the migration (drop triggers, shadow table, etc.) regardless of // whether or not the migration is running on this host (or running at all) glog.Infof("mig_id=%d: cleaning up.", currentMigration.Id) err = CleanUp(currentMigration) if err != nil { glog.Errorf("mig_id=%d: error cleaning up (error: %s)", currentMigration.Id, err) return ErrPtOscCleanUp } return nil }
// execPtOsc shells out and uses pt-osc to actually run a migration. func (runner *runner) execPtOsc(currentMigration *migration.Migration, ptOscOptionGenerator commandOptionGenerator, copyPercentChan chan int, unstageDone bool) (bool, error) { // unstageDone = whether or not unstagedMigrationsWaitGroup has already had .Done() called on it defer func() { if !unstageDone { unstagedMigrationsWaitGroup.Done() } }() canceled := false var fileRoutineWaitGroup sync.WaitGroup err := runner.createAndRetrieveFiles(currentMigration) if err != nil { return canceled, err } ptOscLogFile, ptOscLogWriter, err := setupLogWriter(currentMigration.LogFile) if err != nil { glog.Errorf("mig_id=%d: error creating pt-osc log file '%s' (error: %s)", currentMigration.Id, currentMigration.LogFile, err) return canceled, ErrPtOscExec } defer ptOscLogFile.Close() // generate the pt-osc command to run commandOptions := ptOscOptionGenerator(currentMigration) glog.Infof("mig_id=%d: Running %s %v", currentMigration.Id, runner.PtOscPath, strings.Join(commandOptions, " ")) cmd := exec.Command(runner.PtOscPath, commandOptions...) // capture stdout and stderr of the command stdout, err := cmd.StdoutPipe() if err != nil { glog.Errorf("mig_id=%d: error getting stdout pipe for pt-osc exec (error: %s)", currentMigration.Id, err) return canceled, ErrPtOscExec } stderr, err := cmd.StderrPipe() if err != nil { glog.Errorf("mig_id=%d: error getting stderr pipe for pt-osc exec (error: %s)", currentMigration.Id, err) return canceled, ErrPtOscExec } // start the pt-osc comand if err := cmd.Start(); err != nil { glog.Errorf("mig_id=%d: error starting pt-osc exec (error: %s)", currentMigration.Id, err) return canceled, ErrPtOscExec } // setup a channel and goroutine for logging output of stdout/stderr ptOscLogChan := make(chan string) fileRoutineWaitGroup.Add(1) fileSyncWaitGroup.Add(1) go runner.writeToPtOscLog(ptOscLogWriter, ptOscLogChan, writeLineToPtOscLog, currentMigration.Id, &fileRoutineWaitGroup) // setup a goroutine for sending statefiles to the api ptOscStateFileChan := make(chan string) fileRoutineWaitGroup.Add(1) fileSyncWaitGroup.Add(1) go runner.syncStateFile(currentMigration, ptOscStateFileChan, &fileRoutineWaitGroup) // setup goroutines for watching stdout/stderr of the command stdoutErrChan := make(chan error) stderrErrChan := make(chan error) go currentMigration.WatchMigrationStdout(stdout, stdoutErrChan, ptOscLogChan) if currentMigration.Status == migration.RunMigrationStatus { // setup a goroutine to continually update the % copied of the migration fileRoutineWaitGroup.Add(1) fileSyncWaitGroup.Add(1) go runner.updateMigrationCopyPercentage(currentMigration, copyPercentChan, &fileRoutineWaitGroup) go currentMigration.WatchMigrationCopyStderr(stderr, copyPercentChan, stderrErrChan, ptOscLogChan) } else { go currentMigration.WatchMigrationStderr(stderr, stderrErrChan, ptOscLogChan) } // save the pid of the pt-osc process currentMigration.Pid = cmd.Process.Pid glog.Infof("mig_id=%d: pt-osc pid for status %d is %d.", currentMigration.Id, currentMigration.Status, currentMigration.Pid) // add the migration id and pid to the running migration map runningMigMutex.Lock() runningMigrations[currentMigration.Id] = currentMigration.Pid runningMigMutex.Unlock() if !unstageDone { unstageDone = true unstagedMigrationsWaitGroup.Done() } // wait for both stdout and stderr error channels to receive a signal stdoutErr := <-stdoutErrChan stderrErr := <-stderrErrChan close(ptOscLogChan) close(ptOscStateFileChan) // remove the migration id from the running migration map runningMigMutex.Lock() delete(runningMigrations, currentMigration.Id) runningMigMutex.Unlock() // get the exit status of the command. if it was sent a SIGKILL (most likely // by another goroutine) we want to know because we will treat it differently failed := false if err := cmd.Wait(); err != nil { if exiterr, ok := err.(*exec.ExitError); ok { if status, ok := exiterr.Sys().(syscall.WaitStatus); ok { exitSignal := status.Signal().String() glog.Infof("mig_id=%d: exit signal was %s.", currentMigration.Id, exitSignal) if (exitSignal == SIGKILLSignal) && (currentMigration.Status == migration.RunMigrationStatus) { // was killed glog.Infof("mig_id=%d: migration must have been canceled", currentMigration.Id) canceled = true } else if currentMigration.Status == migration.RunMigrationStatus { // died for an unexpected reason glog.Infof("mig_id=%d: migration died for an unexpected reason", currentMigration.Id) failed = true } } } } else { if (stderrErr == nil) && (currentMigration.Status == migration.RunMigrationStatus) { // wasn't killed. copy completed 100% glog.Infof("mig_id=%d: updating migration with copy percentage of 100", currentMigration.Id) copyPercentChan <- 100 } } if copyPercentChan != nil { glog.Infof("Closing copy percent channel") close(copyPercentChan) } fileRoutineWaitGroup.Wait() // wait for go routines to finish // favor returning error from unexpected failure, then error from stderr, // and lastly error from stdout if failed { return canceled, ErrUnexpectedExit } else if stderrErr != nil { return canceled, stderrErr } else if stdoutErr != nil { return canceled, stdoutErr } return canceled, nil }
// writeLineToPtOscLog writes a single line to a log file func writeLineToPtOscLog(ptOscLogWriter *bufio.Writer, logMsg string, currentTime time.Time) error { glog.Infof("pt-osc output: %s", logMsg) fmt.Fprintln(ptOscLogWriter, logMsg) return ptOscLogWriter.Flush() }
// unstageRunnableMigration tries to unstage ("claim") migrations that are runnable func (runner *runner) unstageRunnableMigration(currentMigration rest.RestResponseItem) (*migration.Migration, error) { migrationStatusField, ok := currentMigration["status"].(float64) if ok == false { glog.Errorf("Failed to get migration status for the following migration: %v", currentMigration) return nil, ErrInvalidMigration } migrationStatus := int(migrationStatusField) // Manually define the statuses to run if intInArray(migrationStatus, statusesToRun) { glog.Infof("Status = %d is in the list of statuses to run, so continuing.", migrationStatus) migrationIdFieldFloat, ok := currentMigration["id"].(float64) if ok == false { glog.Errorf("Failed to get migration id for the "+ "following migration: %v", currentMigration) return nil, ErrInvalidMigration } migrationIdField := int(migrationIdFieldFloat) // check if the migration is already pinned to a particular host pinnedHostname := currentMigration["run_host"] if pinnedHostname != nil { pinnedHostname = pinnedHostname.(string) if pinnedHostname != runner.Hostname { glog.Errorf("mig_id=%d: migration already pinned to %s. Skipping.", migrationIdField, pinnedHostname) return nil, ErrNotRunning } } customOptions := make(map[string]string) if val, ok := currentMigration["custom_options"]; ok && val != nil { json.Unmarshal([]byte(currentMigration["custom_options"].(string)), &customOptions) } // log and statefiles are stored in log directory. ex for mig with id 7: /path/to/logs/statefile-id-7.txt filesDir := runner.LogDir + "id-" + strconv.Itoa(int(migrationIdField)) + "/" stateFile := filesDir + "statefile.txt" logFile := filesDir + "ptosc-output.log" mig := &migration.Migration{ Id: migrationIdField, Status: migrationStatus, Host: currentMigration["host"].(string), Port: int(currentMigration["port"].(float64)), Database: currentMigration["database"].(string), Table: currentMigration["table"].(string), DdlStatement: currentMigration["ddl_statement"].(string), FinalInsert: currentMigration["final_insert"].(string), FilesDir: filesDir, StateFile: stateFile, LogFile: logFile, PendingDropsDb: runner.PendingDropsDb, CustomOptions: customOptions, } // some extra fields when we're not killing a migration if (migrationStatus != migration.CancelStatus) && (migrationStatus != migration.PauseStatus) { mig.RunType = int(currentMigration["runtype"].(float64)) mig.Mode = int(currentMigration["mode"].(float64)) mig.Action = int(currentMigration["action"].(float64)) } // allow setting overrides for the host and db from config file. // this is useful for safe testing in a staging environment if runner.HostOverride != "" { mig.Host = runner.HostOverride } if runner.PortOverride != 0 { mig.Port = runner.PortOverride } if runner.DatabaseOverride != "" { mig.Database = runner.DatabaseOverride } // if pending_drops database isn't defined, set it to be the same // database that the migration is on. if mig.PendingDropsDb == "" { mig.PendingDropsDb = mig.Database } // only claim the migration if we successfully unstage it urlParams := map[string]string{"id": strconv.Itoa(mig.Id)} _, err := runner.RestClient.Unstage(urlParams) if err != nil { return nil, err } glog.Infof("mig_id=%d: Successfully unstaged.", mig.Id) return mig, nil } glog.Infof("Status = %d is not in the list of statuses to run, so skipping.", migrationStatus) return nil, nil }