// getRunnableMigrations queries the shift api to get new/updated migrations. func (runner *runner) getRunnableMigrations() []*migration.Migration { newMigrations := []*migration.Migration{} // Get all of the staged migrations glog.Info("Getting staged migrations.") stagedMigrations, err := runner.RestClient.Staged() if err != nil { glog.Errorf("Failed to get staged migrations (error: %s)", err.Error()) return newMigrations } if len(stagedMigrations) == 0 { glog.Info("No new staged migrations.") } // For each staged migration, try to unstage it. for i := range stagedMigrations { newMigration, err := unstageRunnableMigration(runner, stagedMigrations[i]) if err != nil { glog.Errorf(err.Error()) continue } if newMigration != nil { newMigrations = append(newMigrations, newMigration) } } return newMigrations }
// createAndRetrieveFiles attempts to create a migration's state file, // if it does not exist, and retrieve the latest version from the api func (runner *runner) createAndRetrieveFiles(currentMigration *migration.Migration) error { if _, err := os.Stat(currentMigration.FilesDir); err != nil { if os.IsNotExist(err) { err := os.Mkdir(currentMigration.FilesDir, 0777) if err != nil { glog.Errorf("mig_id=%d: error creating pt-osc log directory '%s' (error: %s)", currentMigration.Id, currentMigration.FilesDir, err) return ErrFiles } } else { glog.Errorf("mig_id=%d: error stat-ing pt-osc log directory '%s' (error: %s)", currentMigration.Id, currentMigration.FilesDir, err) return ErrFiles } } // try to retrieve a statefile from the api urlParams := map[string]string{ "migration_id": strconv.Itoa(currentMigration.Id), "file_type": STATE_FILE_TYPE, } stateFile, restErr := runner.RestClient.GetFile(urlParams) if restErr == nil && stateFile["contents"] != nil && len(stateFile["contents"].(string)) > 0 { err := currentMigration.WriteStateFile([]byte(stateFile["contents"].(string))) if err != nil { glog.Errorf("mig_id=%d: error writing to statefile '%s' (error: %s)", currentMigration.Id, currentMigration.StateFile, err) return ErrFiles } } return nil }
// writeToPtOscLog takes log lines it receives from a channel and passes them // to a function that will write them // Also writes to a buffer which is POSTed to append_to_file when a new log line appears // after LogSyncInterval seconds func (runner *runner) writeToPtOscLog(ptOscLogWriter *bufio.Writer, ptOscLogChan chan string, writerFunc writeToWriter, migrationId int, waitGroup *sync.WaitGroup) { defer fileSyncWaitGroup.Done() defer waitGroup.Done() var buffer bytes.Buffer lastPostTime := time.Unix(0, 0) for line := range ptOscLogChan { currentTime := time.Now().Local() formattedLine := "[" + currentTime.Format("2006-01-02 15:04:05") + "] " + line err := writerFunc(ptOscLogWriter, formattedLine, currentTime) if err != nil { glog.Errorf("Error flushing pt-osc log file (error: %s)", err) } _, err = buffer.WriteString(formattedLine + "\n") if err != nil { glog.Errorf("Error writing to log buffer (error: %s)", err) } if int(currentTime.Sub(lastPostTime)/time.Second) >= runner.LogSyncInterval && buffer.Len() > 0 { err = runner.postLogToFileApi(&buffer, migrationId) if err == nil { lastPostTime = currentTime } } } glog.Infof("Stopped syncing log file") runner.postLogToFileApi(&buffer, migrationId) }
// failMigration reports a failed migration to the shift database/ui. func (runner *runner) failMigration(migration *migration.Migration, errorMsg string) { glog.Errorf("mig_id=%d: %s.", migration.Id, errorMsg) urlParams := map[string]string{ "id": strconv.Itoa(migration.Id), "error_message": errorMsg, } _, err := runner.RestClient.Fail(urlParams) if err != nil { glog.Errorf("mig_id=%d: %s.", migration.Id, err) return } glog.Infof("mig_id=%d: Successfully failed migration.", migration.Id) return }
// WatchMigrationCopyStderr scans stderr of a migration on the copy step, // and parses each line to get the % copied. It also checks for unexpected // output, and it logs each line to a file func (migration *Migration) WatchMigrationCopyStderr(stderrPipe io.Reader, copyPercentChan chan int, errChan chan error, ptOscLogChan chan string) { scanner := bufio.NewScanner(stderrPipe) var line string var wasError bool for scanner.Scan() { line = scanner.Text() wasError = true ptOscLogChan <- fmt.Sprintf("stderr: %s", line) copyPercentageMatch := copyPercentageRegex.MatchString(line) if copyPercentageMatch { if len(strings.Fields(line)) < 3 { glog.Errorf("mig_id=%d: couldn't get copy percentage from '%s'. Continuing anyway", migration.Id, line) } copyPercentageS := strings.TrimSuffix(strings.Fields(line)[2], "%") copyPercentage, err := strconv.Atoi(copyPercentageS) if err != nil { glog.Errorf("mig_id=%d: couldn't get int percentage from '%s'. Continuing anyway", migration.Id, line) } glog.Infof("mig_id=%d: updating migration with copy percentage of %d", migration.Id, copyPercentage) copyPercentChan <- copyPercentage } } if err := scanner.Err(); err != nil { glog.Errorf("mig_id=%d: error getting stderr of pt-osc (error: %s)", migration.Id, err) errChan <- ErrPtOscStderr return } // 0 lines of stderr is okay, because that just means the migration went super quick. if there // are lines of stderr, though, and the last line isn't one that we expect, there was a problem if wasError { copyPercentageMatch := copyPercentageRegex.MatchString(line) waitingMatch := waitingRegex.MatchString(line) pausingMatch := pausingRegex.MatchString(line) if !copyPercentageMatch && !waitingMatch && !pausingMatch { glog.Errorf("mig_id=%d: last line of stderror was not what we expect (was: %s). Something went wrong", migration.Id, line) errChan <- ErrPtOscUnexpectedStderr return } } errChan <- nil }
// killPtOscProcess sends a SIGKILL to the pt-osc process associated // with the migrationId, does not lock runningMigMutex func killPtOscProcessById(migrationId int) error { pid := runningMigrations[migrationId] glog.Infof("mig_id=%d: killing (pid = %d).", migrationId, pid) err := syscall.Kill(pid, ptOscKillSignal) if err != nil { glog.Errorf("mig_id=%d: error killing pt-osc (error: %s)", migrationId, err) return ErrPtOscKill } delete(runningMigrations, migrationId) return nil }
// RunWriteQuery executes a write query on the database for a migration func (migration *Migration) RunWriteQuery(query string, args ...interface{}) error { if args == nil { glog.Infof("mig_id=%d: Running query '%s'.", migration.Id, query) } else { glog.Infof("mig_id=%d: Running query '%s' (args: %v).", migration.Id, query, args) } err := migration.DbClient.QueryInsertUpdate(query, args...) if err != nil { glog.Errorf("mig_id=%d: Query '%s' failed (error: %s).", migration.Id, query, err) return ErrQueryFailed } return nil }
// WatchMigrationStdout scans stdout of a migration, line by line, and // logs it to a file. func (migration *Migration) WatchMigrationStdout(stdoutPipe io.Reader, errChan chan error, ptOscLogChan chan string) { scanner := bufio.NewScanner(stdoutPipe) for scanner.Scan() { line := scanner.Text() ptOscLogChan <- fmt.Sprintf("stdout: %s", line) } if err := scanner.Err(); err != nil { glog.Errorf("mig_id=%d: error getting stdout of pt-osc (error: %s)", migration.Id, err) errChan <- ErrPtOscStdout return } errChan <- nil }
// RunReadQuery executes a read query on the database for a migration func (migration *Migration) RunReadQuery(query string, args ...interface{}) (map[string][]string, error) { if args == nil { glog.Infof("mig_id=%d: Running query '%s'.", migration.Id, query) } else { glog.Infof("mig_id=%d: Running query '%s' (args: %v).", migration.Id, query, args) } response, err := migration.DbClient.QueryReturnColumnDict(query, args...) if err != nil { glog.Errorf("mig_id=%d: Query '%s' failed (error: %s).", migration.Id, query, err) return nil, ErrQueryFailed } glog.Infof("mig_id=%d: Query response was '%v'", migration.Id, response) return response, nil }
// WatchMigrationStderr scans stderr of a migration, line by line, and // checks for any unexpected output. It also logs each line to a file func (migration *Migration) WatchMigrationStderr(stderrPipe io.Reader, errChan chan error, ptOscLogChan chan string) { scanner := bufio.NewScanner(stderrPipe) var wasError bool for scanner.Scan() { line := scanner.Text() wasError = true ptOscLogChan <- fmt.Sprintf("stderr: %s", line) } if err := scanner.Err(); err != nil { glog.Errorf("mig_id=%d: error getting stderr of pt-osc (error: %s)", migration.Id, err) errChan <- ErrPtOscStderr return } // if there was anything sent to stderr, there was a problem if wasError { glog.Errorf("mig_id=%d: stderr is not empty. Something went wrong", migration.Id) errChan <- ErrPtOscUnexpectedStderr return } errChan <- nil }
// updateMigrationCopyPercentage watches a channel for a running migration and // sends the copy percentage completed (from the channel) to the shift api func (runner *runner) updateMigrationCopyPercentage(currentMigration *migration.Migration, copyPercentChan chan int, waitGroup *sync.WaitGroup) { defer fileSyncWaitGroup.Done() defer waitGroup.Done() migrationId := strconv.Itoa(currentMigration.Id) for copyPercentage := range copyPercentChan { // send the table stats to the api urlParams := map[string]string{ "id": migrationId, "copy_percentage": strconv.Itoa(copyPercentage), } _, err := runner.RestClient.Update(urlParams) if err != nil { glog.Errorf("mig_id=%d: error updating copy percentage (error: %s). Continuing anyway", migrationId, err) } } }
// postLogToFileApi POSTs the lines in buffer to append_to_file as a log file then clears the buffer func (runner *runner) postLogToFileApi(buffer *bytes.Buffer, migrationId int) error { if buffer.Len() == 0 { return nil } urlParams := make(map[string]string) urlParams["migration_id"] = strconv.Itoa(migrationId) urlParams["file_type"] = LOG_FILE_TYPE urlParams["contents"] = buffer.String() _, err := runner.RestClient.AppendToFile(urlParams) if err != nil { glog.Errorf("Error POSTing to append_to_file endpoint (error: %s)", err) return err } buffer.Reset() glog.Infof("Sent log lines to append_to_file endpoint") return nil }
// ValidateFinalInsert validates the syntax of the final insert statement, // and starts/rolls back a trxn to verify that the insert won't fail. func (migration *Migration) ValidateFinalInsert() error { finalInsert := migration.FinalInsert validInsertRegex := regexp.MustCompile("^(?i)(INSERT\\s+INTO\\s+)[^;]+$") validInsertSyntax := validInsertRegex.MatchString(finalInsert) if !validInsertSyntax { return ErrInvalidInsert } glog.Infof("mig_id=%d: Validating final insert '%s' in a transaction and rolling it back.", migration.Id, finalInsert) err := migration.DbClient.ValidateInsertStatement(finalInsert) if err != nil { glog.Errorf("mig_id=%d: Final insert '%s' failed (error: %s).", migration.Id, finalInsert, err) return ErrInvalidInsert } glog.Infof("mig_id=%d: Final insert '%s' was successfully validated and rolled back.", migration.Id, finalInsert) return nil }
func (runner *runner) postStateFile(migration *migration.Migration) { data, err := migration.ReadStateFile() if err != nil { glog.Warningf("Could not read state file (%s)", err) } else { urlParams := make(map[string]string) urlParams["migration_id"] = strconv.Itoa(migration.Id) urlParams["file_type"] = STATE_FILE_TYPE urlParams["contents"] = string(data[:]) _, err = runner.RestClient.WriteFile(urlParams) if err != nil { glog.Errorf("Error POSTing to write_file endpoint (error: %s)", err) } else { glog.Infof("Sent state file to write_file endpoint") } } }
func Start(configFile string) error { migrationRunner, err := newRunner(configFile) if err != nil { glog.Errorf("Error creating migration runner (error: %s).", err) return err } jobChannel := make(chan *migration.Migration) go migrationRunner.watchForStopFile() go migrationRunner.sendRunnableMigrationsToProcessor(jobChannel) for job := range jobChannel { go migrationRunner.processMigration(job) } return nil }
// dbClient creates a mysql client that connects to a database host. func (migration *Migration) SetupDbClient(user, password, cert, key, rootCA string, port int) error { tlsConfig := &dbclient.TlsConfig{} if (cert != "") && (key != "") && (rootCA != "") { tlsConfig.UseTls = true tlsConfig.RootCA = rootCA tlsConfig.ClientCert = cert tlsConfig.ClientKey = key } host := "tcp(" + migration.Host + ":" + strconv.Itoa(port) + ")" db, err := newDbClient(user, password, host, migration.Database, "", tlsConfig) if err != nil { glog.Errorf("Failed to connect to the database (error: %s).", err) return ErrDbConnect } else { migration.DbClient = db } glog.Infof("Successfully connected to the database (host = %s, user = %s, database = %s, root ca = %s, cert = %s, key = %s).", host, user, migration.Database, rootCA, cert, key) return nil }
// DryRunCreatesNew does a dry run for creating a new table/view. First it verifies // that the table/view doesn't already exist, and then it creates/drops it to verify // that the ddl is valid. func (migration *Migration) DryRunCreatesNew() error { // verify the table/view doesn't already exist query := "SELECT COUNT(*) as count FROM information_schema.tables WHERE table_schema=? AND table_name=?" args := []interface{}{migration.Database, migration.Table} response, err := RunReadQuery(migration, query, args...) if err != nil { return err } if response["count"][0] != "0" { glog.Errorf("mig_id=%d: Table/view already exists.", migration.Id) return ErrDryRunCreatesNew } // create and immediately drop the new table/view to verify the validity // of the ddl err = RunWriteQuery(migration, migration.DdlStatement) if err != nil { return err } return DirectDrop(migration) }
// SwapOscTables swaps the original table and migration table for an OSC, and returns // the name of the table that the original table got renamed to func (migration *Migration) SwapOscTables() (string, error) { var oldTable string // get the name of the temporary migration table migTable, err := GetMigTable(migration) if err != nil { return oldTable, err } // prep the "old" table name oldTable = TimestampedTable(migration.Table) // swap the tables // current table -> old table, mig table -> current table err = SwapTables(migration, migration.Table, oldTable, migTable, migration.Table) if err != nil { glog.Errorf("mig_id=%d: Table swap failed.", migration.Id) return "", err } return oldTable, nil }
// killMigration kills pt-osc and cleans up after a migration func (runner *runner) killMigration(currentMigration *migration.Migration) error { err := killPtOsc(currentMigration) if err != nil { return err } err = runner.createAndRetrieveFiles(currentMigration) if err != nil { return err } // we want to clean up the migration (drop triggers, shadow table, etc.) regardless of // whether or not the migration is running on this host (or running at all) glog.Infof("mig_id=%d: cleaning up.", currentMigration.Id) err = CleanUp(currentMigration) if err != nil { glog.Errorf("mig_id=%d: error cleaning up (error: %s)", currentMigration.Id, err) return ErrPtOscCleanUp } return nil }
// execPtOsc shells out and uses pt-osc to actually run a migration. func (runner *runner) execPtOsc(currentMigration *migration.Migration, ptOscOptionGenerator commandOptionGenerator, copyPercentChan chan int, unstageDone bool) (bool, error) { // unstageDone = whether or not unstagedMigrationsWaitGroup has already had .Done() called on it defer func() { if !unstageDone { unstagedMigrationsWaitGroup.Done() } }() canceled := false var fileRoutineWaitGroup sync.WaitGroup err := runner.createAndRetrieveFiles(currentMigration) if err != nil { return canceled, err } ptOscLogFile, ptOscLogWriter, err := setupLogWriter(currentMigration.LogFile) if err != nil { glog.Errorf("mig_id=%d: error creating pt-osc log file '%s' (error: %s)", currentMigration.Id, currentMigration.LogFile, err) return canceled, ErrPtOscExec } defer ptOscLogFile.Close() // generate the pt-osc command to run commandOptions := ptOscOptionGenerator(currentMigration) glog.Infof("mig_id=%d: Running %s %v", currentMigration.Id, runner.PtOscPath, strings.Join(commandOptions, " ")) cmd := exec.Command(runner.PtOscPath, commandOptions...) // capture stdout and stderr of the command stdout, err := cmd.StdoutPipe() if err != nil { glog.Errorf("mig_id=%d: error getting stdout pipe for pt-osc exec (error: %s)", currentMigration.Id, err) return canceled, ErrPtOscExec } stderr, err := cmd.StderrPipe() if err != nil { glog.Errorf("mig_id=%d: error getting stderr pipe for pt-osc exec (error: %s)", currentMigration.Id, err) return canceled, ErrPtOscExec } // start the pt-osc comand if err := cmd.Start(); err != nil { glog.Errorf("mig_id=%d: error starting pt-osc exec (error: %s)", currentMigration.Id, err) return canceled, ErrPtOscExec } // setup a channel and goroutine for logging output of stdout/stderr ptOscLogChan := make(chan string) fileRoutineWaitGroup.Add(1) fileSyncWaitGroup.Add(1) go runner.writeToPtOscLog(ptOscLogWriter, ptOscLogChan, writeLineToPtOscLog, currentMigration.Id, &fileRoutineWaitGroup) // setup a goroutine for sending statefiles to the api ptOscStateFileChan := make(chan string) fileRoutineWaitGroup.Add(1) fileSyncWaitGroup.Add(1) go runner.syncStateFile(currentMigration, ptOscStateFileChan, &fileRoutineWaitGroup) // setup goroutines for watching stdout/stderr of the command stdoutErrChan := make(chan error) stderrErrChan := make(chan error) go currentMigration.WatchMigrationStdout(stdout, stdoutErrChan, ptOscLogChan) if currentMigration.Status == migration.RunMigrationStatus { // setup a goroutine to continually update the % copied of the migration fileRoutineWaitGroup.Add(1) fileSyncWaitGroup.Add(1) go runner.updateMigrationCopyPercentage(currentMigration, copyPercentChan, &fileRoutineWaitGroup) go currentMigration.WatchMigrationCopyStderr(stderr, copyPercentChan, stderrErrChan, ptOscLogChan) } else { go currentMigration.WatchMigrationStderr(stderr, stderrErrChan, ptOscLogChan) } // save the pid of the pt-osc process currentMigration.Pid = cmd.Process.Pid glog.Infof("mig_id=%d: pt-osc pid for status %d is %d.", currentMigration.Id, currentMigration.Status, currentMigration.Pid) // add the migration id and pid to the running migration map runningMigMutex.Lock() runningMigrations[currentMigration.Id] = currentMigration.Pid runningMigMutex.Unlock() if !unstageDone { unstageDone = true unstagedMigrationsWaitGroup.Done() } // wait for both stdout and stderr error channels to receive a signal stdoutErr := <-stdoutErrChan stderrErr := <-stderrErrChan close(ptOscLogChan) close(ptOscStateFileChan) // remove the migration id from the running migration map runningMigMutex.Lock() delete(runningMigrations, currentMigration.Id) runningMigMutex.Unlock() // get the exit status of the command. if it was sent a SIGKILL (most likely // by another goroutine) we want to know because we will treat it differently failed := false if err := cmd.Wait(); err != nil { if exiterr, ok := err.(*exec.ExitError); ok { if status, ok := exiterr.Sys().(syscall.WaitStatus); ok { exitSignal := status.Signal().String() glog.Infof("mig_id=%d: exit signal was %s.", currentMigration.Id, exitSignal) if (exitSignal == SIGKILLSignal) && (currentMigration.Status == migration.RunMigrationStatus) { // was killed glog.Infof("mig_id=%d: migration must have been canceled", currentMigration.Id) canceled = true } else if currentMigration.Status == migration.RunMigrationStatus { // died for an unexpected reason glog.Infof("mig_id=%d: migration died for an unexpected reason", currentMigration.Id) failed = true } } } } else { if (stderrErr == nil) && (currentMigration.Status == migration.RunMigrationStatus) { // wasn't killed. copy completed 100% glog.Infof("mig_id=%d: updating migration with copy percentage of 100", currentMigration.Id) copyPercentChan <- 100 } } if copyPercentChan != nil { glog.Infof("Closing copy percent channel") close(copyPercentChan) } fileRoutineWaitGroup.Wait() // wait for go routines to finish // favor returning error from unexpected failure, then error from stderr, // and lastly error from stdout if failed { return canceled, ErrUnexpectedExit } else if stderrErr != nil { return canceled, stderrErr } else if stdoutErr != nil { return canceled, stdoutErr } return canceled, nil }
// unstageRunnableMigration tries to unstage ("claim") migrations that are runnable func (runner *runner) unstageRunnableMigration(currentMigration rest.RestResponseItem) (*migration.Migration, error) { migrationStatusField, ok := currentMigration["status"].(float64) if ok == false { glog.Errorf("Failed to get migration status for the following migration: %v", currentMigration) return nil, ErrInvalidMigration } migrationStatus := int(migrationStatusField) // Manually define the statuses to run if intInArray(migrationStatus, statusesToRun) { glog.Infof("Status = %d is in the list of statuses to run, so continuing.", migrationStatus) migrationIdFieldFloat, ok := currentMigration["id"].(float64) if ok == false { glog.Errorf("Failed to get migration id for the "+ "following migration: %v", currentMigration) return nil, ErrInvalidMigration } migrationIdField := int(migrationIdFieldFloat) // check if the migration is already pinned to a particular host pinnedHostname := currentMigration["run_host"] if pinnedHostname != nil { pinnedHostname = pinnedHostname.(string) if pinnedHostname != runner.Hostname { glog.Errorf("mig_id=%d: migration already pinned to %s. Skipping.", migrationIdField, pinnedHostname) return nil, ErrNotRunning } } customOptions := make(map[string]string) if val, ok := currentMigration["custom_options"]; ok && val != nil { json.Unmarshal([]byte(currentMigration["custom_options"].(string)), &customOptions) } // log and statefiles are stored in log directory. ex for mig with id 7: /path/to/logs/statefile-id-7.txt filesDir := runner.LogDir + "id-" + strconv.Itoa(int(migrationIdField)) + "/" stateFile := filesDir + "statefile.txt" logFile := filesDir + "ptosc-output.log" mig := &migration.Migration{ Id: migrationIdField, Status: migrationStatus, Host: currentMigration["host"].(string), Port: int(currentMigration["port"].(float64)), Database: currentMigration["database"].(string), Table: currentMigration["table"].(string), DdlStatement: currentMigration["ddl_statement"].(string), FinalInsert: currentMigration["final_insert"].(string), FilesDir: filesDir, StateFile: stateFile, LogFile: logFile, PendingDropsDb: runner.PendingDropsDb, CustomOptions: customOptions, } // some extra fields when we're not killing a migration if (migrationStatus != migration.CancelStatus) && (migrationStatus != migration.PauseStatus) { mig.RunType = int(currentMigration["runtype"].(float64)) mig.Mode = int(currentMigration["mode"].(float64)) mig.Action = int(currentMigration["action"].(float64)) } // allow setting overrides for the host and db from config file. // this is useful for safe testing in a staging environment if runner.HostOverride != "" { mig.Host = runner.HostOverride } if runner.PortOverride != 0 { mig.Port = runner.PortOverride } if runner.DatabaseOverride != "" { mig.Database = runner.DatabaseOverride } // if pending_drops database isn't defined, set it to be the same // database that the migration is on. if mig.PendingDropsDb == "" { mig.PendingDropsDb = mig.Database } // only claim the migration if we successfully unstage it urlParams := map[string]string{"id": strconv.Itoa(mig.Id)} _, err := runner.RestClient.Unstage(urlParams) if err != nil { return nil, err } glog.Infof("mig_id=%d: Successfully unstaged.", mig.Id) return mig, nil } glog.Infof("Status = %d is not in the list of statuses to run, so skipping.", migrationStatus) return nil, nil }