// clone phase: // - copy the data from source tablets to destination masters (with replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (vscw *VerticalSplitCloneWorker) clone(ctx context.Context) error { vscw.setState(WorkerStateCloneOffline) start := time.Now() defer func() { statsStateDurationsNs.Set(string(WorkerStateCloneOffline), time.Now().Sub(start).Nanoseconds()) }() // get source schema shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sourceSchemaDefinition, err := vscw.wr.GetSchema(shortCtx, vscw.sourceAlias, vscw.tables, nil, true) cancel() if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", topoproto.TabletAliasString(vscw.sourceAlias), err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter") } vscw.wr.Logger().Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) vscw.tableStatusList.initialize(sourceSchemaDefinition) // In parallel, setup the channels to send SQL data chunks to // for each destination tablet. // // mu protects firstError mu := sync.Mutex{} var firstError error ctx, cancelCopy := context.WithCancel(ctx) processError := func(format string, args ...interface{}) { vscw.wr.Logger().Errorf(format, args...) mu.Lock() if firstError == nil { firstError = fmt.Errorf(format, args...) cancelCopy() } mu.Unlock() } destinationWaitGroup := sync.WaitGroup{} // we create one channel for the destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannel := make(chan string, vscw.destinationWriterCount*2) // Set up the throttler for the destination shard. keyspaceAndShard := topoproto.KeyspaceShardString(vscw.destinationKeyspace, vscw.destinationShard) destinationThrottler, err := throttler.NewThrottler( keyspaceAndShard, "transactions", vscw.destinationWriterCount, vscw.maxTPS, throttler.ReplicationLagModuleDisabled) if err != nil { return fmt.Errorf("cannot instantiate throttler: %v", err) } for j := 0; j < vscw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func(threadID int) { defer destinationWaitGroup.Done() defer destinationThrottler.ThreadFinished(threadID) executor := newExecutor(vscw.wr, vscw.tsc, destinationThrottler, vscw.destinationKeyspace, vscw.destinationShard, threadID) if err := executor.fetchLoop(ctx, insertChannel); err != nil { processError("executer.FetchLoop failed: %v", err) } }(j) } // Now for each table, read data chunks and send them to insertChannel sourceWaitGroup := sync.WaitGroup{} sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0) dbName := vscw.destinationDbNames[topoproto.KeyspaceShardString(vscw.destinationKeyspace, vscw.destinationShard)] for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == tmutils.TableView { continue } chunks, err := generateChunks(ctx, vscw.wr, vscw.sourceTablet, td, vscw.minTableSizeForSplit, vscw.sourceReaderCount) if err != nil { return err } vscw.tableStatusList.setThreadCount(tableIndex, len(chunks)-1) for _, c := range chunks { sourceWaitGroup.Add(1) go func(td *tabletmanagerdatapb.TableDefinition, tableIndex int, chunk chunk) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() vscw.tableStatusList.threadStarted(tableIndex) // Start streaming from the source tablet. rr, err := NewRestartableResultReader(ctx, vscw.wr.Logger(), vscw.wr.TopoServer(), vscw.sourceAlias, td, chunk) if err != nil { processError("NewRestartableResultReader failed: %v", err) return } defer rr.Close() // process the data if err := vscw.processData(ctx, dbName, td, tableIndex, rr, insertChannel, vscw.destinationPackCount); err != nil { processError("ResultReader failed: %v", err) } vscw.tableStatusList.threadDone(tableIndex) }(td, tableIndex, c) } } sourceWaitGroup.Wait() close(insertChannel) destinationWaitGroup.Wait() // Stop Throttler. destinationThrottler.Close() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if vscw.strategy.skipPopulateBlpCheckpoint { vscw.wr.Logger().Infof("Skipping populating the blp_checkpoint table") } else { // get the current position from the source shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) status, err := vscw.wr.TabletManagerClient().SlaveStatus(shortCtx, vscw.sourceTablet) cancel() if err != nil { return err } queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if vscw.strategy.dontStartBinlogPlayer { flags = binlogplayer.BlpFlagDontStart } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, vscw.maxTPS, throttler.ReplicationLagModuleDisabled, time.Now().Unix(), flags)) vscw.wr.Logger().Infof("Making and populating blp_checkpoint table") if err := runSQLCommands(ctx, vscw.wr, vscw.tsc, vscw.destinationKeyspace, vscw.destinationShard, dbName, queries); err != nil { processError("blp_checkpoint queries failed: %v", err) } if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. if vscw.strategy.skipSetSourceShards { vscw.wr.Logger().Infof("Skipping setting SourceShard on destination shard.") } else { vscw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := vscw.wr.SetSourceShards(shortCtx, vscw.destinationKeyspace, vscw.destinationShard, []*topodatapb.TabletAlias{vscw.sourceAlias}, vscw.tables) cancel() if err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } } err = vscw.findRefreshTargets(ctx) if err != nil { return fmt.Errorf("failed before refreshing state on destination tablets: %v", err) } // And force a state refresh (re-read topo) on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for _, tabletAlias := range vscw.refreshAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Refreshing state on tablet %v", ti.AliasString()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := vscw.wr.TabletManagerClient().RefreshState(shortCtx, ti.Tablet) cancel() if err != nil { processError("RefreshState failed on tablet %v: %v", ti.AliasString(), err) } }(vscw.refreshTablets[*tabletAlias]) } destinationWaitGroup.Wait() return firstError }
// copy phase: // - copy the data from source tablets to destination masters (wtih replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (vscw *VerticalSplitCloneWorker) copy(ctx context.Context) error { vscw.setState(WorkerStateCopy) // get source schema shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sourceSchemaDefinition, err := vscw.wr.GetSchema(shortCtx, vscw.sourceAlias, vscw.tables, nil, true) cancel() if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", topo.TabletAliasString(vscw.sourceAlias), err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter") } vscw.wr.Logger().Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) vscw.Mu.Lock() vscw.tableStatus = make([]*tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i] = &tableStatus{ name: td.Name, rowCount: td.RowCount, } } vscw.startTime = time.Now() vscw.Mu.Unlock() // Count rows for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i].mu.Lock() if td.Type == myproto.TableBaseTable { vscw.tableStatus[i].rowCount = td.RowCount } else { vscw.tableStatus[i].isView = true } vscw.tableStatus[i].mu.Unlock() } // In parallel, setup the channels to send SQL data chunks to // for each destination tablet. // // mu protects firstError mu := sync.Mutex{} var firstError error ctx, cancel = context.WithCancel(ctx) processError := func(format string, args ...interface{}) { vscw.wr.Logger().Errorf(format, args...) mu.Lock() if firstError == nil { firstError = fmt.Errorf(format, args...) cancel() } mu.Unlock() } destinationWaitGroup := sync.WaitGroup{} // we create one channel for the destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannel := make(chan string, vscw.destinationWriterCount*2) go func(shardName string, insertChannel chan string) { for j := 0; j < vscw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() if err := executeFetchLoop(ctx, vscw.wr, vscw, shardName, insertChannel); err != nil { processError("executeFetchLoop failed: %v", err) } }() } }(vscw.destinationShard, insertChannel) // Now for each table, read data chunks and send them to insertChannel sourceWaitGroup := sync.WaitGroup{} sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TableView { continue } chunks, err := FindChunks(ctx, vscw.wr, vscw.sourceTablet, td, vscw.minTableSizeForSplit, vscw.sourceReaderCount) if err != nil { return err } vscw.tableStatus[tableIndex].setThreadCount(len(chunks) - 1) for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() vscw.tableStatus[tableIndex].threadStarted() // build the query, and start the streaming selectSQL := buildSQLFromChunks(vscw.wr, td, chunks, chunkIndex, topo.TabletAliasString(vscw.sourceAlias)) qrr, err := NewQueryResultReaderForTablet(ctx, vscw.wr.TopoServer(), vscw.sourceAlias, selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } defer qrr.Close() // process the data if err := vscw.processData(td, tableIndex, qrr, insertChannel, vscw.destinationPackCount, ctx.Done()); err != nil { processError("QueryResultReader failed: %v", err) } vscw.tableStatus[tableIndex].threadDone() }(td, tableIndex, chunkIndex) } } sourceWaitGroup.Wait() close(insertChannel) destinationWaitGroup.Wait() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if vscw.strategy.PopulateBlpCheckpoint { // get the current position from the source shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) status, err := vscw.wr.TabletManagerClient().SlaveStatus(shortCtx, vscw.sourceTablet) cancel() if err != nil { return err } queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if vscw.strategy.DontStartBinlogPlayer { flags = binlogplayer.BlpFlagDontStart } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags)) destinationWaitGroup.Add(1) go func(shardName string) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Making and populating blp_checkpoint table") if err := runSQLCommands(ctx, vscw.wr, vscw, shardName, queries); err != nil { processError("blp_checkpoint queries failed: %v", err) } }(vscw.destinationShard) destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. if vscw.strategy.SkipSetSourceShards { vscw.wr.Logger().Infof("Skipping setting SourceShard on destination shard.") } else { vscw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := vscw.wr.SetSourceShards(shortCtx, vscw.destinationKeyspace, vscw.destinationShard, []*pb.TabletAlias{vscw.sourceAlias}, vscw.tables) cancel() if err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } } err = vscw.findReloadTargets(ctx) if err != nil { return fmt.Errorf("failed before reloading schema on destination tablets: %v", err) } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for _, tabletAlias := range vscw.reloadAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Reloading schema on tablet %v", ti.AliasString()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := vscw.wr.TabletManagerClient().ReloadSchema(shortCtx, ti) cancel() if err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.AliasString(), err) } }(vscw.reloadTablets[*tabletAlias]) } destinationWaitGroup.Wait() return firstError }
// copy phase: // - get schema on the source, filter tables // - create tables on all destinations // - copy the data func (vscw *VerticalSplitCloneWorker) copy() error { vscw.setState(stateVSCCopy) // get source schema sourceSchemaDefinition, err := vscw.wr.GetSchema(vscw.sourceAlias, vscw.tables, nil, true) if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", vscw.sourceAlias, err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter") } vscw.wr.Logger().Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) vscw.mu.Lock() vscw.tableStatus = make([]tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i].name = td.Name vscw.tableStatus[i].rowCount = td.RowCount } vscw.startTime = time.Now() vscw.mu.Unlock() // Create all the commands to create the destination schema: // - createDbCmds will create the database and the tables // - createViewCmds will create the views // - alterTablesCmds will modify the tables at the end if needed // (all need template substitution for {{.DatabaseName}}) createDbCmds := make([]string, 0, len(sourceSchemaDefinition.TableDefinitions)+1) createDbCmds = append(createDbCmds, sourceSchemaDefinition.DatabaseSchema) createViewCmds := make([]string, 0, 16) alterTablesCmds := make([]string, 0, 16) for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i].mu.Lock() if td.Type == myproto.TABLE_BASE_TABLE { create, alter, err := mysqlctl.MakeSplitCreateTableSql(vscw.wr.Logger(), td.Schema, "{{.DatabaseName}}", td.Name, vscw.strategy) if err != nil { return fmt.Errorf("MakeSplitCreateTableSql(%v) returned: %v", td.Name, err) } createDbCmds = append(createDbCmds, create) if alter != "" { alterTablesCmds = append(alterTablesCmds, alter) } vscw.tableStatus[i].state = "before table creation" vscw.tableStatus[i].rowCount = td.RowCount } else { createViewCmds = append(createViewCmds, td.Schema) vscw.tableStatus[i].state = "before view creation" vscw.tableStatus[i].rowCount = 0 } vscw.tableStatus[i].mu.Unlock() } // For each destination tablet (in parallel): // - create the schema // - setup the channels to send SQL data chunks // // mu protects the abort channel for closing, and firstError mu := sync.Mutex{} abort := make(chan struct{}) var firstError error processError := func(format string, args ...interface{}) { vscw.wr.Logger().Errorf(format, args...) mu.Lock() if abort != nil { close(abort) abort = nil firstError = fmt.Errorf(format, args...) } mu.Unlock() } insertChannels := make([]chan string, len(vscw.destinationAliases)) destinationWaitGroup := sync.WaitGroup{} for i, tabletAlias := range vscw.destinationAliases { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[i] = make(chan string, vscw.destinationWriterCount*2) destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo, insertChannel chan string) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Creating tables on tablet %v", ti.Alias) if err := runSqlCommands(vscw.wr, ti, createDbCmds, abort); err != nil { processError("createDbCmds failed: %v", err) return } if len(createViewCmds) > 0 { vscw.wr.Logger().Infof("Creating views on tablet %v", ti.Alias) if err := runSqlCommands(vscw.wr, ti, createViewCmds, abort); err != nil { processError("createViewCmds failed: %v", err) return } } for j := 0; j < vscw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() if err := executeFetchLoop(vscw.wr, ti, insertChannel, abort); err != nil { processError("executeFetchLoop failed: %v", err) } }() } }(vscw.destinationTablets[tabletAlias], insertChannels[i]) } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_VIEW { vscw.tableStatus[tableIndex].setState("view created") continue } vscw.tableStatus[tableIndex].setState("before copy") chunks, err := findChunks(vscw.wr, vscw.sourceTablet, td, vscw.minTableSizeForSplit, vscw.sourceReaderCount) if err != nil { return err } for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() vscw.tableStatus[tableIndex].setState("started the copy") // build the query, and start the streaming selectSQL := buildSQLFromChunks(vscw.wr, td, chunks, chunkIndex, vscw.sourceAlias.String()) qrr, err := NewQueryResultReaderForTablet(vscw.wr.TopoServer(), vscw.sourceAlias, selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } // process the data if err := vscw.processData(td, tableIndex, qrr, insertChannels, abort); err != nil { processError("QueryResultReader failed: %v", err) } }(td, tableIndex, chunkIndex) } } sourceWaitGroup.Wait() for _, c := range insertChannels { close(c) } destinationWaitGroup.Wait() if firstError != nil { return firstError } // do the post-copy alters if any if len(alterTablesCmds) > 0 { for _, tabletAlias := range vscw.destinationAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Altering tables on tablet %v", ti.Alias) if err := runSqlCommands(vscw.wr, ti, alterTablesCmds, abort); err != nil { processError("alterTablesCmds failed on tablet %v: %v", ti.Alias, err) } }(vscw.destinationTablets[tabletAlias]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // then create and populate the blp_checkpoint table if strings.Index(vscw.strategy, "populateBlpCheckpoint") != -1 { // get the current position from the source status, err := vscw.wr.TabletManagerClient().SlaveStatus(vscw.sourceTablet, 30*time.Second) if err != nil { return err } queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if strings.Index(vscw.strategy, "dontStartBinlogPlayer") != -1 { flags = binlogplayer.BLP_FLAG_DONT_START } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags)) for _, tabletAlias := range vscw.destinationAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias) if err := runSqlCommands(vscw.wr, ti, queries, abort); err != nil { processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err) } }(vscw.destinationTablets[tabletAlias]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. if strings.Index(vscw.strategy, "skipSetSourceShards") != -1 { vscw.wr.Logger().Infof("Skipping setting SourceShard on destination shard.") } else { vscw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard) if err := vscw.wr.SetSourceShards(vscw.destinationKeyspace, vscw.destinationShard, []topo.TabletAlias{vscw.sourceAlias}, vscw.tables); err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for _, tabletAlias := range vscw.destinationAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Reloading schema on tablet %v", ti.Alias) if err := vscw.wr.TabletManagerClient().ReloadSchema(ti, 30*time.Second); err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err) } }(vscw.destinationTablets[tabletAlias]) } destinationWaitGroup.Wait() return firstError }
// copy phase: // - copy the data from source tablets to destination masters (with replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (scw *SplitCloneWorker) copy(ctx context.Context) error { scw.setState(WorkerStateCopy) // get source schema from the first shard // TODO(alainjobart): for now, we assume the schema is compatible // on all source shards. Furthermore, we estimate the number of rows // in each source shard for each table to be about the same // (rowCount is used to estimate an ETA) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sourceSchemaDefinition, err := scw.wr.GetSchema(shortCtx, scw.sourceAliases[0], nil, scw.excludeTables, true) cancel() if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", topoproto.TabletAliasString(scw.sourceAliases[0]), err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter in tablet %v", topoproto.TabletAliasString(scw.sourceAliases[0])) } scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) scw.Mu.Lock() scw.tableStatus = make([]*tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { scw.tableStatus[i] = &tableStatus{ name: td.Name, rowCount: td.RowCount * uint64(len(scw.sourceAliases)), } } scw.startTime = time.Now() scw.Mu.Unlock() // Find the column index for the sharding columns in all the databases, and count rows columnIndexes := make([]int, len(sourceSchemaDefinition.TableDefinitions)) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TableBaseTable { // find the column to split on columnIndexes[tableIndex] = -1 for i, name := range td.Columns { if name == scw.keyspaceInfo.ShardingColumnName { columnIndexes[tableIndex] = i break } } if columnIndexes[tableIndex] == -1 { return fmt.Errorf("table %v doesn't have a column named '%v'", td.Name, scw.keyspaceInfo.ShardingColumnName) } scw.tableStatus[tableIndex].mu.Lock() scw.tableStatus[tableIndex].rowCount = td.RowCount scw.tableStatus[tableIndex].mu.Unlock() } else { scw.tableStatus[tableIndex].mu.Lock() scw.tableStatus[tableIndex].isView = true scw.tableStatus[tableIndex].mu.Unlock() } } // In parallel, setup the channels to send SQL data chunks to for each destination tablet: // // mu protects the context for cancelation, and firstError mu := sync.Mutex{} var firstError error ctx, cancelCopy := context.WithCancel(ctx) processError := func(format string, args ...interface{}) { scw.wr.Logger().Errorf(format, args...) mu.Lock() if firstError == nil { firstError = fmt.Errorf(format, args...) cancelCopy() } mu.Unlock() } insertChannels := make([]chan string, len(scw.destinationShards)) destinationWaitGroup := sync.WaitGroup{} for shardIndex, si := range scw.destinationShards { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[shardIndex] = make(chan string, scw.destinationWriterCount*2) go func(shardName string, insertChannel chan string) { for j := 0; j < scw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() if err := executeFetchLoop(ctx, scw.wr, scw, shardName, insertChannel); err != nil { processError("executeFetchLoop failed: %v", err) } }() } }(si.ShardName(), insertChannels[shardIndex]) } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} for shardIndex := range scw.sourceShards { sema := sync2.NewSemaphore(scw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TableView { continue } rowSplitter := NewRowSplitter(scw.destinationShards, key.ProtoToKeyspaceIdType(scw.keyspaceInfo.ShardingColumnType), columnIndexes[tableIndex]) chunks, err := FindChunks(ctx, scw.wr, scw.sourceTablets[shardIndex], td, scw.minTableSizeForSplit, scw.sourceReaderCount) if err != nil { return err } scw.tableStatus[tableIndex].setThreadCount(len(chunks) - 1) for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() scw.tableStatus[tableIndex].threadStarted() // build the query, and start the streaming selectSQL := buildSQLFromChunks(scw.wr, td, chunks, chunkIndex, scw.sourceAliases[shardIndex].String()) qrr, err := NewQueryResultReaderForTablet(ctx, scw.wr.TopoServer(), scw.sourceAliases[shardIndex], selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } defer qrr.Close() // process the data if err := scw.processData(td, tableIndex, qrr, rowSplitter, insertChannels, scw.destinationPackCount, ctx.Done()); err != nil { processError("processData failed: %v", err) } scw.tableStatus[tableIndex].threadDone() }(td, tableIndex, chunkIndex) } } } sourceWaitGroup.Wait() for shardIndex := range scw.destinationShards { close(insertChannels[shardIndex]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if scw.strategy.PopulateBlpCheckpoint { queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if scw.strategy.DontStartBinlogPlayer { flags = binlogplayer.BlpFlagDontStart } // get the current position from the sources for shardIndex := range scw.sourceShards { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) status, err := scw.wr.TabletManagerClient().SlaveStatus(shortCtx, scw.sourceTablets[shardIndex]) cancel() if err != nil { return err } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags)) } for _, si := range scw.destinationShards { destinationWaitGroup.Add(1) go func(shardName string) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Making and populating blp_checkpoint table") if err := runSQLCommands(ctx, scw.wr, scw, shardName, queries); err != nil { processError("blp_checkpoint queries failed: %v", err) } }(si.ShardName()) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. // TODO(alainjobart) this is a superset, some shards may not // overlap, have to deal with this better (for N -> M splits // where both N>1 and M>1) if scw.strategy.SkipSetSourceShards { scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.") } else { for _, si := range scw.destinationShards { scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.SetSourceShards(shortCtx, si.Keyspace(), si.ShardName(), scw.sourceAliases, nil) cancel() if err != nil { return fmt.Errorf("failed to set source shards: %v", err) } } } err = scw.findReloadTargets(ctx) if err != nil { return fmt.Errorf("failed before reloading schema on destination tablets: %v", err) } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for shardIndex := range scw.destinationShards { for _, tabletAlias := range scw.reloadAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Reloading schema on tablet %v", ti.AliasString()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.TabletManagerClient().ReloadSchema(shortCtx, ti) cancel() if err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.AliasString(), err) } }(scw.reloadTablets[shardIndex][*tabletAlias]) } } destinationWaitGroup.Wait() return firstError }
// MultiRestore is the main entry point for multi restore. // // We will either: // - read from the network if sourceAddrs != nil // - read from a disk snapshot if fromStoragePaths != nil // // The strategy is used as follows: // - If it contains the string 'writeBinLogs' then we will also write // to the binary logs. // - If it contains the command 'populateBlpCheckpoint' then we will // populate the blp_checkpoint table with master positions to start from func (mysqld *Mysqld) MultiRestore(destinationDbName string, keyRanges []key.KeyRange, sourceAddrs []*url.URL, fromStoragePaths []string, snapshotConcurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount int, strategy string) (err error) { writeBinLogs := strings.Contains(strategy, "writeBinLogs") var manifests []*SplitSnapshotManifest if sourceAddrs != nil { // get the manifests from the network manifests = make([]*SplitSnapshotManifest, len(sourceAddrs)) rc := concurrency.NewResourceConstraint(fetchConcurrency) for i, sourceAddr := range sourceAddrs { rc.Add(1) go func(sourceAddr *url.URL, i int) { rc.Acquire() defer rc.ReleaseAndDone() if rc.HasErrors() { return } var sourceDbName string if len(sourceAddr.Path) < 2 { // "" or "/" sourceDbName = destinationDbName } else { sourceDbName = sourceAddr.Path[1:] } ssm, e := fetchSnapshotManifestWithRetry("http://"+sourceAddr.Host, sourceDbName, keyRanges[i], fetchRetryCount) manifests[i] = ssm rc.RecordError(e) }(sourceAddr, i) } if err = rc.Wait(); err != nil { return } } else { // get the manifests from the local snapshots manifests = make([]*SplitSnapshotManifest, len(fromStoragePaths)) for i, fromStoragePath := range fromStoragePaths { var err error manifests[i], err = readSnapshotManifest(fromStoragePath) if err != nil { return err } } } if e := SanityCheckManifests(manifests); e != nil { return e } tempStoragePath := path.Join(mysqld.SnapshotDir, "multirestore", destinationDbName) // Start fresh if err = os.RemoveAll(tempStoragePath); err != nil { return } if err = os.MkdirAll(tempStoragePath, 0775); err != nil { return err } defer func() { if e := os.RemoveAll(tempStoragePath); e != nil { log.Errorf("error removing %v: %v", tempStoragePath, e) } }() // Handle our concurrency: // - fetchConcurrency tasks for network / decompress from disk // - insertTableConcurrency for table inserts from a file // into an innodb table // - snapshotConcurrency tasks for table inserts / modify tables sems := make(map[string]*sync2.Semaphore, len(manifests[0].SchemaDefinition.TableDefinitions)+2) sems["net"] = sync2.NewSemaphore(fetchConcurrency, 0) sems["db"] = sync2.NewSemaphore(snapshotConcurrency, 0) // Store the alter table statements for after restore, // and how many jobs we're running on each table // TODO(alainjobart) the jobCount map is a bit weird. replace it // with a map of WaitGroups, initialized to the number of files // per table. Have extra go routines for the tables with auto_increment // to wait on the waitgroup, and apply the modify_table. postSql := make(map[string]string, len(manifests[0].SchemaDefinition.TableDefinitions)) jobCount := make(map[string]*sync2.AtomicInt32) // Create the database (it's a good check to know if we're running // multirestore a second time too!) manifest := manifests[0] // I am assuming they all match createDatabase, e := fillStringTemplate(manifest.SchemaDefinition.DatabaseSchema, map[string]string{"DatabaseName": destinationDbName}) if e != nil { return e } if createDatabase == "" { return fmt.Errorf("Empty create database statement") } createDbCmds := make([]string, 0, len(manifest.SchemaDefinition.TableDefinitions)+2) if !writeBinLogs { createDbCmds = append(createDbCmds, "SET sql_log_bin = OFF") } createDbCmds = append(createDbCmds, createDatabase) createDbCmds = append(createDbCmds, "USE `"+destinationDbName+"`") createViewCmds := make([]string, 0, 16) for _, td := range manifest.SchemaDefinition.TableDefinitions { if td.Type == proto.TABLE_BASE_TABLE { createDbCmd, alterTable, err := makeCreateTableSql(td.Schema, td.Name, strategy) if err != nil { return err } if alterTable != "" { postSql[td.Name] = alterTable } jobCount[td.Name] = new(sync2.AtomicInt32) createDbCmds = append(createDbCmds, createDbCmd) sems["table-"+td.Name] = sync2.NewSemaphore(insertTableConcurrency, 0) } else { // views are just created with the right db name // and no data will ever go in them. We create them // after all tables are created, as they will // probably depend on real tables. createViewCmd, err := fillStringTemplate(td.Schema, map[string]string{"DatabaseName": destinationDbName}) if err != nil { return err } createViewCmds = append(createViewCmds, createViewCmd) } } createDbCmds = append(createDbCmds, createViewCmds...) if err = mysqld.ExecuteSuperQueryList(createDbCmds); err != nil { return } // compute how many jobs we will have for _, manifest := range manifests { for _, file := range manifest.Source.Files { jobCount[file.TableName].Add(1) } } loadDataInfile := `LOAD DATA INFILE '{{.TableInputPath}}' INTO TABLE {{.TableName}} CHARACTER SET binary FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '\\' LINES TERMINATED BY '\n' ({{.Columns}})` // fetch all the csv files, and apply them one at a time. Note // this might start many go routines, and they'll all be // waiting on the resource semaphores. mrc := concurrency.NewMultiResourceConstraint(sems) for manifestIndex, manifest := range manifests { if err = os.Mkdir(path.Join(tempStoragePath, manifest.Source.Addr), 0775); err != nil { return err } for i := range manifest.Source.Files { lsf := localSnapshotFile{manifest: manifest, file: &manifest.Source.Files[i], basePath: tempStoragePath} mrc.Add(1) go func(manifestIndex, i int) { defer mrc.Done() // compute a few things now, so if we can't we // don't take resources: // - get the schema td, ok := manifest.SchemaDefinition.GetTable(lsf.tableName()) if !ok { mrc.RecordError(fmt.Errorf("No table named %v in schema", lsf.tableName())) return } // - get the load data statement queryParams := map[string]string{ "TableInputPath": lsf.filename(), "TableName": lsf.tableName(), "Columns": strings.Join(td.Columns, ", "), } loadStatement, e := fillStringTemplate(loadDataInfile, queryParams) if e != nil { mrc.RecordError(e) return } // get the file, using the 'net' resource mrc.Acquire("net") if mrc.HasErrors() { mrc.Release("net") return } if sourceAddrs == nil { e = uncompressLocalFile(path.Join(fromStoragePaths[manifestIndex], path.Base(lsf.file.Path)), lsf.file.Hash, lsf.filename()) } else { e = fetchFileWithRetry(lsf.url(), lsf.file.Hash, lsf.filename(), fetchRetryCount) } mrc.Release("net") if e != nil { mrc.RecordError(e) return } defer os.Remove(lsf.filename()) // acquire the table lock (we do this first // so we maximize access to db. Otherwise // if 8 threads had gotten the db lock but // were writing to the same table, only one // load would go at once) tableLockName := "table-" + lsf.tableName() mrc.Acquire(tableLockName) defer func() { mrc.Release(tableLockName) }() if mrc.HasErrors() { return } // acquire the db lock mrc.Acquire("db") defer func() { mrc.Release("db") }() if mrc.HasErrors() { return } // load the data in queries := buildQueryList(destinationDbName, loadStatement, writeBinLogs) e = mysqld.ExecuteSuperQueryList(queries) if e != nil { mrc.RecordError(e) return } // if we're running the last insert, // potentially re-add the auto-increments remainingInserts := jobCount[lsf.tableName()].Add(-1) if remainingInserts == 0 && postSql[lsf.tableName()] != "" { queries = buildQueryList(destinationDbName, postSql[lsf.tableName()], writeBinLogs) e = mysqld.ExecuteSuperQueryList(queries) if e != nil { mrc.RecordError(e) return } } }(manifestIndex, i) } } if err = mrc.Wait(); err != nil { return err } // populate blp_checkpoint table if we want to if strings.Index(strategy, "populateBlpCheckpoint") != -1 { queries := make([]string, 0, 4) if !writeBinLogs { queries = append(queries, "SET sql_log_bin = OFF") queries = append(queries, "SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED") } queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) for manifestIndex, manifest := range manifests { queries = append(queries, binlogplayer.PopulateBlpCheckpoint(manifestIndex, manifest.Source.MasterState.ReplicationPosition.MasterLogGroupId, time.Now().Unix())) } if err = mysqld.ExecuteSuperQueryList(queries); err != nil { return err } } return nil }
// copy phase: // - copy the data from source tablets to destination masters (with replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (scw *LegacySplitCloneWorker) copy(ctx context.Context) error { scw.setState(WorkerStateCloneOffline) start := time.Now() defer func() { statsStateDurationsNs.Set(string(WorkerStateCloneOffline), time.Now().Sub(start).Nanoseconds()) }() // get source schema from the first shard // TODO(alainjobart): for now, we assume the schema is compatible // on all source shards. Furthermore, we estimate the number of rows // in each source shard for each table to be about the same // (rowCount is used to estimate an ETA) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sourceSchemaDefinition, err := scw.wr.GetSchema(shortCtx, scw.sourceAliases[0], nil, scw.excludeTables, false /* includeViews */) cancel() if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", topoproto.TabletAliasString(scw.sourceAliases[0]), err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter in tablet %v", topoproto.TabletAliasString(scw.sourceAliases[0])) } for _, td := range sourceSchemaDefinition.TableDefinitions { if len(td.Columns) == 0 { return fmt.Errorf("schema for table %v has no columns", td.Name) } } scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) scw.tableStatusList.initialize(sourceSchemaDefinition) // In parallel, setup the channels to send SQL data chunks to for each destination tablet: // // mu protects the context for cancelation, and firstError mu := sync.Mutex{} var firstError error ctx, cancelCopy := context.WithCancel(ctx) processError := func(format string, args ...interface{}) { scw.wr.Logger().Errorf(format, args...) mu.Lock() if firstError == nil { firstError = fmt.Errorf(format, args...) cancelCopy() } mu.Unlock() } insertChannels := make([]chan string, len(scw.destinationShards)) destinationWaitGroup := sync.WaitGroup{} for shardIndex, si := range scw.destinationShards { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[shardIndex] = make(chan string, scw.destinationWriterCount*2) go func(keyspace, shard string, insertChannel chan string) { for j := 0; j < scw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func(threadID int) { defer destinationWaitGroup.Done() keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard) throttler := scw.destinationThrottlers[keyspaceAndShard] defer throttler.ThreadFinished(threadID) executor := newExecutor(scw.wr, scw.tsc, throttler, keyspace, shard, threadID) if err := executor.fetchLoop(ctx, insertChannel); err != nil { processError("executer.FetchLoop failed: %v", err) } }(j) } }(si.Keyspace(), si.ShardName(), insertChannels[shardIndex]) } // read the vschema if needed var keyspaceSchema *vindexes.KeyspaceSchema if *useV3ReshardingMode { kschema, err := scw.wr.TopoServer().GetVSchema(ctx, scw.keyspace) if err != nil { return fmt.Errorf("cannot load VSchema for keyspace %v: %v", scw.keyspace, err) } if kschema == nil { return fmt.Errorf("no VSchema for keyspace %v", scw.keyspace) } keyspaceSchema, err = vindexes.BuildKeyspaceSchema(kschema, scw.keyspace) if err != nil { return fmt.Errorf("cannot build vschema for keyspace %v: %v", scw.keyspace, err) } } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} for shardIndex := range scw.sourceShards { sema := sync2.NewSemaphore(scw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { var keyResolver keyspaceIDResolver if *useV3ReshardingMode { keyResolver, err = newV3ResolverFromTableDefinition(keyspaceSchema, td) if err != nil { return fmt.Errorf("cannot resolve v3 sharding keys for keyspace %v: %v", scw.keyspace, err) } } else { keyResolver, err = newV2Resolver(scw.keyspaceInfo, td) if err != nil { return fmt.Errorf("cannot resolve sharding keys for keyspace %v: %v", scw.keyspace, err) } } rowSplitter := NewRowSplitter(scw.destinationShards, keyResolver) chunks, err := generateChunks(ctx, scw.wr, scw.sourceTablets[shardIndex], td, scw.sourceReaderCount, defaultMinRowsPerChunk) if err != nil { return err } scw.tableStatusList.setThreadCount(tableIndex, len(chunks)-1) for _, c := range chunks { sourceWaitGroup.Add(1) go func(td *tabletmanagerdatapb.TableDefinition, tableIndex int, chunk chunk) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() scw.tableStatusList.threadStarted(tableIndex) // Start streaming from the source tablets. tp := newSingleTabletProvider(ctx, scw.wr.TopoServer(), scw.sourceAliases[shardIndex]) rr, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, false /* allowMultipleRetries */) if err != nil { processError("NewRestartableResultReader failed: %v", err) return } defer rr.Close(ctx) // process the data dbNames := make([]string, len(scw.destinationShards)) for i, si := range scw.destinationShards { keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) dbNames[i] = scw.destinationDbNames[keyspaceAndShard] } if err := scw.processData(ctx, dbNames, td, tableIndex, rr, rowSplitter, insertChannels, scw.destinationPackCount); err != nil { processError("processData failed: %v", err) } scw.tableStatusList.threadDone(tableIndex) }(td, tableIndex, c) } } } sourceWaitGroup.Wait() for shardIndex := range scw.destinationShards { close(insertChannels[shardIndex]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if scw.strategy.skipPopulateBlpCheckpoint { scw.wr.Logger().Infof("Skipping populating the blp_checkpoint table") } else { queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if scw.strategy.dontStartBinlogPlayer { flags = binlogplayer.BlpFlagDontStart } // get the current position from the sources for shardIndex := range scw.sourceShards { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) status, err := scw.wr.TabletManagerClient().SlaveStatus(shortCtx, scw.sourceTablets[shardIndex]) cancel() if err != nil { return err } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(uint32(shardIndex), status.Position, scw.maxTPS, throttler.ReplicationLagModuleDisabled, time.Now().Unix(), flags)) } for _, si := range scw.destinationShards { destinationWaitGroup.Add(1) go func(keyspace, shard string) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Making and populating blp_checkpoint table") keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard) if err := runSQLCommands(ctx, scw.wr, scw.tsc, keyspace, shard, scw.destinationDbNames[keyspaceAndShard], queries); err != nil { processError("blp_checkpoint queries failed: %v", err) } }(si.Keyspace(), si.ShardName()) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. // TODO(alainjobart) this is a superset, some shards may not // overlap, have to deal with this better (for N -> M splits // where both N>1 and M>1) if scw.strategy.skipSetSourceShards { scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.") } else { for _, si := range scw.destinationShards { scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.SetSourceShards(shortCtx, si.Keyspace(), si.ShardName(), scw.sourceAliases, nil) cancel() if err != nil { return fmt.Errorf("failed to set source shards: %v", err) } } } err = scw.findRefreshTargets(ctx) if err != nil { return fmt.Errorf("failed before refreshing state on destination tablets: %v", err) } // And force a state refresh (re-read topo) on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for shardIndex := range scw.destinationShards { for _, tabletAlias := range scw.refreshAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Refreshing state on tablet %v", ti.AliasString()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.TabletManagerClient().RefreshState(shortCtx, ti.Tablet) cancel() if err != nil { processError("RefreshState failed on tablet %v: %v", ti.AliasString(), err) } }(scw.refreshTablets[shardIndex][*tabletAlias]) } } destinationWaitGroup.Wait() return firstError }
// copy phase: // - copy the data from source tablets to destination masters (with replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (scw *SplitCloneWorker) clone(ctx context.Context, state StatusWorkerState) error { if state != WorkerStateCloneOnline && state != WorkerStateCloneOffline { panic(fmt.Sprintf("invalid state passed to clone(): %v", state)) } scw.setState(state) start := time.Now() defer func() { statsStateDurationsNs.Set(string(state), time.Now().Sub(start).Nanoseconds()) }() var firstSourceTablet *topodatapb.Tablet if state == WorkerStateCloneOffline { // Use the first source tablet which we took offline. firstSourceTablet = scw.sourceTablets[0] } else { // Pick any healthy serving source tablet. si := scw.sourceShards[0] tablets := scw.tsc.GetTabletStats(si.Keyspace(), si.ShardName(), topodatapb.TabletType_RDONLY) if len(tablets) == 0 { // We fail fast on this problem and don't retry because at the start all tablets should be healthy. return fmt.Errorf("no healthy RDONLY tablet in source shard (%v) available (required to find out the schema)", topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName())) } firstSourceTablet = tablets[0].Tablet } var statsCounters []*stats.Counters var tableStatusList *tableStatusList switch state { case WorkerStateCloneOnline: statsCounters = []*stats.Counters{statsOnlineInsertsCounters, statsOnlineUpdatesCounters, statsOnlineDeletesCounters, statsOnlineEqualRowsCounters} tableStatusList = scw.tableStatusListOnline case WorkerStateCloneOffline: statsCounters = []*stats.Counters{statsOfflineInsertsCounters, statsOfflineUpdatesCounters, statsOfflineDeletesCounters, statsOfflineEqualRowsCounters} tableStatusList = scw.tableStatusListOffline } // The throttlers exist only for the duration of this clone() call. // That means a SplitClone invocation with both online and offline phases // will create throttlers for each phase. if err := scw.createThrottlers(); err != nil { return err } defer scw.closeThrottlers() sourceSchemaDefinition, err := scw.getSourceSchema(ctx, firstSourceTablet) if err != nil { return err } scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) tableStatusList.initialize(sourceSchemaDefinition) // In parallel, setup the channels to send SQL data chunks to for each destination tablet: // // mu protects the context for cancelation, and firstError mu := sync.Mutex{} var firstError error ctx, cancelCopy := context.WithCancel(ctx) processError := func(format string, args ...interface{}) { scw.wr.Logger().Errorf(format, args...) mu.Lock() if firstError == nil { firstError = fmt.Errorf(format, args...) cancelCopy() } mu.Unlock() } insertChannels := make([]chan string, len(scw.destinationShards)) destinationWaitGroup := sync.WaitGroup{} for shardIndex, si := range scw.destinationShards { // We create one channel per destination tablet. It is sized to have a // buffer of a maximum of destinationWriterCount * 2 items, to hopefully // always have data. We then have destinationWriterCount go routines reading // from it. insertChannels[shardIndex] = make(chan string, scw.destinationWriterCount*2) for j := 0; j < scw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func(keyspace, shard string, insertChannel chan string, throttler *throttler.Throttler, threadID int) { defer destinationWaitGroup.Done() defer throttler.ThreadFinished(threadID) executor := newExecutor(scw.wr, scw.tsc, throttler, keyspace, shard, threadID) if err := executor.fetchLoop(ctx, insertChannel); err != nil { processError("executer.FetchLoop failed: %v", err) } }(si.Keyspace(), si.ShardName(), insertChannels[shardIndex], scw.getThrottler(si.Keyspace(), si.ShardName()), j) } } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} sema := sync2.NewSemaphore(scw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { td = reorderColumnsPrimaryKeyFirst(td) keyResolver, err := scw.createKeyResolver(td) if err != nil { return fmt.Errorf("cannot resolve sharding keys for keyspace %v: %v", scw.destinationKeyspace, err) } // TODO(mberlin): We're going to chunk *all* source shards based on the MIN // and MAX values of the *first* source shard. Is this going to be a problem? chunks, err := generateChunks(ctx, scw.wr, firstSourceTablet, td, scw.chunkCount, scw.minRowsPerChunk) if err != nil { return err } tableStatusList.setThreadCount(tableIndex, len(chunks)) for _, c := range chunks { sourceWaitGroup.Add(1) go func(td *tabletmanagerdatapb.TableDefinition, tableIndex int, chunk chunk) { defer sourceWaitGroup.Done() errPrefix := fmt.Sprintf("table=%v chunk=%v", td.Name, chunk) // We need our own error per Go routine to avoid races. var err error sema.Acquire() defer sema.Release() tableStatusList.threadStarted(tableIndex) if state == WorkerStateCloneOnline { // Wait for enough healthy tablets (they might have become unhealthy // and their replication lag might have increased since we started.) if err := scw.waitForTablets(ctx, scw.sourceShards, *retryDuration); err != nil { processError("%v: No healthy source tablets found (gave up after %v): ", errPrefix, *retryDuration, err) return } } // Set up readers for the diff. There will be one reader for every // source and destination shard. sourceReaders := make([]ResultReader, len(scw.sourceShards)) destReaders := make([]ResultReader, len(scw.destinationShards)) for shardIndex, si := range scw.sourceShards { var tp tabletProvider allowMultipleRetries := true if state == WorkerStateCloneOffline { tp = newSingleTabletProvider(ctx, scw.wr.TopoServer(), scw.offlineSourceAliases[shardIndex]) // allowMultipleRetries is false to avoid that we'll keep retrying // on the same tablet alias for hours. This guards us against the // situation that an offline tablet gets restarted and serves again. // In that case we cannot use it because its replication is no // longer stopped at the same point as we took it offline initially. allowMultipleRetries = false } else { tp = newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName()) } sourceResultReader, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, allowMultipleRetries) if err != nil { processError("%v: NewRestartableResultReader for source: %v failed", errPrefix, tp.description()) return } defer sourceResultReader.Close() sourceReaders[shardIndex] = sourceResultReader } // Wait for enough healthy tablets (they might have become unhealthy // and their replication lag might have increased due to a previous // chunk pipeline.) if err := scw.waitForTablets(ctx, scw.destinationShards, *retryDuration); err != nil { processError("%v: No healthy destination tablets found (gave up after %v): ", errPrefix, *retryDuration, err) return } for shardIndex, si := range scw.destinationShards { tp := newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName()) destResultReader, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, true /* allowMultipleRetries */) if err != nil { processError("%v: NewRestartableResultReader for destination: %v failed: %v", errPrefix, tp.description(), err) return } defer destResultReader.Close() destReaders[shardIndex] = destResultReader } var sourceReader ResultReader var destReader ResultReader if len(sourceReaders) >= 2 { sourceReader, err = NewResultMerger(sourceReaders, len(td.PrimaryKeyColumns)) if err != nil { processError("%v: NewResultMerger for source tablets failed: %v", errPrefix, err) return } } else { sourceReader = sourceReaders[0] } if len(destReaders) >= 2 { destReader, err = NewResultMerger(destReaders, len(td.PrimaryKeyColumns)) if err != nil { processError("%v: NewResultMerger for destination tablets failed: %v", errPrefix, err) return } } else { destReader = destReaders[0] } dbNames := make([]string, len(scw.destinationShards)) for i, si := range scw.destinationShards { keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) dbNames[i] = scw.destinationDbNames[keyspaceAndShard] } // Compare the data and reconcile any differences. differ, err := NewRowDiffer2(ctx, sourceReader, destReader, td, tableStatusList, tableIndex, scw.destinationShards, keyResolver, insertChannels, ctx.Done(), dbNames, scw.writeQueryMaxRows, scw.writeQueryMaxSize, scw.writeQueryMaxRowsDelete, statsCounters) if err != nil { processError("%v: NewRowDiffer2 failed: %v", errPrefix, err) return } // Ignore the diff report because all diffs should get reconciled. _ /* DiffReport */, err = differ.Diff() if err != nil { processError("%v: RowDiffer2 failed: %v", errPrefix, err) return } tableStatusList.threadDone(tableIndex) }(td, tableIndex, c) } } sourceWaitGroup.Wait() for shardIndex := range scw.destinationShards { close(insertChannels[shardIndex]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } if state == WorkerStateCloneOffline { // Create and populate the blp_checkpoint table to give filtered replication // a starting point. if scw.strategy.skipPopulateBlpCheckpoint { scw.wr.Logger().Infof("Skipping populating the blp_checkpoint table") } else { queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if scw.strategy.dontStartBinlogPlayer { flags = binlogplayer.BlpFlagDontStart } // get the current position from the sources for shardIndex := range scw.sourceShards { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) status, err := scw.wr.TabletManagerClient().SlaveStatus(shortCtx, scw.sourceTablets[shardIndex]) cancel() if err != nil { return err } // TODO(mberlin): Fill in scw.maxReplicationLag once the adapative // throttler is enabled by default. queries = append(queries, binlogplayer.PopulateBlpCheckpoint(uint32(shardIndex), status.Position, scw.maxTPS, throttler.ReplicationLagModuleDisabled, time.Now().Unix(), flags)) } for _, si := range scw.destinationShards { destinationWaitGroup.Add(1) go func(keyspace, shard string) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Making and populating blp_checkpoint table") keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard) if err := runSQLCommands(ctx, scw.wr, scw.tsc, keyspace, shard, scw.destinationDbNames[keyspaceAndShard], queries); err != nil { processError("blp_checkpoint queries failed: %v", err) } }(si.Keyspace(), si.ShardName()) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Configure filtered replication by setting the SourceShard info. // The master tablets won't enable filtered replication (the binlog player) // until they re-read the topology due to a restart or a reload. // TODO(alainjobart) this is a superset, some shards may not // overlap, have to deal with this better (for N -> M splits // where both N>1 and M>1) if scw.strategy.skipSetSourceShards { scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.") } else { for _, si := range scw.destinationShards { scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v (tables: %v)", si.Keyspace(), si.ShardName(), scw.tables) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.SetSourceShards(shortCtx, si.Keyspace(), si.ShardName(), scw.offlineSourceAliases, scw.tables) cancel() if err != nil { return fmt.Errorf("failed to set source shards: %v", err) } } } // Force a state refresh (re-read topo) on all destination tablets. // The master tablet will end up starting filtered replication at this point. // // Find all tablets first, then refresh the state on each in parallel. err = scw.findRefreshTargets(ctx) if err != nil { return fmt.Errorf("failed before refreshing state on destination tablets: %v", err) } for shardIndex := range scw.destinationShards { for _, tabletAlias := range scw.refreshAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Refreshing state on tablet %v", ti.AliasString()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.TabletManagerClient().RefreshState(shortCtx, ti.Tablet) cancel() if err != nil { processError("RefreshState failed on tablet %v: %v", ti.AliasString(), err) } }(scw.refreshTablets[shardIndex][*tabletAlias]) } } } // clonePhase == offline destinationWaitGroup.Wait() return firstError }
// copy phase: // - get schema on the sources, filter tables // - create tables on all destinations // - copy the data func (scw *SplitCloneWorker) copy() error { scw.setState(stateSCCopy) // get source schema from the first shard // TODO(alainjobart): for now, we assume the schema is compatible // on all source shards. Furthermore, we estimate the number of rows // in each source shard for each table to be about the same // (rowCount is used to estimate an ETA) sourceSchemaDefinition, err := scw.wr.GetSchema(scw.sourceAliases[0], nil, scw.excludeTables, true) if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", scw.sourceAliases[0], err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter in tablet %v", scw.sourceAliases[0]) } scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) scw.mu.Lock() scw.tableStatus = make([]tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { scw.tableStatus[i].name = td.Name scw.tableStatus[i].rowCount = td.RowCount * uint64(len(scw.sourceAliases)) } scw.startTime = time.Now() scw.mu.Unlock() // Create all the commands to create the destination schema: // - createDbCmds will create the database and the tables // - createViewCmds will create the views // - alterTablesCmds will modify the tables at the end if needed // (all need template substitution for {{.DatabaseName}}) createDbCmds := make([]string, 0, len(sourceSchemaDefinition.TableDefinitions)+1) createDbCmds = append(createDbCmds, sourceSchemaDefinition.DatabaseSchema) createViewCmds := make([]string, 0, 16) alterTablesCmds := make([]string, 0, 16) columnIndexes := make([]int, len(sourceSchemaDefinition.TableDefinitions)) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_BASE_TABLE { // build the create and alter statements create, alter, err := mysqlctl.MakeSplitCreateTableSql(scw.wr.Logger(), td.Schema, "{{.DatabaseName}}", td.Name, scw.strategy) if err != nil { return fmt.Errorf("MakeSplitCreateTableSql(%v) returned: %v", td.Name, err) } createDbCmds = append(createDbCmds, create) if alter != "" { alterTablesCmds = append(alterTablesCmds, alter) } // find the column to split on columnIndexes[tableIndex] = -1 for i, name := range td.Columns { if name == scw.keyspaceInfo.ShardingColumnName { columnIndexes[tableIndex] = i break } } if columnIndexes[tableIndex] == -1 { return fmt.Errorf("table %v doesn't have a column named '%v'", td.Name, scw.keyspaceInfo.ShardingColumnName) } scw.tableStatus[tableIndex].mu.Lock() scw.tableStatus[tableIndex].state = "before table creation" scw.tableStatus[tableIndex].rowCount = td.RowCount scw.tableStatus[tableIndex].mu.Unlock() } else { scw.tableStatus[tableIndex].mu.Lock() createViewCmds = append(createViewCmds, td.Schema) scw.tableStatus[tableIndex].state = "before view creation" scw.tableStatus[tableIndex].rowCount = 0 scw.tableStatus[tableIndex].mu.Unlock() } } // For each destination tablet (in parallel): // - create the schema // - setup the channels to send SQL data chunks // // mu protects the abort channel for closing, and firstError mu := sync.Mutex{} abort := make(chan struct{}) var firstError error processError := func(format string, args ...interface{}) { scw.wr.Logger().Errorf(format, args...) mu.Lock() if abort != nil { close(abort) abort = nil firstError = fmt.Errorf(format, args...) } mu.Unlock() } insertChannels := make([][]chan string, len(scw.destinationShards)) destinationWaitGroup := sync.WaitGroup{} for shardIndex, _ := range scw.destinationShards { insertChannels[shardIndex] = make([]chan string, len(scw.destinationAliases[shardIndex])) for i, tabletAlias := range scw.destinationAliases[shardIndex] { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[shardIndex][i] = make(chan string, scw.destinationWriterCount*2) destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo, insertChannel chan string) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Creating tables on tablet %v", ti.Alias) if err := runSqlCommands(scw.wr, ti, createDbCmds, abort); err != nil { processError("createDbCmds failed: %v", err) return } if len(createViewCmds) > 0 { scw.wr.Logger().Infof("Creating views on tablet %v", ti.Alias) if err := runSqlCommands(scw.wr, ti, createViewCmds, abort); err != nil { processError("createViewCmds failed: %v", err) return } } for j := 0; j < scw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() if err := executeFetchLoop(scw.wr, ti, insertChannel, abort); err != nil { processError("executeFetchLoop failed: %v", err) } }() } }(scw.destinationTablets[shardIndex][tabletAlias], insertChannels[shardIndex][i]) } } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} for shardIndex, _ := range scw.sourceShards { sema := sync2.NewSemaphore(scw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_VIEW { continue } rowSplitter := NewRowSplitter(scw.destinationShards, scw.keyspaceInfo.ShardingColumnType, columnIndexes[tableIndex]) chunks, err := findChunks(scw.wr, scw.sourceTablets[shardIndex], td, scw.minTableSizeForSplit, scw.sourceReaderCount) if err != nil { return err } for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() // build the query, and start the streaming selectSQL := buildSQLFromChunks(scw.wr, td, chunks, chunkIndex, scw.sourceAliases[shardIndex].String()) qrr, err := NewQueryResultReaderForTablet(scw.wr.TopoServer(), scw.sourceAliases[shardIndex], selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } // process the data if err := scw.processData(td, tableIndex, qrr, rowSplitter, insertChannels, abort); err != nil { processError("processData failed: %v", err) } }(td, tableIndex, chunkIndex) } } } sourceWaitGroup.Wait() for shardIndex, _ := range scw.destinationShards { for _, c := range insertChannels[shardIndex] { close(c) } } destinationWaitGroup.Wait() if firstError != nil { return firstError } // do the post-copy alters if any if len(alterTablesCmds) > 0 { for shardIndex, _ := range scw.destinationShards { for _, tabletAlias := range scw.destinationAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Altering tables on tablet %v", ti.Alias) if err := runSqlCommands(scw.wr, ti, alterTablesCmds, abort); err != nil { processError("alterTablesCmds failed on tablet %v: %v", ti.Alias, err) } }(scw.destinationTablets[shardIndex][tabletAlias]) } } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // then create and populate the blp_checkpoint table if strings.Index(scw.strategy, "populateBlpCheckpoint") != -1 { queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if strings.Index(scw.strategy, "dontStartBinlogPlayer") != -1 { flags = binlogplayer.BLP_FLAG_DONT_START } // get the current position from the sources for shardIndex, _ := range scw.sourceShards { status, err := scw.wr.TabletManagerClient().SlaveStatus(scw.sourceTablets[shardIndex], 30*time.Second) if err != nil { return err } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags)) } for shardIndex, _ := range scw.destinationShards { for _, tabletAlias := range scw.destinationAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias) if err := runSqlCommands(scw.wr, ti, queries, abort); err != nil { processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err) } }(scw.destinationTablets[shardIndex][tabletAlias]) } } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. // TODO(alainjobart) this is a superset, some shards may not // overlap, have to deal with this better (for N -> M splits // where both N>1 and M>1) if strings.Index(scw.strategy, "skipSetSourceShards") != -1 { scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.") } else { for _, si := range scw.destinationShards { scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName()) if err := scw.wr.SetSourceShards(si.Keyspace(), si.ShardName(), scw.sourceAliases, nil); err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } } } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for shardIndex, _ := range scw.destinationShards { for _, tabletAlias := range scw.destinationAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Reloading schema on tablet %v", ti.Alias) if err := scw.wr.TabletManagerClient().ReloadSchema(ti, 30*time.Second); err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err) } }(scw.destinationTablets[shardIndex][tabletAlias]) } } destinationWaitGroup.Wait() return firstError }
// copy phase: // - get schema on the source, filter tables // - create tables on all destinations // - copy the data func (vscw *VerticalSplitCloneWorker) copy() error { vscw.setState(stateVSCCopy) // get source schema sourceSchemaDefinition, err := vscw.wr.GetSchema(vscw.sourceAlias, vscw.tables, nil, true) if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", vscw.sourceAlias, err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter") } log.Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) vscw.mu.Lock() vscw.tableStatus = make([]tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i].name = td.Name vscw.tableStatus[i].rowCount = td.RowCount } vscw.startTime = time.Now() vscw.mu.Unlock() // Create all the commands to create the destination schema: // - createDbCmds will create the database and the tables // - createViewCmds will create the views // - alterTablesCmds will modify the tables at the end if needed // (all need template substitution for {{.DatabaseName}}) createDbCmds := make([]string, 0, len(sourceSchemaDefinition.TableDefinitions)+1) createDbCmds = append(createDbCmds, sourceSchemaDefinition.DatabaseSchema) createViewCmds := make([]string, 0, 16) alterTablesCmds := make([]string, 0, 16) for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i].mu.Lock() if td.Type == myproto.TABLE_BASE_TABLE { create, alter, err := mysqlctl.MakeSplitCreateTableSql(td.Schema, "{{.DatabaseName}}", td.Name, vscw.strategy) if err != nil { return fmt.Errorf("MakeSplitCreateTableSql(%v) returned: %v", td.Name, err) } createDbCmds = append(createDbCmds, create) if alter != "" { alterTablesCmds = append(alterTablesCmds, alter) } vscw.tableStatus[i].state = "before table creation" vscw.tableStatus[i].rowCount = td.RowCount } else { createViewCmds = append(createViewCmds, td.Schema) vscw.tableStatus[i].state = "before view creation" vscw.tableStatus[i].rowCount = 0 } vscw.tableStatus[i].mu.Unlock() } // For each destination tablet (in parallel): // - create the schema // - setup the channels to send SQL data chunks // // mu protects the abort channel for closing, and firstError mu := sync.Mutex{} abort := make(chan struct{}) var firstError error processError := func(format string, args ...interface{}) { log.Errorf(format, args...) mu.Lock() if abort != nil { close(abort) abort = nil firstError = fmt.Errorf(format, args...) } mu.Unlock() } insertChannels := make([]chan string, len(vscw.destinationAliases)) destinationWaitGroup := sync.WaitGroup{} for i, tabletAlias := range vscw.destinationAliases { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[i] = make(chan string, vscw.destinationWriterCount*2) destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo, insertChannel chan string) { defer destinationWaitGroup.Done() log.Infof("Creating tables on tablet %v", ti.Alias) if err := vscw.runSqlCommands(ti, createDbCmds, abort); err != nil { processError("createDbCmds failed: %v", err) return } if len(createViewCmds) > 0 { log.Infof("Creating views on tablet %v", ti.Alias) if err := vscw.runSqlCommands(ti, createViewCmds, abort); err != nil { processError("createViewCmds failed: %v", err) return } } for j := 0; j < vscw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() for { select { case cmd, ok := <-insertChannel: if !ok { return } cmd = "INSERT INTO `" + ti.DbName() + "`." + cmd _, err := vscw.wr.ActionInitiator().ExecuteFetch(ti, cmd, 0, false, true, 30*time.Second) if err != nil { processError("ExecuteFetch failed: %v", err) return } case <-abort: return } } }() } }(vscw.destinationTablets[tabletAlias], insertChannels[i]) } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_VIEW { vscw.tableStatus[tableIndex].setState("view created") continue } vscw.tableStatus[tableIndex].setState("before copy") chunks, err := vscw.findChunks(vscw.sourceTablet, td) if err != nil { return err } for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() vscw.tableStatus[tableIndex].setState("started the copy") // build the query, and start the streaming selectSQL := "SELECT " + strings.Join(td.Columns, ", ") + " FROM " + td.Name if chunks[chunkIndex] != "" || chunks[chunkIndex+1] != "" { log.Infof("Starting to stream all data from table %v between '%v' and '%v'", td.Name, chunks[chunkIndex], chunks[chunkIndex+1]) clauses := make([]string, 0, 2) if chunks[chunkIndex] != "" { clauses = append(clauses, td.PrimaryKeyColumns[0]+">="+chunks[chunkIndex]) } if chunks[chunkIndex+1] != "" { clauses = append(clauses, td.PrimaryKeyColumns[0]+"<"+chunks[chunkIndex+1]) } selectSQL += " WHERE " + strings.Join(clauses, " AND ") } else { log.Infof("Starting to stream all data from table %v", td.Name) } if len(td.PrimaryKeyColumns) > 0 { selectSQL += " ORDER BY " + strings.Join(td.PrimaryKeyColumns, ", ") } qrr, err := NewQueryResultReaderForTablet(vscw.wr.TopoServer(), vscw.sourceAlias, selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } // process the data baseCmd := td.Name + "(" + strings.Join(td.Columns, ", ") + ") VALUES " loop: for { select { case r, ok := <-qrr.Output: if !ok { if err := qrr.Error(); err != nil { // error case processError("QueryResultReader failed: %v", err) return } // we're done with the data break loop } // send the rows to be inserted vscw.tableStatus[tableIndex].addCopiedRows(len(r.Rows)) cmd := baseCmd + makeValueString(qrr.Fields, r) for _, c := range insertChannels { c <- cmd } case <-abort: return } } }(td, tableIndex, chunkIndex) } } sourceWaitGroup.Wait() for _, c := range insertChannels { close(c) } destinationWaitGroup.Wait() if firstError != nil { return firstError } // do the post-copy alters if any if len(alterTablesCmds) > 0 { for _, tabletAlias := range vscw.destinationAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() log.Infof("Altering tables on tablet %v", ti.Alias) if err := vscw.runSqlCommands(ti, alterTablesCmds, abort); err != nil { processError("alterTablesCmds failed on tablet %v: %v", ti.Alias, err) } }(vscw.destinationTablets[tabletAlias]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // then create and populate the blp_checkpoint table if strings.Index(vscw.strategy, "populateBlpCheckpoint") != -1 { // get the current position from the source pos, err := vscw.wr.ActionInitiator().SlavePosition(vscw.sourceTablet, 30*time.Second) if err != nil { return err } queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if strings.Index(vscw.strategy, "dontStartBinlogPlayer") != -1 { flags = binlogplayer.BLP_FLAG_DONT_START } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, pos.MasterLogGTIDField.Value, time.Now().Unix(), flags)) for _, tabletAlias := range vscw.destinationAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() log.Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias) if err := vscw.runSqlCommands(ti, queries, abort); err != nil { processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err) } }(vscw.destinationTablets[tabletAlias]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. log.Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard) if err := vscw.wr.SetSourceShards(vscw.destinationKeyspace, vscw.destinationShard, []topo.TabletAlias{vscw.sourceAlias}, vscw.tables); err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for _, tabletAlias := range vscw.destinationAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() log.Infof("Reloading schema on tablet %v", ti.Alias) if err := vscw.wr.ActionInitiator().ReloadSchema(ti, 30*time.Second); err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err) } }(vscw.destinationTablets[tabletAlias]) } destinationWaitGroup.Wait() return firstError }