// copy phase: // - copy the data from source tablets to destination masters (wtih replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (scw *SplitCloneWorker) copy() error { scw.setState(stateSCCopy) // get source schema from the first shard // TODO(alainjobart): for now, we assume the schema is compatible // on all source shards. Furthermore, we estimate the number of rows // in each source shard for each table to be about the same // (rowCount is used to estimate an ETA) sourceSchemaDefinition, err := scw.wr.GetSchema(scw.sourceAliases[0], nil, scw.excludeTables, true) if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", scw.sourceAliases[0], err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter in tablet %v", scw.sourceAliases[0]) } scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) scw.mu.Lock() scw.tableStatus = make([]*tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { scw.tableStatus[i] = &tableStatus{ name: td.Name, rowCount: td.RowCount * uint64(len(scw.sourceAliases)), } } scw.startTime = time.Now() scw.mu.Unlock() // Find the column index for the sharding columns in all the databases, and count rows columnIndexes := make([]int, len(sourceSchemaDefinition.TableDefinitions)) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_BASE_TABLE { // find the column to split on columnIndexes[tableIndex] = -1 for i, name := range td.Columns { if name == scw.keyspaceInfo.ShardingColumnName { columnIndexes[tableIndex] = i break } } if columnIndexes[tableIndex] == -1 { return fmt.Errorf("table %v doesn't have a column named '%v'", td.Name, scw.keyspaceInfo.ShardingColumnName) } scw.tableStatus[tableIndex].mu.Lock() scw.tableStatus[tableIndex].rowCount = td.RowCount scw.tableStatus[tableIndex].mu.Unlock() } else { scw.tableStatus[tableIndex].mu.Lock() scw.tableStatus[tableIndex].isView = true scw.tableStatus[tableIndex].mu.Unlock() } } // In parallel, setup the channels to send SQL data chunks to for each destination tablet: // // mu protects the abort channel for closing, and firstError mu := sync.Mutex{} abort := make(chan struct{}) var firstError error processError := func(format string, args ...interface{}) { scw.wr.Logger().Errorf(format, args...) mu.Lock() if abort != nil { close(abort) abort = nil firstError = fmt.Errorf(format, args...) } mu.Unlock() } // since we're writing only to masters, we need to enable bin logs so that replication happens disableBinLogs := false insertChannels := make([][]chan string, len(scw.destinationShards)) destinationWaitGroup := sync.WaitGroup{} for shardIndex, _ := range scw.destinationShards { insertChannels[shardIndex] = make([]chan string, len(scw.destinationAliases[shardIndex])) for i, tabletAlias := range scw.destinationAliases[shardIndex] { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[shardIndex][i] = make(chan string, scw.destinationWriterCount*2) go func(ti *topo.TabletInfo, insertChannel chan string) { for j := 0; j < scw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() if err := executeFetchLoop(scw.wr, ti, insertChannel, abort, disableBinLogs); err != nil { processError("executeFetchLoop failed: %v", err) } }() } }(scw.destinationTablets[shardIndex][tabletAlias], insertChannels[shardIndex][i]) } } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} for shardIndex, _ := range scw.sourceShards { sema := sync2.NewSemaphore(scw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_VIEW { continue } rowSplitter := NewRowSplitter(scw.destinationShards, scw.keyspaceInfo.ShardingColumnType, columnIndexes[tableIndex]) chunks, err := findChunks(scw.wr, scw.sourceTablets[shardIndex], td, scw.minTableSizeForSplit, scw.sourceReaderCount) if err != nil { return err } scw.tableStatus[tableIndex].setThreadCount(len(chunks) - 1) for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() scw.tableStatus[tableIndex].threadStarted() // build the query, and start the streaming selectSQL := buildSQLFromChunks(scw.wr, td, chunks, chunkIndex, scw.sourceAliases[shardIndex].String()) qrr, err := NewQueryResultReaderForTablet(scw.wr.TopoServer(), scw.sourceAliases[shardIndex], selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } defer qrr.Close() // process the data if err := scw.processData(td, tableIndex, qrr, rowSplitter, insertChannels, scw.destinationPackCount, abort); err != nil { processError("processData failed: %v", err) } scw.tableStatus[tableIndex].threadDone() }(td, tableIndex, chunkIndex) } } } sourceWaitGroup.Wait() for shardIndex, _ := range scw.destinationShards { for _, c := range insertChannels[shardIndex] { close(c) } } destinationWaitGroup.Wait() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if scw.strategy.PopulateBlpCheckpoint { queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if scw.strategy.DontStartBinlogPlayer { flags = binlogplayer.BLP_FLAG_DONT_START } // get the current position from the sources for shardIndex, _ := range scw.sourceShards { ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) status, err := scw.wr.TabletManagerClient().SlaveStatus(ctx, scw.sourceTablets[shardIndex]) if err != nil { return err } cancel() queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags)) } for shardIndex, _ := range scw.destinationShards { for _, tabletAlias := range scw.destinationAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias) if err := runSqlCommands(scw.wr, ti, queries, abort, disableBinLogs); err != nil { processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err) } }(scw.destinationTablets[shardIndex][tabletAlias]) } } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. // TODO(alainjobart) this is a superset, some shards may not // overlap, have to deal with this better (for N -> M splits // where both N>1 and M>1) if scw.strategy.SkipSetSourceShards { scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.") } else { for _, si := range scw.destinationShards { scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName()) if err := scw.wr.SetSourceShards(si.Keyspace(), si.ShardName(), scw.sourceAliases, nil); err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } } } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for shardIndex, _ := range scw.destinationShards { for _, tabletAlias := range scw.reloadAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Reloading schema on tablet %v", ti.Alias) ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := scw.wr.TabletManagerClient().ReloadSchema(ctx, ti); err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err) } cancel() }(scw.reloadTablets[shardIndex][tabletAlias]) } } destinationWaitGroup.Wait() return firstError }
// copy phase: // - copy the data from source tablets to destination masters (wtih replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (vscw *VerticalSplitCloneWorker) copy() error { vscw.setState(stateVSCCopy) // get source schema sourceSchemaDefinition, err := vscw.wr.GetSchema(vscw.sourceAlias, vscw.tables, nil, true) if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", vscw.sourceAlias, err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter") } vscw.wr.Logger().Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) vscw.mu.Lock() vscw.tableStatus = make([]*tableStatus, len(sourceSchemaDefinition.TableDefinitions)) for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i] = &tableStatus{ name: td.Name, rowCount: td.RowCount, } } vscw.startTime = time.Now() vscw.mu.Unlock() // Count rows for i, td := range sourceSchemaDefinition.TableDefinitions { vscw.tableStatus[i].mu.Lock() if td.Type == myproto.TABLE_BASE_TABLE { vscw.tableStatus[i].rowCount = td.RowCount } else { vscw.tableStatus[i].isView = true } vscw.tableStatus[i].mu.Unlock() } // In parallel, setup the channels to send SQL data chunks to for each destination tablet. // // mu protects the abort channel for closing, and firstError mu := sync.Mutex{} abort := make(chan struct{}) var firstError error processError := func(format string, args ...interface{}) { vscw.wr.Logger().Errorf(format, args...) mu.Lock() if abort != nil { close(abort) abort = nil firstError = fmt.Errorf(format, args...) } mu.Unlock() } // since we're writing only to masters, we need to enable bin logs so that replication happens disableBinLogs := false insertChannels := make([]chan string, len(vscw.destinationAliases)) destinationWaitGroup := sync.WaitGroup{} for i, tabletAlias := range vscw.destinationAliases { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[i] = make(chan string, vscw.destinationWriterCount*2) go func(ti *topo.TabletInfo, insertChannel chan string) { for j := 0; j < vscw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func() { defer destinationWaitGroup.Done() if err := executeFetchLoop(vscw.wr, ti, insertChannel, abort, disableBinLogs); err != nil { processError("executeFetchLoop failed: %v", err) } }() } }(vscw.destinationTablets[tabletAlias], insertChannels[i]) } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { if td.Type == myproto.TABLE_VIEW { continue } chunks, err := findChunks(vscw.wr, vscw.sourceTablet, td, vscw.minTableSizeForSplit, vscw.sourceReaderCount) if err != nil { return err } vscw.tableStatus[tableIndex].setThreadCount(len(chunks) - 1) for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ { sourceWaitGroup.Add(1) go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() vscw.tableStatus[tableIndex].threadStarted() // build the query, and start the streaming selectSQL := buildSQLFromChunks(vscw.wr, td, chunks, chunkIndex, vscw.sourceAlias.String()) qrr, err := NewQueryResultReaderForTablet(vscw.wr.TopoServer(), vscw.sourceAlias, selectSQL) if err != nil { processError("NewQueryResultReaderForTablet failed: %v", err) return } defer qrr.Close() // process the data if err := vscw.processData(td, tableIndex, qrr, insertChannels, vscw.destinationPackCount, abort); err != nil { processError("QueryResultReader failed: %v", err) } vscw.tableStatus[tableIndex].threadDone() }(td, tableIndex, chunkIndex) } } sourceWaitGroup.Wait() for _, c := range insertChannels { close(c) } destinationWaitGroup.Wait() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if vscw.strategy.PopulateBlpCheckpoint { // get the current position from the source ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) status, err := vscw.wr.TabletManagerClient().SlaveStatus(ctx, vscw.sourceTablet) if err != nil { return err } cancel() queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if vscw.strategy.DontStartBinlogPlayer { flags = binlogplayer.BLP_FLAG_DONT_START } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags)) for _, tabletAlias := range vscw.destinationAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias) if err := runSqlCommands(vscw.wr, ti, queries, abort, disableBinLogs); err != nil { processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err) } }(vscw.destinationTablets[tabletAlias]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. if vscw.strategy.SkipSetSourceShards { vscw.wr.Logger().Infof("Skipping setting SourceShard on destination shard.") } else { vscw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard) if err := vscw.wr.SetSourceShards(vscw.destinationKeyspace, vscw.destinationShard, []topo.TabletAlias{vscw.sourceAlias}, vscw.tables); err != nil { return fmt.Errorf("Failed to set source shards: %v", err) } } // And force a schema reload on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for _, tabletAlias := range vscw.reloadAliases { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() vscw.wr.Logger().Infof("Reloading schema on tablet %v", ti.Alias) ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) if err := vscw.wr.TabletManagerClient().ReloadSchema(ctx, ti); err != nil { processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err) } cancel() }(vscw.reloadTablets[tabletAlias]) } destinationWaitGroup.Wait() return firstError }