// copy phase: // - copy the data from source tablets to destination masters (with replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (scw *LegacySplitCloneWorker) copy(ctx context.Context) error { scw.setState(WorkerStateCloneOffline) start := time.Now() defer func() { statsStateDurationsNs.Set(string(WorkerStateCloneOffline), time.Now().Sub(start).Nanoseconds()) }() // get source schema from the first shard // TODO(alainjobart): for now, we assume the schema is compatible // on all source shards. Furthermore, we estimate the number of rows // in each source shard for each table to be about the same // (rowCount is used to estimate an ETA) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sourceSchemaDefinition, err := scw.wr.GetSchema(shortCtx, scw.sourceAliases[0], nil, scw.excludeTables, false /* includeViews */) cancel() if err != nil { return fmt.Errorf("cannot get schema from source %v: %v", topoproto.TabletAliasString(scw.sourceAliases[0]), err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return fmt.Errorf("no tables matching the table filter in tablet %v", topoproto.TabletAliasString(scw.sourceAliases[0])) } for _, td := range sourceSchemaDefinition.TableDefinitions { if len(td.Columns) == 0 { return fmt.Errorf("schema for table %v has no columns", td.Name) } } scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) scw.tableStatusList.initialize(sourceSchemaDefinition) // In parallel, setup the channels to send SQL data chunks to for each destination tablet: // // mu protects the context for cancelation, and firstError mu := sync.Mutex{} var firstError error ctx, cancelCopy := context.WithCancel(ctx) processError := func(format string, args ...interface{}) { scw.wr.Logger().Errorf(format, args...) mu.Lock() if firstError == nil { firstError = fmt.Errorf(format, args...) cancelCopy() } mu.Unlock() } insertChannels := make([]chan string, len(scw.destinationShards)) destinationWaitGroup := sync.WaitGroup{} for shardIndex, si := range scw.destinationShards { // we create one channel per destination tablet. It // is sized to have a buffer of a maximum of // destinationWriterCount * 2 items, to hopefully // always have data. We then have // destinationWriterCount go routines reading from it. insertChannels[shardIndex] = make(chan string, scw.destinationWriterCount*2) go func(keyspace, shard string, insertChannel chan string) { for j := 0; j < scw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func(threadID int) { defer destinationWaitGroup.Done() keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard) throttler := scw.destinationThrottlers[keyspaceAndShard] defer throttler.ThreadFinished(threadID) executor := newExecutor(scw.wr, scw.tsc, throttler, keyspace, shard, threadID) if err := executor.fetchLoop(ctx, insertChannel); err != nil { processError("executer.FetchLoop failed: %v", err) } }(j) } }(si.Keyspace(), si.ShardName(), insertChannels[shardIndex]) } // read the vschema if needed var keyspaceSchema *vindexes.KeyspaceSchema if *useV3ReshardingMode { kschema, err := scw.wr.TopoServer().GetVSchema(ctx, scw.keyspace) if err != nil { return fmt.Errorf("cannot load VSchema for keyspace %v: %v", scw.keyspace, err) } if kschema == nil { return fmt.Errorf("no VSchema for keyspace %v", scw.keyspace) } keyspaceSchema, err = vindexes.BuildKeyspaceSchema(kschema, scw.keyspace) if err != nil { return fmt.Errorf("cannot build vschema for keyspace %v: %v", scw.keyspace, err) } } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} for shardIndex := range scw.sourceShards { sema := sync2.NewSemaphore(scw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { var keyResolver keyspaceIDResolver if *useV3ReshardingMode { keyResolver, err = newV3ResolverFromTableDefinition(keyspaceSchema, td) if err != nil { return fmt.Errorf("cannot resolve v3 sharding keys for keyspace %v: %v", scw.keyspace, err) } } else { keyResolver, err = newV2Resolver(scw.keyspaceInfo, td) if err != nil { return fmt.Errorf("cannot resolve sharding keys for keyspace %v: %v", scw.keyspace, err) } } rowSplitter := NewRowSplitter(scw.destinationShards, keyResolver) chunks, err := generateChunks(ctx, scw.wr, scw.sourceTablets[shardIndex], td, scw.sourceReaderCount, defaultMinRowsPerChunk) if err != nil { return err } scw.tableStatusList.setThreadCount(tableIndex, len(chunks)-1) for _, c := range chunks { sourceWaitGroup.Add(1) go func(td *tabletmanagerdatapb.TableDefinition, tableIndex int, chunk chunk) { defer sourceWaitGroup.Done() sema.Acquire() defer sema.Release() scw.tableStatusList.threadStarted(tableIndex) // Start streaming from the source tablets. tp := newSingleTabletProvider(ctx, scw.wr.TopoServer(), scw.sourceAliases[shardIndex]) rr, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, false /* allowMultipleRetries */) if err != nil { processError("NewRestartableResultReader failed: %v", err) return } defer rr.Close(ctx) // process the data dbNames := make([]string, len(scw.destinationShards)) for i, si := range scw.destinationShards { keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) dbNames[i] = scw.destinationDbNames[keyspaceAndShard] } if err := scw.processData(ctx, dbNames, td, tableIndex, rr, rowSplitter, insertChannels, scw.destinationPackCount); err != nil { processError("processData failed: %v", err) } scw.tableStatusList.threadDone(tableIndex) }(td, tableIndex, c) } } } sourceWaitGroup.Wait() for shardIndex := range scw.destinationShards { close(insertChannels[shardIndex]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } // then create and populate the blp_checkpoint table if scw.strategy.skipPopulateBlpCheckpoint { scw.wr.Logger().Infof("Skipping populating the blp_checkpoint table") } else { queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if scw.strategy.dontStartBinlogPlayer { flags = binlogplayer.BlpFlagDontStart } // get the current position from the sources for shardIndex := range scw.sourceShards { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) status, err := scw.wr.TabletManagerClient().SlaveStatus(shortCtx, scw.sourceTablets[shardIndex]) cancel() if err != nil { return err } queries = append(queries, binlogplayer.PopulateBlpCheckpoint(uint32(shardIndex), status.Position, scw.maxTPS, throttler.ReplicationLagModuleDisabled, time.Now().Unix(), flags)) } for _, si := range scw.destinationShards { destinationWaitGroup.Add(1) go func(keyspace, shard string) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Making and populating blp_checkpoint table") keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard) if err := runSQLCommands(ctx, scw.wr, scw.tsc, keyspace, shard, scw.destinationDbNames[keyspaceAndShard], queries); err != nil { processError("blp_checkpoint queries failed: %v", err) } }(si.Keyspace(), si.ShardName()) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Now we're done with data copy, update the shard's source info. // TODO(alainjobart) this is a superset, some shards may not // overlap, have to deal with this better (for N -> M splits // where both N>1 and M>1) if scw.strategy.skipSetSourceShards { scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.") } else { for _, si := range scw.destinationShards { scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.SetSourceShards(shortCtx, si.Keyspace(), si.ShardName(), scw.sourceAliases, nil) cancel() if err != nil { return fmt.Errorf("failed to set source shards: %v", err) } } } err = scw.findRefreshTargets(ctx) if err != nil { return fmt.Errorf("failed before refreshing state on destination tablets: %v", err) } // And force a state refresh (re-read topo) on all destination tablets. // The master tablet will end up starting filtered replication // at this point. for shardIndex := range scw.destinationShards { for _, tabletAlias := range scw.refreshAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Refreshing state on tablet %v", ti.AliasString()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.TabletManagerClient().RefreshState(shortCtx, ti.Tablet) cancel() if err != nil { processError("RefreshState failed on tablet %v: %v", ti.AliasString(), err) } }(scw.refreshTablets[shardIndex][*tabletAlias]) } } destinationWaitGroup.Wait() return firstError }
// copy phase: // - copy the data from source tablets to destination masters (with replication on) // Assumes that the schema has already been created on each destination tablet // (probably from vtctl's CopySchemaShard) func (scw *SplitCloneWorker) clone(ctx context.Context, state StatusWorkerState) error { if state != WorkerStateCloneOnline && state != WorkerStateCloneOffline { panic(fmt.Sprintf("invalid state passed to clone(): %v", state)) } scw.setState(state) start := time.Now() defer func() { statsStateDurationsNs.Set(string(state), time.Now().Sub(start).Nanoseconds()) }() var firstSourceTablet *topodatapb.Tablet if state == WorkerStateCloneOffline { // Use the first source tablet which we took offline. firstSourceTablet = scw.sourceTablets[0] } else { // Pick any healthy serving source tablet. si := scw.sourceShards[0] tablets := scw.tsc.GetTabletStats(si.Keyspace(), si.ShardName(), topodatapb.TabletType_RDONLY) if len(tablets) == 0 { // We fail fast on this problem and don't retry because at the start all tablets should be healthy. return fmt.Errorf("no healthy RDONLY tablet in source shard (%v) available (required to find out the schema)", topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName())) } firstSourceTablet = tablets[0].Tablet } var statsCounters []*stats.Counters var tableStatusList *tableStatusList switch state { case WorkerStateCloneOnline: statsCounters = []*stats.Counters{statsOnlineInsertsCounters, statsOnlineUpdatesCounters, statsOnlineDeletesCounters, statsOnlineEqualRowsCounters} tableStatusList = scw.tableStatusListOnline case WorkerStateCloneOffline: statsCounters = []*stats.Counters{statsOfflineInsertsCounters, statsOfflineUpdatesCounters, statsOfflineDeletesCounters, statsOfflineEqualRowsCounters} tableStatusList = scw.tableStatusListOffline } // The throttlers exist only for the duration of this clone() call. // That means a SplitClone invocation with both online and offline phases // will create throttlers for each phase. if err := scw.createThrottlers(); err != nil { return err } defer scw.closeThrottlers() sourceSchemaDefinition, err := scw.getSourceSchema(ctx, firstSourceTablet) if err != nil { return err } scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions)) tableStatusList.initialize(sourceSchemaDefinition) // In parallel, setup the channels to send SQL data chunks to for each destination tablet: // // mu protects the context for cancelation, and firstError mu := sync.Mutex{} var firstError error ctx, cancelCopy := context.WithCancel(ctx) processError := func(format string, args ...interface{}) { scw.wr.Logger().Errorf(format, args...) mu.Lock() if firstError == nil { firstError = fmt.Errorf(format, args...) cancelCopy() } mu.Unlock() } insertChannels := make([]chan string, len(scw.destinationShards)) destinationWaitGroup := sync.WaitGroup{} for shardIndex, si := range scw.destinationShards { // We create one channel per destination tablet. It is sized to have a // buffer of a maximum of destinationWriterCount * 2 items, to hopefully // always have data. We then have destinationWriterCount go routines reading // from it. insertChannels[shardIndex] = make(chan string, scw.destinationWriterCount*2) for j := 0; j < scw.destinationWriterCount; j++ { destinationWaitGroup.Add(1) go func(keyspace, shard string, insertChannel chan string, throttler *throttler.Throttler, threadID int) { defer destinationWaitGroup.Done() defer throttler.ThreadFinished(threadID) executor := newExecutor(scw.wr, scw.tsc, throttler, keyspace, shard, threadID) if err := executor.fetchLoop(ctx, insertChannel); err != nil { processError("executer.FetchLoop failed: %v", err) } }(si.Keyspace(), si.ShardName(), insertChannels[shardIndex], scw.getThrottler(si.Keyspace(), si.ShardName()), j) } } // Now for each table, read data chunks and send them to all // insertChannels sourceWaitGroup := sync.WaitGroup{} sema := sync2.NewSemaphore(scw.sourceReaderCount, 0) for tableIndex, td := range sourceSchemaDefinition.TableDefinitions { td = reorderColumnsPrimaryKeyFirst(td) keyResolver, err := scw.createKeyResolver(td) if err != nil { return fmt.Errorf("cannot resolve sharding keys for keyspace %v: %v", scw.destinationKeyspace, err) } // TODO(mberlin): We're going to chunk *all* source shards based on the MIN // and MAX values of the *first* source shard. Is this going to be a problem? chunks, err := generateChunks(ctx, scw.wr, firstSourceTablet, td, scw.chunkCount, scw.minRowsPerChunk) if err != nil { return err } tableStatusList.setThreadCount(tableIndex, len(chunks)) for _, c := range chunks { sourceWaitGroup.Add(1) go func(td *tabletmanagerdatapb.TableDefinition, tableIndex int, chunk chunk) { defer sourceWaitGroup.Done() errPrefix := fmt.Sprintf("table=%v chunk=%v", td.Name, chunk) // We need our own error per Go routine to avoid races. var err error sema.Acquire() defer sema.Release() tableStatusList.threadStarted(tableIndex) if state == WorkerStateCloneOnline { // Wait for enough healthy tablets (they might have become unhealthy // and their replication lag might have increased since we started.) if err := scw.waitForTablets(ctx, scw.sourceShards, *retryDuration); err != nil { processError("%v: No healthy source tablets found (gave up after %v): ", errPrefix, *retryDuration, err) return } } // Set up readers for the diff. There will be one reader for every // source and destination shard. sourceReaders := make([]ResultReader, len(scw.sourceShards)) destReaders := make([]ResultReader, len(scw.destinationShards)) for shardIndex, si := range scw.sourceShards { var tp tabletProvider allowMultipleRetries := true if state == WorkerStateCloneOffline { tp = newSingleTabletProvider(ctx, scw.wr.TopoServer(), scw.offlineSourceAliases[shardIndex]) // allowMultipleRetries is false to avoid that we'll keep retrying // on the same tablet alias for hours. This guards us against the // situation that an offline tablet gets restarted and serves again. // In that case we cannot use it because its replication is no // longer stopped at the same point as we took it offline initially. allowMultipleRetries = false } else { tp = newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName()) } sourceResultReader, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, allowMultipleRetries) if err != nil { processError("%v: NewRestartableResultReader for source: %v failed", errPrefix, tp.description()) return } defer sourceResultReader.Close() sourceReaders[shardIndex] = sourceResultReader } // Wait for enough healthy tablets (they might have become unhealthy // and their replication lag might have increased due to a previous // chunk pipeline.) if err := scw.waitForTablets(ctx, scw.destinationShards, *retryDuration); err != nil { processError("%v: No healthy destination tablets found (gave up after %v): ", errPrefix, *retryDuration, err) return } for shardIndex, si := range scw.destinationShards { tp := newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName()) destResultReader, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, true /* allowMultipleRetries */) if err != nil { processError("%v: NewRestartableResultReader for destination: %v failed: %v", errPrefix, tp.description(), err) return } defer destResultReader.Close() destReaders[shardIndex] = destResultReader } var sourceReader ResultReader var destReader ResultReader if len(sourceReaders) >= 2 { sourceReader, err = NewResultMerger(sourceReaders, len(td.PrimaryKeyColumns)) if err != nil { processError("%v: NewResultMerger for source tablets failed: %v", errPrefix, err) return } } else { sourceReader = sourceReaders[0] } if len(destReaders) >= 2 { destReader, err = NewResultMerger(destReaders, len(td.PrimaryKeyColumns)) if err != nil { processError("%v: NewResultMerger for destination tablets failed: %v", errPrefix, err) return } } else { destReader = destReaders[0] } dbNames := make([]string, len(scw.destinationShards)) for i, si := range scw.destinationShards { keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) dbNames[i] = scw.destinationDbNames[keyspaceAndShard] } // Compare the data and reconcile any differences. differ, err := NewRowDiffer2(ctx, sourceReader, destReader, td, tableStatusList, tableIndex, scw.destinationShards, keyResolver, insertChannels, ctx.Done(), dbNames, scw.writeQueryMaxRows, scw.writeQueryMaxSize, scw.writeQueryMaxRowsDelete, statsCounters) if err != nil { processError("%v: NewRowDiffer2 failed: %v", errPrefix, err) return } // Ignore the diff report because all diffs should get reconciled. _ /* DiffReport */, err = differ.Diff() if err != nil { processError("%v: RowDiffer2 failed: %v", errPrefix, err) return } tableStatusList.threadDone(tableIndex) }(td, tableIndex, c) } } sourceWaitGroup.Wait() for shardIndex := range scw.destinationShards { close(insertChannels[shardIndex]) } destinationWaitGroup.Wait() if firstError != nil { return firstError } if state == WorkerStateCloneOffline { // Create and populate the blp_checkpoint table to give filtered replication // a starting point. if scw.strategy.skipPopulateBlpCheckpoint { scw.wr.Logger().Infof("Skipping populating the blp_checkpoint table") } else { queries := make([]string, 0, 4) queries = append(queries, binlogplayer.CreateBlpCheckpoint()...) flags := "" if scw.strategy.dontStartBinlogPlayer { flags = binlogplayer.BlpFlagDontStart } // get the current position from the sources for shardIndex := range scw.sourceShards { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) status, err := scw.wr.TabletManagerClient().SlaveStatus(shortCtx, scw.sourceTablets[shardIndex]) cancel() if err != nil { return err } // TODO(mberlin): Fill in scw.maxReplicationLag once the adapative // throttler is enabled by default. queries = append(queries, binlogplayer.PopulateBlpCheckpoint(uint32(shardIndex), status.Position, scw.maxTPS, throttler.ReplicationLagModuleDisabled, time.Now().Unix(), flags)) } for _, si := range scw.destinationShards { destinationWaitGroup.Add(1) go func(keyspace, shard string) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Making and populating blp_checkpoint table") keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard) if err := runSQLCommands(ctx, scw.wr, scw.tsc, keyspace, shard, scw.destinationDbNames[keyspaceAndShard], queries); err != nil { processError("blp_checkpoint queries failed: %v", err) } }(si.Keyspace(), si.ShardName()) } destinationWaitGroup.Wait() if firstError != nil { return firstError } } // Configure filtered replication by setting the SourceShard info. // The master tablets won't enable filtered replication (the binlog player) // until they re-read the topology due to a restart or a reload. // TODO(alainjobart) this is a superset, some shards may not // overlap, have to deal with this better (for N -> M splits // where both N>1 and M>1) if scw.strategy.skipSetSourceShards { scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.") } else { for _, si := range scw.destinationShards { scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v (tables: %v)", si.Keyspace(), si.ShardName(), scw.tables) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.SetSourceShards(shortCtx, si.Keyspace(), si.ShardName(), scw.offlineSourceAliases, scw.tables) cancel() if err != nil { return fmt.Errorf("failed to set source shards: %v", err) } } } // Force a state refresh (re-read topo) on all destination tablets. // The master tablet will end up starting filtered replication at this point. // // Find all tablets first, then refresh the state on each in parallel. err = scw.findRefreshTargets(ctx) if err != nil { return fmt.Errorf("failed before refreshing state on destination tablets: %v", err) } for shardIndex := range scw.destinationShards { for _, tabletAlias := range scw.refreshAliases[shardIndex] { destinationWaitGroup.Add(1) go func(ti *topo.TabletInfo) { defer destinationWaitGroup.Done() scw.wr.Logger().Infof("Refreshing state on tablet %v", ti.AliasString()) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.TabletManagerClient().RefreshState(shortCtx, ti.Tablet) cancel() if err != nil { processError("RefreshState failed on tablet %v: %v", ti.AliasString(), err) } }(scw.refreshTablets[shardIndex][*tabletAlias]) } } } // clonePhase == offline destinationWaitGroup.Wait() return firstError }