Ejemplo n.º 1
0
// clone phase:
//	- copy the data from source tablets to destination masters (with replication on)
// Assumes that the schema has already been created on each destination tablet
// (probably from vtctl's CopySchemaShard)
func (vscw *VerticalSplitCloneWorker) clone(ctx context.Context) error {
	vscw.setState(WorkerStateCloneOffline)
	start := time.Now()
	defer func() {
		statsStateDurationsNs.Set(string(WorkerStateCloneOffline), time.Now().Sub(start).Nanoseconds())
	}()

	// get source schema
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	sourceSchemaDefinition, err := vscw.wr.GetSchema(shortCtx, vscw.sourceAlias, vscw.tables, nil, true)
	cancel()
	if err != nil {
		return fmt.Errorf("cannot get schema from source %v: %v", topoproto.TabletAliasString(vscw.sourceAlias), err)
	}
	if len(sourceSchemaDefinition.TableDefinitions) == 0 {
		return fmt.Errorf("no tables matching the table filter")
	}
	vscw.wr.Logger().Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions))
	vscw.tableStatusList.initialize(sourceSchemaDefinition)

	// In parallel, setup the channels to send SQL data chunks to
	// for each destination tablet.
	//
	// mu protects firstError
	mu := sync.Mutex{}
	var firstError error

	ctx, cancelCopy := context.WithCancel(ctx)
	processError := func(format string, args ...interface{}) {
		vscw.wr.Logger().Errorf(format, args...)
		mu.Lock()
		if firstError == nil {
			firstError = fmt.Errorf(format, args...)
			cancelCopy()
		}
		mu.Unlock()
	}

	destinationWaitGroup := sync.WaitGroup{}

	// we create one channel for the destination tablet.  It
	// is sized to have a buffer of a maximum of
	// destinationWriterCount * 2 items, to hopefully
	// always have data. We then have
	// destinationWriterCount go routines reading from it.
	insertChannel := make(chan string, vscw.destinationWriterCount*2)
	// Set up the throttler for the destination shard.
	keyspaceAndShard := topoproto.KeyspaceShardString(vscw.destinationKeyspace, vscw.destinationShard)
	destinationThrottler, err := throttler.NewThrottler(
		keyspaceAndShard, "transactions", vscw.destinationWriterCount, vscw.maxTPS, throttler.ReplicationLagModuleDisabled)
	if err != nil {
		return fmt.Errorf("cannot instantiate throttler: %v", err)
	}
	for j := 0; j < vscw.destinationWriterCount; j++ {
		destinationWaitGroup.Add(1)
		go func(threadID int) {
			defer destinationWaitGroup.Done()
			defer destinationThrottler.ThreadFinished(threadID)

			executor := newExecutor(vscw.wr, vscw.tsc, destinationThrottler, vscw.destinationKeyspace, vscw.destinationShard, threadID)
			if err := executor.fetchLoop(ctx, insertChannel); err != nil {
				processError("executer.FetchLoop failed: %v", err)
			}
		}(j)
	}

	// Now for each table, read data chunks and send them to insertChannel
	sourceWaitGroup := sync.WaitGroup{}
	sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0)
	dbName := vscw.destinationDbNames[topoproto.KeyspaceShardString(vscw.destinationKeyspace, vscw.destinationShard)]
	for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
		if td.Type == tmutils.TableView {
			continue
		}

		chunks, err := generateChunks(ctx, vscw.wr, vscw.sourceTablet, td, vscw.minTableSizeForSplit, vscw.sourceReaderCount)
		if err != nil {
			return err
		}
		vscw.tableStatusList.setThreadCount(tableIndex, len(chunks)-1)

		for _, c := range chunks {
			sourceWaitGroup.Add(1)
			go func(td *tabletmanagerdatapb.TableDefinition, tableIndex int, chunk chunk) {
				defer sourceWaitGroup.Done()

				sema.Acquire()
				defer sema.Release()

				vscw.tableStatusList.threadStarted(tableIndex)

				// Start streaming from the source tablet.
				rr, err := NewRestartableResultReader(ctx, vscw.wr.Logger(), vscw.wr.TopoServer(), vscw.sourceAlias, td, chunk)
				if err != nil {
					processError("NewRestartableResultReader failed: %v", err)
					return
				}
				defer rr.Close()

				// process the data
				if err := vscw.processData(ctx, dbName, td, tableIndex, rr, insertChannel, vscw.destinationPackCount); err != nil {
					processError("ResultReader failed: %v", err)
				}
				vscw.tableStatusList.threadDone(tableIndex)
			}(td, tableIndex, c)
		}
	}
	sourceWaitGroup.Wait()

	close(insertChannel)
	destinationWaitGroup.Wait()
	// Stop Throttler.
	destinationThrottler.Close()
	if firstError != nil {
		return firstError
	}

	// then create and populate the blp_checkpoint table
	if vscw.strategy.skipPopulateBlpCheckpoint {
		vscw.wr.Logger().Infof("Skipping populating the blp_checkpoint table")
	} else {
		// get the current position from the source
		shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
		status, err := vscw.wr.TabletManagerClient().SlaveStatus(shortCtx, vscw.sourceTablet)
		cancel()
		if err != nil {
			return err
		}

		queries := make([]string, 0, 4)
		queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
		flags := ""
		if vscw.strategy.dontStartBinlogPlayer {
			flags = binlogplayer.BlpFlagDontStart
		}
		queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, vscw.maxTPS, throttler.ReplicationLagModuleDisabled, time.Now().Unix(), flags))
		vscw.wr.Logger().Infof("Making and populating blp_checkpoint table")
		if err := runSQLCommands(ctx, vscw.wr, vscw.tsc, vscw.destinationKeyspace, vscw.destinationShard, dbName, queries); err != nil {
			processError("blp_checkpoint queries failed: %v", err)
		}
		if firstError != nil {
			return firstError
		}
	}

	// Now we're done with data copy, update the shard's source info.
	if vscw.strategy.skipSetSourceShards {
		vscw.wr.Logger().Infof("Skipping setting SourceShard on destination shard.")
	} else {
		vscw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard)
		shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
		err := vscw.wr.SetSourceShards(shortCtx, vscw.destinationKeyspace, vscw.destinationShard, []*topodatapb.TabletAlias{vscw.sourceAlias}, vscw.tables)
		cancel()
		if err != nil {
			return fmt.Errorf("Failed to set source shards: %v", err)
		}
	}

	err = vscw.findRefreshTargets(ctx)
	if err != nil {
		return fmt.Errorf("failed before refreshing state on destination tablets: %v", err)
	}
	// And force a state refresh (re-read topo) on all destination tablets.
	// The master tablet will end up starting filtered replication
	// at this point.
	for _, tabletAlias := range vscw.refreshAliases {
		destinationWaitGroup.Add(1)
		go func(ti *topo.TabletInfo) {
			defer destinationWaitGroup.Done()
			vscw.wr.Logger().Infof("Refreshing state on tablet %v", ti.AliasString())
			shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
			err := vscw.wr.TabletManagerClient().RefreshState(shortCtx, ti.Tablet)
			cancel()
			if err != nil {
				processError("RefreshState failed on tablet %v: %v", ti.AliasString(), err)
			}
		}(vscw.refreshTablets[*tabletAlias])
	}
	destinationWaitGroup.Wait()
	return firstError
}
Ejemplo n.º 2
0
// copy phase:
//	- copy the data from source tablets to destination masters (wtih replication on)
// Assumes that the schema has already been created on each destination tablet
// (probably from vtctl's CopySchemaShard)
func (vscw *VerticalSplitCloneWorker) copy(ctx context.Context) error {
	vscw.setState(WorkerStateCopy)

	// get source schema
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	sourceSchemaDefinition, err := vscw.wr.GetSchema(shortCtx, vscw.sourceAlias, vscw.tables, nil, true)
	cancel()
	if err != nil {
		return fmt.Errorf("cannot get schema from source %v: %v", topo.TabletAliasString(vscw.sourceAlias), err)
	}
	if len(sourceSchemaDefinition.TableDefinitions) == 0 {
		return fmt.Errorf("no tables matching the table filter")
	}
	vscw.wr.Logger().Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions))
	vscw.Mu.Lock()
	vscw.tableStatus = make([]*tableStatus, len(sourceSchemaDefinition.TableDefinitions))
	for i, td := range sourceSchemaDefinition.TableDefinitions {
		vscw.tableStatus[i] = &tableStatus{
			name:     td.Name,
			rowCount: td.RowCount,
		}
	}
	vscw.startTime = time.Now()
	vscw.Mu.Unlock()

	// Count rows
	for i, td := range sourceSchemaDefinition.TableDefinitions {
		vscw.tableStatus[i].mu.Lock()
		if td.Type == myproto.TableBaseTable {
			vscw.tableStatus[i].rowCount = td.RowCount
		} else {
			vscw.tableStatus[i].isView = true
		}
		vscw.tableStatus[i].mu.Unlock()
	}

	// In parallel, setup the channels to send SQL data chunks to
	// for each destination tablet.
	//
	// mu protects firstError
	mu := sync.Mutex{}
	var firstError error

	ctx, cancel = context.WithCancel(ctx)
	processError := func(format string, args ...interface{}) {
		vscw.wr.Logger().Errorf(format, args...)
		mu.Lock()
		if firstError == nil {
			firstError = fmt.Errorf(format, args...)
			cancel()
		}
		mu.Unlock()
	}

	destinationWaitGroup := sync.WaitGroup{}

	// we create one channel for the destination tablet.  It
	// is sized to have a buffer of a maximum of
	// destinationWriterCount * 2 items, to hopefully
	// always have data. We then have
	// destinationWriterCount go routines reading from it.
	insertChannel := make(chan string, vscw.destinationWriterCount*2)

	go func(shardName string, insertChannel chan string) {
		for j := 0; j < vscw.destinationWriterCount; j++ {
			destinationWaitGroup.Add(1)
			go func() {
				defer destinationWaitGroup.Done()

				if err := executeFetchLoop(ctx, vscw.wr, vscw, shardName, insertChannel); err != nil {
					processError("executeFetchLoop failed: %v", err)
				}
			}()
		}
	}(vscw.destinationShard, insertChannel)

	// Now for each table, read data chunks and send them to insertChannel
	sourceWaitGroup := sync.WaitGroup{}
	sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0)
	for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
		if td.Type == myproto.TableView {
			continue
		}

		chunks, err := FindChunks(ctx, vscw.wr, vscw.sourceTablet, td, vscw.minTableSizeForSplit, vscw.sourceReaderCount)
		if err != nil {
			return err
		}
		vscw.tableStatus[tableIndex].setThreadCount(len(chunks) - 1)

		for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ {
			sourceWaitGroup.Add(1)
			go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) {
				defer sourceWaitGroup.Done()

				sema.Acquire()
				defer sema.Release()

				vscw.tableStatus[tableIndex].threadStarted()

				// build the query, and start the streaming
				selectSQL := buildSQLFromChunks(vscw.wr, td, chunks, chunkIndex, topo.TabletAliasString(vscw.sourceAlias))
				qrr, err := NewQueryResultReaderForTablet(ctx, vscw.wr.TopoServer(), vscw.sourceAlias, selectSQL)
				if err != nil {
					processError("NewQueryResultReaderForTablet failed: %v", err)
					return
				}
				defer qrr.Close()

				// process the data
				if err := vscw.processData(td, tableIndex, qrr, insertChannel, vscw.destinationPackCount, ctx.Done()); err != nil {
					processError("QueryResultReader failed: %v", err)
				}
				vscw.tableStatus[tableIndex].threadDone()
			}(td, tableIndex, chunkIndex)
		}
	}
	sourceWaitGroup.Wait()

	close(insertChannel)
	destinationWaitGroup.Wait()
	if firstError != nil {
		return firstError
	}

	// then create and populate the blp_checkpoint table
	if vscw.strategy.PopulateBlpCheckpoint {
		// get the current position from the source
		shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
		status, err := vscw.wr.TabletManagerClient().SlaveStatus(shortCtx, vscw.sourceTablet)
		cancel()
		if err != nil {
			return err
		}

		queries := make([]string, 0, 4)
		queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
		flags := ""
		if vscw.strategy.DontStartBinlogPlayer {
			flags = binlogplayer.BlpFlagDontStart
		}
		queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags))
		destinationWaitGroup.Add(1)
		go func(shardName string) {
			defer destinationWaitGroup.Done()
			vscw.wr.Logger().Infof("Making and populating blp_checkpoint table")
			if err := runSQLCommands(ctx, vscw.wr, vscw, shardName, queries); err != nil {
				processError("blp_checkpoint queries failed: %v", err)
			}
		}(vscw.destinationShard)
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// Now we're done with data copy, update the shard's source info.
	if vscw.strategy.SkipSetSourceShards {
		vscw.wr.Logger().Infof("Skipping setting SourceShard on destination shard.")
	} else {
		vscw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard)
		shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
		err := vscw.wr.SetSourceShards(shortCtx, vscw.destinationKeyspace, vscw.destinationShard, []*pb.TabletAlias{vscw.sourceAlias}, vscw.tables)
		cancel()
		if err != nil {
			return fmt.Errorf("Failed to set source shards: %v", err)
		}
	}

	err = vscw.findReloadTargets(ctx)
	if err != nil {
		return fmt.Errorf("failed before reloading schema on destination tablets: %v", err)
	}
	// And force a schema reload on all destination tablets.
	// The master tablet will end up starting filtered replication
	// at this point.
	for _, tabletAlias := range vscw.reloadAliases {
		destinationWaitGroup.Add(1)
		go func(ti *topo.TabletInfo) {
			defer destinationWaitGroup.Done()
			vscw.wr.Logger().Infof("Reloading schema on tablet %v", ti.AliasString())
			shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
			err := vscw.wr.TabletManagerClient().ReloadSchema(shortCtx, ti)
			cancel()
			if err != nil {
				processError("ReloadSchema failed on tablet %v: %v", ti.AliasString(), err)
			}
		}(vscw.reloadTablets[*tabletAlias])
	}
	destinationWaitGroup.Wait()
	return firstError
}
Ejemplo n.º 3
0
// copy phase:
// - get schema on the source, filter tables
// - create tables on all destinations
// - copy the data
func (vscw *VerticalSplitCloneWorker) copy() error {
	vscw.setState(stateVSCCopy)

	// get source schema
	sourceSchemaDefinition, err := vscw.wr.GetSchema(vscw.sourceAlias, vscw.tables, nil, true)
	if err != nil {
		return fmt.Errorf("cannot get schema from source %v: %v", vscw.sourceAlias, err)
	}
	if len(sourceSchemaDefinition.TableDefinitions) == 0 {
		return fmt.Errorf("no tables matching the table filter")
	}
	vscw.wr.Logger().Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions))
	vscw.mu.Lock()
	vscw.tableStatus = make([]tableStatus, len(sourceSchemaDefinition.TableDefinitions))
	for i, td := range sourceSchemaDefinition.TableDefinitions {
		vscw.tableStatus[i].name = td.Name
		vscw.tableStatus[i].rowCount = td.RowCount
	}
	vscw.startTime = time.Now()
	vscw.mu.Unlock()

	// Create all the commands to create the destination schema:
	// - createDbCmds will create the database and the tables
	// - createViewCmds will create the views
	// - alterTablesCmds will modify the tables at the end if needed
	// (all need template substitution for {{.DatabaseName}})
	createDbCmds := make([]string, 0, len(sourceSchemaDefinition.TableDefinitions)+1)
	createDbCmds = append(createDbCmds, sourceSchemaDefinition.DatabaseSchema)
	createViewCmds := make([]string, 0, 16)
	alterTablesCmds := make([]string, 0, 16)
	for i, td := range sourceSchemaDefinition.TableDefinitions {
		vscw.tableStatus[i].mu.Lock()
		if td.Type == myproto.TABLE_BASE_TABLE {
			create, alter, err := mysqlctl.MakeSplitCreateTableSql(vscw.wr.Logger(), td.Schema, "{{.DatabaseName}}", td.Name, vscw.strategy)
			if err != nil {
				return fmt.Errorf("MakeSplitCreateTableSql(%v) returned: %v", td.Name, err)
			}
			createDbCmds = append(createDbCmds, create)
			if alter != "" {
				alterTablesCmds = append(alterTablesCmds, alter)
			}
			vscw.tableStatus[i].state = "before table creation"
			vscw.tableStatus[i].rowCount = td.RowCount
		} else {
			createViewCmds = append(createViewCmds, td.Schema)
			vscw.tableStatus[i].state = "before view creation"
			vscw.tableStatus[i].rowCount = 0
		}
		vscw.tableStatus[i].mu.Unlock()
	}

	// For each destination tablet (in parallel):
	// - create the schema
	// - setup the channels to send SQL data chunks
	//
	// mu protects the abort channel for closing, and firstError
	mu := sync.Mutex{}
	abort := make(chan struct{})
	var firstError error

	processError := func(format string, args ...interface{}) {
		vscw.wr.Logger().Errorf(format, args...)
		mu.Lock()
		if abort != nil {
			close(abort)
			abort = nil
			firstError = fmt.Errorf(format, args...)
		}
		mu.Unlock()
	}

	insertChannels := make([]chan string, len(vscw.destinationAliases))
	destinationWaitGroup := sync.WaitGroup{}
	for i, tabletAlias := range vscw.destinationAliases {
		// we create one channel per destination tablet.  It
		// is sized to have a buffer of a maximum of
		// destinationWriterCount * 2 items, to hopefully
		// always have data. We then have
		// destinationWriterCount go routines reading from it.
		insertChannels[i] = make(chan string, vscw.destinationWriterCount*2)

		destinationWaitGroup.Add(1)
		go func(ti *topo.TabletInfo, insertChannel chan string) {
			defer destinationWaitGroup.Done()
			vscw.wr.Logger().Infof("Creating tables on tablet %v", ti.Alias)
			if err := runSqlCommands(vscw.wr, ti, createDbCmds, abort); err != nil {
				processError("createDbCmds failed: %v", err)
				return
			}
			if len(createViewCmds) > 0 {
				vscw.wr.Logger().Infof("Creating views on tablet %v", ti.Alias)
				if err := runSqlCommands(vscw.wr, ti, createViewCmds, abort); err != nil {
					processError("createViewCmds failed: %v", err)
					return
				}
			}
			for j := 0; j < vscw.destinationWriterCount; j++ {
				destinationWaitGroup.Add(1)
				go func() {
					defer destinationWaitGroup.Done()

					if err := executeFetchLoop(vscw.wr, ti, insertChannel, abort); err != nil {
						processError("executeFetchLoop failed: %v", err)
					}
				}()
			}
		}(vscw.destinationTablets[tabletAlias], insertChannels[i])
	}

	// Now for each table, read data chunks and send them to all
	// insertChannels
	sourceWaitGroup := sync.WaitGroup{}
	sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0)
	for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
		if td.Type == myproto.TABLE_VIEW {
			vscw.tableStatus[tableIndex].setState("view created")
			continue
		}

		vscw.tableStatus[tableIndex].setState("before copy")
		chunks, err := findChunks(vscw.wr, vscw.sourceTablet, td, vscw.minTableSizeForSplit, vscw.sourceReaderCount)
		if err != nil {
			return err
		}

		for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ {
			sourceWaitGroup.Add(1)
			go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) {
				defer sourceWaitGroup.Done()

				sema.Acquire()
				defer sema.Release()

				vscw.tableStatus[tableIndex].setState("started the copy")

				// build the query, and start the streaming
				selectSQL := buildSQLFromChunks(vscw.wr, td, chunks, chunkIndex, vscw.sourceAlias.String())
				qrr, err := NewQueryResultReaderForTablet(vscw.wr.TopoServer(), vscw.sourceAlias, selectSQL)
				if err != nil {
					processError("NewQueryResultReaderForTablet failed: %v", err)
					return
				}

				// process the data
				if err := vscw.processData(td, tableIndex, qrr, insertChannels, abort); err != nil {
					processError("QueryResultReader failed: %v", err)
				}
			}(td, tableIndex, chunkIndex)
		}
	}
	sourceWaitGroup.Wait()

	for _, c := range insertChannels {
		close(c)
	}
	destinationWaitGroup.Wait()
	if firstError != nil {
		return firstError
	}

	// do the post-copy alters if any
	if len(alterTablesCmds) > 0 {
		for _, tabletAlias := range vscw.destinationAliases {
			destinationWaitGroup.Add(1)
			go func(ti *topo.TabletInfo) {
				defer destinationWaitGroup.Done()
				vscw.wr.Logger().Infof("Altering tables on tablet %v", ti.Alias)
				if err := runSqlCommands(vscw.wr, ti, alterTablesCmds, abort); err != nil {
					processError("alterTablesCmds failed on tablet %v: %v", ti.Alias, err)
				}
			}(vscw.destinationTablets[tabletAlias])
		}
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// then create and populate the blp_checkpoint table
	if strings.Index(vscw.strategy, "populateBlpCheckpoint") != -1 {
		// get the current position from the source
		status, err := vscw.wr.TabletManagerClient().SlaveStatus(vscw.sourceTablet, 30*time.Second)
		if err != nil {
			return err
		}

		queries := make([]string, 0, 4)
		queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
		flags := ""
		if strings.Index(vscw.strategy, "dontStartBinlogPlayer") != -1 {
			flags = binlogplayer.BLP_FLAG_DONT_START
		}
		queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags))
		for _, tabletAlias := range vscw.destinationAliases {
			destinationWaitGroup.Add(1)
			go func(ti *topo.TabletInfo) {
				defer destinationWaitGroup.Done()
				vscw.wr.Logger().Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias)
				if err := runSqlCommands(vscw.wr, ti, queries, abort); err != nil {
					processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err)
				}
			}(vscw.destinationTablets[tabletAlias])
		}
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// Now we're done with data copy, update the shard's source info.
	if strings.Index(vscw.strategy, "skipSetSourceShards") != -1 {
		vscw.wr.Logger().Infof("Skipping setting SourceShard on destination shard.")
	} else {
		vscw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard)
		if err := vscw.wr.SetSourceShards(vscw.destinationKeyspace, vscw.destinationShard, []topo.TabletAlias{vscw.sourceAlias}, vscw.tables); err != nil {
			return fmt.Errorf("Failed to set source shards: %v", err)
		}
	}

	// And force a schema reload on all destination tablets.
	// The master tablet will end up starting filtered replication
	// at this point.
	for _, tabletAlias := range vscw.destinationAliases {
		destinationWaitGroup.Add(1)
		go func(ti *topo.TabletInfo) {
			defer destinationWaitGroup.Done()
			vscw.wr.Logger().Infof("Reloading schema on tablet %v", ti.Alias)
			if err := vscw.wr.TabletManagerClient().ReloadSchema(ti, 30*time.Second); err != nil {
				processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err)
			}
		}(vscw.destinationTablets[tabletAlias])
	}
	destinationWaitGroup.Wait()
	return firstError
}
Ejemplo n.º 4
0
// copy phase:
//	- copy the data from source tablets to destination masters (with replication on)
// Assumes that the schema has already been created on each destination tablet
// (probably from vtctl's CopySchemaShard)
func (scw *SplitCloneWorker) copy(ctx context.Context) error {
	scw.setState(WorkerStateCopy)

	// get source schema from the first shard
	// TODO(alainjobart): for now, we assume the schema is compatible
	// on all source shards. Furthermore, we estimate the number of rows
	// in each source shard for each table to be about the same
	// (rowCount is used to estimate an ETA)
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	sourceSchemaDefinition, err := scw.wr.GetSchema(shortCtx, scw.sourceAliases[0], nil, scw.excludeTables, true)
	cancel()
	if err != nil {
		return fmt.Errorf("cannot get schema from source %v: %v", topoproto.TabletAliasString(scw.sourceAliases[0]), err)
	}
	if len(sourceSchemaDefinition.TableDefinitions) == 0 {
		return fmt.Errorf("no tables matching the table filter in tablet %v", topoproto.TabletAliasString(scw.sourceAliases[0]))
	}
	scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions))
	scw.Mu.Lock()
	scw.tableStatus = make([]*tableStatus, len(sourceSchemaDefinition.TableDefinitions))
	for i, td := range sourceSchemaDefinition.TableDefinitions {
		scw.tableStatus[i] = &tableStatus{
			name:     td.Name,
			rowCount: td.RowCount * uint64(len(scw.sourceAliases)),
		}
	}
	scw.startTime = time.Now()
	scw.Mu.Unlock()

	// Find the column index for the sharding columns in all the databases, and count rows
	columnIndexes := make([]int, len(sourceSchemaDefinition.TableDefinitions))
	for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
		if td.Type == myproto.TableBaseTable {
			// find the column to split on
			columnIndexes[tableIndex] = -1
			for i, name := range td.Columns {
				if name == scw.keyspaceInfo.ShardingColumnName {
					columnIndexes[tableIndex] = i
					break
				}
			}
			if columnIndexes[tableIndex] == -1 {
				return fmt.Errorf("table %v doesn't have a column named '%v'", td.Name, scw.keyspaceInfo.ShardingColumnName)
			}

			scw.tableStatus[tableIndex].mu.Lock()
			scw.tableStatus[tableIndex].rowCount = td.RowCount
			scw.tableStatus[tableIndex].mu.Unlock()
		} else {
			scw.tableStatus[tableIndex].mu.Lock()
			scw.tableStatus[tableIndex].isView = true
			scw.tableStatus[tableIndex].mu.Unlock()
		}
	}

	// In parallel, setup the channels to send SQL data chunks to for each destination tablet:
	//
	// mu protects the context for cancelation, and firstError
	mu := sync.Mutex{}
	var firstError error

	ctx, cancelCopy := context.WithCancel(ctx)
	processError := func(format string, args ...interface{}) {
		scw.wr.Logger().Errorf(format, args...)
		mu.Lock()
		if firstError == nil {
			firstError = fmt.Errorf(format, args...)
			cancelCopy()
		}
		mu.Unlock()
	}

	insertChannels := make([]chan string, len(scw.destinationShards))
	destinationWaitGroup := sync.WaitGroup{}
	for shardIndex, si := range scw.destinationShards {
		// we create one channel per destination tablet.  It
		// is sized to have a buffer of a maximum of
		// destinationWriterCount * 2 items, to hopefully
		// always have data. We then have
		// destinationWriterCount go routines reading from it.
		insertChannels[shardIndex] = make(chan string, scw.destinationWriterCount*2)

		go func(shardName string, insertChannel chan string) {
			for j := 0; j < scw.destinationWriterCount; j++ {
				destinationWaitGroup.Add(1)
				go func() {
					defer destinationWaitGroup.Done()
					if err := executeFetchLoop(ctx, scw.wr, scw, shardName, insertChannel); err != nil {
						processError("executeFetchLoop failed: %v", err)
					}
				}()
			}
		}(si.ShardName(), insertChannels[shardIndex])
	}

	// Now for each table, read data chunks and send them to all
	// insertChannels
	sourceWaitGroup := sync.WaitGroup{}
	for shardIndex := range scw.sourceShards {
		sema := sync2.NewSemaphore(scw.sourceReaderCount, 0)
		for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
			if td.Type == myproto.TableView {
				continue
			}

			rowSplitter := NewRowSplitter(scw.destinationShards, key.ProtoToKeyspaceIdType(scw.keyspaceInfo.ShardingColumnType), columnIndexes[tableIndex])

			chunks, err := FindChunks(ctx, scw.wr, scw.sourceTablets[shardIndex], td, scw.minTableSizeForSplit, scw.sourceReaderCount)
			if err != nil {
				return err
			}
			scw.tableStatus[tableIndex].setThreadCount(len(chunks) - 1)

			for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ {
				sourceWaitGroup.Add(1)
				go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) {
					defer sourceWaitGroup.Done()

					sema.Acquire()
					defer sema.Release()

					scw.tableStatus[tableIndex].threadStarted()

					// build the query, and start the streaming
					selectSQL := buildSQLFromChunks(scw.wr, td, chunks, chunkIndex, scw.sourceAliases[shardIndex].String())
					qrr, err := NewQueryResultReaderForTablet(ctx, scw.wr.TopoServer(), scw.sourceAliases[shardIndex], selectSQL)
					if err != nil {
						processError("NewQueryResultReaderForTablet failed: %v", err)
						return
					}
					defer qrr.Close()

					// process the data
					if err := scw.processData(td, tableIndex, qrr, rowSplitter, insertChannels, scw.destinationPackCount, ctx.Done()); err != nil {
						processError("processData failed: %v", err)
					}
					scw.tableStatus[tableIndex].threadDone()
				}(td, tableIndex, chunkIndex)
			}
		}
	}
	sourceWaitGroup.Wait()

	for shardIndex := range scw.destinationShards {
		close(insertChannels[shardIndex])
	}
	destinationWaitGroup.Wait()
	if firstError != nil {
		return firstError
	}

	// then create and populate the blp_checkpoint table
	if scw.strategy.PopulateBlpCheckpoint {
		queries := make([]string, 0, 4)
		queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
		flags := ""
		if scw.strategy.DontStartBinlogPlayer {
			flags = binlogplayer.BlpFlagDontStart
		}

		// get the current position from the sources
		for shardIndex := range scw.sourceShards {
			shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
			status, err := scw.wr.TabletManagerClient().SlaveStatus(shortCtx, scw.sourceTablets[shardIndex])
			cancel()
			if err != nil {
				return err
			}

			queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags))
		}

		for _, si := range scw.destinationShards {
			destinationWaitGroup.Add(1)
			go func(shardName string) {
				defer destinationWaitGroup.Done()
				scw.wr.Logger().Infof("Making and populating blp_checkpoint table")
				if err := runSQLCommands(ctx, scw.wr, scw, shardName, queries); err != nil {
					processError("blp_checkpoint queries failed: %v", err)
				}
			}(si.ShardName())
		}
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// Now we're done with data copy, update the shard's source info.
	// TODO(alainjobart) this is a superset, some shards may not
	// overlap, have to deal with this better (for N -> M splits
	// where both N>1 and M>1)
	if scw.strategy.SkipSetSourceShards {
		scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.")
	} else {
		for _, si := range scw.destinationShards {
			scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName())
			shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
			err := scw.wr.SetSourceShards(shortCtx, si.Keyspace(), si.ShardName(), scw.sourceAliases, nil)
			cancel()
			if err != nil {
				return fmt.Errorf("failed to set source shards: %v", err)
			}
		}
	}

	err = scw.findReloadTargets(ctx)
	if err != nil {
		return fmt.Errorf("failed before reloading schema on destination tablets: %v", err)
	}
	// And force a schema reload on all destination tablets.
	// The master tablet will end up starting filtered replication
	// at this point.
	for shardIndex := range scw.destinationShards {
		for _, tabletAlias := range scw.reloadAliases[shardIndex] {
			destinationWaitGroup.Add(1)
			go func(ti *topo.TabletInfo) {
				defer destinationWaitGroup.Done()
				scw.wr.Logger().Infof("Reloading schema on tablet %v", ti.AliasString())
				shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
				err := scw.wr.TabletManagerClient().ReloadSchema(shortCtx, ti)
				cancel()
				if err != nil {
					processError("ReloadSchema failed on tablet %v: %v", ti.AliasString(), err)
				}
			}(scw.reloadTablets[shardIndex][*tabletAlias])
		}
	}
	destinationWaitGroup.Wait()
	return firstError
}
Ejemplo n.º 5
0
// MultiRestore is the main entry point for multi restore.
//
// We will either:
// - read from the network if sourceAddrs != nil
// - read from a disk snapshot if fromStoragePaths != nil
//
// The strategy is used as follows:
// - If it contains the string 'writeBinLogs' then we will also write
//   to the binary logs.
// - If it contains the command 'populateBlpCheckpoint' then we will
//   populate the blp_checkpoint table with master positions to start from
func (mysqld *Mysqld) MultiRestore(destinationDbName string, keyRanges []key.KeyRange, sourceAddrs []*url.URL, fromStoragePaths []string, snapshotConcurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount int, strategy string) (err error) {
	writeBinLogs := strings.Contains(strategy, "writeBinLogs")

	var manifests []*SplitSnapshotManifest
	if sourceAddrs != nil {
		// get the manifests from the network
		manifests = make([]*SplitSnapshotManifest, len(sourceAddrs))
		rc := concurrency.NewResourceConstraint(fetchConcurrency)
		for i, sourceAddr := range sourceAddrs {
			rc.Add(1)
			go func(sourceAddr *url.URL, i int) {
				rc.Acquire()
				defer rc.ReleaseAndDone()
				if rc.HasErrors() {
					return
				}

				var sourceDbName string
				if len(sourceAddr.Path) < 2 { // "" or "/"
					sourceDbName = destinationDbName
				} else {
					sourceDbName = sourceAddr.Path[1:]
				}
				ssm, e := fetchSnapshotManifestWithRetry("http://"+sourceAddr.Host, sourceDbName, keyRanges[i], fetchRetryCount)
				manifests[i] = ssm
				rc.RecordError(e)
			}(sourceAddr, i)
		}
		if err = rc.Wait(); err != nil {
			return
		}
	} else {
		// get the manifests from the local snapshots
		manifests = make([]*SplitSnapshotManifest, len(fromStoragePaths))
		for i, fromStoragePath := range fromStoragePaths {
			var err error
			manifests[i], err = readSnapshotManifest(fromStoragePath)
			if err != nil {
				return err
			}
		}
	}

	if e := SanityCheckManifests(manifests); e != nil {
		return e
	}

	tempStoragePath := path.Join(mysqld.SnapshotDir, "multirestore", destinationDbName)

	// Start fresh
	if err = os.RemoveAll(tempStoragePath); err != nil {
		return
	}

	if err = os.MkdirAll(tempStoragePath, 0775); err != nil {
		return err
	}

	defer func() {
		if e := os.RemoveAll(tempStoragePath); e != nil {
			log.Errorf("error removing %v: %v", tempStoragePath, e)
		}

	}()

	// Handle our concurrency:
	// - fetchConcurrency tasks for network / decompress from disk
	// - insertTableConcurrency for table inserts from a file
	//   into an innodb table
	// - snapshotConcurrency tasks for table inserts / modify tables
	sems := make(map[string]*sync2.Semaphore, len(manifests[0].SchemaDefinition.TableDefinitions)+2)
	sems["net"] = sync2.NewSemaphore(fetchConcurrency, 0)
	sems["db"] = sync2.NewSemaphore(snapshotConcurrency, 0)

	// Store the alter table statements for after restore,
	// and how many jobs we're running on each table
	// TODO(alainjobart) the jobCount map is a bit weird. replace it
	// with a map of WaitGroups, initialized to the number of files
	// per table. Have extra go routines for the tables with auto_increment
	// to wait on the waitgroup, and apply the modify_table.
	postSql := make(map[string]string, len(manifests[0].SchemaDefinition.TableDefinitions))
	jobCount := make(map[string]*sync2.AtomicInt32)

	// Create the database (it's a good check to know if we're running
	// multirestore a second time too!)
	manifest := manifests[0] // I am assuming they all match
	createDatabase, e := fillStringTemplate(manifest.SchemaDefinition.DatabaseSchema, map[string]string{"DatabaseName": destinationDbName})
	if e != nil {
		return e
	}
	if createDatabase == "" {
		return fmt.Errorf("Empty create database statement")
	}

	createDbCmds := make([]string, 0, len(manifest.SchemaDefinition.TableDefinitions)+2)
	if !writeBinLogs {
		createDbCmds = append(createDbCmds, "SET sql_log_bin = OFF")
	}
	createDbCmds = append(createDbCmds, createDatabase)
	createDbCmds = append(createDbCmds, "USE `"+destinationDbName+"`")
	createViewCmds := make([]string, 0, 16)
	for _, td := range manifest.SchemaDefinition.TableDefinitions {
		if td.Type == proto.TABLE_BASE_TABLE {
			createDbCmd, alterTable, err := makeCreateTableSql(td.Schema, td.Name, strategy)
			if err != nil {
				return err
			}
			if alterTable != "" {
				postSql[td.Name] = alterTable
			}
			jobCount[td.Name] = new(sync2.AtomicInt32)
			createDbCmds = append(createDbCmds, createDbCmd)
			sems["table-"+td.Name] = sync2.NewSemaphore(insertTableConcurrency, 0)
		} else {
			// views are just created with the right db name
			// and no data will ever go in them. We create them
			// after all tables are created, as they will
			// probably depend on real tables.
			createViewCmd, err := fillStringTemplate(td.Schema, map[string]string{"DatabaseName": destinationDbName})
			if err != nil {
				return err
			}
			createViewCmds = append(createViewCmds, createViewCmd)
		}
	}
	createDbCmds = append(createDbCmds, createViewCmds...)
	if err = mysqld.ExecuteSuperQueryList(createDbCmds); err != nil {
		return
	}

	// compute how many jobs we will have
	for _, manifest := range manifests {
		for _, file := range manifest.Source.Files {
			jobCount[file.TableName].Add(1)
		}
	}

	loadDataInfile := `LOAD DATA INFILE '{{.TableInputPath}}' INTO TABLE {{.TableName}} CHARACTER SET binary FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '\\' LINES TERMINATED BY '\n' ({{.Columns}})`

	// fetch all the csv files, and apply them one at a time. Note
	// this might start many go routines, and they'll all be
	// waiting on the resource semaphores.
	mrc := concurrency.NewMultiResourceConstraint(sems)
	for manifestIndex, manifest := range manifests {
		if err = os.Mkdir(path.Join(tempStoragePath, manifest.Source.Addr), 0775); err != nil {
			return err
		}

		for i := range manifest.Source.Files {
			lsf := localSnapshotFile{manifest: manifest, file: &manifest.Source.Files[i], basePath: tempStoragePath}
			mrc.Add(1)
			go func(manifestIndex, i int) {
				defer mrc.Done()

				// compute a few things now, so if we can't we
				// don't take resources:
				// - get the schema
				td, ok := manifest.SchemaDefinition.GetTable(lsf.tableName())
				if !ok {
					mrc.RecordError(fmt.Errorf("No table named %v in schema", lsf.tableName()))
					return
				}

				// - get the load data statement
				queryParams := map[string]string{
					"TableInputPath": lsf.filename(),
					"TableName":      lsf.tableName(),
					"Columns":        strings.Join(td.Columns, ", "),
				}
				loadStatement, e := fillStringTemplate(loadDataInfile, queryParams)
				if e != nil {
					mrc.RecordError(e)
					return
				}

				// get the file, using the 'net' resource
				mrc.Acquire("net")
				if mrc.HasErrors() {
					mrc.Release("net")
					return
				}
				if sourceAddrs == nil {
					e = uncompressLocalFile(path.Join(fromStoragePaths[manifestIndex], path.Base(lsf.file.Path)), lsf.file.Hash, lsf.filename())
				} else {
					e = fetchFileWithRetry(lsf.url(), lsf.file.Hash, lsf.filename(), fetchRetryCount)
				}
				mrc.Release("net")
				if e != nil {
					mrc.RecordError(e)
					return
				}
				defer os.Remove(lsf.filename())

				// acquire the table lock (we do this first
				// so we maximize access to db. Otherwise
				// if 8 threads had gotten the db lock but
				// were writing to the same table, only one
				// load would go at once)
				tableLockName := "table-" + lsf.tableName()
				mrc.Acquire(tableLockName)
				defer func() {
					mrc.Release(tableLockName)
				}()
				if mrc.HasErrors() {
					return
				}

				// acquire the db lock
				mrc.Acquire("db")
				defer func() {
					mrc.Release("db")
				}()
				if mrc.HasErrors() {
					return
				}

				// load the data in
				queries := buildQueryList(destinationDbName, loadStatement, writeBinLogs)
				e = mysqld.ExecuteSuperQueryList(queries)
				if e != nil {
					mrc.RecordError(e)
					return
				}

				// if we're running the last insert,
				// potentially re-add the auto-increments
				remainingInserts := jobCount[lsf.tableName()].Add(-1)
				if remainingInserts == 0 && postSql[lsf.tableName()] != "" {
					queries = buildQueryList(destinationDbName, postSql[lsf.tableName()], writeBinLogs)
					e = mysqld.ExecuteSuperQueryList(queries)
					if e != nil {
						mrc.RecordError(e)
						return
					}
				}
			}(manifestIndex, i)
		}
	}

	if err = mrc.Wait(); err != nil {
		return err
	}

	// populate blp_checkpoint table if we want to
	if strings.Index(strategy, "populateBlpCheckpoint") != -1 {
		queries := make([]string, 0, 4)
		if !writeBinLogs {
			queries = append(queries, "SET sql_log_bin = OFF")
			queries = append(queries, "SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED")
		}
		queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
		for manifestIndex, manifest := range manifests {
			queries = append(queries, binlogplayer.PopulateBlpCheckpoint(manifestIndex, manifest.Source.MasterState.ReplicationPosition.MasterLogGroupId, time.Now().Unix()))
		}
		if err = mysqld.ExecuteSuperQueryList(queries); err != nil {
			return err
		}
	}
	return nil
}
Ejemplo n.º 6
0
// copy phase:
//	- copy the data from source tablets to destination masters (with replication on)
// Assumes that the schema has already been created on each destination tablet
// (probably from vtctl's CopySchemaShard)
func (scw *LegacySplitCloneWorker) copy(ctx context.Context) error {
	scw.setState(WorkerStateCloneOffline)
	start := time.Now()
	defer func() {
		statsStateDurationsNs.Set(string(WorkerStateCloneOffline), time.Now().Sub(start).Nanoseconds())
	}()

	// get source schema from the first shard
	// TODO(alainjobart): for now, we assume the schema is compatible
	// on all source shards. Furthermore, we estimate the number of rows
	// in each source shard for each table to be about the same
	// (rowCount is used to estimate an ETA)
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	sourceSchemaDefinition, err := scw.wr.GetSchema(shortCtx, scw.sourceAliases[0], nil, scw.excludeTables, false /* includeViews */)
	cancel()
	if err != nil {
		return fmt.Errorf("cannot get schema from source %v: %v", topoproto.TabletAliasString(scw.sourceAliases[0]), err)
	}
	if len(sourceSchemaDefinition.TableDefinitions) == 0 {
		return fmt.Errorf("no tables matching the table filter in tablet %v", topoproto.TabletAliasString(scw.sourceAliases[0]))
	}
	for _, td := range sourceSchemaDefinition.TableDefinitions {
		if len(td.Columns) == 0 {
			return fmt.Errorf("schema for table %v has no columns", td.Name)
		}
	}
	scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions))
	scw.tableStatusList.initialize(sourceSchemaDefinition)

	// In parallel, setup the channels to send SQL data chunks to for each destination tablet:
	//
	// mu protects the context for cancelation, and firstError
	mu := sync.Mutex{}
	var firstError error

	ctx, cancelCopy := context.WithCancel(ctx)
	processError := func(format string, args ...interface{}) {
		scw.wr.Logger().Errorf(format, args...)
		mu.Lock()
		if firstError == nil {
			firstError = fmt.Errorf(format, args...)
			cancelCopy()
		}
		mu.Unlock()
	}

	insertChannels := make([]chan string, len(scw.destinationShards))
	destinationWaitGroup := sync.WaitGroup{}
	for shardIndex, si := range scw.destinationShards {
		// we create one channel per destination tablet.  It
		// is sized to have a buffer of a maximum of
		// destinationWriterCount * 2 items, to hopefully
		// always have data. We then have
		// destinationWriterCount go routines reading from it.
		insertChannels[shardIndex] = make(chan string, scw.destinationWriterCount*2)

		go func(keyspace, shard string, insertChannel chan string) {
			for j := 0; j < scw.destinationWriterCount; j++ {
				destinationWaitGroup.Add(1)
				go func(threadID int) {
					defer destinationWaitGroup.Done()

					keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard)
					throttler := scw.destinationThrottlers[keyspaceAndShard]
					defer throttler.ThreadFinished(threadID)

					executor := newExecutor(scw.wr, scw.tsc, throttler, keyspace, shard, threadID)
					if err := executor.fetchLoop(ctx, insertChannel); err != nil {
						processError("executer.FetchLoop failed: %v", err)
					}
				}(j)
			}
		}(si.Keyspace(), si.ShardName(), insertChannels[shardIndex])
	}

	// read the vschema if needed
	var keyspaceSchema *vindexes.KeyspaceSchema
	if *useV3ReshardingMode {
		kschema, err := scw.wr.TopoServer().GetVSchema(ctx, scw.keyspace)
		if err != nil {
			return fmt.Errorf("cannot load VSchema for keyspace %v: %v", scw.keyspace, err)
		}
		if kschema == nil {
			return fmt.Errorf("no VSchema for keyspace %v", scw.keyspace)
		}

		keyspaceSchema, err = vindexes.BuildKeyspaceSchema(kschema, scw.keyspace)
		if err != nil {
			return fmt.Errorf("cannot build vschema for keyspace %v: %v", scw.keyspace, err)
		}
	}

	// Now for each table, read data chunks and send them to all
	// insertChannels
	sourceWaitGroup := sync.WaitGroup{}
	for shardIndex := range scw.sourceShards {
		sema := sync2.NewSemaphore(scw.sourceReaderCount, 0)
		for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
			var keyResolver keyspaceIDResolver
			if *useV3ReshardingMode {
				keyResolver, err = newV3ResolverFromTableDefinition(keyspaceSchema, td)
				if err != nil {
					return fmt.Errorf("cannot resolve v3 sharding keys for keyspace %v: %v", scw.keyspace, err)
				}
			} else {
				keyResolver, err = newV2Resolver(scw.keyspaceInfo, td)
				if err != nil {
					return fmt.Errorf("cannot resolve sharding keys for keyspace %v: %v", scw.keyspace, err)
				}
			}
			rowSplitter := NewRowSplitter(scw.destinationShards, keyResolver)

			chunks, err := generateChunks(ctx, scw.wr, scw.sourceTablets[shardIndex], td, scw.sourceReaderCount, defaultMinRowsPerChunk)
			if err != nil {
				return err
			}
			scw.tableStatusList.setThreadCount(tableIndex, len(chunks)-1)

			for _, c := range chunks {
				sourceWaitGroup.Add(1)
				go func(td *tabletmanagerdatapb.TableDefinition, tableIndex int, chunk chunk) {
					defer sourceWaitGroup.Done()

					sema.Acquire()
					defer sema.Release()

					scw.tableStatusList.threadStarted(tableIndex)

					// Start streaming from the source tablets.
					tp := newSingleTabletProvider(ctx, scw.wr.TopoServer(), scw.sourceAliases[shardIndex])
					rr, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, false /* allowMultipleRetries */)
					if err != nil {
						processError("NewRestartableResultReader failed: %v", err)
						return
					}
					defer rr.Close(ctx)

					// process the data
					dbNames := make([]string, len(scw.destinationShards))
					for i, si := range scw.destinationShards {
						keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName())
						dbNames[i] = scw.destinationDbNames[keyspaceAndShard]
					}
					if err := scw.processData(ctx, dbNames, td, tableIndex, rr, rowSplitter, insertChannels, scw.destinationPackCount); err != nil {
						processError("processData failed: %v", err)
					}
					scw.tableStatusList.threadDone(tableIndex)
				}(td, tableIndex, c)
			}
		}
	}
	sourceWaitGroup.Wait()

	for shardIndex := range scw.destinationShards {
		close(insertChannels[shardIndex])
	}
	destinationWaitGroup.Wait()
	if firstError != nil {
		return firstError
	}

	// then create and populate the blp_checkpoint table
	if scw.strategy.skipPopulateBlpCheckpoint {
		scw.wr.Logger().Infof("Skipping populating the blp_checkpoint table")
	} else {
		queries := make([]string, 0, 4)
		queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
		flags := ""
		if scw.strategy.dontStartBinlogPlayer {
			flags = binlogplayer.BlpFlagDontStart
		}

		// get the current position from the sources
		for shardIndex := range scw.sourceShards {
			shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
			status, err := scw.wr.TabletManagerClient().SlaveStatus(shortCtx, scw.sourceTablets[shardIndex])
			cancel()
			if err != nil {
				return err
			}

			queries = append(queries, binlogplayer.PopulateBlpCheckpoint(uint32(shardIndex), status.Position, scw.maxTPS, throttler.ReplicationLagModuleDisabled, time.Now().Unix(), flags))
		}

		for _, si := range scw.destinationShards {
			destinationWaitGroup.Add(1)
			go func(keyspace, shard string) {
				defer destinationWaitGroup.Done()
				scw.wr.Logger().Infof("Making and populating blp_checkpoint table")
				keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard)
				if err := runSQLCommands(ctx, scw.wr, scw.tsc, keyspace, shard, scw.destinationDbNames[keyspaceAndShard], queries); err != nil {
					processError("blp_checkpoint queries failed: %v", err)
				}
			}(si.Keyspace(), si.ShardName())
		}
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// Now we're done with data copy, update the shard's source info.
	// TODO(alainjobart) this is a superset, some shards may not
	// overlap, have to deal with this better (for N -> M splits
	// where both N>1 and M>1)
	if scw.strategy.skipSetSourceShards {
		scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.")
	} else {
		for _, si := range scw.destinationShards {
			scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName())
			shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
			err := scw.wr.SetSourceShards(shortCtx, si.Keyspace(), si.ShardName(), scw.sourceAliases, nil)
			cancel()
			if err != nil {
				return fmt.Errorf("failed to set source shards: %v", err)
			}
		}
	}

	err = scw.findRefreshTargets(ctx)
	if err != nil {
		return fmt.Errorf("failed before refreshing state on destination tablets: %v", err)
	}
	// And force a state refresh (re-read topo) on all destination tablets.
	// The master tablet will end up starting filtered replication
	// at this point.
	for shardIndex := range scw.destinationShards {
		for _, tabletAlias := range scw.refreshAliases[shardIndex] {
			destinationWaitGroup.Add(1)
			go func(ti *topo.TabletInfo) {
				defer destinationWaitGroup.Done()
				scw.wr.Logger().Infof("Refreshing state on tablet %v", ti.AliasString())
				shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
				err := scw.wr.TabletManagerClient().RefreshState(shortCtx, ti.Tablet)
				cancel()
				if err != nil {
					processError("RefreshState failed on tablet %v: %v", ti.AliasString(), err)
				}
			}(scw.refreshTablets[shardIndex][*tabletAlias])
		}
	}
	destinationWaitGroup.Wait()
	return firstError
}
Ejemplo n.º 7
0
// copy phase:
//	- copy the data from source tablets to destination masters (with replication on)
// Assumes that the schema has already been created on each destination tablet
// (probably from vtctl's CopySchemaShard)
func (scw *SplitCloneWorker) clone(ctx context.Context, state StatusWorkerState) error {
	if state != WorkerStateCloneOnline && state != WorkerStateCloneOffline {
		panic(fmt.Sprintf("invalid state passed to clone(): %v", state))
	}
	scw.setState(state)
	start := time.Now()
	defer func() {
		statsStateDurationsNs.Set(string(state), time.Now().Sub(start).Nanoseconds())
	}()

	var firstSourceTablet *topodatapb.Tablet
	if state == WorkerStateCloneOffline {
		// Use the first source tablet which we took offline.
		firstSourceTablet = scw.sourceTablets[0]
	} else {
		// Pick any healthy serving source tablet.
		si := scw.sourceShards[0]
		tablets := scw.tsc.GetTabletStats(si.Keyspace(), si.ShardName(), topodatapb.TabletType_RDONLY)
		if len(tablets) == 0 {
			// We fail fast on this problem and don't retry because at the start all tablets should be healthy.
			return fmt.Errorf("no healthy RDONLY tablet in source shard (%v) available (required to find out the schema)", topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()))
		}
		firstSourceTablet = tablets[0].Tablet
	}
	var statsCounters []*stats.Counters
	var tableStatusList *tableStatusList
	switch state {
	case WorkerStateCloneOnline:
		statsCounters = []*stats.Counters{statsOnlineInsertsCounters, statsOnlineUpdatesCounters, statsOnlineDeletesCounters, statsOnlineEqualRowsCounters}
		tableStatusList = scw.tableStatusListOnline
	case WorkerStateCloneOffline:
		statsCounters = []*stats.Counters{statsOfflineInsertsCounters, statsOfflineUpdatesCounters, statsOfflineDeletesCounters, statsOfflineEqualRowsCounters}
		tableStatusList = scw.tableStatusListOffline
	}

	// The throttlers exist only for the duration of this clone() call.
	// That means a SplitClone invocation with both online and offline phases
	// will create throttlers for each phase.
	if err := scw.createThrottlers(); err != nil {
		return err
	}
	defer scw.closeThrottlers()

	sourceSchemaDefinition, err := scw.getSourceSchema(ctx, firstSourceTablet)
	if err != nil {
		return err
	}
	scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions))
	tableStatusList.initialize(sourceSchemaDefinition)

	// In parallel, setup the channels to send SQL data chunks to for each destination tablet:
	//
	// mu protects the context for cancelation, and firstError
	mu := sync.Mutex{}
	var firstError error

	ctx, cancelCopy := context.WithCancel(ctx)
	processError := func(format string, args ...interface{}) {
		scw.wr.Logger().Errorf(format, args...)
		mu.Lock()
		if firstError == nil {
			firstError = fmt.Errorf(format, args...)
			cancelCopy()
		}
		mu.Unlock()
	}

	insertChannels := make([]chan string, len(scw.destinationShards))
	destinationWaitGroup := sync.WaitGroup{}
	for shardIndex, si := range scw.destinationShards {
		// We create one channel per destination tablet. It is sized to have a
		// buffer of a maximum of destinationWriterCount * 2 items, to hopefully
		// always have data. We then have destinationWriterCount go routines reading
		// from it.
		insertChannels[shardIndex] = make(chan string, scw.destinationWriterCount*2)

		for j := 0; j < scw.destinationWriterCount; j++ {
			destinationWaitGroup.Add(1)
			go func(keyspace, shard string, insertChannel chan string, throttler *throttler.Throttler, threadID int) {
				defer destinationWaitGroup.Done()
				defer throttler.ThreadFinished(threadID)

				executor := newExecutor(scw.wr, scw.tsc, throttler, keyspace, shard, threadID)
				if err := executor.fetchLoop(ctx, insertChannel); err != nil {
					processError("executer.FetchLoop failed: %v", err)
				}
			}(si.Keyspace(), si.ShardName(), insertChannels[shardIndex], scw.getThrottler(si.Keyspace(), si.ShardName()), j)
		}
	}

	// Now for each table, read data chunks and send them to all
	// insertChannels
	sourceWaitGroup := sync.WaitGroup{}
	sema := sync2.NewSemaphore(scw.sourceReaderCount, 0)
	for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
		td = reorderColumnsPrimaryKeyFirst(td)

		keyResolver, err := scw.createKeyResolver(td)
		if err != nil {
			return fmt.Errorf("cannot resolve sharding keys for keyspace %v: %v", scw.destinationKeyspace, err)
		}

		// TODO(mberlin): We're going to chunk *all* source shards based on the MIN
		// and MAX values of the *first* source shard. Is this going to be a problem?
		chunks, err := generateChunks(ctx, scw.wr, firstSourceTablet, td, scw.chunkCount, scw.minRowsPerChunk)
		if err != nil {
			return err
		}
		tableStatusList.setThreadCount(tableIndex, len(chunks))

		for _, c := range chunks {
			sourceWaitGroup.Add(1)
			go func(td *tabletmanagerdatapb.TableDefinition, tableIndex int, chunk chunk) {
				defer sourceWaitGroup.Done()
				errPrefix := fmt.Sprintf("table=%v chunk=%v", td.Name, chunk)

				// We need our own error per Go routine to avoid races.
				var err error

				sema.Acquire()
				defer sema.Release()

				tableStatusList.threadStarted(tableIndex)

				if state == WorkerStateCloneOnline {
					// Wait for enough healthy tablets (they might have become unhealthy
					// and their replication lag might have increased since we started.)
					if err := scw.waitForTablets(ctx, scw.sourceShards, *retryDuration); err != nil {
						processError("%v: No healthy source tablets found (gave up after %v): ", errPrefix, *retryDuration, err)
						return
					}
				}

				// Set up readers for the diff. There will be one reader for every
				// source and destination shard.
				sourceReaders := make([]ResultReader, len(scw.sourceShards))
				destReaders := make([]ResultReader, len(scw.destinationShards))
				for shardIndex, si := range scw.sourceShards {
					var tp tabletProvider
					allowMultipleRetries := true
					if state == WorkerStateCloneOffline {
						tp = newSingleTabletProvider(ctx, scw.wr.TopoServer(), scw.offlineSourceAliases[shardIndex])
						// allowMultipleRetries is false to avoid that we'll keep retrying
						// on the same tablet alias for hours. This guards us against the
						// situation that an offline tablet gets restarted and serves again.
						// In that case we cannot use it because its replication is no
						// longer stopped at the same point as we took it offline initially.
						allowMultipleRetries = false
					} else {
						tp = newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName())
					}
					sourceResultReader, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, allowMultipleRetries)
					if err != nil {
						processError("%v: NewRestartableResultReader for source: %v failed", errPrefix, tp.description())
						return
					}
					defer sourceResultReader.Close()
					sourceReaders[shardIndex] = sourceResultReader
				}

				// Wait for enough healthy tablets (they might have become unhealthy
				// and their replication lag might have increased due to a previous
				// chunk pipeline.)
				if err := scw.waitForTablets(ctx, scw.destinationShards, *retryDuration); err != nil {
					processError("%v: No healthy destination tablets found (gave up after %v): ", errPrefix, *retryDuration, err)
					return
				}

				for shardIndex, si := range scw.destinationShards {
					tp := newShardTabletProvider(scw.tsc, scw.tabletTracker, si.Keyspace(), si.ShardName())
					destResultReader, err := NewRestartableResultReader(ctx, scw.wr.Logger(), tp, td, chunk, true /* allowMultipleRetries */)
					if err != nil {
						processError("%v: NewRestartableResultReader for destination: %v failed: %v", errPrefix, tp.description(), err)
						return
					}
					defer destResultReader.Close()
					destReaders[shardIndex] = destResultReader
				}

				var sourceReader ResultReader
				var destReader ResultReader
				if len(sourceReaders) >= 2 {
					sourceReader, err = NewResultMerger(sourceReaders, len(td.PrimaryKeyColumns))
					if err != nil {
						processError("%v: NewResultMerger for source tablets failed: %v", errPrefix, err)
						return
					}
				} else {
					sourceReader = sourceReaders[0]
				}
				if len(destReaders) >= 2 {
					destReader, err = NewResultMerger(destReaders, len(td.PrimaryKeyColumns))
					if err != nil {
						processError("%v: NewResultMerger for destination tablets failed: %v", errPrefix, err)
						return
					}
				} else {
					destReader = destReaders[0]
				}

				dbNames := make([]string, len(scw.destinationShards))
				for i, si := range scw.destinationShards {
					keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName())
					dbNames[i] = scw.destinationDbNames[keyspaceAndShard]
				}
				// Compare the data and reconcile any differences.
				differ, err := NewRowDiffer2(ctx, sourceReader, destReader, td, tableStatusList, tableIndex,
					scw.destinationShards, keyResolver,
					insertChannels, ctx.Done(), dbNames, scw.writeQueryMaxRows, scw.writeQueryMaxSize, scw.writeQueryMaxRowsDelete, statsCounters)
				if err != nil {
					processError("%v: NewRowDiffer2 failed: %v", errPrefix, err)
					return
				}
				// Ignore the diff report because all diffs should get reconciled.
				_ /* DiffReport */, err = differ.Diff()
				if err != nil {
					processError("%v: RowDiffer2 failed: %v", errPrefix, err)
					return
				}

				tableStatusList.threadDone(tableIndex)
			}(td, tableIndex, c)
		}
	}
	sourceWaitGroup.Wait()

	for shardIndex := range scw.destinationShards {
		close(insertChannels[shardIndex])
	}
	destinationWaitGroup.Wait()
	if firstError != nil {
		return firstError
	}

	if state == WorkerStateCloneOffline {
		// Create and populate the blp_checkpoint table to give filtered replication
		// a starting point.
		if scw.strategy.skipPopulateBlpCheckpoint {
			scw.wr.Logger().Infof("Skipping populating the blp_checkpoint table")
		} else {
			queries := make([]string, 0, 4)
			queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
			flags := ""
			if scw.strategy.dontStartBinlogPlayer {
				flags = binlogplayer.BlpFlagDontStart
			}

			// get the current position from the sources
			for shardIndex := range scw.sourceShards {
				shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
				status, err := scw.wr.TabletManagerClient().SlaveStatus(shortCtx, scw.sourceTablets[shardIndex])
				cancel()
				if err != nil {
					return err
				}

				// TODO(mberlin): Fill in scw.maxReplicationLag once the adapative
				//                throttler is enabled by default.
				queries = append(queries, binlogplayer.PopulateBlpCheckpoint(uint32(shardIndex), status.Position, scw.maxTPS, throttler.ReplicationLagModuleDisabled, time.Now().Unix(), flags))
			}

			for _, si := range scw.destinationShards {
				destinationWaitGroup.Add(1)
				go func(keyspace, shard string) {
					defer destinationWaitGroup.Done()
					scw.wr.Logger().Infof("Making and populating blp_checkpoint table")
					keyspaceAndShard := topoproto.KeyspaceShardString(keyspace, shard)
					if err := runSQLCommands(ctx, scw.wr, scw.tsc, keyspace, shard, scw.destinationDbNames[keyspaceAndShard], queries); err != nil {
						processError("blp_checkpoint queries failed: %v", err)
					}
				}(si.Keyspace(), si.ShardName())
			}
			destinationWaitGroup.Wait()
			if firstError != nil {
				return firstError
			}
		}

		// Configure filtered replication by setting the SourceShard info.
		// The master tablets won't enable filtered replication (the binlog player)
		//  until they re-read the topology due to a restart or a reload.
		// TODO(alainjobart) this is a superset, some shards may not
		// overlap, have to deal with this better (for N -> M splits
		// where both N>1 and M>1)
		if scw.strategy.skipSetSourceShards {
			scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.")
		} else {
			for _, si := range scw.destinationShards {
				scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v (tables: %v)", si.Keyspace(), si.ShardName(), scw.tables)
				shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
				err := scw.wr.SetSourceShards(shortCtx, si.Keyspace(), si.ShardName(), scw.offlineSourceAliases, scw.tables)
				cancel()
				if err != nil {
					return fmt.Errorf("failed to set source shards: %v", err)
				}
			}
		}

		// Force a state refresh (re-read topo) on all destination tablets.
		// The master tablet will end up starting filtered replication at this point.
		//
		// Find all tablets first, then refresh the state on each in parallel.
		err = scw.findRefreshTargets(ctx)
		if err != nil {
			return fmt.Errorf("failed before refreshing state on destination tablets: %v", err)
		}
		for shardIndex := range scw.destinationShards {
			for _, tabletAlias := range scw.refreshAliases[shardIndex] {
				destinationWaitGroup.Add(1)
				go func(ti *topo.TabletInfo) {
					defer destinationWaitGroup.Done()
					scw.wr.Logger().Infof("Refreshing state on tablet %v", ti.AliasString())
					shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
					err := scw.wr.TabletManagerClient().RefreshState(shortCtx, ti.Tablet)
					cancel()
					if err != nil {
						processError("RefreshState failed on tablet %v: %v", ti.AliasString(), err)
					}
				}(scw.refreshTablets[shardIndex][*tabletAlias])
			}
		}
	} // clonePhase == offline

	destinationWaitGroup.Wait()
	return firstError
}
Ejemplo n.º 8
0
// copy phase:
// - get schema on the sources, filter tables
// - create tables on all destinations
// - copy the data
func (scw *SplitCloneWorker) copy() error {
	scw.setState(stateSCCopy)

	// get source schema from the first shard
	// TODO(alainjobart): for now, we assume the schema is compatible
	// on all source shards. Furthermore, we estimate the number of rows
	// in each source shard for each table to be about the same
	// (rowCount is used to estimate an ETA)
	sourceSchemaDefinition, err := scw.wr.GetSchema(scw.sourceAliases[0], nil, scw.excludeTables, true)
	if err != nil {
		return fmt.Errorf("cannot get schema from source %v: %v", scw.sourceAliases[0], err)
	}
	if len(sourceSchemaDefinition.TableDefinitions) == 0 {
		return fmt.Errorf("no tables matching the table filter in tablet %v", scw.sourceAliases[0])
	}
	scw.wr.Logger().Infof("Source tablet 0 has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions))
	scw.mu.Lock()
	scw.tableStatus = make([]tableStatus, len(sourceSchemaDefinition.TableDefinitions))
	for i, td := range sourceSchemaDefinition.TableDefinitions {
		scw.tableStatus[i].name = td.Name
		scw.tableStatus[i].rowCount = td.RowCount * uint64(len(scw.sourceAliases))
	}
	scw.startTime = time.Now()
	scw.mu.Unlock()

	// Create all the commands to create the destination schema:
	// - createDbCmds will create the database and the tables
	// - createViewCmds will create the views
	// - alterTablesCmds will modify the tables at the end if needed
	// (all need template substitution for {{.DatabaseName}})
	createDbCmds := make([]string, 0, len(sourceSchemaDefinition.TableDefinitions)+1)
	createDbCmds = append(createDbCmds, sourceSchemaDefinition.DatabaseSchema)
	createViewCmds := make([]string, 0, 16)
	alterTablesCmds := make([]string, 0, 16)
	columnIndexes := make([]int, len(sourceSchemaDefinition.TableDefinitions))
	for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
		if td.Type == myproto.TABLE_BASE_TABLE {
			// build the create and alter statements
			create, alter, err := mysqlctl.MakeSplitCreateTableSql(scw.wr.Logger(), td.Schema, "{{.DatabaseName}}", td.Name, scw.strategy)
			if err != nil {
				return fmt.Errorf("MakeSplitCreateTableSql(%v) returned: %v", td.Name, err)
			}
			createDbCmds = append(createDbCmds, create)
			if alter != "" {
				alterTablesCmds = append(alterTablesCmds, alter)
			}

			// find the column to split on
			columnIndexes[tableIndex] = -1
			for i, name := range td.Columns {
				if name == scw.keyspaceInfo.ShardingColumnName {
					columnIndexes[tableIndex] = i
					break
				}
			}
			if columnIndexes[tableIndex] == -1 {
				return fmt.Errorf("table %v doesn't have a column named '%v'", td.Name, scw.keyspaceInfo.ShardingColumnName)
			}

			scw.tableStatus[tableIndex].mu.Lock()
			scw.tableStatus[tableIndex].state = "before table creation"
			scw.tableStatus[tableIndex].rowCount = td.RowCount
			scw.tableStatus[tableIndex].mu.Unlock()
		} else {
			scw.tableStatus[tableIndex].mu.Lock()
			createViewCmds = append(createViewCmds, td.Schema)
			scw.tableStatus[tableIndex].state = "before view creation"
			scw.tableStatus[tableIndex].rowCount = 0
			scw.tableStatus[tableIndex].mu.Unlock()
		}
	}

	// For each destination tablet (in parallel):
	// - create the schema
	// - setup the channels to send SQL data chunks
	//
	// mu protects the abort channel for closing, and firstError
	mu := sync.Mutex{}
	abort := make(chan struct{})
	var firstError error

	processError := func(format string, args ...interface{}) {
		scw.wr.Logger().Errorf(format, args...)
		mu.Lock()
		if abort != nil {
			close(abort)
			abort = nil
			firstError = fmt.Errorf(format, args...)
		}
		mu.Unlock()
	}

	insertChannels := make([][]chan string, len(scw.destinationShards))
	destinationWaitGroup := sync.WaitGroup{}
	for shardIndex, _ := range scw.destinationShards {
		insertChannels[shardIndex] = make([]chan string, len(scw.destinationAliases[shardIndex]))
		for i, tabletAlias := range scw.destinationAliases[shardIndex] {
			// we create one channel per destination tablet.  It
			// is sized to have a buffer of a maximum of
			// destinationWriterCount * 2 items, to hopefully
			// always have data. We then have
			// destinationWriterCount go routines reading from it.
			insertChannels[shardIndex][i] = make(chan string, scw.destinationWriterCount*2)

			destinationWaitGroup.Add(1)
			go func(ti *topo.TabletInfo, insertChannel chan string) {
				defer destinationWaitGroup.Done()
				scw.wr.Logger().Infof("Creating tables on tablet %v", ti.Alias)
				if err := runSqlCommands(scw.wr, ti, createDbCmds, abort); err != nil {
					processError("createDbCmds failed: %v", err)
					return
				}
				if len(createViewCmds) > 0 {
					scw.wr.Logger().Infof("Creating views on tablet %v", ti.Alias)
					if err := runSqlCommands(scw.wr, ti, createViewCmds, abort); err != nil {
						processError("createViewCmds failed: %v", err)
						return
					}
				}
				for j := 0; j < scw.destinationWriterCount; j++ {
					destinationWaitGroup.Add(1)
					go func() {
						defer destinationWaitGroup.Done()
						if err := executeFetchLoop(scw.wr, ti, insertChannel, abort); err != nil {
							processError("executeFetchLoop failed: %v", err)
						}
					}()
				}
			}(scw.destinationTablets[shardIndex][tabletAlias], insertChannels[shardIndex][i])
		}
	}

	// Now for each table, read data chunks and send them to all
	// insertChannels
	sourceWaitGroup := sync.WaitGroup{}
	for shardIndex, _ := range scw.sourceShards {
		sema := sync2.NewSemaphore(scw.sourceReaderCount, 0)
		for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
			if td.Type == myproto.TABLE_VIEW {
				continue
			}

			rowSplitter := NewRowSplitter(scw.destinationShards, scw.keyspaceInfo.ShardingColumnType, columnIndexes[tableIndex])

			chunks, err := findChunks(scw.wr, scw.sourceTablets[shardIndex], td, scw.minTableSizeForSplit, scw.sourceReaderCount)
			if err != nil {
				return err
			}

			for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ {
				sourceWaitGroup.Add(1)
				go func(td *myproto.TableDefinition, tableIndex, chunkIndex int) {
					defer sourceWaitGroup.Done()

					sema.Acquire()
					defer sema.Release()

					// build the query, and start the streaming
					selectSQL := buildSQLFromChunks(scw.wr, td, chunks, chunkIndex, scw.sourceAliases[shardIndex].String())
					qrr, err := NewQueryResultReaderForTablet(scw.wr.TopoServer(), scw.sourceAliases[shardIndex], selectSQL)
					if err != nil {
						processError("NewQueryResultReaderForTablet failed: %v", err)
						return
					}

					// process the data
					if err := scw.processData(td, tableIndex, qrr, rowSplitter, insertChannels, abort); err != nil {
						processError("processData failed: %v", err)
					}
				}(td, tableIndex, chunkIndex)
			}
		}
	}
	sourceWaitGroup.Wait()

	for shardIndex, _ := range scw.destinationShards {
		for _, c := range insertChannels[shardIndex] {
			close(c)
		}
	}
	destinationWaitGroup.Wait()
	if firstError != nil {
		return firstError
	}

	// do the post-copy alters if any
	if len(alterTablesCmds) > 0 {
		for shardIndex, _ := range scw.destinationShards {
			for _, tabletAlias := range scw.destinationAliases[shardIndex] {
				destinationWaitGroup.Add(1)
				go func(ti *topo.TabletInfo) {
					defer destinationWaitGroup.Done()
					scw.wr.Logger().Infof("Altering tables on tablet %v", ti.Alias)
					if err := runSqlCommands(scw.wr, ti, alterTablesCmds, abort); err != nil {
						processError("alterTablesCmds failed on tablet %v: %v", ti.Alias, err)
					}
				}(scw.destinationTablets[shardIndex][tabletAlias])
			}
		}
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// then create and populate the blp_checkpoint table
	if strings.Index(scw.strategy, "populateBlpCheckpoint") != -1 {
		queries := make([]string, 0, 4)
		queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
		flags := ""
		if strings.Index(scw.strategy, "dontStartBinlogPlayer") != -1 {
			flags = binlogplayer.BLP_FLAG_DONT_START
		}

		// get the current position from the sources
		for shardIndex, _ := range scw.sourceShards {
			status, err := scw.wr.TabletManagerClient().SlaveStatus(scw.sourceTablets[shardIndex], 30*time.Second)
			if err != nil {
				return err
			}

			queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, status.Position, time.Now().Unix(), flags))
		}

		for shardIndex, _ := range scw.destinationShards {
			for _, tabletAlias := range scw.destinationAliases[shardIndex] {
				destinationWaitGroup.Add(1)
				go func(ti *topo.TabletInfo) {
					defer destinationWaitGroup.Done()
					scw.wr.Logger().Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias)
					if err := runSqlCommands(scw.wr, ti, queries, abort); err != nil {
						processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err)
					}
				}(scw.destinationTablets[shardIndex][tabletAlias])
			}
		}
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// Now we're done with data copy, update the shard's source info.
	// TODO(alainjobart) this is a superset, some shards may not
	// overlap, have to deal with this better (for N -> M splits
	// where both N>1 and M>1)
	if strings.Index(scw.strategy, "skipSetSourceShards") != -1 {
		scw.wr.Logger().Infof("Skipping setting SourceShard on destination shards.")
	} else {
		for _, si := range scw.destinationShards {
			scw.wr.Logger().Infof("Setting SourceShard on shard %v/%v", si.Keyspace(), si.ShardName())
			if err := scw.wr.SetSourceShards(si.Keyspace(), si.ShardName(), scw.sourceAliases, nil); err != nil {
				return fmt.Errorf("Failed to set source shards: %v", err)
			}
		}
	}

	// And force a schema reload on all destination tablets.
	// The master tablet will end up starting filtered replication
	// at this point.
	for shardIndex, _ := range scw.destinationShards {
		for _, tabletAlias := range scw.destinationAliases[shardIndex] {
			destinationWaitGroup.Add(1)
			go func(ti *topo.TabletInfo) {
				defer destinationWaitGroup.Done()
				scw.wr.Logger().Infof("Reloading schema on tablet %v", ti.Alias)
				if err := scw.wr.TabletManagerClient().ReloadSchema(ti, 30*time.Second); err != nil {
					processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err)
				}
			}(scw.destinationTablets[shardIndex][tabletAlias])
		}
	}
	destinationWaitGroup.Wait()
	return firstError
}
Ejemplo n.º 9
0
// copy phase:
// - get schema on the source, filter tables
// - create tables on all destinations
// - copy the data
func (vscw *VerticalSplitCloneWorker) copy() error {
	vscw.setState(stateVSCCopy)

	// get source schema
	sourceSchemaDefinition, err := vscw.wr.GetSchema(vscw.sourceAlias, vscw.tables, nil, true)
	if err != nil {
		return fmt.Errorf("cannot get schema from source %v: %v", vscw.sourceAlias, err)
	}
	if len(sourceSchemaDefinition.TableDefinitions) == 0 {
		return fmt.Errorf("no tables matching the table filter")
	}
	log.Infof("Source tablet has %v tables to copy", len(sourceSchemaDefinition.TableDefinitions))
	vscw.mu.Lock()
	vscw.tableStatus = make([]tableStatus, len(sourceSchemaDefinition.TableDefinitions))
	for i, td := range sourceSchemaDefinition.TableDefinitions {
		vscw.tableStatus[i].name = td.Name
		vscw.tableStatus[i].rowCount = td.RowCount
	}
	vscw.startTime = time.Now()
	vscw.mu.Unlock()

	// Create all the commands to create the destination schema:
	// - createDbCmds will create the database and the tables
	// - createViewCmds will create the views
	// - alterTablesCmds will modify the tables at the end if needed
	// (all need template substitution for {{.DatabaseName}})
	createDbCmds := make([]string, 0, len(sourceSchemaDefinition.TableDefinitions)+1)
	createDbCmds = append(createDbCmds, sourceSchemaDefinition.DatabaseSchema)
	createViewCmds := make([]string, 0, 16)
	alterTablesCmds := make([]string, 0, 16)
	for i, td := range sourceSchemaDefinition.TableDefinitions {
		vscw.tableStatus[i].mu.Lock()
		if td.Type == myproto.TABLE_BASE_TABLE {
			create, alter, err := mysqlctl.MakeSplitCreateTableSql(td.Schema, "{{.DatabaseName}}", td.Name, vscw.strategy)
			if err != nil {
				return fmt.Errorf("MakeSplitCreateTableSql(%v) returned: %v", td.Name, err)
			}
			createDbCmds = append(createDbCmds, create)
			if alter != "" {
				alterTablesCmds = append(alterTablesCmds, alter)
			}
			vscw.tableStatus[i].state = "before table creation"
			vscw.tableStatus[i].rowCount = td.RowCount
		} else {
			createViewCmds = append(createViewCmds, td.Schema)
			vscw.tableStatus[i].state = "before view creation"
			vscw.tableStatus[i].rowCount = 0
		}
		vscw.tableStatus[i].mu.Unlock()
	}

	// For each destination tablet (in parallel):
	// - create the schema
	// - setup the channels to send SQL data chunks
	//
	// mu protects the abort channel for closing, and firstError
	mu := sync.Mutex{}
	abort := make(chan struct{})
	var firstError error

	processError := func(format string, args ...interface{}) {
		log.Errorf(format, args...)
		mu.Lock()
		if abort != nil {
			close(abort)
			abort = nil
			firstError = fmt.Errorf(format, args...)
		}
		mu.Unlock()
	}

	insertChannels := make([]chan string, len(vscw.destinationAliases))
	destinationWaitGroup := sync.WaitGroup{}
	for i, tabletAlias := range vscw.destinationAliases {
		// we create one channel per destination tablet.  It
		// is sized to have a buffer of a maximum of
		// destinationWriterCount * 2 items, to hopefully
		// always have data. We then have
		// destinationWriterCount go routines reading from it.
		insertChannels[i] = make(chan string, vscw.destinationWriterCount*2)

		destinationWaitGroup.Add(1)
		go func(ti *topo.TabletInfo, insertChannel chan string) {
			defer destinationWaitGroup.Done()
			log.Infof("Creating tables on tablet %v", ti.Alias)
			if err := vscw.runSqlCommands(ti, createDbCmds, abort); err != nil {
				processError("createDbCmds failed: %v", err)
				return
			}
			if len(createViewCmds) > 0 {
				log.Infof("Creating views on tablet %v", ti.Alias)
				if err := vscw.runSqlCommands(ti, createViewCmds, abort); err != nil {
					processError("createViewCmds failed: %v", err)
					return
				}
			}
			for j := 0; j < vscw.destinationWriterCount; j++ {
				destinationWaitGroup.Add(1)
				go func() {
					defer destinationWaitGroup.Done()
					for {
						select {
						case cmd, ok := <-insertChannel:
							if !ok {
								return
							}
							cmd = "INSERT INTO `" + ti.DbName() + "`." + cmd
							_, err := vscw.wr.ActionInitiator().ExecuteFetch(ti, cmd, 0, false, true, 30*time.Second)
							if err != nil {
								processError("ExecuteFetch failed: %v", err)
								return
							}
						case <-abort:
							return
						}
					}
				}()
			}
		}(vscw.destinationTablets[tabletAlias], insertChannels[i])
	}

	// Now for each table, read data chunks and send them to all
	// insertChannels
	sourceWaitGroup := sync.WaitGroup{}
	sema := sync2.NewSemaphore(vscw.sourceReaderCount, 0)
	for tableIndex, td := range sourceSchemaDefinition.TableDefinitions {
		if td.Type == myproto.TABLE_VIEW {
			vscw.tableStatus[tableIndex].setState("view created")
			continue
		}

		vscw.tableStatus[tableIndex].setState("before copy")
		chunks, err := vscw.findChunks(vscw.sourceTablet, td)
		if err != nil {
			return err
		}

		for chunkIndex := 0; chunkIndex < len(chunks)-1; chunkIndex++ {
			sourceWaitGroup.Add(1)
			go func(td myproto.TableDefinition, tableIndex, chunkIndex int) {
				defer sourceWaitGroup.Done()

				sema.Acquire()
				defer sema.Release()

				vscw.tableStatus[tableIndex].setState("started the copy")

				// build the query, and start the streaming
				selectSQL := "SELECT " + strings.Join(td.Columns, ", ") + " FROM " + td.Name
				if chunks[chunkIndex] != "" || chunks[chunkIndex+1] != "" {
					log.Infof("Starting to stream all data from table %v between '%v' and '%v'", td.Name, chunks[chunkIndex], chunks[chunkIndex+1])
					clauses := make([]string, 0, 2)
					if chunks[chunkIndex] != "" {
						clauses = append(clauses, td.PrimaryKeyColumns[0]+">="+chunks[chunkIndex])
					}
					if chunks[chunkIndex+1] != "" {
						clauses = append(clauses, td.PrimaryKeyColumns[0]+"<"+chunks[chunkIndex+1])
					}
					selectSQL += " WHERE " + strings.Join(clauses, " AND ")
				} else {
					log.Infof("Starting to stream all data from table %v", td.Name)
				}
				if len(td.PrimaryKeyColumns) > 0 {
					selectSQL += " ORDER BY " + strings.Join(td.PrimaryKeyColumns, ", ")
				}
				qrr, err := NewQueryResultReaderForTablet(vscw.wr.TopoServer(), vscw.sourceAlias, selectSQL)
				if err != nil {
					processError("NewQueryResultReaderForTablet failed: %v", err)
					return
				}

				// process the data
				baseCmd := td.Name + "(" + strings.Join(td.Columns, ", ") + ") VALUES "
			loop:
				for {
					select {
					case r, ok := <-qrr.Output:
						if !ok {
							if err := qrr.Error(); err != nil {
								// error case
								processError("QueryResultReader failed: %v", err)
								return
							}

							// we're done with the data
							break loop
						}

						// send the rows to be inserted
						vscw.tableStatus[tableIndex].addCopiedRows(len(r.Rows))
						cmd := baseCmd + makeValueString(qrr.Fields, r)
						for _, c := range insertChannels {
							c <- cmd
						}
					case <-abort:
						return
					}
				}
			}(td, tableIndex, chunkIndex)
		}
	}
	sourceWaitGroup.Wait()

	for _, c := range insertChannels {
		close(c)
	}
	destinationWaitGroup.Wait()
	if firstError != nil {
		return firstError
	}

	// do the post-copy alters if any
	if len(alterTablesCmds) > 0 {
		for _, tabletAlias := range vscw.destinationAliases {
			destinationWaitGroup.Add(1)
			go func(ti *topo.TabletInfo) {
				defer destinationWaitGroup.Done()
				log.Infof("Altering tables on tablet %v", ti.Alias)
				if err := vscw.runSqlCommands(ti, alterTablesCmds, abort); err != nil {
					processError("alterTablesCmds failed on tablet %v: %v", ti.Alias, err)
				}
			}(vscw.destinationTablets[tabletAlias])
		}
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// then create and populate the blp_checkpoint table
	if strings.Index(vscw.strategy, "populateBlpCheckpoint") != -1 {
		// get the current position from the source
		pos, err := vscw.wr.ActionInitiator().SlavePosition(vscw.sourceTablet, 30*time.Second)
		if err != nil {
			return err
		}

		queries := make([]string, 0, 4)
		queries = append(queries, binlogplayer.CreateBlpCheckpoint()...)
		flags := ""
		if strings.Index(vscw.strategy, "dontStartBinlogPlayer") != -1 {
			flags = binlogplayer.BLP_FLAG_DONT_START
		}
		queries = append(queries, binlogplayer.PopulateBlpCheckpoint(0, pos.MasterLogGTIDField.Value, time.Now().Unix(), flags))
		for _, tabletAlias := range vscw.destinationAliases {
			destinationWaitGroup.Add(1)
			go func(ti *topo.TabletInfo) {
				defer destinationWaitGroup.Done()
				log.Infof("Making and populating blp_checkpoint table on tablet %v", ti.Alias)
				if err := vscw.runSqlCommands(ti, queries, abort); err != nil {
					processError("blp_checkpoint queries failed on tablet %v: %v", ti.Alias, err)
				}
			}(vscw.destinationTablets[tabletAlias])
		}
		destinationWaitGroup.Wait()
		if firstError != nil {
			return firstError
		}
	}

	// Now we're done with data copy, update the shard's source info.
	log.Infof("Setting SourceShard on shard %v/%v", vscw.destinationKeyspace, vscw.destinationShard)
	if err := vscw.wr.SetSourceShards(vscw.destinationKeyspace, vscw.destinationShard, []topo.TabletAlias{vscw.sourceAlias}, vscw.tables); err != nil {
		return fmt.Errorf("Failed to set source shards: %v", err)
	}

	// And force a schema reload on all destination tablets.
	// The master tablet will end up starting filtered replication
	// at this point.
	for _, tabletAlias := range vscw.destinationAliases {
		destinationWaitGroup.Add(1)
		go func(ti *topo.TabletInfo) {
			defer destinationWaitGroup.Done()
			log.Infof("Reloading schema on tablet %v", ti.Alias)
			if err := vscw.wr.ActionInitiator().ReloadSchema(ti, 30*time.Second); err != nil {
				processError("ReloadSchema failed on tablet %v: %v", ti.Alias, err)
			}
		}(vscw.destinationTablets[tabletAlias])
	}
	destinationWaitGroup.Wait()
	return firstError
}