func (wr *Wrangler) shardMultiRestore(keyspace, shard string, sources []topo.TabletAlias, concurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount int, strategy string) error { // read the shard shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } // read the source tablets sourceTablets, err := GetTabletMap(wr.TopoServer(), sources) if err != nil { return err } // insert their KeyRange / shard in the SourceShards array shardInfo.SourceShards = make([]topo.SourceShard, 0, len(sourceTablets)) for _, ti := range sourceTablets { overlap, err := key.KeyRangesOverlap(shardInfo.KeyRange, ti.KeyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", ti.KeyRange, shardInfo.KeyRange) } ss := topo.SourceShard{ KeyRange: overlap, } shardInfo.SourceShards = append(shardInfo.SourceShards, ss) } // and write the shard if err = wr.ts.UpdateShard(shardInfo); err != nil { return err } return nil }
func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v", bpc, x) err = fmt.Errorf("panic: %v", x) } }() // Enable any user to set the timestamp. // We do it on every iteration to be sure, in case mysql was // restarted. bpc.DisableSuperToSetTimestamp() // create the db connection, connect it vtClient := mysqlctl.NewDbClient(bpc.dbConfig) if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, err := mysqlctl.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // Find the server list for the source shard in our cell addrs, err := bpc.ts.GetEndPoints(bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topo.TYPE_REPLICA) if err != nil { return fmt.Errorf("can't find any source tablet for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA, err) } if len(addrs.Entries) == 0 { return fmt.Errorf("empty source tablet list for %v %v %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA) } newServerIndex := rand.Intn(len(addrs.Entries)) addr := fmt.Sprintf("%v:%v", addrs.Entries[newServerIndex].Host, addrs.Entries[newServerIndex].NamedPortMap["_vtocc"]) // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, just get them player := mysqlctl.NewBinlogPlayerTables(vtClient, addr, bpc.sourceShard.Tables, startPosition, bpc.stopAtGroupId) return player.ApplyBinlogEvents(bpc.interrupted) } else { // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player := mysqlctl.NewBinlogPlayerKeyRange(vtClient, addr, overlap, startPosition, bpc.stopAtGroupId) return player.ApplyBinlogEvents(bpc.interrupted) } }
func (ta *TabletActor) multiRestore(actionNode *actionnode.ActionNode) (err error) { args := actionNode.Args.(*actionnode.MultiRestoreArgs) // read our current tablet, verify its state // we only support restoring to the master or spare replicas tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_MASTER && tablet.Type != topo.TYPE_SPARE && tablet.Type != topo.TYPE_REPLICA && tablet.Type != topo.TYPE_RDONLY { return fmt.Errorf("expected master, spare replica or rdonly type, not %v: %v", tablet.Type, ta.tabletAlias) } // get source tablets addresses sourceAddrs := make([]*url.URL, len(args.SrcTabletAliases)) keyRanges := make([]key.KeyRange, len(args.SrcTabletAliases)) for i, alias := range args.SrcTabletAliases { t, e := ta.ts.GetTablet(alias) if e != nil { return e } sourceAddrs[i] = &url.URL{Host: t.GetAddr(), Path: "/" + t.DbName()} keyRanges[i], e = key.KeyRangesOverlap(tablet.KeyRange, t.KeyRange) if e != nil { return e } } // change type to restore, no change to replication graph originalType := tablet.Type tablet.Type = topo.TYPE_RESTORE err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // run the action, scrap if it fails if err := ta.mysqld.MultiRestore(tablet.DbName(), keyRanges, sourceAddrs, args.Concurrency, args.FetchConcurrency, args.InsertTableConcurrency, args.FetchRetryCount, args.Strategy); err != nil { if e := Scrap(ta.ts, ta.tabletAlias, false); e != nil { log.Errorf("Failed to Scrap after failed RestoreFromMultiSnapshot: %v", e) } return err } // restore type back tablet.Type = originalType return topo.UpdateTablet(ta.ts, tablet) }
func (sdw *SplitDiffWorker) diff(ctx context.Context) error { sdw.SetState(WorkerStateDiff) sdw.wr.Logger().Infof("Gathering schema information...") sdw.sourceSchemaDefinitions = make([]*myproto.SchemaDefinition, len(sdw.sourceAliases)) wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} wg.Add(1) go func() { var err error shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sdw.destinationSchemaDefinition, err = sdw.wr.GetSchema( shortCtx, sdw.destinationAlias, nil /* tables */, sdw.excludeTables, false /* includeViews */) cancel() rec.RecordError(err) sdw.wr.Logger().Infof("Got schema from destination %v", sdw.destinationAlias) wg.Done() }() for i, sourceAlias := range sdw.sourceAliases { wg.Add(1) go func(i int, sourceAlias pb.TabletAlias) { var err error shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sdw.sourceSchemaDefinitions[i], err = sdw.wr.GetSchema( shortCtx, &sourceAlias, nil /* tables */, sdw.excludeTables, false /* includeViews */) cancel() rec.RecordError(err) sdw.wr.Logger().Infof("Got schema from source[%v] %v", i, sourceAlias) wg.Done() }(i, *sourceAlias) } wg.Wait() if rec.HasErrors() { return rec.Error() } // TODO(alainjobart) Checking against each source may be // overkill, if all sources have the same schema? sdw.wr.Logger().Infof("Diffing the schema...") rec = concurrency.AllErrorRecorder{} for i, sourceSchemaDefinition := range sdw.sourceSchemaDefinitions { sourceName := fmt.Sprintf("source[%v]", i) myproto.DiffSchema("destination", sdw.destinationSchemaDefinition, sourceName, sourceSchemaDefinition, &rec) } if rec.HasErrors() { sdw.wr.Logger().Warningf("Different schemas: %v", rec.Error().Error()) } else { sdw.wr.Logger().Infof("Schema match, good.") } // run the diffs, 8 at a time sdw.wr.Logger().Infof("Running the diffs...") sem := sync2.NewSemaphore(8, 0) for _, tableDefinition := range sdw.destinationSchemaDefinition.TableDefinitions { wg.Add(1) go func(tableDefinition *myproto.TableDefinition) { defer wg.Done() sem.Acquire() defer sem.Release() sdw.wr.Logger().Infof("Starting the diff on table %v", tableDefinition.Name) if len(sdw.sourceAliases) != 1 { err := fmt.Errorf("Don't support more than one source for table yet: %v", tableDefinition.Name) rec.RecordError(err) sdw.wr.Logger().Errorf(err.Error()) return } overlap, err := key.KeyRangesOverlap(sdw.shardInfo.KeyRange, sdw.shardInfo.SourceShards[0].KeyRange) if err != nil { newErr := fmt.Errorf("Source shard doesn't overlap with destination????: %v", err) rec.RecordError(newErr) sdw.wr.Logger().Errorf(newErr.Error()) return } sourceQueryResultReader, err := TableScanByKeyRange(ctx, sdw.wr.Logger(), sdw.wr.TopoServer(), sdw.sourceAliases[0], tableDefinition, overlap, key.ProtoToKeyspaceIdType(sdw.keyspaceInfo.ShardingColumnType)) if err != nil { newErr := fmt.Errorf("TableScanByKeyRange(source) failed: %v", err) rec.RecordError(newErr) sdw.wr.Logger().Errorf(newErr.Error()) return } defer sourceQueryResultReader.Close() destinationQueryResultReader, err := TableScanByKeyRange(ctx, sdw.wr.Logger(), sdw.wr.TopoServer(), sdw.destinationAlias, tableDefinition, nil, key.ProtoToKeyspaceIdType(sdw.keyspaceInfo.ShardingColumnType)) if err != nil { newErr := fmt.Errorf("TableScanByKeyRange(destination) failed: %v", err) rec.RecordError(newErr) sdw.wr.Logger().Errorf(newErr.Error()) return } defer destinationQueryResultReader.Close() differ, err := NewRowDiffer(sourceQueryResultReader, destinationQueryResultReader, tableDefinition) if err != nil { newErr := fmt.Errorf("NewRowDiffer() failed: %v", err) rec.RecordError(newErr) sdw.wr.Logger().Errorf(newErr.Error()) return } report, err := differ.Go(sdw.wr.Logger()) if err != nil { newErr := fmt.Errorf("Differ.Go failed: %v", err.Error()) rec.RecordError(newErr) sdw.wr.Logger().Errorf(newErr.Error()) } else { if report.HasDifferences() { err := fmt.Errorf("Table %v has differences: %v", tableDefinition.Name, report.String()) rec.RecordError(err) sdw.wr.Logger().Warningf(err.Error()) } else { sdw.wr.Logger().Infof("Table %v checks out (%v rows processed, %v qps)", tableDefinition.Name, report.processedRows, report.processingQPS) } } }(tableDefinition) } wg.Wait() return rec.Error() }
// Iteration is a single iteration for the player: get the current status, // try to play, and plays until interrupted, or until an error occurs. func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v\n%s", bpc, x, tb.Stack(4)) err = fmt.Errorf("panic: %v", x) } }() // Apply any special settings necessary for playback of binlogs. // We do it on every iteration to be sure, in case MySQL was restarted. if err := bpc.mysqld.EnableBinlogPlayback(); err != nil { // We failed to apply the required settings, so we shouldn't keep going. return err } // create the db connection, connect it vtClient := bpc.vtClientFactory() if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // if we shouldn't start, we just error out and try again later if strings.Index(flags, binlogplayer.BlpFlagDontStart) != -1 { return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BlpFlagDontStart) } // wait for the endpoint set (usefull for the first run at least, fast for next runs) if err := discovery.WaitForEndPoints(bpc.healthCheck, bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, []topodatapb.TabletType{topodatapb.TabletType_REPLICA}); err != nil { return fmt.Errorf("error waiting for endpoints for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA, err) } // Find the server list from the health check addrs := bpc.healthCheck.GetEndPointStatsFromTarget(bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topodatapb.TabletType_REPLICA) if len(addrs) == 0 { return fmt.Errorf("can't find any source tablet for %v %v %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA) } newServerIndex := rand.Intn(len(addrs)) endPoint := addrs[newServerIndex].EndPoint // save our current server bpc.playerMutex.Lock() bpc.sourceTablet = &topodatapb.TabletAlias{ Cell: bpc.cell, Uid: endPoint.Uid, } bpc.lastError = nil bpc.playerMutex.Unlock() // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(bpc.mysqld, bpc.dbName, bpc.sourceShard.Tables) if err != nil { return fmt.Errorf("failed to resolve table names: %v", err) } // tables, just get them player, err := binlogplayer.NewBinlogPlayerTables(vtClient, endPoint, tables, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerTables failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) } // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player, err := binlogplayer.NewBinlogPlayerKeyRange(vtClient, endPoint, overlap, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerKeyRange failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) }
func (ta *TabletActor) multiRestore(actionNode *actionnode.ActionNode) (err error) { args := actionNode.Args.(*actionnode.MultiRestoreArgs) // read our current tablet, verify its state // we only support restoring to the master or active replicas tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_MASTER && !topo.IsSlaveType(tablet.Type) { return fmt.Errorf("expected master, or slave type, not %v: %v", tablet.Type, ta.tabletAlias) } // get source tablets addresses sourceAddrs := make([]*url.URL, len(args.SrcTabletAliases)) keyRanges := make([]key.KeyRange, len(args.SrcTabletAliases)) fromStoragePaths := make([]string, len(args.SrcTabletAliases)) for i, alias := range args.SrcTabletAliases { t, e := ta.ts.GetTablet(alias) if e != nil { return e } sourceAddrs[i] = &url.URL{ Host: t.Addr(), Path: "/" + t.DbName(), } keyRanges[i], e = key.KeyRangesOverlap(tablet.KeyRange, t.KeyRange) if e != nil { return e } fromStoragePaths[i] = path.Join(ta.mysqld.SnapshotDir, "from-storage", fmt.Sprintf("from-%v-%v", keyRanges[i].Start.Hex(), keyRanges[i].End.Hex())) } // change type to restore, no change to replication graph originalType := tablet.Type tablet.Type = topo.TYPE_RESTORE err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // first try to get the data from a remote storage wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for i, alias := range args.SrcTabletAliases { wg.Add(1) go func(i int, alias topo.TabletAlias) { defer wg.Done() h := hook.NewSimpleHook("copy_snapshot_from_storage") h.ExtraEnv = make(map[string]string) for k, v := range ta.hookExtraEnv() { h.ExtraEnv[k] = v } h.ExtraEnv["KEYRANGE"] = fmt.Sprintf("%v-%v", keyRanges[i].Start.Hex(), keyRanges[i].End.Hex()) h.ExtraEnv["SNAPSHOT_PATH"] = fromStoragePaths[i] h.ExtraEnv["SOURCE_TABLET_ALIAS"] = alias.String() hr := h.Execute() if hr.ExitStatus != hook.HOOK_SUCCESS { rec.RecordError(fmt.Errorf("%v hook failed(%v): %v", h.Name, hr.ExitStatus, hr.Stderr)) } }(i, alias) } wg.Wait() // run the action, scrap if it fails if rec.HasErrors() { log.Infof("Got errors trying to get snapshots from storage, trying to get them from original tablets: %v", rec.Error()) err = ta.mysqld.MultiRestore(tablet.DbName(), keyRanges, sourceAddrs, nil, args.Concurrency, args.FetchConcurrency, args.InsertTableConcurrency, args.FetchRetryCount, args.Strategy) } else { log.Infof("Got snapshots from storage, reading them from disk directly") err = ta.mysqld.MultiRestore(tablet.DbName(), keyRanges, nil, fromStoragePaths, args.Concurrency, args.FetchConcurrency, args.InsertTableConcurrency, args.FetchRetryCount, args.Strategy) } if err != nil { if e := topotools.Scrap(ta.ts, ta.tabletAlias, false); e != nil { log.Errorf("Failed to Scrap after failed RestoreFromMultiSnapshot: %v", e) } return err } // restore type back tablet.Type = originalType return topo.UpdateTablet(ta.ts, tablet) }
func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: Caught panic: %v", bpc, x) err = fmt.Errorf("panic: %v", x) } }() // Enable any user to set the timestamp. // We do it on every iteration to be sure, in case mysql was // restarted. bpc.DisableSuperToSetTimestamp() // create the db connection, connect it vtClient := mysqlctl.NewDbClient(bpc.dbConfig) if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, err := mysqlctl.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // Find the server list for the source shard in our cell addrs, err := bpc.ts.GetEndPoints(bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topo.TYPE_REPLICA) if err != nil { return fmt.Errorf("can't find any source tablet for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA, err) } if len(addrs.Entries) == 0 { return fmt.Errorf("empty source tablet list for %v %v %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA) } // if the server we were using before is in the list, just keep using it usePreviousServer := false for _, addr := range addrs.Entries { vtAddr := fmt.Sprintf("%v:%v", addr.Host, addr.NamedPortMap["_vtocc"]) if vtAddr == startPosition.Addr { log.Infof("%v: Previous server %s still healthy, using it", bpc, vtAddr) usePreviousServer = true } } // if we can't use the previous server, pick a new one randomly if !usePreviousServer { newServerIndex := rand.Intn(len(addrs.Entries)) startPosition.Addr = fmt.Sprintf("%v:%v", addrs.Entries[newServerIndex].Host, addrs.Entries[newServerIndex].NamedPortMap["_vtocc"]) startPosition.Position.MasterFilename = "" startPosition.Position.MasterPosition = 0 log.Infof("%v: Connecting to different server: %s", bpc, startPosition.Addr) } // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } // Create the player. player, err := mysqlctl.NewBinlogPlayer(vtClient, overlap, bpc.sourceShard.Uid, startPosition, nil /*tables*/, 1 /*txnBatch*/, 30*time.Second /*maxTxnInterval*/, false /*execDdl*/) if err != nil { return fmt.Errorf("can't create player: %v", err) } // Run player loop until it's done. return player.ApplyBinlogEvents(bpc.interrupted) }
// Iteration is a single iteration for the player: get the current status, // try to play, and plays until interrupted, or until an error occurs. func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v\n%s", bpc, x, tb.Stack(4)) err = fmt.Errorf("panic: %v", x) } }() // Check if the context is still good. select { case <-bpc.ctx.Done(): if bpc.ctx.Err() == context.Canceled { // We were stopped. Break out of Loop(). return nil } return fmt.Errorf("giving up since the context is done: %v", bpc.ctx.Err()) default: } // Apply any special settings necessary for playback of binlogs. // We do it on every iteration to be sure, in case MySQL was restarted. if err := bpc.mysqld.EnableBinlogPlayback(); err != nil { // We failed to apply the required settings, so we shouldn't keep going. return err } // create the db connection, connect it vtClient := bpc.vtClientFactory() if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // if we shouldn't start, we just error out and try again later if strings.Index(flags, binlogplayer.BlpFlagDontStart) != -1 { return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BlpFlagDontStart) } // wait for the tablet set (usefull for the first run at least, fast for next runs) if err := bpc.tabletStatsCache.WaitForTablets(bpc.ctx, bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, []topodatapb.TabletType{topodatapb.TabletType_REPLICA}); err != nil { return fmt.Errorf("error waiting for tablets for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA, err) } // Find the server list from the health check. // Note: We cannot use tsc.GetHealthyTabletStats() here because it does // not return non-serving tablets. We must include non-serving tablets because // REPLICA source tablets may not be serving anymore because their traffic was // already migrated to the destination shards. addrs := discovery.RemoveUnhealthyTablets(bpc.tabletStatsCache.GetTabletStats(bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topodatapb.TabletType_REPLICA)) if len(addrs) == 0 { return fmt.Errorf("can't find any healthy source tablet for %v %v %v", bpc.cell, bpc.sourceShard.String(), topodatapb.TabletType_REPLICA) } newServerIndex := rand.Intn(len(addrs)) tablet := addrs[newServerIndex].Tablet // save our current server bpc.playerMutex.Lock() bpc.sourceTablet = tablet.Alias bpc.lastError = nil bpc.playerMutex.Unlock() // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(bpc.mysqld, bpc.dbName, bpc.sourceShard.Tables) if err != nil { return fmt.Errorf("failed to resolve table names: %v", err) } // tables, just get them player, err := binlogplayer.NewBinlogPlayerTables(vtClient, tablet, tables, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerTables failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) } // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player, err := binlogplayer.NewBinlogPlayerKeyRange(vtClient, tablet, overlap, bpc.sourceShard.Uid, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) if err != nil { return fmt.Errorf("NewBinlogPlayerKeyRange failed: %v", err) } return player.ApplyBinlogEvents(bpc.ctx) }
// Iteration is a single iteration for the player: get the current status, // try to play, and plays until interrupted, or until an error occurs. func (bpc *BinlogPlayerController) Iteration(ctx context.Context) (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v", bpc, x) err = fmt.Errorf("panic: %v", x) } }() // Apply any special settings necessary for playback of binlogs. // We do it on every iteration to be sure, in case MySQL was restarted. if err := bpc.mysqld.EnableBinlogPlayback(); err != nil { // We failed to apply the required settings, so we shouldn't keep going. return err } // create the db connection, connect it vtClient := binlogplayer.NewDbClient(bpc.dbConfig) if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // if we shouldn't start, we just error out and try again later if strings.Index(flags, binlogplayer.BlpFlagDontStart) != -1 { return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BlpFlagDontStart) } // Find the server list for the source shard in our cell addrs, _, err := bpc.ts.GetEndPoints(ctx, bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topo.TYPE_REPLICA) if err != nil { return fmt.Errorf("can't find any source tablet for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA, err) } if len(addrs.Entries) == 0 { return fmt.Errorf("empty source tablet list for %v %v %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA) } newServerIndex := rand.Intn(len(addrs.Entries)) port, _ := addrs.Entries[newServerIndex].NamedPortMap["vt"] addr := netutil.JoinHostPort(addrs.Entries[newServerIndex].Host, port) // save our current server bpc.playerMutex.Lock() bpc.sourceTablet = topo.TabletAlias{ Cell: bpc.cell, Uid: addrs.Entries[newServerIndex].Uid, } bpc.lastError = nil bpc.playerMutex.Unlock() // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, first resolve wildcards tables, err := mysqlctl.ResolveTables(bpc.mysqld, bpc.dbName, bpc.sourceShard.Tables) if err != nil { return fmt.Errorf("failed to resolve table names: %v", err) } // tables, just get them player := binlogplayer.NewBinlogPlayerTables(vtClient, addr, tables, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) return player.ApplyBinlogEvents(bpc.interrupted) } // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player := binlogplayer.NewBinlogPlayerKeyRange(vtClient, addr, bpc.keyspaceIDType, overlap, startPosition, bpc.stopPosition, bpc.binlogPlayerStats) return player.ApplyBinlogEvents(bpc.interrupted) }
func (sdw *SplitDiffWorker) diff(ctx context.Context) error { sdw.SetState(WorkerStateDiff) sdw.wr.Logger().Infof("Gathering schema information...") wg := sync.WaitGroup{} rec := &concurrency.AllErrorRecorder{} wg.Add(1) go func() { var err error shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sdw.destinationSchemaDefinition, err = sdw.wr.GetSchema( shortCtx, sdw.destinationAlias, nil /* tables */, sdw.excludeTables, false /* includeViews */) cancel() rec.RecordError(err) sdw.wr.Logger().Infof("Got schema from destination %v", sdw.destinationAlias) wg.Done() }() wg.Add(1) go func() { var err error shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sdw.sourceSchemaDefinition, err = sdw.wr.GetSchema( shortCtx, sdw.sourceAlias, nil /* tables */, sdw.excludeTables, false /* includeViews */) cancel() rec.RecordError(err) sdw.wr.Logger().Infof("Got schema from source %v", sdw.sourceAlias) wg.Done() }() wg.Wait() if rec.HasErrors() { return rec.Error() } sdw.wr.Logger().Infof("Diffing the schema...") rec = &concurrency.AllErrorRecorder{} tmutils.DiffSchema("destination", sdw.destinationSchemaDefinition, "source", sdw.sourceSchemaDefinition, rec) if rec.HasErrors() { sdw.wr.Logger().Warningf("Different schemas: %v", rec.Error().Error()) } else { sdw.wr.Logger().Infof("Schema match, good.") } // read the vschema if needed var keyspaceSchema *vindexes.KeyspaceSchema if *useV3ReshardingMode { kschema, err := sdw.wr.TopoServer().GetVSchema(ctx, sdw.keyspace) if err != nil { return fmt.Errorf("cannot load VSchema for keyspace %v: %v", sdw.keyspace, err) } if kschema == nil { return fmt.Errorf("no VSchema for keyspace %v", sdw.keyspace) } keyspaceSchema, err = vindexes.BuildKeyspaceSchema(kschema, sdw.keyspace) if err != nil { return fmt.Errorf("cannot build vschema for keyspace %v: %v", sdw.keyspace, err) } } // Compute the overlap keyrange. Later, we'll compare it with // source or destination keyrange. If it matches either, // we'll just ask for all the data. If the overlap is a subset, // we'll filter. overlap, err := key.KeyRangesOverlap(sdw.shardInfo.KeyRange, sdw.shardInfo.SourceShards[sdw.sourceUID].KeyRange) if err != nil { return fmt.Errorf("Source shard doesn't overlap with destination: %v", err) } // run the diffs, 8 at a time sdw.wr.Logger().Infof("Running the diffs...") // TODO(mberlin): Parameterize the hard coded value 8. sem := sync2.NewSemaphore(8, 0) for _, tableDefinition := range sdw.destinationSchemaDefinition.TableDefinitions { wg.Add(1) go func(tableDefinition *tabletmanagerdatapb.TableDefinition) { defer wg.Done() sem.Acquire() defer sem.Release() sdw.wr.Logger().Infof("Starting the diff on table %v", tableDefinition.Name) // On the source, see if we need a full scan // or a filtered scan. var sourceQueryResultReader *QueryResultReader if key.KeyRangeEqual(overlap, sdw.shardInfo.SourceShards[sdw.sourceUID].KeyRange) { sourceQueryResultReader, err = TableScan(ctx, sdw.wr.Logger(), sdw.wr.TopoServer(), sdw.sourceAlias, tableDefinition) } else { sourceQueryResultReader, err = TableScanByKeyRange(ctx, sdw.wr.Logger(), sdw.wr.TopoServer(), sdw.sourceAlias, tableDefinition, overlap, keyspaceSchema, sdw.keyspaceInfo.ShardingColumnName, sdw.keyspaceInfo.ShardingColumnType) } if err != nil { newErr := fmt.Errorf("TableScan(ByKeyRange?)(source) failed: %v", err) rec.RecordError(newErr) sdw.wr.Logger().Errorf("%v", newErr) return } defer sourceQueryResultReader.Close(ctx) // On the destination, see if we need a full scan // or a filtered scan. var destinationQueryResultReader *QueryResultReader if key.KeyRangeEqual(overlap, sdw.shardInfo.KeyRange) { destinationQueryResultReader, err = TableScan(ctx, sdw.wr.Logger(), sdw.wr.TopoServer(), sdw.destinationAlias, tableDefinition) } else { destinationQueryResultReader, err = TableScanByKeyRange(ctx, sdw.wr.Logger(), sdw.wr.TopoServer(), sdw.destinationAlias, tableDefinition, overlap, keyspaceSchema, sdw.keyspaceInfo.ShardingColumnName, sdw.keyspaceInfo.ShardingColumnType) } if err != nil { newErr := fmt.Errorf("TableScan(ByKeyRange?)(destination) failed: %v", err) rec.RecordError(newErr) sdw.wr.Logger().Errorf("%v", newErr) return } defer destinationQueryResultReader.Close(ctx) // Create the row differ. differ, err := NewRowDiffer(sourceQueryResultReader, destinationQueryResultReader, tableDefinition) if err != nil { newErr := fmt.Errorf("NewRowDiffer() failed: %v", err) rec.RecordError(newErr) sdw.wr.Logger().Errorf("%v", newErr) return } // And run the diff. report, err := differ.Go(sdw.wr.Logger()) if err != nil { newErr := fmt.Errorf("Differ.Go failed: %v", err.Error()) rec.RecordError(newErr) sdw.wr.Logger().Errorf("%v", newErr) } else { if report.HasDifferences() { err := fmt.Errorf("Table %v has differences: %v", tableDefinition.Name, report.String()) rec.RecordError(err) sdw.wr.Logger().Warningf(err.Error()) } else { sdw.wr.Logger().Infof("Table %v checks out (%v rows processed, %v qps)", tableDefinition.Name, report.processedRows, report.processingQPS) } } }(tableDefinition) } wg.Wait() return rec.Error() }
// Iteration is a single iteration for the player: get the current status, // try to play, and plays until interrupted, or until an error occurs. func (bpc *BinlogPlayerController) Iteration() (err error) { defer func() { if x := recover(); x != nil { log.Errorf("%v: caught panic: %v", bpc, x) err = fmt.Errorf("panic: %v", x) } }() // Enable any user to set the timestamp. // We do it on every iteration to be sure, in case mysql was // restarted. bpc.DisableSuperToSetTimestamp() // create the db connection, connect it vtClient := binlogplayer.NewDbClient(bpc.dbConfig) if err := vtClient.Connect(); err != nil { return fmt.Errorf("can't connect to database: %v", err) } defer vtClient.Close() // Read the start position startPosition, flags, err := binlogplayer.ReadStartPosition(vtClient, bpc.sourceShard.Uid) if err != nil { return fmt.Errorf("can't read startPosition: %v", err) } // if we shouldn't start, we just error out and try again later if strings.Index(flags, binlogplayer.BLP_FLAG_DONT_START) != -1 { return fmt.Errorf("not starting because flag '%v' is set", binlogplayer.BLP_FLAG_DONT_START) } // Find the server list for the source shard in our cell addrs, err := bpc.ts.GetEndPoints(bpc.cell, bpc.sourceShard.Keyspace, bpc.sourceShard.Shard, topo.TYPE_REPLICA) if err != nil { return fmt.Errorf("can't find any source tablet for %v %v %v: %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA, err) } if len(addrs.Entries) == 0 { return fmt.Errorf("empty source tablet list for %v %v %v", bpc.cell, bpc.sourceShard.String(), topo.TYPE_REPLICA) } newServerIndex := rand.Intn(len(addrs.Entries)) addr := fmt.Sprintf("%v:%v", addrs.Entries[newServerIndex].Host, addrs.Entries[newServerIndex].NamedPortMap["_vtocc"]) // save our current server bpc.playerMutex.Lock() bpc.sourceTablet = topo.TabletAlias{ Cell: bpc.cell, Uid: addrs.Entries[newServerIndex].Uid, } bpc.lastError = nil bpc.playerMutex.Unlock() // check which kind of replication we're doing, tables or keyrange if len(bpc.sourceShard.Tables) > 0 { // tables, first resolve wildcards tables, err := bpc.mysqld.ResolveTables(bpc.dbName, bpc.sourceShard.Tables) if err != nil { return fmt.Errorf("failed to resolve table names: %v", err) } // tables, just get them player := binlogplayer.NewBinlogPlayerTables(vtClient, addr, tables, startPosition, bpc.stopAtGTID, bpc.binlogPlayerStats) return player.ApplyBinlogEvents(bpc.interrupted) } else { // the data we have to replicate is the intersection of the // source keyrange and our keyrange overlap, err := key.KeyRangesOverlap(bpc.sourceShard.KeyRange, bpc.keyRange) if err != nil { return fmt.Errorf("Source shard %v doesn't overlap destination shard %v", bpc.sourceShard.KeyRange, bpc.keyRange) } player := binlogplayer.NewBinlogPlayerKeyRange(vtClient, addr, bpc.keyspaceIdType, overlap, startPosition, bpc.stopAtGTID, bpc.binlogPlayerStats) return player.ApplyBinlogEvents(bpc.interrupted) } }