func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error { // If the majority of slaves restarted, move ahead. if majorityRestart { if leaveMasterReadOnly { wr.logger.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } else { wr.logger.Infof("marking master-elect read-write %v", masterElect.Alias) if err := wr.tmc.SetReadWrite(wr.ctx, masterElect); err != nil { wr.logger.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } } } else { wr.logger.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } // save the new master in the shard info si.MasterAlias = masterElect.Alias if err := topo.UpdateShard(context.TODO(), wr.ts, si); err != nil { wr.logger.Errorf("Failed to save new master into shard: %v", err) return err } // We rebuild all the cells, as we may have taken tablets in and // out of the graph. wr.logger.Infof("rebuilding shard serving graph data") _, err := topotools.RebuildShard(context.TODO(), wr.logger, wr.ts, masterElect.Keyspace, masterElect.Shard, nil, wr.lockTimeout, interrupted) return err }
// replicaMigrateServedFrom handles the slave (replica, rdonly) migration. func (wr *Wrangler) replicaMigrateServedFrom(ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, servedType topo.TabletType, cells []string, reverse bool, tables []string, ev *events.MigrateServedFrom) error { // Save the destination keyspace (its ServedFrom has been changed) event.DispatchUpdate(ev, "updating keyspace") if err := topo.UpdateKeyspace(wr.ts, ki); err != nil { return err } // Save the source shard (its blacklisted tables field has changed) event.DispatchUpdate(ev, "updating source shard") if err := sourceShard.UpdateSourceBlacklistedTables(servedType, cells, reverse, tables); err != nil { return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err) } if err := topo.UpdateShard(context.TODO(), wr.ts, sourceShard); err != nil { return fmt.Errorf("UpdateShard(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err) } // Now refresh the source servers so they reload their // blacklisted table list event.DispatchUpdate(ev, "refreshing sources tablets state so they update their blacklisted tables") if err := wr.RefreshTablesByShard(sourceShard, servedType, cells); err != nil { return err } return nil }
func (wr *Wrangler) setShardServedTypes(keyspace, shard string, cells []string, servedType topo.TabletType, remove bool) error { si, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } if err := si.UpdateServedTypesMap(servedType, cells, remove); err != nil { return err } return topo.UpdateShard(context.TODO(), wr.ts, si) }
// updateShardCellsAndMaster will update the 'Cells' and possibly // MasterAlias records for the shard, if needed. func (wr *Wrangler) updateShardCellsAndMaster(si *topo.ShardInfo, tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) error { // See if we need to update the Shard: // - add the tablet's cell to the shard's Cells if needed // - change the master if needed shardUpdateRequired := false if !si.HasCell(tabletAlias.Cell) { shardUpdateRequired = true } if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias { shardUpdateRequired = true } if !shardUpdateRequired { return nil } actionNode := actionnode.UpdateShard() keyspace := si.Keyspace() shard := si.ShardName() lockPath, err := wr.lockShard(keyspace, shard, actionNode) if err != nil { return err } // re-read the shard with the lock si, err = wr.ts.GetShard(keyspace, shard) if err != nil { return wr.unlockShard(keyspace, shard, actionNode, lockPath, err) } // update it wasUpdated := false if !si.HasCell(tabletAlias.Cell) { si.Cells = append(si.Cells, tabletAlias.Cell) wasUpdated = true } if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias { if !si.MasterAlias.IsZero() && !force { return wr.unlockShard(keyspace, shard, actionNode, lockPath, fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, keyspace, shard)) } si.MasterAlias = tabletAlias wasUpdated = true } if wasUpdated { // write it back if err := topo.UpdateShard(context.TODO(), wr.ts, si); err != nil { return wr.unlockShard(keyspace, shard, actionNode, lockPath, err) } } // and unlock return wr.unlockShard(keyspace, shard, actionNode, lockPath, err) }
func (wr *Wrangler) removeShardCell(keyspace, shard, cell string, force bool) error { shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } // check the cell is in the list already if !topo.InCellList(cell, shardInfo.Cells) { return fmt.Errorf("cell %v in not in shard info", cell) } // check the master alias is not in the cell if shardInfo.MasterAlias.Cell == cell { return fmt.Errorf("master %v is in the cell '%v' we want to remove", shardInfo.MasterAlias, cell) } // get the ShardReplication object in the cell sri, err := wr.ts.GetShardReplication(cell, keyspace, shard) switch err { case nil: if len(sri.ReplicationLinks) > 0 { return fmt.Errorf("cell %v has %v possible tablets in replication graph", cell, len(sri.ReplicationLinks)) } // ShardReplication object is now useless, remove it if err := wr.ts.DeleteShardReplication(cell, keyspace, shard); err != nil { return fmt.Errorf("error deleting ShardReplication object in cell %v: %v", cell, err) } // we keep going case topo.ErrNoNode: // no ShardReplication object, we keep going default: // we can't get the object, assume topo server is down there, // so we look at force flag if !force { return err } wr.Logger().Warningf("Cannot get ShardReplication from cell %v, assuming cell topo server is down, and forcing the removal", cell) } // now we can update the shard wr.Logger().Infof("Removing cell %v from shard %v/%v", cell, keyspace, shard) newCells := make([]string, 0, len(shardInfo.Cells)-1) for _, c := range shardInfo.Cells { if c != cell { newCells = append(newCells, c) } } shardInfo.Cells = newCells return topo.UpdateShard(context.TODO(), wr.ts, shardInfo) }
func (wr *Wrangler) setShardTabletControl(keyspace, shard string, tabletType topo.TabletType, cells []string, remove, disableQueryService bool, tables []string) error { shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } if len(tables) == 0 && !remove { // we are setting the DisableQueryService flag only if err := shardInfo.UpdateDisableQueryService(tabletType, cells, disableQueryService); err != nil { return fmt.Errorf("UpdateDisableQueryService(%v/%v) failed: %v", shardInfo.Keyspace(), shardInfo.ShardName(), err) } } else { // we are setting / removing the blacklisted tables only if err := shardInfo.UpdateSourceBlacklistedTables(tabletType, cells, remove, tables); err != nil { return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", shardInfo.Keyspace(), shardInfo.ShardName(), err) } } return topo.UpdateShard(context.TODO(), wr.ts, shardInfo) }
// SetSourceShards is a utility function to override the SourceShards fields // on a Shard. func (wr *Wrangler) SetSourceShards(keyspace, shard string, sources []topo.TabletAlias, tables []string) error { // read the shard shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } // If the shard already has sources, maybe it's already been restored, // so let's be safe and abort right here. if len(shardInfo.SourceShards) > 0 { return fmt.Errorf("Shard %v/%v already has SourceShards, not overwriting them", keyspace, shard) } // read the source tablets sourceTablets, err := topo.GetTabletMap(context.TODO(), wr.TopoServer(), sources) if err != nil { return err } // Insert their KeyRange in the SourceShards array. // We use a linear 0-based id, that matches what mysqlctld/split.go // inserts into _vt.blp_checkpoint. shardInfo.SourceShards = make([]topo.SourceShard, len(sourceTablets)) i := 0 for _, ti := range sourceTablets { shardInfo.SourceShards[i] = topo.SourceShard{ Uid: uint32(i), Keyspace: ti.Keyspace, Shard: ti.Shard, KeyRange: ti.KeyRange, Tables: tables, } i++ } // and write the shard if err = topo.UpdateShard(context.TODO(), wr.ts, shardInfo); err != nil { return err } return nil }
func (wr *Wrangler) sourceShardDelete(keyspace, shard string, uid uint32) error { si, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } newSourceShards := make([]topo.SourceShard, 0, 0) for _, ss := range si.SourceShards { if ss.Uid != uid { newSourceShards = append(newSourceShards, ss) } } if len(newSourceShards) == len(si.SourceShards) { return fmt.Errorf("no SourceShard with uid %v", uid) } if len(newSourceShards) == 0 { newSourceShards = nil } si.SourceShards = newSourceShards return topo.UpdateShard(context.TODO(), wr.ts, si) }
func (wr *Wrangler) sourceShardAdd(keyspace, shard string, uid uint32, skeyspace, sshard string, keyRange key.KeyRange, tables []string) error { si, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } // check the uid is not used already for _, ss := range si.SourceShards { if ss.Uid == uid { return fmt.Errorf("uid %v is already in use", uid) } } si.SourceShards = append(si.SourceShards, topo.SourceShard{ Uid: uid, Keyspace: skeyspace, Shard: sshard, KeyRange: keyRange, Tables: tables, }) return topo.UpdateShard(context.TODO(), wr.ts, si) }
// Scrap a tablet. If force is used, we write to topo.Server // directly and don't remote-execute the command. // // If we scrap the master for a shard, we will clear its record // from the Shard object (only if that was the right master) func (wr *Wrangler) Scrap(tabletAlias topo.TabletAlias, force, skipRebuild bool) error { // load the tablet, see if we'll need to rebuild ti, err := wr.ts.GetTablet(tabletAlias) if err != nil { return err } rebuildRequired := ti.Tablet.IsInServingGraph() wasMaster := ti.Type == topo.TYPE_MASTER if force { err = topotools.Scrap(wr.ts, ti.Alias, force) } else { err = wr.tmc.Scrap(wr.ctx, ti) } if err != nil { return err } if !rebuildRequired { wr.Logger().Infof("Rebuild not required") return nil } if skipRebuild { wr.Logger().Warningf("Rebuild required, but skipping it") return nil } // update the Shard object if the master was scrapped if wasMaster { actionNode := actionnode.UpdateShard() lockPath, err := wr.lockShard(ti.Keyspace, ti.Shard, actionNode) if err != nil { return err } // read the shard with the lock si, err := wr.ts.GetShard(ti.Keyspace, ti.Shard) if err != nil { return wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err) } // update it if the right alias is there if si.MasterAlias == tabletAlias { si.MasterAlias = topo.TabletAlias{} // write it back if err := topo.UpdateShard(context.TODO(), wr.ts, si); err != nil { return wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err) } } else { wr.Logger().Warningf("Scrapping master %v from shard %v/%v but master in Shard object was %v", tabletAlias, ti.Keyspace, ti.Shard, si.MasterAlias) } // and unlock if err := wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err); err != nil { return err } } // and rebuild the original shard / keyspace _, err = wr.RebuildShardGraph(ti.Keyspace, ti.Shard, []string{ti.Alias.Cell}) return err }
func CheckShard(t *testing.T, ts topo.Server) { if err := ts.CreateKeyspace("test_keyspace", &topo.Keyspace{}); err != nil { t.Fatalf("CreateKeyspace: %v", err) } if err := topo.CreateShard(ts, "test_keyspace", "b0-c0"); err != nil { t.Fatalf("CreateShard: %v", err) } if err := topo.CreateShard(ts, "test_keyspace", "b0-c0"); err != topo.ErrNodeExists { t.Errorf("CreateShard called second time, got: %v", err) } if _, err := ts.GetShard("test_keyspace", "666"); err != topo.ErrNoNode { t.Errorf("GetShard(666): %v", err) } shardInfo, err := ts.GetShard("test_keyspace", "b0-c0") if err != nil { t.Errorf("GetShard: %v", err) } if want := newKeyRange("b0-c0"); shardInfo.KeyRange != want { t.Errorf("shardInfo.KeyRange: want %v, got %v", want, shardInfo.KeyRange) } master := topo.TabletAlias{Cell: "ny", Uid: 1} shardInfo.MasterAlias = master shardInfo.KeyRange = newKeyRange("b0-c0") shardInfo.ServedTypesMap = map[topo.TabletType]*topo.ShardServedType{ topo.TYPE_MASTER: &topo.ShardServedType{}, topo.TYPE_REPLICA: &topo.ShardServedType{Cells: []string{"c1"}}, topo.TYPE_RDONLY: &topo.ShardServedType{}, } shardInfo.SourceShards = []topo.SourceShard{ topo.SourceShard{ Uid: 1, Keyspace: "source_ks", Shard: "b8-c0", KeyRange: newKeyRange("b8-c0"), Tables: []string{"table1", "table2"}, }, } shardInfo.TabletControlMap = map[topo.TabletType]*topo.TabletControl{ topo.TYPE_MASTER: &topo.TabletControl{ Cells: []string{"c1", "c2"}, BlacklistedTables: []string{"black1", "black2"}, }, topo.TYPE_REPLICA: &topo.TabletControl{ DisableQueryService: true, }, } if err := topo.UpdateShard(context.TODO(), ts, shardInfo); err != nil { t.Errorf("UpdateShard: %v", err) } updatedShardInfo, err := ts.GetShard("test_keyspace", "b0-c0") if err != nil { t.Fatalf("GetShard: %v", err) } if eq, err := shardEqual(shardInfo.Shard, updatedShardInfo.Shard); err != nil { t.Errorf("cannot compare shards: %v", err) } else if !eq { t.Errorf("put and got shards are not identical:\n%#v\n%#v", shardInfo.Shard, updatedShardInfo.Shard) } // test GetShardNames shards, err := ts.GetShardNames("test_keyspace") if err != nil { t.Errorf("GetShardNames: %v", err) } if len(shards) != 1 || shards[0] != "b0-c0" { t.Errorf(`GetShardNames: want [ "b0-c0" ], got %v`, shards) } if _, err := ts.GetShardNames("test_keyspace666"); err != topo.ErrNoNode { t.Errorf("GetShardNames(666): %v", err) } }
// tabletExternallyReparentedLocked is called with the shard lock. // It returns if agent.refreshTablet should be called, and the error. // Note both are set independently (can have both true and an error). func (agent *ActionAgent) tabletExternallyReparentedLocked(ctx context.Context, externalID string, interrupted chan struct{}) (bool, error) { // re-read the tablet record to be sure we have the latest version tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return false, err } // read the shard, make sure again the master is not already good. shardInfo, err := agent.TopoServer.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return false, err } if shardInfo.MasterAlias == tablet.Alias { log.Infof("TabletExternallyReparented: tablet became the master before we get the lock?") return false, nil } log.Infof("TabletExternallyReparented called and we're not the master, doing the work") // Read the tablets, make sure the master elect is known to the shard // (it's this tablet, so it better be!). // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard) switch err { case nil: // keep going case topo.ErrPartialResult: log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return false, err } masterElectTablet, ok := tabletMap[tablet.Alias] if !ok { return false, fmt.Errorf("this master-elect tablet %v not found in replication graph %v/%v %v", tablet.Alias, tablet.Keyspace, tablet.Shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *tablet.Tablet, ExternalID: externalID, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // sort the tablets, and handle them slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) event.DispatchUpdate(ev, "starting external from tablet") // We fix the new master in the replication graph. // Note after this call, we may have changed the tablet record, // so we will always return true, so the tablet record is re-read // by the agent. event.DispatchUpdate(ev, "mark ourself as new master") err = agent.updateReplicationGraphForPromotedSlave(ctx, tablet) if err != nil { // This suggests we can't talk to topo server. This is bad. return true, fmt.Errorf("updateReplicationGraphForPromotedSlave failed: %v", err) } // Once this tablet is promoted, remove it from our maps delete(slaveTabletMap, tablet.Alias) delete(masterTabletMap, tablet.Alias) // Then fix all the slaves, including the old master. This // last step is very likely to time out for some tablets (one // random guy is dead, the old master is dead, ...). We // execute them all in parallel until we get to // wr.ActionTimeout(). After this, no other action with a // timeout is executed, so even if we got to the timeout, // we're still good. event.DispatchUpdate(ev, "restarting slaves") logger := logutil.NewConsoleLogger() tmc := tmclient.NewTabletManagerClient() topotools.RestartSlavesExternal(agent.TopoServer, logger, slaveTabletMap, masterTabletMap, masterElectTablet.Alias, func(ti *topo.TabletInfo, swrd *actionnode.SlaveWasRestartedArgs) error { return tmc.SlaveWasRestarted(ctx, ti, swrd) }) // Compute the list of Cells we need to rebuild: old master and // all other cells if reparenting to another cell. cells := []string{shardInfo.MasterAlias.Cell} if shardInfo.MasterAlias.Cell != tablet.Alias.Cell { cells = nil } // now update the master record in the shard object event.DispatchUpdate(ev, "updating shard record") log.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = tablet.Alias if err = topo.UpdateShard(ctx, agent.TopoServer, shardInfo); err != nil { return true, err } // and rebuild the shard serving graph event.DispatchUpdate(ev, "rebuilding shard serving graph") log.Infof("Rebuilding shard serving graph data") if _, err = topotools.RebuildShard(ctx, logger, agent.TopoServer, tablet.Keyspace, tablet.Shard, cells, agent.LockTimeout, interrupted); err != nil { return true, err } event.DispatchUpdate(ev, "finished") return true, nil }
// masterMigrateServedFrom handles the master migration. The ordering is // a bit different than for rdonly / replica to guarantee a smooth transition. // // The order is as follows: // - Add BlacklistedTables on the source shard map for master // - Refresh the source master, so it stops writing on the tables // - Get the source master position, wait until destination master reaches it // - Clear SourceShard on the destination Shard // - Refresh the destination master, so its stops its filtered // replication and starts accepting writes func (wr *Wrangler) masterMigrateServedFrom(ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, tables []string, ev *events.MigrateServedFrom) error { // Read the data we need sourceMasterTabletInfo, err := wr.ts.GetTablet(sourceShard.MasterAlias) if err != nil { return err } destinationMasterTabletInfo, err := wr.ts.GetTablet(destinationShard.MasterAlias) if err != nil { return err } // Update source shard (more blacklisted tables) event.DispatchUpdate(ev, "updating source shard") if err := sourceShard.UpdateSourceBlacklistedTables(topo.TYPE_MASTER, nil, false, tables); err != nil { return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err) } if err := topo.UpdateShard(context.TODO(), wr.ts, sourceShard); err != nil { return fmt.Errorf("UpdateShard(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err) } // Now refresh the blacklisted table list on the source master event.DispatchUpdate(ev, "refreshing source master so it updates its blacklisted tables") if err := wr.tmc.RefreshState(wr.ctx, sourceMasterTabletInfo); err != nil { return err } // get the position event.DispatchUpdate(ev, "getting master position") masterPosition, err := wr.tmc.MasterPosition(wr.ctx, sourceMasterTabletInfo) if err != nil { return err } // wait for it event.DispatchUpdate(ev, "waiting for destination master to catch up to source master") if err := wr.tmc.WaitBlpPosition(context.TODO(), destinationMasterTabletInfo, blproto.BlpPosition{ Uid: 0, Position: masterPosition, }, wr.ActionTimeout()); err != nil { return err } // Update the destination keyspace (its ServedFrom has changed) event.DispatchUpdate(ev, "updating keyspace") if err = topo.UpdateKeyspace(wr.ts, ki); err != nil { return err } // Update the destination shard (no more source shard) event.DispatchUpdate(ev, "updating destination shard") destinationShard.SourceShards = nil if err := topo.UpdateShard(context.TODO(), wr.ts, destinationShard); err != nil { return err } // Tell the new shards masters they can now be read-write. // Invoking a remote action will also make the tablet stop filtered // replication. event.DispatchUpdate(ev, "setting destination shard masters read-write") if err := wr.refreshMasters([]*topo.ShardInfo{destinationShard}); err != nil { return err } return nil }
// migrateServedTypes operates with all concerned shards locked. func (wr *Wrangler) migrateServedTypes(keyspace string, sourceShards, destinationShards []*topo.ShardInfo, cells []string, servedType topo.TabletType, reverse bool) (err error) { // re-read all the shards so we are up to date wr.Logger().Infof("Re-reading all shards") for i, si := range sourceShards { if sourceShards[i], err = wr.ts.GetShard(si.Keyspace(), si.ShardName()); err != nil { return err } } for i, si := range destinationShards { if destinationShards[i], err = wr.ts.GetShard(si.Keyspace(), si.ShardName()); err != nil { return err } } ev := &events.MigrateServedTypes{ Keyspace: *topo.NewKeyspaceInfo(keyspace, nil, -1), SourceShards: sourceShards, DestinationShards: destinationShards, ServedType: servedType, Reverse: reverse, } event.DispatchUpdate(ev, "start") defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // For master type migration, need to: // - switch the source shards to read-only by disabling query service // - gather all replication points // - wait for filtered replication to catch up before we continue // - disable filtered replication after the fact if servedType == topo.TYPE_MASTER { event.DispatchUpdate(ev, "disabling query service on all source masters") for _, si := range sourceShards { if err := si.UpdateDisableQueryService(topo.TYPE_MASTER, nil, true); err != nil { return err } if err := topo.UpdateShard(context.TODO(), wr.ts, si); err != nil { return err } } if err := wr.refreshMasters(sourceShards); err != nil { return err } event.DispatchUpdate(ev, "getting positions of source masters") masterPositions, err := wr.getMastersPosition(sourceShards) if err != nil { return err } event.DispatchUpdate(ev, "waiting for destination masters to catch up") if err := wr.waitForFilteredReplication(masterPositions, destinationShards); err != nil { return err } for _, si := range destinationShards { si.SourceShards = nil } } // Check and update all shard records, in memory only. // We remember if we need to refresh the state of the source tablets // so their query service is enabled again, for reverse migration. needToRefreshSourceTablets := false for _, si := range sourceShards { if err := si.UpdateServedTypesMap(servedType, cells, !reverse); err != nil { return err } if tc, ok := si.TabletControlMap[servedType]; reverse && ok && tc.DisableQueryService { // this is a backward migration, where the // source tablets were disabled previously, so // we need to refresh them if err := si.UpdateDisableQueryService(servedType, cells, false); err != nil { return err } needToRefreshSourceTablets = true } if !reverse && servedType != topo.TYPE_MASTER { // this is a forward migration, we need to disable // query service on the source shards. // (this was already done for masters earlier) if err := si.UpdateDisableQueryService(servedType, cells, true); err != nil { return err } } } for _, si := range destinationShards { if err := si.UpdateServedTypesMap(servedType, cells, reverse); err != nil { return err } } // All is good, we can save the shards now event.DispatchUpdate(ev, "updating source shards") for _, si := range sourceShards { if err := topo.UpdateShard(context.TODO(), wr.ts, si); err != nil { return err } } if needToRefreshSourceTablets { event.DispatchUpdate(ev, "refreshing source shard tablets so they restart their query service") for _, si := range sourceShards { wr.RefreshTablesByShard(si, servedType, cells) } } event.DispatchUpdate(ev, "updating destination shards") for _, si := range destinationShards { if err := topo.UpdateShard(context.TODO(), wr.ts, si); err != nil { return err } } // And tell the new shards masters they can now be read-write. // Invoking a remote action will also make the tablet stop filtered // replication. if servedType == topo.TYPE_MASTER { event.DispatchUpdate(ev, "setting destination masters read-write") if err := wr.refreshMasters(destinationShards); err != nil { return err } } event.DispatchUpdate(ev, "finished") return nil }
func TestTabletExternallyReparented(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) // Create an old master, a new master, two good slaves, one bad slave oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave1 := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave2 := NewFakeTablet(t, wr, "cell2", 3, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) badSlave := NewFakeTablet(t, wr, "cell1", 4, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) // Add a new Cell to the Shard, that doesn't map to any read topo cell, // to simulate a data center being unreachable. si, err := ts.GetShard("test_keyspace", "0") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.Cells = append(si.Cells, "cell666") if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // Slightly unrelated test: make sure we can find the tablets // even with a datacenter being down. tabletMap, err := topo.GetTabletMapForShardByCell(ctx, ts, "test_keyspace", "0", []string{"cell1"}) if err != nil { t.Fatalf("GetTabletMapForShardByCell should have worked but got: %v", err) } master, err := topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != nil || master != oldMaster.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master) } slave1, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave1.Tablet.IPAddr, "vt", goodSlave1.Tablet.Portmap["vt"]) if err != nil || slave1 != goodSlave1.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(slave1) failed: %v %v", err, master) } slave2, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave2.Tablet.IPAddr, "vt", goodSlave2.Tablet.Portmap["vt"]) if err != topo.ErrNoNode { t.Fatalf("FindTabletByIPAddrAndPort(slave2) worked: %v %v", err, slave2) } // Make sure the master is not exported in other cells tabletMap, err = topo.GetTabletMapForShardByCell(ctx, ts, "test_keyspace", "0", []string{"cell2"}) master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != topo.ErrNoNode { t.Fatalf("FindTabletByIPAddrAndPort(master) worked in cell2: %v %v", err, master) } tabletMap, err = topo.GetTabletMapForShard(ctx, ts, "test_keyspace", "0") if err != topo.ErrPartialResult { t.Fatalf("GetTabletMapForShard should have returned ErrPartialResult but got: %v", err) } master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != nil || master != oldMaster.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master) } // On the elected master, we will respond to // TABLET_ACTION_SLAVE_WAS_PROMOTED newMaster.FakeMysqlDaemon.MasterAddr = "" newMaster.StartActionLoop(t, wr) defer newMaster.StopActionLoop(t) // On the old master, we will only respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. oldMaster.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() oldMaster.StartActionLoop(t, wr) defer oldMaster.StopActionLoop(t) // On the good slaves, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. goodSlave1.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave1.StartActionLoop(t, wr) defer goodSlave1.StopActionLoop(t) goodSlave2.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave2.StartActionLoop(t, wr) defer goodSlave2.StopActionLoop(t) // On the bad slave, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED with bad data. badSlave.FakeMysqlDaemon.MasterAddr = "234.0.0.1:3301" badSlave.StartActionLoop(t, wr) defer badSlave.StopActionLoop(t) // First test: reparent to the same master, make sure it works // as expected. tmc := tmclient.NewTabletManagerClient() ti, err := ts.GetTablet(oldMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(same master) should have worked") } // Second test: reparent to a replica, and pretend the old // master is still good to go. // This tests a bad case; the new designated master is a slave, // but we should do what we're told anyway ti, err = ts.GetTablet(goodSlave1.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(slave) error: %v", err) } // This tests the good case, where everything works as planned t.Logf("TabletExternallyReparented(new master) expecting success") ti, err = ts.GetTablet(newMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) } // Now double-check the serving graph is good. // Should only have one good replica left. addrs, err := ts.GetEndPoints("cell1", "test_keyspace", "0", topo.TYPE_REPLICA) if err != nil { t.Fatalf("GetEndPoints failed at the end: %v", err) } if len(addrs.Entries) != 1 { t.Fatalf("GetEndPoints has too many entries: %v", addrs) } }