func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error { // If the majority of slaves restarted, move ahead. if majorityRestart { if leaveMasterReadOnly { wr.logger.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } else { wr.logger.Infof("marking master-elect read-write %v", masterElect.Alias) if err := wr.tmc.SetReadWrite(masterElect, wr.ActionTimeout()); err != nil { wr.logger.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } } } else { wr.logger.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } // save the new master in the shard info si.MasterAlias = masterElect.Alias if err := topo.UpdateShard(wr.ts, si); err != nil { wr.logger.Errorf("Failed to save new master into shard: %v", err) return err } // We rebuild all the cells, as we may have taken tablets in and // out of the graph. wr.logger.Infof("rebuilding shard serving graph data") return topotools.RebuildShard(wr.logger, wr.ts, masterElect.Keyspace, masterElect.Shard, nil, wr.lockTimeout, interrupted) }
// replicaMigrateServedFrom handles the slave (replica, rdonly) migration. func (wr *Wrangler) replicaMigrateServedFrom(ctx context.Context, ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, servedType pb.TabletType, cells []string, reverse bool, tables []string, ev *events.MigrateServedFrom) error { // Save the destination keyspace (its ServedFrom has been changed) event.DispatchUpdate(ev, "updating keyspace") if err := topo.UpdateKeyspace(ctx, wr.ts, ki); err != nil { return err } // Save the source shard (its blacklisted tables field has changed) event.DispatchUpdate(ev, "updating source shard") if err := sourceShard.UpdateSourceBlacklistedTables(servedType, cells, reverse, tables); err != nil { return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err) } if err := topo.UpdateShard(ctx, wr.ts, sourceShard); err != nil { return fmt.Errorf("UpdateShard(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err) } // Now refresh the source servers so they reload their // blacklisted table list event.DispatchUpdate(ev, "refreshing sources tablets state so they update their blacklisted tables") if err := wr.RefreshTablesByShard(ctx, sourceShard, servedType, cells); err != nil { return err } return nil }
// replicaMigrateServedFrom handles the slave (replica, rdonly) migration. func (wr *Wrangler) replicaMigrateServedFrom(ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, servedType topo.TabletType, reverse bool, tables []string, ev *events.MigrateServedFrom) error { // Save the destination keyspace (its ServedFrom has been changed) event.DispatchUpdate(ev, "updating keyspace") if err := topo.UpdateKeyspace(wr.ts, ki); err != nil { return err } // Save the source shard (its blacklisted tables field has changed) event.DispatchUpdate(ev, "updating source shard") if sourceShard.BlacklistedTablesMap == nil { sourceShard.BlacklistedTablesMap = make(map[topo.TabletType][]string) } if reverse { delete(sourceShard.BlacklistedTablesMap, servedType) } else { sourceShard.BlacklistedTablesMap[servedType] = tables } if err := topo.UpdateShard(wr.ts, sourceShard); err != nil { return err } // Now refresh the source servers so they reload their // blacklisted table list event.DispatchUpdate(ev, "refreshing sources tablets state so they update their blacklisted tables") if err := wr.RefreshTablesByShard(sourceShard.Keyspace(), sourceShard.ShardName(), servedType); err != nil { return err } return nil }
func (wr *Wrangler) setShardServedTypes(keyspace, shard string, servedTypes []topo.TabletType) error { shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } shardInfo.ServedTypes = servedTypes return topo.UpdateShard(wr.ts, shardInfo) }
func (wr *Wrangler) setShardServedTypes(ctx context.Context, keyspace, shard string, cells []string, servedType topo.TabletType, remove bool) error { si, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } if err := si.UpdateServedTypesMap(servedType, cells, remove); err != nil { return err } return topo.UpdateShard(ctx, wr.ts, si) }
// updateShardCellsAndMaster will update the 'Cells' and possibly // MasterAlias records for the shard, if needed. func (wr *Wrangler) updateShardCellsAndMaster(ctx context.Context, si *topo.ShardInfo, tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) error { // See if we need to update the Shard: // - add the tablet's cell to the shard's Cells if needed // - change the master if needed shardUpdateRequired := false if !si.HasCell(tabletAlias.Cell) { shardUpdateRequired = true } if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias { shardUpdateRequired = true } if !shardUpdateRequired { return nil } actionNode := actionnode.UpdateShard() keyspace := si.Keyspace() shard := si.ShardName() lockPath, err := wr.lockShard(ctx, keyspace, shard, actionNode) if err != nil { return err } // re-read the shard with the lock si, err = wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err) } // update it wasUpdated := false if !si.HasCell(tabletAlias.Cell) { si.Cells = append(si.Cells, tabletAlias.Cell) wasUpdated = true } if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias { if !si.MasterAlias.IsZero() && !force { return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, keyspace, shard)) } si.MasterAlias = tabletAlias wasUpdated = true } if wasUpdated { // write it back if err := topo.UpdateShard(ctx, wr.ts, si); err != nil { return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err) } } // and unlock return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err) }
func (wr *Wrangler) removeShardCell(keyspace, shard, cell string, force bool) error { shardInfo, err := wr.ts.GetShardCritical(keyspace, shard) if err != nil { return err } // check the cell is in the list already if !topo.InCellList(cell, shardInfo.Cells) { return fmt.Errorf("cell %v in not in shard info", cell) } // check the master alias is not in the cell if shardInfo.MasterAlias.Cell == cell { return fmt.Errorf("master %v is in the cell '%v' we want to remove", shardInfo.MasterAlias, cell) } // get the ShardReplication object in the cell sri, err := wr.ts.GetShardReplication(cell, keyspace, shard) switch err { case nil: if len(sri.ReplicationLinks) > 0 { return fmt.Errorf("cell %v has %v possible tablets in replication graph", cell, len(sri.ReplicationLinks)) } // ShardReplication object is now useless, remove it if err := wr.ts.DeleteShardReplication(cell, keyspace, shard); err != nil { return fmt.Errorf("error deleting ShardReplication object in cell %v: %v", cell, err) } // we keep going case topo.ErrNoNode: // no ShardReplication object, we keep going default: // we can't get the object, assume topo server is down there, // so we look at force flag if !force { return err } log.Warningf("Cannot get ShardReplication from cell %v, assuming cell topo server is down, and forcing the removal", cell) } // now we can update the shard log.Infof("Removing cell %v from shard %v/%v", cell, keyspace, shard) newCells := make([]string, 0, len(shardInfo.Cells)-1) for _, c := range shardInfo.Cells { if c != cell { newCells = append(newCells, c) } } shardInfo.Cells = newCells return topo.UpdateShard(wr.ts, shardInfo) }
func TestReparentTablet(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second) // create shard and tablets if err := topo.CreateShard(ctx, ts, "test_keyspace", "0"); err != nil { t.Fatalf("CreateShard failed: %v", err) } master := NewFakeTablet(t, wr, "cell1", 1, pb.TabletType_MASTER) slave := NewFakeTablet(t, wr, "cell1", 2, pb.TabletType_REPLICA) // mark the master inside the shard si, err := ts.GetShard(ctx, "test_keyspace", "0") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.MasterAlias = master.Tablet.Alias if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // master action loop (to initialize host and port) master.StartActionLoop(t, wr) defer master.StopActionLoop(t) // slave loop slave.FakeMysqlDaemon.SetMasterCommandsInput = fmt.Sprintf("%v:%v", master.Tablet.Hostname, master.Tablet.PortMap["mysql"]) slave.FakeMysqlDaemon.SetMasterCommandsResult = []string{"set master cmd 1"} slave.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "set master cmd 1", } slave.StartActionLoop(t, wr) defer slave.StopActionLoop(t) // run ReparentTablet if err := wr.ReparentTablet(ctx, slave.Tablet.Alias); err != nil { t.Fatalf("ReparentTablet failed: %v", err) } // check what was run if err := slave.FakeMysqlDaemon.CheckSuperQueryList(); err != nil { t.Fatalf("slave.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err) } }
func (wr *Wrangler) setShardTabletControl(ctx context.Context, keyspace, shard string, tabletType topo.TabletType, cells []string, remove, disableQueryService bool, tables []string) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } if len(tables) == 0 && !remove { // we are setting the DisableQueryService flag only if err := shardInfo.UpdateDisableQueryService(tabletType, cells, disableQueryService); err != nil { return fmt.Errorf("UpdateDisableQueryService(%v/%v) failed: %v", shardInfo.Keyspace(), shardInfo.ShardName(), err) } } else { // we are setting / removing the blacklisted tables only if err := shardInfo.UpdateSourceBlacklistedTables(tabletType, cells, remove, tables); err != nil { return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", shardInfo.Keyspace(), shardInfo.ShardName(), err) } } return topo.UpdateShard(ctx, wr.ts, shardInfo) }
// SetSourceShards is a utility function to override the SourceShards fields // on a Shard. func (wr *Wrangler) SetSourceShards(ctx context.Context, keyspace, shard string, sources []topo.TabletAlias, tables []string) error { // read the shard shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } // If the shard already has sources, maybe it's already been restored, // so let's be safe and abort right here. if len(shardInfo.SourceShards) > 0 { return fmt.Errorf("Shard %v/%v already has SourceShards, not overwriting them", keyspace, shard) } // read the source tablets sourceTablets, err := topo.GetTabletMap(ctx, wr.TopoServer(), sources) if err != nil { return err } // Insert their KeyRange in the SourceShards array. // We use a linear 0-based id, that matches what mysqlctld/split.go // inserts into _vt.blp_checkpoint. shardInfo.SourceShards = make([]*pb.Shard_SourceShard, len(sourceTablets)) i := 0 for _, ti := range sourceTablets { shardInfo.SourceShards[i] = &pb.Shard_SourceShard{ Uid: uint32(i), Keyspace: ti.Keyspace, Shard: ti.Shard, KeyRange: key.KeyRangeToProto(ti.KeyRange), Tables: tables, } i++ } // and write the shard if err = topo.UpdateShard(ctx, wr.ts, shardInfo); err != nil { return err } return nil }
func (wr *Wrangler) sourceShardDelete(ctx context.Context, keyspace, shard string, uid uint32) error { si, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } newSourceShards := make([]topo.SourceShard, 0, 0) for _, ss := range si.SourceShards { if ss.Uid != uid { newSourceShards = append(newSourceShards, ss) } } if len(newSourceShards) == len(si.SourceShards) { return fmt.Errorf("no SourceShard with uid %v", uid) } if len(newSourceShards) == 0 { newSourceShards = nil } si.SourceShards = newSourceShards return topo.UpdateShard(ctx, wr.ts, si) }
func (wr *Wrangler) sourceShardAdd(ctx context.Context, keyspace, shard string, uid uint32, skeyspace, sshard string, keyRange key.KeyRange, tables []string) error { si, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } // check the uid is not used already for _, ss := range si.SourceShards { if ss.Uid == uid { return fmt.Errorf("uid %v is already in use", uid) } } si.SourceShards = append(si.SourceShards, topo.SourceShard{ Uid: uid, Keyspace: skeyspace, Shard: sshard, KeyRange: keyRange, Tables: tables, }) return topo.UpdateShard(ctx, wr.ts, si) }
func (wr *Wrangler) setShardBlacklistedTables(keyspace, shard string, tabletType topo.TabletType, tables []string) error { shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } if len(tables) == 0 { // it's a removal if shardInfo.BlacklistedTablesMap != nil { delete(shardInfo.BlacklistedTablesMap, tabletType) if len(shardInfo.BlacklistedTablesMap) == 0 { shardInfo.BlacklistedTablesMap = nil } } } else { // it's an addition if shardInfo.BlacklistedTablesMap == nil { shardInfo.BlacklistedTablesMap = make(map[topo.TabletType][]string) } shardInfo.BlacklistedTablesMap[tabletType] = tables } return topo.UpdateShard(wr.ts, shardInfo) }
// migrateServedTypes operates with all concerned shards locked. func (wr *Wrangler) migrateServedTypes(keyspace string, sourceShards, destinationShards []*topo.ShardInfo, servedType topo.TabletType, reverse bool, shardCache map[string]*topo.ShardInfo) (err error) { // re-read all the shards so we are up to date for i, si := range sourceShards { if sourceShards[i], err = wr.ts.GetShard(si.Keyspace(), si.ShardName()); err != nil { return err } shardCache[si.ShardName()] = sourceShards[i] } for i, si := range destinationShards { if destinationShards[i], err = wr.ts.GetShard(si.Keyspace(), si.ShardName()); err != nil { return err } shardCache[si.ShardName()] = destinationShards[i] } ev := &events.MigrateServedTypes{ Keyspace: *topo.NewKeyspaceInfo(keyspace, nil, -1), SourceShards: sourceShards, DestinationShards: destinationShards, ServedType: servedType, Reverse: reverse, } event.DispatchUpdate(ev, "start") defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // check and update all shard records, in memory only for _, si := range sourceShards { if reverse { // need to add to source if topo.IsTypeInList(servedType, si.ServedTypes) { return fmt.Errorf("Source shard %v/%v is already serving type %v", si.Keyspace(), si.ShardName(), servedType) } si.ServedTypes = append(si.ServedTypes, servedType) } else { // need to remove from source var found bool if si.ServedTypes, found = removeType(servedType, si.ServedTypes); !found { return fmt.Errorf("Source shard %v/%v is not serving type %v", si.Keyspace(), si.ShardName(), servedType) } } } for _, si := range destinationShards { if reverse { // need to remove from destination var found bool if si.ServedTypes, found = removeType(servedType, si.ServedTypes); !found { return fmt.Errorf("Destination shard %v/%v is not serving type %v", si.Keyspace(), si.ShardName(), servedType) } } else { // need to add to destination if topo.IsTypeInList(servedType, si.ServedTypes) { return fmt.Errorf("Destination shard %v/%v is already serving type %v", si.Keyspace(), si.ShardName(), servedType) } si.ServedTypes = append(si.ServedTypes, servedType) } } // For master type migration, need to: // - switch the source shards to read-only // - gather all replication points // - wait for filtered replication to catch up before we continue // - disable filtered replication after the fact if servedType == topo.TYPE_MASTER { event.DispatchUpdate(ev, "setting all source masters read-only") err := wr.makeMastersReadOnly(sourceShards) if err != nil { return err } event.DispatchUpdate(ev, "getting positions of source masters") masterPositions, err := wr.getMastersPosition(sourceShards) if err != nil { return err } event.DispatchUpdate(ev, "waiting for destination masters to catch up") if err := wr.waitForFilteredReplication(masterPositions, destinationShards); err != nil { return err } for _, si := range destinationShards { si.SourceShards = nil } } // All is good, we can save the shards now event.DispatchUpdate(ev, "updating source shards") for _, si := range sourceShards { if err := topo.UpdateShard(wr.ts, si); err != nil { return err } shardCache[si.ShardName()] = si } event.DispatchUpdate(ev, "updating destination shards") for _, si := range destinationShards { if err := topo.UpdateShard(wr.ts, si); err != nil { return err } shardCache[si.ShardName()] = si } // And tell the new shards masters they can now be read-write. // Invoking a remote action will also make the tablet stop filtered // replication. if servedType == topo.TYPE_MASTER { event.DispatchUpdate(ev, "setting destination masters read-write") if err := wr.makeMastersReadWrite(destinationShards); err != nil { return err } } event.DispatchUpdate(ev, "finished") return nil }
func (wr *Wrangler) migrateServedFrom(ki *topo.KeyspaceInfo, si *topo.ShardInfo, servedType topo.TabletType, reverse bool) (err error) { // re-read and update keyspace info record ki, err = wr.ts.GetKeyspace(ki.KeyspaceName()) if err != nil { return err } if reverse { if _, ok := ki.ServedFrom[servedType]; ok { return fmt.Errorf("Destination Keyspace %s is not serving type %v", ki.KeyspaceName(), servedType) } ki.ServedFrom[servedType] = si.SourceShards[0].Keyspace } else { if _, ok := ki.ServedFrom[servedType]; !ok { return fmt.Errorf("Destination Keyspace %s is already serving type %v", ki.KeyspaceName(), servedType) } delete(ki.ServedFrom, servedType) } // re-read and check the destination shard si, err = wr.ts.GetShard(si.Keyspace(), si.ShardName()) if err != nil { return err } if len(si.SourceShards) != 1 { return fmt.Errorf("Destination shard %v/%v is not a vertical split target", si.Keyspace(), si.ShardName()) } tables := si.SourceShards[0].Tables // read the source shard, we'll need its master sourceShard, err := wr.ts.GetShard(si.SourceShards[0].Keyspace, si.SourceShards[0].Shard) if err != nil { return err } ev := &events.MigrateServedFrom{ Keyspace: *ki, SourceShard: *sourceShard, DestinationShard: *si, ServedType: servedType, Reverse: reverse, } event.DispatchUpdate(ev, "start") defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // For master type migration, need to: // - switch the source shard to read-only // - gather the replication point // - wait for filtered replication to catch up before we continue // - disable filtered replication after the fact var sourceMasterTabletInfo *topo.TabletInfo if servedType == topo.TYPE_MASTER { // set master to read-only event.DispatchUpdate(ev, "setting source shard master to read-only") actionPath, err := wr.ai.SetReadOnly(sourceShard.MasterAlias) if err != nil { return err } if err := wr.WaitForCompletion(actionPath); err != nil { return err } // get the position event.DispatchUpdate(ev, "getting master position") sourceMasterTabletInfo, err = wr.ts.GetTablet(sourceShard.MasterAlias) if err != nil { return err } masterPosition, err := wr.ai.MasterPosition(sourceMasterTabletInfo, wr.ActionTimeout()) if err != nil { return err } // wait for it event.DispatchUpdate(ev, "waiting for destination master to catch up to source master") if err := wr.ai.WaitBlpPosition(si.MasterAlias, blproto.BlpPosition{ Uid: 0, Position: masterPosition, }, wr.ActionTimeout()); err != nil { return err } // and clear the shard record si.SourceShards = nil } // All is good, we can save the keyspace and shard (if needed) now event.DispatchUpdate(ev, "updating keyspace") if err = topo.UpdateKeyspace(wr.ts, ki); err != nil { return err } event.DispatchUpdate(ev, "updating destination shard") if servedType == topo.TYPE_MASTER { if err := topo.UpdateShard(wr.ts, si); err != nil { return err } } // Tell the new shards masters they can now be read-write. // Invoking a remote action will also make the tablet stop filtered // replication. event.DispatchUpdate(ev, "setting destination shard masters read-write") if servedType == topo.TYPE_MASTER { if err := wr.makeMastersReadWrite([]*topo.ShardInfo{si}); err != nil { return err } } // Now blacklist the table list on the right servers event.DispatchUpdate(ev, "setting blacklisted tables on source shard") if servedType == topo.TYPE_MASTER { if err := wr.ai.SetBlacklistedTables(sourceMasterTabletInfo, tables, wr.ActionTimeout()); err != nil { return err } } else { // We use the list of tables that are replicating // for the blacklist. In case of a reverse move, we clear the // blacklist. if reverse { tables = nil } if err := wr.SetBlacklistedTablesByShard(sourceShard.Keyspace(), sourceShard.ShardName(), servedType, tables); err != nil { return err } } event.DispatchUpdate(ev, "finished") return nil }
// TestInitMasterShardOneSlaveFails makes sure that if one slave fails to // proceed, the action completes anyway func TestInitMasterShardOneSlaveFails(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second) // Create a master, a couple slaves master := NewFakeTablet(t, wr, "cell1", 0, pb.TabletType_MASTER) goodSlave := NewFakeTablet(t, wr, "cell1", 1, pb.TabletType_REPLICA) badSlave := NewFakeTablet(t, wr, "cell2", 2, pb.TabletType_REPLICA) // Master: set a plausible ReplicationPosition to return, // and expect to add entry in _vt.reparent_journal master.FakeMysqlDaemon.CurrentMasterPosition = myproto.ReplicationPosition{ GTIDSet: myproto.MariadbGTID{ Domain: 5, Server: 456, Sequence: 890, }, } master.FakeMysqlDaemon.ReadOnly = true master.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "CREATE DATABASE IF NOT EXISTS _vt", "SUBCREATE TABLE IF NOT EXISTS _vt.reparent_journal", "CREATE DATABASE IF NOT EXISTS _vt", "SUBCREATE TABLE IF NOT EXISTS _vt.reparent_journal", "SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, master_alias, replication_position) VALUES", } master.StartActionLoop(t, wr) defer master.StopActionLoop(t) // goodSlave: expect to be re-parented goodSlave.FakeMysqlDaemon.ReadOnly = true goodSlave.FakeMysqlDaemon.StartReplicationCommandsStatus = &myproto.ReplicationStatus{ Position: master.FakeMysqlDaemon.CurrentMasterPosition, MasterHost: master.Tablet.Hostname, MasterPort: int(master.Tablet.PortMap["mysql"]), MasterConnectRetry: 10, } goodSlave.FakeMysqlDaemon.StartReplicationCommandsResult = []string{"cmd1"} goodSlave.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = goodSlave.FakeMysqlDaemon.StartReplicationCommandsResult goodSlave.StartActionLoop(t, wr) defer goodSlave.StopActionLoop(t) // badSlave: insert an error by failing the ReplicationStatus input // on purpose badSlave.FakeMysqlDaemon.ReadOnly = true badSlave.FakeMysqlDaemon.StartReplicationCommandsStatus = &myproto.ReplicationStatus{ Position: master.FakeMysqlDaemon.CurrentMasterPosition, MasterHost: "", MasterPort: 0, MasterConnectRetry: 10, } badSlave.StartActionLoop(t, wr) defer badSlave.StopActionLoop(t) // also change the master alias in the Shard object, to make sure it // is set back. si, err := ts.GetShard(ctx, master.Tablet.Keyspace, master.Tablet.Shard) if err != nil { t.Fatalf("GetShard failed: %v", err) } si.MasterAlias.Uid++ if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // run InitShardMaster without force, it fails because master is // changing. if err := wr.InitShardMaster(ctx, master.Tablet.Keyspace, master.Tablet.Shard, master.Tablet.Alias, false /*force*/, 10*time.Second); err == nil || !strings.Contains(err.Error(), "is not the shard master") { t.Errorf("InitShardMaster with mismatched new master returned wrong error: %v", err) } // run InitShardMaster if err := wr.InitShardMaster(ctx, master.Tablet.Keyspace, master.Tablet.Shard, master.Tablet.Alias, true /*force*/, 10*time.Second); err == nil || !strings.Contains(err.Error(), "wrong status for StartReplicationCommands") { t.Errorf("InitShardMaster with one failed slave returned wrong error: %v", err) } // check what was run: master should still be good if master.FakeMysqlDaemon.ReadOnly { t.Errorf("master was not turned read-write") } si, err = ts.GetShard(ctx, master.Tablet.Keyspace, master.Tablet.Shard) if err != nil { t.Fatalf("GetShard failed: %v", err) } if !topo.TabletAliasEqual(si.MasterAlias, master.Tablet.Alias) { t.Errorf("unexpected shard master alias, got %v expected %v", si.MasterAlias, master.Tablet.Alias) } }
// TestTabletControl verifies the shard's TabletControl record can disable // query service in a tablet. func TestTabletControl(t *testing.T) { ctx := context.Background() agent := createTestAgent(ctx, t) targetTabletType := topo.TYPE_REPLICA // first health check, should change us to replica before := time.Now() agent.runHealthCheck(targetTabletType) ti, err := agent.TopoServer.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != targetTabletType { t.Errorf("First health check failed to go to replica: %v", ti.Type) } if !agent.QueryServiceControl.IsServing() { t.Errorf("Query service should be running") } if agent._healthyTime.Sub(before) < 0 { t.Errorf("runHealthCheck did not update agent._healthyTime") } // now update the shard si, err := agent.TopoServer.GetShard(ctx, keyspace, shard) if err != nil { t.Fatalf("GetShard failed: %v", err) } si.TabletControls = []*pb.Shard_TabletControl{ &pb.Shard_TabletControl{ TabletType: topo.TabletTypeToProto(targetTabletType), DisableQueryService: true, }, } if err := topo.UpdateShard(ctx, agent.TopoServer, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // now refresh the tablet state, as the resharding process would do agent.RPCWrapLockAction(ctx, actionnode.TabletActionRefreshState, "", "", true, func() error { agent.RefreshState(ctx) return nil }) // check we shutdown query service if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } // check running a health check will not start it again before = time.Now() agent.runHealthCheck(targetTabletType) ti, err = agent.TopoServer.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != targetTabletType { t.Errorf("Health check failed to go to replica: %v", ti.Type) } if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } if agent._healthyTime.Sub(before) < 0 { t.Errorf("runHealthCheck did not update agent._healthyTime") } // go unhealthy, check we go to spare and QS is not running agent.HealthReporter.(*fakeHealthCheck).reportError = fmt.Errorf("tablet is unhealthy") before = time.Now() agent.runHealthCheck(targetTabletType) ti, err = agent.TopoServer.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_SPARE { t.Errorf("Unhealthy health check should go to spare: %v", ti.Type) } if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } if agent._healthyTime.Sub(before) < 0 { t.Errorf("runHealthCheck did not update agent._healthyTime") } // go back healthy, check QS is still not running agent.HealthReporter.(*fakeHealthCheck).reportError = nil before = time.Now() agent.runHealthCheck(targetTabletType) ti, err = agent.TopoServer.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != targetTabletType { t.Errorf("Healthy health check should go to replica: %v", ti.Type) } if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } if agent._healthyTime.Sub(before) < 0 { t.Errorf("runHealthCheck did not update agent._healthyTime") } }
// migrateServedTypes operates with all concerned shards locked. func (wr *Wrangler) migrateServedTypes(ctx context.Context, keyspace string, sourceShards, destinationShards []*topo.ShardInfo, cells []string, servedType pb.TabletType, reverse bool, filteredReplicationWaitTime time.Duration) (err error) { // re-read all the shards so we are up to date wr.Logger().Infof("Re-reading all shards") for i, si := range sourceShards { if sourceShards[i], err = wr.ts.GetShard(ctx, si.Keyspace(), si.ShardName()); err != nil { return err } } for i, si := range destinationShards { if destinationShards[i], err = wr.ts.GetShard(ctx, si.Keyspace(), si.ShardName()); err != nil { return err } } ev := &events.MigrateServedTypes{ Keyspace: *topo.NewKeyspaceInfo(keyspace, nil, -1), SourceShards: sourceShards, DestinationShards: destinationShards, ServedType: servedType, Reverse: reverse, } event.DispatchUpdate(ev, "start") defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // For master type migration, need to: // - switch the source shards to read-only by disabling query service // - gather all replication points // - wait for filtered replication to catch up before we continue // - disable filtered replication after the fact if servedType == pb.TabletType_MASTER { event.DispatchUpdate(ev, "disabling query service on all source masters") for _, si := range sourceShards { if err := si.UpdateDisableQueryService(pb.TabletType_MASTER, nil, true); err != nil { return err } if err := topo.UpdateShard(ctx, wr.ts, si); err != nil { return err } } if err := wr.refreshMasters(ctx, sourceShards); err != nil { return err } event.DispatchUpdate(ev, "getting positions of source masters") masterPositions, err := wr.getMastersPosition(ctx, sourceShards) if err != nil { return err } event.DispatchUpdate(ev, "waiting for destination masters to catch up") if err := wr.waitForFilteredReplication(ctx, masterPositions, destinationShards, filteredReplicationWaitTime); err != nil { return err } for _, si := range destinationShards { si.SourceShards = nil } } // Check and update all shard records, in memory only. // We remember if we need to refresh the state of the source tablets // so their query service is enabled again, for reverse migration. needToRefreshSourceTablets := false for _, si := range sourceShards { if err := si.UpdateServedTypesMap(servedType, cells, !reverse); err != nil { return err } if tc := si.GetTabletControl(servedType); reverse && tc != nil && tc.DisableQueryService { // this is a backward migration, where the // source tablets were disabled previously, so // we need to refresh them if err := si.UpdateDisableQueryService(servedType, cells, false); err != nil { return err } needToRefreshSourceTablets = true } if !reverse && servedType != pb.TabletType_MASTER { // this is a forward migration, we need to disable // query service on the source shards. // (this was already done for masters earlier) if err := si.UpdateDisableQueryService(servedType, cells, true); err != nil { return err } } } // We remember if we need to refresh the state of the destination tablets // so their query service will be enabled. needToRefreshDestinationTablets := false for _, si := range destinationShards { if err := si.UpdateServedTypesMap(servedType, cells, reverse); err != nil { return err } if tc := si.GetTabletControl(servedType); !reverse && tc != nil && tc.DisableQueryService { // This is a forwards migration, and the destination query service was already in a disabled state. // We need to enable and force a refresh, otherwise it's possible that both the source and destination // will have query service disabled at the same time, and queries would have nowhere to go. if err := si.UpdateDisableQueryService(servedType, cells, false); err != nil { return err } needToRefreshDestinationTablets = true } if reverse && servedType != pb.TabletType_MASTER { // this is a backwards migration, we need to disable // query service on the destination shards. // (we're not allowed to reverse a master migration) if err := si.UpdateDisableQueryService(servedType, cells, true); err != nil { return err } } } // All is good, we can save the shards now event.DispatchUpdate(ev, "updating source shards") for _, si := range sourceShards { if err := topo.UpdateShard(ctx, wr.ts, si); err != nil { return err } } if needToRefreshSourceTablets { event.DispatchUpdate(ev, "refreshing source shard tablets so they restart their query service") for _, si := range sourceShards { wr.RefreshTablesByShard(ctx, si, servedType, cells) } } event.DispatchUpdate(ev, "updating destination shards") for _, si := range destinationShards { if err := topo.UpdateShard(ctx, wr.ts, si); err != nil { return err } } if needToRefreshDestinationTablets { event.DispatchUpdate(ev, "refreshing destination shard tablets so they restart their query service") for _, si := range destinationShards { wr.RefreshTablesByShard(ctx, si, servedType, cells) } } // And tell the new shards masters they can now be read-write. // Invoking a remote action will also make the tablet stop filtered // replication. if servedType == pb.TabletType_MASTER { event.DispatchUpdate(ev, "setting destination masters read-write") if err := wr.refreshMasters(ctx, destinationShards); err != nil { return err } } event.DispatchUpdate(ev, "finished") return nil }
// TestInitTablet will test the InitTablet code creates / updates the // tablet node correctly. Note we modify global parameters (the flags) // so this has to be in one test. func TestInitTablet(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) tabletAlias := topo.TabletAlias{ Cell: "cell1", Uid: 1, } // start with idle, and a tablet record that doesn't exist port := 1234 gRPCPort := 3456 mysqlDaemon := mysqlctl.NewFakeMysqlDaemon() agent := &ActionAgent{ TopoServer: ts, TabletAlias: tabletAlias, MysqlDaemon: mysqlDaemon, DBConfigs: nil, SchemaOverrides: nil, BinlogPlayerMap: nil, LockTimeout: 10 * time.Second, batchCtx: ctx, History: history.New(historyLength), lastHealthMapCount: new(stats.Int), _healthy: fmt.Errorf("healthcheck not run yet"), } *initTabletType = "idle" *tabletHostname = "localhost" if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("NewTestActionAgent(idle) failed: %v", err) } ti, err := ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_IDLE { t.Errorf("wrong type for tablet: %v", ti.Type) } if ti.Hostname != "localhost" { t.Errorf("wrong hostname for tablet: %v", ti.Hostname) } if ti.Portmap["vt"] != port { t.Errorf("wrong port for tablet: %v", ti.Portmap["vt"]) } if ti.Portmap["grpc"] != gRPCPort { t.Errorf("wrong gRPC port for tablet: %v", ti.Portmap["grpc"]) } // try again now that the node exists port = 3456 if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("NewTestActionAgent(idle again) failed: %v", err) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Portmap["vt"] != port { t.Errorf("wrong port for tablet: %v", ti.Portmap["vt"]) } if ti.Portmap["grpc"] != gRPCPort { t.Errorf("wrong gRPC port for tablet: %v", ti.Portmap["grpc"]) } // try with a keyspace and shard on the previously idle tablet, // should fail *initTabletType = "replica" *initKeyspace = "test_keyspace" *initShard = "-80" if err := agent.InitTablet(port, gRPCPort); err == nil || !strings.Contains(err.Error(), "InitTablet failed because existing tablet keyspace and shard / differ from the provided ones test_keyspace/-80") { t.Fatalf("InitTablet(type over idle) didn't fail correctly: %v", err) } // now let's use a different real tablet in a shard, that will create // the keyspace and shard. tabletAlias = topo.TabletAlias{ Cell: "cell1", Uid: 2, } agent.TabletAlias = tabletAlias if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("InitTablet(type) failed: %v", err) } si, err := ts.GetShard(ctx, "test_keyspace", "-80") if err != nil { t.Fatalf("GetShard failed: %v", err) } if len(si.Cells) != 1 || si.Cells[0] != "cell1" { t.Errorf("shard.Cells not updated properly: %v", si) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_REPLICA { t.Errorf("wrong tablet type: %v", ti.Type) } // try to init again, this time with health check on *initTabletType = "" *targetTabletType = "replica" if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("InitTablet(type, healthcheck) failed: %v", err) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_SPARE { t.Errorf("wrong tablet type: %v", ti.Type) } // update shard's master to our alias, then try to init again si, err = ts.GetShard(ctx, "test_keyspace", "-80") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.MasterAlias = topo.TabletAliasToProto(tabletAlias) if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("InitTablet(type, healthcheck) failed: %v", err) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_MASTER { t.Errorf("wrong tablet type: %v", ti.Type) } // init again with the tablet_type set, no healthcheck // (also check db name override and tags here) *initTabletType = "replica" *targetTabletType = "" *initDbNameOverride = "DBNAME" initTags.Set("aaa:bbb") if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("InitTablet(type, healthcheck) failed: %v", err) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_MASTER { t.Errorf("wrong tablet type: %v", ti.Type) } if ti.DbNameOverride != "DBNAME" { t.Errorf("wrong tablet DbNameOverride: %v", ti.DbNameOverride) } if len(ti.Tags) != 1 || ti.Tags["aaa"] != "bbb" { t.Errorf("wrong tablet tags: %v", ti.Tags) } }
// masterMigrateServedFrom handles the master migration. The ordering is // a bit different than for rdonly / replica to guarantee a smooth transition. // // The order is as follows: // - Add BlacklistedTables on the source shard map for master // - Refresh the source master, so it stops writing on the tables // - Get the source master position, wait until destination master reaches it // - Clear SourceShard on the destination Shard // - Refresh the destination master, so its stops its filtered // replication and starts accepting writes func (wr *Wrangler) masterMigrateServedFrom(ctx context.Context, ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, tables []string, ev *events.MigrateServedFrom, filteredReplicationWaitTime time.Duration) error { // Read the data we need sourceMasterTabletInfo, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(sourceShard.MasterAlias)) if err != nil { return err } destinationMasterTabletInfo, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(destinationShard.MasterAlias)) if err != nil { return err } // Update source shard (more blacklisted tables) event.DispatchUpdate(ev, "updating source shard") if err := sourceShard.UpdateSourceBlacklistedTables(pb.TabletType_MASTER, nil, false, tables); err != nil { return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err) } if err := topo.UpdateShard(ctx, wr.ts, sourceShard); err != nil { return fmt.Errorf("UpdateShard(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err) } // Now refresh the blacklisted table list on the source master event.DispatchUpdate(ev, "refreshing source master so it updates its blacklisted tables") if err := wr.tmc.RefreshState(ctx, sourceMasterTabletInfo); err != nil { return err } // get the position event.DispatchUpdate(ev, "getting master position") masterPosition, err := wr.tmc.MasterPosition(ctx, sourceMasterTabletInfo) if err != nil { return err } // wait for it event.DispatchUpdate(ev, "waiting for destination master to catch up to source master") if err := wr.tmc.WaitBlpPosition(ctx, destinationMasterTabletInfo, blproto.BlpPosition{ Uid: 0, Position: masterPosition, }, filteredReplicationWaitTime); err != nil { return err } // Update the destination keyspace (its ServedFrom has changed) event.DispatchUpdate(ev, "updating keyspace") if err = topo.UpdateKeyspace(ctx, wr.ts, ki); err != nil { return err } // Update the destination shard (no more source shard) event.DispatchUpdate(ev, "updating destination shard") destinationShard.SourceShards = nil if err := topo.UpdateShard(ctx, wr.ts, destinationShard); err != nil { return err } // Tell the new shards masters they can now be read-write. // Invoking a remote action will also make the tablet stop filtered // replication. event.DispatchUpdate(ev, "setting destination shard masters read-write") if err := wr.refreshMasters(ctx, []*topo.ShardInfo{destinationShard}); err != nil { return err } return nil }
func TestRebuildShard(t *testing.T) { ctx := context.Background() cells := []string{"test_cell"} logger := logutil.NewMemoryLogger() // Set up topology. ts := zktopo.NewTestServer(t, cells) si, err := GetOrCreateShard(ctx, ts, testKeyspace, testShard) if err != nil { t.Fatalf("GetOrCreateShard: %v", err) } si.Cells = append(si.Cells, cells[0]) if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard: %v", err) } masterInfo := addTablet(ctx, t, ts, 1, cells[0], topo.TYPE_MASTER) replicaInfo := addTablet(ctx, t, ts, 2, cells[0], topo.TYPE_REPLICA) // Do an initial rebuild. if _, err := RebuildShard(ctx, logger, ts, testKeyspace, testShard, cells, time.Minute); err != nil { t.Fatalf("RebuildShard: %v", err) } // Check initial state. ep, _, err := ts.GetEndPoints(ctx, cells[0], testKeyspace, testShard, topo.TYPE_MASTER) if err != nil { t.Fatalf("GetEndPoints: %v", err) } if got, want := len(ep.Entries), 1; got != want { t.Fatalf("len(Entries) = %v, want %v", got, want) } ep, _, err = ts.GetEndPoints(ctx, cells[0], testKeyspace, testShard, topo.TYPE_REPLICA) if err != nil { t.Fatalf("GetEndPoints: %v", err) } if got, want := len(ep.Entries), 1; got != want { t.Fatalf("len(Entries) = %v, want %v", got, want) } // Make a change. masterInfo.Type = topo.TYPE_SPARE if err := topo.UpdateTablet(ctx, ts, masterInfo); err != nil { t.Fatalf("UpdateTablet: %v", err) } if _, err := RebuildShard(ctx, logger, ts, testKeyspace, testShard, cells, time.Minute); err != nil { t.Fatalf("RebuildShard: %v", err) } // Make another change. replicaInfo.Type = topo.TYPE_SPARE if err := topo.UpdateTablet(ctx, ts, replicaInfo); err != nil { t.Fatalf("UpdateTablet: %v", err) } if _, err := RebuildShard(ctx, logger, ts, testKeyspace, testShard, cells, time.Minute); err != nil { t.Fatalf("RebuildShard: %v", err) } // Check that the rebuild picked up both changes. if _, _, err := ts.GetEndPoints(ctx, cells[0], testKeyspace, testShard, topo.TYPE_MASTER); err == nil || !strings.Contains(err.Error(), "node doesn't exist") { t.Errorf("first change wasn't picked up by second rebuild") } if _, _, err := ts.GetEndPoints(ctx, cells[0], testKeyspace, testShard, topo.TYPE_REPLICA); err == nil || !strings.Contains(err.Error(), "node doesn't exist") { t.Errorf("second change was overwritten by first rebuild finishing late") } }
func TestShardExternallyReparented(t *testing.T) { ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) wr.UseRPCs = false // Create an old master, a new master, two good slaves, one bad slave oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave1 := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave2 := NewFakeTablet(t, wr, "cell2", 3, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) badSlave := NewFakeTablet(t, wr, "cell1", 4, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) // Add a new Cell to the Shard, that doesn't map to any read topo cell, // to simulate a data center being unreachable. si, err := ts.GetShard("test_keyspace", "0") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.Cells = append(si.Cells, "cell666") if err := topo.UpdateShard(ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // Slightly unrelated test: make sure we can find the tablets // even with a datacenter being down. tabletMap, err := topo.GetTabletMapForShardByCell(ts, "test_keyspace", "0", []string{"cell1"}) if err != nil { t.Fatalf("GetTabletMapForShardByCell should have worked but got: %v", err) } master, err := topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != nil || master != oldMaster.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master) } slave1, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave1.Tablet.IPAddr, "vt", goodSlave1.Tablet.Portmap["vt"]) if err != nil || slave1 != goodSlave1.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(slave1) failed: %v %v", err, master) } slave2, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave2.Tablet.IPAddr, "vt", goodSlave2.Tablet.Portmap["vt"]) if err != topo.ErrNoNode { t.Fatalf("FindTabletByIPAddrAndPort(slave2) worked: %v %v", err, slave2) } // Make sure the master is not exported in other cells tabletMap, err = topo.GetTabletMapForShardByCell(ts, "test_keyspace", "0", []string{"cell2"}) master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != topo.ErrNoNode { t.Fatalf("FindTabletByIPAddrAndPort(master) worked in cell2: %v %v", err, master) } tabletMap, err = topo.GetTabletMapForShard(ts, "test_keyspace", "0") if err != topo.ErrPartialResult { t.Fatalf("GetTabletMapForShard should have returned ErrPartialResult but got: %v", err) } master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != nil || master != oldMaster.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master) } // First test: reparent to the same master, make sure it works // as expected. if err := wr.ShardExternallyReparented("test_keyspace", "0", oldMaster.Tablet.Alias); err == nil { t.Fatalf("ShardExternallyReparented(same master) should have failed") } else { if !strings.Contains(err.Error(), "already master") { t.Fatalf("ShardExternallyReparented(same master) should have failed with an error that contains 'already master' but got: %v", err) } } // Second test: reparent to the replica, and pretend the old // master is still good to go. // On the elected master, we will respond to // TABLET_ACTION_SLAVE_WAS_PROMOTED newMaster.FakeMysqlDaemon.MasterAddr = "" newMaster.StartActionLoop(t, wr) defer newMaster.StopActionLoop(t) // On the old master, we will only respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. oldMaster.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() oldMaster.StartActionLoop(t, wr) defer oldMaster.StopActionLoop(t) // On the good slaves, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. goodSlave1.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave1.StartActionLoop(t, wr) defer goodSlave1.StopActionLoop(t) goodSlave2.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave2.StartActionLoop(t, wr) defer goodSlave2.StopActionLoop(t) // On the bad slave, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED with bad data. badSlave.FakeMysqlDaemon.MasterAddr = "234.0.0.1:3301" badSlave.StartActionLoop(t, wr) defer badSlave.StopActionLoop(t) // This tests a bad case; the new designated master is a slave, // but we should do what we're told anyway if err := wr.ShardExternallyReparented("test_keyspace", "0", goodSlave1.Tablet.Alias); err != nil { t.Fatalf("ShardExternallyReparented(slave) error: %v", err) } // This tests the good case, where everything works as planned t.Logf("ShardExternallyReparented(new master) expecting success") if err := wr.ShardExternallyReparented("test_keyspace", "0", newMaster.Tablet.Alias); err != nil { t.Fatalf("ShardExternallyReparented(replica) failed: %v", err) } // Now double-check the serving graph is good. // Should only have one good replica left. addrs, err := ts.GetEndPoints("cell1", "test_keyspace", "0", topo.TYPE_REPLICA) if err != nil { t.Fatalf("GetEndPoints failed at the end: %v", err) } if len(addrs.Entries) != 1 { t.Fatalf("GetEndPoints has too many entries: %v", addrs) } }
// Scrap a tablet. If force is used, we write to topo.Server // directly and don't remote-execute the command. // // If we scrap the master for a shard, we will clear its record // from the Shard object (only if that was the right master) func (wr *Wrangler) Scrap(tabletAlias topo.TabletAlias, force, skipRebuild bool) (actionPath string, err error) { // load the tablet, see if we'll need to rebuild ti, err := wr.ts.GetTablet(tabletAlias) if err != nil { return "", err } rebuildRequired := ti.Tablet.IsInServingGraph() wasMaster := ti.Type == topo.TYPE_MASTER if force { err = topotools.Scrap(wr.ts, ti.Alias, force) } else { actionPath, err = wr.ai.Scrap(ti.Alias) } if err != nil { return "", err } if !rebuildRequired { log.Infof("Rebuild not required") return } if skipRebuild { log.Warningf("Rebuild required, but skipping it") return } // wait for the remote Scrap if necessary if actionPath != "" { err = wr.WaitForCompletion(actionPath) if err != nil { return "", err } } // update the Shard object if the master was scrapped if wasMaster { actionNode := actionnode.UpdateShard() lockPath, err := wr.lockShard(ti.Keyspace, ti.Shard, actionNode) if err != nil { return "", err } // read the shard with the lock si, err := wr.ts.GetShard(ti.Keyspace, ti.Shard) if err != nil { return "", wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err) } // update it if the right alias is there if si.MasterAlias == tabletAlias { si.MasterAlias = topo.TabletAlias{} // write it back if err := topo.UpdateShard(wr.ts, si); err != nil { return "", wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err) } } else { log.Warningf("Scrapping master %v from shard %v/%v but master in Shard object was %v", tabletAlias, ti.Keyspace, ti.Shard, si.MasterAlias) } // and unlock if err := wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err); err != nil { return "", err } } // and rebuild the original shard / keyspace return "", wr.RebuildShardGraph(ti.Keyspace, ti.Shard, []string{ti.Alias.Cell}) }
func CheckShard(t *testing.T, ts topo.Server) { if err := ts.CreateKeyspace("test_keyspace", &topo.Keyspace{}); err != nil { t.Fatalf("CreateKeyspace: %v", err) } if err := topo.CreateShard(ts, "test_keyspace", "b0-c0"); err != nil { t.Fatalf("CreateShard: %v", err) } if err := topo.CreateShard(ts, "test_keyspace", "b0-c0"); err != topo.ErrNodeExists { t.Errorf("CreateShard called second time, got: %v", err) } if _, err := ts.GetShard("test_keyspace", "666"); err != topo.ErrNoNode { t.Errorf("GetShard(666): %v", err) } shardInfo, err := ts.GetShard("test_keyspace", "b0-c0") if err != nil { t.Errorf("GetShard: %v", err) } if want := newKeyRange("b0-c0"); shardInfo.KeyRange != want { t.Errorf("shardInfo.KeyRange: want %v, got %v", want, shardInfo.KeyRange) } master := topo.TabletAlias{Cell: "ny", Uid: 1} shardInfo.MasterAlias = master shardInfo.KeyRange = newKeyRange("b0-c0") shardInfo.ServedTypes = []topo.TabletType{topo.TYPE_MASTER, topo.TYPE_REPLICA, topo.TYPE_RDONLY} shardInfo.SourceShards = []topo.SourceShard{ topo.SourceShard{ Uid: 1, Keyspace: "source_ks", Shard: "b8-c0", KeyRange: newKeyRange("b8-c0"), Tables: []string{"table1", "table2"}, }, } if err := topo.UpdateShard(ts, shardInfo); err != nil { t.Errorf("UpdateShard: %v", err) } shardInfo, err = ts.GetShard("test_keyspace", "b0-c0") if err != nil { t.Errorf("GetShard: %v", err) } if shardInfo.MasterAlias != master { t.Errorf("after UpdateShard: shardInfo.MasterAlias got %v", shardInfo.MasterAlias) } if shardInfo.KeyRange != newKeyRange("b0-c0") { t.Errorf("after UpdateShard: shardInfo.KeyRange got %v", shardInfo.KeyRange) } if len(shardInfo.ServedTypes) != 3 || shardInfo.ServedTypes[0] != topo.TYPE_MASTER || shardInfo.ServedTypes[1] != topo.TYPE_REPLICA || shardInfo.ServedTypes[2] != topo.TYPE_RDONLY { t.Errorf("after UpdateShard: shardInfo.ServedTypes got %v", shardInfo.ServedTypes) } if len(shardInfo.SourceShards) != 1 || shardInfo.SourceShards[0].Uid != 1 || shardInfo.SourceShards[0].Keyspace != "source_ks" || shardInfo.SourceShards[0].Shard != "b8-c0" || shardInfo.SourceShards[0].KeyRange != newKeyRange("b8-c0") || len(shardInfo.SourceShards[0].Tables) != 2 || shardInfo.SourceShards[0].Tables[0] != "table1" || shardInfo.SourceShards[0].Tables[1] != "table2" { t.Errorf("after UpdateShard: shardInfo.SourceShards got %v", shardInfo.SourceShards) } shards, err := ts.GetShardNames("test_keyspace") if err != nil { t.Errorf("GetShardNames: %v", err) } if len(shards) != 1 || shards[0] != "b0-c0" { t.Errorf(`GetShardNames: want [ "b0-c0" ], got %v`, shards) } if _, err := ts.GetShardNames("test_keyspace666"); err != topo.ErrNoNode { t.Errorf("GetShardNames(666): %v", err) } }
func tabletExternallyReparentedLocked(ts topo.Server, tablet *topo.TabletInfo, actionTimeout, lockTimeout time.Duration, interrupted chan struct{}) (err error) { // read the shard, make sure again the master is not already good. // critical read, we want up to date info (and the shard is locked). shardInfo, err := ts.GetShardCritical(tablet.Keyspace, tablet.Shard) if err != nil { return err } if shardInfo.MasterAlias == tablet.Alias { return fmt.Errorf("this tablet is already the master") } // Read the tablets, make sure the master elect is known to the shard // (it's this tablet, so it better be!). // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(ts, tablet.Keyspace, tablet.Shard) switch err { case nil: // keep going case topo.ErrPartialResult: log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return err } masterElectTablet, ok := tabletMap[tablet.Alias] if !ok { return fmt.Errorf("this master-elect tablet %v not found in replication graph %v/%v %v", tablet.Alias, tablet.Keyspace, tablet.Shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *tablet.Tablet, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // sort the tablets, and handle them slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) event.DispatchUpdate(ev, "starting external from tablet") // we fix the new master in the replication graph event.DispatchUpdate(ev, "mark ourself as new master") err = updateReplicationGraphForPromotedSlave(ts, tablet) if err != nil { // This suggests we can't talk to topo server. This is bad. return fmt.Errorf("updateReplicationGraphForPromotedSlave failed: %v", err) } // Once this tablet is promoted, remove it from our maps delete(slaveTabletMap, tablet.Alias) delete(masterTabletMap, tablet.Alias) // Then fix all the slaves, including the old master. This // last step is very likely to time out for some tablets (one // random guy is dead, the old master is dead, ...). We // execute them all in parallel until we get to // wr.ActionTimeout(). After this, no other action with a // timeout is executed, so even if we got to the timeout, // we're still good. event.DispatchUpdate(ev, "restarting slaves") logger := logutil.NewConsoleLogger() ai := initiator.NewActionInitiator(ts) topotools.RestartSlavesExternal(ts, logger, slaveTabletMap, masterTabletMap, masterElectTablet.Alias, func(ti *topo.TabletInfo, swrd *actionnode.SlaveWasRestartedArgs) error { return ai.RpcSlaveWasRestarted(ti, swrd, actionTimeout) }) // Compute the list of Cells we need to rebuild: old master and // all other cells if reparenting to another cell. cells := []string{shardInfo.MasterAlias.Cell} if shardInfo.MasterAlias.Cell != tablet.Alias.Cell { cells = nil } // now update the master record in the shard object event.DispatchUpdate(ev, "updating shard record") log.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = tablet.Alias if err = topo.UpdateShard(ts, shardInfo); err != nil { return err } // and rebuild the shard serving graph event.DispatchUpdate(ev, "rebuilding shard serving graph") log.Infof("Rebuilding shard serving graph data") if err = topotools.RebuildShard(logger, ts, tablet.Keyspace, tablet.Shard, cells, lockTimeout, interrupted); err != nil { return err } event.DispatchUpdate(ev, "finished") return nil }
func (wr *Wrangler) shardExternallyReparentedLocked(keyspace, shard string, masterElectTabletAlias topo.TabletAlias) (err error) { // read the shard, make sure the master is not already good. shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } if shardInfo.MasterAlias == masterElectTabletAlias { return fmt.Errorf("master-elect tablet %v is already master", masterElectTabletAlias) } // Read the tablets, make sure the master elect is known to us. // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(wr.ts, keyspace, shard) switch err { case nil: // keep going case topo.ErrPartialResult: wr.logger.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return err } masterElectTablet, ok := tabletMap[masterElectTabletAlias] if !ok { return fmt.Errorf("master-elect tablet %v not found in replication graph %v/%v %v", masterElectTabletAlias, keyspace, shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *masterElectTablet.Tablet, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // sort the tablets, and handle them slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) err = wr.reparentShardExternal(ev, slaveTabletMap, masterTabletMap, masterElectTablet) if err != nil { wr.logger.Infof("Skipping shard rebuild with failed reparent") return err } // Compute the list of Cells we need to rebuild: old master and // all other cells if reparenting to another cell. cells := []string{shardInfo.MasterAlias.Cell} if shardInfo.MasterAlias.Cell != masterElectTabletAlias.Cell { cells = nil } // now update the master record in the shard object event.DispatchUpdate(ev, "updating shard record") wr.logger.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = masterElectTabletAlias if err = topo.UpdateShard(wr.ts, shardInfo); err != nil { return err } // and rebuild the shard serving graph event.DispatchUpdate(ev, "rebuilding shard serving graph") wr.logger.Infof("Rebuilding shard serving graph data") if _, err = topotools.RebuildShard(wr.logger, wr.ts, masterElectTablet.Keyspace, masterElectTablet.Shard, cells, wr.lockTimeout, interrupted); err != nil { return err } event.DispatchUpdate(ev, "finished") return nil }
// Scrap a tablet. If force is used, we write to topo.Server // directly and don't remote-execute the command. // // If we scrap the master for a shard, we will clear its record // from the Shard object (only if that was the right master) func (wr *Wrangler) Scrap(ctx context.Context, tabletAlias topo.TabletAlias, force, skipRebuild bool) error { // load the tablet, see if we'll need to rebuild ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return err } rebuildRequired := ti.IsInServingGraph() wasMaster := ti.Type == topo.TYPE_MASTER if force { err = topotools.Scrap(ctx, wr.ts, ti.Alias, force) } else { err = wr.tmc.Scrap(ctx, ti) } if err != nil { return err } if !rebuildRequired { wr.Logger().Infof("Rebuild not required") return nil } if skipRebuild { wr.Logger().Warningf("Rebuild required, but skipping it") return nil } // update the Shard object if the master was scrapped if wasMaster { actionNode := actionnode.UpdateShard() lockPath, err := wr.lockShard(ctx, ti.Keyspace, ti.Shard, actionNode) if err != nil { return err } // read the shard with the lock si, err := wr.ts.GetShard(ctx, ti.Keyspace, ti.Shard) if err != nil { return wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err) } // update it if the right alias is there if topo.TabletAliasEqual(si.MasterAlias, topo.TabletAliasToProto(tabletAlias)) { si.MasterAlias = nil // write it back if err := topo.UpdateShard(ctx, wr.ts, si); err != nil { return wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err) } } else { wr.Logger().Warningf("Scrapping master %v from shard %v/%v but master in Shard object was %v", tabletAlias, ti.Keyspace, ti.Shard, si.MasterAlias) } // and unlock if err := wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err); err != nil { return err } } // and rebuild the original shard _, err = wr.RebuildShardGraph(ctx, ti.Keyspace, ti.Shard, []string{ti.Alias.Cell}) return err }
func (wr *Wrangler) removeShardCell(ctx context.Context, keyspace, shard, cell string, force, recursive bool) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } // check the cell is in the list already if !topo.InCellList(cell, shardInfo.Cells) { return fmt.Errorf("cell %v in not in shard info", cell) } // check the master alias is not in the cell if shardInfo.MasterAlias.Cell == cell { return fmt.Errorf("master %v is in the cell '%v' we want to remove", shardInfo.MasterAlias, cell) } // get the ShardReplication object in the cell sri, err := wr.ts.GetShardReplication(ctx, cell, keyspace, shard) switch err { case nil: if recursive { wr.Logger().Infof("Deleting all tablets in shard %v/%v", keyspace, shard) for _, node := range sri.Nodes { // We don't care about scrapping or updating the replication graph, // because we're about to delete the entire replication graph. wr.Logger().Infof("Deleting tablet %v", node.TabletAlias) if err := wr.TopoServer().DeleteTablet(ctx, topo.ProtoToTabletAlias(node.TabletAlias)); err != nil && err != topo.ErrNoNode { return fmt.Errorf("can't delete tablet %v: %v", node.TabletAlias, err) } } } else if len(sri.Nodes) > 0 { return fmt.Errorf("cell %v has %v possible tablets in replication graph", cell, len(sri.Nodes)) } // ShardReplication object is now useless, remove it if err := wr.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode { return fmt.Errorf("error deleting ShardReplication object in cell %v: %v", cell, err) } // Rebuild the shard serving graph to reflect the tablets we deleted. // This must be done before removing the cell from the global shard record, // since this cell will be skipped by all future rebuilds. if _, err := wr.RebuildShardGraph(ctx, keyspace, shard, []string{cell}); err != nil { return fmt.Errorf("can't rebuild serving graph for shard %v/%v in cell %v: %v", keyspace, shard, cell, err) } // we keep going case topo.ErrNoNode: // no ShardReplication object, we keep going default: // we can't get the object, assume topo server is down there, // so we look at force flag if !force { return err } wr.Logger().Warningf("Cannot get ShardReplication from cell %v, assuming cell topo server is down, and forcing the removal", cell) } // now we can update the shard wr.Logger().Infof("Removing cell %v from shard %v/%v", cell, keyspace, shard) newCells := make([]string, 0, len(shardInfo.Cells)-1) for _, c := range shardInfo.Cells { if c != cell { newCells = append(newCells, c) } } shardInfo.Cells = newCells return topo.UpdateShard(ctx, wr.ts, shardInfo) }
// InitTablet initializes the tablet record if necessary. func (agent *ActionAgent) InitTablet(port, securePort, gRPCPort int) error { // only enabled if one of init_tablet_type (when healthcheck // is disabled) or init_keyspace (when healthcheck is enabled) // is passed in, then check other parameters if *initTabletType == "" && *initKeyspace == "" { return nil } // figure out our default target type var tabletType topo.TabletType if *initTabletType != "" { if *targetTabletType != "" { log.Fatalf("cannot specify both target_tablet_type and init_tablet_type parameters (as they might conflict)") } // use the type specified on the command line tabletType = topo.TabletType(*initTabletType) if !topo.IsTypeInList(tabletType, topo.AllTabletTypes) { log.Fatalf("InitTablet encountered unknown init_tablet_type '%v'", *initTabletType) } if tabletType == topo.TYPE_MASTER || tabletType == topo.TYPE_SCRAP { // We disallow TYPE_MASTER, so we don't have to change // shard.MasterAlias, and deal with the corner cases. // We also disallow TYPE_SCRAP, obviously. log.Fatalf("init_tablet_type cannot be %v", tabletType) } } else if *targetTabletType != "" { if tabletType := topo.TabletType(*targetTabletType); tabletType == topo.TYPE_MASTER { log.Fatalf("target_tablet_type cannot be '%v'. Use '%v' instead.", tabletType, topo.TYPE_REPLICA) } // use spare, the healthcheck will turn us into what // we need to be eventually tabletType = topo.TYPE_SPARE } else { log.Fatalf("if init tablet is enabled, one of init_tablet_type or target_tablet_type needs to be specified") } // create a context for this whole operation ctx, cancel := context.WithTimeout(agent.batchCtx, *initTimeout) defer cancel() // if we're assigned to a shard, make sure it exists, see if // we are its master, and update its cells list if necessary if tabletType != topo.TYPE_IDLE { if *initKeyspace == "" || *initShard == "" { log.Fatalf("if init tablet is enabled and the target type is not idle, init_keyspace and init_shard also need to be specified") } shard, _, err := topo.ValidateShardName(*initShard) if err != nil { log.Fatalf("cannot validate shard name: %v", err) } log.Infof("Reading shard record %v/%v", *initKeyspace, shard) // read the shard, create it if necessary si, err := topotools.GetOrCreateShard(ctx, agent.TopoServer, *initKeyspace, shard) if err != nil { return fmt.Errorf("InitTablet cannot GetOrCreateShard shard: %v", err) } if si.MasterAlias == agent.TabletAlias { // we are the current master for this shard (probably // means the master tablet process was just restarted), // so InitTablet as master. tabletType = topo.TYPE_MASTER } // See if we need to add the tablet's cell to the shard's cell // list. If we do, it has to be under the shard lock. if !si.HasCell(agent.TabletAlias.Cell) { actionNode := actionnode.UpdateShard() lockPath, err := actionNode.LockShard(ctx, agent.TopoServer, *initKeyspace, shard) if err != nil { return fmt.Errorf("LockShard(%v/%v) failed: %v", *initKeyspace, shard, err) } // re-read the shard with the lock si, err = agent.TopoServer.GetShard(ctx, *initKeyspace, shard) if err != nil { return actionNode.UnlockShard(ctx, agent.TopoServer, *initKeyspace, shard, lockPath, err) } // see if we really need to update it now if !si.HasCell(agent.TabletAlias.Cell) { si.Cells = append(si.Cells, agent.TabletAlias.Cell) // write it back if err := topo.UpdateShard(ctx, agent.TopoServer, si); err != nil { return actionNode.UnlockShard(ctx, agent.TopoServer, *initKeyspace, shard, lockPath, err) } } // and unlock if err := actionNode.UnlockShard(ctx, agent.TopoServer, *initKeyspace, shard, lockPath, nil); err != nil { return err } } } log.Infof("Initializing the tablet for type %v", tabletType) // figure out the hostname hostname := *tabletHostname if hostname == "" { var err error hostname, err = netutil.FullyQualifiedHostname() if err != nil { return err } } // create and populate tablet record tablet := &topo.Tablet{ Alias: agent.TabletAlias, Hostname: hostname, Portmap: make(map[string]int), Keyspace: *initKeyspace, Shard: *initShard, Type: tabletType, DbNameOverride: *initDbNameOverride, Tags: initTags, } if port != 0 { tablet.Portmap["vt"] = port } if securePort != 0 { tablet.Portmap["vts"] = securePort } if gRPCPort != 0 { tablet.Portmap["grpc"] = gRPCPort } if err := tablet.Complete(); err != nil { return fmt.Errorf("InitTablet tablet.Complete failed: %v", err) } // now try to create the record err := topo.CreateTablet(ctx, agent.TopoServer, tablet) switch err { case nil: // it worked, we're good, can update the replication graph if tablet.IsInReplicationGraph() { if err := topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet); err != nil { return fmt.Errorf("UpdateTabletReplicationData failed: %v", err) } } case topo.ErrNodeExists: // The node already exists, will just try to update // it. So we read it first. oldTablet, err := agent.TopoServer.GetTablet(ctx, tablet.Alias) if err != nil { fmt.Errorf("InitTablet failed to read existing tablet record: %v", err) } // Sanity check the keyspace and shard if oldTablet.Keyspace != tablet.Keyspace || oldTablet.Shard != tablet.Shard { return fmt.Errorf("InitTablet failed because existing tablet keyspace and shard %v/%v differ from the provided ones %v/%v", oldTablet.Keyspace, oldTablet.Shard, tablet.Keyspace, tablet.Shard) } // And overwrite the rest *(oldTablet.Tablet) = *tablet if err := topo.UpdateTablet(ctx, agent.TopoServer, oldTablet); err != nil { return fmt.Errorf("UpdateTablet failed: %v", err) } // Note we don't need to UpdateTabletReplicationData // as the tablet already existed with the right data // in the replication graph default: return fmt.Errorf("CreateTablet failed: %v", err) } // and now update the serving graph. Note we do that in any case, // to clean any inaccurate record from any part of the serving graph. if tabletType != topo.TYPE_IDLE { if err := topotools.UpdateTabletEndpoints(ctx, agent.TopoServer, tablet); err != nil { return fmt.Errorf("UpdateTabletEndpoints failed: %v", err) } } return nil }
func TestUpdateTabletEndpoints(t *testing.T) { ctx := context.Background() cell := "test_cell" // Set up topology. ts := zktopo.NewTestServer(t, []string{cell}) si, err := GetOrCreateShard(ctx, ts, testKeyspace, testShard) if err != nil { t.Fatalf("GetOrCreateShard: %v", err) } si.Cells = append(si.Cells, cell) if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard: %v", err) } tablet1 := addTablet(ctx, t, ts, 1, cell, topo.TYPE_MASTER).Tablet tablet2 := addTablet(ctx, t, ts, 2, cell, topo.TYPE_REPLICA).Tablet update := func(tablet *topo.Tablet) { if err := UpdateTabletEndpoints(ctx, ts, tablet); err != nil { t.Fatalf("UpdateTabletEndpoints(%v): %v", tablet, err) } } expect := func(tabletType topo.TabletType, want int) { eps, _, err := ts.GetEndPoints(ctx, cell, testKeyspace, testShard, tabletType) if err != nil && err != topo.ErrNoNode { t.Errorf("GetEndPoints(%v): %v", tabletType, err) return } var got int if err == nil { got = len(eps.Entries) if got == 0 { t.Errorf("len(EndPoints) = 0, expected ErrNoNode instead") } } if got != want { t.Errorf("len(GetEndPoints(%v)) = %v, want %v. EndPoints = %v", tabletType, len(eps.Entries), want, eps) } } // Update tablets. This should create the serving graph dirs too. update(tablet1) expect(topo.TYPE_MASTER, 1) update(tablet2) expect(topo.TYPE_REPLICA, 1) // Re-update an identical tablet. update(tablet1) expect(topo.TYPE_MASTER, 1) // Change a tablet, but keep it the same type. tablet2.Hostname += "extra" update(tablet2) expect(topo.TYPE_REPLICA, 1) // Move the master to replica. tablet1.Type = topo.TYPE_REPLICA update(tablet1) expect(topo.TYPE_MASTER, 0) expect(topo.TYPE_REPLICA, 2) // Take a replica out of serving. tablet1.Type = topo.TYPE_SPARE update(tablet1) expect(topo.TYPE_MASTER, 0) expect(topo.TYPE_REPLICA, 1) // Put it back to serving. tablet1.Type = topo.TYPE_REPLICA update(tablet1) expect(topo.TYPE_MASTER, 0) expect(topo.TYPE_REPLICA, 2) // Move a replica to master. tablet2.Type = topo.TYPE_MASTER update(tablet2) expect(topo.TYPE_MASTER, 1) expect(topo.TYPE_REPLICA, 1) }