// FIXME(msolomon) This validate presumes the master is up and running. // Even when that isn't true, there are validation processes that might be valuable. func (wr *Wrangler) validateShard(ctx context.Context, keyspace, shard string, pingTablets bool, wg *sync.WaitGroup, results chan<- error) { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.GetShard(%v, %v) failed: %v", keyspace, shard, err) return } aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err) return } tabletMap, _ := topo.GetTabletMap(ctx, wr.ts, aliases) var masterAlias topo.TabletAlias for _, alias := range aliases { tabletInfo, ok := tabletMap[alias] if !ok { results <- fmt.Errorf("tablet %v not found in map", alias) continue } if tabletInfo.Type == topo.TYPE_MASTER { if masterAlias.Cell != "" { results <- fmt.Errorf("shard %v/%v already has master %v but found other master %v", keyspace, shard, masterAlias, alias) } else { masterAlias = alias } } } if masterAlias.Cell == "" { results <- fmt.Errorf("no master for shard %v/%v", keyspace, shard) } else if !topo.TabletAliasEqual(shardInfo.MasterAlias, topo.TabletAliasToProto(masterAlias)) { results <- fmt.Errorf("master mismatch for shard %v/%v: found %v, expected %v", keyspace, shard, masterAlias, shardInfo.MasterAlias) } for _, alias := range aliases { wg.Add(1) go func(alias topo.TabletAlias) { defer wg.Done() if err := topo.Validate(ctx, wr.ts, alias); err != nil { results <- fmt.Errorf("Validate(%v) failed: %v", alias, err) } else { wr.Logger().Infof("tablet %v is valid", alias) } }(alias) } if pingTablets { wr.validateReplication(ctx, shardInfo, tabletMap, results) wr.pingTablets(ctx, tabletMap, wg, results) } return }
// SlaveWasRestarted is part of the tmclient.TabletManagerClient interface func (client *Client) SlaveWasRestarted(ctx context.Context, tablet *topo.TabletInfo, args *actionnode.SlaveWasRestartedArgs) error { cc, c, err := client.dial(ctx, tablet) if err != nil { return err } defer cc.Close() _, err = c.SlaveWasRestarted(ctx, &pb.SlaveWasRestartedRequest{ Parent: topo.TabletAliasToProto(args.Parent), }) return err }
// SetMaster is part of the tmclient.TabletManagerClient interface func (client *Client) SetMaster(ctx context.Context, tablet *topo.TabletInfo, parent topo.TabletAlias, timeCreatedNS int64, forceStartSlave bool) error { cc, c, err := client.dial(ctx, tablet) if err != nil { return err } defer cc.Close() _, err = c.SetMaster(ctx, &pb.SetMasterRequest{ Parent: topo.TabletAliasToProto(parent), TimeCreatedNs: timeCreatedNS, ForceStartSlave: forceStartSlave, }) return err }
// InitSlave is part of the tmclient.TabletManagerClient interface func (client *Client) InitSlave(ctx context.Context, tablet *topo.TabletInfo, parent topo.TabletAlias, replicationPosition myproto.ReplicationPosition, timeCreatedNS int64) error { cc, c, err := client.dial(ctx, tablet) if err != nil { return err } defer cc.Close() _, err = c.InitSlave(ctx, &pb.InitSlaveRequest{ Parent: topo.TabletAliasToProto(parent), ReplicationPosition: myproto.ReplicationPositionToProto(replicationPosition), TimeCreatedNs: timeCreatedNS, }) return err }
// PopulateReparentJournal is part of the tmclient.TabletManagerClient interface func (client *Client) PopulateReparentJournal(ctx context.Context, tablet *topo.TabletInfo, timeCreatedNS int64, actionName string, masterAlias topo.TabletAlias, pos myproto.ReplicationPosition) error { cc, c, err := client.dial(ctx, tablet) if err != nil { return err } defer cc.Close() _, err = c.PopulateReparentJournal(ctx, &pb.PopulateReparentJournalRequest{ TimeCreatedNs: timeCreatedNS, ActionName: actionName, MasterAlias: topo.TabletAliasToProto(masterAlias), ReplicationPosition: myproto.ReplicationPositionToProto(pos), }) return err }
func TestReparentTablet(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second) // create shard and tablets if err := topo.CreateShard(ctx, ts, "test_keyspace", "0"); err != nil { t.Fatalf("CreateShard failed: %v", err) } master := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_MASTER) slave := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA) // mark the master inside the shard si, err := ts.GetShard(ctx, "test_keyspace", "0") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.MasterAlias = topo.TabletAliasToProto(master.Tablet.Alias) if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // master action loop (to initialize host and port) master.StartActionLoop(t, wr) defer master.StopActionLoop(t) // slave loop slave.FakeMysqlDaemon.SetMasterCommandsInput = fmt.Sprintf("%v:%v", master.Tablet.Hostname, master.Tablet.Portmap["mysql"]) slave.FakeMysqlDaemon.SetMasterCommandsResult = []string{"set master cmd 1"} slave.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ "set master cmd 1", } slave.StartActionLoop(t, wr) defer slave.StopActionLoop(t) // run ReparentTablet if err := wr.ReparentTablet(ctx, slave.Tablet.Alias); err != nil { t.Fatalf("ReparentTablet failed: %v", err) } // check what was run if err := slave.FakeMysqlDaemon.CheckSuperQueryList(); err != nil { t.Fatalf("slave.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err) } }
// InitTablet creates or updates a tablet. If no parent is specified // in the tablet, and the tablet has a slave type, we will find the // appropriate parent. If createShardAndKeyspace is true and the // parent keyspace or shard don't exist, they will be created. If // update is true, and a tablet with the same ID exists, update it. // If Force is true, and a tablet with the same ID already exists, it // will be scrapped and deleted, and then recreated. func (wr *Wrangler) InitTablet(ctx context.Context, tablet *topo.Tablet, force, createShardAndKeyspace, update bool) error { if err := topo.TabletComplete(tablet); err != nil { return err } if topo.IsInReplicationGraph(tablet.Type) { // get the shard, possibly creating it var err error var si *topo.ShardInfo if createShardAndKeyspace { // create the parent keyspace and shard if needed si, err = topotools.GetOrCreateShard(ctx, wr.ts, tablet.Keyspace, tablet.Shard) } else { si, err = wr.ts.GetShard(ctx, tablet.Keyspace, tablet.Shard) if err == topo.ErrNoNode { return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard") } } // get the shard, checks a couple things if err != nil { return fmt.Errorf("cannot get (or create) shard %v/%v: %v", tablet.Keyspace, tablet.Shard, err) } if key.ProtoToKeyRange(si.KeyRange) != tablet.KeyRange { return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange) } if tablet.Type == topo.TYPE_MASTER && !topo.TabletAliasIsZero(si.MasterAlias) && topo.ProtoToTabletAlias(si.MasterAlias) != tablet.Alias && !force { return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, tablet.Keyspace, tablet.Shard) } // update the shard record if needed if err := wr.updateShardCellsAndMaster(ctx, si, topo.TabletAliasToProto(tablet.Alias), topo.TabletTypeToProto(tablet.Type), force); err != nil { return err } } err := topo.CreateTablet(ctx, wr.ts, tablet) if err != nil && err == topo.ErrNodeExists { // Try to update nicely, but if it fails fall back to force behavior. if update || force { oldTablet, err := wr.ts.GetTablet(ctx, tablet.Alias) if err != nil { wr.Logger().Warningf("failed reading tablet %v: %v", tablet.Alias, err) } else { if oldTablet.Keyspace == tablet.Keyspace && oldTablet.Shard == tablet.Shard { *(oldTablet.Tablet) = *tablet if err := topo.UpdateTablet(ctx, wr.ts, oldTablet); err != nil { wr.Logger().Warningf("failed updating tablet %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { if !topo.IsInReplicationGraph(tablet.Type) { return nil } if err := topo.UpdateTabletReplicationData(ctx, wr.ts, tablet); err != nil { wr.Logger().Warningf("failed updating tablet replication data for %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { return nil } } } } } if force { if err = wr.Scrap(ctx, tablet.Alias, force, false); err != nil { wr.Logger().Errorf("failed scrapping tablet %v: %v", tablet.Alias, err) return err } if err := wr.ts.DeleteTablet(ctx, tablet.Alias); err != nil { // we ignore this wr.Logger().Errorf("failed deleting tablet %v: %v", tablet.Alias, err) } return topo.CreateTablet(ctx, wr.ts, tablet) } } return err }
// Scrap a tablet. If force is used, we write to topo.Server // directly and don't remote-execute the command. // // If we scrap the master for a shard, we will clear its record // from the Shard object (only if that was the right master) func (wr *Wrangler) Scrap(ctx context.Context, tabletAlias topo.TabletAlias, force, skipRebuild bool) error { // load the tablet, see if we'll need to rebuild ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return err } rebuildRequired := ti.IsInServingGraph() wasMaster := ti.Type == topo.TYPE_MASTER if force { err = topotools.Scrap(ctx, wr.ts, ti.Alias, force) } else { err = wr.tmc.Scrap(ctx, ti) } if err != nil { return err } if !rebuildRequired { wr.Logger().Infof("Rebuild not required") return nil } if skipRebuild { wr.Logger().Warningf("Rebuild required, but skipping it") return nil } // update the Shard object if the master was scrapped if wasMaster { actionNode := actionnode.UpdateShard() lockPath, err := wr.lockShard(ctx, ti.Keyspace, ti.Shard, actionNode) if err != nil { return err } // read the shard with the lock si, err := wr.ts.GetShard(ctx, ti.Keyspace, ti.Shard) if err != nil { return wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err) } // update it if the right alias is there if topo.TabletAliasEqual(si.MasterAlias, topo.TabletAliasToProto(tabletAlias)) { si.MasterAlias = nil // write it back if err := topo.UpdateShard(ctx, wr.ts, si); err != nil { return wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err) } } else { wr.Logger().Warningf("Scrapping master %v from shard %v/%v but master in Shard object was %v", tabletAlias, ti.Keyspace, ti.Shard, si.MasterAlias) } // and unlock if err := wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err); err != nil { return err } } // and rebuild the original shard _, err = wr.RebuildShardGraph(ctx, ti.Keyspace, ti.Shard, []string{ti.Alias.Cell}) return err }
// TestInitTablet will test the InitTablet code creates / updates the // tablet node correctly. Note we modify global parameters (the flags) // so this has to be in one test. func TestInitTablet(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) tabletAlias := topo.TabletAlias{ Cell: "cell1", Uid: 1, } // start with idle, and a tablet record that doesn't exist port := 1234 gRPCPort := 3456 mysqlDaemon := mysqlctl.NewFakeMysqlDaemon() agent := &ActionAgent{ TopoServer: ts, TabletAlias: tabletAlias, MysqlDaemon: mysqlDaemon, DBConfigs: nil, SchemaOverrides: nil, BinlogPlayerMap: nil, LockTimeout: 10 * time.Second, batchCtx: ctx, History: history.New(historyLength), lastHealthMapCount: new(stats.Int), _healthy: fmt.Errorf("healthcheck not run yet"), } *initTabletType = "idle" *tabletHostname = "localhost" if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("NewTestActionAgent(idle) failed: %v", err) } ti, err := ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_IDLE { t.Errorf("wrong type for tablet: %v", ti.Type) } if ti.Hostname != "localhost" { t.Errorf("wrong hostname for tablet: %v", ti.Hostname) } if ti.Portmap["vt"] != port { t.Errorf("wrong port for tablet: %v", ti.Portmap["vt"]) } if ti.Portmap["grpc"] != gRPCPort { t.Errorf("wrong gRPC port for tablet: %v", ti.Portmap["grpc"]) } // try again now that the node exists port = 3456 if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("NewTestActionAgent(idle again) failed: %v", err) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Portmap["vt"] != port { t.Errorf("wrong port for tablet: %v", ti.Portmap["vt"]) } if ti.Portmap["grpc"] != gRPCPort { t.Errorf("wrong gRPC port for tablet: %v", ti.Portmap["grpc"]) } // try with a keyspace and shard on the previously idle tablet, // should fail *initTabletType = "replica" *initKeyspace = "test_keyspace" *initShard = "-80" if err := agent.InitTablet(port, gRPCPort); err == nil || !strings.Contains(err.Error(), "InitTablet failed because existing tablet keyspace and shard / differ from the provided ones test_keyspace/-80") { t.Fatalf("InitTablet(type over idle) didn't fail correctly: %v", err) } // now let's use a different real tablet in a shard, that will create // the keyspace and shard. tabletAlias = topo.TabletAlias{ Cell: "cell1", Uid: 2, } agent.TabletAlias = tabletAlias if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("InitTablet(type) failed: %v", err) } si, err := ts.GetShard(ctx, "test_keyspace", "-80") if err != nil { t.Fatalf("GetShard failed: %v", err) } if len(si.Cells) != 1 || si.Cells[0] != "cell1" { t.Errorf("shard.Cells not updated properly: %v", si) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_REPLICA { t.Errorf("wrong tablet type: %v", ti.Type) } // try to init again, this time with health check on *initTabletType = "" *targetTabletType = "replica" if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("InitTablet(type, healthcheck) failed: %v", err) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_SPARE { t.Errorf("wrong tablet type: %v", ti.Type) } // update shard's master to our alias, then try to init again si, err = ts.GetShard(ctx, "test_keyspace", "-80") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.MasterAlias = topo.TabletAliasToProto(tabletAlias) if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("InitTablet(type, healthcheck) failed: %v", err) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_MASTER { t.Errorf("wrong tablet type: %v", ti.Type) } // init again with the tablet_type set, no healthcheck // (also check db name override and tags here) *initTabletType = "replica" *targetTabletType = "" *initDbNameOverride = "DBNAME" initTags.Set("aaa:bbb") if err := agent.InitTablet(port, gRPCPort); err != nil { t.Fatalf("InitTablet(type, healthcheck) failed: %v", err) } ti, err = ts.GetTablet(ctx, tabletAlias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if ti.Type != topo.TYPE_MASTER { t.Errorf("wrong tablet type: %v", ti.Type) } if ti.DbNameOverride != "DBNAME" { t.Errorf("wrong tablet DbNameOverride: %v", ti.DbNameOverride) } if len(ti.Tags) != 1 || ti.Tags["aaa"] != "bbb" { t.Errorf("wrong tablet tags: %v", ti.Tags) } }
// finalizeTabletExternallyReparented performs slow, synchronized reconciliation // tasks that ensure topology is self-consistent, and then marks the reparent as // finished by updating the global shard record. func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context, si *topo.ShardInfo, ev *events.Reparent) (err error) { var wg sync.WaitGroup var errs concurrency.AllErrorRecorder oldMasterAlias := si.MasterAlias // Update the tablet records and serving graph for the old and new master concurrently. event.DispatchUpdate(ev, "updating old and new master tablet records") log.Infof("finalizeTabletExternallyReparented: updating tablet records") wg.Add(1) go func() { defer wg.Done() // Update our own record to master. var updatedTablet *topo.Tablet err := topo.UpdateTabletFields(ctx, agent.TopoServer, agent.TabletAlias, func(tablet *topo.Tablet) error { tablet.Type = topo.TYPE_MASTER tablet.Health = nil updatedTablet = tablet return nil }) if err != nil { errs.RecordError(err) return } // Update the serving graph for the tablet. if updatedTablet != nil { errs.RecordError( topotools.UpdateTabletEndpoints(ctx, agent.TopoServer, updatedTablet)) } }() if !topo.TabletAliasIsZero(oldMasterAlias) { wg.Add(1) go func() { // Force the old master to spare. var oldMasterTablet *topo.Tablet err := topo.UpdateTabletFields(ctx, agent.TopoServer, topo.ProtoToTabletAlias(oldMasterAlias), func(tablet *topo.Tablet) error { tablet.Type = topo.TYPE_SPARE oldMasterTablet = tablet return nil }) if err != nil { errs.RecordError(err) wg.Done() return } if oldMasterTablet != nil { // We now know more about the old master, so add it to event data. ev.OldMaster = *oldMasterTablet // Update the serving graph. errs.RecordError( topotools.UpdateTabletEndpoints(ctx, agent.TopoServer, oldMasterTablet)) wg.Done() // Tell the old master to refresh its state. We don't need to wait for it. tmc := tmclient.NewTabletManagerClient() tmc.RefreshState(ctx, topo.NewTabletInfo(oldMasterTablet, -1)) } }() } tablet := agent.Tablet() // Wait for the tablet records to be updated. At that point, any rebuild will // see the new master, so we're ready to mark the reparent as done in the // global shard record. wg.Wait() if errs.HasErrors() { return errs.Error() } // Update the master field in the global shard record. We don't use a lock // here anymore. The lock was only to ensure that the global shard record // didn't get modified between the time when we read it and the time when we // write it back. Now we use an update loop pattern to do that instead. event.DispatchUpdate(ev, "updating global shard record") log.Infof("finalizeTabletExternallyReparented: updating global shard record") si, err = topo.UpdateShardFields(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard, func(shard *pb.Shard) error { shard.MasterAlias = topo.TabletAliasToProto(tablet.Alias) return nil }) if err != nil { return err } // We already took care of updating the serving graph for the old and new masters. // All that's left now is in case of a cross-cell reparent, we need to update the // master cell setting in the SrvShard records of all cells. if oldMasterAlias == nil || oldMasterAlias.Cell != tablet.Alias.Cell { event.DispatchUpdate(ev, "rebuilding shard serving graph") log.Infof("finalizeTabletExternallyReparented: updating SrvShard in all cells for cross-cell reparent") if err := topotools.UpdateAllSrvShards(ctx, agent.TopoServer, si); err != nil { return err } } event.DispatchUpdate(ev, "finished") return nil }
func TestShardReplicationStatuses(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second) // create shard and tablets if err := topo.CreateShard(ctx, ts, "test_keyspace", "0"); err != nil { t.Fatalf("CreateShard failed: %v", err) } master := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_MASTER) slave := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA) // mark the master inside the shard si, err := ts.GetShard(ctx, "test_keyspace", "0") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.MasterAlias = topo.TabletAliasToProto(master.Tablet.Alias) if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // master action loop (to initialize host and port) master.FakeMysqlDaemon.CurrentMasterPosition = myproto.ReplicationPosition{ GTIDSet: myproto.MariadbGTID{ Domain: 5, Server: 456, Sequence: 892, }, } master.StartActionLoop(t, wr) defer master.StopActionLoop(t) // slave loop slave.FakeMysqlDaemon.CurrentMasterPosition = myproto.ReplicationPosition{ GTIDSet: myproto.MariadbGTID{ Domain: 5, Server: 456, Sequence: 890, }, } slave.FakeMysqlDaemon.CurrentMasterHost = master.Tablet.Hostname slave.FakeMysqlDaemon.CurrentMasterPort = master.Tablet.Portmap["mysql"] slave.StartActionLoop(t, wr) defer slave.StopActionLoop(t) // run ShardReplicationStatuses ti, rs, err := wr.ShardReplicationStatuses(ctx, "test_keyspace", "0") if err != nil { t.Fatalf("ShardReplicationStatuses failed: %v", err) } // check result (make master first in the array) if len(ti) != 2 || len(rs) != 2 { t.Fatalf("ShardReplicationStatuses returned wrong results: %v %v", ti, rs) } if ti[0].Alias == slave.Tablet.Alias { ti[0], ti[1] = ti[1], ti[0] rs[0], rs[1] = rs[1], rs[0] } if ti[0].Alias != master.Tablet.Alias || ti[1].Alias != slave.Tablet.Alias || rs[0].MasterHost != "" || rs[1].MasterHost != master.Tablet.Hostname { t.Fatalf("ShardReplicationStatuses returend wrong results: %v %v", ti, rs) } }
func TestPermissions(t *testing.T) { // Initialize our environment ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second) vp := NewVtctlPipe(t, ts) defer vp.Close() master := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) replica := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA) // mark the master inside the shard si, err := ts.GetShard(ctx, master.Tablet.Keyspace, master.Tablet.Shard) if err != nil { t.Fatalf("GetShard failed: %v", err) } si.MasterAlias = topo.TabletAliasToProto(master.Tablet.Alias) if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // master will be asked for permissions master.FakeMysqlDaemon.FetchSuperQueryMap = map[string]*mproto.QueryResult{ "SELECT * FROM mysql.user": &mproto.QueryResult{ Fields: []mproto.Field{ mproto.Field{ Name: "Host", Type: 254, Flags: 16515}, mproto.Field{ Name: "User", Type: 254, Flags: 16515}, mproto.Field{ Name: "Password", Type: 254, Flags: 129}, mproto.Field{ Name: "Select_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Insert_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Update_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Delete_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Drop_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Reload_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Shutdown_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Process_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "File_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Grant_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "References_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Index_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Alter_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Show_db_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Super_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_tmp_table_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Lock_tables_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Execute_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Repl_slave_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Repl_client_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_view_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Show_view_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_routine_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Alter_routine_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_user_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Event_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Trigger_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_tablespace_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "ssl_type", Type: 254, Flags: 257}, mproto.Field{ Name: "ssl_cipher", Type: 252, Flags: 4241}, mproto.Field{ Name: "x509_issuer", Type: 252, Flags: 4241}, mproto.Field{ Name: "x509_subject", Type: 252, Flags: 4241}, mproto.Field{ Name: "max_questions", Type: 3, Flags: 32801}, mproto.Field{ Name: "max_updates", Type: 3, Flags: 32801}, mproto.Field{ Name: "max_connections", Type: 3, Flags: 32801}, mproto.Field{ Name: "max_user_connections", Type: 3, Flags: 32769}, mproto.Field{ Name: "plugin", Type: 254, Flags: 1}, mproto.Field{ Name: "authentication_string", Type: 252, Flags: 4241}, mproto.Field{ Name: "password_expired", Type: 254, Flags: 257}, mproto.Field{ Name: "is_role", Type: 254, Flags: 257}}, RowsAffected: 0x6, InsertId: 0x0, Rows: [][]sqltypes.Value{ []sqltypes.Value{ sqltypes.MakeString([]byte("test_host1")), sqltypes.MakeString([]byte("test_user1")), sqltypes.MakeString([]byte("test_password1")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N"))}, []sqltypes.Value{ sqltypes.MakeString([]byte("test_host2")), sqltypes.MakeString([]byte("test_user2")), sqltypes.MakeString([]byte("test_password2")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N"))}, []sqltypes.Value{ sqltypes.MakeString([]byte("test_host3")), sqltypes.MakeString([]byte("test_user3")), sqltypes.MakeString([]byte("test_password3")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N"))}, []sqltypes.Value{ sqltypes.MakeString([]byte("test_host4")), sqltypes.MakeString([]byte("test_user4")), sqltypes.MakeString([]byte("test_password4")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("0")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("N")), }, }, }, "SELECT * FROM mysql.db": &mproto.QueryResult{ Fields: []mproto.Field{ mproto.Field{ Name: "Host", Type: mproto.VT_STRING, Flags: 16515}, mproto.Field{ Name: "Db", Type: mproto.VT_STRING, Flags: 16515}, mproto.Field{ Name: "User", Type: mproto.VT_STRING, Flags: 16515}, mproto.Field{ Name: "Select_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Insert_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Update_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Delete_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Drop_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Grant_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "References_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Index_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Alter_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_tmp_table_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Lock_tables_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_view_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Show_view_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_routine_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Alter_routine_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Execute_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Event_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Trigger_priv", Type: 254, Flags: 257}, }, RowsAffected: 0, InsertId: 0, Rows: [][]sqltypes.Value{ []sqltypes.Value{ sqltypes.MakeString([]byte("test_host")), sqltypes.MakeString([]byte("test_db")), sqltypes.MakeString([]byte("test_user")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), }, }, }, "SELECT * FROM mysql.host": &mproto.QueryResult{ Fields: []mproto.Field{ mproto.Field{ Name: "Host", Type: mproto.VT_STRING, Flags: 16515}, mproto.Field{ Name: "Db", Type: 254, Flags: 16515}, mproto.Field{ Name: "Select_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Insert_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Update_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Delete_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Drop_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Grant_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "References_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Index_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Alter_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_tmp_table_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Lock_tables_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_view_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Show_view_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_routine_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Alter_routine_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Execute_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Trigger_priv", Type: 254, Flags: 257}, }, RowsAffected: 0, InsertId: 0, Rows: [][]sqltypes.Value{ []sqltypes.Value{ sqltypes.MakeString([]byte("test_host")), sqltypes.MakeString([]byte("test_db")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), }, }, }, } master.StartActionLoop(t, wr) defer master.StopActionLoop(t) // replica will be asked for permissions replica.FakeMysqlDaemon.FetchSuperQueryMap = map[string]*mproto.QueryResult{ "SELECT * FROM mysql.user": master.FakeMysqlDaemon.FetchSuperQueryMap["SELECT * FROM mysql.user"], "SELECT * FROM mysql.db": master.FakeMysqlDaemon.FetchSuperQueryMap["SELECT * FROM mysql.db"], "SELECT * FROM mysql.host": master.FakeMysqlDaemon.FetchSuperQueryMap["SELECT * FROM mysql.host"], } replica.StartActionLoop(t, wr) defer replica.StopActionLoop(t) // run ValidatePermissionsKeyspace, this should work if err := vp.Run([]string{"ValidatePermissionsKeyspace", master.Tablet.Keyspace}); err != nil { t.Fatalf("ValidatePermissionsKeyspace failed: %v", err) } // modify one field, this should fail replica.FakeMysqlDaemon.FetchSuperQueryMap["SELECT * FROM mysql.host"] = &mproto.QueryResult{ Fields: []mproto.Field{ mproto.Field{ Name: "Host", Type: mproto.VT_STRING, Flags: 16515}, mproto.Field{ Name: "Db", Type: 254, Flags: 16515}, mproto.Field{ Name: "Select_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Insert_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Update_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Delete_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Drop_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Grant_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "References_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Index_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Alter_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_tmp_table_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Lock_tables_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_view_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Show_view_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Create_routine_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Alter_routine_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Execute_priv", Type: 254, Flags: 257}, mproto.Field{ Name: "Trigger_priv", Type: 254, Flags: 257}, }, RowsAffected: 0, InsertId: 0, Rows: [][]sqltypes.Value{ []sqltypes.Value{ sqltypes.MakeString([]byte("test_host")), sqltypes.MakeString([]byte("test_db")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("Y")), sqltypes.MakeString([]byte("N")), // different }, }, } // run ValidatePermissionsKeyspace again, this should now fail if err := vp.Run([]string{"ValidatePermissionsKeyspace", master.Tablet.Keyspace}); err == nil || !strings.Contains(err.Error(), "disagree on host test_host:test_db") { t.Fatalf("ValidatePermissionsKeyspace has unexpected err: %v", err) } }
func (wr *Wrangler) emergencyReparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, masterElectTabletAlias topo.TabletAlias, waitSlaveTimeout time.Duration) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } ev.ShardInfo = *shardInfo event.DispatchUpdate(ev, "reading all tablets") tabletMap, err := topo.GetTabletMapForShard(ctx, wr.ts, keyspace, shard) if err != nil { return err } // Check corner cases we're going to depend on masterElectTabletInfo, ok := tabletMap[masterElectTabletAlias] if !ok { return fmt.Errorf("master-elect tablet %v is not in the shard", masterElectTabletAlias) } ev.NewMaster = *masterElectTabletInfo.Tablet if topo.ProtoToTabletAlias(shardInfo.MasterAlias) == masterElectTabletAlias { return fmt.Errorf("master-elect tablet %v is already the master", masterElectTabletAlias) } // Deal with the old master: try to remote-scrap it, if it's // truely dead we force-scrap it. Remove it from our map in any case. if !topo.TabletAliasIsZero(shardInfo.MasterAlias) { scrapOldMaster := true oldMasterTabletInfo, ok := tabletMap[topo.ProtoToTabletAlias(shardInfo.MasterAlias)] if ok { delete(tabletMap, topo.ProtoToTabletAlias(shardInfo.MasterAlias)) } else { oldMasterTabletInfo, err = wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(shardInfo.MasterAlias)) if err != nil { wr.logger.Warningf("cannot read old master tablet %v, won't touch it: %v", shardInfo.MasterAlias, err) scrapOldMaster = false } } if scrapOldMaster { ev.OldMaster = *oldMasterTabletInfo.Tablet wr.logger.Infof("scrapping old master %v", shardInfo.MasterAlias) ctx, cancel := context.WithTimeout(ctx, waitSlaveTimeout) defer cancel() if err := wr.tmc.Scrap(ctx, oldMasterTabletInfo); err != nil { wr.logger.Warningf("remote scrapping failed master failed, will force the scrap: %v", err) if err := topotools.Scrap(ctx, wr.ts, topo.ProtoToTabletAlias(shardInfo.MasterAlias), true); err != nil { wr.logger.Warningf("old master topo scrapping failed, continuing anyway: %v", err) } } } } // Stop replication on all slaves, get their current // replication position event.DispatchUpdate(ev, "stop replication on all slaves") wg := sync.WaitGroup{} mu := sync.Mutex{} statusMap := make(map[topo.TabletAlias]myproto.ReplicationStatus) for alias, tabletInfo := range tabletMap { wg.Add(1) go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) { defer wg.Done() wr.logger.Infof("getting replication position from %v", alias) ctx, cancel := context.WithTimeout(ctx, waitSlaveTimeout) defer cancel() rp, err := wr.TabletManagerClient().StopReplicationAndGetStatus(ctx, tabletInfo) if err != nil { wr.logger.Warningf("failed to get replication status from %v, ignoring tablet: %v", alias, err) return } mu.Lock() statusMap[alias] = rp mu.Unlock() }(alias, tabletInfo) } wg.Wait() // Verify masterElect is alive and has the most advanced position masterElectStatus, ok := statusMap[masterElectTabletAlias] if !ok { return fmt.Errorf("couldn't get master elect %v replication position", masterElectTabletAlias) } for alias, status := range statusMap { if alias == masterElectTabletAlias { continue } if !masterElectStatus.Position.AtLeast(status.Position) { return fmt.Errorf("tablet %v is more advanced than master elect tablet %v: %v > %v", alias, masterElectTabletAlias, status.Position, masterElectStatus) } } // Promote the masterElect wr.logger.Infof("promote slave %v", masterElectTabletAlias) event.DispatchUpdate(ev, "promoting slave") rp, err := wr.tmc.PromoteSlave(ctx, masterElectTabletInfo) if err != nil { return fmt.Errorf("master-elect tablet %v failed to be upgraded to master: %v", masterElectTabletAlias, err) } // Reset replication on all slaves to point to the new master, and // insert test row in the new master. // Go through all the tablets: // - new master: populate the reparent journal // - everybody else: reparent to new master, wait for row event.DispatchUpdate(ev, "reparenting all tablets") now := time.Now().UnixNano() wgMaster := sync.WaitGroup{} wgSlaves := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} var masterErr error for alias, tabletInfo := range tabletMap { if alias == masterElectTabletAlias { wgMaster.Add(1) go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) { defer wgMaster.Done() wr.logger.Infof("populating reparent journal on new master %v", alias) masterErr = wr.TabletManagerClient().PopulateReparentJournal(ctx, tabletInfo, now, emergencyReparentShardOperation, alias, rp) }(alias, tabletInfo) } else { wgSlaves.Add(1) go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) { defer wgSlaves.Done() wr.logger.Infof("setting new master on slave %v", alias) forceStartSlave := false if status, ok := statusMap[alias]; ok { forceStartSlave = status.SlaveIORunning || status.SlaveSQLRunning } if err := wr.TabletManagerClient().SetMaster(ctx, tabletInfo, masterElectTabletAlias, now, forceStartSlave); err != nil { rec.RecordError(fmt.Errorf("Tablet %v SetMaster failed: %v", alias, err)) } }(alias, tabletInfo) } } // After the master is done, we can update the shard record // (note with semi-sync, it also means at least one slave is done) wgMaster.Wait() if masterErr != nil { wgSlaves.Wait() return fmt.Errorf("failed to PopulateReparentJournal on master: %v", masterErr) } wr.logger.Infof("updating shard record with new master %v", masterElectTabletAlias) shardInfo.MasterAlias = topo.TabletAliasToProto(masterElectTabletAlias) if err := topo.UpdateShard(ctx, wr.ts, shardInfo); err != nil { wgSlaves.Wait() return fmt.Errorf("failed to update shard master record: %v", err) } // Wait for the slaves to complete. If some of them fail, we // will rebuild the shard serving graph anyway wgSlaves.Wait() if err := rec.Error(); err != nil { wr.Logger().Errorf("Some slaves failed to reparent: %v", err) return err } // Then we rebuild the entire serving graph for the shard, // to account for all changes. wr.logger.Infof("rebuilding shard graph") event.DispatchUpdate(ev, "rebuilding shard serving graph") _, err = wr.RebuildShardGraph(ctx, keyspace, shard, nil) return err }
func (wr *Wrangler) plannedReparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, masterElectTabletAlias topo.TabletAlias, waitSlaveTimeout time.Duration) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } ev.ShardInfo = *shardInfo event.DispatchUpdate(ev, "reading tablet map") tabletMap, err := topo.GetTabletMapForShard(ctx, wr.ts, keyspace, shard) if err != nil { return err } // Check corner cases we're going to depend on masterElectTabletInfo, ok := tabletMap[masterElectTabletAlias] if !ok { return fmt.Errorf("master-elect tablet %v is not in the shard", masterElectTabletAlias) } ev.NewMaster = *masterElectTabletInfo.Tablet if topo.ProtoToTabletAlias(shardInfo.MasterAlias) == masterElectTabletAlias { return fmt.Errorf("master-elect tablet %v is already the master", masterElectTabletAlias) } oldMasterTabletInfo, ok := tabletMap[topo.ProtoToTabletAlias(shardInfo.MasterAlias)] if !ok { return fmt.Errorf("old master tablet %v is not in the shard", shardInfo.MasterAlias) } ev.OldMaster = *oldMasterTabletInfo.Tablet // Demote the current master, get its replication position wr.logger.Infof("demote current master %v", shardInfo.MasterAlias) event.DispatchUpdate(ev, "demoting old master") rp, err := wr.tmc.DemoteMaster(ctx, oldMasterTabletInfo) if err != nil { return fmt.Errorf("old master tablet %v DemoteMaster failed: %v", shardInfo.MasterAlias, err) } // Wait on the master-elect tablet until it reaches that position, // then promote it wr.logger.Infof("promote slave %v", masterElectTabletAlias) event.DispatchUpdate(ev, "promoting slave") rp, err = wr.tmc.PromoteSlaveWhenCaughtUp(ctx, masterElectTabletInfo, rp) if err != nil { return fmt.Errorf("master-elect tablet %v failed to catch up with replication or be upgraded to master: %v", masterElectTabletAlias, err) } // Go through all the tablets: // - new master: populate the reparent journal // - everybody else: reparent to new master, wait for row event.DispatchUpdate(ev, "reparenting all tablets") now := time.Now().UnixNano() wgMaster := sync.WaitGroup{} wgSlaves := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} var masterErr error for alias, tabletInfo := range tabletMap { if alias == masterElectTabletAlias { wgMaster.Add(1) go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) { defer wgMaster.Done() wr.logger.Infof("populating reparent journal on new master %v", alias) masterErr = wr.TabletManagerClient().PopulateReparentJournal(ctx, tabletInfo, now, plannedReparentShardOperation, alias, rp) }(alias, tabletInfo) } else { wgSlaves.Add(1) go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) { defer wgSlaves.Done() wr.logger.Infof("setting new master on slave %v", alias) // also restart replication on old master forceStartSlave := alias == oldMasterTabletInfo.Alias if err := wr.TabletManagerClient().SetMaster(ctx, tabletInfo, masterElectTabletAlias, now, forceStartSlave); err != nil { rec.RecordError(fmt.Errorf("Tablet %v SetMaster failed: %v", alias, err)) return } }(alias, tabletInfo) } } // After the master is done, we can update the shard record // (note with semi-sync, it also means at least one slave is done) wgMaster.Wait() if masterErr != nil { wgSlaves.Wait() return fmt.Errorf("failed to PopulateReparentJournal on master: %v", masterErr) } wr.logger.Infof("updating shard record with new master %v", masterElectTabletAlias) shardInfo.MasterAlias = topo.TabletAliasToProto(masterElectTabletAlias) if err := topo.UpdateShard(ctx, wr.ts, shardInfo); err != nil { wgSlaves.Wait() return fmt.Errorf("failed to update shard master record: %v", err) } // Wait for the slaves to complete. If some of them fail, we // will rebuild the shard serving graph anyway wgSlaves.Wait() if err := rec.Error(); err != nil { wr.Logger().Errorf("Some slaves failed to reparent: %v", err) return err } // Then we rebuild the entire serving graph for the shard, // to account for all changes. wr.logger.Infof("rebuilding shard graph") event.DispatchUpdate(ev, "rebuilding shard serving graph") _, err = wr.RebuildShardGraph(ctx, keyspace, shard, nil) return err }
func (wr *Wrangler) initShardMasterLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, masterElectTabletAlias topo.TabletAlias, force bool, waitSlaveTimeout time.Duration) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } ev.ShardInfo = *shardInfo event.DispatchUpdate(ev, "reading tablet map") tabletMap, err := topo.GetTabletMapForShard(ctx, wr.ts, keyspace, shard) if err != nil { return err } // Check the master elect is in tabletMap masterElectTabletInfo, ok := tabletMap[masterElectTabletAlias] if !ok { return fmt.Errorf("master-elect tablet %v is not in the shard", masterElectTabletAlias) } ev.NewMaster = *masterElectTabletInfo.Tablet // Check the master is the only master is the shard, or -force was used. _, masterTabletMap := topotools.SortedTabletMap(tabletMap) if topo.ProtoToTabletAlias(shardInfo.MasterAlias) != masterElectTabletAlias { if !force { return fmt.Errorf("master-elect tablet %v is not the shard master, use -force to proceed anyway", masterElectTabletAlias) } wr.logger.Warningf("master-elect tablet %v is not the shard master, proceeding anyway as -force was used", masterElectTabletAlias) } if _, ok := masterTabletMap[masterElectTabletAlias]; !ok { if !force { return fmt.Errorf("master-elect tablet %v is not a master in the shard, use -force to proceed anyway", masterElectTabletAlias) } wr.logger.Warningf("master-elect tablet %v is not a master in the shard, proceeding anyway as -force was used", masterElectTabletAlias) } haveOtherMaster := false for alias, ti := range masterTabletMap { if alias != masterElectTabletAlias && ti.Type != topo.TYPE_SCRAP { haveOtherMaster = true } } if haveOtherMaster { if !force { return fmt.Errorf("master-elect tablet %v is not the only master in the shard, use -force to proceed anyway", masterElectTabletAlias) } wr.logger.Warningf("master-elect tablet %v is not the only master in the shard, proceeding anyway as -force was used", masterElectTabletAlias) } // First phase: reset replication on all tablets. If anyone fails, // we stop. It is probably because it is unreachable, and may leave // an unstable database process in the mix, with a database daemon // at a wrong replication spot. event.DispatchUpdate(ev, "resetting replication on all tablets") wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for alias, tabletInfo := range tabletMap { wg.Add(1) go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) { defer wg.Done() wr.logger.Infof("resetting replication on tablet %v", alias) if err := wr.TabletManagerClient().ResetReplication(ctx, tabletInfo); err != nil { rec.RecordError(fmt.Errorf("Tablet %v ResetReplication failed (either fix it, or Scrap it): %v", alias, err)) } }(alias, tabletInfo) } wg.Wait() if err := rec.Error(); err != nil { return err } // Tell the new master to break its slaves, return its replication // position wr.logger.Infof("initializing master on %v", masterElectTabletAlias) event.DispatchUpdate(ev, "initializing master") rp, err := wr.TabletManagerClient().InitMaster(ctx, masterElectTabletInfo) if err != nil { return err } // Now tell the new master to insert the reparent_journal row, // and tell everybody else to become a slave of the new master, // and wait for the row in the reparent_journal table. // We start all these in parallel, to handle the semi-sync // case: for the master to be able to commit its row in the // reparent_journal table, it needs connected slaves. event.DispatchUpdate(ev, "reparenting all tablets") now := time.Now().UnixNano() wgMaster := sync.WaitGroup{} wgSlaves := sync.WaitGroup{} var masterErr error for alias, tabletInfo := range tabletMap { if alias == masterElectTabletAlias { wgMaster.Add(1) go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) { defer wgMaster.Done() wr.logger.Infof("populating reparent journal on new master %v", alias) masterErr = wr.TabletManagerClient().PopulateReparentJournal(ctx, tabletInfo, now, initShardMasterOperation, alias, rp) }(alias, tabletInfo) } else { wgSlaves.Add(1) go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) { defer wgSlaves.Done() wr.logger.Infof("initializing slave %v", alias) if err := wr.TabletManagerClient().InitSlave(ctx, tabletInfo, masterElectTabletAlias, rp, now); err != nil { rec.RecordError(fmt.Errorf("Tablet %v InitSlave failed: %v", alias, err)) } }(alias, tabletInfo) } } // After the master is done, we can update the shard record // (note with semi-sync, it also means at least one slave is done) wgMaster.Wait() if masterErr != nil { wgSlaves.Wait() return fmt.Errorf("failed to PopulateReparentJournal on master: %v", masterErr) } if topo.ProtoToTabletAlias(shardInfo.MasterAlias) != masterElectTabletAlias { shardInfo.MasterAlias = topo.TabletAliasToProto(masterElectTabletAlias) if err := topo.UpdateShard(ctx, wr.ts, shardInfo); err != nil { wgSlaves.Wait() return fmt.Errorf("failed to update shard master record: %v", err) } } // Wait for the slaves to complete. If some of them fail, we // don't want to rebuild the shard serving graph (the failure // will most likely be a timeout, and our context will be // expired, so the rebuild will fail anyway) wgSlaves.Wait() if err := rec.Error(); err != nil { return err } // Then we rebuild the entire serving graph for the shard, // to account for all changes. event.DispatchUpdate(ev, "rebuilding shard graph") _, err = wr.RebuildShardGraph(ctx, keyspace, shard, nil) return err }