// New creates a new Wrangler object. // // actionTimeout: how long should we wait for an action to complete? // - if using wrangler for just one action, this is set properly // upon wrangler creation. // - if re-using wrangler multiple times, call ResetActionTimeout before // every action. Do not use this too much, just for corner cases. // It is just much easier to create a new Wrangler object per action. // // lockTimeout: how long should we wait for the initial lock to start // a complex action? This is distinct from actionTimeout because most // of the time, we want to immediately know that our action will // fail. However, automated action will need some time to arbitrate // the locks. func New(logger logutil.Logger, ts topo.Server, actionTimeout, lockTimeout time.Duration) *Wrangler { ctx, cancel := context.WithTimeout(context.Background(), actionTimeout) return &Wrangler{ logger: logger, ts: ts, tmc: tmclient.NewTabletManagerClient(), ctx: ctx, cancel: cancel, deadline: time.Now().Add(actionTimeout), lockTimeout: lockTimeout, } }
// TestTabletExternallyReparentedWithDifferentMysqlPort makes sure // that if mysql is restarted on the master-elect tablet and has a different // port, we pick it up correctly. func TestTabletExternallyReparentedWithDifferentMysqlPort(t *testing.T) { ts := zktopo.NewTestServer(t, []string{"cell1"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) // Create an old master, a new master, two good slaves, one bad slave oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) // Now we're restarting mysql on a different port, 3301->3303 // but without updating the Tablet record in topology. // On the elected master, we will respond to // TABLET_ACTION_SLAVE_WAS_PROMOTED, so we need a MysqlDaemon // that returns no master, and the new port (as returned by mysql) newMaster.FakeMysqlDaemon.MasterAddr = "" newMaster.FakeMysqlDaemon.MysqlPort = 3303 newMaster.StartActionLoop(t, wr) defer newMaster.StopActionLoop(t) // On the old master, we will only respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED and point to the new mysql port oldMaster.FakeMysqlDaemon.MasterAddr = "101.0.0.1:3303" oldMaster.StartActionLoop(t, wr) defer oldMaster.StopActionLoop(t) // On the good slaves, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED and point to the new mysql port goodSlave.FakeMysqlDaemon.MasterAddr = "101.0.0.1:3303" goodSlave.StartActionLoop(t, wr) defer goodSlave.StopActionLoop(t) // This tests the good case, where everything works as planned t.Logf("TabletExternallyReparented(new master) expecting success") tmc := tmclient.NewTabletManagerClient() ti, err := ts.GetTablet(newMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) } }
// tabletExternallyReparentedLocked is called with the shard lock. // It returns if agent.refreshTablet should be called, and the error. // Note both are set independently (can have both true and an error). func (agent *ActionAgent) tabletExternallyReparentedLocked(ctx context.Context, externalID string, interrupted chan struct{}) (bool, error) { // re-read the tablet record to be sure we have the latest version tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return false, err } // read the shard, make sure again the master is not already good. shardInfo, err := agent.TopoServer.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return false, err } if shardInfo.MasterAlias == tablet.Alias { log.Infof("TabletExternallyReparented: tablet became the master before we get the lock?") return false, nil } log.Infof("TabletExternallyReparented called and we're not the master, doing the work") // Read the tablets, make sure the master elect is known to the shard // (it's this tablet, so it better be!). // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard) switch err { case nil: // keep going case topo.ErrPartialResult: log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return false, err } masterElectTablet, ok := tabletMap[tablet.Alias] if !ok { return false, fmt.Errorf("this master-elect tablet %v not found in replication graph %v/%v %v", tablet.Alias, tablet.Keyspace, tablet.Shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *tablet.Tablet, ExternalID: externalID, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // sort the tablets, and handle them slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) event.DispatchUpdate(ev, "starting external from tablet") // We fix the new master in the replication graph. // Note after this call, we may have changed the tablet record, // so we will always return true, so the tablet record is re-read // by the agent. event.DispatchUpdate(ev, "mark ourself as new master") err = agent.updateReplicationGraphForPromotedSlave(ctx, tablet) if err != nil { // This suggests we can't talk to topo server. This is bad. return true, fmt.Errorf("updateReplicationGraphForPromotedSlave failed: %v", err) } // Once this tablet is promoted, remove it from our maps delete(slaveTabletMap, tablet.Alias) delete(masterTabletMap, tablet.Alias) // Then fix all the slaves, including the old master. This // last step is very likely to time out for some tablets (one // random guy is dead, the old master is dead, ...). We // execute them all in parallel until we get to // wr.ActionTimeout(). After this, no other action with a // timeout is executed, so even if we got to the timeout, // we're still good. event.DispatchUpdate(ev, "restarting slaves") logger := logutil.NewConsoleLogger() tmc := tmclient.NewTabletManagerClient() topotools.RestartSlavesExternal(agent.TopoServer, logger, slaveTabletMap, masterTabletMap, masterElectTablet.Alias, func(ti *topo.TabletInfo, swrd *actionnode.SlaveWasRestartedArgs) error { return tmc.SlaveWasRestarted(ctx, ti, swrd) }) // Compute the list of Cells we need to rebuild: old master and // all other cells if reparenting to another cell. cells := []string{shardInfo.MasterAlias.Cell} if shardInfo.MasterAlias.Cell != tablet.Alias.Cell { cells = nil } // now update the master record in the shard object event.DispatchUpdate(ev, "updating shard record") log.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = tablet.Alias if err = topo.UpdateShard(ctx, agent.TopoServer, shardInfo); err != nil { return true, err } // and rebuild the shard serving graph event.DispatchUpdate(ev, "rebuilding shard serving graph") log.Infof("Rebuilding shard serving graph data") if _, err = topotools.RebuildShard(ctx, logger, agent.TopoServer, tablet.Keyspace, tablet.Shard, cells, agent.LockTimeout, interrupted); err != nil { return true, err } event.DispatchUpdate(ev, "finished") return true, nil }
func TestTabletExternallyReparentedFailedOldMaster(t *testing.T) { ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) // Create an old master, a new master, two good slaves oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) // Reparent to a replica, and pretend the old master is not responding // On the elected master, we will respond to // TABLET_ACTION_SLAVE_WAS_PROMOTED newMaster.FakeMysqlDaemon.MasterAddr = "" newMaster.StartActionLoop(t, wr) defer newMaster.StopActionLoop(t) // On the old master, we will only get a // TABLET_ACTION_SLAVE_WAS_RESTARTED call, let's just not // respond to it at all // On the good slave, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. goodSlave.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave.StartActionLoop(t, wr) defer goodSlave.StopActionLoop(t) // The reparent should work as expected here t.Logf("TabletExternallyReparented(new master) expecting success") tmc := tmclient.NewTabletManagerClient() ti, err := ts.GetTablet(newMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) } // Now double-check the serving graph is good. // Should only have one good replica left. addrs, err := ts.GetEndPoints("cell1", "test_keyspace", "0", topo.TYPE_REPLICA) if err != nil { t.Fatalf("GetEndPoints failed at the end: %v", err) } if len(addrs.Entries) != 1 { t.Fatalf("GetEndPoints has too many entries: %v", addrs) } // check the old master was converted to spare tablet, err := ts.GetTablet(oldMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet(%v) failed: %v", oldMaster.Tablet.Alias, err) } if tablet.Type != topo.TYPE_SPARE { t.Fatalf("old master should be spare but is: %v", tablet.Type) } if tablet.Parent != newMaster.Tablet.Alias { t.Fatalf("old master has the wrong master, got %v expected %v", tablet.Parent, newMaster.Tablet.Alias) } }
func TestTabletExternallyReparented(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) // Create an old master, a new master, two good slaves, one bad slave oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave1 := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave2 := NewFakeTablet(t, wr, "cell2", 3, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) badSlave := NewFakeTablet(t, wr, "cell1", 4, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) // Add a new Cell to the Shard, that doesn't map to any read topo cell, // to simulate a data center being unreachable. si, err := ts.GetShard("test_keyspace", "0") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.Cells = append(si.Cells, "cell666") if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // Slightly unrelated test: make sure we can find the tablets // even with a datacenter being down. tabletMap, err := topo.GetTabletMapForShardByCell(ctx, ts, "test_keyspace", "0", []string{"cell1"}) if err != nil { t.Fatalf("GetTabletMapForShardByCell should have worked but got: %v", err) } master, err := topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != nil || master != oldMaster.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master) } slave1, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave1.Tablet.IPAddr, "vt", goodSlave1.Tablet.Portmap["vt"]) if err != nil || slave1 != goodSlave1.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(slave1) failed: %v %v", err, master) } slave2, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave2.Tablet.IPAddr, "vt", goodSlave2.Tablet.Portmap["vt"]) if err != topo.ErrNoNode { t.Fatalf("FindTabletByIPAddrAndPort(slave2) worked: %v %v", err, slave2) } // Make sure the master is not exported in other cells tabletMap, err = topo.GetTabletMapForShardByCell(ctx, ts, "test_keyspace", "0", []string{"cell2"}) master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != topo.ErrNoNode { t.Fatalf("FindTabletByIPAddrAndPort(master) worked in cell2: %v %v", err, master) } tabletMap, err = topo.GetTabletMapForShard(ctx, ts, "test_keyspace", "0") if err != topo.ErrPartialResult { t.Fatalf("GetTabletMapForShard should have returned ErrPartialResult but got: %v", err) } master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != nil || master != oldMaster.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master) } // On the elected master, we will respond to // TABLET_ACTION_SLAVE_WAS_PROMOTED newMaster.FakeMysqlDaemon.MasterAddr = "" newMaster.StartActionLoop(t, wr) defer newMaster.StopActionLoop(t) // On the old master, we will only respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. oldMaster.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() oldMaster.StartActionLoop(t, wr) defer oldMaster.StopActionLoop(t) // On the good slaves, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. goodSlave1.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave1.StartActionLoop(t, wr) defer goodSlave1.StopActionLoop(t) goodSlave2.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave2.StartActionLoop(t, wr) defer goodSlave2.StopActionLoop(t) // On the bad slave, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED with bad data. badSlave.FakeMysqlDaemon.MasterAddr = "234.0.0.1:3301" badSlave.StartActionLoop(t, wr) defer badSlave.StopActionLoop(t) // First test: reparent to the same master, make sure it works // as expected. tmc := tmclient.NewTabletManagerClient() ti, err := ts.GetTablet(oldMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(same master) should have worked") } // Second test: reparent to a replica, and pretend the old // master is still good to go. // This tests a bad case; the new designated master is a slave, // but we should do what we're told anyway ti, err = ts.GetTablet(goodSlave1.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(slave) error: %v", err) } // This tests the good case, where everything works as planned t.Logf("TabletExternallyReparented(new master) expecting success") ti, err = ts.GetTablet(newMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) } // Now double-check the serving graph is good. // Should only have one good replica left. addrs, err := ts.GetEndPoints("cell1", "test_keyspace", "0", topo.TYPE_REPLICA) if err != nil { t.Fatalf("GetEndPoints failed at the end: %v", err) } if len(addrs.Entries) != 1 { t.Fatalf("GetEndPoints has too many entries: %v", addrs) } }