// SnapshotSourceEnd restores the state of the server after a // Snapshot(server_mode =true) // Should be called under RpcWrapLockAction. func (agent *ActionAgent) SnapshotSourceEnd(ctx context.Context, args *actionnode.SnapshotSourceEndArgs) error { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("expected snapshot_source type, not %v", tablet.Type) } if err := agent.Mysqld.SnapshotSourceEnd(args.SlaveStartRequired, args.ReadOnly, true, agent.hookExtraEnv()); err != nil { log.Errorf("SnapshotSourceEnd failed, leaving tablet type alone: %v", err) return err } // change the type back if args.OriginalType == topo.TYPE_MASTER { // force the master update tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, args.OriginalType, make(map[string]string), true /*runHooks*/) } return err }
// updateReplicationGraphForPromotedSlave makes sure the newly promoted slave // is correctly represented in the replication graph func (agent *ActionAgent) updateReplicationGraphForPromotedSlave(ctx context.Context, tablet *topo.TabletInfo) error { // Update tablet regardless - trend towards consistency. tablet.State = topo.STATE_READ_WRITE tablet.Type = topo.TYPE_MASTER tablet.Parent.Cell = "" tablet.Parent.Uid = topo.NO_TABLET tablet.Health = nil err := topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } // NOTE(msolomon) A serving graph update is required, but in // order for the shard to be consistent the old master must be // scrapped first. That is externally coordinated by the // wrangler reparent action. // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// SlaveWasRestarted updates the parent record for a tablet. // Should be called under RpcWrapLockAction. func (agent *ActionAgent) SlaveWasRestarted(ctx context.Context, swrd *actionnode.SlaveWasRestartedArgs) error { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } // Once this action completes, update authoritative tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } // Update the new tablet location in the replication graph now that // we've updated the tablet. err = topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// SetReadOnly makes the mysql instance read-only or read-write // Should be called under RpcWrapLockAction. func (agent *ActionAgent) SetReadOnly(ctx context.Context, rdonly bool) error { err := agent.Mysqld.SetReadOnly(rdonly) if err != nil { return err } tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if rdonly { tablet.State = topo.STATE_READ_ONLY } else { tablet.State = topo.STATE_READ_WRITE } return topo.UpdateTablet(ctx, agent.TopoServer, tablet) }
// Scrap will update the tablet type to 'Scrap', and remove it from // the serving graph. // // 'force' means we are not on the tablet being scrapped, so it is // probably dead. So if 'force' is true, we will also remove pending // remote actions. And if 'force' is false, we also run an optional // hook. func Scrap(ts topo.Server, tabletAlias topo.TabletAlias, force bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // If you are already scrap, skip updating replication data. It won't // be there anyway. wasAssigned := tablet.IsAssigned() tablet.Type = topo.TYPE_SCRAP tablet.Parent = topo.TabletAlias{} // Update the tablet first, since that is canonical. err = topo.UpdateTablet(context.TODO(), ts, tablet) if err != nil { return err } if wasAssigned { err = topo.DeleteTabletReplicationData(ts, tablet.Tablet) if err != nil { if err == topo.ErrNoNode { log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell) err = nil } if err != nil { log.Warningf("remove replication data for %v failed: %v", tablet.Alias, err) } } } // run a hook for final cleanup, only in non-force mode. // (force mode executes on the vtctl side, not on the vttablet side) if !force { hk := hook.NewSimpleHook("postflight_scrap") ConfigureTabletHook(hk, tablet.Alias) if hookErr := hk.ExecuteOptional(); hookErr != nil { // we don't want to return an error, the server // is already in bad shape probably. log.Warningf("Scrap: postflight_scrap failed: %v", hookErr) } } return nil }
// RestartSlavesExternal will tell all the slaves in the provided list // that they have a new master, and also tell all the masters. The // masters will be scrapped if they don't answer. // We execute all the actions in parallel. func RestartSlavesExternal(ts topo.Server, log logutil.Logger, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTabletAlias topo.TabletAlias, slaveWasRestarted func(*topo.TabletInfo, *actionnode.SlaveWasRestartedArgs) error) { wg := sync.WaitGroup{} swrd := actionnode.SlaveWasRestartedArgs{ Parent: masterElectTabletAlias, } log.Infof("Updating individual tablets with the right master...") // do all the slaves for _, ti := range slaveTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { if err := slaveWasRestarted(ti, &swrd); err != nil { log.Warningf("Slave %v had an error: %v", ti.Alias, err) } wg.Done() }(ti) } // and do the old master and any straggler, if possible. for _, ti := range masterTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { err := slaveWasRestarted(ti, &swrd) if err != nil { // the old master can be annoying if left // around in the replication graph, so if we // can't restart it, we just scrap it. // We don't rebuild the Shard just yet though. log.Warningf("Old master %v is not restarting in time, forcing it to spare: %v", ti.Alias, err) ti.Type = topo.TYPE_SPARE ti.Parent = masterElectTabletAlias if err := topo.UpdateTablet(context.TODO(), ts, ti); err != nil { log.Warningf("Failed to change old master %v to spare: %v", ti.Alias, err) } } wg.Done() }(ti) } wg.Wait() }
// checkTabletMysqlPort will check the mysql port for the tablet is good, // and if not will try to update it. func (agent *ActionAgent) checkTabletMysqlPort(tablet *topo.TabletInfo) *topo.TabletInfo { mport, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { log.Warningf("Cannot get current mysql port, not checking it: %v", err) return nil } if mport == tablet.Portmap["mysql"] { return nil } log.Warningf("MySQL port has changed from %v to %v, updating it in tablet record", tablet.Portmap["mysql"], mport) tablet.Portmap["mysql"] = mport if err := topo.UpdateTablet(context.TODO(), agent.TopoServer, tablet); err != nil { log.Warningf("Failed to update tablet record, may use old mysql port") return nil } return tablet }
// change a tablet type to RESTORE and set all the other arguments. // from now on, we can go to: // - back to IDLE if we don't use the tablet at all (after for instance // a successful ReserveForRestore but a failed Snapshot) // - to SCRAP if something in the process on the target host fails // - to SPARE if the clone works func (agent *ActionAgent) changeTypeToRestore(ctx context.Context, tablet, sourceTablet *topo.TabletInfo, parentAlias topo.TabletAlias, keyRange key.KeyRange) error { // run the optional preflight_assigned hook hk := hook.NewSimpleHook("preflight_assigned") topotools.ConfigureTabletHook(hk, agent.TabletAlias) if err := hk.ExecuteOptional(); err != nil { return err } // change the type tablet.Parent = parentAlias tablet.Keyspace = sourceTablet.Keyspace tablet.Shard = sourceTablet.Shard tablet.Type = topo.TYPE_RESTORE tablet.KeyRange = keyRange tablet.DbNameOverride = sourceTablet.DbNameOverride if err := topo.UpdateTablet(ctx, agent.TopoServer, tablet); err != nil { return err } // and create the replication graph items return topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) }
// InitTablet creates or updates a tablet. If no parent is specified // in the tablet, and the tablet has a slave type, we will find the // appropriate parent. If createShardAndKeyspace is true and the // parent keyspace or shard don't exist, they will be created. If // update is true, and a tablet with the same ID exists, update it. // If Force is true, and a tablet with the same ID already exists, it // will be scrapped and deleted, and then recreated. func (wr *Wrangler) InitTablet(tablet *topo.Tablet, force, createShardAndKeyspace, update bool) error { if err := tablet.Complete(); err != nil { return err } if tablet.IsInReplicationGraph() { // create the parent keyspace and shard if needed if createShardAndKeyspace { if err := wr.ts.CreateKeyspace(tablet.Keyspace, &topo.Keyspace{}); err != nil && err != topo.ErrNodeExists { return err } if err := topo.CreateShard(wr.ts, tablet.Keyspace, tablet.Shard); err != nil && err != topo.ErrNodeExists { return err } } // get the shard, checks a couple things si, err := wr.ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard") } if si.KeyRange != tablet.KeyRange { return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange) } if tablet.Type == topo.TYPE_MASTER && !si.MasterAlias.IsZero() && si.MasterAlias != tablet.Alias && !force { return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, tablet.Keyspace, tablet.Shard) } // see if we specified a parent, otherwise get it from the shard if tablet.Parent.IsZero() && tablet.Type.IsSlaveType() { if si.MasterAlias.IsZero() { return fmt.Errorf("trying to create tablet %v in shard %v/%v without a master", tablet.Alias, tablet.Keyspace, tablet.Shard) } tablet.Parent = si.MasterAlias } // update the shard record if needed if err := wr.updateShardCellsAndMaster(si, tablet.Alias, tablet.Type, force); err != nil { return err } } err := topo.CreateTablet(wr.ts, tablet) if err != nil && err == topo.ErrNodeExists { // Try to update nicely, but if it fails fall back to force behavior. if update || force { oldTablet, err := wr.ts.GetTablet(tablet.Alias) if err != nil { wr.Logger().Warningf("failed reading tablet %v: %v", tablet.Alias, err) } else { if oldTablet.Keyspace == tablet.Keyspace && oldTablet.Shard == tablet.Shard { *(oldTablet.Tablet) = *tablet if err := topo.UpdateTablet(context.TODO(), wr.ts, oldTablet); err != nil { wr.Logger().Warningf("failed updating tablet %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { if !tablet.IsInReplicationGraph() { return nil } if err := topo.UpdateTabletReplicationData(context.TODO(), wr.ts, tablet); err != nil { wr.Logger().Warningf("failed updating tablet replication data for %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { return nil } } } } } if force { if err = wr.Scrap(tablet.Alias, force, false); err != nil { wr.Logger().Errorf("failed scrapping tablet %v: %v", tablet.Alias, err) return err } if err := wr.ts.DeleteTablet(tablet.Alias); err != nil { // we ignore this wr.Logger().Errorf("failed deleting tablet %v: %v", tablet.Alias, err) } return topo.CreateTablet(wr.ts, tablet) } } return err }
func CheckTablet(ctx context.Context, t *testing.T, ts topo.Server) { cell := getLocalCell(t, ts) tablet := &topo.Tablet{ Alias: topo.TabletAlias{Cell: cell, Uid: 1}, Hostname: "localhost", IPAddr: "10.11.12.13", Portmap: map[string]int{ "vt": 3333, "mysql": 3334, }, Tags: map[string]string{"tag": "value"}, Keyspace: "test_keyspace", Type: topo.TYPE_MASTER, State: topo.STATE_READ_WRITE, KeyRange: newKeyRange("-10"), } if err := ts.CreateTablet(tablet); err != nil { t.Errorf("CreateTablet: %v", err) } if err := ts.CreateTablet(tablet); err != topo.ErrNodeExists { t.Errorf("CreateTablet(again): %v", err) } if _, err := ts.GetTablet(topo.TabletAlias{Cell: cell, Uid: 666}); err != topo.ErrNoNode { t.Errorf("GetTablet(666): %v", err) } ti, err := ts.GetTablet(tablet.Alias) if err != nil { t.Errorf("GetTablet %v: %v", tablet.Alias, err) } if eq, err := tabletEqual(ti.Tablet, tablet); err != nil { t.Errorf("cannot compare tablets: %v", err) } else if !eq { t.Errorf("put and got tablets are not identical:\n%#v\n%#v", tablet, ti.Tablet) } if _, err := ts.GetTabletsByCell("666"); err != topo.ErrNoNode { t.Errorf("GetTabletsByCell(666): %v", err) } inCell, err := ts.GetTabletsByCell(cell) if err != nil { t.Errorf("GetTabletsByCell: %v", err) } if len(inCell) != 1 || inCell[0] != tablet.Alias { t.Errorf("GetTabletsByCell: want [%v], got %v", tablet.Alias, inCell) } ti.State = topo.STATE_READ_ONLY if err := topo.UpdateTablet(ctx, ts, ti); err != nil { t.Errorf("UpdateTablet: %v", err) } ti, err = ts.GetTablet(tablet.Alias) if err != nil { t.Errorf("GetTablet %v: %v", tablet.Alias, err) } if want := topo.STATE_READ_ONLY; ti.State != want { t.Errorf("ti.State: want %v, got %v", want, ti.State) } if err := ts.UpdateTabletFields(tablet.Alias, func(t *topo.Tablet) error { t.State = topo.STATE_READ_WRITE return nil }); err != nil { t.Errorf("UpdateTabletFields: %v", err) } ti, err = ts.GetTablet(tablet.Alias) if err != nil { t.Errorf("GetTablet %v: %v", tablet.Alias, err) } if want := topo.STATE_READ_WRITE; ti.State != want { t.Errorf("ti.State: want %v, got %v", want, ti.State) } if err := ts.DeleteTablet(tablet.Alias); err != nil { t.Errorf("DeleteTablet: %v", err) } if err := ts.DeleteTablet(tablet.Alias); err != topo.ErrNoNode { t.Errorf("DeleteTablet(again): %v", err) } if _, err := ts.GetTablet(tablet.Alias); err != topo.ErrNoNode { t.Errorf("GetTablet: expected error, tablet was deleted: %v", err) } }
// Snapshot takes a db snapshot // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Snapshot(ctx context.Context, args *actionnode.SnapshotArgs, logger logutil.Logger) (*actionnode.SnapshotReply, error) { // update our type to TYPE_BACKUP tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return nil, err } originalType := tablet.Type // ForceMasterSnapshot: Normally a master is not a viable tablet // to snapshot. However, there are degenerate cases where you need // to override this, for instance the initial clone of a new master. if tablet.Type == topo.TYPE_MASTER && args.ForceMasterSnapshot { // In this case, we don't bother recomputing the serving graph. // All queries will have to fail anyway. log.Infof("force change type master -> backup") // There is a legitimate reason to force in the case of a single // master. tablet.Tablet.Type = topo.TYPE_BACKUP err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_BACKUP, make(map[string]string), true /*runHooks*/) } if err != nil { return nil, err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "snapshotStart"); err != nil { return nil, fmt.Errorf("failed to update state before snaphost: %v", err) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup filename, slaveStartRequired, readOnly, returnErr := agent.Mysqld.CreateSnapshot(l, tablet.DbName(), tablet.Addr(), false, args.Concurrency, args.ServerMode, agent.hookExtraEnv()) // and change our type to the appropriate value newType := originalType if returnErr != nil { log.Errorf("snapshot failed, restoring tablet type back to %v: %v", newType, returnErr) } else { if args.ServerMode { log.Infof("server mode specified, switching tablet to snapshot_source mode") newType = topo.TYPE_SNAPSHOT_SOURCE } else { log.Infof("change type back after snapshot: %v", newType) } } if tablet.Parent.Uid == topo.NO_TABLET && args.ForceMasterSnapshot && newType != topo.TYPE_SNAPSHOT_SOURCE { log.Infof("force change type backup -> master: %v", tablet.Alias) tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, newType, nil, true /*runHooks*/) } if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) returnErr = err } // if anything failed, don't return anything if returnErr != nil { return nil, returnErr } // it all worked, return the required information sr := &actionnode.SnapshotReply{ ManifestPath: filename, SlaveStartRequired: slaveStartRequired, ReadOnly: readOnly, } if tablet.Parent.Uid == topo.NO_TABLET { // If this is a master, this will be the new parent. // FIXME(msolomon) this doesn't work in hierarchical replication. sr.ParentAlias = tablet.Alias } else { sr.ParentAlias = tablet.Parent } return sr, nil }
// RestartSlave tells the tablet it has a new master // Should be called under RpcWrapLockAction. func (agent *ActionAgent) RestartSlave(ctx context.Context, rsd *actionnode.RestartSlaveData) error { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } // If this check fails, we seem reparented. The only part that // could have failed is the insert in the replication // graph. Do NOT try to reparent again. That will either wedge // replication or corrupt data. if tablet.Parent != rsd.Parent { log.V(6).Infof("restart with new parent") // Remove tablet from the replication graph. if err = topo.DeleteTabletReplicationData(agent.TopoServer, tablet.Tablet); err != nil && err != topo.ErrNoNode { return err } // Move a lag slave into the orphan lag type so we can safely ignore // this reparenting until replication catches up. if tablet.Type == topo.TYPE_LAG { tablet.Type = topo.TYPE_LAG_ORPHAN } else { err = agent.Mysqld.RestartSlave(rsd.ReplicationStatus, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } } // Once this action completes, update authoritative tablet node first. tablet.Parent = rsd.Parent err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } } else if rsd.Force { err = agent.Mysqld.RestartSlave(rsd.ReplicationStatus, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } // Complete the special orphan accounting. if tablet.Type == topo.TYPE_LAG_ORPHAN { tablet.Type = topo.TYPE_LAG err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } } } else { // There is nothing to safely reparent, so check replication. If // either replication thread is not running, report an error. status, err := agent.Mysqld.SlaveStatus() if err != nil { return fmt.Errorf("cannot verify replication for slave: %v", err) } if !status.SlaveRunning() { return fmt.Errorf("replication not running for slave") } } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// ChangeType changes the type of the tablet and possibly also updates // the health informaton for it. Make this external, since these // transitions need to be forced from time to time. // // - if health is nil, we don't touch the Tablet's Health record. // - if health is an empty map, we clear the Tablet's Health record. // - if health has values, we overwrite the Tablet's Health record. func ChangeType(ts topo.Server, tabletAlias topo.TabletAlias, newType topo.TabletType, health map[string]string, runHooks bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } if !topo.IsTrivialTypeChange(tablet.Type, newType) || !topo.IsValidTypeChange(tablet.Type, newType) { return fmt.Errorf("cannot change tablet type %v -> %v %v", tablet.Type, newType, tabletAlias) } if runHooks { // Only run the preflight_serving_type hook when // transitioning from non-serving to serving. if !topo.IsInServingGraph(tablet.Type) && topo.IsInServingGraph(newType) { if err := hook.NewSimpleHook("preflight_serving_type").ExecuteOptional(); err != nil { return err } } } tablet.Type = newType if newType == topo.TYPE_IDLE { if tablet.Parent.IsZero() { si, err := ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return err } rec := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} for _, cell := range si.Cells { wg.Add(1) go func(cell string) { defer wg.Done() sri, err := ts.GetShardReplication(cell, tablet.Keyspace, tablet.Shard) if err != nil { log.Warningf("Cannot check cell %v for extra replication paths, assuming it's good", cell) return } for _, rl := range sri.ReplicationLinks { if rl.Parent == tabletAlias { rec.RecordError(fmt.Errorf("Still have a ReplicationLink in cell %v", cell)) } } }(cell) } wg.Wait() if rec.HasErrors() { return rec.Error() } } tablet.Parent = topo.TabletAlias{} tablet.Keyspace = "" tablet.Shard = "" tablet.KeyRange = key.KeyRange{} tablet.Health = health } if health != nil { if len(health) == 0 { tablet.Health = nil } else { tablet.Health = health } } return topo.UpdateTablet(context.TODO(), ts, tablet) }
func TestRebuildShardRace(t *testing.T) { ctx := context.Background() cells := []string{"test_cell"} logger := logutil.NewMemoryLogger() timeout := 10 * time.Second interrupted := make(chan struct{}) // Set up topology. ts := zktopo.NewTestServer(t, cells) f := faketopo.New(t, logger, ts, cells) defer f.TearDown() keyspace := faketopo.TestKeyspace shard := faketopo.TestShard master := f.AddTablet(1, "test_cell", topo.TYPE_MASTER, nil) f.AddTablet(2, "test_cell", topo.TYPE_REPLICA, master) // Do an initial rebuild. if _, err := RebuildShard(ctx, logger, f.Topo, keyspace, shard, cells, timeout, interrupted); err != nil { t.Fatalf("RebuildShard: %v", err) } // Check initial state. ep, err := ts.GetEndPoints(cells[0], keyspace, shard, topo.TYPE_MASTER) if err != nil { t.Fatalf("GetEndPoints: %v", err) } if got, want := len(ep.Entries), 1; got != want { t.Fatalf("len(Entries) = %v, want %v", got, want) } ep, err = ts.GetEndPoints(cells[0], keyspace, shard, topo.TYPE_REPLICA) if err != nil { t.Fatalf("GetEndPoints: %v", err) } if got, want := len(ep.Entries), 1; got != want { t.Fatalf("len(Entries) = %v, want %v", got, want) } // Install a hook that hands out locks out of order to simulate a race. trigger := make(chan struct{}) stalled := make(chan struct{}) done := make(chan struct{}) wait := make(chan bool, 2) wait <- true // first guy waits for trigger wait <- false // second guy doesn't wait ts.HookLockSrvShardForAction = func() { if <-wait { close(stalled) <-trigger } } // Make a change and start a rebuild that will stall when it tries to get // the SrvShard lock. masterInfo := f.GetTablet(1) masterInfo.Type = topo.TYPE_SPARE if err := topo.UpdateTablet(ctx, ts, masterInfo); err != nil { t.Fatalf("UpdateTablet: %v", err) } go func() { if _, err := RebuildShard(ctx, logger, f.Topo, keyspace, shard, cells, timeout, interrupted); err != nil { t.Fatalf("RebuildShard: %v", err) } close(done) }() // Wait for first rebuild to stall. <-stalled // While the first rebuild is stalled, make another change and start a rebuild // that doesn't stall. replicaInfo := f.GetTablet(2) replicaInfo.Type = topo.TYPE_SPARE if err := topo.UpdateTablet(ctx, ts, replicaInfo); err != nil { t.Fatalf("UpdateTablet: %v", err) } if _, err := RebuildShard(ctx, logger, f.Topo, keyspace, shard, cells, timeout, interrupted); err != nil { t.Fatalf("RebuildShard: %v", err) } // Now that the second rebuild is done, un-stall the first rebuild and wait // for it to finish. close(trigger) <-done // Check that the rebuild picked up both changes. if _, err := ts.GetEndPoints(cells[0], keyspace, shard, topo.TYPE_MASTER); err == nil || !strings.Contains(err.Error(), "node doesn't exist") { t.Errorf("first change wasn't picked up by second rebuild") } if _, err := ts.GetEndPoints(cells[0], keyspace, shard, topo.TYPE_REPLICA); err == nil || !strings.Contains(err.Error(), "node doesn't exist") { t.Errorf("second change was overwritten by first rebuild finishing late") } }