// updateReplicationGraphForPromotedSlave makes sure the newly promoted slave // is correctly represented in the replication graph func (agent *ActionAgent) updateReplicationGraphForPromotedSlave(ctx context.Context, tablet *topo.TabletInfo) error { // Update tablet regardless - trend towards consistency. tablet.State = topo.STATE_READ_WRITE tablet.Type = topo.TYPE_MASTER tablet.Parent.Cell = "" tablet.Parent.Uid = topo.NO_TABLET tablet.Health = nil err := topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } // NOTE(msolomon) A serving graph update is required, but in // order for the shard to be consistent the old master must be // scrapped first. That is externally coordinated by the // wrangler reparent action. // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// SlaveWasRestarted updates the parent record for a tablet. // Should be called under RpcWrapLockAction. func (agent *ActionAgent) SlaveWasRestarted(ctx context.Context, swrd *actionnode.SlaveWasRestartedArgs) error { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } // Once this action completes, update authoritative tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } // Update the new tablet location in the replication graph now that // we've updated the tablet. err = topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// change a tablet type to RESTORE and set all the other arguments. // from now on, we can go to: // - back to IDLE if we don't use the tablet at all (after for instance // a successful ReserveForRestore but a failed Snapshot) // - to SCRAP if something in the process on the target host fails // - to SPARE if the clone works func (agent *ActionAgent) changeTypeToRestore(ctx context.Context, tablet, sourceTablet *topo.TabletInfo, parentAlias topo.TabletAlias, keyRange key.KeyRange) error { // run the optional preflight_assigned hook hk := hook.NewSimpleHook("preflight_assigned") topotools.ConfigureTabletHook(hk, agent.TabletAlias) if err := hk.ExecuteOptional(); err != nil { return err } // change the type tablet.Parent = parentAlias tablet.Keyspace = sourceTablet.Keyspace tablet.Shard = sourceTablet.Shard tablet.Type = topo.TYPE_RESTORE tablet.KeyRange = keyRange tablet.DbNameOverride = sourceTablet.DbNameOverride if err := topo.UpdateTablet(ctx, agent.TopoServer, tablet); err != nil { return err } // and create the replication graph items return topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) }
// InitTablet creates or updates a tablet. If no parent is specified // in the tablet, and the tablet has a slave type, we will find the // appropriate parent. If createShardAndKeyspace is true and the // parent keyspace or shard don't exist, they will be created. If // update is true, and a tablet with the same ID exists, update it. // If Force is true, and a tablet with the same ID already exists, it // will be scrapped and deleted, and then recreated. func (wr *Wrangler) InitTablet(tablet *topo.Tablet, force, createShardAndKeyspace, update bool) error { if err := tablet.Complete(); err != nil { return err } if tablet.IsInReplicationGraph() { // create the parent keyspace and shard if needed if createShardAndKeyspace { if err := wr.ts.CreateKeyspace(tablet.Keyspace, &topo.Keyspace{}); err != nil && err != topo.ErrNodeExists { return err } if err := topo.CreateShard(wr.ts, tablet.Keyspace, tablet.Shard); err != nil && err != topo.ErrNodeExists { return err } } // get the shard, checks a couple things si, err := wr.ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard") } if si.KeyRange != tablet.KeyRange { return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange) } if tablet.Type == topo.TYPE_MASTER && !si.MasterAlias.IsZero() && si.MasterAlias != tablet.Alias && !force { return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, tablet.Keyspace, tablet.Shard) } // see if we specified a parent, otherwise get it from the shard if tablet.Parent.IsZero() && tablet.Type.IsSlaveType() { if si.MasterAlias.IsZero() { return fmt.Errorf("trying to create tablet %v in shard %v/%v without a master", tablet.Alias, tablet.Keyspace, tablet.Shard) } tablet.Parent = si.MasterAlias } // update the shard record if needed if err := wr.updateShardCellsAndMaster(si, tablet.Alias, tablet.Type, force); err != nil { return err } } err := topo.CreateTablet(wr.ts, tablet) if err != nil && err == topo.ErrNodeExists { // Try to update nicely, but if it fails fall back to force behavior. if update || force { oldTablet, err := wr.ts.GetTablet(tablet.Alias) if err != nil { wr.Logger().Warningf("failed reading tablet %v: %v", tablet.Alias, err) } else { if oldTablet.Keyspace == tablet.Keyspace && oldTablet.Shard == tablet.Shard { *(oldTablet.Tablet) = *tablet if err := topo.UpdateTablet(context.TODO(), wr.ts, oldTablet); err != nil { wr.Logger().Warningf("failed updating tablet %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { if !tablet.IsInReplicationGraph() { return nil } if err := topo.UpdateTabletReplicationData(context.TODO(), wr.ts, tablet); err != nil { wr.Logger().Warningf("failed updating tablet replication data for %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { return nil } } } } } if force { if err = wr.Scrap(tablet.Alias, force, false); err != nil { wr.Logger().Errorf("failed scrapping tablet %v: %v", tablet.Alias, err) return err } if err := wr.ts.DeleteTablet(tablet.Alias); err != nil { // we ignore this wr.Logger().Errorf("failed deleting tablet %v: %v", tablet.Alias, err) } return topo.CreateTablet(wr.ts, tablet) } } return err }
// This is a quick and dirty tool to resurrect the TopologyServer data from the // canonical data stored in the tablet nodes. // // cells: local vt cells to scan for all tablets // keyspaces: list of keyspaces to rebuild func (wr *Wrangler) RebuildReplicationGraph(cells []string, keyspaces []string) error { if cells == nil || len(cells) == 0 { return fmt.Errorf("must specify cells to rebuild replication graph") } if keyspaces == nil || len(keyspaces) == 0 { return fmt.Errorf("must specify keyspaces to rebuild replication graph") } allTablets := make([]*topo.TabletInfo, 0, 1024) for _, cell := range cells { tablets, err := topotools.GetAllTablets(context.TODO(), wr.ts, cell) if err != nil { return err } allTablets = append(allTablets, tablets...) } for _, keyspace := range keyspaces { wr.logger.Infof("delete keyspace shards: %v", keyspace) if err := wr.ts.DeleteKeyspaceShards(keyspace); err != nil { return err } } keyspacesToRebuild := make(map[string]bool) shardsCreated := make(map[string]bool) hasErr := false mu := sync.Mutex{} wg := sync.WaitGroup{} for _, ti := range allTablets { wg.Add(1) go func(ti *topo.TabletInfo) { defer wg.Done() if !ti.IsInReplicationGraph() { return } if !strInList(keyspaces, ti.Keyspace) { return } mu.Lock() keyspacesToRebuild[ti.Keyspace] = true shardPath := ti.Keyspace + "/" + ti.Shard if !shardsCreated[shardPath] { if err := topo.CreateShard(wr.ts, ti.Keyspace, ti.Shard); err != nil && err != topo.ErrNodeExists { wr.logger.Warningf("failed re-creating shard %v: %v", shardPath, err) hasErr = true } else { shardsCreated[shardPath] = true } } mu.Unlock() err := topo.UpdateTabletReplicationData(context.TODO(), wr.ts, ti.Tablet) if err != nil { mu.Lock() hasErr = true mu.Unlock() wr.logger.Warningf("failed updating replication data: %v", err) } }(ti) } wg.Wait() for keyspace := range keyspacesToRebuild { wg.Add(1) go func(keyspace string) { defer wg.Done() if err := wr.RebuildKeyspaceGraph(keyspace, nil); err != nil { mu.Lock() hasErr = true mu.Unlock() wr.logger.Warningf("RebuildKeyspaceGraph(%v) failed: %v", keyspace, err) return } }(keyspace) } wg.Wait() if hasErr { return fmt.Errorf("some errors occurred rebuilding replication graph, consult log") } return nil }
// RestartSlave tells the tablet it has a new master // Should be called under RpcWrapLockAction. func (agent *ActionAgent) RestartSlave(ctx context.Context, rsd *actionnode.RestartSlaveData) error { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } // If this check fails, we seem reparented. The only part that // could have failed is the insert in the replication // graph. Do NOT try to reparent again. That will either wedge // replication or corrupt data. if tablet.Parent != rsd.Parent { log.V(6).Infof("restart with new parent") // Remove tablet from the replication graph. if err = topo.DeleteTabletReplicationData(agent.TopoServer, tablet.Tablet); err != nil && err != topo.ErrNoNode { return err } // Move a lag slave into the orphan lag type so we can safely ignore // this reparenting until replication catches up. if tablet.Type == topo.TYPE_LAG { tablet.Type = topo.TYPE_LAG_ORPHAN } else { err = agent.Mysqld.RestartSlave(rsd.ReplicationStatus, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } } // Once this action completes, update authoritative tablet node first. tablet.Parent = rsd.Parent err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } } else if rsd.Force { err = agent.Mysqld.RestartSlave(rsd.ReplicationStatus, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } // Complete the special orphan accounting. if tablet.Type == topo.TYPE_LAG_ORPHAN { tablet.Type = topo.TYPE_LAG err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } } } else { // There is nothing to safely reparent, so check replication. If // either replication thread is not running, report an error. status, err := agent.Mysqld.SlaveStatus() if err != nil { return fmt.Errorf("cannot verify replication for slave: %v", err) } if !status.SlaveRunning() { return fmt.Errorf("replication not running for slave") } } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }