func (wr *Wrangler) removeShardCell(keyspace, shard, cell string, force bool) error { shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } // check the cell is in the list already if !topo.InCellList(cell, shardInfo.Cells) { return fmt.Errorf("cell %v in not in shard info", cell) } // check the master alias is not in the cell if shardInfo.MasterAlias.Cell == cell { return fmt.Errorf("master %v is in the cell '%v' we want to remove", shardInfo.MasterAlias, cell) } // get the ShardReplication object in the cell sri, err := wr.ts.GetShardReplication(cell, keyspace, shard) switch err { case nil: if len(sri.ReplicationLinks) > 0 { return fmt.Errorf("cell %v has %v possible tablets in replication graph", cell, len(sri.ReplicationLinks)) } // ShardReplication object is now useless, remove it if err := wr.ts.DeleteShardReplication(cell, keyspace, shard); err != nil { return fmt.Errorf("error deleting ShardReplication object in cell %v: %v", cell, err) } // we keep going case topo.ErrNoNode: // no ShardReplication object, we keep going default: // we can't get the object, assume topo server is down there, // so we look at force flag if !force { return err } wr.Logger().Warningf("Cannot get ShardReplication from cell %v, assuming cell topo server is down, and forcing the removal", cell) } // now we can update the shard wr.Logger().Infof("Removing cell %v from shard %v/%v", cell, keyspace, shard) newCells := make([]string, 0, len(shardInfo.Cells)-1) for _, c := range shardInfo.Cells { if c != cell { newCells = append(newCells, c) } } shardInfo.Cells = newCells return topo.UpdateShard(context.TODO(), wr.ts, shardInfo) }
// findCellsForRebuild will find all the cells in the given keyspace // and create an entry if the map for them func (wr *Wrangler) findCellsForRebuild(ki *topo.KeyspaceInfo, shardMap map[string]*topo.ShardInfo, cells []string, srvKeyspaceMap map[string]*topo.SrvKeyspace) { for _, si := range shardMap { for _, cell := range si.Cells { if !topo.InCellList(cell, cells) { continue } if _, ok := srvKeyspaceMap[cell]; !ok { srvKeyspaceMap[cell] = &topo.SrvKeyspace{ Shards: make([]topo.SrvShard, 0, 16), ShardingColumnName: ki.ShardingColumnName, ShardingColumnType: ki.ShardingColumnType, ServedFrom: ki.ComputeCellServedFrom(cell), SplitShardCount: ki.SplitShardCount, } } } } }
// Update shard file with new master, replicas, etc. // // Re-read from TopologyServer to make sure we are using the side // effects of all actions. // // This function locks individual SvrShard paths, so it doesn't need a lock // on the shard. func RebuildShard(ctx context.Context, log logutil.Logger, ts topo.Server, keyspace, shard string, cells []string, timeout time.Duration, interrupted chan struct{}) (*topo.ShardInfo, error) { log.Infof("RebuildShard %v/%v", keyspace, shard) span := trace.NewSpanFromContext(ctx) span.StartLocal("topotools.RebuildShard") defer span.Finish() ctx = trace.NewContext(ctx, span) // read the existing shard info. It has to exist. shardInfo, err := ts.GetShard(keyspace, shard) if err != nil { return nil, err } // rebuild all cells in parallel wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, cell := range shardInfo.Cells { // skip this cell if we shouldn't rebuild it if !topo.InCellList(cell, cells) { continue } // start with the master if it's in the current cell tabletsAsMap := make(map[topo.TabletAlias]bool) if shardInfo.MasterAlias.Cell == cell { tabletsAsMap[shardInfo.MasterAlias] = true } wg.Add(1) go func(cell string) { defer wg.Done() // Lock the SrvShard so we don't race with other rebuilds of the same // shard in the same cell (e.g. from our peer tablets). actionNode := actionnode.RebuildSrvShard() lockPath, err := actionNode.LockSrvShard(ctx, ts, cell, keyspace, shard, timeout, interrupted) if err != nil { rec.RecordError(err) return } // read the ShardReplication object to find tablets sri, err := ts.GetShardReplication(cell, keyspace, shard) if err != nil { rec.RecordError(fmt.Errorf("GetShardReplication(%v, %v, %v) failed: %v", cell, keyspace, shard, err)) return } // add all relevant tablets to the map for _, rl := range sri.ReplicationLinks { tabletsAsMap[rl.TabletAlias] = true if rl.Parent.Cell == cell { tabletsAsMap[rl.Parent] = true } } // convert the map to a list aliases := make([]topo.TabletAlias, 0, len(tabletsAsMap)) for a := range tabletsAsMap { aliases = append(aliases, a) } // read all the Tablet records tablets, err := topo.GetTabletMap(ctx, ts, aliases) switch err { case nil: // keep going, we're good case topo.ErrPartialResult: log.Warningf("Got ErrPartialResult from topo.GetTabletMap in cell %v, some tablets may not be added properly to serving graph", cell) default: rec.RecordError(fmt.Errorf("GetTabletMap in cell %v failed: %v", cell, err)) return } // write the data we need to rebuildErr := rebuildCellSrvShard(ctx, log, ts, shardInfo, cell, tablets) // and unlock if err := actionNode.UnlockSrvShard(ctx, ts, cell, keyspace, shard, lockPath, rebuildErr); err != nil { rec.RecordError(err) } }(cell) } wg.Wait() return shardInfo, rec.Error() }
// changeCallback is run after every action that might // have changed something in the tablet record. func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topo.Tablet) error { span := trace.NewSpanFromContext(ctx) span.StartLocal("ActionAgent.changeCallback") defer span.Finish() allowQuery := newTablet.IsRunningQueryService() // Read the shard to get SourceShards / TabletControlMap if // we're going to use it. var shardInfo *topo.ShardInfo var tabletControl *topo.TabletControl var blacklistedTables []string var err error if allowQuery { shardInfo, err = agent.TopoServer.GetShard(newTablet.Keyspace, newTablet.Shard) if err != nil { log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err) } else { if newTablet.Type == topo.TYPE_MASTER { allowQuery = len(shardInfo.SourceShards) == 0 } if tc, ok := shardInfo.TabletControlMap[newTablet.Type]; ok { if topo.InCellList(newTablet.Alias.Cell, tc.Cells) { if tc.DisableQueryService { allowQuery = false } blacklistedTables = tc.BlacklistedTables tabletControl = tc } } } } // Read the keyspace on masters to get ShardingColumnType, // for binlog replication, only if source shards are set. var keyspaceInfo *topo.KeyspaceInfo if newTablet.Type == topo.TYPE_MASTER && shardInfo != nil && len(shardInfo.SourceShards) > 0 { keyspaceInfo, err = agent.TopoServer.GetKeyspace(newTablet.Keyspace) if err != nil { log.Errorf("Cannot read keyspace for this tablet %v: %v", newTablet.Alias, err) keyspaceInfo = nil } } if allowQuery { // There are a few transitions when we're // going to need to restart the query service: // - transitioning from replica to master, so clients // that were already connected don't keep on using // the master as replica or rdonly. // - having different parameters for the query // service. It needs to stop and restart with the // new parameters. That includes: // - changing KeyRange // - changing the BlacklistedTables list if (newTablet.Type == topo.TYPE_MASTER && oldTablet.Type != topo.TYPE_MASTER) || (newTablet.KeyRange != oldTablet.KeyRange) || !reflect.DeepEqual(blacklistedTables, agent.BlacklistedTables()) { agent.disallowQueries() } if err := agent.allowQueries(newTablet, blacklistedTables); err != nil { log.Errorf("Cannot start query service: %v", err) } } else { agent.disallowQueries() } // save the tabletControl we've been using, so the background // healthcheck makes the same decisions as we've been making. agent.setTabletControl(tabletControl) // update stream needs to be started or stopped too if agent.DBConfigs != nil { if topo.IsRunningUpdateStream(newTablet.Type) { binlog.EnableUpdateStreamService(agent.DBConfigs.App.DbName, agent.Mysqld) } else { binlog.DisableUpdateStreamService() } } statsType.Set(string(newTablet.Type)) statsKeyspace.Set(newTablet.Keyspace) statsShard.Set(newTablet.Shard) statsKeyRangeStart.Set(string(newTablet.KeyRange.Start.Hex())) statsKeyRangeEnd.Set(string(newTablet.KeyRange.End.Hex())) // See if we need to start or stop any binlog player if agent.BinlogPlayerMap != nil { if newTablet.Type == topo.TYPE_MASTER { agent.BinlogPlayerMap.RefreshMap(newTablet, keyspaceInfo, shardInfo) } else { agent.BinlogPlayerMap.StopAllPlayersAndReset() } } return nil }