Example #1
0
func (wr *Wrangler) removeShardCell(keyspace, shard, cell string, force bool) error {
	shardInfo, err := wr.ts.GetShard(keyspace, shard)
	if err != nil {
		return err
	}

	// check the cell is in the list already
	if !topo.InCellList(cell, shardInfo.Cells) {
		return fmt.Errorf("cell %v in not in shard info", cell)
	}

	// check the master alias is not in the cell
	if shardInfo.MasterAlias.Cell == cell {
		return fmt.Errorf("master %v is in the cell '%v' we want to remove", shardInfo.MasterAlias, cell)
	}

	// get the ShardReplication object in the cell
	sri, err := wr.ts.GetShardReplication(cell, keyspace, shard)
	switch err {
	case nil:
		if len(sri.ReplicationLinks) > 0 {
			return fmt.Errorf("cell %v has %v possible tablets in replication graph", cell, len(sri.ReplicationLinks))
		}

		// ShardReplication object is now useless, remove it
		if err := wr.ts.DeleteShardReplication(cell, keyspace, shard); err != nil {
			return fmt.Errorf("error deleting ShardReplication object in cell %v: %v", cell, err)
		}

		// we keep going
	case topo.ErrNoNode:
		// no ShardReplication object, we keep going
	default:
		// we can't get the object, assume topo server is down there,
		// so we look at force flag
		if !force {
			return err
		}
		wr.Logger().Warningf("Cannot get ShardReplication from cell %v, assuming cell topo server is down, and forcing the removal", cell)
	}

	// now we can update the shard
	wr.Logger().Infof("Removing cell %v from shard %v/%v", cell, keyspace, shard)
	newCells := make([]string, 0, len(shardInfo.Cells)-1)
	for _, c := range shardInfo.Cells {
		if c != cell {
			newCells = append(newCells, c)
		}
	}
	shardInfo.Cells = newCells

	return topo.UpdateShard(context.TODO(), wr.ts, shardInfo)
}
Example #2
0
// findCellsForRebuild will find all the cells in the given keyspace
// and create an entry if the map for them
func (wr *Wrangler) findCellsForRebuild(ki *topo.KeyspaceInfo, shardMap map[string]*topo.ShardInfo, cells []string, srvKeyspaceMap map[string]*topo.SrvKeyspace) {
	for _, si := range shardMap {
		for _, cell := range si.Cells {
			if !topo.InCellList(cell, cells) {
				continue
			}
			if _, ok := srvKeyspaceMap[cell]; !ok {
				srvKeyspaceMap[cell] = &topo.SrvKeyspace{
					Shards:             make([]topo.SrvShard, 0, 16),
					ShardingColumnName: ki.ShardingColumnName,
					ShardingColumnType: ki.ShardingColumnType,
					ServedFrom:         ki.ComputeCellServedFrom(cell),
					SplitShardCount:    ki.SplitShardCount,
				}
			}
		}
	}
}
Example #3
0
// Update shard file with new master, replicas, etc.
//
// Re-read from TopologyServer to make sure we are using the side
// effects of all actions.
//
// This function locks individual SvrShard paths, so it doesn't need a lock
// on the shard.
func RebuildShard(ctx context.Context, log logutil.Logger, ts topo.Server, keyspace, shard string, cells []string, timeout time.Duration, interrupted chan struct{}) (*topo.ShardInfo, error) {
	log.Infof("RebuildShard %v/%v", keyspace, shard)

	span := trace.NewSpanFromContext(ctx)
	span.StartLocal("topotools.RebuildShard")
	defer span.Finish()
	ctx = trace.NewContext(ctx, span)

	// read the existing shard info. It has to exist.
	shardInfo, err := ts.GetShard(keyspace, shard)
	if err != nil {
		return nil, err
	}

	// rebuild all cells in parallel
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, cell := range shardInfo.Cells {
		// skip this cell if we shouldn't rebuild it
		if !topo.InCellList(cell, cells) {
			continue
		}

		// start with the master if it's in the current cell
		tabletsAsMap := make(map[topo.TabletAlias]bool)
		if shardInfo.MasterAlias.Cell == cell {
			tabletsAsMap[shardInfo.MasterAlias] = true
		}

		wg.Add(1)
		go func(cell string) {
			defer wg.Done()

			// Lock the SrvShard so we don't race with other rebuilds of the same
			// shard in the same cell (e.g. from our peer tablets).
			actionNode := actionnode.RebuildSrvShard()
			lockPath, err := actionNode.LockSrvShard(ctx, ts, cell, keyspace, shard, timeout, interrupted)
			if err != nil {
				rec.RecordError(err)
				return
			}

			// read the ShardReplication object to find tablets
			sri, err := ts.GetShardReplication(cell, keyspace, shard)
			if err != nil {
				rec.RecordError(fmt.Errorf("GetShardReplication(%v, %v, %v) failed: %v", cell, keyspace, shard, err))
				return
			}

			// add all relevant tablets to the map
			for _, rl := range sri.ReplicationLinks {
				tabletsAsMap[rl.TabletAlias] = true
				if rl.Parent.Cell == cell {
					tabletsAsMap[rl.Parent] = true
				}
			}

			// convert the map to a list
			aliases := make([]topo.TabletAlias, 0, len(tabletsAsMap))
			for a := range tabletsAsMap {
				aliases = append(aliases, a)
			}

			// read all the Tablet records
			tablets, err := topo.GetTabletMap(ctx, ts, aliases)
			switch err {
			case nil:
				// keep going, we're good
			case topo.ErrPartialResult:
				log.Warningf("Got ErrPartialResult from topo.GetTabletMap in cell %v, some tablets may not be added properly to serving graph", cell)
			default:
				rec.RecordError(fmt.Errorf("GetTabletMap in cell %v failed: %v", cell, err))
				return
			}

			// write the data we need to
			rebuildErr := rebuildCellSrvShard(ctx, log, ts, shardInfo, cell, tablets)

			// and unlock
			if err := actionNode.UnlockSrvShard(ctx, ts, cell, keyspace, shard, lockPath, rebuildErr); err != nil {
				rec.RecordError(err)
			}
		}(cell)
	}
	wg.Wait()

	return shardInfo, rec.Error()
}
Example #4
0
// changeCallback is run after every action that might
// have changed something in the tablet record.
func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topo.Tablet) error {
	span := trace.NewSpanFromContext(ctx)
	span.StartLocal("ActionAgent.changeCallback")
	defer span.Finish()

	allowQuery := newTablet.IsRunningQueryService()

	// Read the shard to get SourceShards / TabletControlMap if
	// we're going to use it.
	var shardInfo *topo.ShardInfo
	var tabletControl *topo.TabletControl
	var blacklistedTables []string
	var err error
	if allowQuery {
		shardInfo, err = agent.TopoServer.GetShard(newTablet.Keyspace, newTablet.Shard)
		if err != nil {
			log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err)
		} else {
			if newTablet.Type == topo.TYPE_MASTER {
				allowQuery = len(shardInfo.SourceShards) == 0
			}
			if tc, ok := shardInfo.TabletControlMap[newTablet.Type]; ok {
				if topo.InCellList(newTablet.Alias.Cell, tc.Cells) {
					if tc.DisableQueryService {
						allowQuery = false
					}
					blacklistedTables = tc.BlacklistedTables
					tabletControl = tc
				}
			}
		}
	}

	// Read the keyspace on masters to get ShardingColumnType,
	// for binlog replication, only if source shards are set.
	var keyspaceInfo *topo.KeyspaceInfo
	if newTablet.Type == topo.TYPE_MASTER && shardInfo != nil && len(shardInfo.SourceShards) > 0 {
		keyspaceInfo, err = agent.TopoServer.GetKeyspace(newTablet.Keyspace)
		if err != nil {
			log.Errorf("Cannot read keyspace for this tablet %v: %v", newTablet.Alias, err)
			keyspaceInfo = nil
		}
	}

	if allowQuery {
		// There are a few transitions when we're
		// going to need to restart the query service:
		// - transitioning from replica to master, so clients
		//   that were already connected don't keep on using
		//   the master as replica or rdonly.
		// - having different parameters for the query
		//   service. It needs to stop and restart with the
		//   new parameters. That includes:
		//   - changing KeyRange
		//   - changing the BlacklistedTables list
		if (newTablet.Type == topo.TYPE_MASTER &&
			oldTablet.Type != topo.TYPE_MASTER) ||
			(newTablet.KeyRange != oldTablet.KeyRange) ||
			!reflect.DeepEqual(blacklistedTables, agent.BlacklistedTables()) {
			agent.disallowQueries()
		}
		if err := agent.allowQueries(newTablet, blacklistedTables); err != nil {
			log.Errorf("Cannot start query service: %v", err)
		}
	} else {
		agent.disallowQueries()
	}

	// save the tabletControl we've been using, so the background
	// healthcheck makes the same decisions as we've been making.
	agent.setTabletControl(tabletControl)

	// update stream needs to be started or stopped too
	if agent.DBConfigs != nil {
		if topo.IsRunningUpdateStream(newTablet.Type) {
			binlog.EnableUpdateStreamService(agent.DBConfigs.App.DbName, agent.Mysqld)
		} else {
			binlog.DisableUpdateStreamService()
		}
	}

	statsType.Set(string(newTablet.Type))
	statsKeyspace.Set(newTablet.Keyspace)
	statsShard.Set(newTablet.Shard)
	statsKeyRangeStart.Set(string(newTablet.KeyRange.Start.Hex()))
	statsKeyRangeEnd.Set(string(newTablet.KeyRange.End.Hex()))

	// See if we need to start or stop any binlog player
	if agent.BinlogPlayerMap != nil {
		if newTablet.Type == topo.TYPE_MASTER {
			agent.BinlogPlayerMap.RefreshMap(newTablet, keyspaceInfo, shardInfo)
		} else {
			agent.BinlogPlayerMap.StopAllPlayersAndReset()
		}
	}
	return nil
}