Exemplo n.º 1
0
// ReparentTablet tells a tablet to reparent this tablet to the current
// master, based on the current replication position. If there is no
// match, it will fail.
func (wr *Wrangler) ReparentTablet(ctx context.Context, tabletAlias topo.TabletAlias) error {
	// Get specified tablet.
	// Get current shard master tablet.
	// Sanity check they are in the same keyspace/shard.
	// Issue a SetMaster to the tablet.
	ti, err := wr.ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}

	shardInfo, err := wr.ts.GetShard(ctx, ti.Keyspace, ti.Shard)
	if err != nil {
		return err
	}
	if topo.TabletAliasIsZero(shardInfo.MasterAlias) {
		return fmt.Errorf("no master tablet for shard %v/%v", ti.Keyspace, ti.Shard)
	}

	masterTi, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(shardInfo.MasterAlias))
	if err != nil {
		return err
	}

	// Basic sanity checking.
	if masterTi.Type != topo.TYPE_MASTER {
		return fmt.Errorf("TopologyServer has inconsistent state for shard master %v", shardInfo.MasterAlias)
	}
	if masterTi.Keyspace != ti.Keyspace || masterTi.Shard != ti.Shard {
		return fmt.Errorf("master %v and potential slave not in same keyspace/shard", shardInfo.MasterAlias)
	}

	// and do the remote command
	return wr.TabletManagerClient().SetMaster(ctx, ti, topo.ProtoToTabletAlias(shardInfo.MasterAlias), 0, false)
}
Exemplo n.º 2
0
// CopySchemaShard copies the schema from a source tablet to the
// specified shard.  The schema is applied directly on the master of
// the destination shard, and is propogated to the replicas through
// binlogs.
func (wr *Wrangler) CopySchemaShard(ctx context.Context, sourceTabletAlias topo.TabletAlias, tables, excludeTables []string, includeViews bool, destKeyspace, destShard string) error {
	destShardInfo, err := wr.ts.GetShard(ctx, destKeyspace, destShard)
	if err != nil {
		return err
	}

	sourceSd, err := wr.GetSchema(ctx, sourceTabletAlias, tables, excludeTables, includeViews)
	if err != nil {
		return err
	}
	destSd, err := wr.GetSchema(ctx, topo.ProtoToTabletAlias(destShardInfo.MasterAlias), tables, excludeTables, includeViews)
	if err != nil {
		destSd = nil
	}
	if destSd != nil {
		diffs := myproto.DiffSchemaToArray("source", sourceSd, "dest", destSd)
		if diffs == nil {
			// Return early because dest has already the same schema as source.
			return nil
		}
	}

	createSql := sourceSd.ToSQLStrings()
	destTabletInfo, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(destShardInfo.MasterAlias))
	if err != nil {
		return err
	}
	for i, sqlLine := range createSql {
		err = wr.applySqlShard(ctx, destTabletInfo, sqlLine, i == len(createSql)-1)
		if err != nil {
			return err
		}
	}
	return nil
}
Exemplo n.º 3
0
// ValidatePermissionsKeyspace validates all the permissions are the same
// in a keyspace
func (wr *Wrangler) ValidatePermissionsKeyspace(ctx context.Context, keyspace string) error {
	// find all the shards
	shards, err := wr.ts.GetShardNames(ctx, keyspace)
	if err != nil {
		return err
	}

	// corner cases
	if len(shards) == 0 {
		return fmt.Errorf("No shards in keyspace %v", keyspace)
	}
	sort.Strings(shards)
	if len(shards) == 1 {
		return wr.ValidatePermissionsShard(ctx, keyspace, shards[0])
	}

	// find the reference permissions using the first shard's master
	si, err := wr.ts.GetShard(ctx, keyspace, shards[0])
	if err != nil {
		return err
	}
	if topo.TabletAliasIsZero(si.MasterAlias) {
		return fmt.Errorf("No master in shard %v/%v", keyspace, shards[0])
	}
	referenceAlias := topo.ProtoToTabletAlias(si.MasterAlias)
	log.Infof("Gathering permissions for reference master %v", referenceAlias)
	referencePermissions, err := wr.GetPermissions(ctx, topo.ProtoToTabletAlias(si.MasterAlias))
	if err != nil {
		return err
	}

	// then diff with all tablets but master 0
	er := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}
	for _, shard := range shards {
		aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
		if err != nil {
			er.RecordError(err)
			continue
		}

		for _, alias := range aliases {
			if alias == topo.ProtoToTabletAlias(si.MasterAlias) {
				continue
			}

			wg.Add(1)
			go wr.diffPermissions(ctx, referencePermissions, referenceAlias, alias, &wg, &er)
		}
	}
	wg.Wait()
	if er.HasErrors() {
		return fmt.Errorf("Permissions diffs:\n%v", er.Error().Error())
	}
	return nil
}
Exemplo n.º 4
0
func (s *server) SetMaster(ctx context.Context, request *pb.SetMasterRequest) (*pb.SetMasterResponse, error) {
	ctx = callinfo.GRPCCallInfo(ctx)
	response := &pb.SetMasterResponse{}
	return response, s.agent.RPCWrapLockAction(ctx, actionnode.TabletActionSetMaster, request, response, true, func() error {
		return s.agent.SetMaster(ctx, topo.ProtoToTabletAlias(request.Parent), request.TimeCreatedNs, request.ForceStartSlave)
	})
}
Exemplo n.º 5
0
func (wr *Wrangler) getMastersPosition(ctx context.Context, shards []*topo.ShardInfo) (map[*topo.ShardInfo]myproto.ReplicationPosition, error) {
	mu := sync.Mutex{}
	result := make(map[*topo.ShardInfo]myproto.ReplicationPosition)

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			wr.Logger().Infof("Gathering master position for %v", si.MasterAlias)
			ti, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(si.MasterAlias))
			if err != nil {
				rec.RecordError(err)
				return
			}

			pos, err := wr.tmc.MasterPosition(ctx, ti)
			if err != nil {
				rec.RecordError(err)
				return
			}

			wr.Logger().Infof("Got master position for %v", si.MasterAlias)
			mu.Lock()
			result[si] = pos
			mu.Unlock()
		}(si)
	}
	wg.Wait()
	return result, rec.Error()
}
Exemplo n.º 6
0
func (s *server) InitSlave(ctx context.Context, request *pb.InitSlaveRequest) (*pb.InitSlaveResponse, error) {
	ctx = callinfo.GRPCCallInfo(ctx)
	response := &pb.InitSlaveResponse{}
	return response, s.agent.RPCWrapLockAction(ctx, actionnode.TabletActionInitSlave, request, response, true, func() error {
		return s.agent.InitSlave(ctx, topo.ProtoToTabletAlias(request.Parent), myproto.ProtoToReplicationPosition(request.ReplicationPosition), request.TimeCreatedNs)
	})
}
Exemplo n.º 7
0
// Open opens a connection to the master for every shard
func (exec *TabletExecutor) Open(ctx context.Context, keyspace string) error {
	if !exec.isClosed {
		return nil
	}
	shardNames, err := exec.topoServer.GetShardNames(ctx, keyspace)
	if err != nil {
		return fmt.Errorf("unable to get shard names for keyspace: %s, error: %v", keyspace, err)
	}
	log.Infof("Keyspace: %v, Shards: %v\n", keyspace, shardNames)
	exec.tabletInfos = make([]*topo.TabletInfo, len(shardNames))
	for i, shardName := range shardNames {
		shardInfo, err := exec.topoServer.GetShard(ctx, keyspace, shardName)
		log.Infof("\tShard: %s, ShardInfo: %v\n", shardName, shardInfo)
		if err != nil {
			return fmt.Errorf("unable to get shard info, keyspace: %s, shard: %s, error: %v", keyspace, shardName, err)
		}
		tabletInfo, err := exec.topoServer.GetTablet(ctx, topo.ProtoToTabletAlias(shardInfo.MasterAlias))
		if err != nil {
			return fmt.Errorf("unable to get master tablet info, keyspace: %s, shard: %s, error: %v", keyspace, shardName, err)
		}
		exec.tabletInfos[i] = tabletInfo
		log.Infof("\t\tTabletInfo: %+v\n", tabletInfo)
	}

	if len(exec.tabletInfos) == 0 {
		return fmt.Errorf("keyspace: %s does not contain any master tablets", keyspace)
	}
	exec.isClosed = false
	return nil
}
Exemplo n.º 8
0
// CopySchemaShardFromShard copies the schema from a source shard to the specified destination shard.
// For both source and destination it picks the master tablet. See also CopySchemaShard.
func (wr *Wrangler) CopySchemaShardFromShard(ctx context.Context, tables, excludeTables []string, includeViews bool, sourceKeyspace, sourceShard, destKeyspace, destShard string) error {
	sourceShardInfo, err := wr.ts.GetShard(ctx, sourceKeyspace, sourceShard)
	if err != nil {
		return err
	}

	return wr.CopySchemaShard(ctx, topo.ProtoToTabletAlias(sourceShardInfo.MasterAlias), tables, excludeTables, includeViews, destKeyspace, destShard)
}
Exemplo n.º 9
0
func (s *server) SlaveWasRestarted(ctx context.Context, request *pb.SlaveWasRestartedRequest) (*pb.SlaveWasRestartedResponse, error) {
	ctx = callinfo.GRPCCallInfo(ctx)
	response := &pb.SlaveWasRestartedResponse{}
	return response, s.agent.RPCWrapLockAction(ctx, actionnode.TabletActionSlaveWasRestarted, request, response, true, func() error {
		return s.agent.SlaveWasRestarted(ctx, &actionnode.SlaveWasRestartedArgs{
			Parent: topo.ProtoToTabletAlias(request.Parent),
		})
	})
}
Exemplo n.º 10
0
// ValidateSchemaShard will diff the schema from all the tablets in the shard.
func (wr *Wrangler) ValidateSchemaShard(ctx context.Context, keyspace, shard string, excludeTables []string, includeViews bool) error {
	si, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	// get schema from the master, or error
	if topo.TabletAliasIsZero(si.MasterAlias) {
		return fmt.Errorf("No master in shard %v/%v", keyspace, shard)
	}
	log.Infof("Gathering schema for master %v", si.MasterAlias)
	masterSchema, err := wr.GetSchema(ctx, topo.ProtoToTabletAlias(si.MasterAlias), nil, excludeTables, includeViews)
	if err != nil {
		return err
	}

	// read all the aliases in the shard, that is all tablets that are
	// replicating from the master
	aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
	if err != nil {
		return err
	}

	// then diff with all slaves
	er := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}
	for _, alias := range aliases {
		if alias == topo.ProtoToTabletAlias(si.MasterAlias) {
			continue
		}

		wg.Add(1)
		go wr.diffSchema(ctx, masterSchema, topo.ProtoToTabletAlias(si.MasterAlias), alias, excludeTables, includeViews, &wg, &er)
	}
	wg.Wait()
	if er.HasErrors() {
		return fmt.Errorf("Schema diffs:\n%v", er.Error().Error())
	}
	return nil
}
Exemplo n.º 11
0
// ValidatePermissionsShard validates all the permissions are the same
// in a shard
func (wr *Wrangler) ValidatePermissionsShard(ctx context.Context, keyspace, shard string) error {
	si, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	// get permissions from the master, or error
	if topo.TabletAliasIsZero(si.MasterAlias) {
		return fmt.Errorf("No master in shard %v/%v", keyspace, shard)
	}
	log.Infof("Gathering permissions for master %v", si.MasterAlias)
	masterPermissions, err := wr.GetPermissions(ctx, topo.ProtoToTabletAlias(si.MasterAlias))
	if err != nil {
		return err
	}

	// read all the aliases in the shard, that is all tablets that are
	// replicating from the master
	aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
	if err != nil {
		return err
	}

	// then diff all of them, except master
	er := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}
	for _, alias := range aliases {
		if alias == topo.ProtoToTabletAlias(si.MasterAlias) {
			continue
		}
		wg.Add(1)
		go wr.diffPermissions(ctx, masterPermissions, topo.ProtoToTabletAlias(si.MasterAlias), alias, &wg, &er)
	}
	wg.Wait()
	if er.HasErrors() {
		return fmt.Errorf("Permissions diffs:\n%v", er.Error().Error())
	}
	return nil
}
Exemplo n.º 12
0
// ApplySchemaShard applies a schema change on a shard.
// Note for 'complex' mode (the 'simple' mode is easy enough that we
// don't need to handle recovery that much): this method is able to
// recover if interrupted in the middle, because it knows which server
// has the schema change already applied, and will just pass through them
// very quickly.
func (wr *Wrangler) ApplySchemaShard(ctx context.Context, keyspace, shard, change string, newParentTabletAlias topo.TabletAlias, simple, force bool, waitSlaveTimeout time.Duration) (*myproto.SchemaChangeResult, error) {
	// read the shard
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return nil, err
	}

	// preflight on the master, to get baseline
	// this assumes the master doesn't have the schema upgrade applied
	// If the master does, and some slaves don't, may have to
	// fix them manually one at a time, or re-clone them.
	// we do this outside of the shard lock because we can.
	log.Infof("Running Preflight on Master %v", shardInfo.MasterAlias)
	if err != nil {
		return nil, err
	}
	preflight, err := wr.PreflightSchema(ctx, topo.ProtoToTabletAlias(shardInfo.MasterAlias), change)
	if err != nil {
		return nil, err
	}

	return wr.lockAndApplySchemaShard(ctx, shardInfo, preflight, keyspace, shard, topo.ProtoToTabletAlias(shardInfo.MasterAlias), change, newParentTabletAlias, simple, force, waitSlaveTimeout)
}
Exemplo n.º 13
0
func (wr *Wrangler) validateReplication(ctx context.Context, shardInfo *topo.ShardInfo, tabletMap map[topo.TabletAlias]*topo.TabletInfo, results chan<- error) {
	masterTablet, ok := tabletMap[topo.ProtoToTabletAlias(shardInfo.MasterAlias)]
	if !ok {
		results <- fmt.Errorf("master %v not in tablet map", shardInfo.MasterAlias)
		return
	}

	slaveList, err := wr.tmc.GetSlaves(ctx, masterTablet)
	if err != nil {
		results <- fmt.Errorf("GetSlaves(%v) failed: %v", masterTablet, err)
		return
	}
	if len(slaveList) == 0 {
		results <- fmt.Errorf("no slaves of tablet %v found", shardInfo.MasterAlias)
		return
	}

	tabletIPMap := make(map[string]*topo.Tablet)
	slaveIPMap := make(map[string]bool)
	for _, tablet := range tabletMap {
		tabletIPMap[normalizeIP(tablet.IPAddr)] = tablet.Tablet
	}

	// See if every slave is in the replication graph.
	for _, slaveAddr := range slaveList {
		if tabletIPMap[normalizeIP(slaveAddr)] == nil {
			results <- fmt.Errorf("slave %v not in replication graph for shard %v/%v (mysql instance without vttablet?)", slaveAddr, shardInfo.Keyspace(), shardInfo.ShardName())
		}
		slaveIPMap[normalizeIP(slaveAddr)] = true
	}

	// See if every entry in the replication graph is connected to the master.
	for _, tablet := range tabletMap {
		if !tablet.IsSlaveType() {
			continue
		}

		if !slaveIPMap[normalizeIP(tablet.IPAddr)] {
			results <- fmt.Errorf("slave %v not replicating: %v slave list: %q", tablet.Alias, tablet.IPAddr, slaveList)
		}
	}
}
Exemplo n.º 14
0
func (wr *Wrangler) waitForFilteredReplication(ctx context.Context, sourcePositions map[*topo.ShardInfo]myproto.ReplicationPosition, destinationShards []*topo.ShardInfo, waitTime time.Duration) error {
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range destinationShards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			for _, sourceShard := range si.SourceShards {
				// we're waiting on this guy
				blpPosition := blproto.BlpPosition{
					Uid: sourceShard.Uid,
				}

				// find the position it should be at
				for s, pos := range sourcePositions {
					if s.Keyspace() == sourceShard.Keyspace && s.ShardName() == sourceShard.Shard {
						blpPosition.Position = pos
					}
				}

				// and wait for it
				wr.Logger().Infof("Waiting for %v to catch up", si.MasterAlias)
				tablet, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(si.MasterAlias))
				if err != nil {
					rec.RecordError(err)
					return
				}

				if err := wr.tmc.WaitBlpPosition(ctx, tablet, blpPosition, waitTime); err != nil {
					rec.RecordError(err)
				} else {
					wr.Logger().Infof("%v caught up", si.MasterAlias)
				}
			}
		}(si)
	}
	wg.Wait()
	return rec.Error()
}
Exemplo n.º 15
0
func newCellShardTabletsCache(ts topo.Server) *VersionedObjectCacheMap {
	return NewVersionedObjectCacheMap(func(key string) *VersionedObjectCache {
		return NewVersionedObjectCache(func(ctx context.Context) (VersionedObject, error) {
			parts := strings.Split(key, "/")
			if len(parts) != 3 {
				return nil, fmt.Errorf("Invalid shard tablets path: %v", key)
			}
			sr, err := ts.GetShardReplication(ctx, parts[0], parts[1], parts[2])
			if err != nil {
				return nil, err
			}
			result := &CellShardTablets{
				Cell:          parts[0],
				KeyspaceName:  parts[1],
				ShardName:     parts[2],
				TabletAliases: make([]topo.TabletAlias, len(sr.Nodes)),
			}
			for i, node := range sr.Nodes {
				result.TabletAliases[i] = topo.ProtoToTabletAlias(node.TabletAlias)
			}
			return result, nil
		})
	})
}
Exemplo n.º 16
0
// refreshMasters will just RPC-ping all the masters with RefreshState
func (wr *Wrangler) refreshMasters(ctx context.Context, shards []*topo.ShardInfo) error {
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			wr.Logger().Infof("RefreshState master %v", si.MasterAlias)
			ti, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(si.MasterAlias))
			if err != nil {
				rec.RecordError(err)
				return
			}

			if err := wr.tmc.RefreshState(ctx, ti); err != nil {
				rec.RecordError(err)
			} else {
				wr.Logger().Infof("%v responded", si.MasterAlias)
			}
		}(si)
	}
	wg.Wait()
	return rec.Error()
}
Exemplo n.º 17
0
// Does a topo lookup for a single shard, and returns the tablet record of the master tablet.
func resolveDestinationShardMaster(ctx context.Context, keyspace, shard string, wr *wrangler.Wrangler) (*topo.TabletInfo, error) {
	var ti *topo.TabletInfo
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	si, err := topo.GetShard(shortCtx, wr.TopoServer(), keyspace, shard)
	cancel()
	if err != nil {
		return ti, fmt.Errorf("unable to resolve destination shard %v/%v", keyspace, shard)
	}

	if topo.TabletAliasIsZero(si.MasterAlias) {
		return ti, fmt.Errorf("no master in destination shard %v/%v", keyspace, shard)
	}

	wr.Logger().Infof("Found target master alias %v in shard %v/%v", si.MasterAlias, keyspace, shard)

	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	ti, err = topo.GetTablet(shortCtx, wr.TopoServer(), topo.ProtoToTabletAlias(si.MasterAlias))
	cancel()
	if err != nil {
		return ti, fmt.Errorf("unable to get master tablet from alias %v in shard %v/%v",
			si.MasterAlias, keyspace, shard)
	}
	return ti, nil
}
Exemplo n.º 18
0
func (sdw *SplitDiffWorker) synchronizeReplication(ctx context.Context) error {
	sdw.SetState(WorkerStateSyncReplication)

	masterInfo, err := sdw.wr.TopoServer().GetTablet(ctx, topo.ProtoToTabletAlias(sdw.shardInfo.MasterAlias))
	if err != nil {
		return fmt.Errorf("synchronizeReplication: cannot get Tablet record for master %v: %v", sdw.shardInfo.MasterAlias, err)
	}

	// 1 - stop the master binlog replication, get its current position
	sdw.wr.Logger().Infof("Stopping master binlog replication on %v", sdw.shardInfo.MasterAlias)
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	blpPositionList, err := sdw.wr.TabletManagerClient().StopBlp(shortCtx, masterInfo)
	cancel()
	if err != nil {
		return fmt.Errorf("StopBlp for %v failed: %v", sdw.shardInfo.MasterAlias, err)
	}
	wrangler.RecordStartBlpAction(sdw.cleaner, masterInfo)

	// 2 - stop all the source tablets at a binlog position
	//     higher than the destination master
	stopPositionList := blproto.BlpPositionList{
		Entries: make([]blproto.BlpPosition, len(sdw.shardInfo.SourceShards)),
	}
	for i, ss := range sdw.shardInfo.SourceShards {
		// find where we should be stopping
		blpPos, err := blpPositionList.FindBlpPositionById(ss.Uid)
		if err != nil {
			return fmt.Errorf("no binlog position on the master for Uid %v", ss.Uid)
		}

		// read the tablet
		sourceTablet, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.sourceAliases[i])
		if err != nil {
			return err
		}

		// stop replication
		sdw.wr.Logger().Infof("Stopping slave[%v] %v at a minimum of %v", i, sdw.sourceAliases[i], blpPos.Position)
		shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
		stoppedAt, err := sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, sourceTablet, blpPos.Position, *remoteActionsTimeout)
		cancel()
		if err != nil {
			return fmt.Errorf("cannot stop slave %v at right binlog position %v: %v", sdw.sourceAliases[i], blpPos.Position, err)
		}
		stopPositionList.Entries[i].Uid = ss.Uid
		stopPositionList.Entries[i].Position = stoppedAt

		// change the cleaner actions from ChangeSlaveType(rdonly)
		// to StartSlave() + ChangeSlaveType(spare)
		wrangler.RecordStartSlaveAction(sdw.cleaner, sourceTablet)
		action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.sourceAliases[i])
		if err != nil {
			return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.sourceAliases[i], err)
		}
		action.TabletType = topo.TYPE_SPARE
	}

	// 3 - ask the master of the destination shard to resume filtered
	//     replication up to the new list of positions
	sdw.wr.Logger().Infof("Restarting master %v until it catches up to %v", sdw.shardInfo.MasterAlias, stopPositionList)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	masterPos, err := sdw.wr.TabletManagerClient().RunBlpUntil(shortCtx, masterInfo, &stopPositionList, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("RunBlpUntil for %v until %v failed: %v", sdw.shardInfo.MasterAlias, stopPositionList, err)
	}

	// 4 - wait until the destination tablet is equal or passed
	//     that master binlog position, and stop its replication.
	sdw.wr.Logger().Infof("Waiting for destination tablet %v to catch up to %v", sdw.destinationAlias, masterPos)
	destinationTablet, err := sdw.wr.TopoServer().GetTablet(ctx, sdw.destinationAlias)
	if err != nil {
		return err
	}
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	_, err = sdw.wr.TabletManagerClient().StopSlaveMinimum(shortCtx, destinationTablet, masterPos, *remoteActionsTimeout)
	cancel()
	if err != nil {
		return fmt.Errorf("StopSlaveMinimum for %v at %v failed: %v", sdw.destinationAlias, masterPos, err)
	}
	wrangler.RecordStartSlaveAction(sdw.cleaner, destinationTablet)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(sdw.cleaner, sdw.destinationAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", sdw.destinationAlias, err)
	}
	action.TabletType = topo.TYPE_SPARE

	// 5 - restart filtered replication on destination master
	sdw.wr.Logger().Infof("Restarting filtered replication on master %v", sdw.shardInfo.MasterAlias)
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	err = sdw.wr.TabletManagerClient().StartBlp(shortCtx, masterInfo)
	if err := sdw.cleaner.RemoveActionByName(wrangler.StartBlpActionName, sdw.shardInfo.MasterAlias.String()); err != nil {
		sdw.wr.Logger().Warningf("Cannot find cleaning action %v/%v: %v", wrangler.StartBlpActionName, sdw.shardInfo.MasterAlias.String(), err)
	}
	cancel()
	if err != nil {
		return fmt.Errorf("StartBlp failed for %v: %v", sdw.shardInfo.MasterAlias, err)
	}

	return nil
}
Exemplo n.º 19
0
// RestoreFromBackup is the main entry point for backup restore.
// It will either work, fail gracefully, or return
// an error in case of a non-recoverable error.
// It takes the action lock so no RPC interferes.
func (agent *ActionAgent) RestoreFromBackup(ctx context.Context) error {
	agent.actionMutex.Lock()
	defer agent.actionMutex.Unlock()

	// change type to RESTORE (using UpdateTabletFields so it's
	// always authorized)
	tablet := agent.Tablet()
	originalType := tablet.Type
	if err := agent.TopoServer.UpdateTabletFields(ctx, tablet.Alias, func(tablet *topo.Tablet) error {
		tablet.Type = topo.TYPE_RESTORE
		return nil
	}); err != nil {
		return fmt.Errorf("Cannot change type to RESTORE: %v", err)
	}

	// do the optional restore, if that fails we are in a bad state,
	// just log.Fatalf out.
	bucket := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard)
	pos, err := mysqlctl.Restore(ctx, agent.MysqlDaemon, bucket, *restoreConcurrency, agent.hookExtraEnv())
	if err != nil && err != mysqlctl.ErrNoBackup {
		return fmt.Errorf("Cannot restore original backup: %v", err)
	}

	if err == nil {
		// now read the shard to find the current master, and its location
		si, err := agent.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard)
		if err != nil {
			return fmt.Errorf("Cannot read shard: %v", err)
		}
		if si.MasterAlias == nil {
			return fmt.Errorf("Shard %v/%v has no master", tablet.Keyspace, tablet.Shard)
		}
		ti, err := agent.TopoServer.GetTablet(ctx, topo.ProtoToTabletAlias(si.MasterAlias))
		if err != nil {
			return fmt.Errorf("Cannot read master tablet %v: %v", si.MasterAlias, err)
		}

		// set replication straight
		status := &myproto.ReplicationStatus{
			Position:   pos,
			MasterHost: ti.Hostname,
			MasterPort: ti.Portmap["mysql"],
		}
		cmds, err := agent.MysqlDaemon.StartReplicationCommands(status)
		if err != nil {
			return fmt.Errorf("MysqlDaemon.StartReplicationCommands failed: %v", err)
		}
		if err := agent.MysqlDaemon.ExecuteSuperQueryList(cmds); err != nil {
			return fmt.Errorf("MysqlDaemon.ExecuteSuperQueryList failed: %v", err)
		}
	}

	// change type back to original type
	if err := agent.TopoServer.UpdateTabletFields(ctx, tablet.Alias, func(tablet *topo.Tablet) error {
		tablet.Type = originalType
		return nil
	}); err != nil {
		return fmt.Errorf("Cannot change type back to %v: %v", originalType, err)
	}
	return nil
}
Exemplo n.º 20
0
// masterMigrateServedFrom handles the master migration. The ordering is
// a bit different than for rdonly / replica to guarantee a smooth transition.
//
// The order is as follows:
// - Add BlacklistedTables on the source shard map for master
// - Refresh the source master, so it stops writing on the tables
// - Get the source master position, wait until destination master reaches it
// - Clear SourceShard on the destination Shard
// - Refresh the destination master, so its stops its filtered
//   replication and starts accepting writes
func (wr *Wrangler) masterMigrateServedFrom(ctx context.Context, ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, tables []string, ev *events.MigrateServedFrom, filteredReplicationWaitTime time.Duration) error {
	// Read the data we need
	sourceMasterTabletInfo, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(sourceShard.MasterAlias))
	if err != nil {
		return err
	}
	destinationMasterTabletInfo, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(destinationShard.MasterAlias))
	if err != nil {
		return err
	}

	// Update source shard (more blacklisted tables)
	event.DispatchUpdate(ev, "updating source shard")
	if err := sourceShard.UpdateSourceBlacklistedTables(pb.TabletType_MASTER, nil, false, tables); err != nil {
		return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err)
	}
	if err := topo.UpdateShard(ctx, wr.ts, sourceShard); err != nil {
		return fmt.Errorf("UpdateShard(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err)
	}

	// Now refresh the blacklisted table list on the source master
	event.DispatchUpdate(ev, "refreshing source master so it updates its blacklisted tables")
	if err := wr.tmc.RefreshState(ctx, sourceMasterTabletInfo); err != nil {
		return err
	}

	// get the position
	event.DispatchUpdate(ev, "getting master position")
	masterPosition, err := wr.tmc.MasterPosition(ctx, sourceMasterTabletInfo)
	if err != nil {
		return err
	}

	// wait for it
	event.DispatchUpdate(ev, "waiting for destination master to catch up to source master")
	if err := wr.tmc.WaitBlpPosition(ctx, destinationMasterTabletInfo, blproto.BlpPosition{
		Uid:      0,
		Position: masterPosition,
	}, filteredReplicationWaitTime); err != nil {
		return err
	}

	// Update the destination keyspace (its ServedFrom has changed)
	event.DispatchUpdate(ev, "updating keyspace")
	if err = topo.UpdateKeyspace(ctx, wr.ts, ki); err != nil {
		return err
	}

	// Update the destination shard (no more source shard)
	event.DispatchUpdate(ev, "updating destination shard")
	destinationShard.SourceShards = nil
	if err := topo.UpdateShard(ctx, wr.ts, destinationShard); err != nil {
		return err
	}

	// Tell the new shards masters they can now be read-write.
	// Invoking a remote action will also make the tablet stop filtered
	// replication.
	event.DispatchUpdate(ev, "setting destination shard masters read-write")
	if err := wr.refreshMasters(ctx, []*topo.ShardInfo{destinationShard}); err != nil {
		return err
	}

	return nil
}
Exemplo n.º 21
0
// TestInitMasterShard is the good scenario test, where everything
// works as planned
func TestInitMasterShard(t *testing.T) {
	ctx := context.Background()
	ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"})
	wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second)
	vp := NewVtctlPipe(t, ts)
	defer vp.Close()

	// Create a master, a couple good slaves
	master := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER)
	goodSlave1 := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA)
	goodSlave2 := NewFakeTablet(t, wr, "cell2", 2, topo.TYPE_REPLICA)

	// Master: set a plausible ReplicationPosition to return,
	// and expect to add entry in _vt.reparent_journal
	master.FakeMysqlDaemon.CurrentMasterPosition = myproto.ReplicationPosition{
		GTIDSet: myproto.MariadbGTID{
			Domain:   5,
			Server:   456,
			Sequence: 890,
		},
	}
	master.FakeMysqlDaemon.ReadOnly = true
	master.FakeMysqlDaemon.ResetReplicationResult = []string{"reset rep 1"}
	master.FakeMysqlDaemon.StartReplicationCommandsResult = []string{"new master shouldn't use this"}
	master.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
		"reset rep 1",
		"CREATE DATABASE IF NOT EXISTS _vt",
		"SUBCREATE TABLE IF NOT EXISTS _vt.reparent_journal",
		"CREATE DATABASE IF NOT EXISTS _vt",
		"SUBCREATE TABLE IF NOT EXISTS _vt.reparent_journal",
		"SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, master_alias, replication_position) VALUES",
	}
	master.StartActionLoop(t, wr)
	defer master.StopActionLoop(t)

	// Slave1: expect to be reset and re-parented
	goodSlave1.FakeMysqlDaemon.ReadOnly = true
	goodSlave1.FakeMysqlDaemon.ResetReplicationResult = []string{"reset rep 1"}
	goodSlave1.FakeMysqlDaemon.StartReplicationCommandsStatus = &myproto.ReplicationStatus{
		Position:           master.FakeMysqlDaemon.CurrentMasterPosition,
		MasterHost:         master.Tablet.Hostname,
		MasterPort:         master.Tablet.Portmap["mysql"],
		MasterConnectRetry: 10,
	}
	goodSlave1.FakeMysqlDaemon.StartReplicationCommandsResult = []string{"cmd1"}
	goodSlave1.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
		"reset rep 1",
		"cmd1",
	}
	goodSlave1.StartActionLoop(t, wr)
	defer goodSlave1.StopActionLoop(t)

	// Slave2: expect to be re-parented
	goodSlave2.FakeMysqlDaemon.ReadOnly = true
	goodSlave2.FakeMysqlDaemon.ResetReplicationResult = []string{"reset rep 2"}
	goodSlave2.FakeMysqlDaemon.StartReplicationCommandsStatus = &myproto.ReplicationStatus{
		Position:           master.FakeMysqlDaemon.CurrentMasterPosition,
		MasterHost:         master.Tablet.Hostname,
		MasterPort:         master.Tablet.Portmap["mysql"],
		MasterConnectRetry: 10,
	}
	goodSlave2.FakeMysqlDaemon.StartReplicationCommandsResult = []string{"cmd1", "cmd2"}
	goodSlave2.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
		"reset rep 2",
		"cmd1",
		"cmd2",
	}
	goodSlave2.StartActionLoop(t, wr)
	defer goodSlave2.StopActionLoop(t)

	// run InitShardMaster
	if err := vp.Run([]string{"InitShardMaster", "-wait_slave_timeout", "10s", master.Tablet.Keyspace + "/" + master.Tablet.Shard, master.Tablet.Alias.String()}); err != nil {
		t.Fatalf("InitShardMaster failed: %v", err)
	}

	// check what was run
	if master.FakeMysqlDaemon.ReadOnly {
		t.Errorf("master was not turned read-write")
	}
	si, err := ts.GetShard(ctx, master.Tablet.Keyspace, master.Tablet.Shard)
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	if topo.ProtoToTabletAlias(si.MasterAlias) != master.Tablet.Alias {
		t.Errorf("unexpected shard master alias, got %v expected %v", si.MasterAlias, master.Tablet.Alias)
	}
	if err := master.FakeMysqlDaemon.CheckSuperQueryList(); err != nil {
		t.Fatalf("master.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err)
	}
	if err := goodSlave1.FakeMysqlDaemon.CheckSuperQueryList(); err != nil {
		t.Fatalf("goodSlave1.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err)
	}
	if err := goodSlave2.FakeMysqlDaemon.CheckSuperQueryList(); err != nil {
		t.Fatalf("goodSlave2.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err)
	}
}
Exemplo n.º 22
0
// finalizeTabletExternallyReparented performs slow, synchronized reconciliation
// tasks that ensure topology is self-consistent, and then marks the reparent as
// finished by updating the global shard record.
func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context, si *topo.ShardInfo, ev *events.Reparent) (err error) {
	var wg sync.WaitGroup
	var errs concurrency.AllErrorRecorder
	oldMasterAlias := si.MasterAlias

	// Update the tablet records and serving graph for the old and new master concurrently.
	event.DispatchUpdate(ev, "updating old and new master tablet records")
	log.Infof("finalizeTabletExternallyReparented: updating tablet records")
	wg.Add(1)
	go func() {
		defer wg.Done()
		// Update our own record to master.
		var updatedTablet *topo.Tablet
		err := topo.UpdateTabletFields(ctx, agent.TopoServer, agent.TabletAlias,
			func(tablet *topo.Tablet) error {
				tablet.Type = topo.TYPE_MASTER
				tablet.Health = nil
				updatedTablet = tablet
				return nil
			})
		if err != nil {
			errs.RecordError(err)
			return
		}

		// Update the serving graph for the tablet.
		if updatedTablet != nil {
			errs.RecordError(
				topotools.UpdateTabletEndpoints(ctx, agent.TopoServer, updatedTablet))
		}
	}()

	if !topo.TabletAliasIsZero(oldMasterAlias) {
		wg.Add(1)
		go func() {
			// Force the old master to spare.
			var oldMasterTablet *topo.Tablet
			err := topo.UpdateTabletFields(ctx, agent.TopoServer, topo.ProtoToTabletAlias(oldMasterAlias),
				func(tablet *topo.Tablet) error {
					tablet.Type = topo.TYPE_SPARE
					oldMasterTablet = tablet
					return nil
				})
			if err != nil {
				errs.RecordError(err)
				wg.Done()
				return
			}

			if oldMasterTablet != nil {
				// We now know more about the old master, so add it to event data.
				ev.OldMaster = *oldMasterTablet

				// Update the serving graph.
				errs.RecordError(
					topotools.UpdateTabletEndpoints(ctx, agent.TopoServer, oldMasterTablet))
				wg.Done()

				// Tell the old master to refresh its state. We don't need to wait for it.
				tmc := tmclient.NewTabletManagerClient()
				tmc.RefreshState(ctx, topo.NewTabletInfo(oldMasterTablet, -1))
			}
		}()
	}

	tablet := agent.Tablet()

	// Wait for the tablet records to be updated. At that point, any rebuild will
	// see the new master, so we're ready to mark the reparent as done in the
	// global shard record.
	wg.Wait()
	if errs.HasErrors() {
		return errs.Error()
	}

	// Update the master field in the global shard record. We don't use a lock
	// here anymore. The lock was only to ensure that the global shard record
	// didn't get modified between the time when we read it and the time when we
	// write it back. Now we use an update loop pattern to do that instead.
	event.DispatchUpdate(ev, "updating global shard record")
	log.Infof("finalizeTabletExternallyReparented: updating global shard record")
	si, err = topo.UpdateShardFields(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard, func(shard *pb.Shard) error {
		shard.MasterAlias = topo.TabletAliasToProto(tablet.Alias)
		return nil
	})
	if err != nil {
		return err
	}

	// We already took care of updating the serving graph for the old and new masters.
	// All that's left now is in case of a cross-cell reparent, we need to update the
	// master cell setting in the SrvShard records of all cells.
	if oldMasterAlias == nil || oldMasterAlias.Cell != tablet.Alias.Cell {
		event.DispatchUpdate(ev, "rebuilding shard serving graph")
		log.Infof("finalizeTabletExternallyReparented: updating SrvShard in all cells for cross-cell reparent")
		if err := topotools.UpdateAllSrvShards(ctx, agent.TopoServer, si); err != nil {
			return err
		}
	}

	event.DispatchUpdate(ev, "finished")
	return nil
}
Exemplo n.º 23
0
// TabletExternallyReparented updates all topo records so the current
// tablet is the new master for this shard.
// Should be called under RPCWrapLock.
func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, externalID string) error {
	startTime := time.Now()

	// If there is a finalize step running, wait for it to finish or time out
	// before checking the global shard record again.
	if agent.finalizeReparentCtx != nil {
		select {
		case <-agent.finalizeReparentCtx.Done():
			agent.finalizeReparentCtx = nil
		case <-ctx.Done():
			return ctx.Err()
		}
	}

	tablet := agent.Tablet()

	// Check the global shard record.
	si, err := topo.GetShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard)
	if err != nil {
		log.Warningf("fastTabletExternallyReparented: failed to read global shard record for %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
		return err
	}
	if si.MasterAlias != nil && *si.MasterAlias == *topo.TabletAliasToProto(tablet.Alias) {
		// We may get called on the current master even when nothing has changed.
		// If the global shard record is already updated, it means we successfully
		// finished a previous reparent to this tablet.
		return nil
	}

	// Create a reusable Reparent event with available info.
	ev := &events.Reparent{
		ShardInfo: *si,
		NewMaster: *tablet.Tablet,
		OldMaster: topo.Tablet{
			Alias: topo.ProtoToTabletAlias(si.MasterAlias),
			Type:  topo.TYPE_MASTER,
		},
		ExternalID: externalID,
	}
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()
	event.DispatchUpdate(ev, "starting external from tablet (fast)")

	// Execute state change to master by force-updating only the local copy of the
	// tablet record. The actual record in topo will be updated later.
	log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER")
	oldTablet := *tablet.Tablet
	newTablet := oldTablet
	newTablet.Type = topo.TYPE_MASTER
	newTablet.Health = nil
	agent.setTablet(topo.NewTabletInfo(&newTablet, -1))
	if err := agent.updateState(ctx, &oldTablet, "fastTabletExternallyReparented"); err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to change tablet state to MASTER: %v", err)
	}

	agent.mutex.Lock()
	agent._tabletExternallyReparentedTime = time.Now()
	agent.mutex.Unlock()

	// Directly write the new master endpoint in the serving graph.
	// We will do a true rebuild in the background soon, but in the meantime,
	// this will be enough for clients to re-resolve the new master.
	event.DispatchUpdate(ev, "writing new master endpoint")
	log.Infof("fastTabletExternallyReparented: writing new master endpoint to serving graph")
	ep, err := tablet.EndPoint()
	if err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to generate EndPoint for tablet %v: %v", tablet.Alias, err)
	}
	err = topo.UpdateEndPoints(ctx, agent.TopoServer, tablet.Alias.Cell,
		si.Keyspace(), si.ShardName(), topo.TYPE_MASTER,
		&pb.EndPoints{Entries: []*pb.EndPoint{ep}}, -1)
	if err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to update master endpoint: %v", err)
	}
	externalReparentStats.Record("NewMasterVisible", startTime)

	// Start the finalize stage with a background context, but connect the trace.
	bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout)
	bgCtx = trace.CopySpan(bgCtx, ctx)
	agent.finalizeReparentCtx = bgCtx
	go func() {
		err := agent.finalizeTabletExternallyReparented(bgCtx, si, ev)
		cancel()

		if err != nil {
			log.Warningf("finalizeTabletExternallyReparented error: %v", err)
			event.DispatchUpdate(ev, "failed: "+err.Error())
			return
		}
		externalReparentStats.Record("FullRebuild", startTime)
	}()

	return nil
}
Exemplo n.º 24
0
func (s *server) PopulateReparentJournal(ctx context.Context, request *pb.PopulateReparentJournalRequest) (*pb.PopulateReparentJournalResponse, error) {
	ctx = callinfo.GRPCCallInfo(ctx)
	response := &pb.PopulateReparentJournalResponse{}
	return response, s.agent.RPCWrap(ctx, actionnode.TabletActionPopulateReparentJournal, request, response, func() error {
		position, err := myproto.DecodeReplicationPosition(request.ReplicationPosition)
		if err != nil {
			return err
		}
		return s.agent.PopulateReparentJournal(ctx, request.TimeCreatedNs, request.ActionName, topo.ProtoToTabletAlias(request.MasterAlias), position)
	})
}
Exemplo n.º 25
0
func (wr *Wrangler) applySchemaShardComplex(ctx context.Context, statusArray []*tabletStatus, shardInfo *topo.ShardInfo, preflight *myproto.SchemaChangeResult, masterTabletAlias topo.TabletAlias, change string, newParentTabletAlias topo.TabletAlias, force bool, waitSlaveTimeout time.Duration) (*myproto.SchemaChangeResult, error) {
	// apply the schema change to all replica / slave tablets
	for _, status := range statusArray {
		// if already applied, we skip this guy
		diffs := myproto.DiffSchemaToArray("after", preflight.AfterSchema, status.ti.Alias.String(), status.beforeSchema)
		if len(diffs) == 0 {
			log.Infof("Tablet %v already has the AfterSchema, skipping", status.ti.Alias)
			continue
		}

		// make sure the before schema matches
		diffs = myproto.DiffSchemaToArray("master", preflight.BeforeSchema, status.ti.Alias.String(), status.beforeSchema)
		if len(diffs) > 0 {
			if force {
				log.Warningf("Tablet %v has inconsistent schema, ignoring: %v", status.ti.Alias, strings.Join(diffs, "\n"))
			} else {
				return nil, fmt.Errorf("Tablet %v has inconsistent schema: %v", status.ti.Alias, strings.Join(diffs, "\n"))
			}
		}

		// take this guy out of the serving graph if necessary
		ti, err := wr.ts.GetTablet(ctx, status.ti.Alias)
		if err != nil {
			return nil, err
		}
		typeChangeRequired := ti.Tablet.IsInServingGraph()
		if typeChangeRequired {
			// note we want to update the serving graph there
			err = wr.changeTypeInternal(ctx, ti.Alias, topo.TYPE_SCHEMA_UPGRADE)
			if err != nil {
				return nil, err
			}
		}

		// apply the schema change
		log.Infof("Applying schema change to slave %v in complex mode", status.ti.Alias)
		sc := &myproto.SchemaChange{Sql: change, Force: force, AllowReplication: false, BeforeSchema: preflight.BeforeSchema, AfterSchema: preflight.AfterSchema}
		_, err = wr.ApplySchema(ctx, status.ti.Alias, sc)
		if err != nil {
			return nil, err
		}

		// put this guy back into the serving graph
		if typeChangeRequired {
			err = wr.changeTypeInternal(ctx, ti.Alias, ti.Tablet.Type)
			if err != nil {
				return nil, err
			}
		}
	}

	// if newParentTabletAlias is passed in, use that as the new master
	if !newParentTabletAlias.IsZero() {
		log.Infof("Reparenting with new master set to %v", newParentTabletAlias)
		oldMasterAlias := topo.ProtoToTabletAlias(shardInfo.MasterAlias)

		// Create reusable Reparent event with available info
		ev := &events.Reparent{}

		if err := wr.plannedReparentShardLocked(ctx, ev, shardInfo.Keyspace(), shardInfo.ShardName(), newParentTabletAlias, waitSlaveTimeout); err != nil {
			return nil, err
		}

		// Here we would apply the schema change to the old
		// master, but we just scrap it, to be consistent
		// with the previous implementation of the reparent.
		// (this code will be refactored at some point anyway)
		if err := wr.Scrap(ctx, oldMasterAlias, false, false); err != nil {
			wr.Logger().Warningf("Scrapping old master %v from shard %v/%v failed: %v", oldMasterAlias, shardInfo.Keyspace(), shardInfo.ShardName(), err)
		}
	}
	return &myproto.SchemaChangeResult{BeforeSchema: preflight.BeforeSchema, AfterSchema: preflight.AfterSchema}, nil
}
Exemplo n.º 26
0
func (wr *Wrangler) emergencyReparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, masterElectTabletAlias topo.TabletAlias, waitSlaveTimeout time.Duration) error {
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}
	ev.ShardInfo = *shardInfo

	event.DispatchUpdate(ev, "reading all tablets")
	tabletMap, err := topo.GetTabletMapForShard(ctx, wr.ts, keyspace, shard)
	if err != nil {
		return err
	}

	// Check corner cases we're going to depend on
	masterElectTabletInfo, ok := tabletMap[masterElectTabletAlias]
	if !ok {
		return fmt.Errorf("master-elect tablet %v is not in the shard", masterElectTabletAlias)
	}
	ev.NewMaster = *masterElectTabletInfo.Tablet
	if topo.ProtoToTabletAlias(shardInfo.MasterAlias) == masterElectTabletAlias {
		return fmt.Errorf("master-elect tablet %v is already the master", masterElectTabletAlias)
	}

	// Deal with the old master: try to remote-scrap it, if it's
	// truely dead we force-scrap it. Remove it from our map in any case.
	if !topo.TabletAliasIsZero(shardInfo.MasterAlias) {
		scrapOldMaster := true
		oldMasterTabletInfo, ok := tabletMap[topo.ProtoToTabletAlias(shardInfo.MasterAlias)]
		if ok {
			delete(tabletMap, topo.ProtoToTabletAlias(shardInfo.MasterAlias))
		} else {
			oldMasterTabletInfo, err = wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(shardInfo.MasterAlias))
			if err != nil {
				wr.logger.Warningf("cannot read old master tablet %v, won't touch it: %v", shardInfo.MasterAlias, err)
				scrapOldMaster = false
			}
		}

		if scrapOldMaster {
			ev.OldMaster = *oldMasterTabletInfo.Tablet
			wr.logger.Infof("scrapping old master %v", shardInfo.MasterAlias)

			ctx, cancel := context.WithTimeout(ctx, waitSlaveTimeout)
			defer cancel()

			if err := wr.tmc.Scrap(ctx, oldMasterTabletInfo); err != nil {
				wr.logger.Warningf("remote scrapping failed master failed, will force the scrap: %v", err)

				if err := topotools.Scrap(ctx, wr.ts, topo.ProtoToTabletAlias(shardInfo.MasterAlias), true); err != nil {
					wr.logger.Warningf("old master topo scrapping failed, continuing anyway: %v", err)
				}
			}
		}
	}

	// Stop replication on all slaves, get their current
	// replication position
	event.DispatchUpdate(ev, "stop replication on all slaves")
	wg := sync.WaitGroup{}
	mu := sync.Mutex{}
	statusMap := make(map[topo.TabletAlias]myproto.ReplicationStatus)
	for alias, tabletInfo := range tabletMap {
		wg.Add(1)
		go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) {
			defer wg.Done()
			wr.logger.Infof("getting replication position from %v", alias)
			ctx, cancel := context.WithTimeout(ctx, waitSlaveTimeout)
			defer cancel()
			rp, err := wr.TabletManagerClient().StopReplicationAndGetStatus(ctx, tabletInfo)
			if err != nil {
				wr.logger.Warningf("failed to get replication status from %v, ignoring tablet: %v", alias, err)
				return
			}
			mu.Lock()
			statusMap[alias] = rp
			mu.Unlock()
		}(alias, tabletInfo)
	}
	wg.Wait()

	// Verify masterElect is alive and has the most advanced position
	masterElectStatus, ok := statusMap[masterElectTabletAlias]
	if !ok {
		return fmt.Errorf("couldn't get master elect %v replication position", masterElectTabletAlias)
	}
	for alias, status := range statusMap {
		if alias == masterElectTabletAlias {
			continue
		}
		if !masterElectStatus.Position.AtLeast(status.Position) {
			return fmt.Errorf("tablet %v is more advanced than master elect tablet %v: %v > %v", alias, masterElectTabletAlias, status.Position, masterElectStatus)
		}
	}

	// Promote the masterElect
	wr.logger.Infof("promote slave %v", masterElectTabletAlias)
	event.DispatchUpdate(ev, "promoting slave")
	rp, err := wr.tmc.PromoteSlave(ctx, masterElectTabletInfo)
	if err != nil {
		return fmt.Errorf("master-elect tablet %v failed to be upgraded to master: %v", masterElectTabletAlias, err)
	}

	// Reset replication on all slaves to point to the new master, and
	// insert test row in the new master.
	// Go through all the tablets:
	// - new master: populate the reparent journal
	// - everybody else: reparent to new master, wait for row
	event.DispatchUpdate(ev, "reparenting all tablets")
	now := time.Now().UnixNano()
	wgMaster := sync.WaitGroup{}
	wgSlaves := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	var masterErr error
	for alias, tabletInfo := range tabletMap {
		if alias == masterElectTabletAlias {
			wgMaster.Add(1)
			go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) {
				defer wgMaster.Done()
				wr.logger.Infof("populating reparent journal on new master %v", alias)
				masterErr = wr.TabletManagerClient().PopulateReparentJournal(ctx, tabletInfo, now, emergencyReparentShardOperation, alias, rp)
			}(alias, tabletInfo)
		} else {
			wgSlaves.Add(1)
			go func(alias topo.TabletAlias, tabletInfo *topo.TabletInfo) {
				defer wgSlaves.Done()
				wr.logger.Infof("setting new master on slave %v", alias)
				forceStartSlave := false
				if status, ok := statusMap[alias]; ok {
					forceStartSlave = status.SlaveIORunning || status.SlaveSQLRunning
				}
				if err := wr.TabletManagerClient().SetMaster(ctx, tabletInfo, masterElectTabletAlias, now, forceStartSlave); err != nil {
					rec.RecordError(fmt.Errorf("Tablet %v SetMaster failed: %v", alias, err))
				}
			}(alias, tabletInfo)
		}
	}

	// After the master is done, we can update the shard record
	// (note with semi-sync, it also means at least one slave is done)
	wgMaster.Wait()
	if masterErr != nil {
		wgSlaves.Wait()
		return fmt.Errorf("failed to PopulateReparentJournal on master: %v", masterErr)
	}
	wr.logger.Infof("updating shard record with new master %v", masterElectTabletAlias)
	shardInfo.MasterAlias = topo.TabletAliasToProto(masterElectTabletAlias)
	if err := topo.UpdateShard(ctx, wr.ts, shardInfo); err != nil {
		wgSlaves.Wait()
		return fmt.Errorf("failed to update shard master record: %v", err)
	}

	// Wait for the slaves to complete. If some of them fail, we
	// will rebuild the shard serving graph anyway
	wgSlaves.Wait()
	if err := rec.Error(); err != nil {
		wr.Logger().Errorf("Some slaves failed to reparent: %v", err)
		return err
	}

	// Then we rebuild the entire serving graph for the shard,
	// to account for all changes.
	wr.logger.Infof("rebuilding shard graph")
	event.DispatchUpdate(ev, "rebuilding shard serving graph")
	_, err = wr.RebuildShardGraph(ctx, keyspace, shard, nil)
	return err
}
Exemplo n.º 27
0
// ValidateSchemaKeyspace will diff the schema from all the tablets in
// the keyspace.
func (wr *Wrangler) ValidateSchemaKeyspace(ctx context.Context, keyspace string, excludeTables []string, includeViews bool) error {
	// find all the shards
	shards, err := wr.ts.GetShardNames(ctx, keyspace)
	if err != nil {
		return err
	}

	// corner cases
	if len(shards) == 0 {
		return fmt.Errorf("No shards in keyspace %v", keyspace)
	}
	sort.Strings(shards)
	if len(shards) == 1 {
		return wr.ValidateSchemaShard(ctx, keyspace, shards[0], excludeTables, includeViews)
	}

	// find the reference schema using the first shard's master
	si, err := wr.ts.GetShard(ctx, keyspace, shards[0])
	if err != nil {
		return err
	}
	if topo.TabletAliasIsZero(si.MasterAlias) {
		return fmt.Errorf("No master in shard %v/%v", keyspace, shards[0])
	}
	referenceAlias := topo.ProtoToTabletAlias(si.MasterAlias)
	log.Infof("Gathering schema for reference master %v", referenceAlias)
	referenceSchema, err := wr.GetSchema(ctx, referenceAlias, nil, excludeTables, includeViews)
	if err != nil {
		return err
	}

	// then diff with all other tablets everywhere
	er := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	// first diff the slaves in the reference shard 0
	aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shards[0])
	if err != nil {
		return err
	}

	for _, alias := range aliases {
		if alias == topo.ProtoToTabletAlias(si.MasterAlias) {
			continue
		}

		wg.Add(1)
		go wr.diffSchema(ctx, referenceSchema, referenceAlias, alias, excludeTables, includeViews, &wg, &er)
	}

	// then diffs all tablets in the other shards
	for _, shard := range shards[1:] {
		si, err := wr.ts.GetShard(ctx, keyspace, shard)
		if err != nil {
			er.RecordError(err)
			continue
		}

		if topo.TabletAliasIsZero(si.MasterAlias) {
			er.RecordError(fmt.Errorf("No master in shard %v/%v", keyspace, shard))
			continue
		}

		aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
		if err != nil {
			er.RecordError(err)
			continue
		}

		for _, alias := range aliases {
			wg.Add(1)
			go wr.diffSchema(ctx, referenceSchema, referenceAlias, alias, excludeTables, includeViews, &wg, &er)
		}
	}
	wg.Wait()
	if er.HasErrors() {
		return fmt.Errorf("Schema diffs:\n%v", er.Error().Error())
	}
	return nil
}
Exemplo n.º 28
0
func (wr *Wrangler) removeShardCell(ctx context.Context, keyspace, shard, cell string, force, recursive bool) error {
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	// check the cell is in the list already
	if !topo.InCellList(cell, shardInfo.Cells) {
		return fmt.Errorf("cell %v in not in shard info", cell)
	}

	// check the master alias is not in the cell
	if shardInfo.MasterAlias.Cell == cell {
		return fmt.Errorf("master %v is in the cell '%v' we want to remove", shardInfo.MasterAlias, cell)
	}

	// get the ShardReplication object in the cell
	sri, err := wr.ts.GetShardReplication(ctx, cell, keyspace, shard)
	switch err {
	case nil:
		if recursive {
			wr.Logger().Infof("Deleting all tablets in shard %v/%v", keyspace, shard)
			for _, node := range sri.Nodes {
				// We don't care about scrapping or updating the replication graph,
				// because we're about to delete the entire replication graph.
				wr.Logger().Infof("Deleting tablet %v", node.TabletAlias)
				if err := wr.TopoServer().DeleteTablet(ctx, topo.ProtoToTabletAlias(node.TabletAlias)); err != nil && err != topo.ErrNoNode {
					return fmt.Errorf("can't delete tablet %v: %v", node.TabletAlias, err)
				}
			}
		} else if len(sri.Nodes) > 0 {
			return fmt.Errorf("cell %v has %v possible tablets in replication graph", cell, len(sri.Nodes))
		}

		// ShardReplication object is now useless, remove it
		if err := wr.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode {
			return fmt.Errorf("error deleting ShardReplication object in cell %v: %v", cell, err)
		}

		// Rebuild the shard serving graph to reflect the tablets we deleted.
		// This must be done before removing the cell from the global shard record,
		// since this cell will be skipped by all future rebuilds.
		if _, err := wr.RebuildShardGraph(ctx, keyspace, shard, []string{cell}); err != nil {
			return fmt.Errorf("can't rebuild serving graph for shard %v/%v in cell %v: %v", keyspace, shard, cell, err)
		}

		// we keep going
	case topo.ErrNoNode:
		// no ShardReplication object, we keep going
	default:
		// we can't get the object, assume topo server is down there,
		// so we look at force flag
		if !force {
			return err
		}
		wr.Logger().Warningf("Cannot get ShardReplication from cell %v, assuming cell topo server is down, and forcing the removal", cell)
	}

	// now we can update the shard
	wr.Logger().Infof("Removing cell %v from shard %v/%v", cell, keyspace, shard)
	newCells := make([]string, 0, len(shardInfo.Cells)-1)
	for _, c := range shardInfo.Cells {
		if c != cell {
			newCells = append(newCells, c)
		}
	}
	shardInfo.Cells = newCells

	return topo.UpdateShard(ctx, wr.ts, shardInfo)
}
Exemplo n.º 29
0
// InitTablet creates or updates a tablet. If no parent is specified
// in the tablet, and the tablet has a slave type, we will find the
// appropriate parent. If createShardAndKeyspace is true and the
// parent keyspace or shard don't exist, they will be created.  If
// update is true, and a tablet with the same ID exists, update it.
// If Force is true, and a tablet with the same ID already exists, it
// will be scrapped and deleted, and then recreated.
func (wr *Wrangler) InitTablet(ctx context.Context, tablet *topo.Tablet, force, createShardAndKeyspace, update bool) error {
	if err := topo.TabletComplete(tablet); err != nil {
		return err
	}

	if topo.IsInReplicationGraph(tablet.Type) {
		// get the shard, possibly creating it
		var err error
		var si *topo.ShardInfo

		if createShardAndKeyspace {
			// create the parent keyspace and shard if needed
			si, err = topotools.GetOrCreateShard(ctx, wr.ts, tablet.Keyspace, tablet.Shard)
		} else {
			si, err = wr.ts.GetShard(ctx, tablet.Keyspace, tablet.Shard)
			if err == topo.ErrNoNode {
				return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard")
			}
		}

		// get the shard, checks a couple things
		if err != nil {
			return fmt.Errorf("cannot get (or create) shard %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
		}
		if key.ProtoToKeyRange(si.KeyRange) != tablet.KeyRange {
			return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange)
		}
		if tablet.Type == topo.TYPE_MASTER && !topo.TabletAliasIsZero(si.MasterAlias) && topo.ProtoToTabletAlias(si.MasterAlias) != tablet.Alias && !force {
			return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, tablet.Keyspace, tablet.Shard)
		}

		// update the shard record if needed
		if err := wr.updateShardCellsAndMaster(ctx, si, topo.TabletAliasToProto(tablet.Alias), topo.TabletTypeToProto(tablet.Type), force); err != nil {
			return err
		}
	}

	err := topo.CreateTablet(ctx, wr.ts, tablet)
	if err != nil && err == topo.ErrNodeExists {
		// Try to update nicely, but if it fails fall back to force behavior.
		if update || force {
			oldTablet, err := wr.ts.GetTablet(ctx, tablet.Alias)
			if err != nil {
				wr.Logger().Warningf("failed reading tablet %v: %v", tablet.Alias, err)
			} else {
				if oldTablet.Keyspace == tablet.Keyspace && oldTablet.Shard == tablet.Shard {
					*(oldTablet.Tablet) = *tablet
					if err := topo.UpdateTablet(ctx, wr.ts, oldTablet); err != nil {
						wr.Logger().Warningf("failed updating tablet %v: %v", tablet.Alias, err)
						// now fall through the Scrap case
					} else {
						if !topo.IsInReplicationGraph(tablet.Type) {
							return nil
						}

						if err := topo.UpdateTabletReplicationData(ctx, wr.ts, tablet); err != nil {
							wr.Logger().Warningf("failed updating tablet replication data for %v: %v", tablet.Alias, err)
							// now fall through the Scrap case
						} else {
							return nil
						}
					}
				}
			}
		}
		if force {
			if err = wr.Scrap(ctx, tablet.Alias, force, false); err != nil {
				wr.Logger().Errorf("failed scrapping tablet %v: %v", tablet.Alias, err)
				return err
			}
			if err := wr.ts.DeleteTablet(ctx, tablet.Alias); err != nil {
				// we ignore this
				wr.Logger().Errorf("failed deleting tablet %v: %v", tablet.Alias, err)
			}
			return topo.CreateTablet(ctx, wr.ts, tablet)
		}
	}
	return err
}
Exemplo n.º 30
0
// TestInitMasterShardOneSlaveFails makes sure that if one slave fails to
// proceed, the action completes anyway
func TestInitMasterShardOneSlaveFails(t *testing.T) {
	ctx := context.Background()
	ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"})
	wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second)

	// Create a master, a couple slaves
	master := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER)
	goodSlave := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA)
	badSlave := NewFakeTablet(t, wr, "cell2", 2, topo.TYPE_REPLICA)

	// Master: set a plausible ReplicationPosition to return,
	// and expect to add entry in _vt.reparent_journal
	master.FakeMysqlDaemon.CurrentMasterPosition = myproto.ReplicationPosition{
		GTIDSet: myproto.MariadbGTID{
			Domain:   5,
			Server:   456,
			Sequence: 890,
		},
	}
	master.FakeMysqlDaemon.ReadOnly = true
	master.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
		"CREATE DATABASE IF NOT EXISTS _vt",
		"SUBCREATE TABLE IF NOT EXISTS _vt.reparent_journal",
		"CREATE DATABASE IF NOT EXISTS _vt",
		"SUBCREATE TABLE IF NOT EXISTS _vt.reparent_journal",
		"SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, master_alias, replication_position) VALUES",
	}
	master.StartActionLoop(t, wr)
	defer master.StopActionLoop(t)

	// goodSlave: expect to be re-parented
	goodSlave.FakeMysqlDaemon.ReadOnly = true
	goodSlave.FakeMysqlDaemon.StartReplicationCommandsStatus = &myproto.ReplicationStatus{
		Position:           master.FakeMysqlDaemon.CurrentMasterPosition,
		MasterHost:         master.Tablet.Hostname,
		MasterPort:         master.Tablet.Portmap["mysql"],
		MasterConnectRetry: 10,
	}
	goodSlave.FakeMysqlDaemon.StartReplicationCommandsResult = []string{"cmd1"}
	goodSlave.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = goodSlave.FakeMysqlDaemon.StartReplicationCommandsResult
	goodSlave.StartActionLoop(t, wr)
	defer goodSlave.StopActionLoop(t)

	// badSlave: insert an error by failing the ReplicationStatus input
	// on purpose
	badSlave.FakeMysqlDaemon.ReadOnly = true
	badSlave.FakeMysqlDaemon.StartReplicationCommandsStatus = &myproto.ReplicationStatus{
		Position:           master.FakeMysqlDaemon.CurrentMasterPosition,
		MasterHost:         "",
		MasterPort:         0,
		MasterConnectRetry: 10,
	}
	badSlave.StartActionLoop(t, wr)
	defer badSlave.StopActionLoop(t)

	// also change the master alias in the Shard object, to make sure it
	// is set back.
	si, err := ts.GetShard(ctx, master.Tablet.Keyspace, master.Tablet.Shard)
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	si.MasterAlias.Uid++
	if err := topo.UpdateShard(ctx, ts, si); err != nil {
		t.Fatalf("UpdateShard failed: %v", err)
	}

	// run InitShardMaster without force, it fails because master is
	// changing.
	if err := wr.InitShardMaster(ctx, master.Tablet.Keyspace, master.Tablet.Shard, master.Tablet.Alias, false /*force*/, 10*time.Second); err == nil || !strings.Contains(err.Error(), "is not the shard master") {
		t.Errorf("InitShardMaster with mismatched new master returned wrong error: %v", err)
	}

	// run InitShardMaster
	if err := wr.InitShardMaster(ctx, master.Tablet.Keyspace, master.Tablet.Shard, master.Tablet.Alias, true /*force*/, 10*time.Second); err == nil || !strings.Contains(err.Error(), "wrong status for StartReplicationCommands") {
		t.Errorf("InitShardMaster with one failed slave returned wrong error: %v", err)
	}

	// check what was run: master should still be good
	if master.FakeMysqlDaemon.ReadOnly {
		t.Errorf("master was not turned read-write")
	}
	si, err = ts.GetShard(ctx, master.Tablet.Keyspace, master.Tablet.Shard)
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	if topo.ProtoToTabletAlias(si.MasterAlias) != master.Tablet.Alias {
		t.Errorf("unexpected shard master alias, got %v expected %v", si.MasterAlias, master.Tablet.Alias)
	}
}