Пример #1
0
// rpcWrapper handles all the logic for rpc calls.
func (agent *ActionAgent) rpcWrapper(ctx context.Context, name string, args, reply interface{}, verbose bool, f func() error, lock, runAfterAction bool) (err error) {
	defer func() {
		if x := recover(); x != nil {
			log.Errorf("TabletManager.%v(%v) on %v panic: %v\n%s", name, args, topo.TabletAliasString(agent.TabletAlias), x, tb.Stack(4))
			err = fmt.Errorf("caught panic during %v: %v", name, x)
		}
	}()

	from := ""
	ci, ok := callinfo.FromContext(ctx)
	if ok {
		from = ci.Text()
	}

	if lock {
		beforeLock := time.Now()
		agent.actionMutex.Lock()
		defer agent.actionMutex.Unlock()
		if time.Now().Sub(beforeLock) > rpcTimeout {
			return fmt.Errorf("server timeout for " + name)
		}
	}

	if err = f(); err != nil {
		log.Warningf("TabletManager.%v(%v)(on %v from %v) error: %v", name, args, topo.TabletAliasString(agent.TabletAlias), from, err.Error())
		return fmt.Errorf("TabletManager.%v on %v error: %v", name, topo.TabletAliasString(agent.TabletAlias), err)
	}
	if verbose {
		log.Infof("TabletManager.%v(%v)(on %v from %v): %#v", name, args, topo.TabletAliasString(agent.TabletAlias), from, reply)
	}
	if runAfterAction {
		err = agent.refreshTablet(ctx, "RPC("+name+")")
	}
	return
}
Пример #2
0
// Syslog writes a Reparent event to syslog.
func (r *Reparent) Syslog() (syslog.Priority, string) {
	return syslog.LOG_INFO, fmt.Sprintf("%s/%s [reparent %v -> %v] %s (%s)",
		r.ShardInfo.Keyspace(), r.ShardInfo.ShardName(),
		topo.TabletAliasString(r.OldMaster.Alias),
		topo.TabletAliasString(r.NewMaster.Alias),
		r.Status, r.ExternalID)
}
Пример #3
0
func (wr *Wrangler) getMastersPosition(ctx context.Context, shards []*topo.ShardInfo) (map[*topo.ShardInfo]myproto.ReplicationPosition, error) {
	mu := sync.Mutex{}
	result := make(map[*topo.ShardInfo]myproto.ReplicationPosition)

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			wr.Logger().Infof("Gathering master position for %v", topo.TabletAliasString(si.MasterAlias))
			ti, err := wr.ts.GetTablet(ctx, si.MasterAlias)
			if err != nil {
				rec.RecordError(err)
				return
			}

			pos, err := wr.tmc.MasterPosition(ctx, ti)
			if err != nil {
				rec.RecordError(err)
				return
			}

			wr.Logger().Infof("Got master position for %v", topo.TabletAliasString(si.MasterAlias))
			mu.Lock()
			result[si] = pos
			mu.Unlock()
		}(si)
	}
	wg.Wait()
	return result, rec.Error()
}
Пример #4
0
// ReparentTablet tells a tablet to reparent this tablet to the current
// master, based on the current replication position. If there is no
// match, it will fail.
func (wr *Wrangler) ReparentTablet(ctx context.Context, tabletAlias *pb.TabletAlias) error {
	// Get specified tablet.
	// Get current shard master tablet.
	// Sanity check they are in the same keyspace/shard.
	// Issue a SetMaster to the tablet.
	ti, err := wr.ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}

	shardInfo, err := wr.ts.GetShard(ctx, ti.Keyspace, ti.Shard)
	if err != nil {
		return err
	}
	if topo.TabletAliasIsZero(shardInfo.MasterAlias) {
		return fmt.Errorf("no master tablet for shard %v/%v", ti.Keyspace, ti.Shard)
	}

	masterTi, err := wr.ts.GetTablet(ctx, shardInfo.MasterAlias)
	if err != nil {
		return err
	}

	// Basic sanity checking.
	if masterTi.Type != pb.TabletType_MASTER {
		return fmt.Errorf("TopologyServer has inconsistent state for shard master %v", topo.TabletAliasString(shardInfo.MasterAlias))
	}
	if masterTi.Keyspace != ti.Keyspace || masterTi.Shard != ti.Shard {
		return fmt.Errorf("master %v and potential slave not in same keyspace/shard", topo.TabletAliasString(shardInfo.MasterAlias))
	}

	// and do the remote command
	return wr.TabletManagerClient().SetMaster(ctx, ti, shardInfo.MasterAlias, 0, false)
}
Пример #5
0
// findTargets phase:
// - find one rdonly in the source shard
// - mark it as 'worker' pointing back to us
// - get the aliases of all the targets
func (vscw *VerticalSplitCloneWorker) findTargets(ctx context.Context) error {
	vscw.setState(WorkerStateFindTargets)

	// find an appropriate endpoint in the source shard
	var err error
	vscw.sourceAlias, err = FindWorkerTablet(ctx, vscw.wr, vscw.cleaner, vscw.cell, vscw.sourceKeyspace, "0")
	if err != nil {
		return fmt.Errorf("FindWorkerTablet() failed for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err)
	}
	vscw.wr.Logger().Infof("Using tablet %v as the source", topo.TabletAliasString(vscw.sourceAlias))

	// get the tablet info for it
	vscw.sourceTablet, err = vscw.wr.TopoServer().GetTablet(ctx, vscw.sourceAlias)
	if err != nil {
		return fmt.Errorf("cannot read tablet %v: %v", topo.TabletAliasString(vscw.sourceAlias), err)
	}

	// stop replication on it
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	err = vscw.wr.TabletManagerClient().StopSlave(shortCtx, vscw.sourceTablet)
	cancel()
	if err != nil {
		return fmt.Errorf("cannot stop replication on tablet %v", topo.TabletAliasString(vscw.sourceAlias))
	}

	wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(vscw.cleaner, vscw.sourceAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(vscw.sourceAlias), err)
	}
	action.TabletType = pb.TabletType_SPARE

	return vscw.ResolveDestinationMasters(ctx)
}
Пример #6
0
// FIXME(msolomon) This validate presumes the master is up and running.
// Even when that isn't true, there are validation processes that might be valuable.
func (wr *Wrangler) validateShard(ctx context.Context, keyspace, shard string, pingTablets bool, wg *sync.WaitGroup, results chan<- error) {
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		results <- fmt.Errorf("TopologyServer.GetShard(%v, %v) failed: %v", keyspace, shard, err)
		return
	}

	aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
	if err != nil {
		results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err)
		return
	}

	tabletMap, _ := topo.GetTabletMap(ctx, wr.ts, aliases)

	var masterAlias *pb.TabletAlias
	for _, alias := range aliases {
		tabletInfo, ok := tabletMap[*alias]
		if !ok {
			results <- fmt.Errorf("tablet %v not found in map", topo.TabletAliasString(alias))
			continue
		}
		if tabletInfo.Type == pb.TabletType_MASTER {
			if masterAlias != nil {
				results <- fmt.Errorf("shard %v/%v already has master %v but found other master %v", keyspace, shard, topo.TabletAliasString(masterAlias), topo.TabletAliasString(alias))
			} else {
				masterAlias = alias
			}
		}
	}

	if masterAlias == nil {
		results <- fmt.Errorf("no master for shard %v/%v", keyspace, shard)
	} else if !topo.TabletAliasEqual(shardInfo.MasterAlias, masterAlias) {
		results <- fmt.Errorf("master mismatch for shard %v/%v: found %v, expected %v", keyspace, shard, topo.TabletAliasString(masterAlias), topo.TabletAliasString(shardInfo.MasterAlias))
	}

	for _, alias := range aliases {
		wg.Add(1)
		go func(alias *pb.TabletAlias) {
			defer wg.Done()
			if err := topo.Validate(ctx, wr.ts, alias); err != nil {
				results <- fmt.Errorf("Validate(%v) failed: %v", topo.TabletAliasString(alias), err)
			} else {
				wr.Logger().Infof("tablet %v is valid", topo.TabletAliasString(alias))
			}
		}(alias)
	}

	if pingTablets {
		wr.validateReplication(ctx, shardInfo, tabletMap, results)
		wr.pingTablets(ctx, tabletMap, wg, results)
	}

	return
}
Пример #7
0
// DeleteShard will do all the necessary changes in the topology server
// to entirely remove a shard.
func (wr *Wrangler) DeleteShard(ctx context.Context, keyspace, shard string, recursive bool) error {
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	tabletMap, err := topo.GetTabletMapForShard(ctx, wr.ts, keyspace, shard)
	if err != nil {
		return err
	}
	if recursive {
		wr.Logger().Infof("Deleting all tablets in shard %v/%v", keyspace, shard)
		for tabletAlias := range tabletMap {
			// We don't care about scrapping or updating the replication graph,
			// because we're about to delete the entire replication graph.
			wr.Logger().Infof("Deleting tablet %v", topo.TabletAliasString(&tabletAlias))
			if err := wr.TopoServer().DeleteTablet(ctx, &tabletAlias); err != nil && err != topo.ErrNoNode {
				// Unlike the errors below in non-recursive steps, we don't want to
				// continue if a DeleteTablet fails. If we continue and delete the
				// replication graph, the tablet record will be orphaned, since we'll
				// no longer know it belongs to this shard.
				//
				// If the problem is temporary, or resolved externally, re-running
				// DeleteShard will skip over tablets that were already deleted.
				return fmt.Errorf("can't delete tablet %v: %v", topo.TabletAliasString(&tabletAlias), err)
			}
		}
	} else if len(tabletMap) > 0 {
		return fmt.Errorf("shard %v/%v still has %v tablets; use -recursive or remove them manually", keyspace, shard, len(tabletMap))
	}

	// remove the replication graph and serving graph in each cell
	for _, cell := range shardInfo.Cells {
		if err := wr.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode {
			wr.Logger().Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %v", cell, keyspace, shard, err)
		}

		for _, t := range topo.AllTabletTypes {
			if !topo.IsInServingGraph(t) {
				continue
			}

			if err := wr.ts.DeleteEndPoints(ctx, cell, keyspace, shard, t, -1); err != nil && err != topo.ErrNoNode {
				wr.Logger().Warningf("Cannot delete EndPoints in cell %v for %v/%v/%v: %v", cell, keyspace, shard, t, err)
			}
		}

		if err := wr.ts.DeleteSrvShard(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode {
			wr.Logger().Warningf("Cannot delete SrvShard in cell %v for %v/%v: %v", cell, keyspace, shard, err)
		}
	}

	return wr.ts.DeleteShard(ctx, keyspace, shard)
}
Пример #8
0
// diffPermissions is a helper method to asynchronously diff a permissions
func (wr *Wrangler) diffPermissions(ctx context.Context, masterPermissions *myproto.Permissions, masterAlias *pb.TabletAlias, alias *pb.TabletAlias, wg *sync.WaitGroup, er concurrency.ErrorRecorder) {
	defer wg.Done()
	log.Infof("Gathering permissions for %v", alias)
	slavePermissions, err := wr.GetPermissions(ctx, alias)
	if err != nil {
		er.RecordError(err)
		return
	}

	log.Infof("Diffing permissions for %v", alias)
	myproto.DiffPermissions(topo.TabletAliasString(masterAlias), masterPermissions, topo.TabletAliasString(alias), slavePermissions, er)
}
Пример #9
0
// helper method to asynchronously diff a schema
func (wr *Wrangler) diffSchema(ctx context.Context, masterSchema *myproto.SchemaDefinition, masterTabletAlias, alias *pb.TabletAlias, excludeTables []string, includeViews bool, wg *sync.WaitGroup, er concurrency.ErrorRecorder) {
	defer wg.Done()
	log.Infof("Gathering schema for %v", alias)
	slaveSchema, err := wr.GetSchema(ctx, alias, nil, excludeTables, includeViews)
	if err != nil {
		er.RecordError(err)
		return
	}

	log.Infof("Diffing schema for %v", alias)
	myproto.DiffSchema(topo.TabletAliasString(masterTabletAlias), masterSchema, topo.TabletAliasString(alias), slaveSchema, er)
}
Пример #10
0
// RecordTabletTagAction records a new TabletTagAction
// into the specified Cleaner
func RecordTabletTagAction(cleaner *Cleaner, tabletAlias *pb.TabletAlias, name, value string) {
	cleaner.Record(TabletTagActionName, topo.TabletAliasString(tabletAlias), &TabletTagAction{
		TabletAlias: tabletAlias,
		Name:        name,
		Value:       value,
	})
}
Пример #11
0
func (scw *SplitCloneWorker) formatSources() string {
	result := ""
	for _, alias := range scw.sourceAliases {
		result += " " + topo.TabletAliasString(alias)
	}
	return result
}
Пример #12
0
// Validate all tablets in all discoverable cells, even if they are
// not in the replication graph.
func (wr *Wrangler) validateAllTablets(ctx context.Context, wg *sync.WaitGroup, results chan<- error) {
	cellSet := make(map[string]bool, 16)

	keyspaces, err := wr.ts.GetKeyspaces(ctx)
	if err != nil {
		results <- fmt.Errorf("TopologyServer.GetKeyspaces failed: %v", err)
		return
	}
	for _, keyspace := range keyspaces {
		shards, err := wr.ts.GetShardNames(ctx, keyspace)
		if err != nil {
			results <- fmt.Errorf("TopologyServer.GetShardNames(%v) failed: %v", keyspace, err)
			return
		}

		for _, shard := range shards {
			aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
			if err != nil {
				results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err)
				return
			}
			for _, alias := range aliases {
				cellSet[alias.Cell] = true
			}
		}
	}

	for cell := range cellSet {
		aliases, err := wr.ts.GetTabletsByCell(ctx, cell)
		if err != nil {
			results <- fmt.Errorf("TopologyServer.GetTabletsByCell(%v) failed: %v", cell, err)
			continue
		}

		for _, alias := range aliases {
			wg.Add(1)
			go func(alias *pb.TabletAlias) {
				defer wg.Done()
				if err := topo.Validate(ctx, wr.ts, alias); err != nil {
					results <- fmt.Errorf("Validate(%v) failed: %v", topo.TabletAliasString(alias), err)
				} else {
					wr.Logger().Infof("tablet %v is valid", topo.TabletAliasString(alias))
				}
			}(alias)
		}
	}
}
Пример #13
0
// ValidateVersionKeyspace validates all versions are the same in all
// tablets in a keyspace
func (wr *Wrangler) ValidateVersionKeyspace(ctx context.Context, keyspace string) error {
	// find all the shards
	shards, err := wr.ts.GetShardNames(ctx, keyspace)
	if err != nil {
		return err
	}

	// corner cases
	if len(shards) == 0 {
		return fmt.Errorf("No shards in keyspace %v", keyspace)
	}
	sort.Strings(shards)
	if len(shards) == 1 {
		return wr.ValidateVersionShard(ctx, keyspace, shards[0])
	}

	// find the reference version using the first shard's master
	si, err := wr.ts.GetShard(ctx, keyspace, shards[0])
	if err != nil {
		return err
	}
	if topo.TabletAliasIsZero(si.MasterAlias) {
		return fmt.Errorf("No master in shard %v/%v", keyspace, shards[0])
	}
	referenceAlias := si.MasterAlias
	log.Infof("Gathering version for reference master %v", topo.TabletAliasString(referenceAlias))
	referenceVersion, err := wr.GetVersion(ctx, referenceAlias)
	if err != nil {
		return err
	}

	// then diff with all tablets but master 0
	er := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}
	for _, shard := range shards {
		aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
		if err != nil {
			er.RecordError(err)
			continue
		}

		for _, alias := range aliases {
			if topo.TabletAliasEqual(alias, si.MasterAlias) {
				continue
			}

			wg.Add(1)
			go wr.diffVersion(ctx, referenceVersion, referenceAlias, alias, &wg, &er)
		}
	}
	wg.Wait()
	if er.HasErrors() {
		return fmt.Errorf("Version diffs:\n%v", er.Error().Error())
	}
	return nil
}
Пример #14
0
func (wr *Wrangler) validateReplication(ctx context.Context, shardInfo *topo.ShardInfo, tabletMap map[pb.TabletAlias]*topo.TabletInfo, results chan<- error) {
	masterTablet, ok := tabletMap[*shardInfo.MasterAlias]
	if !ok {
		results <- fmt.Errorf("master %v not in tablet map", topo.TabletAliasString(shardInfo.MasterAlias))
		return
	}

	slaveList, err := wr.tmc.GetSlaves(ctx, masterTablet)
	if err != nil {
		results <- fmt.Errorf("GetSlaves(%v) failed: %v", masterTablet, err)
		return
	}
	if len(slaveList) == 0 {
		results <- fmt.Errorf("no slaves of tablet %v found", topo.TabletAliasString(shardInfo.MasterAlias))
		return
	}

	tabletIPMap := make(map[string]*pb.Tablet)
	slaveIPMap := make(map[string]bool)
	for _, tablet := range tabletMap {
		tabletIPMap[normalizeIP(tablet.Ip)] = tablet.Tablet
	}

	// See if every slave is in the replication graph.
	for _, slaveAddr := range slaveList {
		if tabletIPMap[normalizeIP(slaveAddr)] == nil {
			results <- fmt.Errorf("slave %v not in replication graph for shard %v/%v (mysql instance without vttablet?)", slaveAddr, shardInfo.Keyspace(), shardInfo.ShardName())
		}
		slaveIPMap[normalizeIP(slaveAddr)] = true
	}

	// See if every entry in the replication graph is connected to the master.
	for _, tablet := range tabletMap {
		if !tablet.IsSlaveType() {
			continue
		}

		if !slaveIPMap[normalizeIP(tablet.Ip)] {
			results <- fmt.Errorf("slave %v not replicating: %v slave list: %q", topo.TabletAliasString(tablet.Alias), tablet.Ip, slaveList)
		}
	}
}
Пример #15
0
// FindChangeSlaveTypeActionByTarget finds the first action for the target
func FindChangeSlaveTypeActionByTarget(cleaner *Cleaner, tabletAlias *pb.TabletAlias) (*ChangeSlaveTypeAction, error) {
	action, err := cleaner.GetActionByName(ChangeSlaveTypeActionName, topo.TabletAliasString(tabletAlias))
	if err != nil {
		return nil, err
	}
	result, ok := action.(*ChangeSlaveTypeAction)
	if !ok {
		return nil, fmt.Errorf("Action with wrong type: %v", action)
	}
	return result, nil
}
Пример #16
0
func (wr *Wrangler) pingTablets(ctx context.Context, tabletMap map[pb.TabletAlias]*topo.TabletInfo, wg *sync.WaitGroup, results chan<- error) {
	for tabletAlias, tabletInfo := range tabletMap {
		wg.Add(1)
		go func(tabletAlias pb.TabletAlias, tabletInfo *topo.TabletInfo) {
			defer wg.Done()

			if err := wr.tmc.Ping(ctx, tabletInfo); err != nil {
				results <- fmt.Errorf("Ping(%v) failed: %v tablet hostname: %v", topo.TabletAliasString(&tabletAlias), err, tabletInfo.Hostname)
			}
		}(tabletAlias, tabletInfo)
	}
}
Пример #17
0
func (wr *Wrangler) waitForFilteredReplication(ctx context.Context, sourcePositions map[*topo.ShardInfo]myproto.ReplicationPosition, destinationShards []*topo.ShardInfo, waitTime time.Duration) error {
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range destinationShards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			for _, sourceShard := range si.SourceShards {
				// we're waiting on this guy
				blpPosition := blproto.BlpPosition{
					Uid: sourceShard.Uid,
				}

				// find the position it should be at
				for s, pos := range sourcePositions {
					if s.Keyspace() == sourceShard.Keyspace && s.ShardName() == sourceShard.Shard {
						blpPosition.Position = pos
					}
				}

				// and wait for it
				wr.Logger().Infof("Waiting for %v to catch up", topo.TabletAliasString(si.MasterAlias))
				tablet, err := wr.ts.GetTablet(ctx, si.MasterAlias)
				if err != nil {
					rec.RecordError(err)
					return
				}

				if err := wr.tmc.WaitBlpPosition(ctx, tablet, blpPosition, waitTime); err != nil {
					rec.RecordError(err)
				} else {
					wr.Logger().Infof("%v caught up", topo.TabletAliasString(si.MasterAlias))
				}
			}
		}(si)
	}
	wg.Wait()
	return rec.Error()
}
Пример #18
0
// helper method to asynchronously get and diff a version
func (wr *Wrangler) diffVersion(ctx context.Context, masterVersion string, masterAlias *pb.TabletAlias, alias *pb.TabletAlias, wg *sync.WaitGroup, er concurrency.ErrorRecorder) {
	defer wg.Done()
	log.Infof("Gathering version for %v", topo.TabletAliasString(alias))
	slaveVersion, err := wr.GetVersion(ctx, alias)
	if err != nil {
		er.RecordError(err)
		return
	}

	if masterVersion != slaveVersion {
		er.RecordError(fmt.Errorf("Master %v version %v is different than slave %v version %v", topo.TabletAliasString(masterAlias), masterVersion, topo.TabletAliasString(alias), slaveVersion))
	}
}
Пример #19
0
// GetVersion returns the version string from a tablet
func (wr *Wrangler) GetVersion(ctx context.Context, tabletAlias *pb.TabletAlias) (string, error) {
	tablet, err := wr.ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		return "", err
	}

	version, err := getVersionFromTablet(tablet.Addr())
	if err != nil {
		return "", err
	}
	log.Infof("Tablet %v is running version '%v'", topo.TabletAliasString(tabletAlias), version)
	return version, err
}
Пример #20
0
// findTargets phase:
// - find one rdonly in the source shard
// - mark it as 'worker' pointing back to us
// - get the aliases of all the targets
func (scw *SplitCloneWorker) findTargets(ctx context.Context) error {
	scw.setState(WorkerStateFindTargets)
	var err error

	// find an appropriate endpoint in the source shards
	scw.sourceAliases = make([]*pb.TabletAlias, len(scw.sourceShards))
	for i, si := range scw.sourceShards {
		scw.sourceAliases[i], err = FindWorkerTablet(ctx, scw.wr, scw.cleaner, scw.cell, si.Keyspace(), si.ShardName())
		if err != nil {
			return fmt.Errorf("FindWorkerTablet() failed for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err)
		}
		scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", topo.TabletAliasString(scw.sourceAliases[i]), si.Keyspace(), si.ShardName())
	}

	// get the tablet info for them, and stop their replication
	scw.sourceTablets = make([]*topo.TabletInfo, len(scw.sourceAliases))
	for i, alias := range scw.sourceAliases {
		scw.sourceTablets[i], err = scw.wr.TopoServer().GetTablet(ctx, alias)
		if err != nil {
			return fmt.Errorf("cannot read tablet %v: %v", topo.TabletAliasString(alias), err)
		}

		shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
		err := scw.wr.TabletManagerClient().StopSlave(shortCtx, scw.sourceTablets[i])
		cancel()
		if err != nil {
			return fmt.Errorf("cannot stop replication on tablet %v", topo.TabletAliasString(alias))
		}

		wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i])
		action, err := wrangler.FindChangeSlaveTypeActionByTarget(scw.cleaner, alias)
		if err != nil {
			return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(alias), err)
		}
		action.TabletType = pb.TabletType_SPARE
	}

	return scw.ResolveDestinationMasters(ctx)
}
Пример #21
0
// FindWorkerTablet will:
// - find a rdonly instance in the keyspace / shard
// - mark it as worker
// - tag it with our worker process
func FindWorkerTablet(ctx context.Context, wr *wrangler.Wrangler, cleaner *wrangler.Cleaner, cell, keyspace, shard string) (*pb.TabletAlias, error) {
	tabletAlias, err := FindHealthyRdonlyEndPoint(ctx, wr, cell, keyspace, shard)
	if err != nil {
		return nil, err
	}

	// We add the tag before calling ChangeSlaveType, so the destination
	// vttablet reloads the worker URL when it reloads the tablet.
	ourURL := servenv.ListeningURL.String()
	wr.Logger().Infof("Adding tag[worker]=%v to tablet %v", ourURL, topo.TabletAliasString(tabletAlias))
	if err := wr.TopoServer().UpdateTabletFields(ctx, tabletAlias, func(tablet *pb.Tablet) error {
		if tablet.Tags == nil {
			tablet.Tags = make(map[string]string)
		}
		tablet.Tags["worker"] = ourURL
		return nil
	}); err != nil {
		return nil, err
	}
	// we remove the tag *before* calling ChangeSlaveType back, so
	// we need to record this tag change after the change slave
	// type change in the cleaner.
	defer wrangler.RecordTabletTagAction(cleaner, tabletAlias, "worker", "")

	wr.Logger().Infof("Changing tablet %v to '%v'", topo.TabletAliasString(tabletAlias), pb.TabletType_WORKER)
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	err = wr.ChangeType(shortCtx, tabletAlias, pb.TabletType_WORKER, false /*force*/)
	cancel()
	if err != nil {
		return nil, err
	}

	// Record a clean-up action to take the tablet back to rdonly.
	// We will alter this one later on and let the tablet go back to
	// 'spare' if we have stopped replication for too long on it.
	wrangler.RecordChangeSlaveTypeAction(cleaner, tabletAlias, pb.TabletType_RDONLY)
	return tabletAlias, nil
}
Пример #22
0
// synchronizeReplication phase:
// 1 - ask the subset slave to stop replication
// 2 - sleep for 5 seconds
// 3 - ask the superset slave to stop replication
// Note this is not 100% correct, but good enough for now
func (worker *SQLDiffWorker) synchronizeReplication(ctx context.Context) error {
	worker.SetState(WorkerStateSyncReplication)

	// stop replication on subset slave
	worker.wr.Logger().Infof("Stopping replication on subset slave %v", topo.TabletAliasString(worker.subset.alias))
	subsetTablet, err := worker.wr.TopoServer().GetTablet(ctx, worker.subset.alias)
	if err != nil {
		return err
	}
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	err = worker.wr.TabletManagerClient().StopSlave(shortCtx, subsetTablet)
	cancel()
	if err != nil {
		return fmt.Errorf("Cannot stop slave %v: %v", topo.TabletAliasString(worker.subset.alias), err)
	}
	if err := checkDone(ctx); err != nil {
		return err
	}

	// change the cleaner actions from ChangeSlaveType(rdonly)
	// to StartSlave() + ChangeSlaveType(spare)
	wrangler.RecordStartSlaveAction(worker.cleaner, subsetTablet)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.subset.alias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(worker.subset.alias), err)
	}
	action.TabletType = pb.TabletType_SPARE

	// sleep for a few seconds
	time.Sleep(5 * time.Second)
	if err := checkDone(ctx); err != nil {
		return err
	}

	// stop replication on superset slave
	worker.wr.Logger().Infof("Stopping replication on superset slave %v", topo.TabletAliasString(worker.superset.alias))
	supersetTablet, err := worker.wr.TopoServer().GetTablet(ctx, worker.superset.alias)
	if err != nil {
		return err
	}
	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	err = worker.wr.TabletManagerClient().StopSlave(shortCtx, supersetTablet)
	cancel()
	if err != nil {
		return fmt.Errorf("Cannot stop slave %v: %v", topo.TabletAliasString(worker.superset.alias), err)
	}

	// change the cleaner actions from ChangeSlaveType(rdonly)
	// to StartSlave() + ChangeSlaveType(spare)
	wrangler.RecordStartSlaveAction(worker.cleaner, supersetTablet)
	action, err = wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.superset.alias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(worker.superset.alias), err)
	}
	action.TabletType = pb.TabletType_SPARE

	return nil
}
Пример #23
0
// Does a topo lookup for a single shard, and returns the tablet record of the master tablet.
func resolveDestinationShardMaster(ctx context.Context, keyspace, shard string, wr *wrangler.Wrangler) (*topo.TabletInfo, error) {
	var ti *topo.TabletInfo
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	si, err := topo.GetShard(shortCtx, wr.TopoServer(), keyspace, shard)
	cancel()
	if err != nil {
		return ti, fmt.Errorf("unable to resolve destination shard %v/%v", keyspace, shard)
	}

	if topo.TabletAliasIsZero(si.MasterAlias) {
		return ti, fmt.Errorf("no master in destination shard %v/%v", keyspace, shard)
	}

	wr.Logger().Infof("Found target master alias %v in shard %v/%v", topo.TabletAliasString(si.MasterAlias), keyspace, shard)

	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	ti, err = topo.GetTablet(shortCtx, wr.TopoServer(), si.MasterAlias)
	cancel()
	if err != nil {
		return ti, fmt.Errorf("unable to get master tablet from alias %v in shard %v/%v",
			topo.TabletAliasString(si.MasterAlias), keyspace, shard)
	}
	return ti, nil
}
Пример #24
0
// refreshMasters will just RPC-ping all the masters with RefreshState
func (wr *Wrangler) refreshMasters(ctx context.Context, shards []*topo.ShardInfo) error {
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			wr.Logger().Infof("RefreshState master %v", topo.TabletAliasString(si.MasterAlias))
			ti, err := wr.ts.GetTablet(ctx, si.MasterAlias)
			if err != nil {
				rec.RecordError(err)
				return
			}

			if err := wr.tmc.RefreshState(ctx, ti); err != nil {
				rec.RecordError(err)
			} else {
				wr.Logger().Infof("%v responded", topo.TabletAliasString(si.MasterAlias))
			}
		}(si)
	}
	wg.Wait()
	return rec.Error()
}
Пример #25
0
// Backup takes a db backup and sends it to the BackupStorage
// Should be called under RPCWrapLockAction.
func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error {
	// update our type to BACKUP
	tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias)
	if err != nil {
		return err
	}
	if tablet.Type == pb.TabletType_MASTER {
		return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode")
	}
	originalType := tablet.Type
	if err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, pb.TabletType_BACKUP, make(map[string]string)); err != nil {
		return err
	}

	// let's update our internal state (stop query service and other things)
	if err := agent.refreshTablet(ctx, "backup"); err != nil {
		return fmt.Errorf("failed to update state before backup: %v", err)
	}

	// create the loggers: tee to console and source
	l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)

	// now we can run the backup
	bucket := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard)
	name := fmt.Sprintf("%v.%v", topo.TabletAliasString(tablet.Alias), time.Now().UTC().Format("2006-01-02.150405"))
	returnErr := mysqlctl.Backup(ctx, agent.MysqlDaemon, l, bucket, name, concurrency, agent.hookExtraEnv())

	// and change our type back to the appropriate value:
	// - if healthcheck is enabled, go to spare
	// - if not, go back to original type
	if agent.IsRunningHealthCheck() {
		originalType = pb.TabletType_SPARE
	}
	err = topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, originalType, nil)
	if err != nil {
		// failure in changing the topology type is probably worse,
		// so returning that (we logged the snapshot error anyway)
		if returnErr != nil {
			l.Errorf("mysql backup command returned error: %v", returnErr)
		}
		returnErr = err
	}

	return returnErr
}
Пример #26
0
// TableScanByKeyRange returns a QueryResultReader that gets all the
// rows from a table that match the supplied KeyRange, ordered by
// Primary Key. The returned columns are ordered with the Primary Key
// columns in front.
func TableScanByKeyRange(ctx context.Context, log logutil.Logger, ts topo.Server, tabletAlias *pb.TabletAlias, tableDefinition *myproto.TableDefinition, keyRange *pb.KeyRange, keyspaceIDType key.KeyspaceIdType) (*QueryResultReader, error) {
	where := ""
	if keyRange != nil {
		switch keyspaceIDType {
		case key.KIT_UINT64:
			if len(keyRange.Start) > 0 {
				if len(keyRange.End) > 0 {
					// have start & end
					where = fmt.Sprintf("WHERE keyspace_id >= %v AND keyspace_id < %v ", uint64FromKeyspaceID(keyRange.Start), uint64FromKeyspaceID(keyRange.End))
				} else {
					// have start only
					where = fmt.Sprintf("WHERE keyspace_id >= %v ", uint64FromKeyspaceID(keyRange.Start))
				}
			} else {
				if len(keyRange.End) > 0 {
					// have end only
					where = fmt.Sprintf("WHERE keyspace_id < %v ", uint64FromKeyspaceID(keyRange.End))
				}
			}
		case key.KIT_BYTES:
			if len(keyRange.Start) > 0 {
				if len(keyRange.End) > 0 {
					// have start & end
					where = fmt.Sprintf("WHERE HEX(keyspace_id) >= '%v' AND HEX(keyspace_id) < '%v' ", hex.EncodeToString(keyRange.Start), hex.EncodeToString(keyRange.End))
				} else {
					// have start only
					where = fmt.Sprintf("WHERE HEX(keyspace_id) >= '%v' ", hex.EncodeToString(keyRange.Start))
				}
			} else {
				if len(keyRange.End) > 0 {
					// have end only
					where = fmt.Sprintf("WHERE HEX(keyspace_id) < '%v' ", hex.EncodeToString(keyRange.End))
				}
			}
		default:
			return nil, fmt.Errorf("Unsupported KeyspaceIdType: %v", keyspaceIDType)
		}
	}

	sql := fmt.Sprintf("SELECT %v FROM %v %vORDER BY %v", strings.Join(orderedColumns(tableDefinition), ", "), tableDefinition.Name, where, strings.Join(tableDefinition.PrimaryKeyColumns, ", "))
	log.Infof("SQL query for %v/%v: %v", topo.TabletAliasString(tabletAlias), tableDefinition.Name, sql)
	return NewQueryResultReaderForTablet(ctx, ts, tabletAlias, sql)
}
Пример #27
0
// StatusAsText implements the Worker interface
func (vscw *VerticalSplitCloneWorker) StatusAsText() string {
	vscw.Mu.Lock()
	defer vscw.Mu.Unlock()
	result := "Working on: " + vscw.destinationKeyspace + "/" + vscw.destinationShard + "\n"
	result += "State: " + vscw.State.String() + "\n"
	switch vscw.State {
	case WorkerStateCopy:
		result += "Running:\n"
		result += "Copying from: " + topo.TabletAliasString(vscw.sourceAlias) + "\n"
		statuses, eta := formatTableStatuses(vscw.tableStatus, vscw.startTime)
		result += "ETA: " + eta.String() + "\n"
		result += strings.Join(statuses, "\n")
	case WorkerStateDone:
		result += "Success:\n"
		statuses, _ := formatTableStatuses(vscw.tableStatus, vscw.startTime)
		result += strings.Join(statuses, "\n")
	}
	return result
}
Пример #28
0
func (wr *Wrangler) applySchemaShardSimple(ctx context.Context, statusArray []*tabletStatus, preflight *myproto.SchemaChangeResult, masterTabletAlias *pb.TabletAlias, change string, force bool) (*myproto.SchemaChangeResult, error) {
	// check all tablets have the same schema as the master's
	// BeforeSchema. If not, we shouldn't proceed
	log.Infof("Checking schema on all tablets")
	for _, status := range statusArray {
		diffs := myproto.DiffSchemaToArray("master", preflight.BeforeSchema, topo.TabletAliasString(status.ti.Alias), status.beforeSchema)
		if len(diffs) > 0 {
			if force {
				log.Warningf("Tablet %v has inconsistent schema, ignoring: %v", status.ti.Alias, strings.Join(diffs, "\n"))
			} else {
				return nil, fmt.Errorf("Tablet %v has inconsistent schema: %v", status.ti.Alias, strings.Join(diffs, "\n"))
			}
		}
	}

	// we're good, just send to the master
	log.Infof("Applying schema change to master in simple mode")
	sc := &myproto.SchemaChange{Sql: change, Force: force, AllowReplication: true, BeforeSchema: preflight.BeforeSchema, AfterSchema: preflight.AfterSchema}
	return wr.ApplySchema(ctx, masterTabletAlias, sc)
}
Пример #29
0
// StatusAsHTML implements the Worker interface
func (vscw *VerticalSplitCloneWorker) StatusAsHTML() template.HTML {
	vscw.Mu.Lock()
	defer vscw.Mu.Unlock()
	result := "<b>Working on:</b> " + vscw.destinationKeyspace + "/" + vscw.destinationShard + "</br>\n"
	result += "<b>State:</b> " + vscw.State.String() + "</br>\n"
	switch vscw.State {
	case WorkerStateCopy:
		result += "<b>Running</b>:</br>\n"
		result += "<b>Copying from</b>: " + topo.TabletAliasString(vscw.sourceAlias) + "</br>\n"
		statuses, eta := formatTableStatuses(vscw.tableStatus, vscw.startTime)
		result += "<b>ETA</b>: " + eta.String() + "</br>\n"
		result += strings.Join(statuses, "</br>\n")
	case WorkerStateDone:
		result += "<b>Success</b>:</br>\n"
		statuses, _ := formatTableStatuses(vscw.tableStatus, vscw.startTime)
		result += strings.Join(statuses, "</br>\n")
	}

	return template.HTML(result)
}
Пример #30
0
// ValidateVersionShard validates all versions are the same in all
// tablets in a shard
func (wr *Wrangler) ValidateVersionShard(ctx context.Context, keyspace, shard string) error {
	si, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	// get version from the master, or error
	if topo.TabletAliasIsZero(si.MasterAlias) {
		return fmt.Errorf("No master in shard %v/%v", keyspace, shard)
	}
	log.Infof("Gathering version for master %v", topo.TabletAliasString(si.MasterAlias))
	masterVersion, err := wr.GetVersion(ctx, si.MasterAlias)
	if err != nil {
		return err
	}

	// read all the aliases in the shard, that is all tablets that are
	// replicating from the master
	aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
	if err != nil {
		return err
	}

	// then diff with all slaves
	er := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}
	for _, alias := range aliases {
		if topo.TabletAliasEqual(alias, si.MasterAlias) {
			continue
		}

		wg.Add(1)
		go wr.diffVersion(ctx, masterVersion, si.MasterAlias, alias, &wg, &er)
	}
	wg.Wait()
	if er.HasErrors() {
		return fmt.Errorf("Version diffs:\n%v", er.Error().Error())
	}
	return nil
}