Пример #1
0
// GetAllTablets returns a sorted list of tablets.
func GetAllTablets(ctx context.Context, ts topo.Server, cell string) ([]*topo.TabletInfo, error) {
	aliases, err := ts.GetTabletsByCell(ctx, cell)
	if err != nil {
		return nil, err
	}
	sort.Sort(topo.TabletAliasList(aliases))

	tabletMap, err := topo.GetTabletMap(ctx, ts, aliases)
	if err != nil {
		// we got another error than topo.ErrNoNode
		return nil, err
	}
	tablets := make([]*topo.TabletInfo, 0, len(aliases))
	for _, tabletAlias := range aliases {
		tabletInfo, ok := tabletMap[*tabletAlias]
		if !ok {
			// tablet disappeared on us (GetTabletMap ignores
			// topo.ErrNoNode), just echo a warning
			log.Warningf("failed to load tablet %v", tabletAlias)
		} else {
			tablets = append(tablets, tabletInfo)
		}
	}

	return tablets, nil
}
Пример #2
0
// findTargets phase:
// - find one rdonly in the source shard
// - mark it as 'checker' pointing back to us
// - get the aliases of all the targets
func (vscw *VerticalSplitCloneWorker) findTargets() error {
	vscw.setState(stateVSCFindTargets)

	// find an appropriate endpoint in the source shard
	var err error
	vscw.sourceAlias, err = findChecker(vscw.wr, vscw.cleaner, vscw.cell, vscw.sourceKeyspace, "0")
	if err != nil {
		return fmt.Errorf("cannot find checker for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err)
	}
	vscw.wr.Logger().Infof("Using tablet %v as the source", vscw.sourceAlias)

	// get the tablet info for it
	vscw.sourceTablet, err = vscw.wr.TopoServer().GetTablet(vscw.sourceAlias)
	if err != nil {
		return fmt.Errorf("cannot read tablet %v: %v", vscw.sourceTablet, err)
	}

	// stop replication on it
	if err := vscw.wr.TabletManagerClient().StopSlave(vscw.sourceTablet, 30*time.Second); err != nil {
		return fmt.Errorf("cannot stop replication on tablet %v", vscw.sourceAlias)
	}

	wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet, 30*time.Second)
	action, err := wrangler.FindChangeSlaveTypeActionByTarget(vscw.cleaner, vscw.sourceAlias)
	if err != nil {
		return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vscw.sourceAlias, err)
	}
	action.TabletType = topo.TYPE_SPARE

	// find all the targets in the destination keyspace / shard
	vscw.destinationAliases, err = topo.FindAllTabletAliasesInShard(vscw.wr.TopoServer(), vscw.destinationKeyspace, vscw.destinationShard)
	if err != nil {
		return fmt.Errorf("cannot find all target tablets in %v/%v: %v", vscw.destinationKeyspace, vscw.destinationShard, err)
	}
	vscw.wr.Logger().Infof("Found %v target aliases", len(vscw.destinationAliases))

	// get the TabletInfo for all targets
	vscw.destinationTablets, err = topo.GetTabletMap(vscw.wr.TopoServer(), vscw.destinationAliases)
	if err != nil {
		return fmt.Errorf("cannot read all target tablets in %v/%v: %v", vscw.destinationKeyspace, vscw.destinationShard, err)
	}

	// find and validate the master
	for tabletAlias, ti := range vscw.destinationTablets {
		if ti.Type == topo.TYPE_MASTER {
			if vscw.destinationMasterAlias.IsZero() {
				vscw.destinationMasterAlias = tabletAlias
			} else {
				return fmt.Errorf("multiple masters in destination shard: %v and %v at least", vscw.destinationMasterAlias, tabletAlias)
			}
		}
	}
	if vscw.destinationMasterAlias.IsZero() {
		return fmt.Errorf("no master in destination shard")
	}

	return nil
}
Пример #3
0
// findTargets phase:
// - find one rdonly in the source shard
// - mark it as 'checker' pointing back to us
// - get the aliases of all the targets
func (scw *SplitCloneWorker) findTargets() error {
	scw.setState(stateSCFindTargets)
	var err error

	// find an appropriate endpoint in the source shards
	scw.sourceAliases = make([]topo.TabletAlias, len(scw.sourceShards))
	for i, si := range scw.sourceShards {
		scw.sourceAliases[i], err = findChecker(scw.wr, scw.cleaner, scw.cell, si.Keyspace(), si.ShardName())
		if err != nil {
			return fmt.Errorf("cannot find checker for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err)
		}
		scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", scw.sourceAliases[i], si.Keyspace(), si.ShardName())
	}

	// get the tablet info for them
	scw.sourceTablets = make([]*topo.TabletInfo, len(scw.sourceAliases))
	for i, alias := range scw.sourceAliases {
		scw.sourceTablets[i], err = scw.wr.TopoServer().GetTablet(alias)
		if err != nil {
			return fmt.Errorf("cannot read tablet %v: %v", alias, err)
		}
	}

	// find all the targets in the destination shards
	scw.destinationAliases = make([][]topo.TabletAlias, len(scw.destinationShards))
	scw.destinationTablets = make([]map[topo.TabletAlias]*topo.TabletInfo, len(scw.destinationShards))
	scw.destinationMasterAliases = make([]topo.TabletAlias, len(scw.destinationShards))
	for shardIndex, si := range scw.destinationShards {
		scw.destinationAliases[shardIndex], err = topo.FindAllTabletAliasesInShard(scw.wr.TopoServer(), si.Keyspace(), si.ShardName())
		if err != nil {
			return fmt.Errorf("cannot find all target tablets in %v/%v: %v", si.Keyspace(), si.ShardName(), err)
		}
		scw.wr.Logger().Infof("Found %v target aliases in shard %v/%v", len(scw.destinationAliases[shardIndex]), si.Keyspace(), si.ShardName())

		// get the TabletInfo for all targets
		scw.destinationTablets[shardIndex], err = topo.GetTabletMap(scw.wr.TopoServer(), scw.destinationAliases[shardIndex])
		if err != nil {
			return fmt.Errorf("cannot read all target tablets in %v/%v: %v", si.Keyspace(), si.ShardName(), err)
		}

		// find and validate the master
		for tabletAlias, ti := range scw.destinationTablets[shardIndex] {
			if ti.Type == topo.TYPE_MASTER {
				if scw.destinationMasterAliases[shardIndex].IsZero() {
					scw.destinationMasterAliases[shardIndex] = tabletAlias
				} else {
					return fmt.Errorf("multiple masters in destination shard: %v and %v at least", scw.destinationMasterAliases[shardIndex], tabletAlias)
				}
			}
		}
		if scw.destinationMasterAliases[shardIndex].IsZero() {
			return fmt.Errorf("no master in destination shard")
		}
	}

	return nil
}
Пример #4
0
// FIXME(msolomon) This validate presumes the master is up and running.
// Even when that isn't true, there are validation processes that might be valuable.
func (wr *Wrangler) validateShard(ctx context.Context, keyspace, shard string, pingTablets bool, wg *sync.WaitGroup, results chan<- error) {
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		results <- fmt.Errorf("TopologyServer.GetShard(%v, %v) failed: %v", keyspace, shard, err)
		return
	}

	aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
	if err != nil {
		results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err)
		return
	}

	tabletMap, _ := topo.GetTabletMap(ctx, wr.ts, aliases)

	var masterAlias *pb.TabletAlias
	for _, alias := range aliases {
		tabletInfo, ok := tabletMap[*alias]
		if !ok {
			results <- fmt.Errorf("tablet %v not found in map", alias)
			continue
		}
		if tabletInfo.Type == pb.TabletType_MASTER {
			if masterAlias != nil {
				results <- fmt.Errorf("shard %v/%v already has master %v but found other master %v", keyspace, shard, masterAlias, alias)
			} else {
				masterAlias = alias
			}
		}
	}

	if masterAlias == nil {
		results <- fmt.Errorf("no master for shard %v/%v", keyspace, shard)
	} else if !topo.TabletAliasEqual(shardInfo.MasterAlias, masterAlias) {
		results <- fmt.Errorf("master mismatch for shard %v/%v: found %v, expected %v", keyspace, shard, masterAlias, shardInfo.MasterAlias)
	}

	for _, alias := range aliases {
		wg.Add(1)
		go func(alias *pb.TabletAlias) {
			defer wg.Done()
			if err := topo.Validate(ctx, wr.ts, alias); err != nil {
				results <- fmt.Errorf("Validate(%v) failed: %v", alias, err)
			} else {
				wr.Logger().Infof("tablet %v is valid", alias)
			}
		}(alias)
	}

	if pingTablets {
		wr.validateReplication(ctx, shardInfo, tabletMap, results)
		wr.pingTablets(ctx, tabletMap, wg, results)
	}

	return
}
Пример #5
0
// FIXME(msolomon) This validate presumes the master is up and running.
// Even when that isn't true, there are validation processes that might be valuable.
func (wr *Wrangler) validateShard(keyspace, shard string, pingTablets bool, wg *sync.WaitGroup, results chan<- vresult) {
	shardInfo, err := wr.ts.GetShard(keyspace, shard)
	if err != nil {
		results <- vresult{keyspace + "/" + shard, err}
		return
	}

	aliases, err := topo.FindAllTabletAliasesInShard(wr.ts, keyspace, shard)
	if err != nil {
		results <- vresult{keyspace + "/" + shard, err}
	}

	tabletMap, _ := topo.GetTabletMap(wr.ts, aliases)

	var masterAlias topo.TabletAlias
	for _, alias := range aliases {
		tabletInfo, ok := tabletMap[alias]
		if !ok {
			results <- vresult{alias.String(), fmt.Errorf("tablet not found in map")}
			continue
		}
		if tabletInfo.Parent.Uid == topo.NO_TABLET {
			if masterAlias.Cell != "" {
				results <- vresult{alias.String(), fmt.Errorf("tablet already has a master %v", masterAlias)}
			} else {
				masterAlias = alias
			}
		}
	}

	if masterAlias.Cell == "" {
		results <- vresult{keyspace + "/" + shard, fmt.Errorf("no master for shard")}
	} else if shardInfo.MasterAlias != masterAlias {
		results <- vresult{keyspace + "/" + shard, fmt.Errorf("master mismatch for shard: found %v, expected %v", masterAlias, shardInfo.MasterAlias)}
	}

	for _, alias := range aliases {
		wg.Add(1)
		go func(alias topo.TabletAlias) {
			results <- vresult{alias.String(), topo.Validate(wr.ts, alias)}
			wg.Done()
		}(alias)
	}

	if pingTablets {
		wr.validateReplication(shardInfo, tabletMap, results)
		wr.pingTablets(tabletMap, wg, results)
	}

	return
}
Пример #6
0
// Does a topo lookup for a single shard, and returns:
//	1. Slice of all tablet aliases for the shard.
//	2. Map of tablet alias : tablet record for all tablets.
func resolveReloadTabletsForShard(ctx context.Context, keyspace, shard string, wr *wrangler.Wrangler) (reloadAliases []topo.TabletAlias, reloadTablets map[topo.TabletAlias]*topo.TabletInfo, err error) {
	// Keep a long timeout, because we really don't want the copying to succeed, and then the worker to fail at the end.
	shortCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
	reloadAliases, err = topo.FindAllTabletAliasesInShard(shortCtx, wr.TopoServer(), keyspace, shard)
	cancel()
	if err != nil {
		return nil, nil, fmt.Errorf("cannot find all reload target tablets in %v/%v: %v", keyspace, shard, err)
	}
	wr.Logger().Infof("Found %v reload target aliases in shard %v/%v", len(reloadAliases), keyspace, shard)

	shortCtx, cancel = context.WithTimeout(ctx, 5*time.Minute)
	reloadTablets, err = topo.GetTabletMap(shortCtx, wr.TopoServer(), reloadAliases)
	cancel()
	if err != nil {
		return nil, nil, fmt.Errorf("cannot read all reload target tablets in %v/%v: %v",
			keyspace, shard, err)
	}
	return reloadAliases, reloadTablets, nil
}
Пример #7
0
// SetSourceShards is a utility function to override the SourceShards fields
// on a Shard.
func (wr *Wrangler) SetSourceShards(ctx context.Context, keyspace, shard string, sources []topo.TabletAlias, tables []string) error {
	// read the shard
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	// If the shard already has sources, maybe it's already been restored,
	// so let's be safe and abort right here.
	if len(shardInfo.SourceShards) > 0 {
		return fmt.Errorf("Shard %v/%v already has SourceShards, not overwriting them", keyspace, shard)
	}

	// read the source tablets
	sourceTablets, err := topo.GetTabletMap(ctx, wr.TopoServer(), sources)
	if err != nil {
		return err
	}

	// Insert their KeyRange in the SourceShards array.
	// We use a linear 0-based id, that matches what mysqlctld/split.go
	// inserts into _vt.blp_checkpoint.
	shardInfo.SourceShards = make([]*pb.Shard_SourceShard, len(sourceTablets))
	i := 0
	for _, ti := range sourceTablets {
		shardInfo.SourceShards[i] = &pb.Shard_SourceShard{
			Uid:      uint32(i),
			Keyspace: ti.Keyspace,
			Shard:    ti.Shard,
			KeyRange: key.KeyRangeToProto(ti.KeyRange),
			Tables:   tables,
		}
		i++
	}

	// and write the shard
	if err = topo.UpdateShard(ctx, wr.ts, shardInfo); err != nil {
		return err
	}

	return nil
}
Пример #8
0
func (wr *Wrangler) shardMultiRestore(keyspace, shard string, sources []topo.TabletAlias, tables []string, concurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount int, strategy string) error {
	// read the shard
	shardInfo, err := wr.ts.GetShard(keyspace, shard)
	if err != nil {
		return err
	}

	// read the source tablets
	sourceTablets, err := topo.GetTabletMap(wr.TopoServer(), sources)
	if err != nil {
		return err
	}

	// Insert their KeyRange in the SourceShards array.
	// We use a linear 0-based id, that matches what mysqlctld/split.go
	// inserts into _vt.blp_checkpoint.
	shardInfo.SourceShards = make([]topo.SourceShard, len(sourceTablets))
	i := 0
	for _, ti := range sourceTablets {
		shardInfo.SourceShards[i] = topo.SourceShard{
			Uid:      uint32(i),
			Keyspace: ti.Keyspace,
			Shard:    ti.Shard,
			KeyRange: ti.KeyRange,
			Tables:   tables,
		}
		i++
	}

	// and write the shard
	if err = wr.ts.UpdateShard(shardInfo); err != nil {
		return err
	}

	return nil
}
Пример #9
0
// Update shard file with new master, replicas, etc.
//
// Re-read from TopologyServer to make sure we are using the side
// effects of all actions.
//
// This function locks individual SvrShard paths, so it doesn't need a lock
// on the shard.
func RebuildShard(log logutil.Logger, ts topo.Server, keyspace, shard string, cells []string, timeout time.Duration, interrupted chan struct{}) error {
	log.Infof("RebuildShard %v/%v", keyspace, shard)

	// read the existing shard info. It has to exist.
	shardInfo, err := ts.GetShard(keyspace, shard)
	if err != nil {
		return err
	}

	// rebuild all cells in parallel
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, cell := range shardInfo.Cells {
		// skip this cell if we shouldn't rebuild it
		if !topo.InCellList(cell, cells) {
			continue
		}

		// start with the master if it's in the current cell
		tabletsAsMap := make(map[topo.TabletAlias]bool)
		if shardInfo.MasterAlias.Cell == cell {
			tabletsAsMap[shardInfo.MasterAlias] = true
		}

		wg.Add(1)
		go func(cell string) {
			defer wg.Done()

			// read the ShardReplication object to find tablets
			sri, err := ts.GetShardReplication(cell, keyspace, shard)
			if err != nil {
				rec.RecordError(fmt.Errorf("GetShardReplication(%v, %v, %v) failed: %v", cell, keyspace, shard, err))
				return
			}

			// add all relevant tablets to the map
			for _, rl := range sri.ReplicationLinks {
				tabletsAsMap[rl.TabletAlias] = true
				if rl.Parent.Cell == cell {
					tabletsAsMap[rl.Parent] = true
				}
			}

			// convert the map to a list
			aliases := make([]topo.TabletAlias, 0, len(tabletsAsMap))
			for a := range tabletsAsMap {
				aliases = append(aliases, a)
			}

			// read all the Tablet records
			tablets, err := topo.GetTabletMap(ts, aliases)
			switch err {
			case nil:
				// keep going, we're good
			case topo.ErrPartialResult:
				log.Warningf("Got ErrPartialResult from topo.GetTabletMap in cell %v, some tablets may not be added properly to serving graph", cell)
			default:
				rec.RecordError(fmt.Errorf("GetTabletMap in cell %v failed: %v", cell, err))
				return
			}

			// Lock the SrvShard so we write a consistent data set.
			actionNode := actionnode.RebuildSrvShard()
			lockPath, err := actionNode.LockSrvShard(ts, cell, keyspace, shard, timeout, interrupted)
			if err != nil {
				rec.RecordError(err)
				return
			}

			// write the data we need to
			rebuildErr := rebuildCellSrvShard(log, ts, shardInfo, cell, tablets)

			// and unlock
			if err := actionNode.UnlockSrvShard(ts, cell, keyspace, shard, lockPath, rebuildErr); err != nil {
				rec.RecordError(err)
			}
		}(cell)
	}
	wg.Wait()

	return rec.Error()
}