Beispiel #1
0
func (wr *Wrangler) makeMastersReadOnly(shards []*topo.ShardInfo) error {
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		if si.MasterAlias.IsZero() {
			rec.RecordError(fmt.Errorf("Shard %v/%v has no master?", si.Keyspace(), si.ShardName()))
			continue
		}

		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()

			log.Infof("Making master %v read-only", si.MasterAlias)
			actionPath, err := wr.ai.SetReadOnly(si.MasterAlias)
			if err != nil {
				rec.RecordError(err)
				return
			}
			rec.RecordError(wr.WaitForCompletion(actionPath))
			log.Infof("Master %v is now read-only", si.MasterAlias)
		}(si)
	}
	wg.Wait()
	return rec.Error()
}
Beispiel #2
0
// WaitForDrain blocks until the selected tablets (cells/keyspace/shard/tablet_type)
// have reported a QPS rate of 0.0.
// NOTE: This is just an observation of one point in time and no guarantee that
// the tablet was actually drained. At later times, a QPS rate > 0.0 could still
// be observed.
func (wr *Wrangler) WaitForDrain(ctx context.Context, cells []string, keyspace, shard string, servedType topodatapb.TabletType,
	retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout time.Duration) error {
	if len(cells) == 0 {
		// Retrieve list of cells for the shard from the topology.
		shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
		if err != nil {
			return fmt.Errorf("failed to retrieve list of all cells. GetShard() failed: %v", err)
		}
		cells = shardInfo.Cells
	}

	// Check all cells in parallel.
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, cell := range cells {
		wg.Add(1)
		go func(cell string) {
			defer wg.Done()
			rec.RecordError(wr.waitForDrainInCell(ctx, cell, keyspace, shard, servedType,
				retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout))
		}(cell)
	}
	wg.Wait()

	return rec.Error()
}
Beispiel #3
0
func (wr *Wrangler) getMastersPosition(shards []*topo.ShardInfo) (map[*topo.ShardInfo]myproto.ReplicationPosition, error) {
	mu := sync.Mutex{}
	result := make(map[*topo.ShardInfo]myproto.ReplicationPosition)

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			log.Infof("Gathering master position for %v", si.MasterAlias)
			ti, err := wr.ts.GetTablet(si.MasterAlias)
			if err != nil {
				rec.RecordError(err)
				return
			}

			pos, err := wr.ai.MasterPosition(ti, wr.ActionTimeout())
			if err != nil {
				rec.RecordError(err)
				return
			}

			log.Infof("Got master position for %v", si.MasterAlias)
			mu.Lock()
			result[si] = pos
			mu.Unlock()
		}(si)
	}
	wg.Wait()
	return result, rec.Error()
}
Beispiel #4
0
// CleanUp will run the recorded actions.
// If an action on a target fails, it will not run the next action on
// the same target.
// We return the aggregate errors for all cleanups.
// CleanUp uses its own context, with a timeout of 5 minutes, so that clean up action will run even if the original context times out.
// TODO(alainjobart) Actions should run concurrently on a per target
// basis. They are then serialized on each target.
func (cleaner *Cleaner) CleanUp(wr *Wrangler) error {
	// we use a background context so we're not dependent on the original context timeout
	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
	actionMap := make(map[string]*cleanUpHelper)
	rec := concurrency.AllErrorRecorder{}
	cleaner.mu.Lock()
	for i := len(cleaner.actions) - 1; i >= 0; i-- {
		actionReference := cleaner.actions[i]
		helper, ok := actionMap[actionReference.target]
		if !ok {
			helper = &cleanUpHelper{
				err: nil,
			}
			actionMap[actionReference.target] = helper
		}
		if helper.err != nil {
			wr.Logger().Warningf("previous action failed on target %v, no running %v", actionReference.target, actionReference.name)
			continue
		}
		err := actionReference.action.CleanUp(ctx, wr)
		if err != nil {
			helper.err = err
			rec.RecordError(err)
			wr.Logger().Errorf("action %v failed on %v: %v", actionReference.name, actionReference.target, err)
		} else {
			wr.Logger().Infof("action %v successful on %v", actionReference.name, actionReference.target)
		}
	}
	cleaner.mu.Unlock()
	cancel()
	return rec.Error()
}
Beispiel #5
0
// FIXME(alainjobart) no action to become read-write now, just use Ping,
// that forces the shard reload and will stop replication.
func (wr *Wrangler) makeMastersReadWrite(shards []*topo.ShardInfo) error {
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			log.Infof("Pinging master %v", si.MasterAlias)

			actionPath, err := wr.ai.Ping(si.MasterAlias)
			if err != nil {
				rec.RecordError(err)
				return
			}

			if err := wr.WaitForCompletion(actionPath); err != nil {
				rec.RecordError(err)
			} else {
				log.Infof("%v responded", si.MasterAlias)
			}

		}(si)
	}
	wg.Wait()
	return rec.Error()
}
Beispiel #6
0
func (wr *Wrangler) waitForFilteredReplication(sourcePositions map[*topo.ShardInfo]myproto.ReplicationPosition, destinationShards []*topo.ShardInfo) error {
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range destinationShards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			for _, sourceShard := range si.SourceShards {
				// we're waiting on this guy
				blpPosition := blproto.BlpPosition{
					Uid: sourceShard.Uid,
				}

				// find the position it should be at
				for s, pos := range sourcePositions {
					if s.Keyspace() == sourceShard.Keyspace && s.ShardName() == sourceShard.Shard {
						blpPosition.Position = pos
					}
				}

				log.Infof("Waiting for %v to catch up", si.MasterAlias)
				if err := wr.ai.WaitBlpPosition(si.MasterAlias, blpPosition, wr.ActionTimeout()); err != nil {
					rec.RecordError(err)
				} else {
					log.Infof("%v caught up", si.MasterAlias)
				}
				wg.Done()
			}
		}(si)
	}
	wg.Wait()
	return rec.Error()
}
Beispiel #7
0
// RebuildShard updates the SrvShard objects and underlying serving graph.
//
// Re-read from TopologyServer to make sure we are using the side
// effects of all actions.
//
// This function will start each cell over from the beginning on ErrBadVersion,
// so it doesn't need a lock on the shard.
func RebuildShard(ctx context.Context, log logutil.Logger, ts topo.Server, keyspace, shard string, cells []string, lockTimeout time.Duration) (*topo.ShardInfo, error) {
	log.Infof("RebuildShard %v/%v", keyspace, shard)

	span := trace.NewSpanFromContext(ctx)
	span.StartLocal("topotools.RebuildShard")
	defer span.Finish()
	ctx = trace.NewContext(ctx, span)

	// read the existing shard info. It has to exist.
	shardInfo, err := ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return nil, err
	}

	// rebuild all cells in parallel
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, cell := range shardInfo.Cells {
		// skip this cell if we shouldn't rebuild it
		if !topo.InCellList(cell, cells) {
			continue
		}

		wg.Add(1)
		go func(cell string) {
			defer wg.Done()
			rec.RecordError(rebuildCellSrvShard(ctx, log, ts, shardInfo, cell))
		}(cell)
	}
	wg.Wait()

	return shardInfo, rec.Error()
}
Beispiel #8
0
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool) error {
	recorder := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	swrd := tm.SlaveWasRestartedData{
		Parent:               masterElectTablet.Alias(),
		ExpectedMasterAddr:   masterElectTablet.MysqlAddr,
		ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr,
		ScrapStragglers:      scrapStragglers,
	}

	// do all the slaves
	for _, ti := range slaveTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			recorder.RecordError(wr.slaveWasRestarted(ti, &swrd))
			wg.Done()
		}(ti)
	}
	wg.Wait()

	// then do the master
	recorder.RecordError(wr.slaveWasRestarted(masterTablet, &swrd))
	return recorder.Error()
}
Beispiel #9
0
func (wr *Wrangler) getMastersPosition(shards []*topo.ShardInfo) (map[*topo.ShardInfo]*mysqlctl.ReplicationPosition, error) {
	mu := sync.Mutex{}
	result := make(map[*topo.ShardInfo]*mysqlctl.ReplicationPosition)

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			log.Infof("Gathering master position for %v", si.MasterAlias)
			pos, err := wr.getMasterPosition(si.MasterAlias)
			if err != nil {
				rec.RecordError(err)
			} else {
				log.Infof("Got master position for %v", si.MasterAlias)
				mu.Lock()
				result[si] = pos
				mu.Unlock()
			}
			wg.Done()
		}(si)
	}
	wg.Wait()
	return result, rec.Error()
}
Beispiel #10
0
// CopyKeyspaces will create the keyspaces in the destination topo
func CopyKeyspaces(fromTS, toTS topo.Server) {
	keyspaces, err := fromTS.GetKeyspaces()
	if err != nil {
		log.Fatalf("GetKeyspaces: %v", err)
	}

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, keyspace := range keyspaces {
		wg.Add(1)
		go func(keyspace string) {
			defer wg.Done()

			k, err := fromTS.GetKeyspace(keyspace)
			if err != nil {
				rec.RecordError(fmt.Errorf("GetKeyspace(%v): %v", keyspace, err))
				return
			}

			if err := toTS.CreateKeyspace(keyspace, k.Keyspace); err != nil {
				if err == topo.ErrNodeExists {
					log.Warningf("keyspace %v already exists", keyspace)
				} else {
					rec.RecordError(fmt.Errorf("CreateKeyspace(%v): %v", keyspace, err))
				}
			}
		}(keyspace)
	}
	wg.Wait()
	if rec.HasErrors() {
		log.Fatalf("copyKeyspaces failed: %v", rec.Error())
	}
}
Beispiel #11
0
// DeleteKeyspaceShards implements topo.Server.
func (s *Server) DeleteKeyspaceShards(ctx context.Context, keyspace string) error {
	shards, err := s.GetShardNames(ctx, keyspace)
	if err != nil {
		return err
	}

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	global := s.getGlobal()
	for _, shard := range shards {
		wg.Add(1)
		go func(shard string) {
			defer wg.Done()
			_, err := global.Delete(shardDirPath(keyspace, shard), true /* recursive */)
			rec.RecordError(convertError(err))
		}(shard)
	}
	wg.Wait()

	if err = rec.Error(); err != nil {
		return err
	}

	event.Dispatch(&events.KeyspaceChange{
		KeyspaceInfo: *topo.NewKeyspaceInfo(keyspace, nil, -1),
		Status:       "deleted all shards",
	})
	return nil
}
Beispiel #12
0
// CleanUp will run the recorded actions.
// If an action on a target fails, it will not run the next action on
// the same target.
// We return the aggregate errors for all cleanups.
// TODO(alainjobart) Actions should run concurrently on a per target
// basis. They are then serialized on each target.
func (cleaner *Cleaner) CleanUp(wr *Wrangler) error {
	actionMap := make(map[string]*cleanUpHelper)
	rec := concurrency.AllErrorRecorder{}
	cleaner.mu.Lock()
	for i := len(cleaner.actions) - 1; i >= 0; i-- {
		actionReference := cleaner.actions[i]
		helper, ok := actionMap[actionReference.target]
		if !ok {
			helper = &cleanUpHelper{
				err: nil,
			}
			actionMap[actionReference.target] = helper
		}
		if helper.err != nil {
			log.Warningf("previous action failed on target %v, no running %v", actionReference.target, actionReference.name)
			continue
		}
		err := actionReference.action.CleanUp(wr)
		if err != nil {
			helper.err = err
			rec.RecordError(err)
			log.Errorf("action %v failed on %v: %v", actionReference.name, actionReference.target, err)
		} else {
			log.Infof("action %v successfull on %v", actionReference.name, actionReference.target)
		}
	}
	cleaner.mu.Unlock()
	return rec.Error()
}
Beispiel #13
0
func (wr *Wrangler) makeMastersReadOnly(shards []*topo.ShardInfo) error {
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		if si.MasterAlias.IsZero() {
			rec.RecordError(fmt.Errorf("Shard %v/%v has no master?", si.Keyspace(), si.ShardName()))
			continue
		}

		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()

			wr.Logger().Infof("Making master %v read-only", si.MasterAlias)
			ti, err := wr.ts.GetTablet(si.MasterAlias)
			if err != nil {
				rec.RecordError(err)
				return
			}

			if err = wr.tmc.SetReadOnly(ti, wr.ActionTimeout()); err != nil {
				rec.RecordError(err)
				return
			}
			wr.Logger().Infof("Master %v is now read-only", si.MasterAlias)
		}(si)
	}
	wg.Wait()
	return rec.Error()
}
Beispiel #14
0
// CopyTablets will create the tablets in the destination topo
func CopyTablets(fromTS, toTS topo.Server) {
	cells, err := fromTS.GetKnownCells()
	if err != nil {
		log.Fatalf("fromTS.GetKnownCells failed: %v", err)
	}

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, cell := range cells {
		wg.Add(1)
		go func(cell string) {
			defer wg.Done()
			tabletAliases, err := fromTS.GetTabletsByCell(cell)
			if err != nil {
				rec.RecordError(err)
			} else {
				for _, tabletAlias := range tabletAliases {
					wg.Add(1)
					go func(tabletAlias topo.TabletAlias) {
						defer wg.Done()

						// read the source tablet
						ti, err := fromTS.GetTablet(tabletAlias)
						if err != nil {
							rec.RecordError(err)
							return
						}

						// try to create the destination
						err = toTS.CreateTablet(ti.Tablet)
						if err == topo.ErrNodeExists {
							// update the destination tablet
							log.Warningf("tablet %v already exists, updating it", tabletAlias)
							err = toTS.UpdateTabletFields(ti.Alias(), func(t *topo.Tablet) error {
								*t = *ti.Tablet
								return nil
							})
						}
						if err != nil {
							rec.RecordError(err)
							return
						}

						// create the replication paths
						// for masters only here
						if ti.Type == topo.TYPE_MASTER {
							if err = toTS.CreateReplicationPath(ti.Keyspace, ti.Shard, ti.Alias().String()); err != nil && err != topo.ErrNodeExists {
								rec.RecordError(err)
							}
						}
					}(tabletAlias)
				}
			}
		}(cell)
	}
	wg.Wait()
	if rec.HasErrors() {
		log.Fatalf("copyTablets failed: %v", rec.Error())
	}
}
Beispiel #15
0
func (wr *Wrangler) getMastersPosition(ctx context.Context, shards []*topo.ShardInfo) (map[*topo.ShardInfo]string, error) {
	mu := sync.Mutex{}
	result := make(map[*topo.ShardInfo]string)

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, si := range shards {
		wg.Add(1)
		go func(si *topo.ShardInfo) {
			defer wg.Done()
			wr.Logger().Infof("Gathering master position for %v", topoproto.TabletAliasString(si.MasterAlias))
			ti, err := wr.ts.GetTablet(ctx, si.MasterAlias)
			if err != nil {
				rec.RecordError(err)
				return
			}

			pos, err := wr.tmc.MasterPosition(ctx, ti)
			if err != nil {
				rec.RecordError(err)
				return
			}

			wr.Logger().Infof("Got master position for %v", topoproto.TabletAliasString(si.MasterAlias))
			mu.Lock()
			result[si] = pos
			mu.Unlock()
		}(si)
	}
	wg.Wait()
	return result, rec.Error()
}
Beispiel #16
0
// UpdateTabletEndpoints fixes up any entries in the serving graph that relate
// to a given tablet.
func UpdateTabletEndpoints(ctx context.Context, ts topo.Server, tablet *topo.Tablet) (err error) {
	if *lockSrvShard {
		// This lock is only necessary until all tablets are upgraded to lock-free.
		actionNode := actionnode.RebuildSrvShard()
		lockPath, err := actionNode.LockSrvShard(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard)
		if err != nil {
			return fmt.Errorf("can't lock shard for UpdateTabletEndpoints(%v): %v", tablet, err)
		}

		defer func() {
			actionNode.UnlockSrvShard(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard, lockPath, err)
		}()
	}

	srvTypes, err := ts.GetSrvTabletTypesPerShard(ctx, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard)
	if err != nil {
		if err != topo.ErrNoNode {
			return err
		}
		// It's fine if there are no existing types.
		srvTypes = nil
	}

	wg := sync.WaitGroup{}
	errs := concurrency.AllErrorRecorder{}

	// Update the list that the tablet is supposed to be in (if any).
	if tablet.IsInServingGraph() {
		endpoint, err := tablet.EndPoint()
		if err != nil {
			return err
		}

		wg.Add(1)
		go func() {
			defer wg.Done()
			errs.RecordError(
				updateEndpoint(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard,
					tablet.Type, endpoint))
		}()
	}

	// Remove it from any other lists it isn't supposed to be in.
	for _, srvType := range srvTypes {
		if srvType != tablet.Type {
			wg.Add(1)
			go func(tabletType topo.TabletType) {
				defer wg.Done()
				errs.RecordError(
					removeEndpoint(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard,
						tabletType, tablet.Alias.Uid))
			}(srvType)
		}
	}

	wg.Wait()
	return errs.Error()
}
// shardsWithTablesSources returns all the shards that have SourceShards set
// to one value, with an array of Tables.
func shardsWithTablesSources(ctx context.Context, wr *wrangler.Wrangler) ([]map[string]string, error) {
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	keyspaces, err := wr.TopoServer().GetKeyspaces(shortCtx)
	cancel()
	if err != nil {
		return nil, err
	}

	wg := sync.WaitGroup{}
	mu := sync.Mutex{} // protects result
	result := make([]map[string]string, 0, len(keyspaces))
	rec := concurrency.AllErrorRecorder{}
	for _, keyspace := range keyspaces {
		wg.Add(1)
		go func(keyspace string) {
			defer wg.Done()
			shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
			shards, err := wr.TopoServer().GetShardNames(shortCtx, keyspace)
			cancel()
			if err != nil {
				rec.RecordError(err)
				return
			}
			for _, shard := range shards {
				wg.Add(1)
				go func(keyspace, shard string) {
					defer wg.Done()
					shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
					si, err := wr.TopoServer().GetShard(shortCtx, keyspace, shard)
					cancel()
					if err != nil {
						rec.RecordError(err)
						return
					}

					if len(si.SourceShards) == 1 && len(si.SourceShards[0].Tables) > 0 {
						mu.Lock()
						result = append(result, map[string]string{
							"Keyspace": keyspace,
							"Shard":    shard,
						})
						mu.Unlock()
					}
				}(keyspace, shard)
			}
		}(keyspace)
	}
	wg.Wait()

	if rec.HasErrors() {
		return nil, rec.Error()
	}
	if len(result) == 0 {
		return nil, fmt.Errorf("There are no shards with SourceShards")
	}
	return result, nil
}
Beispiel #18
0
// Make this external, since these transitions need to be forced from time to time.
func ChangeType(ts topo.Server, tabletAlias topo.TabletAlias, newType topo.TabletType, runHooks bool) error {
	tablet, err := ts.GetTablet(tabletAlias)
	if err != nil {
		return err
	}

	if !topo.IsTrivialTypeChange(tablet.Type, newType) || !topo.IsValidTypeChange(tablet.Type, newType) {
		return fmt.Errorf("cannot change tablet type %v -> %v %v", tablet.Type, newType, tabletAlias)
	}

	if runHooks {
		// Only run the preflight_serving_type hook when
		// transitioning from non-serving to serving.
		if !topo.IsInServingGraph(tablet.Type) && topo.IsInServingGraph(newType) {
			if err := hook.NewSimpleHook("preflight_serving_type").ExecuteOptional(); err != nil {
				return err
			}
		}
	}

	tablet.Type = newType
	if newType == topo.TYPE_IDLE {
		if tablet.Parent.IsZero() {
			si, err := ts.GetShard(tablet.Keyspace, tablet.Shard)
			if err != nil {
				return err
			}
			rec := concurrency.AllErrorRecorder{}
			wg := sync.WaitGroup{}
			for _, cell := range si.Cells {
				wg.Add(1)
				go func(cell string) {
					defer wg.Done()
					sri, err := ts.GetShardReplication(cell, tablet.Keyspace, tablet.Shard)
					if err != nil {
						log.Warningf("Cannot check cell %v for extra replication paths, assuming it's good", cell)
						return
					}
					for _, rl := range sri.ReplicationLinks {
						if rl.Parent == tabletAlias {
							rec.RecordError(fmt.Errorf("Still have a ReplicationLink in cell %v", cell))
						}
					}
				}(cell)
			}
			wg.Wait()
			if rec.HasErrors() {
				return rec.Error()
			}
		}
		tablet.Parent = topo.TabletAlias{}
		tablet.Keyspace = ""
		tablet.Shard = ""
		tablet.KeyRange = key.KeyRange{}
	}
	return topo.UpdateTablet(ts, tablet)
}
Beispiel #19
0
// ValidateVersionKeyspace validates all versions are the same in all
// tablets in a keyspace
func (wr *Wrangler) ValidateVersionKeyspace(ctx context.Context, keyspace string) error {
	// find all the shards
	shards, err := wr.ts.GetShardNames(ctx, keyspace)
	if err != nil {
		return err
	}

	// corner cases
	if len(shards) == 0 {
		return fmt.Errorf("No shards in keyspace %v", keyspace)
	}
	sort.Strings(shards)
	if len(shards) == 1 {
		return wr.ValidateVersionShard(ctx, keyspace, shards[0])
	}

	// find the reference version using the first shard's master
	si, err := wr.ts.GetShard(ctx, keyspace, shards[0])
	if err != nil {
		return err
	}
	if topo.TabletAliasIsZero(si.MasterAlias) {
		return fmt.Errorf("No master in shard %v/%v", keyspace, shards[0])
	}
	referenceAlias := si.MasterAlias
	log.Infof("Gathering version for reference master %v", topo.TabletAliasString(referenceAlias))
	referenceVersion, err := wr.GetVersion(ctx, referenceAlias)
	if err != nil {
		return err
	}

	// then diff with all tablets but master 0
	er := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}
	for _, shard := range shards {
		aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard)
		if err != nil {
			er.RecordError(err)
			continue
		}

		for _, alias := range aliases {
			if topo.TabletAliasEqual(alias, si.MasterAlias) {
				continue
			}

			wg.Add(1)
			go wr.diffVersion(ctx, referenceVersion, referenceAlias, alias, &wg, &er)
		}
	}
	wg.Wait()
	if er.HasErrors() {
		return fmt.Errorf("Version diffs:\n%v", er.Error().Error())
	}
	return nil
}
Beispiel #20
0
func (wr *Wrangler) ValidatePermissionsKeyspace(keyspace string) error {
	// find all the shards
	shards, err := wr.ts.GetShardNames(keyspace)
	if err != nil {
		return err
	}

	// corner cases
	if len(shards) == 0 {
		return fmt.Errorf("No shards in keyspace %v", keyspace)
	}
	sort.Strings(shards)
	if len(shards) == 1 {
		return wr.ValidatePermissionsShard(keyspace, shards[0])
	}

	// find the reference permissions using the first shard's master
	si, err := wr.ts.GetShard(keyspace, shards[0])
	if err != nil {
		return err
	}
	if si.MasterAlias.Uid == topo.NO_TABLET {
		return fmt.Errorf("No master in shard %v/%v", keyspace, shards[0])
	}
	referenceAlias := si.MasterAlias
	log.Infof("Gathering permissions for reference master %v", referenceAlias)
	referencePermissions, err := wr.GetPermissions(si.MasterAlias)
	if err != nil {
		return err
	}

	// then diff with all tablets but master 0
	er := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}
	for _, shard := range shards {
		aliases, err := topo.FindAllTabletAliasesInShard(wr.ts, keyspace, shard)
		if err != nil {
			er.RecordError(err)
			continue
		}

		for _, alias := range aliases {
			if alias == si.MasterAlias {
				continue
			}

			wg.Add(1)
			go wr.diffPermissions(referencePermissions, referenceAlias, alias, &wg, &er)
		}
	}
	wg.Wait()
	if er.HasErrors() {
		return fmt.Errorf("Permissions diffs:\n%v", er.Error().Error())
	}
	return nil
}
Beispiel #21
0
// CopyShardReplications will create the ShardReplication objects in
// the destination topo
func CopyShardReplications(ctx context.Context, fromTS, toTS topo.Impl) {
	keyspaces, err := fromTS.GetKeyspaces(ctx)
	if err != nil {
		log.Fatalf("fromTS.GetKeyspaces: %v", err)
	}
	tts := topo.Server{
		Impl: toTS,
	}

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, keyspace := range keyspaces {
		wg.Add(1)
		go func(keyspace string) {
			defer wg.Done()
			shards, err := fromTS.GetShardNames(ctx, keyspace)
			if err != nil {
				rec.RecordError(fmt.Errorf("GetShardNames(%v): %v", keyspace, err))
				return
			}

			for _, shard := range shards {
				wg.Add(1)
				go func(keyspace, shard string) {
					defer wg.Done()

					// read the source shard to get the cells
					s, _, err := fromTS.GetShard(ctx, keyspace, shard)
					if err != nil {
						rec.RecordError(fmt.Errorf("GetShard(%v, %v): %v", keyspace, shard, err))
						return
					}

					for _, cell := range s.Cells {
						sri, err := fromTS.GetShardReplication(ctx, cell, keyspace, shard)
						if err != nil {
							rec.RecordError(fmt.Errorf("GetShardReplication(%v, %v, %v): %v", cell, keyspace, shard, err))
							continue
						}

						if err := tts.UpdateShardReplicationFields(ctx, cell, keyspace, shard, func(oldSR *topodatapb.ShardReplication) error {
							*oldSR = *sri.ShardReplication
							return nil
						}); err != nil {
							rec.RecordError(fmt.Errorf("UpdateShardReplicationFields(%v, %v, %v): %v", cell, keyspace, shard, err))
						}
					}
				}(keyspace, shard)
			}
		}(keyspace)
	}
	wg.Wait()
	if rec.HasErrors() {
		log.Fatalf("copyShards failed: %v", rec.Error())
	}
}
Beispiel #22
0
// CopyShards will create the shards in the destination topo
func CopyShards(fromTS, toTS topo.Server, deleteKeyspaceShards bool) {
	keyspaces, err := fromTS.GetKeyspaces()
	if err != nil {
		log.Fatalf("fromTS.GetKeyspaces: %v", err)
	}

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, keyspace := range keyspaces {
		wg.Add(1)
		go func(keyspace string) {
			defer wg.Done()
			shards, err := fromTS.GetShardNames(keyspace)
			if err != nil {
				rec.RecordError(fmt.Errorf("GetShardNames(%v): %v", keyspace, err))
				return
			}

			if deleteKeyspaceShards {
				if err := toTS.DeleteKeyspaceShards(keyspace); err != nil {
					rec.RecordError(fmt.Errorf("DeleteKeyspaceShards(%v): %v", keyspace, err))
					return
				}
			}

			for _, shard := range shards {
				wg.Add(1)
				go func(keyspace, shard string) {
					defer wg.Done()
					if err := topo.CreateShard(toTS, keyspace, shard); err != nil {
						if err == topo.ErrNodeExists {
							log.Warningf("shard %v/%v already exists", keyspace, shard)
						} else {
							rec.RecordError(fmt.Errorf("CreateShard(%v, %v): %v", keyspace, shard, err))
							return
						}
					}

					si, err := fromTS.GetShard(keyspace, shard)
					if err != nil {
						rec.RecordError(fmt.Errorf("GetShard(%v, %v): %v", keyspace, shard, err))
						return
					}

					if err := toTS.UpdateShard(si); err != nil {
						rec.RecordError(fmt.Errorf("UpdateShard(%v, %v): %v", keyspace, shard, err))
					}
				}(keyspace, shard)
			}
		}(keyspace)
	}
	wg.Wait()
	if rec.HasErrors() {
		log.Fatalf("copyShards failed: %v", rec.Error())
	}
}
Beispiel #23
0
func (wr *Wrangler) ShardMultiRestore(keyspace, shard string, sources []topo.TabletAlias, tables []string, concurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount int, strategy string) error {

	// check parameters
	if len(tables) > 0 && len(sources) > 1 {
		return fmt.Errorf("ShardMultiRestore can only handle one source when tables are specified")
	}

	// lock the shard to perform the changes we need done
	actionNode := actionnode.ShardMultiRestore(&actionnode.MultiRestoreArgs{
		SrcTabletAliases:       sources,
		Concurrency:            concurrency,
		FetchConcurrency:       fetchConcurrency,
		InsertTableConcurrency: insertTableConcurrency,
		FetchRetryCount:        fetchRetryCount,
		Strategy:               strategy})
	lockPath, err := wr.lockShard(keyspace, shard, actionNode)
	if err != nil {
		return err
	}

	mrErr := wr.SetSourceShards(keyspace, shard, sources, tables)
	err = wr.unlockShard(keyspace, shard, actionNode, lockPath, mrErr)
	if err != nil {
		if mrErr != nil {
			log.Errorf("unlockShard got error back: %v", err)
			return mrErr
		}
		return err
	}
	if mrErr != nil {
		return mrErr
	}

	// find all tablets in the shard
	destTablets, err := topo.FindAllTabletAliasesInShard(wr.ts, keyspace, shard)
	if err != nil {
		return err
	}

	// now launch MultiRestore on all tablets we need to do
	rec := cc.AllErrorRecorder{}
	wg := sync.WaitGroup{}
	for _, tabletAlias := range destTablets {
		wg.Add(1)
		go func(tabletAlias topo.TabletAlias) {
			log.Infof("Starting multirestore on tablet %v", tabletAlias)
			err := wr.MultiRestore(tabletAlias, sources, concurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount, strategy)
			log.Infof("Multirestore on tablet %v is done (err=%v)", tabletAlias, err)
			rec.RecordError(err)
			wg.Done()
		}(tabletAlias)
	}
	wg.Wait()

	return rec.Error()
}
Beispiel #24
0
// CopyShards will create the shards in the destination topo
func CopyShards(ctx context.Context, fromTS, toTS topo.Impl) {
	keyspaces, err := fromTS.GetKeyspaces(ctx)
	if err != nil {
		log.Fatalf("fromTS.GetKeyspaces: %v", err)
	}

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, keyspace := range keyspaces {
		wg.Add(1)
		go func(keyspace string) {
			defer wg.Done()
			shards, err := fromTS.GetShardNames(ctx, keyspace)
			if err != nil {
				rec.RecordError(fmt.Errorf("GetShardNames(%v): %v", keyspace, err))
				return
			}

			for _, shard := range shards {
				wg.Add(1)
				go func(keyspace, shard string) {
					defer wg.Done()
					if err := toTS.CreateShard(ctx, keyspace, shard, &topodatapb.Shard{}); err != nil {
						if err == topo.ErrNodeExists {
							log.Warningf("shard %v/%v already exists", keyspace, shard)
						} else {
							rec.RecordError(fmt.Errorf("CreateShard(%v, %v): %v", keyspace, shard, err))
							return
						}
					}

					s, _, err := fromTS.GetShard(ctx, keyspace, shard)
					if err != nil {
						rec.RecordError(fmt.Errorf("GetShard(%v, %v): %v", keyspace, shard, err))
						return
					}

					_, toV, err := toTS.GetShard(ctx, keyspace, shard)
					if err != nil {
						rec.RecordError(fmt.Errorf("toTS.GetShard(%v, %v): %v", keyspace, shard, err))
						return
					}

					if _, err := toTS.UpdateShard(ctx, keyspace, shard, s, toV); err != nil {
						rec.RecordError(fmt.Errorf("UpdateShard(%v, %v): %v", keyspace, shard, err))
					}
				}(keyspace, shard)
			}
		}(keyspace)
	}
	wg.Wait()
	if rec.HasErrors() {
		log.Fatalf("copyShards failed: %v", rec.Error())
	}
}
Beispiel #25
0
// CopyTablets will create the tablets in the destination topo
func CopyTablets(ctx context.Context, fromTS, toTS topo.Impl) {
	cells, err := fromTS.GetKnownCells(ctx)
	if err != nil {
		log.Fatalf("fromTS.GetKnownCells: %v", err)
	}
	tts := topo.Server{
		Impl: toTS,
	}

	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, cell := range cells {
		wg.Add(1)
		go func(cell string) {
			defer wg.Done()
			tabletAliases, err := fromTS.GetTabletsByCell(ctx, cell)
			if err != nil {
				rec.RecordError(fmt.Errorf("GetTabletsByCell(%v): %v", cell, err))
			} else {
				for _, tabletAlias := range tabletAliases {
					wg.Add(1)
					go func(tabletAlias *topodatapb.TabletAlias) {
						defer wg.Done()

						// read the source tablet
						tablet, _, err := fromTS.GetTablet(ctx, tabletAlias)
						if err != nil {
							rec.RecordError(fmt.Errorf("GetTablet(%v): %v", tabletAlias, err))
							return
						}

						// try to create the destination
						err = toTS.CreateTablet(ctx, tablet)
						if err == topo.ErrNodeExists {
							// update the destination tablet
							log.Warningf("tablet %v already exists, updating it", tabletAlias)
							_, err = tts.UpdateTabletFields(ctx, tablet.Alias, func(t *topodatapb.Tablet) error {
								*t = *tablet
								return nil
							})
						}
						if err != nil {
							rec.RecordError(fmt.Errorf("CreateTablet(%v): %v", tabletAlias, err))
							return
						}
					}(tabletAlias)
				}
			}
		}(cell)
	}
	wg.Wait()
	if rec.HasErrors() {
		log.Fatalf("copyTablets failed: %v", rec.Error())
	}
}
Beispiel #26
0
// getNewConn creates a new tablet connection with a separate per conn timeout.
// It limits the overall timeout to connTimeoutTotal by checking elapsed time after each blocking call.
func (sdc *ShardConn) getNewConn(ctx context.Context) (conn tabletconn.TabletConn, endPoint *topodatapb.EndPoint, isTimeout bool, err error) {
	startTime := time.Now()

	endPoints, err := sdc.balancer.Get()
	if err != nil {
		// Error when getting endpoint
		return nil, nil, false, err
	}
	if len(endPoints) == 0 {
		// No valid endpoint
		return nil, nil, false, vterrors.FromError(
			vtrpcpb.ErrorCode_INTERNAL_ERROR,
			fmt.Errorf("no valid endpoint"),
		)
	}
	if time.Now().Sub(startTime) >= sdc.connTimeoutTotal {
		return nil, nil, true, vterrors.FromError(
			vtrpcpb.ErrorCode_DEADLINE_EXCEEDED,
			fmt.Errorf("timeout when getting endpoints"),
		)
	}

	// Iterate through all endpoints to create a connection
	perConnTimeout := sdc.getConnTimeoutPerConn(len(endPoints))
	allErrors := new(concurrency.AllErrorRecorder)
	for _, endPoint := range endPoints {
		perConnStartTime := time.Now()
		conn, err = tabletconn.GetDialer()(ctx, endPoint, sdc.keyspace, sdc.shard, topodatapb.TabletType_UNKNOWN, perConnTimeout)
		if err == nil {
			sdc.connectTimings.Record([]string{sdc.keyspace, sdc.shard, strings.ToLower(sdc.tabletType.String())}, perConnStartTime)
			sdc.mu.Lock()
			defer sdc.mu.Unlock()
			sdc.conn = conn
			return conn, endPoint, false, nil
		}
		// Markdown the endpoint if it failed to connect
		sdc.balancer.MarkDown(endPoint.Uid, err.Error())
		vtErr := vterrors.NewVitessError(
			// TODO(aaijazi): what about OperationalErrors here?
			vterrors.RecoverVtErrorCode(err), err,
			"%v %+v", err, endPoint,
		)
		allErrors.RecordError(vtErr)
		if time.Now().Sub(startTime) >= sdc.connTimeoutTotal {
			err = vterrors.FromError(
				vtrpcpb.ErrorCode_DEADLINE_EXCEEDED,
				fmt.Errorf("timeout when connecting to %+v", endPoint),
			)
			allErrors.RecordError(err)
			return nil, nil, true, allErrors.AggrError(AggregateVtGateErrors)
		}
	}
	return nil, nil, false, allErrors.Error()
}
Beispiel #27
0
func (stc *ScatterConn) rollbackIfNeeded(ctx context.Context, allErrors *concurrency.AllErrorRecorder, session *SafeSession) {
	if session.InTransaction() {
		errstr := allErrors.Error().Error()
		// We cannot recover from these errors
		// TODO(aaijazi): get rid of this string parsing. Might
		// want a function that searches through a deeply
		// nested error chain for a particular error.
		if strings.Contains(errstr, "tx_pool_full") || strings.Contains(errstr, "not_in_tx") {
			stc.Rollback(ctx, session)
		}
	}
}
Beispiel #28
0
// Reload reloads the schema info from the db.
// Any tables that have changed since the last load are updated.
// This is a no-op if the SchemaInfo is closed.
func (si *SchemaInfo) Reload(ctx context.Context) error {
	defer logError(si.queryServiceStats)

	// Reload() gets called both from the ticker, and from external RPCs.
	// We don't want them to race over writing data that was read concurrently.
	si.actionMutex.Lock()
	defer si.actionMutex.Unlock()

	if si.IsClosed() {
		return nil
	}

	// Get time first because it needs a connection from the pool.
	curTime := si.mysqlTime(ctx)

	var tableData *sqltypes.Result
	var err error
	func() {
		conn := getOrPanic(ctx, si.connPool)
		defer conn.Recycle()
		tableData, err = conn.Exec(ctx, baseShowTables, maxTableCount, false)
	}()
	if err != nil {
		return fmt.Errorf("could not get table list for reload: %v", err)
	}

	// Reload any tables that have changed. We try every table even if some fail,
	// but we return success only if all tables succeed.
	// The following section requires us to hold mu.
	rec := concurrency.AllErrorRecorder{}
	si.mu.Lock()
	defer si.mu.Unlock()
	for _, row := range tableData.Rows {
		tableName := row[0].String()
		createTime, _ := row[2].ParseInt64()
		// Check if we know about the table or it has been recreated.
		if _, ok := si.tables[tableName]; !ok || createTime >= si.lastChange {
			func() {
				// Unlock so CreateOrUpdateTable can lock.
				si.mu.Unlock()
				defer si.mu.Lock()
				log.Infof("Reloading schema for table: %s", tableName)
				rec.RecordError(si.createOrUpdateTableLocked(ctx, tableName))
			}()
			continue
		}
		// Only update table_rows, data_length, index_length, max_data_length
		si.tables[tableName].SetMysqlStats(row[4], row[5], row[6], row[7], row[8])
	}
	si.lastChange = curTime
	return rec.Error()
}
Beispiel #29
0
// runOnAllShards is a helper method that executes the passed function for all shards in parallel.
// The method returns no error if the function succeeds on all shards. If on any of the shards
// the function fails then the method returns error. If several shards return error then only one
// of them is returned.
func (schemaSwap *Swap) runOnAllShards(shardFunc func(shard *shardSchemaSwap) error) error {
	var errorRecorder concurrency.AllErrorRecorder
	var waitGroup sync.WaitGroup
	for _, shardSwap := range schemaSwap.allShards {
		waitGroup.Add(1)
		go func(shard *shardSchemaSwap) {
			defer waitGroup.Done()
			errorRecorder.RecordError(shardFunc(shard))
		}(shardSwap)
	}
	waitGroup.Wait()
	return errorRecorder.Error()
}
Beispiel #30
0
// UpdateAllSrvShards calls UpdateSrvShard for all cells concurrently.
func UpdateAllSrvShards(ctx context.Context, ts topo.Server, si *topo.ShardInfo) error {
	wg := sync.WaitGroup{}
	errs := concurrency.AllErrorRecorder{}

	for _, cell := range si.Cells {
		wg.Add(1)
		go func(cell string) {
			errs.RecordError(UpdateSrvShard(ctx, ts, cell, si))
			wg.Done()
		}(cell)
	}
	wg.Wait()
	return errs.Error()
}