func (wr *Wrangler) makeMastersReadOnly(shards []*topo.ShardInfo) error { wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { if si.MasterAlias.IsZero() { rec.RecordError(fmt.Errorf("Shard %v/%v has no master?", si.Keyspace(), si.ShardName())) continue } wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() log.Infof("Making master %v read-only", si.MasterAlias) actionPath, err := wr.ai.SetReadOnly(si.MasterAlias) if err != nil { rec.RecordError(err) return } rec.RecordError(wr.WaitForCompletion(actionPath)) log.Infof("Master %v is now read-only", si.MasterAlias) }(si) } wg.Wait() return rec.Error() }
// WaitForDrain blocks until the selected tablets (cells/keyspace/shard/tablet_type) // have reported a QPS rate of 0.0. // NOTE: This is just an observation of one point in time and no guarantee that // the tablet was actually drained. At later times, a QPS rate > 0.0 could still // be observed. func (wr *Wrangler) WaitForDrain(ctx context.Context, cells []string, keyspace, shard string, servedType topodatapb.TabletType, retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout time.Duration) error { if len(cells) == 0 { // Retrieve list of cells for the shard from the topology. shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return fmt.Errorf("failed to retrieve list of all cells. GetShard() failed: %v", err) } cells = shardInfo.Cells } // Check all cells in parallel. wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, cell := range cells { wg.Add(1) go func(cell string) { defer wg.Done() rec.RecordError(wr.waitForDrainInCell(ctx, cell, keyspace, shard, servedType, retryDelay, healthCheckTopologyRefresh, healthcheckRetryDelay, healthCheckTimeout)) }(cell) } wg.Wait() return rec.Error() }
func (wr *Wrangler) getMastersPosition(shards []*topo.ShardInfo) (map[*topo.ShardInfo]myproto.ReplicationPosition, error) { mu := sync.Mutex{} result := make(map[*topo.ShardInfo]myproto.ReplicationPosition) wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() log.Infof("Gathering master position for %v", si.MasterAlias) ti, err := wr.ts.GetTablet(si.MasterAlias) if err != nil { rec.RecordError(err) return } pos, err := wr.ai.MasterPosition(ti, wr.ActionTimeout()) if err != nil { rec.RecordError(err) return } log.Infof("Got master position for %v", si.MasterAlias) mu.Lock() result[si] = pos mu.Unlock() }(si) } wg.Wait() return result, rec.Error() }
// CleanUp will run the recorded actions. // If an action on a target fails, it will not run the next action on // the same target. // We return the aggregate errors for all cleanups. // CleanUp uses its own context, with a timeout of 5 minutes, so that clean up action will run even if the original context times out. // TODO(alainjobart) Actions should run concurrently on a per target // basis. They are then serialized on each target. func (cleaner *Cleaner) CleanUp(wr *Wrangler) error { // we use a background context so we're not dependent on the original context timeout ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) actionMap := make(map[string]*cleanUpHelper) rec := concurrency.AllErrorRecorder{} cleaner.mu.Lock() for i := len(cleaner.actions) - 1; i >= 0; i-- { actionReference := cleaner.actions[i] helper, ok := actionMap[actionReference.target] if !ok { helper = &cleanUpHelper{ err: nil, } actionMap[actionReference.target] = helper } if helper.err != nil { wr.Logger().Warningf("previous action failed on target %v, no running %v", actionReference.target, actionReference.name) continue } err := actionReference.action.CleanUp(ctx, wr) if err != nil { helper.err = err rec.RecordError(err) wr.Logger().Errorf("action %v failed on %v: %v", actionReference.name, actionReference.target, err) } else { wr.Logger().Infof("action %v successful on %v", actionReference.name, actionReference.target) } } cleaner.mu.Unlock() cancel() return rec.Error() }
// FIXME(alainjobart) no action to become read-write now, just use Ping, // that forces the shard reload and will stop replication. func (wr *Wrangler) makeMastersReadWrite(shards []*topo.ShardInfo) error { wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() log.Infof("Pinging master %v", si.MasterAlias) actionPath, err := wr.ai.Ping(si.MasterAlias) if err != nil { rec.RecordError(err) return } if err := wr.WaitForCompletion(actionPath); err != nil { rec.RecordError(err) } else { log.Infof("%v responded", si.MasterAlias) } }(si) } wg.Wait() return rec.Error() }
func (wr *Wrangler) waitForFilteredReplication(sourcePositions map[*topo.ShardInfo]myproto.ReplicationPosition, destinationShards []*topo.ShardInfo) error { wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range destinationShards { wg.Add(1) go func(si *topo.ShardInfo) { for _, sourceShard := range si.SourceShards { // we're waiting on this guy blpPosition := blproto.BlpPosition{ Uid: sourceShard.Uid, } // find the position it should be at for s, pos := range sourcePositions { if s.Keyspace() == sourceShard.Keyspace && s.ShardName() == sourceShard.Shard { blpPosition.Position = pos } } log.Infof("Waiting for %v to catch up", si.MasterAlias) if err := wr.ai.WaitBlpPosition(si.MasterAlias, blpPosition, wr.ActionTimeout()); err != nil { rec.RecordError(err) } else { log.Infof("%v caught up", si.MasterAlias) } wg.Done() } }(si) } wg.Wait() return rec.Error() }
// RebuildShard updates the SrvShard objects and underlying serving graph. // // Re-read from TopologyServer to make sure we are using the side // effects of all actions. // // This function will start each cell over from the beginning on ErrBadVersion, // so it doesn't need a lock on the shard. func RebuildShard(ctx context.Context, log logutil.Logger, ts topo.Server, keyspace, shard string, cells []string, lockTimeout time.Duration) (*topo.ShardInfo, error) { log.Infof("RebuildShard %v/%v", keyspace, shard) span := trace.NewSpanFromContext(ctx) span.StartLocal("topotools.RebuildShard") defer span.Finish() ctx = trace.NewContext(ctx, span) // read the existing shard info. It has to exist. shardInfo, err := ts.GetShard(ctx, keyspace, shard) if err != nil { return nil, err } // rebuild all cells in parallel wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, cell := range shardInfo.Cells { // skip this cell if we shouldn't rebuild it if !topo.InCellList(cell, cells) { continue } wg.Add(1) go func(cell string) { defer wg.Done() rec.RecordError(rebuildCellSrvShard(ctx, log, ts, shardInfo, cell)) }(cell) } wg.Wait() return shardInfo, rec.Error() }
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool) error { recorder := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} swrd := tm.SlaveWasRestartedData{ Parent: masterElectTablet.Alias(), ExpectedMasterAddr: masterElectTablet.MysqlAddr, ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr, ScrapStragglers: scrapStragglers, } // do all the slaves for _, ti := range slaveTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { recorder.RecordError(wr.slaveWasRestarted(ti, &swrd)) wg.Done() }(ti) } wg.Wait() // then do the master recorder.RecordError(wr.slaveWasRestarted(masterTablet, &swrd)) return recorder.Error() }
func (wr *Wrangler) getMastersPosition(shards []*topo.ShardInfo) (map[*topo.ShardInfo]*mysqlctl.ReplicationPosition, error) { mu := sync.Mutex{} result := make(map[*topo.ShardInfo]*mysqlctl.ReplicationPosition) wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { wg.Add(1) go func(si *topo.ShardInfo) { log.Infof("Gathering master position for %v", si.MasterAlias) pos, err := wr.getMasterPosition(si.MasterAlias) if err != nil { rec.RecordError(err) } else { log.Infof("Got master position for %v", si.MasterAlias) mu.Lock() result[si] = pos mu.Unlock() } wg.Done() }(si) } wg.Wait() return result, rec.Error() }
// CopyKeyspaces will create the keyspaces in the destination topo func CopyKeyspaces(fromTS, toTS topo.Server) { keyspaces, err := fromTS.GetKeyspaces() if err != nil { log.Fatalf("GetKeyspaces: %v", err) } wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, keyspace := range keyspaces { wg.Add(1) go func(keyspace string) { defer wg.Done() k, err := fromTS.GetKeyspace(keyspace) if err != nil { rec.RecordError(fmt.Errorf("GetKeyspace(%v): %v", keyspace, err)) return } if err := toTS.CreateKeyspace(keyspace, k.Keyspace); err != nil { if err == topo.ErrNodeExists { log.Warningf("keyspace %v already exists", keyspace) } else { rec.RecordError(fmt.Errorf("CreateKeyspace(%v): %v", keyspace, err)) } } }(keyspace) } wg.Wait() if rec.HasErrors() { log.Fatalf("copyKeyspaces failed: %v", rec.Error()) } }
// DeleteKeyspaceShards implements topo.Server. func (s *Server) DeleteKeyspaceShards(ctx context.Context, keyspace string) error { shards, err := s.GetShardNames(ctx, keyspace) if err != nil { return err } wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} global := s.getGlobal() for _, shard := range shards { wg.Add(1) go func(shard string) { defer wg.Done() _, err := global.Delete(shardDirPath(keyspace, shard), true /* recursive */) rec.RecordError(convertError(err)) }(shard) } wg.Wait() if err = rec.Error(); err != nil { return err } event.Dispatch(&events.KeyspaceChange{ KeyspaceInfo: *topo.NewKeyspaceInfo(keyspace, nil, -1), Status: "deleted all shards", }) return nil }
// CleanUp will run the recorded actions. // If an action on a target fails, it will not run the next action on // the same target. // We return the aggregate errors for all cleanups. // TODO(alainjobart) Actions should run concurrently on a per target // basis. They are then serialized on each target. func (cleaner *Cleaner) CleanUp(wr *Wrangler) error { actionMap := make(map[string]*cleanUpHelper) rec := concurrency.AllErrorRecorder{} cleaner.mu.Lock() for i := len(cleaner.actions) - 1; i >= 0; i-- { actionReference := cleaner.actions[i] helper, ok := actionMap[actionReference.target] if !ok { helper = &cleanUpHelper{ err: nil, } actionMap[actionReference.target] = helper } if helper.err != nil { log.Warningf("previous action failed on target %v, no running %v", actionReference.target, actionReference.name) continue } err := actionReference.action.CleanUp(wr) if err != nil { helper.err = err rec.RecordError(err) log.Errorf("action %v failed on %v: %v", actionReference.name, actionReference.target, err) } else { log.Infof("action %v successfull on %v", actionReference.name, actionReference.target) } } cleaner.mu.Unlock() return rec.Error() }
func (wr *Wrangler) makeMastersReadOnly(shards []*topo.ShardInfo) error { wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { if si.MasterAlias.IsZero() { rec.RecordError(fmt.Errorf("Shard %v/%v has no master?", si.Keyspace(), si.ShardName())) continue } wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() wr.Logger().Infof("Making master %v read-only", si.MasterAlias) ti, err := wr.ts.GetTablet(si.MasterAlias) if err != nil { rec.RecordError(err) return } if err = wr.tmc.SetReadOnly(ti, wr.ActionTimeout()); err != nil { rec.RecordError(err) return } wr.Logger().Infof("Master %v is now read-only", si.MasterAlias) }(si) } wg.Wait() return rec.Error() }
// CopyTablets will create the tablets in the destination topo func CopyTablets(fromTS, toTS topo.Server) { cells, err := fromTS.GetKnownCells() if err != nil { log.Fatalf("fromTS.GetKnownCells failed: %v", err) } wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, cell := range cells { wg.Add(1) go func(cell string) { defer wg.Done() tabletAliases, err := fromTS.GetTabletsByCell(cell) if err != nil { rec.RecordError(err) } else { for _, tabletAlias := range tabletAliases { wg.Add(1) go func(tabletAlias topo.TabletAlias) { defer wg.Done() // read the source tablet ti, err := fromTS.GetTablet(tabletAlias) if err != nil { rec.RecordError(err) return } // try to create the destination err = toTS.CreateTablet(ti.Tablet) if err == topo.ErrNodeExists { // update the destination tablet log.Warningf("tablet %v already exists, updating it", tabletAlias) err = toTS.UpdateTabletFields(ti.Alias(), func(t *topo.Tablet) error { *t = *ti.Tablet return nil }) } if err != nil { rec.RecordError(err) return } // create the replication paths // for masters only here if ti.Type == topo.TYPE_MASTER { if err = toTS.CreateReplicationPath(ti.Keyspace, ti.Shard, ti.Alias().String()); err != nil && err != topo.ErrNodeExists { rec.RecordError(err) } } }(tabletAlias) } } }(cell) } wg.Wait() if rec.HasErrors() { log.Fatalf("copyTablets failed: %v", rec.Error()) } }
func (wr *Wrangler) getMastersPosition(ctx context.Context, shards []*topo.ShardInfo) (map[*topo.ShardInfo]string, error) { mu := sync.Mutex{} result := make(map[*topo.ShardInfo]string) wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() wr.Logger().Infof("Gathering master position for %v", topoproto.TabletAliasString(si.MasterAlias)) ti, err := wr.ts.GetTablet(ctx, si.MasterAlias) if err != nil { rec.RecordError(err) return } pos, err := wr.tmc.MasterPosition(ctx, ti) if err != nil { rec.RecordError(err) return } wr.Logger().Infof("Got master position for %v", topoproto.TabletAliasString(si.MasterAlias)) mu.Lock() result[si] = pos mu.Unlock() }(si) } wg.Wait() return result, rec.Error() }
// UpdateTabletEndpoints fixes up any entries in the serving graph that relate // to a given tablet. func UpdateTabletEndpoints(ctx context.Context, ts topo.Server, tablet *topo.Tablet) (err error) { if *lockSrvShard { // This lock is only necessary until all tablets are upgraded to lock-free. actionNode := actionnode.RebuildSrvShard() lockPath, err := actionNode.LockSrvShard(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard) if err != nil { return fmt.Errorf("can't lock shard for UpdateTabletEndpoints(%v): %v", tablet, err) } defer func() { actionNode.UnlockSrvShard(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard, lockPath, err) }() } srvTypes, err := ts.GetSrvTabletTypesPerShard(ctx, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard) if err != nil { if err != topo.ErrNoNode { return err } // It's fine if there are no existing types. srvTypes = nil } wg := sync.WaitGroup{} errs := concurrency.AllErrorRecorder{} // Update the list that the tablet is supposed to be in (if any). if tablet.IsInServingGraph() { endpoint, err := tablet.EndPoint() if err != nil { return err } wg.Add(1) go func() { defer wg.Done() errs.RecordError( updateEndpoint(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard, tablet.Type, endpoint)) }() } // Remove it from any other lists it isn't supposed to be in. for _, srvType := range srvTypes { if srvType != tablet.Type { wg.Add(1) go func(tabletType topo.TabletType) { defer wg.Done() errs.RecordError( removeEndpoint(ctx, ts, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard, tabletType, tablet.Alias.Uid)) }(srvType) } } wg.Wait() return errs.Error() }
// shardsWithTablesSources returns all the shards that have SourceShards set // to one value, with an array of Tables. func shardsWithTablesSources(ctx context.Context, wr *wrangler.Wrangler) ([]map[string]string, error) { shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) keyspaces, err := wr.TopoServer().GetKeyspaces(shortCtx) cancel() if err != nil { return nil, err } wg := sync.WaitGroup{} mu := sync.Mutex{} // protects result result := make([]map[string]string, 0, len(keyspaces)) rec := concurrency.AllErrorRecorder{} for _, keyspace := range keyspaces { wg.Add(1) go func(keyspace string) { defer wg.Done() shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) shards, err := wr.TopoServer().GetShardNames(shortCtx, keyspace) cancel() if err != nil { rec.RecordError(err) return } for _, shard := range shards { wg.Add(1) go func(keyspace, shard string) { defer wg.Done() shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) si, err := wr.TopoServer().GetShard(shortCtx, keyspace, shard) cancel() if err != nil { rec.RecordError(err) return } if len(si.SourceShards) == 1 && len(si.SourceShards[0].Tables) > 0 { mu.Lock() result = append(result, map[string]string{ "Keyspace": keyspace, "Shard": shard, }) mu.Unlock() } }(keyspace, shard) } }(keyspace) } wg.Wait() if rec.HasErrors() { return nil, rec.Error() } if len(result) == 0 { return nil, fmt.Errorf("There are no shards with SourceShards") } return result, nil }
// Make this external, since these transitions need to be forced from time to time. func ChangeType(ts topo.Server, tabletAlias topo.TabletAlias, newType topo.TabletType, runHooks bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } if !topo.IsTrivialTypeChange(tablet.Type, newType) || !topo.IsValidTypeChange(tablet.Type, newType) { return fmt.Errorf("cannot change tablet type %v -> %v %v", tablet.Type, newType, tabletAlias) } if runHooks { // Only run the preflight_serving_type hook when // transitioning from non-serving to serving. if !topo.IsInServingGraph(tablet.Type) && topo.IsInServingGraph(newType) { if err := hook.NewSimpleHook("preflight_serving_type").ExecuteOptional(); err != nil { return err } } } tablet.Type = newType if newType == topo.TYPE_IDLE { if tablet.Parent.IsZero() { si, err := ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return err } rec := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} for _, cell := range si.Cells { wg.Add(1) go func(cell string) { defer wg.Done() sri, err := ts.GetShardReplication(cell, tablet.Keyspace, tablet.Shard) if err != nil { log.Warningf("Cannot check cell %v for extra replication paths, assuming it's good", cell) return } for _, rl := range sri.ReplicationLinks { if rl.Parent == tabletAlias { rec.RecordError(fmt.Errorf("Still have a ReplicationLink in cell %v", cell)) } } }(cell) } wg.Wait() if rec.HasErrors() { return rec.Error() } } tablet.Parent = topo.TabletAlias{} tablet.Keyspace = "" tablet.Shard = "" tablet.KeyRange = key.KeyRange{} } return topo.UpdateTablet(ts, tablet) }
// ValidateVersionKeyspace validates all versions are the same in all // tablets in a keyspace func (wr *Wrangler) ValidateVersionKeyspace(ctx context.Context, keyspace string) error { // find all the shards shards, err := wr.ts.GetShardNames(ctx, keyspace) if err != nil { return err } // corner cases if len(shards) == 0 { return fmt.Errorf("No shards in keyspace %v", keyspace) } sort.Strings(shards) if len(shards) == 1 { return wr.ValidateVersionShard(ctx, keyspace, shards[0]) } // find the reference version using the first shard's master si, err := wr.ts.GetShard(ctx, keyspace, shards[0]) if err != nil { return err } if topo.TabletAliasIsZero(si.MasterAlias) { return fmt.Errorf("No master in shard %v/%v", keyspace, shards[0]) } referenceAlias := si.MasterAlias log.Infof("Gathering version for reference master %v", topo.TabletAliasString(referenceAlias)) referenceVersion, err := wr.GetVersion(ctx, referenceAlias) if err != nil { return err } // then diff with all tablets but master 0 er := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} for _, shard := range shards { aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard) if err != nil { er.RecordError(err) continue } for _, alias := range aliases { if topo.TabletAliasEqual(alias, si.MasterAlias) { continue } wg.Add(1) go wr.diffVersion(ctx, referenceVersion, referenceAlias, alias, &wg, &er) } } wg.Wait() if er.HasErrors() { return fmt.Errorf("Version diffs:\n%v", er.Error().Error()) } return nil }
func (wr *Wrangler) ValidatePermissionsKeyspace(keyspace string) error { // find all the shards shards, err := wr.ts.GetShardNames(keyspace) if err != nil { return err } // corner cases if len(shards) == 0 { return fmt.Errorf("No shards in keyspace %v", keyspace) } sort.Strings(shards) if len(shards) == 1 { return wr.ValidatePermissionsShard(keyspace, shards[0]) } // find the reference permissions using the first shard's master si, err := wr.ts.GetShard(keyspace, shards[0]) if err != nil { return err } if si.MasterAlias.Uid == topo.NO_TABLET { return fmt.Errorf("No master in shard %v/%v", keyspace, shards[0]) } referenceAlias := si.MasterAlias log.Infof("Gathering permissions for reference master %v", referenceAlias) referencePermissions, err := wr.GetPermissions(si.MasterAlias) if err != nil { return err } // then diff with all tablets but master 0 er := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} for _, shard := range shards { aliases, err := topo.FindAllTabletAliasesInShard(wr.ts, keyspace, shard) if err != nil { er.RecordError(err) continue } for _, alias := range aliases { if alias == si.MasterAlias { continue } wg.Add(1) go wr.diffPermissions(referencePermissions, referenceAlias, alias, &wg, &er) } } wg.Wait() if er.HasErrors() { return fmt.Errorf("Permissions diffs:\n%v", er.Error().Error()) } return nil }
// CopyShardReplications will create the ShardReplication objects in // the destination topo func CopyShardReplications(ctx context.Context, fromTS, toTS topo.Impl) { keyspaces, err := fromTS.GetKeyspaces(ctx) if err != nil { log.Fatalf("fromTS.GetKeyspaces: %v", err) } tts := topo.Server{ Impl: toTS, } wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, keyspace := range keyspaces { wg.Add(1) go func(keyspace string) { defer wg.Done() shards, err := fromTS.GetShardNames(ctx, keyspace) if err != nil { rec.RecordError(fmt.Errorf("GetShardNames(%v): %v", keyspace, err)) return } for _, shard := range shards { wg.Add(1) go func(keyspace, shard string) { defer wg.Done() // read the source shard to get the cells s, _, err := fromTS.GetShard(ctx, keyspace, shard) if err != nil { rec.RecordError(fmt.Errorf("GetShard(%v, %v): %v", keyspace, shard, err)) return } for _, cell := range s.Cells { sri, err := fromTS.GetShardReplication(ctx, cell, keyspace, shard) if err != nil { rec.RecordError(fmt.Errorf("GetShardReplication(%v, %v, %v): %v", cell, keyspace, shard, err)) continue } if err := tts.UpdateShardReplicationFields(ctx, cell, keyspace, shard, func(oldSR *topodatapb.ShardReplication) error { *oldSR = *sri.ShardReplication return nil }); err != nil { rec.RecordError(fmt.Errorf("UpdateShardReplicationFields(%v, %v, %v): %v", cell, keyspace, shard, err)) } } }(keyspace, shard) } }(keyspace) } wg.Wait() if rec.HasErrors() { log.Fatalf("copyShards failed: %v", rec.Error()) } }
// CopyShards will create the shards in the destination topo func CopyShards(fromTS, toTS topo.Server, deleteKeyspaceShards bool) { keyspaces, err := fromTS.GetKeyspaces() if err != nil { log.Fatalf("fromTS.GetKeyspaces: %v", err) } wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, keyspace := range keyspaces { wg.Add(1) go func(keyspace string) { defer wg.Done() shards, err := fromTS.GetShardNames(keyspace) if err != nil { rec.RecordError(fmt.Errorf("GetShardNames(%v): %v", keyspace, err)) return } if deleteKeyspaceShards { if err := toTS.DeleteKeyspaceShards(keyspace); err != nil { rec.RecordError(fmt.Errorf("DeleteKeyspaceShards(%v): %v", keyspace, err)) return } } for _, shard := range shards { wg.Add(1) go func(keyspace, shard string) { defer wg.Done() if err := topo.CreateShard(toTS, keyspace, shard); err != nil { if err == topo.ErrNodeExists { log.Warningf("shard %v/%v already exists", keyspace, shard) } else { rec.RecordError(fmt.Errorf("CreateShard(%v, %v): %v", keyspace, shard, err)) return } } si, err := fromTS.GetShard(keyspace, shard) if err != nil { rec.RecordError(fmt.Errorf("GetShard(%v, %v): %v", keyspace, shard, err)) return } if err := toTS.UpdateShard(si); err != nil { rec.RecordError(fmt.Errorf("UpdateShard(%v, %v): %v", keyspace, shard, err)) } }(keyspace, shard) } }(keyspace) } wg.Wait() if rec.HasErrors() { log.Fatalf("copyShards failed: %v", rec.Error()) } }
func (wr *Wrangler) ShardMultiRestore(keyspace, shard string, sources []topo.TabletAlias, tables []string, concurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount int, strategy string) error { // check parameters if len(tables) > 0 && len(sources) > 1 { return fmt.Errorf("ShardMultiRestore can only handle one source when tables are specified") } // lock the shard to perform the changes we need done actionNode := actionnode.ShardMultiRestore(&actionnode.MultiRestoreArgs{ SrcTabletAliases: sources, Concurrency: concurrency, FetchConcurrency: fetchConcurrency, InsertTableConcurrency: insertTableConcurrency, FetchRetryCount: fetchRetryCount, Strategy: strategy}) lockPath, err := wr.lockShard(keyspace, shard, actionNode) if err != nil { return err } mrErr := wr.SetSourceShards(keyspace, shard, sources, tables) err = wr.unlockShard(keyspace, shard, actionNode, lockPath, mrErr) if err != nil { if mrErr != nil { log.Errorf("unlockShard got error back: %v", err) return mrErr } return err } if mrErr != nil { return mrErr } // find all tablets in the shard destTablets, err := topo.FindAllTabletAliasesInShard(wr.ts, keyspace, shard) if err != nil { return err } // now launch MultiRestore on all tablets we need to do rec := cc.AllErrorRecorder{} wg := sync.WaitGroup{} for _, tabletAlias := range destTablets { wg.Add(1) go func(tabletAlias topo.TabletAlias) { log.Infof("Starting multirestore on tablet %v", tabletAlias) err := wr.MultiRestore(tabletAlias, sources, concurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount, strategy) log.Infof("Multirestore on tablet %v is done (err=%v)", tabletAlias, err) rec.RecordError(err) wg.Done() }(tabletAlias) } wg.Wait() return rec.Error() }
// CopyShards will create the shards in the destination topo func CopyShards(ctx context.Context, fromTS, toTS topo.Impl) { keyspaces, err := fromTS.GetKeyspaces(ctx) if err != nil { log.Fatalf("fromTS.GetKeyspaces: %v", err) } wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, keyspace := range keyspaces { wg.Add(1) go func(keyspace string) { defer wg.Done() shards, err := fromTS.GetShardNames(ctx, keyspace) if err != nil { rec.RecordError(fmt.Errorf("GetShardNames(%v): %v", keyspace, err)) return } for _, shard := range shards { wg.Add(1) go func(keyspace, shard string) { defer wg.Done() if err := toTS.CreateShard(ctx, keyspace, shard, &topodatapb.Shard{}); err != nil { if err == topo.ErrNodeExists { log.Warningf("shard %v/%v already exists", keyspace, shard) } else { rec.RecordError(fmt.Errorf("CreateShard(%v, %v): %v", keyspace, shard, err)) return } } s, _, err := fromTS.GetShard(ctx, keyspace, shard) if err != nil { rec.RecordError(fmt.Errorf("GetShard(%v, %v): %v", keyspace, shard, err)) return } _, toV, err := toTS.GetShard(ctx, keyspace, shard) if err != nil { rec.RecordError(fmt.Errorf("toTS.GetShard(%v, %v): %v", keyspace, shard, err)) return } if _, err := toTS.UpdateShard(ctx, keyspace, shard, s, toV); err != nil { rec.RecordError(fmt.Errorf("UpdateShard(%v, %v): %v", keyspace, shard, err)) } }(keyspace, shard) } }(keyspace) } wg.Wait() if rec.HasErrors() { log.Fatalf("copyShards failed: %v", rec.Error()) } }
// CopyTablets will create the tablets in the destination topo func CopyTablets(ctx context.Context, fromTS, toTS topo.Impl) { cells, err := fromTS.GetKnownCells(ctx) if err != nil { log.Fatalf("fromTS.GetKnownCells: %v", err) } tts := topo.Server{ Impl: toTS, } wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, cell := range cells { wg.Add(1) go func(cell string) { defer wg.Done() tabletAliases, err := fromTS.GetTabletsByCell(ctx, cell) if err != nil { rec.RecordError(fmt.Errorf("GetTabletsByCell(%v): %v", cell, err)) } else { for _, tabletAlias := range tabletAliases { wg.Add(1) go func(tabletAlias *topodatapb.TabletAlias) { defer wg.Done() // read the source tablet tablet, _, err := fromTS.GetTablet(ctx, tabletAlias) if err != nil { rec.RecordError(fmt.Errorf("GetTablet(%v): %v", tabletAlias, err)) return } // try to create the destination err = toTS.CreateTablet(ctx, tablet) if err == topo.ErrNodeExists { // update the destination tablet log.Warningf("tablet %v already exists, updating it", tabletAlias) _, err = tts.UpdateTabletFields(ctx, tablet.Alias, func(t *topodatapb.Tablet) error { *t = *tablet return nil }) } if err != nil { rec.RecordError(fmt.Errorf("CreateTablet(%v): %v", tabletAlias, err)) return } }(tabletAlias) } } }(cell) } wg.Wait() if rec.HasErrors() { log.Fatalf("copyTablets failed: %v", rec.Error()) } }
// getNewConn creates a new tablet connection with a separate per conn timeout. // It limits the overall timeout to connTimeoutTotal by checking elapsed time after each blocking call. func (sdc *ShardConn) getNewConn(ctx context.Context) (conn tabletconn.TabletConn, endPoint *topodatapb.EndPoint, isTimeout bool, err error) { startTime := time.Now() endPoints, err := sdc.balancer.Get() if err != nil { // Error when getting endpoint return nil, nil, false, err } if len(endPoints) == 0 { // No valid endpoint return nil, nil, false, vterrors.FromError( vtrpcpb.ErrorCode_INTERNAL_ERROR, fmt.Errorf("no valid endpoint"), ) } if time.Now().Sub(startTime) >= sdc.connTimeoutTotal { return nil, nil, true, vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, fmt.Errorf("timeout when getting endpoints"), ) } // Iterate through all endpoints to create a connection perConnTimeout := sdc.getConnTimeoutPerConn(len(endPoints)) allErrors := new(concurrency.AllErrorRecorder) for _, endPoint := range endPoints { perConnStartTime := time.Now() conn, err = tabletconn.GetDialer()(ctx, endPoint, sdc.keyspace, sdc.shard, topodatapb.TabletType_UNKNOWN, perConnTimeout) if err == nil { sdc.connectTimings.Record([]string{sdc.keyspace, sdc.shard, strings.ToLower(sdc.tabletType.String())}, perConnStartTime) sdc.mu.Lock() defer sdc.mu.Unlock() sdc.conn = conn return conn, endPoint, false, nil } // Markdown the endpoint if it failed to connect sdc.balancer.MarkDown(endPoint.Uid, err.Error()) vtErr := vterrors.NewVitessError( // TODO(aaijazi): what about OperationalErrors here? vterrors.RecoverVtErrorCode(err), err, "%v %+v", err, endPoint, ) allErrors.RecordError(vtErr) if time.Now().Sub(startTime) >= sdc.connTimeoutTotal { err = vterrors.FromError( vtrpcpb.ErrorCode_DEADLINE_EXCEEDED, fmt.Errorf("timeout when connecting to %+v", endPoint), ) allErrors.RecordError(err) return nil, nil, true, allErrors.AggrError(AggregateVtGateErrors) } } return nil, nil, false, allErrors.Error() }
func (stc *ScatterConn) rollbackIfNeeded(ctx context.Context, allErrors *concurrency.AllErrorRecorder, session *SafeSession) { if session.InTransaction() { errstr := allErrors.Error().Error() // We cannot recover from these errors // TODO(aaijazi): get rid of this string parsing. Might // want a function that searches through a deeply // nested error chain for a particular error. if strings.Contains(errstr, "tx_pool_full") || strings.Contains(errstr, "not_in_tx") { stc.Rollback(ctx, session) } } }
// Reload reloads the schema info from the db. // Any tables that have changed since the last load are updated. // This is a no-op if the SchemaInfo is closed. func (si *SchemaInfo) Reload(ctx context.Context) error { defer logError(si.queryServiceStats) // Reload() gets called both from the ticker, and from external RPCs. // We don't want them to race over writing data that was read concurrently. si.actionMutex.Lock() defer si.actionMutex.Unlock() if si.IsClosed() { return nil } // Get time first because it needs a connection from the pool. curTime := si.mysqlTime(ctx) var tableData *sqltypes.Result var err error func() { conn := getOrPanic(ctx, si.connPool) defer conn.Recycle() tableData, err = conn.Exec(ctx, baseShowTables, maxTableCount, false) }() if err != nil { return fmt.Errorf("could not get table list for reload: %v", err) } // Reload any tables that have changed. We try every table even if some fail, // but we return success only if all tables succeed. // The following section requires us to hold mu. rec := concurrency.AllErrorRecorder{} si.mu.Lock() defer si.mu.Unlock() for _, row := range tableData.Rows { tableName := row[0].String() createTime, _ := row[2].ParseInt64() // Check if we know about the table or it has been recreated. if _, ok := si.tables[tableName]; !ok || createTime >= si.lastChange { func() { // Unlock so CreateOrUpdateTable can lock. si.mu.Unlock() defer si.mu.Lock() log.Infof("Reloading schema for table: %s", tableName) rec.RecordError(si.createOrUpdateTableLocked(ctx, tableName)) }() continue } // Only update table_rows, data_length, index_length, max_data_length si.tables[tableName].SetMysqlStats(row[4], row[5], row[6], row[7], row[8]) } si.lastChange = curTime return rec.Error() }
// runOnAllShards is a helper method that executes the passed function for all shards in parallel. // The method returns no error if the function succeeds on all shards. If on any of the shards // the function fails then the method returns error. If several shards return error then only one // of them is returned. func (schemaSwap *Swap) runOnAllShards(shardFunc func(shard *shardSchemaSwap) error) error { var errorRecorder concurrency.AllErrorRecorder var waitGroup sync.WaitGroup for _, shardSwap := range schemaSwap.allShards { waitGroup.Add(1) go func(shard *shardSchemaSwap) { defer waitGroup.Done() errorRecorder.RecordError(shardFunc(shard)) }(shardSwap) } waitGroup.Wait() return errorRecorder.Error() }
// UpdateAllSrvShards calls UpdateSrvShard for all cells concurrently. func UpdateAllSrvShards(ctx context.Context, ts topo.Server, si *topo.ShardInfo) error { wg := sync.WaitGroup{} errs := concurrency.AllErrorRecorder{} for _, cell := range si.Cells { wg.Add(1) go func(cell string) { errs.RecordError(UpdateSrvShard(ctx, ts, cell, si)) wg.Done() }(cell) } wg.Wait() return errs.Error() }