// rpcWrapper handles all the logic for rpc calls. func (agent *ActionAgent) rpcWrapper(ctx context.Context, name string, args, reply interface{}, verbose bool, f func() error, lock, runAfterAction bool) (err error) { defer func() { if x := recover(); x != nil { log.Errorf("TabletManager.%v(%v) on %v panic: %v\n%s", name, args, topo.TabletAliasString(agent.TabletAlias), x, tb.Stack(4)) err = fmt.Errorf("caught panic during %v: %v", name, x) } }() from := "" ci, ok := callinfo.FromContext(ctx) if ok { from = ci.Text() } if lock { beforeLock := time.Now() agent.actionMutex.Lock() defer agent.actionMutex.Unlock() if time.Now().Sub(beforeLock) > rpcTimeout { return fmt.Errorf("server timeout for " + name) } } if err = f(); err != nil { log.Warningf("TabletManager.%v(%v)(on %v from %v) error: %v", name, args, topo.TabletAliasString(agent.TabletAlias), from, err.Error()) return fmt.Errorf("TabletManager.%v on %v error: %v", name, topo.TabletAliasString(agent.TabletAlias), err) } if verbose { log.Infof("TabletManager.%v(%v)(on %v from %v): %#v", name, args, topo.TabletAliasString(agent.TabletAlias), from, reply) } if runAfterAction { err = agent.refreshTablet(ctx, "RPC("+name+")") } return }
// Syslog writes a Reparent event to syslog. func (r *Reparent) Syslog() (syslog.Priority, string) { return syslog.LOG_INFO, fmt.Sprintf("%s/%s [reparent %v -> %v] %s (%s)", r.ShardInfo.Keyspace(), r.ShardInfo.ShardName(), topo.TabletAliasString(r.OldMaster.Alias), topo.TabletAliasString(r.NewMaster.Alias), r.Status, r.ExternalID) }
func (wr *Wrangler) getMastersPosition(ctx context.Context, shards []*topo.ShardInfo) (map[*topo.ShardInfo]myproto.ReplicationPosition, error) { mu := sync.Mutex{} result := make(map[*topo.ShardInfo]myproto.ReplicationPosition) wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() wr.Logger().Infof("Gathering master position for %v", topo.TabletAliasString(si.MasterAlias)) ti, err := wr.ts.GetTablet(ctx, si.MasterAlias) if err != nil { rec.RecordError(err) return } pos, err := wr.tmc.MasterPosition(ctx, ti) if err != nil { rec.RecordError(err) return } wr.Logger().Infof("Got master position for %v", topo.TabletAliasString(si.MasterAlias)) mu.Lock() result[si] = pos mu.Unlock() }(si) } wg.Wait() return result, rec.Error() }
// ReparentTablet tells a tablet to reparent this tablet to the current // master, based on the current replication position. If there is no // match, it will fail. func (wr *Wrangler) ReparentTablet(ctx context.Context, tabletAlias *pb.TabletAlias) error { // Get specified tablet. // Get current shard master tablet. // Sanity check they are in the same keyspace/shard. // Issue a SetMaster to the tablet. ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return err } shardInfo, err := wr.ts.GetShard(ctx, ti.Keyspace, ti.Shard) if err != nil { return err } if topo.TabletAliasIsZero(shardInfo.MasterAlias) { return fmt.Errorf("no master tablet for shard %v/%v", ti.Keyspace, ti.Shard) } masterTi, err := wr.ts.GetTablet(ctx, shardInfo.MasterAlias) if err != nil { return err } // Basic sanity checking. if masterTi.Type != pb.TabletType_MASTER { return fmt.Errorf("TopologyServer has inconsistent state for shard master %v", topo.TabletAliasString(shardInfo.MasterAlias)) } if masterTi.Keyspace != ti.Keyspace || masterTi.Shard != ti.Shard { return fmt.Errorf("master %v and potential slave not in same keyspace/shard", topo.TabletAliasString(shardInfo.MasterAlias)) } // and do the remote command return wr.TabletManagerClient().SetMaster(ctx, ti, shardInfo.MasterAlias, 0, false) }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets(ctx context.Context) error { vscw.setState(WorkerStateFindTargets) // find an appropriate endpoint in the source shard var err error vscw.sourceAlias, err = FindWorkerTablet(ctx, vscw.wr, vscw.cleaner, vscw.cell, vscw.sourceKeyspace, "0") if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", topo.TabletAliasString(vscw.sourceAlias)) // get the tablet info for it vscw.sourceTablet, err = vscw.wr.TopoServer().GetTablet(ctx, vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topo.TabletAliasString(vscw.sourceAlias), err) } // stop replication on it shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err = vscw.wr.TabletManagerClient().StopSlave(shortCtx, vscw.sourceTablet) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topo.TabletAliasString(vscw.sourceAlias)) } wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vscw.cleaner, vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(vscw.sourceAlias), err) } action.TabletType = pb.TabletType_SPARE return vscw.ResolveDestinationMasters(ctx) }
// FIXME(msolomon) This validate presumes the master is up and running. // Even when that isn't true, there are validation processes that might be valuable. func (wr *Wrangler) validateShard(ctx context.Context, keyspace, shard string, pingTablets bool, wg *sync.WaitGroup, results chan<- error) { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.GetShard(%v, %v) failed: %v", keyspace, shard, err) return } aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err) return } tabletMap, _ := topo.GetTabletMap(ctx, wr.ts, aliases) var masterAlias *pb.TabletAlias for _, alias := range aliases { tabletInfo, ok := tabletMap[*alias] if !ok { results <- fmt.Errorf("tablet %v not found in map", topo.TabletAliasString(alias)) continue } if tabletInfo.Type == pb.TabletType_MASTER { if masterAlias != nil { results <- fmt.Errorf("shard %v/%v already has master %v but found other master %v", keyspace, shard, topo.TabletAliasString(masterAlias), topo.TabletAliasString(alias)) } else { masterAlias = alias } } } if masterAlias == nil { results <- fmt.Errorf("no master for shard %v/%v", keyspace, shard) } else if !topo.TabletAliasEqual(shardInfo.MasterAlias, masterAlias) { results <- fmt.Errorf("master mismatch for shard %v/%v: found %v, expected %v", keyspace, shard, topo.TabletAliasString(masterAlias), topo.TabletAliasString(shardInfo.MasterAlias)) } for _, alias := range aliases { wg.Add(1) go func(alias *pb.TabletAlias) { defer wg.Done() if err := topo.Validate(ctx, wr.ts, alias); err != nil { results <- fmt.Errorf("Validate(%v) failed: %v", topo.TabletAliasString(alias), err) } else { wr.Logger().Infof("tablet %v is valid", topo.TabletAliasString(alias)) } }(alias) } if pingTablets { wr.validateReplication(ctx, shardInfo, tabletMap, results) wr.pingTablets(ctx, tabletMap, wg, results) } return }
// DeleteShard will do all the necessary changes in the topology server // to entirely remove a shard. func (wr *Wrangler) DeleteShard(ctx context.Context, keyspace, shard string, recursive bool) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } tabletMap, err := topo.GetTabletMapForShard(ctx, wr.ts, keyspace, shard) if err != nil { return err } if recursive { wr.Logger().Infof("Deleting all tablets in shard %v/%v", keyspace, shard) for tabletAlias := range tabletMap { // We don't care about scrapping or updating the replication graph, // because we're about to delete the entire replication graph. wr.Logger().Infof("Deleting tablet %v", topo.TabletAliasString(&tabletAlias)) if err := wr.TopoServer().DeleteTablet(ctx, &tabletAlias); err != nil && err != topo.ErrNoNode { // Unlike the errors below in non-recursive steps, we don't want to // continue if a DeleteTablet fails. If we continue and delete the // replication graph, the tablet record will be orphaned, since we'll // no longer know it belongs to this shard. // // If the problem is temporary, or resolved externally, re-running // DeleteShard will skip over tablets that were already deleted. return fmt.Errorf("can't delete tablet %v: %v", topo.TabletAliasString(&tabletAlias), err) } } } else if len(tabletMap) > 0 { return fmt.Errorf("shard %v/%v still has %v tablets; use -recursive or remove them manually", keyspace, shard, len(tabletMap)) } // remove the replication graph and serving graph in each cell for _, cell := range shardInfo.Cells { if err := wr.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %v", cell, keyspace, shard, err) } for _, t := range topo.AllTabletTypes { if !topo.IsInServingGraph(t) { continue } if err := wr.ts.DeleteEndPoints(ctx, cell, keyspace, shard, t, -1); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete EndPoints in cell %v for %v/%v/%v: %v", cell, keyspace, shard, t, err) } } if err := wr.ts.DeleteSrvShard(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete SrvShard in cell %v for %v/%v: %v", cell, keyspace, shard, err) } } return wr.ts.DeleteShard(ctx, keyspace, shard) }
// diffPermissions is a helper method to asynchronously diff a permissions func (wr *Wrangler) diffPermissions(ctx context.Context, masterPermissions *myproto.Permissions, masterAlias *pb.TabletAlias, alias *pb.TabletAlias, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { defer wg.Done() log.Infof("Gathering permissions for %v", alias) slavePermissions, err := wr.GetPermissions(ctx, alias) if err != nil { er.RecordError(err) return } log.Infof("Diffing permissions for %v", alias) myproto.DiffPermissions(topo.TabletAliasString(masterAlias), masterPermissions, topo.TabletAliasString(alias), slavePermissions, er) }
// helper method to asynchronously diff a schema func (wr *Wrangler) diffSchema(ctx context.Context, masterSchema *myproto.SchemaDefinition, masterTabletAlias, alias *pb.TabletAlias, excludeTables []string, includeViews bool, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { defer wg.Done() log.Infof("Gathering schema for %v", alias) slaveSchema, err := wr.GetSchema(ctx, alias, nil, excludeTables, includeViews) if err != nil { er.RecordError(err) return } log.Infof("Diffing schema for %v", alias) myproto.DiffSchema(topo.TabletAliasString(masterTabletAlias), masterSchema, topo.TabletAliasString(alias), slaveSchema, er) }
// RecordTabletTagAction records a new TabletTagAction // into the specified Cleaner func RecordTabletTagAction(cleaner *Cleaner, tabletAlias *pb.TabletAlias, name, value string) { cleaner.Record(TabletTagActionName, topo.TabletAliasString(tabletAlias), &TabletTagAction{ TabletAlias: tabletAlias, Name: name, Value: value, }) }
func (scw *SplitCloneWorker) formatSources() string { result := "" for _, alias := range scw.sourceAliases { result += " " + topo.TabletAliasString(alias) } return result }
// Validate all tablets in all discoverable cells, even if they are // not in the replication graph. func (wr *Wrangler) validateAllTablets(ctx context.Context, wg *sync.WaitGroup, results chan<- error) { cellSet := make(map[string]bool, 16) keyspaces, err := wr.ts.GetKeyspaces(ctx) if err != nil { results <- fmt.Errorf("TopologyServer.GetKeyspaces failed: %v", err) return } for _, keyspace := range keyspaces { shards, err := wr.ts.GetShardNames(ctx, keyspace) if err != nil { results <- fmt.Errorf("TopologyServer.GetShardNames(%v) failed: %v", keyspace, err) return } for _, shard := range shards { aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err) return } for _, alias := range aliases { cellSet[alias.Cell] = true } } } for cell := range cellSet { aliases, err := wr.ts.GetTabletsByCell(ctx, cell) if err != nil { results <- fmt.Errorf("TopologyServer.GetTabletsByCell(%v) failed: %v", cell, err) continue } for _, alias := range aliases { wg.Add(1) go func(alias *pb.TabletAlias) { defer wg.Done() if err := topo.Validate(ctx, wr.ts, alias); err != nil { results <- fmt.Errorf("Validate(%v) failed: %v", topo.TabletAliasString(alias), err) } else { wr.Logger().Infof("tablet %v is valid", topo.TabletAliasString(alias)) } }(alias) } } }
// ValidateVersionKeyspace validates all versions are the same in all // tablets in a keyspace func (wr *Wrangler) ValidateVersionKeyspace(ctx context.Context, keyspace string) error { // find all the shards shards, err := wr.ts.GetShardNames(ctx, keyspace) if err != nil { return err } // corner cases if len(shards) == 0 { return fmt.Errorf("No shards in keyspace %v", keyspace) } sort.Strings(shards) if len(shards) == 1 { return wr.ValidateVersionShard(ctx, keyspace, shards[0]) } // find the reference version using the first shard's master si, err := wr.ts.GetShard(ctx, keyspace, shards[0]) if err != nil { return err } if topo.TabletAliasIsZero(si.MasterAlias) { return fmt.Errorf("No master in shard %v/%v", keyspace, shards[0]) } referenceAlias := si.MasterAlias log.Infof("Gathering version for reference master %v", topo.TabletAliasString(referenceAlias)) referenceVersion, err := wr.GetVersion(ctx, referenceAlias) if err != nil { return err } // then diff with all tablets but master 0 er := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} for _, shard := range shards { aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard) if err != nil { er.RecordError(err) continue } for _, alias := range aliases { if topo.TabletAliasEqual(alias, si.MasterAlias) { continue } wg.Add(1) go wr.diffVersion(ctx, referenceVersion, referenceAlias, alias, &wg, &er) } } wg.Wait() if er.HasErrors() { return fmt.Errorf("Version diffs:\n%v", er.Error().Error()) } return nil }
func (wr *Wrangler) validateReplication(ctx context.Context, shardInfo *topo.ShardInfo, tabletMap map[pb.TabletAlias]*topo.TabletInfo, results chan<- error) { masterTablet, ok := tabletMap[*shardInfo.MasterAlias] if !ok { results <- fmt.Errorf("master %v not in tablet map", topo.TabletAliasString(shardInfo.MasterAlias)) return } slaveList, err := wr.tmc.GetSlaves(ctx, masterTablet) if err != nil { results <- fmt.Errorf("GetSlaves(%v) failed: %v", masterTablet, err) return } if len(slaveList) == 0 { results <- fmt.Errorf("no slaves of tablet %v found", topo.TabletAliasString(shardInfo.MasterAlias)) return } tabletIPMap := make(map[string]*pb.Tablet) slaveIPMap := make(map[string]bool) for _, tablet := range tabletMap { tabletIPMap[normalizeIP(tablet.Ip)] = tablet.Tablet } // See if every slave is in the replication graph. for _, slaveAddr := range slaveList { if tabletIPMap[normalizeIP(slaveAddr)] == nil { results <- fmt.Errorf("slave %v not in replication graph for shard %v/%v (mysql instance without vttablet?)", slaveAddr, shardInfo.Keyspace(), shardInfo.ShardName()) } slaveIPMap[normalizeIP(slaveAddr)] = true } // See if every entry in the replication graph is connected to the master. for _, tablet := range tabletMap { if !tablet.IsSlaveType() { continue } if !slaveIPMap[normalizeIP(tablet.Ip)] { results <- fmt.Errorf("slave %v not replicating: %v slave list: %q", topo.TabletAliasString(tablet.Alias), tablet.Ip, slaveList) } } }
// FindChangeSlaveTypeActionByTarget finds the first action for the target func FindChangeSlaveTypeActionByTarget(cleaner *Cleaner, tabletAlias *pb.TabletAlias) (*ChangeSlaveTypeAction, error) { action, err := cleaner.GetActionByName(ChangeSlaveTypeActionName, topo.TabletAliasString(tabletAlias)) if err != nil { return nil, err } result, ok := action.(*ChangeSlaveTypeAction) if !ok { return nil, fmt.Errorf("Action with wrong type: %v", action) } return result, nil }
func (wr *Wrangler) pingTablets(ctx context.Context, tabletMap map[pb.TabletAlias]*topo.TabletInfo, wg *sync.WaitGroup, results chan<- error) { for tabletAlias, tabletInfo := range tabletMap { wg.Add(1) go func(tabletAlias pb.TabletAlias, tabletInfo *topo.TabletInfo) { defer wg.Done() if err := wr.tmc.Ping(ctx, tabletInfo); err != nil { results <- fmt.Errorf("Ping(%v) failed: %v tablet hostname: %v", topo.TabletAliasString(&tabletAlias), err, tabletInfo.Hostname) } }(tabletAlias, tabletInfo) } }
func (wr *Wrangler) waitForFilteredReplication(ctx context.Context, sourcePositions map[*topo.ShardInfo]myproto.ReplicationPosition, destinationShards []*topo.ShardInfo, waitTime time.Duration) error { wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range destinationShards { wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() for _, sourceShard := range si.SourceShards { // we're waiting on this guy blpPosition := blproto.BlpPosition{ Uid: sourceShard.Uid, } // find the position it should be at for s, pos := range sourcePositions { if s.Keyspace() == sourceShard.Keyspace && s.ShardName() == sourceShard.Shard { blpPosition.Position = pos } } // and wait for it wr.Logger().Infof("Waiting for %v to catch up", topo.TabletAliasString(si.MasterAlias)) tablet, err := wr.ts.GetTablet(ctx, si.MasterAlias) if err != nil { rec.RecordError(err) return } if err := wr.tmc.WaitBlpPosition(ctx, tablet, blpPosition, waitTime); err != nil { rec.RecordError(err) } else { wr.Logger().Infof("%v caught up", topo.TabletAliasString(si.MasterAlias)) } } }(si) } wg.Wait() return rec.Error() }
// helper method to asynchronously get and diff a version func (wr *Wrangler) diffVersion(ctx context.Context, masterVersion string, masterAlias *pb.TabletAlias, alias *pb.TabletAlias, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { defer wg.Done() log.Infof("Gathering version for %v", topo.TabletAliasString(alias)) slaveVersion, err := wr.GetVersion(ctx, alias) if err != nil { er.RecordError(err) return } if masterVersion != slaveVersion { er.RecordError(fmt.Errorf("Master %v version %v is different than slave %v version %v", topo.TabletAliasString(masterAlias), masterVersion, topo.TabletAliasString(alias), slaveVersion)) } }
// GetVersion returns the version string from a tablet func (wr *Wrangler) GetVersion(ctx context.Context, tabletAlias *pb.TabletAlias) (string, error) { tablet, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return "", err } version, err := getVersionFromTablet(tablet.Addr()) if err != nil { return "", err } log.Infof("Tablet %v is running version '%v'", topo.TabletAliasString(tabletAlias), version) return version, err }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'worker' pointing back to us // - get the aliases of all the targets func (scw *SplitCloneWorker) findTargets(ctx context.Context) error { scw.setState(WorkerStateFindTargets) var err error // find an appropriate endpoint in the source shards scw.sourceAliases = make([]*pb.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { scw.sourceAliases[i], err = FindWorkerTablet(ctx, scw.wr, scw.cleaner, scw.cell, si.Keyspace(), si.ShardName()) if err != nil { return fmt.Errorf("FindWorkerTablet() failed for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", topo.TabletAliasString(scw.sourceAliases[i]), si.Keyspace(), si.ShardName()) } // get the tablet info for them, and stop their replication scw.sourceTablets = make([]*topo.TabletInfo, len(scw.sourceAliases)) for i, alias := range scw.sourceAliases { scw.sourceTablets[i], err = scw.wr.TopoServer().GetTablet(ctx, alias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", topo.TabletAliasString(alias), err) } shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err := scw.wr.TabletManagerClient().StopSlave(shortCtx, scw.sourceTablets[i]) cancel() if err != nil { return fmt.Errorf("cannot stop replication on tablet %v", topo.TabletAliasString(alias)) } wrangler.RecordStartSlaveAction(scw.cleaner, scw.sourceTablets[i]) action, err := wrangler.FindChangeSlaveTypeActionByTarget(scw.cleaner, alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(alias), err) } action.TabletType = pb.TabletType_SPARE } return scw.ResolveDestinationMasters(ctx) }
// FindWorkerTablet will: // - find a rdonly instance in the keyspace / shard // - mark it as worker // - tag it with our worker process func FindWorkerTablet(ctx context.Context, wr *wrangler.Wrangler, cleaner *wrangler.Cleaner, cell, keyspace, shard string) (*pb.TabletAlias, error) { tabletAlias, err := FindHealthyRdonlyEndPoint(ctx, wr, cell, keyspace, shard) if err != nil { return nil, err } // We add the tag before calling ChangeSlaveType, so the destination // vttablet reloads the worker URL when it reloads the tablet. ourURL := servenv.ListeningURL.String() wr.Logger().Infof("Adding tag[worker]=%v to tablet %v", ourURL, topo.TabletAliasString(tabletAlias)) if err := wr.TopoServer().UpdateTabletFields(ctx, tabletAlias, func(tablet *pb.Tablet) error { if tablet.Tags == nil { tablet.Tags = make(map[string]string) } tablet.Tags["worker"] = ourURL return nil }); err != nil { return nil, err } // we remove the tag *before* calling ChangeSlaveType back, so // we need to record this tag change after the change slave // type change in the cleaner. defer wrangler.RecordTabletTagAction(cleaner, tabletAlias, "worker", "") wr.Logger().Infof("Changing tablet %v to '%v'", topo.TabletAliasString(tabletAlias), pb.TabletType_WORKER) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err = wr.ChangeType(shortCtx, tabletAlias, pb.TabletType_WORKER, false /*force*/) cancel() if err != nil { return nil, err } // Record a clean-up action to take the tablet back to rdonly. // We will alter this one later on and let the tablet go back to // 'spare' if we have stopped replication for too long on it. wrangler.RecordChangeSlaveTypeAction(cleaner, tabletAlias, pb.TabletType_RDONLY) return tabletAlias, nil }
// synchronizeReplication phase: // 1 - ask the subset slave to stop replication // 2 - sleep for 5 seconds // 3 - ask the superset slave to stop replication // Note this is not 100% correct, but good enough for now func (worker *SQLDiffWorker) synchronizeReplication(ctx context.Context) error { worker.SetState(WorkerStateSyncReplication) // stop replication on subset slave worker.wr.Logger().Infof("Stopping replication on subset slave %v", topo.TabletAliasString(worker.subset.alias)) subsetTablet, err := worker.wr.TopoServer().GetTablet(ctx, worker.subset.alias) if err != nil { return err } shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) err = worker.wr.TabletManagerClient().StopSlave(shortCtx, subsetTablet) cancel() if err != nil { return fmt.Errorf("Cannot stop slave %v: %v", topo.TabletAliasString(worker.subset.alias), err) } if err := checkDone(ctx); err != nil { return err } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, subsetTablet) action, err := wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.subset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(worker.subset.alias), err) } action.TabletType = pb.TabletType_SPARE // sleep for a few seconds time.Sleep(5 * time.Second) if err := checkDone(ctx); err != nil { return err } // stop replication on superset slave worker.wr.Logger().Infof("Stopping replication on superset slave %v", topo.TabletAliasString(worker.superset.alias)) supersetTablet, err := worker.wr.TopoServer().GetTablet(ctx, worker.superset.alias) if err != nil { return err } shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) err = worker.wr.TabletManagerClient().StopSlave(shortCtx, supersetTablet) cancel() if err != nil { return fmt.Errorf("Cannot stop slave %v: %v", topo.TabletAliasString(worker.superset.alias), err) } // change the cleaner actions from ChangeSlaveType(rdonly) // to StartSlave() + ChangeSlaveType(spare) wrangler.RecordStartSlaveAction(worker.cleaner, supersetTablet) action, err = wrangler.FindChangeSlaveTypeActionByTarget(worker.cleaner, worker.superset.alias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", topo.TabletAliasString(worker.superset.alias), err) } action.TabletType = pb.TabletType_SPARE return nil }
// Does a topo lookup for a single shard, and returns the tablet record of the master tablet. func resolveDestinationShardMaster(ctx context.Context, keyspace, shard string, wr *wrangler.Wrangler) (*topo.TabletInfo, error) { var ti *topo.TabletInfo shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) si, err := topo.GetShard(shortCtx, wr.TopoServer(), keyspace, shard) cancel() if err != nil { return ti, fmt.Errorf("unable to resolve destination shard %v/%v", keyspace, shard) } if topo.TabletAliasIsZero(si.MasterAlias) { return ti, fmt.Errorf("no master in destination shard %v/%v", keyspace, shard) } wr.Logger().Infof("Found target master alias %v in shard %v/%v", topo.TabletAliasString(si.MasterAlias), keyspace, shard) shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout) ti, err = topo.GetTablet(shortCtx, wr.TopoServer(), si.MasterAlias) cancel() if err != nil { return ti, fmt.Errorf("unable to get master tablet from alias %v in shard %v/%v", topo.TabletAliasString(si.MasterAlias), keyspace, shard) } return ti, nil }
// refreshMasters will just RPC-ping all the masters with RefreshState func (wr *Wrangler) refreshMasters(ctx context.Context, shards []*topo.ShardInfo) error { wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() wr.Logger().Infof("RefreshState master %v", topo.TabletAliasString(si.MasterAlias)) ti, err := wr.ts.GetTablet(ctx, si.MasterAlias) if err != nil { rec.RecordError(err) return } if err := wr.tmc.RefreshState(ctx, ti); err != nil { rec.RecordError(err) } else { wr.Logger().Infof("%v responded", topo.TabletAliasString(si.MasterAlias)) } }(si) } wg.Wait() return rec.Error() }
// Backup takes a db backup and sends it to the BackupStorage // Should be called under RPCWrapLockAction. func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error { // update our type to BACKUP tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err } if tablet.Type == pb.TabletType_MASTER { return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode") } originalType := tablet.Type if err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, pb.TabletType_BACKUP, make(map[string]string)); err != nil { return err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "backup"); err != nil { return fmt.Errorf("failed to update state before backup: %v", err) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup bucket := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard) name := fmt.Sprintf("%v.%v", topo.TabletAliasString(tablet.Alias), time.Now().UTC().Format("2006-01-02.150405")) returnErr := mysqlctl.Backup(ctx, agent.MysqlDaemon, l, bucket, name, concurrency, agent.hookExtraEnv()) // and change our type back to the appropriate value: // - if healthcheck is enabled, go to spare // - if not, go back to original type if agent.IsRunningHealthCheck() { originalType = pb.TabletType_SPARE } err = topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, originalType, nil) if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) if returnErr != nil { l.Errorf("mysql backup command returned error: %v", returnErr) } returnErr = err } return returnErr }
// TableScanByKeyRange returns a QueryResultReader that gets all the // rows from a table that match the supplied KeyRange, ordered by // Primary Key. The returned columns are ordered with the Primary Key // columns in front. func TableScanByKeyRange(ctx context.Context, log logutil.Logger, ts topo.Server, tabletAlias *pb.TabletAlias, tableDefinition *myproto.TableDefinition, keyRange *pb.KeyRange, keyspaceIDType key.KeyspaceIdType) (*QueryResultReader, error) { where := "" if keyRange != nil { switch keyspaceIDType { case key.KIT_UINT64: if len(keyRange.Start) > 0 { if len(keyRange.End) > 0 { // have start & end where = fmt.Sprintf("WHERE keyspace_id >= %v AND keyspace_id < %v ", uint64FromKeyspaceID(keyRange.Start), uint64FromKeyspaceID(keyRange.End)) } else { // have start only where = fmt.Sprintf("WHERE keyspace_id >= %v ", uint64FromKeyspaceID(keyRange.Start)) } } else { if len(keyRange.End) > 0 { // have end only where = fmt.Sprintf("WHERE keyspace_id < %v ", uint64FromKeyspaceID(keyRange.End)) } } case key.KIT_BYTES: if len(keyRange.Start) > 0 { if len(keyRange.End) > 0 { // have start & end where = fmt.Sprintf("WHERE HEX(keyspace_id) >= '%v' AND HEX(keyspace_id) < '%v' ", hex.EncodeToString(keyRange.Start), hex.EncodeToString(keyRange.End)) } else { // have start only where = fmt.Sprintf("WHERE HEX(keyspace_id) >= '%v' ", hex.EncodeToString(keyRange.Start)) } } else { if len(keyRange.End) > 0 { // have end only where = fmt.Sprintf("WHERE HEX(keyspace_id) < '%v' ", hex.EncodeToString(keyRange.End)) } } default: return nil, fmt.Errorf("Unsupported KeyspaceIdType: %v", keyspaceIDType) } } sql := fmt.Sprintf("SELECT %v FROM %v %vORDER BY %v", strings.Join(orderedColumns(tableDefinition), ", "), tableDefinition.Name, where, strings.Join(tableDefinition.PrimaryKeyColumns, ", ")) log.Infof("SQL query for %v/%v: %v", topo.TabletAliasString(tabletAlias), tableDefinition.Name, sql) return NewQueryResultReaderForTablet(ctx, ts, tabletAlias, sql) }
// StatusAsText implements the Worker interface func (vscw *VerticalSplitCloneWorker) StatusAsText() string { vscw.Mu.Lock() defer vscw.Mu.Unlock() result := "Working on: " + vscw.destinationKeyspace + "/" + vscw.destinationShard + "\n" result += "State: " + vscw.State.String() + "\n" switch vscw.State { case WorkerStateCopy: result += "Running:\n" result += "Copying from: " + topo.TabletAliasString(vscw.sourceAlias) + "\n" statuses, eta := formatTableStatuses(vscw.tableStatus, vscw.startTime) result += "ETA: " + eta.String() + "\n" result += strings.Join(statuses, "\n") case WorkerStateDone: result += "Success:\n" statuses, _ := formatTableStatuses(vscw.tableStatus, vscw.startTime) result += strings.Join(statuses, "\n") } return result }
func (wr *Wrangler) applySchemaShardSimple(ctx context.Context, statusArray []*tabletStatus, preflight *myproto.SchemaChangeResult, masterTabletAlias *pb.TabletAlias, change string, force bool) (*myproto.SchemaChangeResult, error) { // check all tablets have the same schema as the master's // BeforeSchema. If not, we shouldn't proceed log.Infof("Checking schema on all tablets") for _, status := range statusArray { diffs := myproto.DiffSchemaToArray("master", preflight.BeforeSchema, topo.TabletAliasString(status.ti.Alias), status.beforeSchema) if len(diffs) > 0 { if force { log.Warningf("Tablet %v has inconsistent schema, ignoring: %v", status.ti.Alias, strings.Join(diffs, "\n")) } else { return nil, fmt.Errorf("Tablet %v has inconsistent schema: %v", status.ti.Alias, strings.Join(diffs, "\n")) } } } // we're good, just send to the master log.Infof("Applying schema change to master in simple mode") sc := &myproto.SchemaChange{Sql: change, Force: force, AllowReplication: true, BeforeSchema: preflight.BeforeSchema, AfterSchema: preflight.AfterSchema} return wr.ApplySchema(ctx, masterTabletAlias, sc) }
// StatusAsHTML implements the Worker interface func (vscw *VerticalSplitCloneWorker) StatusAsHTML() template.HTML { vscw.Mu.Lock() defer vscw.Mu.Unlock() result := "<b>Working on:</b> " + vscw.destinationKeyspace + "/" + vscw.destinationShard + "</br>\n" result += "<b>State:</b> " + vscw.State.String() + "</br>\n" switch vscw.State { case WorkerStateCopy: result += "<b>Running</b>:</br>\n" result += "<b>Copying from</b>: " + topo.TabletAliasString(vscw.sourceAlias) + "</br>\n" statuses, eta := formatTableStatuses(vscw.tableStatus, vscw.startTime) result += "<b>ETA</b>: " + eta.String() + "</br>\n" result += strings.Join(statuses, "</br>\n") case WorkerStateDone: result += "<b>Success</b>:</br>\n" statuses, _ := formatTableStatuses(vscw.tableStatus, vscw.startTime) result += strings.Join(statuses, "</br>\n") } return template.HTML(result) }
// ValidateVersionShard validates all versions are the same in all // tablets in a shard func (wr *Wrangler) ValidateVersionShard(ctx context.Context, keyspace, shard string) error { si, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } // get version from the master, or error if topo.TabletAliasIsZero(si.MasterAlias) { return fmt.Errorf("No master in shard %v/%v", keyspace, shard) } log.Infof("Gathering version for master %v", topo.TabletAliasString(si.MasterAlias)) masterVersion, err := wr.GetVersion(ctx, si.MasterAlias) if err != nil { return err } // read all the aliases in the shard, that is all tablets that are // replicating from the master aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard) if err != nil { return err } // then diff with all slaves er := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} for _, alias := range aliases { if topo.TabletAliasEqual(alias, si.MasterAlias) { continue } wg.Add(1) go wr.diffVersion(ctx, masterVersion, si.MasterAlias, alias, &wg, &er) } wg.Wait() if er.HasErrors() { return fmt.Errorf("Version diffs:\n%v", er.Error().Error()) } return nil }