// GetAllTablets returns a sorted list of tablets. func GetAllTablets(ctx context.Context, ts topo.Server, cell string) ([]*topo.TabletInfo, error) { aliases, err := ts.GetTabletsByCell(ctx, cell) if err != nil { return nil, err } sort.Sort(topo.TabletAliasList(aliases)) tabletMap, err := topo.GetTabletMap(ctx, ts, aliases) if err != nil { // we got another error than topo.ErrNoNode return nil, err } tablets := make([]*topo.TabletInfo, 0, len(aliases)) for _, tabletAlias := range aliases { tabletInfo, ok := tabletMap[*tabletAlias] if !ok { // tablet disappeared on us (GetTabletMap ignores // topo.ErrNoNode), just echo a warning log.Warningf("failed to load tablet %v", tabletAlias) } else { tablets = append(tablets, tabletInfo) } } return tablets, nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'checker' pointing back to us // - get the aliases of all the targets func (vscw *VerticalSplitCloneWorker) findTargets() error { vscw.setState(stateVSCFindTargets) // find an appropriate endpoint in the source shard var err error vscw.sourceAlias, err = findChecker(vscw.wr, vscw.cleaner, vscw.cell, vscw.sourceKeyspace, "0") if err != nil { return fmt.Errorf("cannot find checker for %v/%v/0: %v", vscw.cell, vscw.sourceKeyspace, err) } vscw.wr.Logger().Infof("Using tablet %v as the source", vscw.sourceAlias) // get the tablet info for it vscw.sourceTablet, err = vscw.wr.TopoServer().GetTablet(vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", vscw.sourceTablet, err) } // stop replication on it if err := vscw.wr.TabletManagerClient().StopSlave(vscw.sourceTablet, 30*time.Second); err != nil { return fmt.Errorf("cannot stop replication on tablet %v", vscw.sourceAlias) } wrangler.RecordStartSlaveAction(vscw.cleaner, vscw.sourceTablet, 30*time.Second) action, err := wrangler.FindChangeSlaveTypeActionByTarget(vscw.cleaner, vscw.sourceAlias) if err != nil { return fmt.Errorf("cannot find ChangeSlaveType action for %v: %v", vscw.sourceAlias, err) } action.TabletType = topo.TYPE_SPARE // find all the targets in the destination keyspace / shard vscw.destinationAliases, err = topo.FindAllTabletAliasesInShard(vscw.wr.TopoServer(), vscw.destinationKeyspace, vscw.destinationShard) if err != nil { return fmt.Errorf("cannot find all target tablets in %v/%v: %v", vscw.destinationKeyspace, vscw.destinationShard, err) } vscw.wr.Logger().Infof("Found %v target aliases", len(vscw.destinationAliases)) // get the TabletInfo for all targets vscw.destinationTablets, err = topo.GetTabletMap(vscw.wr.TopoServer(), vscw.destinationAliases) if err != nil { return fmt.Errorf("cannot read all target tablets in %v/%v: %v", vscw.destinationKeyspace, vscw.destinationShard, err) } // find and validate the master for tabletAlias, ti := range vscw.destinationTablets { if ti.Type == topo.TYPE_MASTER { if vscw.destinationMasterAlias.IsZero() { vscw.destinationMasterAlias = tabletAlias } else { return fmt.Errorf("multiple masters in destination shard: %v and %v at least", vscw.destinationMasterAlias, tabletAlias) } } } if vscw.destinationMasterAlias.IsZero() { return fmt.Errorf("no master in destination shard") } return nil }
// findTargets phase: // - find one rdonly in the source shard // - mark it as 'checker' pointing back to us // - get the aliases of all the targets func (scw *SplitCloneWorker) findTargets() error { scw.setState(stateSCFindTargets) var err error // find an appropriate endpoint in the source shards scw.sourceAliases = make([]topo.TabletAlias, len(scw.sourceShards)) for i, si := range scw.sourceShards { scw.sourceAliases[i], err = findChecker(scw.wr, scw.cleaner, scw.cell, si.Keyspace(), si.ShardName()) if err != nil { return fmt.Errorf("cannot find checker for %v/%v/%v: %v", scw.cell, si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Using tablet %v as source for %v/%v", scw.sourceAliases[i], si.Keyspace(), si.ShardName()) } // get the tablet info for them scw.sourceTablets = make([]*topo.TabletInfo, len(scw.sourceAliases)) for i, alias := range scw.sourceAliases { scw.sourceTablets[i], err = scw.wr.TopoServer().GetTablet(alias) if err != nil { return fmt.Errorf("cannot read tablet %v: %v", alias, err) } } // find all the targets in the destination shards scw.destinationAliases = make([][]topo.TabletAlias, len(scw.destinationShards)) scw.destinationTablets = make([]map[topo.TabletAlias]*topo.TabletInfo, len(scw.destinationShards)) scw.destinationMasterAliases = make([]topo.TabletAlias, len(scw.destinationShards)) for shardIndex, si := range scw.destinationShards { scw.destinationAliases[shardIndex], err = topo.FindAllTabletAliasesInShard(scw.wr.TopoServer(), si.Keyspace(), si.ShardName()) if err != nil { return fmt.Errorf("cannot find all target tablets in %v/%v: %v", si.Keyspace(), si.ShardName(), err) } scw.wr.Logger().Infof("Found %v target aliases in shard %v/%v", len(scw.destinationAliases[shardIndex]), si.Keyspace(), si.ShardName()) // get the TabletInfo for all targets scw.destinationTablets[shardIndex], err = topo.GetTabletMap(scw.wr.TopoServer(), scw.destinationAliases[shardIndex]) if err != nil { return fmt.Errorf("cannot read all target tablets in %v/%v: %v", si.Keyspace(), si.ShardName(), err) } // find and validate the master for tabletAlias, ti := range scw.destinationTablets[shardIndex] { if ti.Type == topo.TYPE_MASTER { if scw.destinationMasterAliases[shardIndex].IsZero() { scw.destinationMasterAliases[shardIndex] = tabletAlias } else { return fmt.Errorf("multiple masters in destination shard: %v and %v at least", scw.destinationMasterAliases[shardIndex], tabletAlias) } } } if scw.destinationMasterAliases[shardIndex].IsZero() { return fmt.Errorf("no master in destination shard") } } return nil }
// FIXME(msolomon) This validate presumes the master is up and running. // Even when that isn't true, there are validation processes that might be valuable. func (wr *Wrangler) validateShard(ctx context.Context, keyspace, shard string, pingTablets bool, wg *sync.WaitGroup, results chan<- error) { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.GetShard(%v, %v) failed: %v", keyspace, shard, err) return } aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err) return } tabletMap, _ := topo.GetTabletMap(ctx, wr.ts, aliases) var masterAlias *pb.TabletAlias for _, alias := range aliases { tabletInfo, ok := tabletMap[*alias] if !ok { results <- fmt.Errorf("tablet %v not found in map", alias) continue } if tabletInfo.Type == pb.TabletType_MASTER { if masterAlias != nil { results <- fmt.Errorf("shard %v/%v already has master %v but found other master %v", keyspace, shard, masterAlias, alias) } else { masterAlias = alias } } } if masterAlias == nil { results <- fmt.Errorf("no master for shard %v/%v", keyspace, shard) } else if !topo.TabletAliasEqual(shardInfo.MasterAlias, masterAlias) { results <- fmt.Errorf("master mismatch for shard %v/%v: found %v, expected %v", keyspace, shard, masterAlias, shardInfo.MasterAlias) } for _, alias := range aliases { wg.Add(1) go func(alias *pb.TabletAlias) { defer wg.Done() if err := topo.Validate(ctx, wr.ts, alias); err != nil { results <- fmt.Errorf("Validate(%v) failed: %v", alias, err) } else { wr.Logger().Infof("tablet %v is valid", alias) } }(alias) } if pingTablets { wr.validateReplication(ctx, shardInfo, tabletMap, results) wr.pingTablets(ctx, tabletMap, wg, results) } return }
// FIXME(msolomon) This validate presumes the master is up and running. // Even when that isn't true, there are validation processes that might be valuable. func (wr *Wrangler) validateShard(keyspace, shard string, pingTablets bool, wg *sync.WaitGroup, results chan<- vresult) { shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { results <- vresult{keyspace + "/" + shard, err} return } aliases, err := topo.FindAllTabletAliasesInShard(wr.ts, keyspace, shard) if err != nil { results <- vresult{keyspace + "/" + shard, err} } tabletMap, _ := topo.GetTabletMap(wr.ts, aliases) var masterAlias topo.TabletAlias for _, alias := range aliases { tabletInfo, ok := tabletMap[alias] if !ok { results <- vresult{alias.String(), fmt.Errorf("tablet not found in map")} continue } if tabletInfo.Parent.Uid == topo.NO_TABLET { if masterAlias.Cell != "" { results <- vresult{alias.String(), fmt.Errorf("tablet already has a master %v", masterAlias)} } else { masterAlias = alias } } } if masterAlias.Cell == "" { results <- vresult{keyspace + "/" + shard, fmt.Errorf("no master for shard")} } else if shardInfo.MasterAlias != masterAlias { results <- vresult{keyspace + "/" + shard, fmt.Errorf("master mismatch for shard: found %v, expected %v", masterAlias, shardInfo.MasterAlias)} } for _, alias := range aliases { wg.Add(1) go func(alias topo.TabletAlias) { results <- vresult{alias.String(), topo.Validate(wr.ts, alias)} wg.Done() }(alias) } if pingTablets { wr.validateReplication(shardInfo, tabletMap, results) wr.pingTablets(tabletMap, wg, results) } return }
// Does a topo lookup for a single shard, and returns: // 1. Slice of all tablet aliases for the shard. // 2. Map of tablet alias : tablet record for all tablets. func resolveReloadTabletsForShard(ctx context.Context, keyspace, shard string, wr *wrangler.Wrangler) (reloadAliases []topo.TabletAlias, reloadTablets map[topo.TabletAlias]*topo.TabletInfo, err error) { // Keep a long timeout, because we really don't want the copying to succeed, and then the worker to fail at the end. shortCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) reloadAliases, err = topo.FindAllTabletAliasesInShard(shortCtx, wr.TopoServer(), keyspace, shard) cancel() if err != nil { return nil, nil, fmt.Errorf("cannot find all reload target tablets in %v/%v: %v", keyspace, shard, err) } wr.Logger().Infof("Found %v reload target aliases in shard %v/%v", len(reloadAliases), keyspace, shard) shortCtx, cancel = context.WithTimeout(ctx, 5*time.Minute) reloadTablets, err = topo.GetTabletMap(shortCtx, wr.TopoServer(), reloadAliases) cancel() if err != nil { return nil, nil, fmt.Errorf("cannot read all reload target tablets in %v/%v: %v", keyspace, shard, err) } return reloadAliases, reloadTablets, nil }
// SetSourceShards is a utility function to override the SourceShards fields // on a Shard. func (wr *Wrangler) SetSourceShards(ctx context.Context, keyspace, shard string, sources []topo.TabletAlias, tables []string) error { // read the shard shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } // If the shard already has sources, maybe it's already been restored, // so let's be safe and abort right here. if len(shardInfo.SourceShards) > 0 { return fmt.Errorf("Shard %v/%v already has SourceShards, not overwriting them", keyspace, shard) } // read the source tablets sourceTablets, err := topo.GetTabletMap(ctx, wr.TopoServer(), sources) if err != nil { return err } // Insert their KeyRange in the SourceShards array. // We use a linear 0-based id, that matches what mysqlctld/split.go // inserts into _vt.blp_checkpoint. shardInfo.SourceShards = make([]*pb.Shard_SourceShard, len(sourceTablets)) i := 0 for _, ti := range sourceTablets { shardInfo.SourceShards[i] = &pb.Shard_SourceShard{ Uid: uint32(i), Keyspace: ti.Keyspace, Shard: ti.Shard, KeyRange: key.KeyRangeToProto(ti.KeyRange), Tables: tables, } i++ } // and write the shard if err = topo.UpdateShard(ctx, wr.ts, shardInfo); err != nil { return err } return nil }
func (wr *Wrangler) shardMultiRestore(keyspace, shard string, sources []topo.TabletAlias, tables []string, concurrency, fetchConcurrency, insertTableConcurrency, fetchRetryCount int, strategy string) error { // read the shard shardInfo, err := wr.ts.GetShard(keyspace, shard) if err != nil { return err } // read the source tablets sourceTablets, err := topo.GetTabletMap(wr.TopoServer(), sources) if err != nil { return err } // Insert their KeyRange in the SourceShards array. // We use a linear 0-based id, that matches what mysqlctld/split.go // inserts into _vt.blp_checkpoint. shardInfo.SourceShards = make([]topo.SourceShard, len(sourceTablets)) i := 0 for _, ti := range sourceTablets { shardInfo.SourceShards[i] = topo.SourceShard{ Uid: uint32(i), Keyspace: ti.Keyspace, Shard: ti.Shard, KeyRange: ti.KeyRange, Tables: tables, } i++ } // and write the shard if err = wr.ts.UpdateShard(shardInfo); err != nil { return err } return nil }
// Update shard file with new master, replicas, etc. // // Re-read from TopologyServer to make sure we are using the side // effects of all actions. // // This function locks individual SvrShard paths, so it doesn't need a lock // on the shard. func RebuildShard(log logutil.Logger, ts topo.Server, keyspace, shard string, cells []string, timeout time.Duration, interrupted chan struct{}) error { log.Infof("RebuildShard %v/%v", keyspace, shard) // read the existing shard info. It has to exist. shardInfo, err := ts.GetShard(keyspace, shard) if err != nil { return err } // rebuild all cells in parallel wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, cell := range shardInfo.Cells { // skip this cell if we shouldn't rebuild it if !topo.InCellList(cell, cells) { continue } // start with the master if it's in the current cell tabletsAsMap := make(map[topo.TabletAlias]bool) if shardInfo.MasterAlias.Cell == cell { tabletsAsMap[shardInfo.MasterAlias] = true } wg.Add(1) go func(cell string) { defer wg.Done() // read the ShardReplication object to find tablets sri, err := ts.GetShardReplication(cell, keyspace, shard) if err != nil { rec.RecordError(fmt.Errorf("GetShardReplication(%v, %v, %v) failed: %v", cell, keyspace, shard, err)) return } // add all relevant tablets to the map for _, rl := range sri.ReplicationLinks { tabletsAsMap[rl.TabletAlias] = true if rl.Parent.Cell == cell { tabletsAsMap[rl.Parent] = true } } // convert the map to a list aliases := make([]topo.TabletAlias, 0, len(tabletsAsMap)) for a := range tabletsAsMap { aliases = append(aliases, a) } // read all the Tablet records tablets, err := topo.GetTabletMap(ts, aliases) switch err { case nil: // keep going, we're good case topo.ErrPartialResult: log.Warningf("Got ErrPartialResult from topo.GetTabletMap in cell %v, some tablets may not be added properly to serving graph", cell) default: rec.RecordError(fmt.Errorf("GetTabletMap in cell %v failed: %v", cell, err)) return } // Lock the SrvShard so we write a consistent data set. actionNode := actionnode.RebuildSrvShard() lockPath, err := actionNode.LockSrvShard(ts, cell, keyspace, shard, timeout, interrupted) if err != nil { rec.RecordError(err) return } // write the data we need to rebuildErr := rebuildCellSrvShard(log, ts, shardInfo, cell, tablets) // and unlock if err := actionNode.UnlockSrvShard(ts, cell, keyspace, shard, lockPath, rebuildErr); err != nil { rec.RecordError(err) } }(cell) } wg.Wait() return rec.Error() }