func (wr *Wrangler) shardReplicationStatuses(ctx context.Context, shardInfo *topo.ShardInfo) ([]*topo.TabletInfo, []*myproto.ReplicationStatus, error) { // FIXME(msolomon) this assumes no hierarchical replication, which is currently the case. tabletMap, err := wr.ts.GetTabletMapForShard(ctx, shardInfo.Keyspace(), shardInfo.ShardName()) if err != nil { return nil, nil, err } tablets := topotools.CopyMapValues(tabletMap, []*topo.TabletInfo{}).([]*topo.TabletInfo) stats, err := wr.tabletReplicationStatuses(ctx, tablets) return tablets, stats, err }
// ShardReplicationStatuses returns the ReplicationStatus for each tablet in a shard. func (wr *Wrangler) ShardReplicationStatuses(ctx context.Context, keyspace, shard string) ([]*topo.TabletInfo, []*replicationdatapb.Status, error) { tabletMap, err := wr.ts.GetTabletMapForShard(ctx, keyspace, shard) if err != nil { return nil, nil, err } tablets := topotools.CopyMapValues(tabletMap, []*topo.TabletInfo{}).([]*topo.TabletInfo) wr.logger.Infof("Gathering tablet replication status for: %v", tablets) wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} result := make([]*replicationdatapb.Status, len(tablets)) for i, ti := range tablets { // Don't scan tablets that won't return something // useful. Otherwise, you'll end up waiting for a timeout. if ti.Type == topodatapb.TabletType_MASTER { wg.Add(1) go func(i int, ti *topo.TabletInfo) { defer wg.Done() pos, err := wr.tmc.MasterPosition(ctx, ti.Tablet) if err != nil { rec.RecordError(fmt.Errorf("MasterPosition(%v) failed: %v", ti.AliasString(), err)) return } result[i] = &replicationdatapb.Status{ Position: pos, } }(i, ti) } else if ti.IsSlaveType() { wg.Add(1) go func(i int, ti *topo.TabletInfo) { defer wg.Done() status, err := wr.tmc.SlaveStatus(ctx, ti.Tablet) if err != nil { rec.RecordError(fmt.Errorf("SlaveStatus(%v) failed: %v", ti.AliasString(), err)) return } result[i] = status }(i, ti) } } wg.Wait() return tablets, result, rec.Error() }
func (wr *Wrangler) restartSlaves(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, rsd *actionnode.RestartSlaveData) (majorityRestart bool, err error) { wg := new(sync.WaitGroup) slaves := topotools.CopyMapValues(slaveTabletMap, []*topo.TabletInfo{}).([]*topo.TabletInfo) errs := make([]error, len(slaveTabletMap)) f := func(i int) { errs[i] = wr.restartSlave(slaves[i], rsd) if errs[i] != nil { // FIXME(msolomon) Don't bail early, just mark this phase as // failed. We might decide to proceed if enough of these // succeed. // // FIXME(msolomon) This is a somewhat delicate retry - have to // figure out why it failed on the tablet end. This could lead // to a nasty case of having to recompute where to start // replication. Practically speaking, that chance is pretty low. wr.logger.Warningf("restart slave failed: %v %v", slaves[i].Alias, errs[i]) } wg.Done() } for i := range slaves { wg.Add(1) go f(i) } wg.Wait() errCount := 0 badTablets := make([]string, 0, 16) for i, err := range errs { if err != nil { errCount++ badTablets = append(badTablets, slaves[i].Alias.String()) } } // Phrase the question with multiplication so we don't get caught by int // division rounding. majorityRestart = errCount*2 < len(slaveTabletMap) if errCount > 0 { err = fmt.Errorf("restart slave failed on some tablets (%v): %v", errCount, strings.Join(badTablets, ", ")) } return }