func (wr *Wrangler) shardReplicationStatuses(shardInfo *topo.ShardInfo) ([]*topo.TabletInfo, []*myproto.ReplicationStatus, error) { // FIXME(msolomon) this assumes no hierarchical replication, which is currently the case. tabletMap, err := topo.GetTabletMapForShard(context.TODO(), wr.ts, shardInfo.Keyspace(), shardInfo.ShardName()) if err != nil { return nil, nil, err } tablets := topotools.CopyMapValues(tabletMap, []*topo.TabletInfo{}).([]*topo.TabletInfo) stats, err := wr.tabletReplicationStatuses(tablets) return tablets, stats, err }
func (wr *Wrangler) restartSlaves(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, rsd *actionnode.RestartSlaveData) (majorityRestart bool, err error) { wg := new(sync.WaitGroup) slaves := topotools.CopyMapValues(slaveTabletMap, []*topo.TabletInfo{}).([]*topo.TabletInfo) errs := make([]error, len(slaveTabletMap)) f := func(i int) { errs[i] = wr.restartSlave(slaves[i], rsd) if errs[i] != nil { // FIXME(msolomon) Don't bail early, just mark this phase as // failed. We might decide to proceed if enough of these // succeed. // // FIXME(msolomon) This is a somewhat delicate retry - have to // figure out why it failed on the tablet end. This could lead // to a nasty case of having to recompute where to start // replication. Practically speaking, that chance is pretty low. wr.logger.Warningf("restart slave failed: %v %v", slaves[i].Alias, errs[i]) } wg.Done() } for i := range slaves { wg.Add(1) go f(i) } wg.Wait() errCount := 0 badTablets := make([]string, 0, 16) for i, err := range errs { if err != nil { errCount++ badTablets = append(badTablets, slaves[i].Alias.String()) } } // Phrase the question with multiplication so we don't get caught by int // division rounding. majorityRestart = errCount*2 < len(slaveTabletMap) if errCount > 0 { err = fmt.Errorf("restart slave failed on some tablets (%v): %v", errCount, strings.Join(badTablets, ", ")) } return }