func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error { // If the majority of slaves restarted, move ahead. if majorityRestart { if leaveMasterReadOnly { wr.logger.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } else { wr.logger.Infof("marking master-elect read-write %v", masterElect.Alias) if err := wr.tmc.SetReadWrite(wr.ctx, masterElect); err != nil { wr.logger.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } } } else { wr.logger.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias) } // save the new master in the shard info si.MasterAlias = masterElect.Alias if err := topo.UpdateShard(context.TODO(), wr.ts, si); err != nil { wr.logger.Errorf("Failed to save new master into shard: %v", err) return err } // We rebuild all the cells, as we may have taken tablets in and // out of the graph. wr.logger.Infof("rebuilding shard serving graph data") _, err := topotools.RebuildShard(context.TODO(), wr.logger, wr.ts, masterElect.Keyspace, masterElect.Shard, nil, wr.lockTimeout, interrupted) return err }
// rebuildShardIfNeeded will rebuild the serving graph if we need to func (agent *ActionAgent) rebuildShardIfNeeded(tablet *topo.TabletInfo, targetTabletType topo.TabletType) error { if topo.IsInServingGraph(targetTabletType) { // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) // no need to take the shard lock in this case if _, err := topotools.RebuildShard(context.TODO(), logutil.NewConsoleLogger(), agent.TopoServer, tablet.Keyspace, tablet.Shard, []string{tablet.Alias.Cell}, agent.LockTimeout, interrupted); err != nil { return fmt.Errorf("topotools.RebuildShard returned an error: %v", err) } } return nil }
// same as ChangeType, but assume we already have the shard lock, // and do not have the option to force anything. func (wr *Wrangler) changeTypeInternal(tabletAlias topo.TabletAlias, dbType topo.TabletType) error { ti, err := wr.ts.GetTablet(tabletAlias) if err != nil { return err } rebuildRequired := ti.Tablet.IsInServingGraph() // change the type if err := wr.tmc.ChangeType(wr.ctx, ti, dbType); err != nil { return err } // rebuild if necessary if rebuildRequired { _, err = topotools.RebuildShard(context.TODO(), wr.logger, wr.ts, ti.Keyspace, ti.Shard, []string{ti.Alias.Cell}, wr.lockTimeout, interrupted) if err != nil { return err } } return nil }
// Rebuild the serving and replication rollup data data while locking // out other changes. func (wr *Wrangler) RebuildShardGraph(keyspace, shard string, cells []string) (*topo.ShardInfo, error) { return topotools.RebuildShard(context.TODO(), wr.logger, wr.ts, keyspace, shard, cells, wr.lockTimeout, interrupted) }
// tabletExternallyReparentedLocked is called with the shard lock. // It returns if agent.refreshTablet should be called, and the error. // Note both are set independently (can have both true and an error). func (agent *ActionAgent) tabletExternallyReparentedLocked(ctx context.Context, externalID string, interrupted chan struct{}) (bool, error) { // re-read the tablet record to be sure we have the latest version tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return false, err } // read the shard, make sure again the master is not already good. shardInfo, err := agent.TopoServer.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return false, err } if shardInfo.MasterAlias == tablet.Alias { log.Infof("TabletExternallyReparented: tablet became the master before we get the lock?") return false, nil } log.Infof("TabletExternallyReparented called and we're not the master, doing the work") // Read the tablets, make sure the master elect is known to the shard // (it's this tablet, so it better be!). // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard) switch err { case nil: // keep going case topo.ErrPartialResult: log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return false, err } masterElectTablet, ok := tabletMap[tablet.Alias] if !ok { return false, fmt.Errorf("this master-elect tablet %v not found in replication graph %v/%v %v", tablet.Alias, tablet.Keyspace, tablet.Shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *tablet.Tablet, ExternalID: externalID, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // sort the tablets, and handle them slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) event.DispatchUpdate(ev, "starting external from tablet") // We fix the new master in the replication graph. // Note after this call, we may have changed the tablet record, // so we will always return true, so the tablet record is re-read // by the agent. event.DispatchUpdate(ev, "mark ourself as new master") err = agent.updateReplicationGraphForPromotedSlave(ctx, tablet) if err != nil { // This suggests we can't talk to topo server. This is bad. return true, fmt.Errorf("updateReplicationGraphForPromotedSlave failed: %v", err) } // Once this tablet is promoted, remove it from our maps delete(slaveTabletMap, tablet.Alias) delete(masterTabletMap, tablet.Alias) // Then fix all the slaves, including the old master. This // last step is very likely to time out for some tablets (one // random guy is dead, the old master is dead, ...). We // execute them all in parallel until we get to // wr.ActionTimeout(). After this, no other action with a // timeout is executed, so even if we got to the timeout, // we're still good. event.DispatchUpdate(ev, "restarting slaves") logger := logutil.NewConsoleLogger() tmc := tmclient.NewTabletManagerClient() topotools.RestartSlavesExternal(agent.TopoServer, logger, slaveTabletMap, masterTabletMap, masterElectTablet.Alias, func(ti *topo.TabletInfo, swrd *actionnode.SlaveWasRestartedArgs) error { return tmc.SlaveWasRestarted(ctx, ti, swrd) }) // Compute the list of Cells we need to rebuild: old master and // all other cells if reparenting to another cell. cells := []string{shardInfo.MasterAlias.Cell} if shardInfo.MasterAlias.Cell != tablet.Alias.Cell { cells = nil } // now update the master record in the shard object event.DispatchUpdate(ev, "updating shard record") log.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = tablet.Alias if err = topo.UpdateShard(ctx, agent.TopoServer, shardInfo); err != nil { return true, err } // and rebuild the shard serving graph event.DispatchUpdate(ev, "rebuilding shard serving graph") log.Infof("Rebuilding shard serving graph data") if _, err = topotools.RebuildShard(ctx, logger, agent.TopoServer, tablet.Keyspace, tablet.Shard, cells, agent.LockTimeout, interrupted); err != nil { return true, err } event.DispatchUpdate(ev, "finished") return true, nil }