Beispiel #1
0
// New creates a new Wrangler object.
//
// actionTimeout: how long should we wait for an action to complete?
// - if using wrangler for just one action, this is set properly
//   upon wrangler creation.
// - if re-using wrangler multiple times, call ResetActionTimeout before
//   every action.
//
// lockTimeout: how long should we wait for the initial lock to start
// a complex action?  This is distinct from actionTimeout because most
// of the time, we want to immediately know that our action will
// fail. However, automated action will need some time to arbitrate
// the locks.
func New(logger logutil.Logger, ts topo.Server, actionTimeout, lockTimeout time.Duration) *Wrangler {
	return &Wrangler{logger, ts, initiator.NewActionInitiator(ts), time.Now().Add(actionTimeout), lockTimeout}
}
Beispiel #2
0
func tabletExternallyReparentedLocked(ts topo.Server, tablet *topo.TabletInfo, actionTimeout, lockTimeout time.Duration, interrupted chan struct{}) (err error) {
	// read the shard, make sure again the master is not already good.
	// critical read, we want up to date info (and the shard is locked).
	shardInfo, err := ts.GetShardCritical(tablet.Keyspace, tablet.Shard)
	if err != nil {
		return err
	}
	if shardInfo.MasterAlias == tablet.Alias {
		return fmt.Errorf("this tablet is already the master")
	}

	// Read the tablets, make sure the master elect is known to the shard
	// (it's this tablet, so it better be!).
	// Note we will keep going with a partial tablet map, which usually
	// happens when a cell is not reachable. After these checks, the
	// guarantees we'll have are:
	// - global cell is reachable (we just locked and read the shard)
	// - the local cell that contains the new master is reachable
	//   (as we're going to check the new master is in the list)
	// That should be enough.
	tabletMap, err := topo.GetTabletMapForShard(ts, tablet.Keyspace, tablet.Shard)
	switch err {
	case nil:
		// keep going
	case topo.ErrPartialResult:
		log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets")
	default:
		return err
	}
	masterElectTablet, ok := tabletMap[tablet.Alias]
	if !ok {
		return fmt.Errorf("this master-elect tablet %v not found in replication graph %v/%v %v", tablet.Alias, tablet.Keyspace, tablet.Shard, topotools.MapKeys(tabletMap))
	}

	// Create reusable Reparent event with available info
	ev := &events.Reparent{
		ShardInfo: *shardInfo,
		NewMaster: *tablet.Tablet,
	}

	if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok {
		ev.OldMaster = *oldMasterTablet.Tablet
	}

	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	// sort the tablets, and handle them
	slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap)
	event.DispatchUpdate(ev, "starting external from tablet")

	// we fix the new master in the replication graph
	event.DispatchUpdate(ev, "mark ourself as new master")
	err = updateReplicationGraphForPromotedSlave(ts, tablet)
	if err != nil {
		// This suggests we can't talk to topo server. This is bad.
		return fmt.Errorf("updateReplicationGraphForPromotedSlave failed: %v", err)
	}

	// Once this tablet is promoted, remove it from our maps
	delete(slaveTabletMap, tablet.Alias)
	delete(masterTabletMap, tablet.Alias)

	// Then fix all the slaves, including the old master.  This
	// last step is very likely to time out for some tablets (one
	// random guy is dead, the old master is dead, ...). We
	// execute them all in parallel until we get to
	// wr.ActionTimeout(). After this, no other action with a
	// timeout is executed, so even if we got to the timeout,
	// we're still good.
	event.DispatchUpdate(ev, "restarting slaves")
	logger := logutil.NewConsoleLogger()
	ai := initiator.NewActionInitiator(ts)
	topotools.RestartSlavesExternal(ts, logger, slaveTabletMap, masterTabletMap, masterElectTablet.Alias, func(ti *topo.TabletInfo, swrd *actionnode.SlaveWasRestartedArgs) error {
		return ai.RpcSlaveWasRestarted(ti, swrd, actionTimeout)
	})

	// Compute the list of Cells we need to rebuild: old master and
	// all other cells if reparenting to another cell.
	cells := []string{shardInfo.MasterAlias.Cell}
	if shardInfo.MasterAlias.Cell != tablet.Alias.Cell {
		cells = nil
	}

	// now update the master record in the shard object
	event.DispatchUpdate(ev, "updating shard record")
	log.Infof("Updating Shard's MasterAlias record")
	shardInfo.MasterAlias = tablet.Alias
	if err = topo.UpdateShard(ts, shardInfo); err != nil {
		return err
	}

	// and rebuild the shard serving graph
	event.DispatchUpdate(ev, "rebuilding shard serving graph")
	log.Infof("Rebuilding shard serving graph data")
	if err = topotools.RebuildShard(logger, ts, tablet.Keyspace, tablet.Shard, cells, lockTimeout, interrupted); err != nil {
		return err
	}

	event.DispatchUpdate(ev, "finished")
	return nil
}
Beispiel #3
0
// actionTimeout: how long should we wait for an action to complete?
// lockTimeout: how long should we wait for the initial lock to start a complex action?
//   This is distinct from actionTimeout because most of the time, we want to immediately
//   know that out action will fail. However, automated action will need some time to
//   arbitrate the locks.
func New(ts topo.Server, actionTimeout, lockTimeout time.Duration) *Wrangler {
	return &Wrangler{ts, initiator.NewActionInitiator(ts, *tabletManagerProtocol), time.Now().Add(actionTimeout), lockTimeout, true}
}