Esempio n. 1
0
// reparentShardExternal handles an external reparent.
//
// The ev parameter is an event struct prefilled with information that the
// caller has on hand, which would be expensive for us to re-query.
func (wr *Wrangler) reparentShardExternal(ev *events.Reparent, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo) error {
	ev.UpdateStatus("starting external")

	// we fix the new master in the replication graph
	ev.UpdateStatus("checking if new master was promoted")
	err := wr.slaveWasPromoted(masterElectTablet)
	if err != nil {
		// This suggests that the master-elect is dead. This is bad.
		return fmt.Errorf("slaveWasPromoted(%v) failed: %v", masterElectTablet, err)
	}

	// Once the slave is promoted, remove it from our maps
	delete(slaveTabletMap, masterElectTablet.Alias)
	delete(masterTabletMap, masterElectTablet.Alias)

	// Re-read the master elect tablet, as its mysql port may have been updated
	ev.UpdateStatus("re-reading new master record")
	masterElectTablet, err = wr.TopoServer().GetTablet(masterElectTablet.Alias)
	if err != nil {
		return fmt.Errorf("cannot re-read the master record, something is seriously wrong: %v", err)
	}

	// then fix all the slaves, including the old master
	ev.UpdateStatus("restarting slaves")
	wr.restartSlavesExternal(slaveTabletMap, masterTabletMap, masterElectTablet)
	return nil
}
Esempio n. 2
0
// reparentShardGraceful executes a graceful reparent.
// The ev parameter is an event struct prefilled with information that the
// caller has on hand, which would be expensive for us to re-query.
func (wr *Wrangler) reparentShardGraceful(ev *events.Reparent, si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly bool) (err error) {
	ev.UpdateStatus("starting graceful")

	defer func() {
		if err != nil {
			ev.UpdateStatus("failed: " + err.Error())
		}
	}()

	// Validate a bunch of assumptions we make about the replication graph.
	if len(masterTabletMap) != 1 {
		aliases := make([]string, 0, len(masterTabletMap))
		for _, v := range masterTabletMap {
			aliases = append(aliases, v.String())
		}
		return fmt.Errorf("I have 0 or multiple masters / scrapped tablets in this shard replication graph, please scrap the non-master ones: %v", strings.Join(aliases, " "))
	}
	var masterTablet *topo.TabletInfo
	for _, v := range masterTabletMap {
		masterTablet = v
	}

	if masterTablet.Parent.Uid != topo.NO_TABLET {
		return fmt.Errorf("master tablet should not have a ParentUid: %v %v", masterTablet.Parent.Uid, masterTablet.Alias)
	}

	if masterTablet.Type != topo.TYPE_MASTER {
		return fmt.Errorf("master tablet should not be type: %v %v", masterTablet.Type, masterTablet.Alias)
	}

	if masterTablet.Alias.Uid == masterElectTablet.Alias.Uid {
		return fmt.Errorf("master tablet should not match master elect - this must be forced: %v", masterTablet.Alias)
	}

	if _, ok := slaveTabletMap[masterElectTablet.Alias]; !ok {
		return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias, masterTablet.Keyspace, masterTablet.Shard, mapKeys(slaveTabletMap))
	}

	if err := wr.ValidateShard(masterTablet.Keyspace, masterTablet.Shard, true); err != nil {
		return fmt.Errorf("ValidateShard verification failed: %v, if the master is dead, run: vtctl ScrapTablet -force %v", err, masterTablet.Alias)
	}

	// Make sure all tablets have the right parent and reasonable positions.
	ev.UpdateStatus("checking slave replication positions")
	err = wr.checkSlaveReplication(slaveTabletMap, masterTablet.Alias.Uid)
	if err != nil {
		return err
	}

	// Check the master-elect is fit for duty - call out for hardware checks.
	ev.UpdateStatus("checking that new master is ready to serve")
	err = wr.checkMasterElect(masterElectTablet)
	if err != nil {
		return err
	}

	ev.UpdateStatus("demoting old master")
	masterPosition, err := wr.demoteMaster(masterTablet)
	if err != nil {
		// FIXME(msolomon) This suggests that the master is dead and we
		// need to take steps. We could either pop a prompt, or make
		// retrying the action painless.
		return fmt.Errorf("demote master failed: %v, if the master is dead, run: vtctl -force ScrapTablet %v", err, masterTablet.Alias)
	}

	ev.UpdateStatus("checking slave consistency")
	log.Infof("check slaves %v/%v", masterTablet.Keyspace, masterTablet.Shard)
	restartableSlaveTabletMap := restartableTabletMap(slaveTabletMap)
	err = wr.checkSlaveConsistency(restartableSlaveTabletMap, masterPosition)
	if err != nil {
		return fmt.Errorf("check slave consistency failed %v, demoted master is still read only, run: vtctl SetReadWrite %v", err, masterTablet.Alias)
	}

	ev.UpdateStatus("promoting new master")
	rsd, err := wr.promoteSlave(masterElectTablet)
	if err != nil {
		// FIXME(msolomon) This suggests that the master-elect is dead.
		// We need to classify certain errors as temporary and retry.
		return fmt.Errorf("promote slave failed: %v, demoted master is still read only: vtctl SetReadWrite %v", err, masterTablet.Alias)
	}

	// Once the slave is promoted, remove it from our map
	delete(slaveTabletMap, masterElectTablet.Alias)

	ev.UpdateStatus("restarting slaves")
	majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd)

	// For now, scrap the old master regardless of how many
	// slaves restarted.
	//
	// FIXME(msolomon) We could reintroduce it and reparent it and use
	// it as new replica.
	ev.UpdateStatus("scrapping old master")
	log.Infof("scrap demoted master %v", masterTablet.Alias)
	scrapActionPath, scrapErr := wr.ai.Scrap(masterTablet.Alias)
	if scrapErr == nil {
		scrapErr = wr.WaitForCompletion(scrapActionPath)
	}
	if scrapErr != nil {
		// The sub action is non-critical, so just warn.
		log.Warningf("scrap demoted master failed: %v", scrapErr)
	}

	ev.UpdateStatus("rebuilding shard serving graph")
	err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly)
	if err != nil {
		return err
	}

	ev.UpdateStatus("finished")

	if restartSlaveErr != nil {
		// This is more of a warning at this point.
		return restartSlaveErr
	}

	return nil
}
Esempio n. 3
0
// reparentShardBrutal executes a brutal reparent.
//
// Assume the master is dead and not coming back. Just push your way
// forward.  Force means we are reparenting to the same master
// (assuming the data has been externally synched).
//
// The ev parameter is an event struct prefilled with information that the
// caller has on hand, which would be expensive for us to re-query.
func (wr *Wrangler) reparentShardBrutal(ev *events.Reparent, si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly, force bool) (err error) {
	ev.UpdateStatus("starting brutal")

	defer func() {
		if err != nil {
			ev.UpdateStatus("failed: " + err.Error())
		}
	}()

	log.Infof("Skipping ValidateShard - not a graceful situation")

	if _, ok := slaveTabletMap[masterElectTablet.Alias]; !ok && !force {
		return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias, si.Keyspace(), si.ShardName(), mapKeys(slaveTabletMap))
	}

	// Check the master-elect and slaves are in good shape when the action
	// has not been forced.
	if !force {
		// Make sure all tablets have the right parent and reasonable positions.
		ev.UpdateStatus("checking slave replication positions")
		if err := wr.checkSlaveReplication(slaveTabletMap, topo.NO_TABLET); err != nil {
			return err
		}

		// Check the master-elect is fit for duty - call out for hardware checks.
		ev.UpdateStatus("checking that new master is ready to serve")
		if err := wr.checkMasterElect(masterElectTablet); err != nil {
			return err
		}

		ev.UpdateStatus("checking slave consistency")
		log.Infof("check slaves %v/%v", masterElectTablet.Keyspace, masterElectTablet.Shard)
		restartableSlaveTabletMap := restartableTabletMap(slaveTabletMap)
		err = wr.checkSlaveConsistency(restartableSlaveTabletMap, nil)
		if err != nil {
			return err
		}
	} else {
		ev.UpdateStatus("stopping slave replication")
		log.Infof("forcing reparent to same master %v", masterElectTablet.Alias)
		err := wr.breakReplication(slaveTabletMap, masterElectTablet)
		if err != nil {
			return err
		}
	}

	ev.UpdateStatus("promoting new master")
	rsd, err := wr.promoteSlave(masterElectTablet)
	if err != nil {
		// FIXME(msolomon) This suggests that the master-elect is dead.
		// We need to classify certain errors as temporary and retry.
		return fmt.Errorf("promote slave failed: %v %v", err, masterElectTablet.Alias)
	}

	// Once the slave is promoted, remove it from our maps
	delete(slaveTabletMap, masterElectTablet.Alias)
	delete(masterTabletMap, masterElectTablet.Alias)

	ev.UpdateStatus("restarting slaves")
	majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd)

	if !force {
		for _, failedMaster := range masterTabletMap {
			ev.UpdateStatus("scrapping old master")
			log.Infof("scrap dead master %v", failedMaster.Alias)
			// The master is dead so execute the action locally instead of
			// enqueing the scrap action for an arbitrary amount of time.
			if scrapErr := topotools.Scrap(wr.ts, failedMaster.Alias, false); scrapErr != nil {
				log.Warningf("scrapping failed master failed: %v", scrapErr)
			}
		}
	}

	ev.UpdateStatus("rebuilding shard serving graph")
	err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly)
	if err != nil {
		return err
	}

	ev.UpdateStatus("finished")

	if restartSlaveErr != nil {
		// This is more of a warning at this point.
		return restartSlaveErr
	}

	return nil
}