Example #1
0
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool) error {
	recorder := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	swrd := tm.SlaveWasRestartedData{
		Parent:               masterElectTablet.Alias(),
		ExpectedMasterAddr:   masterElectTablet.MysqlAddr,
		ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr,
		ScrapStragglers:      scrapStragglers,
	}

	// do all the slaves
	for _, ti := range slaveTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			recorder.RecordError(wr.slaveWasRestarted(ti, &swrd))
			wg.Done()
		}(ti)
	}
	wg.Wait()

	// then do the master
	recorder.RecordError(wr.slaveWasRestarted(masterTablet, &swrd))
	return recorder.Error()
}
Example #2
0
func (wr *Wrangler) SnapshotSourceEnd(tabletAlias topo.TabletAlias, slaveStartRequired, readWrite bool, originalType topo.TabletType) (err error) {
	var ti *topo.TabletInfo
	ti, err = wr.ts.GetTablet(tabletAlias)
	if err != nil {
		return
	}

	var actionPath string
	actionPath, err = wr.ai.SnapshotSourceEnd(tabletAlias, &tm.SnapshotSourceEndArgs{slaveStartRequired, !readWrite})
	if err != nil {
		return
	}

	// wait for completion, and save the error
	err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
	if err != nil {
		log.Errorf("SnapshotSourceEnd failed (%v), leaving tablet type alone", err)
		return
	}

	if ti.Tablet.Parent.Uid == topo.NO_TABLET {
		ti.Tablet.Type = topo.TYPE_MASTER
		err = topo.UpdateTablet(wr.ts, ti)
	} else {
		err = wr.ChangeType(ti.Alias(), originalType, false)
	}

	return err
}
Example #3
0
func (wr *Wrangler) slaveWasRestarted(ti *topo.TabletInfo, swrd *tm.SlaveWasRestartedData) (err error) {
	log.Infof("slaveWasRestarted(%v)", ti.Alias())
	actionPath, err := wr.ai.SlaveWasRestarted(ti.Alias(), swrd)
	if err != nil {
		return err
	}
	return wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
}
Example #4
0
func (wr *Wrangler) restartSlave(ti *topo.TabletInfo, rsd *tm.RestartSlaveData) (err error) {
	log.Infof("restart slave %v", ti.Alias())
	actionPath, err := wr.ai.RestartSlave(ti.Alias(), rsd)
	if err != nil {
		return err
	}
	return wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
}
Example #5
0
func (wr *Wrangler) getMasterPosition(ti *topo.TabletInfo) (*mysqlctl.ReplicationPosition, error) {
	actionPath, err := wr.ai.MasterPosition(ti.Alias())
	if err != nil {
		return nil, err
	}
	result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout())
	if err != nil {
		return nil, err
	}
	return result.(*mysqlctl.ReplicationPosition), nil
}
Example #6
0
func (tee *Tee) UpdateTablet(tablet *topo.TabletInfo, existingVersion int) (newVersion int, err error) {
	if newVersion, err = tee.primary.UpdateTablet(tablet, existingVersion); err != nil {
		// failed on primary, not updating secondary
		return
	}

	if _, err := tee.secondary.UpdateTablet(tablet, existingVersion); err != nil {
		// not critical enough to fail
		relog.Warning("secondary.UpdateTablet(%v) failed: %v", tablet.Alias(), err)
	}
	return
}
Example #7
0
func (wr *Wrangler) slaveWasPromoted(ti *topo.TabletInfo) error {
	log.Infof("slave was promoted %v", ti.Alias())
	actionPath, err := wr.ai.SlaveWasPromoted(ti.Alias())
	if err != nil {
		return err
	}
	err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
	if err != nil {
		return err
	}
	return nil
}
Example #8
0
func (wr *Wrangler) demoteMaster(ti *topo.TabletInfo) (*mysqlctl.ReplicationPosition, error) {
	log.Infof("demote master %v", ti.Alias())
	actionPath, err := wr.ai.DemoteMaster(ti.Alias())
	if err != nil {
		return nil, err
	}
	err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
	if err != nil {
		return nil, err
	}
	return wr.getMasterPosition(ti)
}
Example #9
0
func (wr *Wrangler) promoteSlave(ti *topo.TabletInfo) (rsd *tm.RestartSlaveData, err error) {
	log.Infof("promote slave %v", ti.Alias())
	actionPath, err := wr.ai.PromoteSlave(ti.Alias())
	if err != nil {
		return
	}
	result, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout())
	if err != nil {
		return
	}
	rsd = result.(*tm.RestartSlaveData)
	return
}
Example #10
0
func (wr *Wrangler) ExecuteTabletInfoHook(ti *topo.TabletInfo, hook *hk.Hook) (hookResult *hk.HookResult, err error) {

	actionPath, err := wr.ai.ExecuteHook(ti.Alias(), hook)
	if err != nil {
		return nil, err
	}

	var hr interface{}
	if hr, err = wr.ai.WaitForCompletionReply(actionPath, 10*time.Minute); err != nil {
		return nil, err
	}
	return hr.(*hk.HookResult), nil
}
Example #11
0
func (zkts *Server) UpdateTablet(tablet *topo.TabletInfo, existingVersion int) (int, error) {
	zkTabletPath := TabletPathForAlias(tablet.Alias())
	stat, err := zkts.zconn.Set(zkTabletPath, tablet.Json(), existingVersion)
	if err != nil {
		if zookeeper.IsError(err, zookeeper.ZBADVERSION) {
			err = topo.ErrBadVersion
		} else if zookeeper.IsError(err, zookeeper.ZNONODE) {
			err = topo.ErrNoNode
		}

		return 0, err
	}
	return stat.Version(), nil
}
Example #12
0
func (wr *Wrangler) reparentShardExternal(slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, scrapStragglers bool, acceptSuccessPercents int) error {
	// we fix the new master in the replication graph
	err := wr.slaveWasPromoted(masterElectTablet)
	if err != nil {
		// This suggests that the master-elect is dead. This is bad.
		return fmt.Errorf("slaveWasPromoted(%v) failed: %v", masterElectTablet, err)
	}

	// Once the slave is promoted, remove it from our maps
	delete(slaveTabletMap, masterElectTablet.Alias())
	delete(masterTabletMap, masterElectTablet.Alias())

	// then fix all the slaves, including the old master
	return wr.restartSlavesExternal(slaveTabletMap, masterTabletMap, masterElectTablet, scrapStragglers, acceptSuccessPercents)
}
Example #13
0
// Execute a hook and returns an error only if the hook failed, not if
// the hook doesn't exist.
func (wr *Wrangler) ExecuteOptionalTabletInfoHook(ti *topo.TabletInfo, hook *hk.Hook) (err error) {
	hr, err := wr.ExecuteTabletInfoHook(ti, hook)
	if err != nil {
		return err
	}

	if hr.ExitStatus == hk.HOOK_DOES_NOT_EXIST {
		log.Infof("Hook %v doesn't exist on tablet %v", hook.Name, ti.Alias())
		return nil
	}

	if hr.ExitStatus != hk.HOOK_SUCCESS {
		return fmt.Errorf("Hook %v failed(%v): %v", hook.Name, hr.ExitStatus, hr.Stderr)
	}

	return nil
}
Example #14
0
func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error {
	// If the majority of slaves restarted, move ahead.
	if majorityRestart {
		if leaveMasterReadOnly {
			log.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias())
		} else {
			log.Infof("marking master-elect read-write %v", masterElect.Alias())
			actionPath, err := wr.ai.SetReadWrite(masterElect.Alias())
			if err == nil {
				err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
			}
			if err != nil {
				log.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias())
			}
		}
	} else {
		log.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias())
	}

	// save the new master in the shard info
	si.MasterAlias = masterElect.Alias()
	if err := wr.ts.UpdateShard(si); err != nil {
		log.Errorf("Failed to save new master into shard: %v", err)
		return err
	}

	// We rebuild all the cells, as we may have taken tablets in and
	// out of the graph.
	log.Infof("rebuilding shard serving graph data")
	return wr.rebuildShard(masterElect.Keyspace, masterElect.Shard, nil)
}
Example #15
0
func (wr *Wrangler) reparentShardExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool, acceptSuccessPercents int) error {

	// we fix the new master in the replication graph
	err := wr.slaveWasPromoted(masterElectTablet)
	if err != nil {
		// This suggests that the master-elect is dead. This is bad.
		return fmt.Errorf("slaveWasPromoted(%v) failed: %v", masterElectTablet, err)
	}

	// Once the slave is promoted, remove it from our map
	delete(slaveTabletMap, masterElectTablet.Alias())

	// then fix all the slaves, including the old master
	err = wr.restartSlavesExternal(slaveTabletMap, masterTablet, masterElectTablet, scrapStragglers, acceptSuccessPercents)
	if err != nil {
		return err
	}

	// and rebuild the shard graph.  We rebuild all the cells, as
	// we may have taken tablets in and out of the graph.
	log.Infof("rebuilding shard serving graph data")
	return wr.rebuildShard(masterElectTablet.Keyspace, masterElectTablet.Shard, nil)
}
Example #16
0
func (wr *Wrangler) finishReparent(oldMaster, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error {
	// If the majority of slaves restarted, move ahead.
	if majorityRestart {
		if leaveMasterReadOnly {
			log.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias())
		} else {
			log.Infof("marking master-elect read-write %v", masterElect.Alias())
			actionPath, err := wr.ai.SetReadWrite(masterElect.Alias())
			if err == nil {
				err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
			}
			if err != nil {
				log.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias())
			}
		}
	} else {
		log.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias())
	}

	// we can't be smart and just do the old and new master cells,
	// as we export master record everywhere.
	log.Infof("rebuilding shard serving graph data")
	return wr.rebuildShard(masterElect.Keyspace, masterElect.Shard, nil)
}
Example #17
0
func (wr *Wrangler) reparentShardExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool) error {

	// we fix the new master in the replication graph
	err := wr.slaveWasPromoted(masterElectTablet)
	if err != nil {
		// This suggests that the master-elect is dead. This is bad.
		return fmt.Errorf("slaveWasPromoted failed: %v", err, masterTablet.Alias())
	}

	// Once the slave is promoted, remove it from our map
	delete(slaveTabletMap, masterElectTablet.Alias())

	// then fix all the slaves, including the old master
	err = wr.restartSlavesExternal(slaveTabletMap, masterTablet, masterElectTablet, scrapStragglers)
	if err != nil {
		return err
	}

	// and rebuild the shard graph.
	// we can't be smart and just do the old and new master cells,
	// as we export master record everywhere.
	relog.Info("rebuilding shard serving graph data")
	return wr.rebuildShard(masterElectTablet.Keyspace, masterElectTablet.Shard, nil)
}
Example #18
0
func (wr *Wrangler) breakReplication(slaveMap map[topo.TabletAlias]*topo.TabletInfo, masterElect *topo.TabletInfo) error {
	// We are forcing a reparenting. Make sure that all slaves stop so
	// no data is accidentally replicated through before we call RestartSlave.
	log.Infof("stop slaves %v", masterElect.Alias())
	err := wr.stopSlaves(slaveMap)
	if err != nil {
		return err
	}

	// Force slaves to break, just in case they were not advertised in
	// the replication graph.
	log.Infof("break slaves %v", masterElect.Alias())
	actionPath, err := wr.ai.BreakSlaves(masterElect.Alias())
	if err == nil {
		err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
	}
	return err
}
Example #19
0
// TODO(alainjobart) keep a cache of rpcClient by tabletAlias
func (ai *ActionInitiator) rpcCallTablet(tablet *topo.TabletInfo, name string, args, reply interface{}, waitTime time.Duration) error {

	// create the RPC client, using waitTime as the connect
	// timeout, and starting the overall timeout as well
	timer := time.After(waitTime)
	rpcClient, err := bsonrpc.DialHTTP("tcp", tablet.Addr, waitTime)
	if err != nil {
		return fmt.Errorf("RPC error for %v: %v", tablet.Alias(), err.Error())
	}
	defer rpcClient.Close()

	// do the call in the remaining time
	call := rpcClient.Go("TabletManager."+name, args, reply, nil)
	select {
	case <-timer:
		return fmt.Errorf("Timeout waiting for TabletManager.%v to %v", name, tablet.Alias())
	case <-call.Done:
		if call.Error != nil {
			return fmt.Errorf("Remote error for %v: %v", tablet.Alias(), call.Error.Error())
		} else {
			return nil
		}
	}
}
// Assume the master is dead and not coming back. Just push your way
// forward.  Force means we are reparenting to the same master
// (assuming the data has been externally synched).
func (wr *Wrangler) reparentShardBrutal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, failedMaster, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly, force bool) error {
	log.Infof("Skipping ValidateShard - not a graceful situation")

	if _, ok := slaveTabletMap[masterElectTablet.Alias()]; !ok && !force {
		return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias(), failedMaster.Keyspace, failedMaster.Shard, mapKeys(slaveTabletMap))
	}

	// Check the master-elect and slaves are in good shape when the action
	// has not been forced.
	if !force {
		// Make sure all tablets have the right parent and reasonable positions.
		if err := wr.checkSlaveReplication(slaveTabletMap, topo.NO_TABLET); err != nil {
			return err
		}

		// Check the master-elect is fit for duty - call out for hardware checks.
		if err := wr.checkMasterElect(masterElectTablet); err != nil {
			return err
		}

		log.Infof("check slaves %v/%v", masterElectTablet.Keyspace, masterElectTablet.Shard)
		restartableSlaveTabletMap := restartableTabletMap(slaveTabletMap)
		err := wr.checkSlaveConsistency(restartableSlaveTabletMap, nil)
		if err != nil {
			return err
		}
	} else {
		log.Infof("forcing reparent to same master %v", masterElectTablet.Alias())
		err := wr.breakReplication(slaveTabletMap, masterElectTablet)
		if err != nil {
			return err
		}
	}

	rsd, err := wr.promoteSlave(masterElectTablet)
	if err != nil {
		// FIXME(msolomon) This suggests that the master-elect is dead.
		// We need to classify certain errors as temporary and retry.
		return fmt.Errorf("promote slave failed: %v %v", err, masterElectTablet.Alias())
	}

	// Once the slave is promoted, remove it from our map
	delete(slaveTabletMap, masterElectTablet.Alias())

	majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd)

	if !force {
		log.Infof("scrap dead master %v", failedMaster.Alias())
		// The master is dead so execute the action locally instead of
		// enqueing the scrap action for an arbitrary amount of time.
		if scrapErr := tm.Scrap(wr.ts, failedMaster.Alias(), false); scrapErr != nil {
			log.Warningf("scrapping failed master failed: %v", scrapErr)
		}
	}

	err = wr.finishReparent(failedMaster, masterElectTablet, majorityRestart, leaveMasterReadOnly)
	if err != nil {
		return err
	}

	if restartSlaveErr != nil {
		// This is more of a warning at this point.
		return restartSlaveErr
	}

	return nil
}
Example #21
0
func (tee *Tee) UpdateTablet(tablet *topo.TabletInfo, existingVersion int64) (newVersion int64, err error) {
	if newVersion, err = tee.primary.UpdateTablet(tablet, existingVersion); err != nil {
		// failed on primary, not updating secondary
		return
	}

	// if we have a mapping between tablet version in first topo
	// and tablet version in second topo, replace the version number.
	// if not, this will probably fail and log.
	tee.mu.Lock()
	tvm, ok := tee.tabletVersionMapping[tablet.Alias()]
	if ok && tvm.readFromVersion == existingVersion {
		existingVersion = tvm.readFromSecondVersion
		delete(tee.tabletVersionMapping, tablet.Alias())
	}
	tee.mu.Unlock()
	if newVersion2, serr := tee.secondary.UpdateTablet(tablet, existingVersion); serr != nil {
		// not critical enough to fail
		if serr == topo.ErrNoNode {
			// the tablet doesn't exist on the secondary, let's
			// just create it
			if serr = tee.secondary.CreateTablet(tablet.Tablet); serr != nil {
				log.Warningf("secondary.CreateTablet(%v) failed (after UpdateTablet returned ErrNoNode): %v", tablet.Alias(), serr)
			} else {
				log.Infof("secondary.UpdateTablet(%v) failed with ErrNoNode, CreateTablet then worked.", tablet.Alias())
				ti, gerr := tee.secondary.GetTablet(tablet.Alias())
				if gerr != nil {
					log.Warningf("Failed to re-read tablet(%v) after creating it on secondary: %v", tablet.Alias(), gerr)
				} else {
					tee.mu.Lock()
					tee.tabletVersionMapping[tablet.Alias()] = tabletVersionMapping{
						readFromVersion:       newVersion,
						readFromSecondVersion: ti.Version(),
					}
					tee.mu.Unlock()
				}
			}
		} else {
			log.Warningf("secondary.UpdateTablet(%v) failed: %v", tablet.Alias(), serr)
		}
	} else {
		tee.mu.Lock()
		tee.tabletVersionMapping[tablet.Alias()] = tabletVersionMapping{
			readFromVersion:       newVersion,
			readFromSecondVersion: newVersion2,
		}
		tee.mu.Unlock()
	}
	return
}
Example #22
0
// forceMasterSnapshot: Normally a master is not a viable tablet to snapshot.
// However, there are degenerate cases where you need to override this, for
// instance the initial clone of a new master.
func (wr *Wrangler) Snapshot(tabletAlias topo.TabletAlias, forceMasterSnapshot bool, snapshotConcurrency int, serverMode bool) (manifest string, parent topo.TabletAlias, slaveStartRequired, readOnly bool, originalType topo.TabletType, err error) {
	var ti *topo.TabletInfo
	ti, err = wr.ts.GetTablet(tabletAlias)
	if err != nil {
		return
	}

	originalType = ti.Tablet.Type

	if ti.Tablet.Type == topo.TYPE_MASTER && forceMasterSnapshot {
		// In this case, we don't bother recomputing the serving graph.
		// All queries will have to fail anyway.
		log.Infof("force change type master -> backup: %v", tabletAlias)
		// There is a legitimate reason to force in the case of a single
		// master.
		ti.Tablet.Type = topo.TYPE_BACKUP
		err = topo.UpdateTablet(wr.ts, ti)
	} else {
		err = wr.ChangeType(ti.Alias(), topo.TYPE_BACKUP, false)
	}

	if err != nil {
		return
	}

	var actionPath string
	actionPath, err = wr.ai.Snapshot(tabletAlias, &tm.SnapshotArgs{snapshotConcurrency, serverMode})
	if err != nil {
		return
	}

	// wait for completion, and save the error
	results, actionErr := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout())
	var reply *tm.SnapshotReply
	newType := originalType
	if actionErr != nil {
		log.Errorf("snapshot failed, still restoring tablet type: %v", actionErr)
		reply = &tm.SnapshotReply{}
	} else {
		reply = results.(*tm.SnapshotReply)
		tm.BackfillAlias(reply.ZkParentPath, &reply.ParentAlias)
		if serverMode {
			log.Infof("server mode specified, switching tablet to snapshot_source mode")
			newType = topo.TYPE_SNAPSHOT_SOURCE
		}
	}

	// Go back to original type, or go to SNAPSHOT_SOURCE
	log.Infof("change type after snapshot: %v %v", tabletAlias, newType)
	if ti.Tablet.Parent.Uid == topo.NO_TABLET && forceMasterSnapshot && newType != topo.TYPE_SNAPSHOT_SOURCE {
		log.Infof("force change type backup -> master: %v", tabletAlias)
		ti.Tablet.Type = topo.TYPE_MASTER
		err = topo.UpdateTablet(wr.ts, ti)
	} else {
		err = wr.ChangeType(ti.Alias(), newType, false)
	}
	if err != nil {
		// failure in changing the topology type is probably worse,
		// so returning that (we logged actionErr anyway)
		return
	}
	return reply.ManifestPath, reply.ParentAlias, reply.SlaveStartRequired, reply.ReadOnly, originalType, actionErr
}
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool, acceptSuccessPercents int) error {
	recorder := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	swrd := tm.SlaveWasRestartedData{
		Parent:               masterElectTablet.Alias(),
		ExpectedMasterAddr:   masterElectTablet.MysqlAddr,
		ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr,
		ScrapStragglers:      scrapStragglers,
	}

	// do all the slaves
	for _, ti := range slaveTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			recorder.RecordError(wr.slaveWasRestarted(ti, &swrd))
			wg.Done()
		}(ti)
	}
	wg.Wait()

	// then do the old master if it hadn't been scrapped
	if masterTablet != nil {
		err := wr.slaveWasRestarted(masterTablet, &swrd)
		if err != nil {
			recorder.RecordError(err)
			// the old master can be annoying if left
			// around in the replication graph, so if we
			// can't restart it, we just scrap it
			log.Warningf("Old master %v is not restarting, scrapping it", masterTablet.Alias())
			if _, err := wr.Scrap(masterTablet.Alias() /*force*/, true /*skipRebuild*/, true); err != nil {
				log.Warningf("Failed to scrap old master %v: %v", masterTablet.Alias(), err)
			}
		}
	}

	// check the toplevel replication paths only contains the new master,
	// try to remove any old tablet aliases that don't make sense anymore
	toplevelAliases, err := wr.ts.GetReplicationPaths(masterElectTablet.Keyspace, masterElectTablet.Shard, "")
	if err != nil {
		log.Warningf("GetReplicationPaths() failed, cannot fix extra paths: %v", err)
	} else {
		for _, toplevelAlias := range toplevelAliases {
			if toplevelAlias == masterElectTablet.Alias() {
				continue
			}

			// if we can't read the tablet, or if it's not in the
			// replication graph, we remove the entry.
			if ti, err := wr.ts.GetTablet(toplevelAlias); err == nil && ti.Tablet.IsInReplicationGraph() {
				// we can read the entry and it belongs here,
				// keep it
				continue
			}

			log.Infof("Removing stale replication path %v", toplevelAlias.String())
			if err := wr.ts.DeleteReplicationPath(masterElectTablet.Keyspace, masterElectTablet.Shard, toplevelAlias.String()); err != nil {
				log.Warningf("DeleteReplicationPath(%v) failed: %v", toplevelAlias.String(), err)
			}
		}
	}

	if !recorder.HasErrors() {
		return nil
	}

	// report errors only above a threshold
	failurePercent := 100 * len(recorder.Errors) / (len(slaveTabletMap) + 1)
	if failurePercent < 100-acceptSuccessPercents {
		log.Warningf("Encountered %v%% failure, we keep going. Errors: %v", failurePercent, recorder.Error())
		return nil
	}

	return recorder.Error()
}
Example #24
0
func (wr *Wrangler) reparentShardGraceful(si *topo.ShardInfo, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, leaveMasterReadOnly bool) error {
	// Validate a bunch of assumptions we make about the replication graph.
	if len(masterTabletMap) != 1 {
		aliases := make([]string, 0, len(masterTabletMap))
		for _, v := range masterTabletMap {
			aliases = append(aliases, v.String())
		}
		return fmt.Errorf("I have 0 or multiple masters / scrapped tablets in this shard replication graph, please scrap the non-master ones: %v", strings.Join(aliases, " "))
	}
	var masterTablet *topo.TabletInfo
	for _, v := range masterTabletMap {
		masterTablet = v
	}

	if masterTablet.Parent.Uid != topo.NO_TABLET {
		return fmt.Errorf("master tablet should not have a ParentUid: %v %v", masterTablet.Parent.Uid, masterTablet.Alias())
	}

	if masterTablet.Type != topo.TYPE_MASTER {
		return fmt.Errorf("master tablet should not be type: %v %v", masterTablet.Type, masterTablet.Alias())
	}

	if masterTablet.Uid == masterElectTablet.Uid {
		return fmt.Errorf("master tablet should not match master elect - this must be forced: %v", masterTablet.Alias())
	}

	if _, ok := slaveTabletMap[masterElectTablet.Alias()]; !ok {
		return fmt.Errorf("master elect tablet not in replication graph %v %v/%v %v", masterElectTablet.Alias(), masterTablet.Keyspace, masterTablet.Shard, mapKeys(slaveTabletMap))
	}

	if err := wr.ValidateShard(masterTablet.Keyspace, masterTablet.Shard, true); err != nil {
		return fmt.Errorf("ValidateShard verification failed: %v, if the master is dead, run: vtctl ScrapTablet -force %v", err, masterTablet.Alias())
	}

	// Make sure all tablets have the right parent and reasonable positions.
	err := wr.checkSlaveReplication(slaveTabletMap, masterTablet.Uid)
	if err != nil {
		return err
	}

	// Check the master-elect is fit for duty - call out for hardware checks.
	err = wr.checkMasterElect(masterElectTablet)
	if err != nil {
		return err
	}

	masterPosition, err := wr.demoteMaster(masterTablet)
	if err != nil {
		// FIXME(msolomon) This suggests that the master is dead and we
		// need to take steps. We could either pop a prompt, or make
		// retrying the action painless.
		return fmt.Errorf("demote master failed: %v, if the master is dead, run: vtctl -force ScrapTablet %v", err, masterTablet.Alias())
	}

	log.Infof("check slaves %v/%v", masterTablet.Keyspace, masterTablet.Shard)
	restartableSlaveTabletMap := restartableTabletMap(slaveTabletMap)
	err = wr.checkSlaveConsistency(restartableSlaveTabletMap, masterPosition)
	if err != nil {
		return fmt.Errorf("check slave consistency failed %v, demoted master is still read only, run: vtctl SetReadWrite %v", err, masterTablet.Alias())
	}

	rsd, err := wr.promoteSlave(masterElectTablet)
	if err != nil {
		// FIXME(msolomon) This suggests that the master-elect is dead.
		// We need to classify certain errors as temporary and retry.
		return fmt.Errorf("promote slave failed: %v, demoted master is still read only: vtctl SetReadWrite %v", err, masterTablet.Alias())
	}

	// Once the slave is promoted, remove it from our map
	delete(slaveTabletMap, masterElectTablet.Alias())

	majorityRestart, restartSlaveErr := wr.restartSlaves(slaveTabletMap, rsd)

	// For now, scrap the old master regardless of how many
	// slaves restarted.
	//
	// FIXME(msolomon) We could reintroduce it and reparent it and use
	// it as new replica.
	log.Infof("scrap demoted master %v", masterTablet.Alias())
	scrapActionPath, scrapErr := wr.ai.Scrap(masterTablet.Alias())
	if scrapErr == nil {
		scrapErr = wr.ai.WaitForCompletion(scrapActionPath, wr.actionTimeout())
	}
	if scrapErr != nil {
		// The sub action is non-critical, so just warn.
		log.Warningf("scrap demoted master failed: %v", scrapErr)
	}

	err = wr.finishReparent(si, masterElectTablet, majorityRestart, leaveMasterReadOnly)
	if err != nil {
		return err
	}

	if restartSlaveErr != nil {
		// This is more of a warning at this point.
		return restartSlaveErr
	}

	return nil
}
Example #25
0
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTablet *topo.TabletInfo, scrapStragglers bool, acceptSuccessPercents int) error {
	recorder := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	swrd := tm.SlaveWasRestartedData{
		Parent:               masterElectTablet.Alias(),
		ExpectedMasterAddr:   masterElectTablet.MysqlAddr,
		ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr,
		ScrapStragglers:      scrapStragglers,
	}

	// The following two blocks of actions are very likely to time
	// out for some tablets (one random guy is dead, the old
	// master is dead, ...). We execute them all in parallel until
	// we get to wr.actionTimeout(). After this, no other action
	// with a timeout is executed, so even if we got to the
	// timeout, we're still good.
	log.Infof("Making sure all tablets have the right master:")

	// do all the slaves
	for _, ti := range slaveTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			recorder.RecordError(wr.slaveWasRestarted(ti, &swrd))
			wg.Done()
		}(ti)
	}

	// and do the old master and any straggler, if possible, but
	// do not record errors for these
	for _, ti := range masterTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			err := wr.slaveWasRestarted(ti, &swrd)
			if err != nil {
				// the old master can be annoying if left
				// around in the replication graph, so if we
				// can't restart it, we just scrap it.
				// We don't rebuild the Shard just yet though.
				log.Warningf("Old master %v is not restarting, scrapping it: %v", ti.Alias(), err)
				if _, err := wr.Scrap(ti.Alias(), true /*force*/, true /*skipRebuild*/); err != nil {
					log.Warningf("Failed to scrap old master %v: %v", ti.Alias(), err)
				}
			}
			wg.Done()
		}(ti)
	}
	wg.Wait()

	if !recorder.HasErrors() {
		return nil
	}

	// report errors only above a threshold
	failurePercent := 100 * len(recorder.Errors) / (len(slaveTabletMap) + 1)
	if failurePercent < 100-acceptSuccessPercents {
		log.Warningf("Encountered %v%% failure, we keep going. Errors: %v", failurePercent, recorder.Error())
		return nil
	}

	return recorder.Error()
}