Exemple #1
0
func updateReplicationGraphForPromotedSlave(ts topo.Server, tablet *topo.TabletInfo) error {
	// Remove tablet from the replication graph if this is not already the master.
	if tablet.Parent.Uid != topo.NO_TABLET {
		if err := topo.DeleteTabletReplicationData(ts, tablet.Tablet); err != nil && err != topo.ErrNoNode {
			return err
		}
	}

	// Update tablet regardless - trend towards consistency.
	tablet.State = topo.STATE_READ_WRITE
	tablet.Type = topo.TYPE_MASTER
	tablet.Parent.Cell = ""
	tablet.Parent.Uid = topo.NO_TABLET
	err := topo.UpdateTablet(ts, tablet)
	if err != nil {
		return err
	}
	// NOTE(msolomon) A serving graph update is required, but in
	// order for the shard to be consistent the old master must be
	// scrapped first. That is externally coordinated by the
	// wrangler reparent action.

	// Insert the new tablet location in the replication graph now that
	// we've updated the tablet.
	err = topo.CreateTabletReplicationData(ts, tablet.Tablet)
	if err != nil && err != topo.ErrNodeExists {
		return err
	}

	return nil
}
Exemple #2
0
func (ta *TabletActor) SlaveWasRestarted(actionNode *ActionNode, masterAddr string) error {
	swrd := actionNode.args.(*SlaveWasRestartedData)

	tablet, err := ta.ts.GetTablet(ta.tabletAlias)
	if err != nil {
		return err
	}

	// Remove tablet from the replication graph.
	if err := topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet); err != nil && err != topo.ErrNoNode {
		// FIXME(alainjobart) once we don't have replication paths
		// any more, remove this extra check
		if err == topo.ErrNotEmpty {
			log.Infof("Failed to delete master replication path, will be caught later")
		} else {
			return err
		}
	}

	// now we can check the reparent actually worked
	if masterAddr == "" {
		masterAddr, err = ta.mysqlDaemon.GetMasterAddr()
		if err != nil {
			return err
		}
	}
	if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr {
		log.Errorf("slaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr)
		if swrd.ScrapStragglers {
			return Scrap(ta.ts, tablet.Alias(), false)
		} else {
			return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr)
		}
	}

	// Once this action completes, update authoritive tablet node first.
	tablet.Parent = swrd.Parent
	if tablet.Type == topo.TYPE_MASTER {
		tablet.Type = topo.TYPE_SPARE
		tablet.State = topo.STATE_READ_ONLY
	}
	err = topo.UpdateTablet(ta.ts, tablet)
	if err != nil {
		return err
	}

	// Insert the new tablet location in the replication graph now that
	// we've updated the tablet.
	err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet)
	if err != nil && err != topo.ErrNodeExists {
		return err
	}

	return nil
}
Exemple #3
0
// Scrap will update the tablet type to 'Scrap', and remove it from
// the serving graph.
//
// 'force' means we are not on the tablet being scrapped, so it is
// probably dead. So if 'force' is true, we will also remove pending
// remote actions.  And if 'force' is false, we also run an optional
// hook.
func Scrap(ts topo.Server, tabletAlias topo.TabletAlias, force bool) error {
	tablet, err := ts.GetTablet(tabletAlias)
	if err != nil {
		return err
	}

	// If you are already scrap, skip updating replication data. It won't
	// be there anyway.
	wasAssigned := tablet.IsAssigned()
	tablet.Type = topo.TYPE_SCRAP
	tablet.Parent = topo.TabletAlias{}
	// Update the tablet first, since that is canonical.
	err = topo.UpdateTablet(ts, tablet)
	if err != nil {
		return err
	}

	// Remove any pending actions. Presumably forcing a scrap
	// means you don't want the agent doing anything and the
	// machine requires manual attention.
	if force {
		err := ts.PurgeTabletActions(tabletAlias, actionnode.ActionNodeCanBePurged)
		if err != nil {
			log.Warningf("purge actions failed: %v", err)
		}
	}

	if wasAssigned {
		err = topo.DeleteTabletReplicationData(ts, tablet.Tablet)
		if err != nil {
			if err == topo.ErrNoNode {
				log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell)
				err = nil
			}
			if err != nil {
				log.Warningf("remove replication data for %v failed: %v", tablet.Alias, err)
			}
		}
	}

	// run a hook for final cleanup, only in non-force mode.
	// (force mode executes on the vtctl side, not on the vttablet side)
	if !force {
		hk := hook.NewSimpleHook("postflight_scrap")
		ConfigureTabletHook(hk, tablet.Alias)
		if hookErr := hk.ExecuteOptional(); hookErr != nil {
			// we don't want to return an error, the server
			// is already in bad shape probably.
			log.Warningf("Scrap: postflight_scrap failed: %v", hookErr)
		}
	}

	return nil
}
Exemple #4
0
// UnreserveForRestore switches the tablet back to its original state,
// the restore won't happen.
func (wr *Wrangler) UnreserveForRestore(dstTabletAlias topo.TabletAlias) (err error) {
	tablet, err := wr.ts.GetTablet(dstTabletAlias)
	if err != nil {
		return err
	}
	err = topo.DeleteTabletReplicationData(wr.ts, tablet.Tablet)
	if err != nil {
		return err
	}

	return wr.ChangeType(tablet.Alias, topo.TYPE_IDLE, false)
}
Exemple #5
0
// DeleteTablet removes a tablet record from the topology:
// - the replication data record if any
// - the tablet record
func DeleteTablet(ctx context.Context, ts topo.Server, tablet *topodatapb.Tablet) error {
	// try to remove replication data, no fatal if we fail
	if err := topo.DeleteTabletReplicationData(ctx, ts, tablet); err != nil {
		if err == topo.ErrNoNode {
			log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell)
			err = nil
		}
		if err != nil {
			log.Warningf("remove replication data for %v failed: %v", topoproto.TabletAliasString(tablet.Alias), err)
		}
	}

	// then delete the tablet record
	return ts.DeleteTablet(ctx, tablet.Alias)
}
Exemple #6
0
func (ta *TabletActor) slaveWasRestarted(actionNode *ActionNode) error {
	swrd := actionNode.args.(*SlaveWasRestartedData)

	tablet, err := ta.ts.GetTablet(ta.tabletAlias)
	if err != nil {
		return err
	}

	// Remove tablet from the replication graph.
	if err := topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet, tablet.ReplicationPath()); err != nil && err != topo.ErrNoNode {
		return err
	}

	// now we can check the reparent actually worked
	masterAddr, err := ta.mysqld.GetMasterAddr()
	if err != nil {
		return err
	}
	if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr {
		log.Errorf("slaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr)
		if swrd.ScrapStragglers {
			return Scrap(ta.ts, tablet.Alias(), false)
		} else {
			return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr)
		}
	}

	// Once this action completes, update authoritive tablet node first.
	tablet.Parent = swrd.Parent
	if tablet.Type == topo.TYPE_MASTER {
		tablet.Type = topo.TYPE_SPARE
		tablet.State = topo.STATE_READ_ONLY
	}
	err = topo.UpdateTablet(ta.ts, tablet)
	if err != nil {
		return err
	}

	// Insert the new tablet location in the replication graph now that
	// we've updated the tablet.
	err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet)
	if err != nil && err != topo.ErrNodeExists {
		return err
	}

	return nil
}
Exemple #7
0
// Scrap will update the tablet type to 'Scrap', and remove it from
// the serving graph.
//
// 'force' means we are not on the tablet being scrapped, so it is
// probably dead. So if 'force' is true, we will also remove pending
// remote actions.  And if 'force' is false, we also run an optional
// hook.
func Scrap(ctx context.Context, ts topo.Server, tabletAlias topo.TabletAlias, force bool) error {
	tablet, err := ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}

	// If you are already scrap, skip updating replication data. It won't
	// be there anyway.
	wasAssigned := tablet.IsAssigned()
	tablet.Type = topo.TYPE_SCRAP
	// Update the tablet first, since that is canonical.
	err = topo.UpdateTablet(ctx, ts, tablet)
	if err != nil {
		return err
	}

	if wasAssigned {
		err = topo.DeleteTabletReplicationData(ctx, ts, tablet.Tablet)
		if err != nil {
			if err == topo.ErrNoNode {
				log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell)
				err = nil
			}
			if err != nil {
				log.Warningf("remove replication data for %v failed: %v", tablet.Alias, err)
			}
		}
	}

	// run a hook for final cleanup, only in non-force mode.
	// (force mode executes on the vtctl side, not on the vttablet side)
	if !force {
		hk := hook.NewSimpleHook("postflight_scrap")
		ConfigureTabletHook(hk, tablet.Alias)
		if hookErr := hk.ExecuteOptional(); hookErr != nil {
			// we don't want to return an error, the server
			// is already in bad shape probably.
			log.Warningf("Scrap: postflight_scrap failed: %v", hookErr)
		}
	}

	return nil
}
Exemple #8
0
func (ta *TabletActor) restartSlave(actionNode *actionnode.ActionNode) error {
	rsd := actionNode.Args.(*actionnode.RestartSlaveData)

	tablet, err := ta.ts.GetTablet(ta.tabletAlias)
	if err != nil {
		return err
	}

	// If this check fails, we seem reparented. The only part that
	// could have failed is the insert in the replication
	// graph. Do NOT try to reparent again. That will either wedge
	// replication or corrupt data.
	if tablet.Parent != rsd.Parent {
		log.V(6).Infof("restart with new parent")
		// Remove tablet from the replication graph.
		if err = topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet); err != nil && err != topo.ErrNoNode {
			return err
		}

		// Move a lag slave into the orphan lag type so we can safely ignore
		// this reparenting until replication catches up.
		if tablet.Type == topo.TYPE_LAG {
			tablet.Type = topo.TYPE_LAG_ORPHAN
		} else {
			err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted)
			if err != nil {
				return err
			}
		}
		// Once this action completes, update authoritive tablet node first.
		tablet.Parent = rsd.Parent
		err = topo.UpdateTablet(ta.ts, tablet)
		if err != nil {
			return err
		}
	} else if rsd.Force {
		err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted)
		if err != nil {
			return err
		}
		// Complete the special orphan accounting.
		if tablet.Type == topo.TYPE_LAG_ORPHAN {
			tablet.Type = topo.TYPE_LAG
			err = topo.UpdateTablet(ta.ts, tablet)
			if err != nil {
				return err
			}
		}
	} else {
		// There is nothing to safely reparent, so check replication. If
		// either replication thread is not running, report an error.
		replicationPos, err := ta.mysqld.SlaveStatus()
		if err != nil {
			return fmt.Errorf("cannot verify replication for slave: %v", err)
		}
		if replicationPos.SecondsBehindMaster == myproto.InvalidLagSeconds {
			return fmt.Errorf("replication not running for slave")
		}
	}

	// Insert the new tablet location in the replication graph now that
	// we've updated the tablet.
	err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet)
	if err != nil && err != topo.ErrNodeExists {
		return err
	}

	return nil
}
Exemple #9
0
// Make this external, since in needs to be forced from time to time.
func Scrap(ts topo.Server, tabletAlias topo.TabletAlias, force bool) error {
	tablet, err := ts.GetTablet(tabletAlias)
	if err != nil {
		return err
	}

	// If you are already scrap, skip deleting the path. It won't
	// be correct since the Parent will be cleared already.
	wasAssigned := tablet.IsAssigned()
	replicationPath := ""
	if wasAssigned {
		replicationPath = tablet.ReplicationPath()
	}

	wasMaster := tablet.Parent.IsZero()
	tablet.Type = topo.TYPE_SCRAP
	tablet.Parent = topo.TabletAlias{}
	// Update the tablet first, since that is canonical.
	err = topo.UpdateTablet(ts, tablet)
	if err != nil {
		return err
	}

	// Remove any pending actions. Presumably forcing a scrap means you don't
	// want the agent doing anything and the machine requires manual attention.
	if force {
		err := ts.PurgeTabletActions(tabletAlias, ActionNodeCanBePurged)
		if err != nil {
			log.Warningf("purge actions failed: %v", err)
		}
	}

	if wasAssigned {
		err = topo.DeleteTabletReplicationData(ts, tablet.Tablet, replicationPath)
		if err != nil {
			switch err {
			case topo.ErrNoNode:
				log.V(6).Infof("no replication path: %v", replicationPath)
				err = nil
			case topo.ErrNotEmpty:
				// If you are forcing the scrapping of a master, you can't update the
				// replication graph yet, since other nodes are still under the impression
				// they are slaved to this tablet.
				// If the node was not empty, we can't do anything about it - the replication
				// graph needs to be fixed by reparenting. If the action was forced, assume
				// the user knows best and squelch the error.
				if wasMaster && force {
					err = nil
				}
			}
			if err != nil {
				log.Warningf("remove replication path failed: %v %v", replicationPath, err)
			}
		}
	}

	// run a hook for final cleanup, only in non-force mode.
	// (force mode executes on the vtctl side, not on the vttablet side)
	if !force {
		hk := hook.NewSimpleHook("postflight_scrap")
		configureTabletHook(hk, tablet.Alias())
		if hookErr := hk.ExecuteOptional(); hookErr != nil {
			// we don't want to return an error, the server
			// is already in bad shape probably.
			log.Warningf("Scrap: postflight_scrap failed: %v", hookErr)
		}
	}

	return nil
}
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool, acceptSuccessPercents int) error {
	recorder := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	swrd := tm.SlaveWasRestartedData{
		Parent:               masterElectTablet.Alias(),
		ExpectedMasterAddr:   masterElectTablet.MysqlAddr,
		ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr,
		ScrapStragglers:      scrapStragglers,
	}

	// do all the slaves
	for _, ti := range slaveTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			recorder.RecordError(wr.slaveWasRestarted(ti, &swrd))
			wg.Done()
		}(ti)
	}
	wg.Wait()

	// then do the old master if it hadn't been scrapped
	if masterTablet != nil {
		err := wr.slaveWasRestarted(masterTablet, &swrd)
		if err != nil {
			recorder.RecordError(err)
			// the old master can be annoying if left
			// around in the replication graph, so if we
			// can't restart it, we just scrap it
			log.Warningf("Old master %v is not restarting, scrapping it", masterTablet.Alias())
			if _, err := wr.Scrap(masterTablet.Alias(), true /*force*/, true /*skipRebuild*/); err != nil {
				log.Warningf("Failed to scrap old master %v: %v", masterTablet.Alias(), err)
			}
		}
	}

	// check the toplevel replication paths only contains the new master,
	// try to remove any old tablet aliases that don't make sense anymore
	toplevelAliases, err := wr.ts.GetReplicationPaths(masterElectTablet.Keyspace, masterElectTablet.Shard, "")
	if err != nil {
		log.Warningf("GetReplicationPaths() failed, cannot fix extra paths: %v", err)
	} else {
		for _, toplevelAlias := range toplevelAliases {
			if toplevelAlias == masterElectTablet.Alias() {
				continue
			}

			// if we can't read the tablet, or if it's not in the
			// replication graph, we remove the entry.
			if ti, err := wr.ts.GetTablet(toplevelAlias); err == nil && ti.Tablet.IsInReplicationGraph() {
				// we can read the entry and it belongs here,
				// keep it
				continue
			}

			log.Infof("Removing stale replication path %v", toplevelAlias.String())
			if err := topo.DeleteTabletReplicationData(wr.ts, masterElectTablet.Tablet, toplevelAlias.String()); err != nil {
				log.Warningf("DeleteTabletReplicationData(%v) failed: %v", toplevelAlias.String(), err)
			}
		}
	}

	if !recorder.HasErrors() {
		return nil
	}

	// report errors only above a threshold
	failurePercent := 100 * len(recorder.Errors) / (len(slaveTabletMap) + 1)
	if failurePercent < 100-acceptSuccessPercents {
		log.Warningf("Encountered %v%% failure, we keep going. Errors: %v", failurePercent, recorder.Error())
		return nil
	}

	return recorder.Error()
}