func updateReplicationGraphForPromotedSlave(ts topo.Server, tablet *topo.TabletInfo) error { // Remove tablet from the replication graph if this is not already the master. if tablet.Parent.Uid != topo.NO_TABLET { if err := topo.DeleteTabletReplicationData(ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { return err } } // Update tablet regardless - trend towards consistency. tablet.State = topo.STATE_READ_WRITE tablet.Type = topo.TYPE_MASTER tablet.Parent.Cell = "" tablet.Parent.Uid = topo.NO_TABLET err := topo.UpdateTablet(ts, tablet) if err != nil { return err } // NOTE(msolomon) A serving graph update is required, but in // order for the shard to be consistent the old master must be // scrapped first. That is externally coordinated by the // wrangler reparent action. // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func (ta *TabletActor) SlaveWasRestarted(actionNode *ActionNode, masterAddr string) error { swrd := actionNode.args.(*SlaveWasRestartedData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // Remove tablet from the replication graph. if err := topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { // FIXME(alainjobart) once we don't have replication paths // any more, remove this extra check if err == topo.ErrNotEmpty { log.Infof("Failed to delete master replication path, will be caught later") } else { return err } } // now we can check the reparent actually worked if masterAddr == "" { masterAddr, err = ta.mysqlDaemon.GetMasterAddr() if err != nil { return err } } if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr { log.Errorf("slaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) if swrd.ScrapStragglers { return Scrap(ta.ts, tablet.Alias(), false) } else { return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) } } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// Scrap will update the tablet type to 'Scrap', and remove it from // the serving graph. // // 'force' means we are not on the tablet being scrapped, so it is // probably dead. So if 'force' is true, we will also remove pending // remote actions. And if 'force' is false, we also run an optional // hook. func Scrap(ts topo.Server, tabletAlias topo.TabletAlias, force bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // If you are already scrap, skip updating replication data. It won't // be there anyway. wasAssigned := tablet.IsAssigned() tablet.Type = topo.TYPE_SCRAP tablet.Parent = topo.TabletAlias{} // Update the tablet first, since that is canonical. err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Remove any pending actions. Presumably forcing a scrap // means you don't want the agent doing anything and the // machine requires manual attention. if force { err := ts.PurgeTabletActions(tabletAlias, actionnode.ActionNodeCanBePurged) if err != nil { log.Warningf("purge actions failed: %v", err) } } if wasAssigned { err = topo.DeleteTabletReplicationData(ts, tablet.Tablet) if err != nil { if err == topo.ErrNoNode { log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell) err = nil } if err != nil { log.Warningf("remove replication data for %v failed: %v", tablet.Alias, err) } } } // run a hook for final cleanup, only in non-force mode. // (force mode executes on the vtctl side, not on the vttablet side) if !force { hk := hook.NewSimpleHook("postflight_scrap") ConfigureTabletHook(hk, tablet.Alias) if hookErr := hk.ExecuteOptional(); hookErr != nil { // we don't want to return an error, the server // is already in bad shape probably. log.Warningf("Scrap: postflight_scrap failed: %v", hookErr) } } return nil }
// UnreserveForRestore switches the tablet back to its original state, // the restore won't happen. func (wr *Wrangler) UnreserveForRestore(dstTabletAlias topo.TabletAlias) (err error) { tablet, err := wr.ts.GetTablet(dstTabletAlias) if err != nil { return err } err = topo.DeleteTabletReplicationData(wr.ts, tablet.Tablet) if err != nil { return err } return wr.ChangeType(tablet.Alias, topo.TYPE_IDLE, false) }
// DeleteTablet removes a tablet record from the topology: // - the replication data record if any // - the tablet record func DeleteTablet(ctx context.Context, ts topo.Server, tablet *topodatapb.Tablet) error { // try to remove replication data, no fatal if we fail if err := topo.DeleteTabletReplicationData(ctx, ts, tablet); err != nil { if err == topo.ErrNoNode { log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell) err = nil } if err != nil { log.Warningf("remove replication data for %v failed: %v", topoproto.TabletAliasString(tablet.Alias), err) } } // then delete the tablet record return ts.DeleteTablet(ctx, tablet.Alias) }
func (ta *TabletActor) slaveWasRestarted(actionNode *ActionNode) error { swrd := actionNode.args.(*SlaveWasRestartedData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // Remove tablet from the replication graph. if err := topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet, tablet.ReplicationPath()); err != nil && err != topo.ErrNoNode { return err } // now we can check the reparent actually worked masterAddr, err := ta.mysqld.GetMasterAddr() if err != nil { return err } if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr { log.Errorf("slaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) if swrd.ScrapStragglers { return Scrap(ta.ts, tablet.Alias(), false) } else { return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) } } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// Scrap will update the tablet type to 'Scrap', and remove it from // the serving graph. // // 'force' means we are not on the tablet being scrapped, so it is // probably dead. So if 'force' is true, we will also remove pending // remote actions. And if 'force' is false, we also run an optional // hook. func Scrap(ctx context.Context, ts topo.Server, tabletAlias topo.TabletAlias, force bool) error { tablet, err := ts.GetTablet(ctx, tabletAlias) if err != nil { return err } // If you are already scrap, skip updating replication data. It won't // be there anyway. wasAssigned := tablet.IsAssigned() tablet.Type = topo.TYPE_SCRAP // Update the tablet first, since that is canonical. err = topo.UpdateTablet(ctx, ts, tablet) if err != nil { return err } if wasAssigned { err = topo.DeleteTabletReplicationData(ctx, ts, tablet.Tablet) if err != nil { if err == topo.ErrNoNode { log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell) err = nil } if err != nil { log.Warningf("remove replication data for %v failed: %v", tablet.Alias, err) } } } // run a hook for final cleanup, only in non-force mode. // (force mode executes on the vtctl side, not on the vttablet side) if !force { hk := hook.NewSimpleHook("postflight_scrap") ConfigureTabletHook(hk, tablet.Alias) if hookErr := hk.ExecuteOptional(); hookErr != nil { // we don't want to return an error, the server // is already in bad shape probably. log.Warningf("Scrap: postflight_scrap failed: %v", hookErr) } } return nil }
func (ta *TabletActor) restartSlave(actionNode *actionnode.ActionNode) error { rsd := actionNode.Args.(*actionnode.RestartSlaveData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // If this check fails, we seem reparented. The only part that // could have failed is the insert in the replication // graph. Do NOT try to reparent again. That will either wedge // replication or corrupt data. if tablet.Parent != rsd.Parent { log.V(6).Infof("restart with new parent") // Remove tablet from the replication graph. if err = topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { return err } // Move a lag slave into the orphan lag type so we can safely ignore // this reparenting until replication catches up. if tablet.Type == topo.TYPE_LAG { tablet.Type = topo.TYPE_LAG_ORPHAN } else { err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } } // Once this action completes, update authoritive tablet node first. tablet.Parent = rsd.Parent err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } } else if rsd.Force { err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } // Complete the special orphan accounting. if tablet.Type == topo.TYPE_LAG_ORPHAN { tablet.Type = topo.TYPE_LAG err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } } } else { // There is nothing to safely reparent, so check replication. If // either replication thread is not running, report an error. replicationPos, err := ta.mysqld.SlaveStatus() if err != nil { return fmt.Errorf("cannot verify replication for slave: %v", err) } if replicationPos.SecondsBehindMaster == myproto.InvalidLagSeconds { return fmt.Errorf("replication not running for slave") } } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// Make this external, since in needs to be forced from time to time. func Scrap(ts topo.Server, tabletAlias topo.TabletAlias, force bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // If you are already scrap, skip deleting the path. It won't // be correct since the Parent will be cleared already. wasAssigned := tablet.IsAssigned() replicationPath := "" if wasAssigned { replicationPath = tablet.ReplicationPath() } wasMaster := tablet.Parent.IsZero() tablet.Type = topo.TYPE_SCRAP tablet.Parent = topo.TabletAlias{} // Update the tablet first, since that is canonical. err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Remove any pending actions. Presumably forcing a scrap means you don't // want the agent doing anything and the machine requires manual attention. if force { err := ts.PurgeTabletActions(tabletAlias, ActionNodeCanBePurged) if err != nil { log.Warningf("purge actions failed: %v", err) } } if wasAssigned { err = topo.DeleteTabletReplicationData(ts, tablet.Tablet, replicationPath) if err != nil { switch err { case topo.ErrNoNode: log.V(6).Infof("no replication path: %v", replicationPath) err = nil case topo.ErrNotEmpty: // If you are forcing the scrapping of a master, you can't update the // replication graph yet, since other nodes are still under the impression // they are slaved to this tablet. // If the node was not empty, we can't do anything about it - the replication // graph needs to be fixed by reparenting. If the action was forced, assume // the user knows best and squelch the error. if wasMaster && force { err = nil } } if err != nil { log.Warningf("remove replication path failed: %v %v", replicationPath, err) } } } // run a hook for final cleanup, only in non-force mode. // (force mode executes on the vtctl side, not on the vttablet side) if !force { hk := hook.NewSimpleHook("postflight_scrap") configureTabletHook(hk, tablet.Alias()) if hookErr := hk.ExecuteOptional(); hookErr != nil { // we don't want to return an error, the server // is already in bad shape probably. log.Warningf("Scrap: postflight_scrap failed: %v", hookErr) } } return nil }
func (wr *Wrangler) restartSlavesExternal(slaveTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterTablet, masterElectTablet *topo.TabletInfo, scrapStragglers bool, acceptSuccessPercents int) error { recorder := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} swrd := tm.SlaveWasRestartedData{ Parent: masterElectTablet.Alias(), ExpectedMasterAddr: masterElectTablet.MysqlAddr, ExpectedMasterIpAddr: masterElectTablet.MysqlIpAddr, ScrapStragglers: scrapStragglers, } // do all the slaves for _, ti := range slaveTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { recorder.RecordError(wr.slaveWasRestarted(ti, &swrd)) wg.Done() }(ti) } wg.Wait() // then do the old master if it hadn't been scrapped if masterTablet != nil { err := wr.slaveWasRestarted(masterTablet, &swrd) if err != nil { recorder.RecordError(err) // the old master can be annoying if left // around in the replication graph, so if we // can't restart it, we just scrap it log.Warningf("Old master %v is not restarting, scrapping it", masterTablet.Alias()) if _, err := wr.Scrap(masterTablet.Alias(), true /*force*/, true /*skipRebuild*/); err != nil { log.Warningf("Failed to scrap old master %v: %v", masterTablet.Alias(), err) } } } // check the toplevel replication paths only contains the new master, // try to remove any old tablet aliases that don't make sense anymore toplevelAliases, err := wr.ts.GetReplicationPaths(masterElectTablet.Keyspace, masterElectTablet.Shard, "") if err != nil { log.Warningf("GetReplicationPaths() failed, cannot fix extra paths: %v", err) } else { for _, toplevelAlias := range toplevelAliases { if toplevelAlias == masterElectTablet.Alias() { continue } // if we can't read the tablet, or if it's not in the // replication graph, we remove the entry. if ti, err := wr.ts.GetTablet(toplevelAlias); err == nil && ti.Tablet.IsInReplicationGraph() { // we can read the entry and it belongs here, // keep it continue } log.Infof("Removing stale replication path %v", toplevelAlias.String()) if err := topo.DeleteTabletReplicationData(wr.ts, masterElectTablet.Tablet, toplevelAlias.String()); err != nil { log.Warningf("DeleteTabletReplicationData(%v) failed: %v", toplevelAlias.String(), err) } } } if !recorder.HasErrors() { return nil } // report errors only above a threshold failurePercent := 100 * len(recorder.Errors) / (len(slaveTabletMap) + 1) if failurePercent < 100-acceptSuccessPercents { log.Warningf("Encountered %v%% failure, we keep going. Errors: %v", failurePercent, recorder.Error()) return nil } return recorder.Error() }