// prepareToSnapshot changes the type of the tablet to backup (when // the original type is master, it will proceed only if // forceMasterSnapshot is true). It returns a function that will // restore the original state. func (wr *Wrangler) prepareToSnapshot(ti *topo.TabletInfo, forceMasterSnapshot bool) (restoreAfterSnapshot func() error, err error) { originalType := ti.Tablet.Type if ti.Tablet.Type == topo.TYPE_MASTER && forceMasterSnapshot { // In this case, we don't bother recomputing the serving graph. // All queries will have to fail anyway. log.Infof("force change type master -> backup: %v", ti.Alias) // There is a legitimate reason to force in the case of a single // master. ti.Tablet.Type = topo.TYPE_BACKUP err = topo.UpdateTablet(wr.ts, ti) } else { err = wr.ChangeType(ti.Alias, topo.TYPE_BACKUP, false) } if err != nil { return } restoreAfterSnapshot = func() (err error) { log.Infof("change type after snapshot: %v %v", ti.Alias, originalType) if ti.Tablet.Parent.Uid == topo.NO_TABLET && forceMasterSnapshot { log.Infof("force change type backup -> master: %v", ti.Alias) ti.Tablet.Type = topo.TYPE_MASTER return topo.UpdateTablet(wr.ts, ti) } return wr.ChangeType(ti.Alias, originalType, false) } return }
func (ta *TabletActor) multiRestore(actionNode *actionnode.ActionNode) (err error) { args := actionNode.Args.(*actionnode.MultiRestoreArgs) // read our current tablet, verify its state // we only support restoring to the master or spare replicas tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_MASTER && tablet.Type != topo.TYPE_SPARE && tablet.Type != topo.TYPE_REPLICA && tablet.Type != topo.TYPE_RDONLY { return fmt.Errorf("expected master, spare replica or rdonly type, not %v: %v", tablet.Type, ta.tabletAlias) } // get source tablets addresses sourceAddrs := make([]*url.URL, len(args.SrcTabletAliases)) keyRanges := make([]key.KeyRange, len(args.SrcTabletAliases)) for i, alias := range args.SrcTabletAliases { t, e := ta.ts.GetTablet(alias) if e != nil { return e } sourceAddrs[i] = &url.URL{Host: t.GetAddr(), Path: "/" + t.DbName()} keyRanges[i], e = key.KeyRangesOverlap(tablet.KeyRange, t.KeyRange) if e != nil { return e } } // change type to restore, no change to replication graph originalType := tablet.Type tablet.Type = topo.TYPE_RESTORE err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // run the action, scrap if it fails if err := ta.mysqld.MultiRestore(tablet.DbName(), keyRanges, sourceAddrs, args.Concurrency, args.FetchConcurrency, args.InsertTableConcurrency, args.FetchRetryCount, args.Strategy); err != nil { if e := Scrap(ta.ts, ta.tabletAlias, false); e != nil { log.Errorf("Failed to Scrap after failed RestoreFromMultiSnapshot: %v", e) } return err } // restore type back tablet.Type = originalType return topo.UpdateTablet(ta.ts, tablet) }
// SnapshotSourceEnd restores the state of the server after a // Snapshot(server_mode =true) // Should be called under RpcWrapLockAction. func (agent *ActionAgent) SnapshotSourceEnd(args *actionnode.SnapshotSourceEndArgs) error { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("expected snapshot_source type, not %v", tablet.Type) } if err := agent.Mysqld.SnapshotSourceEnd(args.SlaveStartRequired, args.ReadOnly, true, agent.hookExtraEnv()); err != nil { log.Errorf("SnapshotSourceEnd failed, leaving tablet type alone: %v", err) return err } // change the type back if args.OriginalType == topo.TYPE_MASTER { // force the master update tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, args.OriginalType, make(map[string]string), true /*runHooks*/) } return err }
func (wr *Wrangler) SnapshotSourceEnd(tabletAlias topo.TabletAlias, slaveStartRequired, readWrite bool, originalType topo.TabletType) (err error) { var ti *topo.TabletInfo ti, err = wr.ts.GetTablet(tabletAlias) if err != nil { return } var actionPath string actionPath, err = wr.ai.SnapshotSourceEnd(tabletAlias, &tm.SnapshotSourceEndArgs{slaveStartRequired, !readWrite}) if err != nil { return } // wait for completion, and save the error err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout()) if err != nil { log.Errorf("SnapshotSourceEnd failed (%v), leaving tablet type alone", err) return } if ti.Tablet.Parent.Uid == topo.NO_TABLET { ti.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(wr.ts, ti) } else { err = wr.ChangeType(ti.Alias(), originalType, false) } return err }
// ChangeType changes the type of the tablet and possibly also updates // the health informaton for it. Make this external, since these // transitions need to be forced from time to time. // // - if health is nil, we don't touch the Tablet's Health record. // - if health is an empty map, we clear the Tablet's Health record. // - if health has values, we overwrite the Tablet's Health record. func ChangeType(ctx context.Context, ts topo.Server, tabletAlias topo.TabletAlias, newType topo.TabletType, health map[string]string) error { tablet, err := ts.GetTablet(ctx, tabletAlias) if err != nil { return err } if !topo.IsTrivialTypeChange(tablet.Type, newType) { return fmt.Errorf("cannot change tablet type %v -> %v %v", tablet.Type, newType, tabletAlias) } tablet.Type = newType if newType == topo.TYPE_IDLE { tablet.Keyspace = "" tablet.Shard = "" tablet.KeyRange = key.KeyRange{} tablet.Health = health } if health != nil { if len(health) == 0 { tablet.Health = nil } else { tablet.Health = health } } return topo.UpdateTablet(ctx, ts, tablet) }
func SlaveWasRestarted(ts topo.Server, tabletAlias topo.TabletAlias, swrd *actionnode.SlaveWasRestartedArgs) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Update the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func (ta *TabletActor) updateReplicationGraphForPromotedSlave(tablet *topo.TabletInfo, actionNode *ActionNode) error { // Remove tablet from the replication graph if this is not already the master. if tablet.Parent.Uid != topo.NO_TABLET { err := ta.ts.DeleteReplicationPath(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNoNode { return err } } // Update tablet regardless - trend towards consistency. tablet.State = topo.STATE_READ_WRITE tablet.Type = topo.TYPE_MASTER tablet.Parent.Cell = "" tablet.Parent.Uid = topo.NO_TABLET err := topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // NOTE(msolomon) A serving graph update is required, but in order for the // shard to be consistent the master must be scrapped first. That is // externally coordinated by the wrangler reparent action. // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = ta.ts.CreateReplicationPath(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func updateReplicationGraphForPromotedSlave(ts topo.Server, tablet *topo.TabletInfo) error { // Remove tablet from the replication graph if this is not already the master. if tablet.Parent.Uid != topo.NO_TABLET { if err := topo.DeleteTabletReplicationData(ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { return err } } // Update tablet regardless - trend towards consistency. tablet.State = topo.STATE_READ_WRITE tablet.Type = topo.TYPE_MASTER tablet.Parent.Cell = "" tablet.Parent.Uid = topo.NO_TABLET tablet.Health = nil err := topo.UpdateTablet(ts, tablet) if err != nil { return err } // NOTE(msolomon) A serving graph update is required, but in // order for the shard to be consistent the old master must be // scrapped first. That is externally coordinated by the // wrangler reparent action. // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// Make this external, since these transitions need to be forced from time to time. func SetBlacklistedTables(ts topo.Server, tabletAlias topo.TabletAlias, tables []string) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } tablet.BlacklistedTables = tables return topo.UpdateTablet(ts, tablet) }
// Make this external, since these transitions need to be forced from time to time. func ChangeType(ts topo.Server, tabletAlias topo.TabletAlias, newType topo.TabletType, runHooks bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } if !topo.IsTrivialTypeChange(tablet.Type, newType) || !topo.IsValidTypeChange(tablet.Type, newType) { return fmt.Errorf("cannot change tablet type %v -> %v %v", tablet.Type, newType, tabletAlias) } if runHooks { // Only run the preflight_serving_type hook when // transitioning from non-serving to serving. if !topo.IsInServingGraph(tablet.Type) && topo.IsInServingGraph(newType) { if err := hook.NewSimpleHook("preflight_serving_type").ExecuteOptional(); err != nil { return err } } } tablet.Type = newType if newType == topo.TYPE_IDLE { if tablet.Parent.IsZero() { si, err := ts.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return err } rec := concurrency.AllErrorRecorder{} wg := sync.WaitGroup{} for _, cell := range si.Cells { wg.Add(1) go func(cell string) { defer wg.Done() sri, err := ts.GetShardReplication(cell, tablet.Keyspace, tablet.Shard) if err != nil { log.Warningf("Cannot check cell %v for extra replication paths, assuming it's good", cell) return } for _, rl := range sri.ReplicationLinks { if rl.Parent == tabletAlias { rec.RecordError(fmt.Errorf("Still have a ReplicationLink in cell %v", cell)) } } }(cell) } wg.Wait() if rec.HasErrors() { return rec.Error() } } tablet.Parent = topo.TabletAlias{} tablet.Keyspace = "" tablet.Shard = "" tablet.KeyRange = key.KeyRange{} } return topo.UpdateTablet(ts, tablet) }
func (ta *TabletActor) SlaveWasRestarted(actionNode *ActionNode, masterAddr string) error { swrd := actionNode.args.(*SlaveWasRestartedData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // Remove tablet from the replication graph. if err := topo.DeleteTabletReplicationData(ta.ts, tablet.Tablet); err != nil && err != topo.ErrNoNode { // FIXME(alainjobart) once we don't have replication paths // any more, remove this extra check if err == topo.ErrNotEmpty { log.Infof("Failed to delete master replication path, will be caught later") } else { return err } } // now we can check the reparent actually worked if masterAddr == "" { masterAddr, err = ta.mysqlDaemon.GetMasterAddr() if err != nil { return err } } if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr { log.Errorf("slaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) if swrd.ScrapStragglers { return Scrap(ta.ts, tablet.Alias(), false) } else { return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) } } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ta.ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// SetMaster sets replication master, and waits for the // reparent_journal table entry up to context timeout func (agent *ActionAgent) SetMaster(ctx context.Context, parent topo.TabletAlias, timeCreatedNS int64, forceStartSlave bool) error { ti, err := agent.TopoServer.GetTablet(ctx, parent) if err != nil { return err } // See if we were replicating at all, and should be replicating wasReplicating := false shouldbeReplicating := false rs, err := agent.MysqlDaemon.SlaveStatus() if err == nil && (rs.SlaveIORunning || rs.SlaveSQLRunning) { wasReplicating = true shouldbeReplicating = true } if forceStartSlave { shouldbeReplicating = true } // Create the list of commands to set the master cmds := []string{} if wasReplicating { cmds = append(cmds, mysqlctl.SqlStopSlave) } smc, err := agent.MysqlDaemon.SetMasterCommands(ti.Hostname, ti.Portmap["mysql"]) if err != nil { return err } cmds = append(cmds, smc...) if shouldbeReplicating { cmds = append(cmds, mysqlctl.SqlStartSlave) } if err := agent.MysqlDaemon.ExecuteSuperQueryList(cmds); err != nil { return err } // change our type to spare if we used to be the master tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err } if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.Health = nil if err := topo.UpdateTablet(ctx, agent.TopoServer, tablet); err != nil { return err } } // if needed, wait until we get the replicated row, or our // context times out if !shouldbeReplicating || timeCreatedNS == 0 { return nil } return agent.MysqlDaemon.WaitForReparentJournal(ctx, timeCreatedNS) }
// Scrap will update the tablet type to 'Scrap', and remove it from // the serving graph. // // 'force' means we are not on the tablet being scrapped, so it is // probably dead. So if 'force' is true, we will also remove pending // remote actions. And if 'force' is false, we also run an optional // hook. func Scrap(ts topo.Server, tabletAlias topo.TabletAlias, force bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // If you are already scrap, skip updating replication data. It won't // be there anyway. wasAssigned := tablet.IsAssigned() tablet.Type = topo.TYPE_SCRAP tablet.Parent = topo.TabletAlias{} // Update the tablet first, since that is canonical. err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Remove any pending actions. Presumably forcing a scrap // means you don't want the agent doing anything and the // machine requires manual attention. if force { err := ts.PurgeTabletActions(tabletAlias, actionnode.ActionNodeCanBePurged) if err != nil { log.Warningf("purge actions failed: %v", err) } } if wasAssigned { err = topo.DeleteTabletReplicationData(ts, tablet.Tablet) if err != nil { if err == topo.ErrNoNode { log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell) err = nil } if err != nil { log.Warningf("remove replication data for %v failed: %v", tablet.Alias, err) } } } // run a hook for final cleanup, only in non-force mode. // (force mode executes on the vtctl side, not on the vttablet side) if !force { hk := hook.NewSimpleHook("postflight_scrap") ConfigureTabletHook(hk, tablet.Alias) if hookErr := hk.ExecuteOptional(); hookErr != nil { // we don't want to return an error, the server // is already in bad shape probably. log.Warningf("Scrap: postflight_scrap failed: %v", hookErr) } } return nil }
func (ta *TabletActor) slaveWasRestarted(actionNode *ActionNode) error { swrd := actionNode.args.(*SlaveWasRestartedData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // Remove tablet from the replication graph. err = ta.ts.DeleteReplicationPath(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNoNode { return err } // now we can check the reparent actually worked masterAddr, err := ta.mysqld.GetMasterAddr() if err != nil { return err } if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr { log.Errorf("slaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) if swrd.ScrapStragglers { return Scrap(ta.ts, tablet.Alias(), false) } else { return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, ta.tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) } } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = ta.ts.CreateReplicationPath(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNodeExists { return err } return nil }
func (ta *TabletActor) demoteMaster() error { _, err := ta.mysqld.DemoteMaster() if err != nil { return err } tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } tablet.State = topo.STATE_READ_ONLY // NOTE(msolomon) there is no serving graph update - the master tablet will // be replaced. Even though writes may fail, reads will succeed. It will be // less noisy to simply leave the entry until well promote the master. return topo.UpdateTablet(ta.ts, tablet) }
func (ta *TabletActor) setReadOnly(rdonly bool) error { err := ta.mysqld.SetReadOnly(rdonly) if err != nil { return err } tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } if rdonly { tablet.State = topo.STATE_READ_ONLY } else { tablet.State = topo.STATE_READ_WRITE } return topo.UpdateTablet(ta.ts, tablet) }
// SetReadOnly makes the mysql instance read-only or read-write // Should be called under RpcWrapLockAction. func (agent *ActionAgent) SetReadOnly(rdonly bool) error { err := agent.Mysqld.SetReadOnly(rdonly) if err != nil { return err } tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if rdonly { tablet.State = topo.STATE_READ_ONLY } else { tablet.State = topo.STATE_READ_WRITE } return topo.UpdateTablet(agent.TopoServer, tablet) }
func SlaveWasRestarted(ts topo.Server, mysqlDaemon mysqlctl.MysqlDaemon, tabletAlias topo.TabletAlias, swrd *actionnode.SlaveWasRestartedArgs) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // check the reparent actually worked masterAddr, err := mysqlDaemon.GetMasterAddr() if err != nil { return err } if masterAddr != swrd.ExpectedMasterAddr && masterAddr != swrd.ExpectedMasterIpAddr { log.Errorf("SlaveWasRestarted found unexpected master %v for %v (was expecting %v or %v)", masterAddr, tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) // Disabled for now // if swrd.ContinueOnUnexpectedMaster { // log.Errorf("ContinueOnUnexpectedMaster is set, we keep going anyway") // } else if swrd.ScrapStragglers { return Scrap(ts, tablet.Alias, false) } else { return fmt.Errorf("Unexpected master %v for %v (was expecting %v or %v)", masterAddr, tabletAlias, swrd.ExpectedMasterAddr, swrd.ExpectedMasterIpAddr) } } // Once this action completes, update authoritive tablet node first. tablet.Parent = swrd.Parent if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE tablet.State = topo.STATE_READ_ONLY } err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Update the new tablet location in the replication graph now that // we've updated the tablet. err = topo.CreateTabletReplicationData(ts, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// Scrap will update the tablet type to 'Scrap', and remove it from // the serving graph. // // 'force' means we are not on the tablet being scrapped, so it is // probably dead. So if 'force' is true, we will also remove pending // remote actions. And if 'force' is false, we also run an optional // hook. func Scrap(ctx context.Context, ts topo.Server, tabletAlias topo.TabletAlias, force bool) error { tablet, err := ts.GetTablet(ctx, tabletAlias) if err != nil { return err } // If you are already scrap, skip updating replication data. It won't // be there anyway. wasAssigned := tablet.IsAssigned() tablet.Type = topo.TYPE_SCRAP // Update the tablet first, since that is canonical. err = topo.UpdateTablet(ctx, ts, tablet) if err != nil { return err } if wasAssigned { err = topo.DeleteTabletReplicationData(ctx, ts, tablet.Tablet) if err != nil { if err == topo.ErrNoNode { log.V(6).Infof("no ShardReplication object for cell %v", tablet.Alias.Cell) err = nil } if err != nil { log.Warningf("remove replication data for %v failed: %v", tablet.Alias, err) } } } // run a hook for final cleanup, only in non-force mode. // (force mode executes on the vtctl side, not on the vttablet side) if !force { hk := hook.NewSimpleHook("postflight_scrap") ConfigureTabletHook(hk, tablet.Alias) if hookErr := hk.ExecuteOptional(); hookErr != nil { // we don't want to return an error, the server // is already in bad shape probably. log.Warningf("Scrap: postflight_scrap failed: %v", hookErr) } } return nil }
// RestartSlavesExternal will tell all the slaves in the provided list // that they have a new master, and also tell all the masters. The // masters will be scrapped if they don't answer. // We execute all the actions in parallel. func RestartSlavesExternal(ts topo.Server, log logutil.Logger, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTabletAlias topo.TabletAlias, slaveWasRestarted func(*topo.TabletInfo, *actionnode.SlaveWasRestartedArgs) error) { wg := sync.WaitGroup{} swrd := actionnode.SlaveWasRestartedArgs{ Parent: masterElectTabletAlias, } log.Infof("Updating individual tablets with the right master...") // do all the slaves for _, ti := range slaveTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { if err := slaveWasRestarted(ti, &swrd); err != nil { log.Warningf("Slave %v had an error: %v", ti.Alias, err) } wg.Done() }(ti) } // and do the old master and any straggler, if possible. for _, ti := range masterTabletMap { wg.Add(1) go func(ti *topo.TabletInfo) { err := slaveWasRestarted(ti, &swrd) if err != nil { // the old master can be annoying if left // around in the replication graph, so if we // can't restart it, we just scrap it. // We don't rebuild the Shard just yet though. log.Warningf("Old master %v is not restarting in time, forcing it to spare: %v", ti.Alias, err) ti.Type = topo.TYPE_SPARE ti.Parent = masterElectTabletAlias if err := topo.UpdateTablet(ts, ti); err != nil { log.Warningf("Failed to change old master %v to spare: %v", ti.Alias, err) } } wg.Done() }(ti) } wg.Wait() }
// checkTabletMysqlPort will check the mysql port for the tablet is good, // and if not will try to update it. func (agent *ActionAgent) checkTabletMysqlPort(ctx context.Context, tablet *topo.TabletInfo) *topo.TabletInfo { mport, err := agent.MysqlDaemon.GetMysqlPort() if err != nil { log.Warningf("Cannot get current mysql port, not checking it: %v", err) return nil } if mport == tablet.Portmap["mysql"] { return nil } log.Warningf("MySQL port has changed from %v to %v, updating it in tablet record", tablet.Portmap["mysql"], mport) tablet.Portmap["mysql"] = mport if err := topo.UpdateTablet(ctx, agent.TopoServer, tablet); err != nil { log.Warningf("Failed to update tablet record, may use old mysql port") return nil } return tablet }
// ChecktabletMysqlPort will check the mysql port for the tablet is good, // and if not will try to update it func CheckTabletMysqlPort(ts topo.Server, mysqlDaemon mysqlctl.MysqlDaemon, tablet *topo.TabletInfo) *topo.TabletInfo { mport, err := mysqlDaemon.GetMysqlPort() if err != nil { log.Warningf("Cannot get current mysql port, not checking it: %v", err) return nil } if mport == tablet.Portmap["mysql"] { return nil } log.Warningf("MySQL port has changed from %v to %v, updating it in tablet record", tablet.Portmap["mysql"], mport) tablet.Portmap["mysql"] = mport if err := topo.UpdateTablet(ts, tablet); err != nil { log.Warningf("Failed to update tablet record, may use old mysql port") return nil } return tablet }
// Make this external, since these transitions need to be forced from time to time. func ChangeType(ts topo.Server, tabletAlias topo.TabletAlias, newType topo.TabletType, runHooks bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } if !topo.IsTrivialTypeChange(tablet.Type, newType) || !topo.IsValidTypeChange(tablet.Type, newType) { return fmt.Errorf("cannot change tablet type %v -> %v %v", tablet.Type, newType, tabletAlias) } if runHooks { // Only run the preflight_serving_type hook when // transitioning from non-serving to serving. if !topo.IsServingType(tablet.Type) && topo.IsServingType(newType) { if err := hook.NewSimpleHook("preflight_serving_type").ExecuteOptional(); err != nil { return err } } } tablet.Type = newType if newType == topo.TYPE_IDLE { if tablet.Parent.Uid == topo.NO_TABLET { // With a master the node cannot be set to idle unless we have already removed all of // the derived paths. The global replication path is a good indication that this has // been resolved. children, err := ts.GetReplicationPaths(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNoNode { return err } if err == nil && len(children) > 0 { return fmt.Errorf("cannot change tablet type %v -> %v - reparent action has not finished %v", tablet.Type, newType, tabletAlias) } } tablet.Parent = topo.TabletAlias{} tablet.Keyspace = "" tablet.Shard = "" tablet.KeyRange = key.KeyRange{} } return topo.UpdateTablet(ts, tablet) }
// change a tablet type to RESTORE and set all the other arguments. // from now on, we can go to: // - back to IDLE if we don't use the tablet at all (after for instance // a successful ReserveForRestore but a failed Snapshot) // - to SCRAP if something in the process on the target host fails // - to SPARE if the clone works func (ta *TabletActor) changeTypeToRestore(tablet, sourceTablet *topo.TabletInfo, parentAlias topo.TabletAlias, keyRange key.KeyRange) error { // run the optional preflight_assigned hook hk := hook.NewSimpleHook("preflight_assigned") configureTabletHook(hk, ta.tabletAlias) if err := hk.ExecuteOptional(); err != nil { return err } // change the type tablet.Parent = parentAlias tablet.Keyspace = sourceTablet.Keyspace tablet.Shard = sourceTablet.Shard tablet.Type = topo.TYPE_RESTORE tablet.KeyRange = keyRange tablet.DbNameOverride = sourceTablet.DbNameOverride if err := topo.UpdateTablet(ta.ts, tablet); err != nil { return err } // and create the replication graph items return topo.CreateTabletReplicationPaths(ta.ts, tablet.Tablet) }
// updateReplicationGraphForPromotedSlave makes sure the newly promoted slave // is correctly represented in the replication graph func (agent *ActionAgent) updateReplicationGraphForPromotedSlave(ctx context.Context, tablet *topo.TabletInfo) error { // Update tablet regardless - trend towards consistency. tablet.Type = topo.TYPE_MASTER tablet.Health = nil err := topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } // NOTE(msolomon) A serving graph update is required, but in // order for the shard to be consistent the old master must be // scrapped first. That is externally coordinated by the // wrangler reparent action. // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// SlaveWasRestarted updates the parent record for a tablet. // Should be called under RPCWrapLockAction. func (agent *ActionAgent) SlaveWasRestarted(ctx context.Context, swrd *actionnode.SlaveWasRestartedArgs) error { tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err } // Once this action completes, update authoritative tablet node first. if tablet.Type == topo.TYPE_MASTER { tablet.Type = topo.TYPE_SPARE } err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) if err != nil { return err } // Update the new tablet location in the replication graph now that // we've updated the tablet. err = topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet.Tablet) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// CheckTablet verifies the topo server API is correct for managing tablets. func CheckTablet(ctx context.Context, t *testing.T, ts topo.Server) { cell := getLocalCell(ctx, t, ts) tablet := &topo.Tablet{ Alias: topo.TabletAlias{Cell: cell, Uid: 1}, Hostname: "localhost", IPAddr: "10.11.12.13", Portmap: map[string]int{ "vt": 3333, "mysql": 3334, }, Tags: map[string]string{"tag": "value"}, Keyspace: "test_keyspace", Type: topo.TYPE_MASTER, KeyRange: newKeyRange("-10"), } if err := ts.CreateTablet(ctx, tablet); err != nil { t.Errorf("CreateTablet: %v", err) } if err := ts.CreateTablet(ctx, tablet); err != topo.ErrNodeExists { t.Errorf("CreateTablet(again): %v", err) } if _, err := ts.GetTablet(ctx, topo.TabletAlias{Cell: cell, Uid: 666}); err != topo.ErrNoNode { t.Errorf("GetTablet(666): %v", err) } ti, err := ts.GetTablet(ctx, tablet.Alias) if err != nil { t.Errorf("GetTablet %v: %v", tablet.Alias, err) } if eq, err := tabletEqual(ti.Tablet, tablet); err != nil { t.Errorf("cannot compare tablets: %v", err) } else if !eq { t.Errorf("put and got tablets are not identical:\n%#v\n%#v", tablet, ti.Tablet) } if _, err := ts.GetTabletsByCell(ctx, "666"); err != topo.ErrNoNode { t.Errorf("GetTabletsByCell(666): %v", err) } inCell, err := ts.GetTabletsByCell(ctx, cell) if err != nil { t.Errorf("GetTabletsByCell: %v", err) } if len(inCell) != 1 || inCell[0] != tablet.Alias { t.Errorf("GetTabletsByCell: want [%v], got %v", tablet.Alias, inCell) } ti.Hostname = "remotehost" if err := topo.UpdateTablet(ctx, ts, ti); err != nil { t.Errorf("UpdateTablet: %v", err) } ti, err = ts.GetTablet(ctx, tablet.Alias) if err != nil { t.Errorf("GetTablet %v: %v", tablet.Alias, err) } if want := "remotehost"; ti.Hostname != want { t.Errorf("ti.Hostname: want %v, got %v", want, ti.Hostname) } if err := topo.UpdateTabletFields(ctx, ts, tablet.Alias, func(t *topo.Tablet) error { t.Hostname = "anotherhost" return nil }); err != nil { t.Errorf("UpdateTabletFields: %v", err) } ti, err = ts.GetTablet(ctx, tablet.Alias) if err != nil { t.Errorf("GetTablet %v: %v", tablet.Alias, err) } if want := "anotherhost"; ti.Hostname != want { t.Errorf("ti.Hostname: want %v, got %v", want, ti.Hostname) } if err := ts.DeleteTablet(ctx, tablet.Alias); err != nil { t.Errorf("DeleteTablet: %v", err) } if err := ts.DeleteTablet(ctx, tablet.Alias); err != topo.ErrNoNode { t.Errorf("DeleteTablet(again): %v", err) } if _, err := ts.GetTablet(ctx, tablet.Alias); err != topo.ErrNoNode { t.Errorf("GetTablet: expected error, tablet was deleted: %v", err) } }
// Make this external, since in needs to be forced from time to time. func Scrap(ts topo.Server, tabletAlias topo.TabletAlias, force bool) error { tablet, err := ts.GetTablet(tabletAlias) if err != nil { return err } // If you are already scrap, skip deleting the path. It won't // be correct since the Parent will be cleared already. wasAssigned := tablet.IsAssigned() replicationPath := "" if wasAssigned { replicationPath = tablet.ReplicationPath() } tablet.Type = topo.TYPE_SCRAP tablet.Parent = topo.TabletAlias{} // Update the tablet first, since that is canonical. err = topo.UpdateTablet(ts, tablet) if err != nil { return err } // Remove any pending actions. Presumably forcing a scrap means you don't // want the agent doing anything and the machine requires manual attention. if force { err := ts.PurgeTabletActions(tabletAlias, ActionNodeCanBePurged) if err != nil { log.Warningf("purge actions failed: %v", err) } } if wasAssigned { err = ts.DeleteReplicationPath(tablet.Keyspace, tablet.Shard, replicationPath) if err != nil { switch err { case topo.ErrNoNode: log.V(6).Infof("no replication path: %v", replicationPath) err = nil case topo.ErrNotEmpty: // If you are forcing the scrapping of a master, you can't update the // replication graph yet, since other nodes are still under the impression // they are slaved to this tablet. // If the node was not empty, we can't do anything about it - the replication // graph needs to be fixed by reparenting. If the action was forced, assume // the user knows best and squelch the error. if tablet.Parent.Uid == topo.NO_TABLET && force { err = nil } } if err != nil { log.Warningf("remove replication path failed: %v %v", replicationPath, err) } } } // run a hook for final cleanup, only in non-force mode. // (force mode executes on the vtctl side, not on the vttablet side) if !force { hk := hook.NewSimpleHook("postflight_scrap") configureTabletHook(hk, tablet.Alias()) if hookErr := hk.ExecuteOptional(); hookErr != nil { // we don't want to return an error, the server // is already in bad shape probably. log.Warningf("Scrap: postflight_scrap failed: %v", hookErr) } } return nil }
func (ta *TabletActor) restartSlave(actionNode *ActionNode) error { rsd := actionNode.args.(*RestartSlaveData) tablet, err := ta.ts.GetTablet(ta.tabletAlias) if err != nil { return err } // If this check fails, we seem reparented. The only part that // could have failed is the insert in the replication // graph. Do NOT try to reparent again. That will either wedge // replication or corrupt data. if tablet.Parent != rsd.Parent { log.V(6).Infof("restart with new parent") // Remove tablet from the replication graph. err = ta.ts.DeleteReplicationPath(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNoNode { return err } // Move a lag slave into the orphan lag type so we can safely ignore // this reparenting until replication catches up. if tablet.Type == topo.TYPE_LAG { tablet.Type = topo.TYPE_LAG_ORPHAN } else { err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } } // Once this action completes, update authoritive tablet node first. tablet.Parent = rsd.Parent err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } } else if rsd.Force { err = ta.mysqld.RestartSlave(rsd.ReplicationState, rsd.WaitPosition, rsd.TimePromoted) if err != nil { return err } // Complete the special orphan accounting. if tablet.Type == topo.TYPE_LAG_ORPHAN { tablet.Type = topo.TYPE_LAG err = topo.UpdateTablet(ta.ts, tablet) if err != nil { return err } } } else { // There is nothing to safely reparent, so check replication. If // either replication thread is not running, report an error. replicationPos, err := ta.mysqld.SlaveStatus() if err != nil { return fmt.Errorf("cannot verify replication for slave: %v", err) } if replicationPos.SecondsBehindMaster == mysqlctl.InvalidLagSeconds { return fmt.Errorf("replication not running for slave") } } // Insert the new tablet location in the replication graph now that // we've updated the tablet. err = ta.ts.CreateReplicationPath(tablet.Keyspace, tablet.Shard, tablet.ReplicationPath()) if err != nil && err != topo.ErrNodeExists { return err } return nil }
// InitTablet creates or updates a tablet. If no parent is specified // in the tablet, and the tablet has a slave type, we will find the // appropriate parent. If createShardAndKeyspace is true and the // parent keyspace or shard don't exist, they will be created. If // update is true, and a tablet with the same ID exists, update it. // If Force is true, and a tablet with the same ID already exists, it // will be scrapped and deleted, and then recreated. func (wr *Wrangler) InitTablet(ctx context.Context, tablet *topo.Tablet, force, createShardAndKeyspace, update bool) error { if err := topo.TabletComplete(tablet); err != nil { return err } if topo.IsInReplicationGraph(tablet.Type) { // get the shard, possibly creating it var err error var si *topo.ShardInfo if createShardAndKeyspace { // create the parent keyspace and shard if needed si, err = topotools.GetOrCreateShard(ctx, wr.ts, tablet.Keyspace, tablet.Shard) } else { si, err = wr.ts.GetShard(ctx, tablet.Keyspace, tablet.Shard) if err == topo.ErrNoNode { return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard") } } // get the shard, checks a couple things if err != nil { return fmt.Errorf("cannot get (or create) shard %v/%v: %v", tablet.Keyspace, tablet.Shard, err) } if key.ProtoToKeyRange(si.KeyRange) != tablet.KeyRange { return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange) } if tablet.Type == topo.TYPE_MASTER && !topo.TabletAliasIsZero(si.MasterAlias) && topo.ProtoToTabletAlias(si.MasterAlias) != tablet.Alias && !force { return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, tablet.Keyspace, tablet.Shard) } // update the shard record if needed if err := wr.updateShardCellsAndMaster(ctx, si, topo.TabletAliasToProto(tablet.Alias), topo.TabletTypeToProto(tablet.Type), force); err != nil { return err } } err := topo.CreateTablet(ctx, wr.ts, tablet) if err != nil && err == topo.ErrNodeExists { // Try to update nicely, but if it fails fall back to force behavior. if update || force { oldTablet, err := wr.ts.GetTablet(ctx, tablet.Alias) if err != nil { wr.Logger().Warningf("failed reading tablet %v: %v", tablet.Alias, err) } else { if oldTablet.Keyspace == tablet.Keyspace && oldTablet.Shard == tablet.Shard { *(oldTablet.Tablet) = *tablet if err := topo.UpdateTablet(ctx, wr.ts, oldTablet); err != nil { wr.Logger().Warningf("failed updating tablet %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { if !topo.IsInReplicationGraph(tablet.Type) { return nil } if err := topo.UpdateTabletReplicationData(ctx, wr.ts, tablet); err != nil { wr.Logger().Warningf("failed updating tablet replication data for %v: %v", tablet.Alias, err) // now fall through the Scrap case } else { return nil } } } } } if force { if err = wr.Scrap(ctx, tablet.Alias, force, false); err != nil { wr.Logger().Errorf("failed scrapping tablet %v: %v", tablet.Alias, err) return err } if err := wr.ts.DeleteTablet(ctx, tablet.Alias); err != nil { // we ignore this wr.Logger().Errorf("failed deleting tablet %v: %v", tablet.Alias, err) } return topo.CreateTablet(ctx, wr.ts, tablet) } } return err }