// ExecuteVtworkerCommand is part of the vtworkerdatapb.VtworkerServer interface func (s *VtworkerServer) ExecuteVtworkerCommand(args *vtworkerdatapb.ExecuteVtworkerCommandRequest, stream vtworkerservicepb.Vtworker_ExecuteVtworkerCommandServer) (err error) { // Please note that this panic handler catches only panics occuring in the code below. // The actual execution of the vtworker command takes place in a new go routine // (started in Instance.setAndStartWorker()) which has its own panic handler. defer servenv.HandlePanic("vtworker", &err) // Stream everything back what the Wrangler is logging. logstream := logutil.NewCallbackLogger(func(e *logutilpb.Event) { // If the client disconnects, we will just fail // to send the log events, but won't interrupt // the command. stream.Send(&vtworkerdatapb.ExecuteVtworkerCommandResponse{ Event: e, }) }) // Let the Wrangler also log everything to the console (and thereby // effectively to a logfile) to make sure that any information or errors // is preserved in the logs in case the RPC or vtworker crashes. logger := logutil.NewTeeLogger(logstream, logutil.NewConsoleLogger()) // create the wrangler wr := s.wi.CreateWrangler(logger) // execute the command worker, done, err := s.wi.RunCommand(args.Args, wr, false /*runFromCli*/) if err == nil && worker != nil && done != nil { err = s.wi.WaitForCommand(worker, done) } return err }
// ExecuteVtctlCommand is the server side method that will execute the query, // and stream the results. func (s *VtctlServer) ExecuteVtctlCommand(context context.Context, query *gorpcproto.ExecuteVtctlCommandArgs, sendReply func(interface{}) error) error { // create a logger, send the result back to the caller logstream := logutil.NewChannelLogger(10) logger := logutil.NewTeeLogger(logstream, logutil.NewConsoleLogger()) // send logs to the caller wg := sync.WaitGroup{} wg.Add(1) go func() { for e := range logstream { // Note we don't interrupt the loop here, as // we still need to flush and finish the // command, even if the channel to the client // has been broken. We'll just keep trying. sendReply(&e) } wg.Done() }() // create the wrangler wr := wrangler.New(logger, s.ts, query.ActionTimeout, query.LockTimeout) // execute the command err := vtctl.RunCommand(wr, query.Args) // close the log channel, and wait for them all to be sent close(logstream) wg.Wait() return err }
// ExecuteVtctlCommand is part of the pb.VtctlServer interface func (s *VtctlServer) ExecuteVtctlCommand(args *pb.ExecuteVtctlCommandRequest, stream pbs.Vtctl_ExecuteVtctlCommandServer) (err error) { defer servenv.HandlePanic("vtctl", &err) // create a logger, send the result back to the caller logstream := logutil.NewChannelLogger(10) logger := logutil.NewTeeLogger(logstream, logutil.NewConsoleLogger()) // send logs to the caller wg := sync.WaitGroup{} wg.Add(1) go func() { for e := range logstream { // Note we don't interrupt the loop here, as // we still need to flush and finish the // command, even if the channel to the client // has been broken. We'll just keep trying. stream.Send(&pb.ExecuteVtctlCommandResponse{ Event: logutil.LoggerEventToProto(&e), }) } wg.Done() }() // create the wrangler wr := wrangler.New(logger, s.ts, tmclient.NewTabletManagerClient(), time.Duration(args.LockTimeout)) // execute the command err = vtctl.RunCommand(stream.Context(), wr, args.Args) // close the log channel, and wait for them all to be sent close(logstream) wg.Wait() return err }
// MultiSnapshot takes a multi-part snapshot // Should be called under RpcWrapLockAction. func (agent *ActionAgent) MultiSnapshot(args *actionnode.MultiSnapshotArgs, logger logutil.Logger) (*actionnode.MultiSnapshotReply, error) { tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return nil, err } ki, err := agent.TopoServer.GetKeyspace(tablet.Keyspace) if err != nil { return nil, err } if tablet.Type != topo.TYPE_BACKUP { return nil, fmt.Errorf("expected backup type, not %v", tablet.Type) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) filenames, err := agent.Mysqld.CreateMultiSnapshot(l, args.KeyRanges, tablet.DbName(), ki.ShardingColumnName, ki.ShardingColumnType, tablet.Addr(), false, args.Concurrency, args.Tables, args.ExcludeTables, args.SkipSlaveRestart, args.MaximumFilesize, agent.hookExtraEnv()) if err != nil { return nil, err } sr := &actionnode.MultiSnapshotReply{ManifestPaths: filenames} if tablet.Parent.Uid == topo.NO_TABLET { // If this is a master, this will be the new parent. // FIXME(msolomon) this doens't work in hierarchical replication. sr.ParentAlias = tablet.Alias } else { sr.ParentAlias = tablet.Parent } return sr, nil }
// RestoreFromBackup deletes all local data and restores anew from the latest backup. func (agent *ActionAgent) RestoreFromBackup(ctx context.Context, logger logutil.Logger) error { if err := agent.lock(ctx); err != nil { return err } defer agent.unlock() tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err } if tablet.Type == topodatapb.TabletType_MASTER { return fmt.Errorf("type MASTER cannot restore from backup, if you really need to do this, restart vttablet in replica mode") } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run restore err = agent.restoreDataLocked(ctx, l, true /* deleteBeforeRestore */) // re-run health check to be sure to capture any replication delay agent.runHealthCheckLocked() return err }
// ExecuteVtworkerCommand is part of the pb.VtworkerServer interface func (s *VtworkerServer) ExecuteVtworkerCommand(args *pb.ExecuteVtworkerCommandRequest, stream pbs.Vtworker_ExecuteVtworkerCommandServer) (err error) { // Please note that this panic handler catches only panics occuring in the code below. // The actual execution of the vtworker command takes place in a new go routine // (started in Instance.setAndStartWorker()) which has its own panic handler. defer servenv.HandlePanic("vtworker", &err) // create a logger, send the result back to the caller logstream := logutil.NewChannelLogger(10) logger := logutil.NewTeeLogger(logstream, logutil.NewMemoryLogger()) // send logs to the caller wg := sync.WaitGroup{} wg.Add(1) go func() { for e := range logstream { // Note we don't interrupt the loop here, as // we still need to flush and finish the // command, even if the channel to the client // has been broken. We'll just keep trying. stream.Send(&pb.ExecuteVtworkerCommandResponse{ Event: &pbl.Event{ Time: &pbl.Time{ Seconds: e.Time.Unix(), Nanoseconds: int32(e.Time.Nanosecond()), }, Level: pbl.Level(e.Level), File: e.File, Line: int64(e.Line), Value: e.Value, }, }) } wg.Done() }() // create the wrangler wr := s.wi.CreateWrangler(logger) // execute the command if len(args.Args) >= 1 && args.Args[0] == "Reset" { err = s.wi.Reset() } else { // Make sure we use the global "err" variable and do not redeclare it in this scope. var worker worker.Worker var done chan struct{} worker, done, err = s.wi.RunCommand(args.Args, wr, false /*runFromCli*/) if err == nil { err = s.wi.WaitForCommand(worker, done) } } // close the log channel, and wait for them all to be sent close(logstream) wg.Wait() return err }
// Backup takes a db backup and sends it to the BackupStorage func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error { if err := agent.lock(ctx); err != nil { return err } defer agent.unlock() // update our type to BACKUP tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err } if tablet.Type == topodatapb.TabletType_MASTER { return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode") } originalType := tablet.Type if _, err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, topodatapb.TabletType_BACKUP); err != nil { return err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "before backup"); err != nil { return err } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard) name := fmt.Sprintf("%v.%v", time.Now().UTC().Format("2006-01-02.150405"), topoproto.TabletAliasString(tablet.Alias)) returnErr := mysqlctl.Backup(ctx, agent.MysqlDaemon, l, dir, name, concurrency, agent.hookExtraEnv()) // change our type back to the original value _, err = topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, originalType) if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) if returnErr != nil { l.Errorf("mysql backup command returned error: %v", returnErr) } returnErr = err } // let's update our internal state (start query service and other things) if err := agent.refreshTablet(ctx, "after backup"); err != nil { return err } // and re-run health check to be sure to capture any replication delay agent.runHealthCheckLocked() return returnErr }
// setAndStartWorker will set the current worker. // We always log to both memory logger (for display on the web) and // console logger (for records / display of command line worker). func (wi *Instance) setAndStartWorker(wrk Worker, wr *wrangler.Wrangler) (chan struct{}, error) { wi.currentWorkerMutex.Lock() defer wi.currentWorkerMutex.Unlock() if wi.currentWorker != nil { return nil, fmt.Errorf("A worker is already in progress: %v", wi.currentWorker) } wi.currentWorker = wrk wi.currentMemoryLogger = logutil.NewMemoryLogger() wi.currentContext, wi.currentCancelFunc = context.WithCancel(wi.backgroundContext) wi.lastRunError = nil done := make(chan struct{}) wranglerLogger := wr.Logger() if wr == wi.wr { // If it's the default wrangler, do not reuse its logger because it may have been set before. // Resuing it would result into an endless recursion. wranglerLogger = logutil.NewConsoleLogger() } wr.SetLogger(logutil.NewTeeLogger(wi.currentMemoryLogger, wranglerLogger)) // one go function runs the worker, changes state when done go func() { log.Infof("Starting worker...") var err error // Catch all panics and always save the execution state at the end. defer func() { // The recovery code is a copy of servenv.HandlePanic(). if x := recover(); x != nil { err = fmt.Errorf("uncaught %v panic: %v", "vtworker", x) } wi.currentWorkerMutex.Lock() wi.currentContext = nil wi.currentCancelFunc = nil wi.lastRunError = err wi.currentWorkerMutex.Unlock() close(done) }() // run will take a long time err = wrk.Run(wi.currentContext) }() return done, nil }
// Backup takes a db backup and sends it to the BackupStorage // Should be called under RPCWrapLockAction. func (agent *ActionAgent) Backup(ctx context.Context, concurrency int, logger logutil.Logger) error { // update our type to BACKUP tablet, err := agent.TopoServer.GetTablet(ctx, agent.TabletAlias) if err != nil { return err } if tablet.Type == topodatapb.TabletType_MASTER { return fmt.Errorf("type MASTER cannot take backup, if you really need to do this, restart vttablet in replica mode") } originalType := tablet.Type if _, err := topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, topodatapb.TabletType_BACKUP, make(map[string]string)); err != nil { return err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "backup"); err != nil { return fmt.Errorf("failed to update state before backup: %v", err) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard) name := fmt.Sprintf("%v.%v", time.Now().UTC().Format("2006-01-02.150405"), topoproto.TabletAliasString(tablet.Alias)) returnErr := mysqlctl.Backup(ctx, agent.MysqlDaemon, l, dir, name, concurrency, agent.hookExtraEnv()) // and change our type back to the appropriate value: // - if healthcheck is enabled, go to spare // - if not, go back to original type if agent.IsRunningHealthCheck() { originalType = topodatapb.TabletType_SPARE } _, err = topotools.ChangeType(ctx, agent.TopoServer, tablet.Alias, originalType, nil) if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) if returnErr != nil { l.Errorf("mysql backup command returned error: %v", returnErr) } returnErr = err } return returnErr }
// ExecuteVtworkerCommand is part of the vtworkerdatapb.VtworkerServer interface func (s *VtworkerServer) ExecuteVtworkerCommand(args *vtworkerdatapb.ExecuteVtworkerCommandRequest, stream vtworkerservicepb.Vtworker_ExecuteVtworkerCommandServer) (err error) { // Please note that this panic handler catches only panics occuring in the code below. // The actual execution of the vtworker command takes place in a new go routine // (started in Instance.setAndStartWorker()) which has its own panic handler. defer servenv.HandlePanic("vtworker", &err) // Stream everything back what the Wrangler is logging. logstream := logutil.NewChannelLogger(10) // Let the Wrangler also log everything to the console (and thereby // effectively to a logfile) to make sure that any information or errors // is preserved in the logs in case the RPC or vtworker crashes. logger := logutil.NewTeeLogger(logstream, logutil.NewConsoleLogger()) // send logs to the caller wg := sync.WaitGroup{} wg.Add(1) go func() { for e := range logstream { // Note we don't interrupt the loop here, as // we still need to flush and finish the // command, even if the channel to the client // has been broken. We'll just keep trying. stream.Send(&vtworkerdatapb.ExecuteVtworkerCommandResponse{ Event: e, }) } wg.Done() }() // create the wrangler wr := s.wi.CreateWrangler(logger) // execute the command worker, done, err := s.wi.RunCommand(args.Args, wr, false /*runFromCli*/) if err == nil && worker != nil && done != nil { err = s.wi.WaitForCommand(worker, done) } // close the log channel, and wait for them all to be sent close(logstream) wg.Wait() return err }
// ExecuteVtctlCommand is part of the vtctldatapb.VtctlServer interface func (s *VtctlServer) ExecuteVtctlCommand(args *vtctldatapb.ExecuteVtctlCommandRequest, stream vtctlservicepb.Vtctl_ExecuteVtctlCommandServer) (err error) { defer servenv.HandlePanic("vtctl", &err) // create a logger, send the result back to the caller logstream := logutil.NewCallbackLogger(func(e *logutilpb.Event) { // If the client disconnects, we will just fail // to send the log events, but won't interrupt // the command. stream.Send(&vtctldatapb.ExecuteVtctlCommandResponse{ Event: e, }) }) logger := logutil.NewTeeLogger(logstream, logutil.NewConsoleLogger()) // create the wrangler wr := wrangler.New(logger, s.ts, tmclient.NewTabletManagerClient()) // execute the command return vtctl.RunCommand(stream.Context(), wr, args.Args) }
// setAndStartWorker will set the current worker. // We always log to both memory logger (for display on the web) and // console logger (for records / display of command line worker). func setAndStartWorker(wrk worker.Worker) (chan struct{}, error) { currentWorkerMutex.Lock() defer currentWorkerMutex.Unlock() if currentWorker != nil { return nil, fmt.Errorf("A worker is already in progress: %v", currentWorker) } currentWorker = wrk currentMemoryLogger = logutil.NewMemoryLogger() currentDone = make(chan struct{}) wr.SetLogger(logutil.NewTeeLogger(currentMemoryLogger, logutil.NewConsoleLogger())) // one go function runs the worker, closes 'done' when done go func() { log.Infof("Starting worker...") wrk.Run() close(currentDone) }() return currentDone, nil }
// ExecuteVtctlCommand is the server side method that will execute the query, // and stream the results. func (s *VtctlServer) ExecuteVtctlCommand(ctx context.Context, query *gorpcproto.ExecuteVtctlCommandArgs, sendReply func(interface{}) error) (err error) { defer vtctl.HandlePanic(&err) // create a logger, send the result back to the caller logstream := logutil.NewChannelLogger(10) logger := logutil.NewTeeLogger(logstream, logutil.NewConsoleLogger()) // send logs to the caller wg := sync.WaitGroup{} wg.Add(1) go func() { for e := range logstream { // Note we don't interrupt the loop here, as // we still need to flush and finish the // command, even if the channel to the client // has been broken. We'll just keep trying. sendReply(&e) } wg.Done() }() // create the wrangler wr := wrangler.New(logger, s.ts, tmclient.NewTabletManagerClient(), query.LockTimeout) // FIXME(alainjobart) use a single context, copy the source info from it ctx, cancel := context.WithTimeout(context.TODO(), query.ActionTimeout) // execute the command err = vtctl.RunCommand(ctx, wr, query.Args) cancel() // close the log channel, and wait for them all to be sent close(logstream) wg.Wait() return err }
// setAndStartWorker will set the current worker. // We always log to both memory logger (for display on the web) and // console logger (for records / display of command line worker). func (wi *Instance) setAndStartWorker(wrk Worker, wr *wrangler.Wrangler) (chan struct{}, error) { wi.currentWorkerMutex.Lock() defer wi.currentWorkerMutex.Unlock() if wi.currentContext != nil { return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR, fmt.Errorf("A worker job is already in progress: %v", wi.currentWorker)) } if wi.currentWorker != nil { // During the grace period, we answer with a retryable error. const gracePeriod = 1 * time.Minute gracePeriodEnd := time.Now().Add(gracePeriod) if wi.lastRunStopTime.Before(gracePeriodEnd) { return nil, vterrors.FromError(vtrpcpb.ErrorCode_TRANSIENT_ERROR, fmt.Errorf("A worker job was recently stopped (%f seconds ago): %v", time.Now().Sub(wi.lastRunStopTime).Seconds(), wi.currentWorker)) } // QUERY_NOT_SERVED = FailedPrecondition => manual resolution required. return nil, vterrors.FromError(vtrpcpb.ErrorCode_QUERY_NOT_SERVED, fmt.Errorf("The worker job was stopped %.1f minutes ago, but not reset. You have to reset it manually. Job: %v", time.Now().Sub(wi.lastRunStopTime).Minutes(), wi.currentWorker)) } wi.currentWorker = wrk wi.currentMemoryLogger = logutil.NewMemoryLogger() wi.currentContext, wi.currentCancelFunc = context.WithCancel(wi.backgroundContext) wi.lastRunError = nil wi.lastRunStopTime = time.Unix(0, 0) done := make(chan struct{}) wranglerLogger := wr.Logger() if wr == wi.wr { // If it's the default wrangler, do not reuse its logger because it may have been set before. // Resuing it would result into an endless recursion. wranglerLogger = logutil.NewConsoleLogger() } wr.SetLogger(logutil.NewTeeLogger(wi.currentMemoryLogger, wranglerLogger)) // one go function runs the worker, changes state when done go func() { log.Infof("Starting worker...") var err error // Catch all panics and always save the execution state at the end. defer func() { // The recovery code is a copy of servenv.HandlePanic(). if x := recover(); x != nil { log.Errorf("uncaught vtworker panic: %v\n%s", x, tb.Stack(4)) err = fmt.Errorf("uncaught vtworker panic: %v", x) } wi.currentWorkerMutex.Lock() wi.currentContext = nil wi.currentCancelFunc = nil wi.lastRunError = err wi.lastRunStopTime = time.Now() wi.currentWorkerMutex.Unlock() close(done) }() // run will take a long time err = wrk.Run(wi.currentContext) }() return done, nil }
// MultiRestore performs the multi-part restore. // Should be called under RpcWrapLockAction. func (agent *ActionAgent) MultiRestore(args *actionnode.MultiRestoreArgs, logger logutil.Logger) error { // read our current tablet, verify its state // we only support restoring to the master or active replicas tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if tablet.Type != topo.TYPE_MASTER && !topo.IsSlaveType(tablet.Type) { return fmt.Errorf("expected master, or slave type, not %v", tablet.Type) } // get source tablets addresses sourceAddrs := make([]*url.URL, len(args.SrcTabletAliases)) keyRanges := make([]key.KeyRange, len(args.SrcTabletAliases)) fromStoragePaths := make([]string, len(args.SrcTabletAliases)) for i, alias := range args.SrcTabletAliases { t, e := agent.TopoServer.GetTablet(alias) if e != nil { return e } sourceAddrs[i] = &url.URL{ Host: t.Addr(), Path: "/" + t.DbName(), } keyRanges[i], e = key.KeyRangesOverlap(tablet.KeyRange, t.KeyRange) if e != nil { return e } fromStoragePaths[i] = path.Join(agent.Mysqld.SnapshotDir, "from-storage", fmt.Sprintf("from-%v-%v", keyRanges[i].Start.Hex(), keyRanges[i].End.Hex())) } // change type to restore, no change to replication graph originalType := tablet.Type tablet.Type = topo.TYPE_RESTORE err = topo.UpdateTablet(agent.TopoServer, tablet) if err != nil { return err } // first try to get the data from a remote storage wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for i, alias := range args.SrcTabletAliases { wg.Add(1) go func(i int, alias topo.TabletAlias) { defer wg.Done() h := hook.NewSimpleHook("copy_snapshot_from_storage") h.ExtraEnv = make(map[string]string) for k, v := range agent.hookExtraEnv() { h.ExtraEnv[k] = v } h.ExtraEnv["KEYRANGE"] = fmt.Sprintf("%v-%v", keyRanges[i].Start.Hex(), keyRanges[i].End.Hex()) h.ExtraEnv["SNAPSHOT_PATH"] = fromStoragePaths[i] h.ExtraEnv["SOURCE_TABLET_ALIAS"] = alias.String() hr := h.Execute() if hr.ExitStatus != hook.HOOK_SUCCESS { rec.RecordError(fmt.Errorf("%v hook failed(%v): %v", h.Name, hr.ExitStatus, hr.Stderr)) } }(i, alias) } wg.Wait() // stop replication for slaves, so it doesn't interfere if topo.IsSlaveType(originalType) { if err := agent.Mysqld.StopSlave(map[string]string{"TABLET_ALIAS": tablet.Alias.String()}); err != nil { return err } } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // run the action, scrap if it fails if rec.HasErrors() { log.Infof("Got errors trying to get snapshots from storage, trying to get them from original tablets: %v", rec.Error()) err = agent.Mysqld.MultiRestore(l, tablet.DbName(), keyRanges, sourceAddrs, nil, args.Concurrency, args.FetchConcurrency, args.InsertTableConcurrency, args.FetchRetryCount, args.Strategy) } else { log.Infof("Got snapshots from storage, reading them from disk directly") err = agent.Mysqld.MultiRestore(l, tablet.DbName(), keyRanges, nil, fromStoragePaths, args.Concurrency, args.FetchConcurrency, args.InsertTableConcurrency, args.FetchRetryCount, args.Strategy) } if err != nil { if e := topotools.Scrap(agent.TopoServer, agent.TabletAlias, false); e != nil { log.Errorf("Failed to Scrap after failed RestoreFromMultiSnapshot: %v", e) } return err } // reload the schema agent.ReloadSchema() // restart replication if topo.IsSlaveType(originalType) { if err := agent.Mysqld.StartSlave(map[string]string{"TABLET_ALIAS": tablet.Alias.String()}); err != nil { return err } } // restore type back tablet.Type = originalType return topo.UpdateTablet(agent.TopoServer, tablet) }
// Snapshot takes a db snapshot // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Snapshot(args *actionnode.SnapshotArgs, logger logutil.Logger) (*actionnode.SnapshotReply, error) { // update our type to TYPE_BACKUP tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return nil, err } originalType := tablet.Type // ForceMasterSnapshot: Normally a master is not a viable tablet // to snapshot. However, there are degenerate cases where you need // to override this, for instance the initial clone of a new master. if tablet.Type == topo.TYPE_MASTER && args.ForceMasterSnapshot { // In this case, we don't bother recomputing the serving graph. // All queries will have to fail anyway. log.Infof("force change type master -> backup") // There is a legitimate reason to force in the case of a single // master. tablet.Tablet.Type = topo.TYPE_BACKUP err = topo.UpdateTablet(agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_BACKUP, make(map[string]string), true /*runHooks*/) } if err != nil { return nil, err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet("snapshotStart"); err != nil { return nil, fmt.Errorf("failed to update state before snaphost: %v", err) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup filename, slaveStartRequired, readOnly, returnErr := agent.Mysqld.CreateSnapshot(l, tablet.DbName(), tablet.Addr(), false, args.Concurrency, args.ServerMode, agent.hookExtraEnv()) // and change our type to the appropriate value newType := originalType if returnErr != nil { log.Errorf("snapshot failed, restoring tablet type back to %v: %v", newType, returnErr) } else { if args.ServerMode { log.Infof("server mode specified, switching tablet to snapshot_source mode") newType = topo.TYPE_SNAPSHOT_SOURCE } else { log.Infof("change type back after snapshot: %v", newType) } } if tablet.Parent.Uid == topo.NO_TABLET && args.ForceMasterSnapshot && newType != topo.TYPE_SNAPSHOT_SOURCE { log.Infof("force change type backup -> master: %v", tablet.Alias) tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, newType, nil, true /*runHooks*/) } if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) returnErr = err } // if anything failed, don't return anything if returnErr != nil { return nil, returnErr } // it all worked, return the required information sr := &actionnode.SnapshotReply{ ManifestPath: filename, SlaveStartRequired: slaveStartRequired, ReadOnly: readOnly, } if tablet.Parent.Uid == topo.NO_TABLET { // If this is a master, this will be the new parent. // FIXME(msolomon) this doesn't work in hierarchical replication. sr.ParentAlias = tablet.Alias } else { sr.ParentAlias = tablet.Parent } return sr, nil }
// Operate on restore tablet. // Check that the SnapshotManifest is valid and the master has not changed. // Shutdown mysqld. // Load the snapshot from source tablet. // Restart mysqld and replication. // Put tablet into the replication graph as a spare. // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Restore(args *actionnode.RestoreArgs, logger logutil.Logger) error { // read our current tablet, verify its state tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if args.WasReserved { if tablet.Type != topo.TYPE_RESTORE { return fmt.Errorf("expected restore type, not %v", tablet.Type) } } else { if tablet.Type != topo.TYPE_IDLE { return fmt.Errorf("expected idle type, not %v", tablet.Type) } } // read the source tablet, compute args.SrcFilePath if default sourceTablet, err := agent.TopoServer.GetTablet(args.SrcTabletAlias) if err != nil { return err } if strings.ToLower(args.SrcFilePath) == "default" { args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile) } // read the parent tablet, verify its state parentTablet, err := agent.TopoServer.GetTablet(args.ParentAlias) if err != nil { return err } if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias) } // read & unpack the manifest sm := new(mysqlctl.SnapshotManifest) if err := fetchAndParseJsonFile(sourceTablet.Addr(), args.SrcFilePath, sm); err != nil { return err } if !args.WasReserved { if err := agent.changeTypeToRestore(tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil { return err } } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // do the work if err := agent.Mysqld.RestoreFromSnapshot(l, sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, agent.hookExtraEnv()); err != nil { log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err) if err := topotools.Scrap(agent.TopoServer, agent.TabletAlias, false); err != nil { log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err) } return err } // reload the schema agent.ReloadSchema() // change to TYPE_SPARE, we're done! return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, topo.TYPE_SPARE, nil, true) }