func main() { flag.Parse() args := flag.Args() installSignalHandlers() servenv.Init() defer servenv.Close() ts := topo.GetServer() defer topo.CloseServers() // the logger will be replaced when we start a job wr = wrangler.New(logutil.NewConsoleLogger(), ts, 30*time.Second, 30*time.Second) if len(args) == 0 { // interactive mode, initialize the web UI to chose a command initInteractiveMode() } else { // single command mode, just runs it runCommand(args) } initStatusHandling() servenv.RunDefault() }
func (ar *ActionRepository) ApplyTabletAction(actionName string, tabletAlias topo.TabletAlias, r *http.Request) *ActionResult { result := &ActionResult{Name: actionName, Parameters: tabletAlias.String()} action, ok := ar.tabletActions[actionName] if !ok { result.error("Unknown tablet action") return result } // check the role if action.role != "" { if err := acl.CheckAccessHTTP(r, action.role); err != nil { result.error("Access denied") return result } } // run the action wr := wrangler.New(logutil.NewConsoleLogger(), ar.ts, *actionTimeout, *lockTimeout) output, err := action.method(wr, tabletAlias, r) if err != nil { result.error(err.Error()) return result } result.Output = output return result }
// ExecuteVtctlCommand is the server side method that will execute the query, // and stream the results. func (s *VtctlServer) ExecuteVtctlCommand(context context.Context, query *gorpcproto.ExecuteVtctlCommandArgs, sendReply func(interface{}) error) error { // create a logger, send the result back to the caller logstream := logutil.NewChannelLogger(10) logger := logutil.NewTeeLogger(logstream, logutil.NewConsoleLogger()) // send logs to the caller wg := sync.WaitGroup{} wg.Add(1) go func() { for e := range logstream { // Note we don't interrupt the loop here, as // we still need to flush and finish the // command, even if the channel to the client // has been broken. We'll just keep trying. sendReply(&e) } wg.Done() }() // create the wrangler wr := wrangler.New(logger, s.ts, query.ActionTimeout, query.LockTimeout) // execute the command err := vtctl.RunCommand(wr, query.Args) // close the log channel, and wait for them all to be sent close(logstream) wg.Wait() return err }
func init() { logger := logutil.NewConsoleLogger() flag.CommandLine.SetOutput(logutil.NewLoggerWriter(logger)) flag.Usage = func() { logger.Printf("Usage: %s [global parameters] command [command parameters]\n", os.Args[0]) logger.Printf("\nThe global optional parameters are:\n") flag.PrintDefaults() logger.Printf("\nThe commands are listed below, sorted by group. Use '%s <command> -h' for more help.\n\n", os.Args[0]) vtctl.PrintAllCommands(logger) } }
// rebuildShardIfNeeded will rebuild the serving graph if we need to func (agent *ActionAgent) rebuildShardIfNeeded(tablet *topo.TabletInfo, targetTabletType topo.TabletType) error { if topo.IsInServingGraph(targetTabletType) { // TODO: interrupted may need to be a global one closed when we exit interrupted := make(chan struct{}) // no need to take the shard lock in this case if _, err := topotools.RebuildShard(context.TODO(), logutil.NewConsoleLogger(), agent.TopoServer, tablet.Keyspace, tablet.Shard, []string{tablet.Alias.Cell}, agent.LockTimeout, interrupted); err != nil { return fmt.Errorf("topotools.RebuildShard returned an error: %v", err) } } return nil }
func snapshotCmd(mysqld *mysqlctl.Mysqld, subFlags *flag.FlagSet, args []string) { concurrency := subFlags.Int("concurrency", 4, "how many compression jobs to run simultaneously") subFlags.Parse(args) if subFlags.NArg() != 1 { log.Fatalf("Command snapshot requires <db name>") } filename, _, _, err := mysqld.CreateSnapshot(logutil.NewConsoleLogger(), subFlags.Arg(0), tabletAddr, false, *concurrency, false, nil) if err != nil { log.Fatalf("snapshot failed: %v", err) } else { log.Infof("manifest location: %v", filename) } }
func snapshotSourceStartCmd(mysqld *mysqlctl.Mysqld, subFlags *flag.FlagSet, args []string) { concurrency := subFlags.Int("concurrency", 4, "how many checksum jobs to run simultaneously") subFlags.Parse(args) if subFlags.NArg() != 1 { log.Fatalf("Command snapshotsourcestart requires <db name>") } filename, slaveStartRequired, readOnly, err := mysqld.CreateSnapshot(logutil.NewConsoleLogger(), subFlags.Arg(0), tabletAddr, false, *concurrency, true, nil) if err != nil { log.Fatalf("snapshot failed: %v", err) } else { log.Infof("manifest location: %v", filename) log.Infof("slave start required: %v", slaveStartRequired) log.Infof("read only: %v", readOnly) } }
func restoreCmd(mysqld *mysqlctl.Mysqld, subFlags *flag.FlagSet, args []string) { dontWaitForSlaveStart := subFlags.Bool("dont_wait_for_slave_start", false, "won't wait for replication to start (useful when restoring from master server)") fetchConcurrency := subFlags.Int("fetch_concurrency", 3, "how many files to fetch simultaneously") fetchRetryCount := subFlags.Int("fetch_retry_count", 3, "how many times to retry a failed transfer") subFlags.Parse(args) if subFlags.NArg() != 1 { log.Fatalf("Command restore requires <snapshot manifest file>") } rs, err := mysqlctl.ReadSnapshotManifest(subFlags.Arg(0)) if err == nil { err = mysqld.RestoreFromSnapshot(logutil.NewConsoleLogger(), rs, *fetchConcurrency, *fetchRetryCount, *dontWaitForSlaveStart, nil) } if err != nil { log.Fatalf("restore failed: %v", err) } }
func TestCopySchemaShard(t *testing.T) { ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) sourceMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER, TabletKeyspaceShard(t, "ks", "-80")) sourceRdonly := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_RDONLY, TabletKeyspaceShard(t, "ks", "-80"), TabletParent(sourceMaster.Tablet.Alias)) destinationMaster := NewFakeTablet(t, wr, "cell1", 10, topo.TYPE_MASTER, TabletKeyspaceShard(t, "ks", "-40")) // one destination RdOnly, so we know that schema copies propogate from masters destinationRdonly := NewFakeTablet(t, wr, "cell1", 11, topo.TYPE_RDONLY, TabletKeyspaceShard(t, "ks", "-40"), TabletParent(destinationMaster.Tablet.Alias)) for _, ft := range []*FakeTablet{sourceMaster, sourceRdonly, destinationMaster, destinationRdonly} { ft.StartActionLoop(t, wr) defer ft.StopActionLoop(t) } sourceRdonly.FakeMysqlDaemon.Schema = &myproto.SchemaDefinition{ DatabaseSchema: "CREATE DATABASE `{{.DatabaseName}}` /*!40100 DEFAULT CHARACTER SET utf8 */", TableDefinitions: []*myproto.TableDefinition{ &myproto.TableDefinition{ Name: "table1", Schema: "CREATE TABLE `resharding1` (\n `id` bigint(20) NOT NULL AUTO_INCREMENT,\n `msg` varchar(64) DEFAULT NULL,\n `keyspace_id` bigint(20) unsigned NOT NULL,\n PRIMARY KEY (`id`),\n KEY `by_msg` (`msg`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8", Type: myproto.TABLE_BASE_TABLE, }, &myproto.TableDefinition{ Name: "view1", Schema: "CREATE TABLE `view1` (\n `id` bigint(20) NOT NULL AUTO_INCREMENT,\n `msg` varchar(64) DEFAULT NULL,\n `keyspace_id` bigint(20) unsigned NOT NULL,\n PRIMARY KEY (`id`),\n KEY `by_msg` (`msg`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8", Type: myproto.TABLE_VIEW, }, }, } destinationMaster.FakeMysqlDaemon.DbaConnectionFactory = DestinationsFactory(t) destinationRdonly.FakeMysqlDaemon.DbaConnectionFactory = DestinationsFactory(t) if err := wr.CopySchemaShard(sourceRdonly.Tablet.Alias, nil, nil, true, "ks", "-40"); err != nil { t.Fatalf("CopySchemaShard failed: %v", err) } }
// TestTabletExternallyReparentedWithDifferentMysqlPort makes sure // that if mysql is restarted on the master-elect tablet and has a different // port, we pick it up correctly. func TestTabletExternallyReparentedWithDifferentMysqlPort(t *testing.T) { ts := zktopo.NewTestServer(t, []string{"cell1"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) // Create an old master, a new master, two good slaves, one bad slave oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) // Now we're restarting mysql on a different port, 3301->3303 // but without updating the Tablet record in topology. // On the elected master, we will respond to // TABLET_ACTION_SLAVE_WAS_PROMOTED, so we need a MysqlDaemon // that returns no master, and the new port (as returned by mysql) newMaster.FakeMysqlDaemon.MasterAddr = "" newMaster.FakeMysqlDaemon.MysqlPort = 3303 newMaster.StartActionLoop(t, wr) defer newMaster.StopActionLoop(t) // On the old master, we will only respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED and point to the new mysql port oldMaster.FakeMysqlDaemon.MasterAddr = "101.0.0.1:3303" oldMaster.StartActionLoop(t, wr) defer oldMaster.StopActionLoop(t) // On the good slaves, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED and point to the new mysql port goodSlave.FakeMysqlDaemon.MasterAddr = "101.0.0.1:3303" goodSlave.StartActionLoop(t, wr) defer goodSlave.StopActionLoop(t) // This tests the good case, where everything works as planned t.Logf("TabletExternallyReparented(new master) expecting success") tmc := tmclient.NewTabletManagerClient() ti, err := ts.GetTablet(newMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) } }
func (ar *ActionRepository) ApplyKeyspaceAction(actionName, keyspace string, r *http.Request) *ActionResult { result := &ActionResult{Name: actionName, Parameters: keyspace} action, ok := ar.keyspaceActions[actionName] if !ok { result.error("Unknown keyspace action") return result } wr := wrangler.New(logutil.NewConsoleLogger(), ar.ts, *actionTimeout, *lockTimeout) output, err := action(wr, keyspace, r) if err != nil { result.error(err.Error()) return result } result.Output = output return result }
func main() { flag.Parse() servenv.Init() ts := topo.GetServer() scheduler, err := janitor.New(*keyspace, *shard, ts, wrangler.New(logutil.NewConsoleLogger(), ts, *actionTimeout, *lockTimeout), *sleepTime) if err != nil { log.Fatalf("janitor.New: %v", err) } if len(activeModules)+len(dryRunModules) < 1 { log.Fatal("no modules to run specified") } scheduler.Enable(activeModules) scheduler.EnableDryRun(dryRunModules) go scheduler.Run() servenv.RunDefault() }
// setAndStartWorker will set the current worker. // We always log to both memory logger (for display on the web) and // console logger (for records / display of command line worker). func setAndStartWorker(wrk worker.Worker) (chan struct{}, error) { currentWorkerMutex.Lock() defer currentWorkerMutex.Unlock() if currentWorker != nil { return nil, fmt.Errorf("A worker is already in progress: %v", currentWorker) } currentWorker = wrk currentMemoryLogger = logutil.NewMemoryLogger() currentDone = make(chan struct{}) wr.SetLogger(logutil.NewTeeLogger(currentMemoryLogger, logutil.NewConsoleLogger())) // one go function runs the worker, closes 'done' when done go func() { log.Infof("Starting worker...") wrk.Run() close(currentDone) }() return currentDone, nil }
func (ar *ActionRepository) ApplyShardAction(actionName, keyspace, shard string, r *http.Request) *ActionResult { // if the shard name contains a '-', we assume it's the // name for a ranged based shard, so we lower case it. if strings.Contains(shard, "-") { shard = strings.ToLower(shard) } result := &ActionResult{Name: actionName, Parameters: keyspace + "/" + shard} action, ok := ar.shardActions[actionName] if !ok { result.error("Unknown shard action") return result } wr := wrangler.New(logutil.NewConsoleLogger(), ar.ts, *actionTimeout, *lockTimeout) output, err := action(wr, keyspace, shard, r) if err != nil { result.error(err.Error()) return result } result.Output = output return result }
func main() { defer exit.RecoverAll() defer logutil.Flush() flag.Parse() args := flag.Args() if len(args) == 0 { flag.Usage() exit.Return(1) } action := args[0] installSignalHandlers() startMsg := fmt.Sprintf("USER=%v SUDO_USER=%v %v", os.Getenv("USER"), os.Getenv("SUDO_USER"), strings.Join(os.Args, " ")) if syslogger, err := syslog.New(syslog.LOG_INFO, "vtctl "); err == nil { syslogger.Info(startMsg) } else { log.Warningf("cannot connect to syslog: %v", err) } topoServer := topo.GetServer() defer topo.CloseServers() wr := wrangler.New(logutil.NewConsoleLogger(), topoServer, *waitTime, *lockWaitTimeout) err := vtctl.RunCommand(wr, args) switch err { case vtctl.ErrUnknownCommand: flag.Usage() exit.Return(1) case nil: // keep going default: log.Errorf("action failed: %v %v", action, err) exit.Return(255) } }
func TestTabletExternallyReparented(t *testing.T) { ctx := context.Background() ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) // Create an old master, a new master, two good slaves, one bad slave oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave1 := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave2 := NewFakeTablet(t, wr, "cell2", 3, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) badSlave := NewFakeTablet(t, wr, "cell1", 4, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) // Add a new Cell to the Shard, that doesn't map to any read topo cell, // to simulate a data center being unreachable. si, err := ts.GetShard("test_keyspace", "0") if err != nil { t.Fatalf("GetShard failed: %v", err) } si.Cells = append(si.Cells, "cell666") if err := topo.UpdateShard(ctx, ts, si); err != nil { t.Fatalf("UpdateShard failed: %v", err) } // Slightly unrelated test: make sure we can find the tablets // even with a datacenter being down. tabletMap, err := topo.GetTabletMapForShardByCell(ctx, ts, "test_keyspace", "0", []string{"cell1"}) if err != nil { t.Fatalf("GetTabletMapForShardByCell should have worked but got: %v", err) } master, err := topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != nil || master != oldMaster.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master) } slave1, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave1.Tablet.IPAddr, "vt", goodSlave1.Tablet.Portmap["vt"]) if err != nil || slave1 != goodSlave1.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(slave1) failed: %v %v", err, master) } slave2, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave2.Tablet.IPAddr, "vt", goodSlave2.Tablet.Portmap["vt"]) if err != topo.ErrNoNode { t.Fatalf("FindTabletByIPAddrAndPort(slave2) worked: %v %v", err, slave2) } // Make sure the master is not exported in other cells tabletMap, err = topo.GetTabletMapForShardByCell(ctx, ts, "test_keyspace", "0", []string{"cell2"}) master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != topo.ErrNoNode { t.Fatalf("FindTabletByIPAddrAndPort(master) worked in cell2: %v %v", err, master) } tabletMap, err = topo.GetTabletMapForShard(ctx, ts, "test_keyspace", "0") if err != topo.ErrPartialResult { t.Fatalf("GetTabletMapForShard should have returned ErrPartialResult but got: %v", err) } master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"]) if err != nil || master != oldMaster.Tablet.Alias { t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master) } // On the elected master, we will respond to // TABLET_ACTION_SLAVE_WAS_PROMOTED newMaster.FakeMysqlDaemon.MasterAddr = "" newMaster.StartActionLoop(t, wr) defer newMaster.StopActionLoop(t) // On the old master, we will only respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. oldMaster.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() oldMaster.StartActionLoop(t, wr) defer oldMaster.StopActionLoop(t) // On the good slaves, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. goodSlave1.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave1.StartActionLoop(t, wr) defer goodSlave1.StopActionLoop(t) goodSlave2.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave2.StartActionLoop(t, wr) defer goodSlave2.StopActionLoop(t) // On the bad slave, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED with bad data. badSlave.FakeMysqlDaemon.MasterAddr = "234.0.0.1:3301" badSlave.StartActionLoop(t, wr) defer badSlave.StopActionLoop(t) // First test: reparent to the same master, make sure it works // as expected. tmc := tmclient.NewTabletManagerClient() ti, err := ts.GetTablet(oldMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(same master) should have worked") } // Second test: reparent to a replica, and pretend the old // master is still good to go. // This tests a bad case; the new designated master is a slave, // but we should do what we're told anyway ti, err = ts.GetTablet(goodSlave1.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(slave) error: %v", err) } // This tests the good case, where everything works as planned t.Logf("TabletExternallyReparented(new master) expecting success") ti, err = ts.GetTablet(newMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) } // Now double-check the serving graph is good. // Should only have one good replica left. addrs, err := ts.GetEndPoints("cell1", "test_keyspace", "0", topo.TYPE_REPLICA) if err != nil { t.Fatalf("GetEndPoints failed at the end: %v", err) } if len(addrs.Entries) != 1 { t.Fatalf("GetEndPoints has too many entries: %v", addrs) } }
func main() { flag.Parse() servenv.Init() defer servenv.Close() templateLoader = NewTemplateLoader(*templateDir, dummyTemplate, *debug) ts = topo.GetServer() defer topo.CloseServers() wr := wrangler.New(logutil.NewConsoleLogger(), ts, 30*time.Second, 30*time.Second) actionRepo = NewActionRepository(ts) // keyspace actions actionRepo.RegisterKeyspaceAction("ValidateKeyspace", func(wr *wrangler.Wrangler, keyspace string, r *http.Request) (string, error) { return "", wr.ValidateKeyspace(keyspace, false) }) actionRepo.RegisterKeyspaceAction("ValidateSchemaKeyspace", func(wr *wrangler.Wrangler, keyspace string, r *http.Request) (string, error) { return "", wr.ValidateSchemaKeyspace(keyspace, nil, false) }) actionRepo.RegisterKeyspaceAction("ValidateVersionKeyspace", func(wr *wrangler.Wrangler, keyspace string, r *http.Request) (string, error) { return "", wr.ValidateVersionKeyspace(keyspace) }) actionRepo.RegisterKeyspaceAction("ValidatePermissionsKeyspace", func(wr *wrangler.Wrangler, keyspace string, r *http.Request) (string, error) { return "", wr.ValidatePermissionsKeyspace(keyspace) }) // shard actions actionRepo.RegisterShardAction("ValidateShard", func(wr *wrangler.Wrangler, keyspace, shard string, r *http.Request) (string, error) { return "", wr.ValidateShard(keyspace, shard, false) }) actionRepo.RegisterShardAction("ValidateSchemaShard", func(wr *wrangler.Wrangler, keyspace, shard string, r *http.Request) (string, error) { return "", wr.ValidateSchemaShard(keyspace, shard, nil, false) }) actionRepo.RegisterShardAction("ValidateVersionShard", func(wr *wrangler.Wrangler, keyspace, shard string, r *http.Request) (string, error) { return "", wr.ValidateVersionShard(keyspace, shard) }) actionRepo.RegisterShardAction("ValidatePermissionsShard", func(wr *wrangler.Wrangler, keyspace, shard string, r *http.Request) (string, error) { return "", wr.ValidatePermissionsShard(keyspace, shard) }) // tablet actions actionRepo.RegisterTabletAction("Ping", "", func(wr *wrangler.Wrangler, tabletAlias topo.TabletAlias, r *http.Request) (string, error) { ti, err := wr.TopoServer().GetTablet(tabletAlias) if err != nil { return "", err } return "", wr.TabletManagerClient().Ping(wr.Context(), ti) }) actionRepo.RegisterTabletAction("ScrapTablet", acl.ADMIN, func(wr *wrangler.Wrangler, tabletAlias topo.TabletAlias, r *http.Request) (string, error) { // refuse to scrap tablets that are not spare ti, err := wr.TopoServer().GetTablet(tabletAlias) if err != nil { return "", err } if ti.Type != topo.TYPE_SPARE { return "", fmt.Errorf("Can only scrap spare tablets") } return "", wr.Scrap(tabletAlias, false, false) }) actionRepo.RegisterTabletAction("ScrapTabletForce", acl.ADMIN, func(wr *wrangler.Wrangler, tabletAlias topo.TabletAlias, r *http.Request) (string, error) { // refuse to scrap tablets that are not spare ti, err := wr.TopoServer().GetTablet(tabletAlias) if err != nil { return "", err } if ti.Type != topo.TYPE_SPARE { return "", fmt.Errorf("Can only scrap spare tablets") } return "", wr.Scrap(tabletAlias, true, false) }) actionRepo.RegisterTabletAction("DeleteTablet", acl.ADMIN, func(wr *wrangler.Wrangler, tabletAlias topo.TabletAlias, r *http.Request) (string, error) { return "", wr.DeleteTablet(tabletAlias) }) // toplevel index http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { templateLoader.ServeTemplate("index.html", indexContent, w, r) }) // keyspace actions http.HandleFunc("/keyspace_actions", func(w http.ResponseWriter, r *http.Request) { if err := r.ParseForm(); err != nil { httpError(w, "cannot parse form: %s", err) return } action := r.FormValue("action") if action == "" { http.Error(w, "no action provided", http.StatusBadRequest) return } keyspace := r.FormValue("keyspace") if keyspace == "" { http.Error(w, "no keyspace provided", http.StatusBadRequest) return } result := actionRepo.ApplyKeyspaceAction(action, keyspace, r) templateLoader.ServeTemplate("action.html", result, w, r) }) // shard actions http.HandleFunc("/shard_actions", func(w http.ResponseWriter, r *http.Request) { if err := r.ParseForm(); err != nil { httpError(w, "cannot parse form: %s", err) return } action := r.FormValue("action") if action == "" { http.Error(w, "no action provided", http.StatusBadRequest) return } keyspace := r.FormValue("keyspace") if keyspace == "" { http.Error(w, "no keyspace provided", http.StatusBadRequest) return } shard := r.FormValue("shard") if shard == "" { http.Error(w, "no shard provided", http.StatusBadRequest) return } result := actionRepo.ApplyShardAction(action, keyspace, shard, r) templateLoader.ServeTemplate("action.html", result, w, r) }) // tablet actions http.HandleFunc("/tablet_actions", func(w http.ResponseWriter, r *http.Request) { if err := r.ParseForm(); err != nil { httpError(w, "cannot parse form: %s", err) return } action := r.FormValue("action") if action == "" { http.Error(w, "no action provided", http.StatusBadRequest) return } alias := r.FormValue("alias") if alias == "" { http.Error(w, "no alias provided", http.StatusBadRequest) return } tabletAlias, err := topo.ParseTabletAliasString(alias) if err != nil { http.Error(w, "bad alias provided", http.StatusBadRequest) return } result := actionRepo.ApplyTabletAction(action, tabletAlias, r) templateLoader.ServeTemplate("action.html", result, w, r) }) // topology server http.HandleFunc("/dbtopo", func(w http.ResponseWriter, r *http.Request) { if err := r.ParseForm(); err != nil { httpError(w, "cannot parse form: %s", err) return } result := DbTopologyResult{} topology, err := topotools.DbTopology(context.TODO(), wr.TopoServer()) if err != nil { result.Error = err.Error() } else { result.Topology = topology } templateLoader.ServeTemplate("dbtopo.html", result, w, r) }) // serving graph http.HandleFunc("/serving_graph/", func(w http.ResponseWriter, r *http.Request) { parts := strings.Split(r.URL.Path, "/") cell := parts[len(parts)-1] if cell == "" { cells, err := ts.GetKnownCells() if err != nil { httpError(w, "cannot get known cells: %v", err) return } else { templateLoader.ServeTemplate("serving_graph_cells.html", cells, w, r) } return } servingGraph := topotools.DbServingGraph(wr.TopoServer(), cell) templateLoader.ServeTemplate("serving_graph.html", servingGraph, w, r) }) // redirects for explorers http.HandleFunc("/explorers/redirect", func(w http.ResponseWriter, r *http.Request) { if err := r.ParseForm(); err != nil { httpError(w, "cannot parse form: %s", err) return } var explorer Explorer switch len(explorers) { case 0: http.Error(w, "no explorer configured", http.StatusInternalServerError) return case 1: for _, ex := range explorers { explorer = ex } default: explorerName := r.FormValue("explorer") var ok bool explorer, ok = explorers[explorerName] if !ok { http.Error(w, "bad explorer name", http.StatusBadRequest) return } } var target string switch r.FormValue("type") { case "keyspace": keyspace := r.FormValue("keyspace") if keyspace == "" { http.Error(w, "keyspace is obligatory for this redirect", http.StatusBadRequest) return } target = explorer.GetKeyspacePath(keyspace) case "shard": keyspace, shard := r.FormValue("keyspace"), r.FormValue("shard") if keyspace == "" || shard == "" { http.Error(w, "keyspace and shard are obligatory for this redirect", http.StatusBadRequest) return } target = explorer.GetShardPath(keyspace, shard) case "srv_keyspace": cell := r.FormValue("cell") if cell == "" { http.Error(w, "cell is obligatory for this redirect", http.StatusBadRequest) return } keyspace := r.FormValue("keyspace") if keyspace == "" { http.Error(w, "keyspace is obligatory for this redirect", http.StatusBadRequest) return } target = explorer.GetSrvKeyspacePath(cell, keyspace) case "srv_shard": cell := r.FormValue("cell") if cell == "" { http.Error(w, "cell is obligatory for this redirect", http.StatusBadRequest) return } keyspace := r.FormValue("keyspace") if keyspace == "" { http.Error(w, "keyspace is obligatory for this redirect", http.StatusBadRequest) return } shard := r.FormValue("shard") if shard == "" { http.Error(w, "shard is obligatory for this redirect", http.StatusBadRequest) return } target = explorer.GetSrvShardPath(cell, keyspace, shard) case "srv_type": cell := r.FormValue("cell") if cell == "" { http.Error(w, "cell is obligatory for this redirect", http.StatusBadRequest) return } keyspace := r.FormValue("keyspace") if keyspace == "" { http.Error(w, "keyspace is obligatory for this redirect", http.StatusBadRequest) return } shard := r.FormValue("shard") if shard == "" { http.Error(w, "shard is obligatory for this redirect", http.StatusBadRequest) return } tabletType := r.FormValue("tablet_type") if tabletType == "" { http.Error(w, "tablet_type is obligatory for this redirect", http.StatusBadRequest) return } target = explorer.GetSrvTypePath(cell, keyspace, shard, topo.TabletType(tabletType)) case "tablet": aliasName := r.FormValue("alias") if aliasName == "" { http.Error(w, "keyspace is obligatory for this redirect", http.StatusBadRequest) return } alias, err := topo.ParseTabletAliasString(aliasName) if err != nil { http.Error(w, "bad tablet alias", http.StatusBadRequest) return } target = explorer.GetTabletPath(alias) case "replication": cell := r.FormValue("cell") if cell == "" { http.Error(w, "cell is obligatory for this redirect", http.StatusBadRequest) return } keyspace := r.FormValue("keyspace") if keyspace == "" { http.Error(w, "keyspace is obligatory for this redirect", http.StatusBadRequest) return } shard := r.FormValue("shard") if shard == "" { http.Error(w, "shard is obligatory for this redirect", http.StatusBadRequest) return } target = explorer.GetReplicationSlaves(cell, keyspace, shard) default: http.Error(w, "bad redirect type", http.StatusBadRequest) return } http.Redirect(w, r, target, http.StatusFound) }) servenv.RunDefault() }
func TestTabletExternallyReparentedFailedOldMaster(t *testing.T) { ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) // Create an old master, a new master, two good slaves oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER) newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) goodSlave := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA, TabletParent(oldMaster.Tablet.Alias)) // Reparent to a replica, and pretend the old master is not responding // On the elected master, we will respond to // TABLET_ACTION_SLAVE_WAS_PROMOTED newMaster.FakeMysqlDaemon.MasterAddr = "" newMaster.StartActionLoop(t, wr) defer newMaster.StopActionLoop(t) // On the old master, we will only get a // TABLET_ACTION_SLAVE_WAS_RESTARTED call, let's just not // respond to it at all // On the good slave, we will respond to // TABLET_ACTION_SLAVE_WAS_RESTARTED. goodSlave.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr() goodSlave.StartActionLoop(t, wr) defer goodSlave.StopActionLoop(t) // The reparent should work as expected here t.Logf("TabletExternallyReparented(new master) expecting success") tmc := tmclient.NewTabletManagerClient() ti, err := ts.GetTablet(newMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet failed: %v", err) } if err := tmc.TabletExternallyReparented(wr.Context(), ti, ""); err != nil { t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) } // Now double-check the serving graph is good. // Should only have one good replica left. addrs, err := ts.GetEndPoints("cell1", "test_keyspace", "0", topo.TYPE_REPLICA) if err != nil { t.Fatalf("GetEndPoints failed at the end: %v", err) } if len(addrs.Entries) != 1 { t.Fatalf("GetEndPoints has too many entries: %v", addrs) } // check the old master was converted to spare tablet, err := ts.GetTablet(oldMaster.Tablet.Alias) if err != nil { t.Fatalf("GetTablet(%v) failed: %v", oldMaster.Tablet.Alias, err) } if tablet.Type != topo.TYPE_SPARE { t.Fatalf("old master should be spare but is: %v", tablet.Type) } if tablet.Parent != newMaster.Tablet.Alias { t.Fatalf("old master has the wrong master, got %v expected %v", tablet.Parent, newMaster.Tablet.Alias) } }
func testSplitClone(t *testing.T, strategy string) { ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second) sourceMaster := testlib.NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER, testlib.TabletKeyspaceShard(t, "ks", "-80")) sourceRdonly1 := testlib.NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_RDONLY, testlib.TabletKeyspaceShard(t, "ks", "-80"), testlib.TabletParent(sourceMaster.Tablet.Alias)) sourceRdonly2 := testlib.NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_RDONLY, testlib.TabletKeyspaceShard(t, "ks", "-80"), testlib.TabletParent(sourceMaster.Tablet.Alias)) leftMaster := testlib.NewFakeTablet(t, wr, "cell1", 10, topo.TYPE_MASTER, testlib.TabletKeyspaceShard(t, "ks", "-40")) leftRdonly := testlib.NewFakeTablet(t, wr, "cell1", 11, topo.TYPE_RDONLY, testlib.TabletKeyspaceShard(t, "ks", "-40"), testlib.TabletParent(leftMaster.Tablet.Alias)) rightMaster := testlib.NewFakeTablet(t, wr, "cell1", 20, topo.TYPE_MASTER, testlib.TabletKeyspaceShard(t, "ks", "40-80")) rightRdonly := testlib.NewFakeTablet(t, wr, "cell1", 21, topo.TYPE_RDONLY, testlib.TabletKeyspaceShard(t, "ks", "40-80"), testlib.TabletParent(rightMaster.Tablet.Alias)) for _, ft := range []*testlib.FakeTablet{sourceMaster, sourceRdonly1, sourceRdonly2, leftMaster, leftRdonly, rightMaster, rightRdonly} { ft.StartActionLoop(t, wr) defer ft.StopActionLoop(t) } // add the topo and schema data we'll need if err := topo.CreateShard(ts, "ks", "80-"); err != nil { t.Fatalf("CreateShard(\"-80\") failed: %v", err) } if err := wr.SetKeyspaceShardingInfo("ks", "keyspace_id", key.KIT_UINT64, 4, false); err != nil { t.Fatalf("SetKeyspaceShardingInfo failed: %v", err) } if err := wr.RebuildKeyspaceGraph("ks", nil); err != nil { t.Fatalf("RebuildKeyspaceGraph failed: %v", err) } gwrk, err := NewSplitCloneWorker(wr, "cell1", "ks", "-80", nil, strategy, 10 /*sourceReaderCount*/, 4 /*destinationPackCount*/, 1 /*minTableSizeForSplit*/, 10 /*destinationWriterCount*/) if err != nil { t.Errorf("Worker creation failed: %v", err) } wrk := gwrk.(*SplitCloneWorker) for _, sourceRdonly := range []*testlib.FakeTablet{sourceRdonly1, sourceRdonly2} { sourceRdonly.FakeMysqlDaemon.Schema = &myproto.SchemaDefinition{ DatabaseSchema: "", TableDefinitions: []*myproto.TableDefinition{ &myproto.TableDefinition{ Name: "table1", Columns: []string{"id", "msg", "keyspace_id"}, PrimaryKeyColumns: []string{"id"}, Type: myproto.TABLE_BASE_TABLE, // This informs how many rows we can pack into a single insert DataLength: 2048, }, }, } sourceRdonly.FakeMysqlDaemon.DbaConnectionFactory = SourceRdonlyFactory(t) sourceRdonly.FakeMysqlDaemon.CurrentSlaveStatus = &myproto.ReplicationStatus{ Position: myproto.ReplicationPosition{ GTIDSet: myproto.MariadbGTID{Domain: 12, Server: 34, Sequence: 5678}, }, } sourceRdonly.RpcServer.Register(&SqlQuery{t: t}) } // We read 100 source rows. sourceReaderCount is set to 10, so // we'll have 100/10=10 rows per table chunk. // destinationPackCount is set to 4, so we take 4 source rows // at once. So we'll process 4 + 4 + 2 rows to get to 10. // That means 3 insert statements on each target (each // containing half of the rows, i.e. 2 + 2 + 1 rows). So 3 * 10 // = 30 insert statements on each destination. leftMaster.FakeMysqlDaemon.DbaConnectionFactory = DestinationsFactory(t, 30) leftRdonly.FakeMysqlDaemon.DbaConnectionFactory = DestinationsFactory(t, 30) rightMaster.FakeMysqlDaemon.DbaConnectionFactory = DestinationsFactory(t, 30) rightRdonly.FakeMysqlDaemon.DbaConnectionFactory = DestinationsFactory(t, 30) wrk.Run() status := wrk.StatusAsText() t.Logf("Got status: %v", status) if wrk.err != nil || wrk.state != stateSCDone { t.Errorf("Worker run failed") } }
// Operate on restore tablet. // Check that the SnapshotManifest is valid and the master has not changed. // Shutdown mysqld. // Load the snapshot from source tablet. // Restart mysqld and replication. // Put tablet into the replication graph as a spare. // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Restore(ctx context.Context, args *actionnode.RestoreArgs, logger logutil.Logger) error { // read our current tablet, verify its state tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return err } if args.WasReserved { if tablet.Type != topo.TYPE_RESTORE { return fmt.Errorf("expected restore type, not %v", tablet.Type) } } else { if tablet.Type != topo.TYPE_IDLE { return fmt.Errorf("expected idle type, not %v", tablet.Type) } } // read the source tablet, compute args.SrcFilePath if default sourceTablet, err := agent.TopoServer.GetTablet(args.SrcTabletAlias) if err != nil { return err } if strings.ToLower(args.SrcFilePath) == "default" { args.SrcFilePath = path.Join(mysqlctl.SnapshotURLPath, mysqlctl.SnapshotManifestFile) } // read the parent tablet, verify its state parentTablet, err := agent.TopoServer.GetTablet(args.ParentAlias) if err != nil { return err } if parentTablet.Type != topo.TYPE_MASTER && parentTablet.Type != topo.TYPE_SNAPSHOT_SOURCE { return fmt.Errorf("restore expected master or snapshot_source parent: %v %v", parentTablet.Type, args.ParentAlias) } // read & unpack the manifest sm := new(mysqlctl.SnapshotManifest) if err := fetchAndParseJsonFile(sourceTablet.Addr(), args.SrcFilePath, sm); err != nil { return err } if !args.WasReserved { if err := agent.changeTypeToRestore(ctx, tablet, sourceTablet, parentTablet.Alias, sourceTablet.KeyRange); err != nil { return err } } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // do the work if err := agent.Mysqld.RestoreFromSnapshot(l, sm, args.FetchConcurrency, args.FetchRetryCount, args.DontWaitForSlaveStart, agent.hookExtraEnv()); err != nil { log.Errorf("RestoreFromSnapshot failed (%v), scrapping", err) if err := topotools.Scrap(agent.TopoServer, agent.TabletAlias, false); err != nil { log.Errorf("Failed to Scrap after failed RestoreFromSnapshot: %v", err) } return err } // reload the schema agent.ReloadSchema(ctx) // change to TYPE_SPARE, we're done! return topotools.ChangeType(agent.TopoServer, agent.TabletAlias, topo.TYPE_SPARE, nil, true) }
// Snapshot takes a db snapshot // Should be called under RpcWrapLockAction. func (agent *ActionAgent) Snapshot(ctx context.Context, args *actionnode.SnapshotArgs, logger logutil.Logger) (*actionnode.SnapshotReply, error) { // update our type to TYPE_BACKUP tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return nil, err } originalType := tablet.Type // ForceMasterSnapshot: Normally a master is not a viable tablet // to snapshot. However, there are degenerate cases where you need // to override this, for instance the initial clone of a new master. if tablet.Type == topo.TYPE_MASTER && args.ForceMasterSnapshot { // In this case, we don't bother recomputing the serving graph. // All queries will have to fail anyway. log.Infof("force change type master -> backup") // There is a legitimate reason to force in the case of a single // master. tablet.Tablet.Type = topo.TYPE_BACKUP err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, topo.TYPE_BACKUP, make(map[string]string), true /*runHooks*/) } if err != nil { return nil, err } // let's update our internal state (stop query service and other things) if err := agent.refreshTablet(ctx, "snapshotStart"); err != nil { return nil, fmt.Errorf("failed to update state before snaphost: %v", err) } // create the loggers: tee to console and source l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger) // now we can run the backup filename, slaveStartRequired, readOnly, returnErr := agent.Mysqld.CreateSnapshot(l, tablet.DbName(), tablet.Addr(), false, args.Concurrency, args.ServerMode, agent.hookExtraEnv()) // and change our type to the appropriate value newType := originalType if returnErr != nil { log.Errorf("snapshot failed, restoring tablet type back to %v: %v", newType, returnErr) } else { if args.ServerMode { log.Infof("server mode specified, switching tablet to snapshot_source mode") newType = topo.TYPE_SNAPSHOT_SOURCE } else { log.Infof("change type back after snapshot: %v", newType) } } if tablet.Parent.Uid == topo.NO_TABLET && args.ForceMasterSnapshot && newType != topo.TYPE_SNAPSHOT_SOURCE { log.Infof("force change type backup -> master: %v", tablet.Alias) tablet.Tablet.Type = topo.TYPE_MASTER err = topo.UpdateTablet(ctx, agent.TopoServer, tablet) } else { err = topotools.ChangeType(agent.TopoServer, tablet.Alias, newType, nil, true /*runHooks*/) } if err != nil { // failure in changing the topology type is probably worse, // so returning that (we logged the snapshot error anyway) returnErr = err } // if anything failed, don't return anything if returnErr != nil { return nil, returnErr } // it all worked, return the required information sr := &actionnode.SnapshotReply{ ManifestPath: filename, SlaveStartRequired: slaveStartRequired, ReadOnly: readOnly, } if tablet.Parent.Uid == topo.NO_TABLET { // If this is a master, this will be the new parent. // FIXME(msolomon) this doesn't work in hierarchical replication. sr.ParentAlias = tablet.Alias } else { sr.ParentAlias = tablet.Parent } return sr, nil }
// tabletExternallyReparentedLocked is called with the shard lock. // It returns if agent.refreshTablet should be called, and the error. // Note both are set independently (can have both true and an error). func (agent *ActionAgent) tabletExternallyReparentedLocked(ctx context.Context, externalID string, interrupted chan struct{}) (bool, error) { // re-read the tablet record to be sure we have the latest version tablet, err := agent.TopoServer.GetTablet(agent.TabletAlias) if err != nil { return false, err } // read the shard, make sure again the master is not already good. shardInfo, err := agent.TopoServer.GetShard(tablet.Keyspace, tablet.Shard) if err != nil { return false, err } if shardInfo.MasterAlias == tablet.Alias { log.Infof("TabletExternallyReparented: tablet became the master before we get the lock?") return false, nil } log.Infof("TabletExternallyReparented called and we're not the master, doing the work") // Read the tablets, make sure the master elect is known to the shard // (it's this tablet, so it better be!). // Note we will keep going with a partial tablet map, which usually // happens when a cell is not reachable. After these checks, the // guarantees we'll have are: // - global cell is reachable (we just locked and read the shard) // - the local cell that contains the new master is reachable // (as we're going to check the new master is in the list) // That should be enough. tabletMap, err := topo.GetTabletMapForShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard) switch err { case nil: // keep going case topo.ErrPartialResult: log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets") default: return false, err } masterElectTablet, ok := tabletMap[tablet.Alias] if !ok { return false, fmt.Errorf("this master-elect tablet %v not found in replication graph %v/%v %v", tablet.Alias, tablet.Keyspace, tablet.Shard, topotools.MapKeys(tabletMap)) } // Create reusable Reparent event with available info ev := &events.Reparent{ ShardInfo: *shardInfo, NewMaster: *tablet.Tablet, ExternalID: externalID, } if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok { ev.OldMaster = *oldMasterTablet.Tablet } defer func() { if err != nil { event.DispatchUpdate(ev, "failed: "+err.Error()) } }() // sort the tablets, and handle them slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap) event.DispatchUpdate(ev, "starting external from tablet") // We fix the new master in the replication graph. // Note after this call, we may have changed the tablet record, // so we will always return true, so the tablet record is re-read // by the agent. event.DispatchUpdate(ev, "mark ourself as new master") err = agent.updateReplicationGraphForPromotedSlave(ctx, tablet) if err != nil { // This suggests we can't talk to topo server. This is bad. return true, fmt.Errorf("updateReplicationGraphForPromotedSlave failed: %v", err) } // Once this tablet is promoted, remove it from our maps delete(slaveTabletMap, tablet.Alias) delete(masterTabletMap, tablet.Alias) // Then fix all the slaves, including the old master. This // last step is very likely to time out for some tablets (one // random guy is dead, the old master is dead, ...). We // execute them all in parallel until we get to // wr.ActionTimeout(). After this, no other action with a // timeout is executed, so even if we got to the timeout, // we're still good. event.DispatchUpdate(ev, "restarting slaves") logger := logutil.NewConsoleLogger() tmc := tmclient.NewTabletManagerClient() topotools.RestartSlavesExternal(agent.TopoServer, logger, slaveTabletMap, masterTabletMap, masterElectTablet.Alias, func(ti *topo.TabletInfo, swrd *actionnode.SlaveWasRestartedArgs) error { return tmc.SlaveWasRestarted(ctx, ti, swrd) }) // Compute the list of Cells we need to rebuild: old master and // all other cells if reparenting to another cell. cells := []string{shardInfo.MasterAlias.Cell} if shardInfo.MasterAlias.Cell != tablet.Alias.Cell { cells = nil } // now update the master record in the shard object event.DispatchUpdate(ev, "updating shard record") log.Infof("Updating Shard's MasterAlias record") shardInfo.MasterAlias = tablet.Alias if err = topo.UpdateShard(ctx, agent.TopoServer, shardInfo); err != nil { return true, err } // and rebuild the shard serving graph event.DispatchUpdate(ev, "rebuilding shard serving graph") log.Infof("Rebuilding shard serving graph data") if _, err = topotools.RebuildShard(ctx, logger, agent.TopoServer, tablet.Keyspace, tablet.Shard, cells, agent.LockTimeout, interrupted); err != nil { return true, err } event.DispatchUpdate(ev, "finished") return true, nil }