func (maxPosSearch *maxReplPosSearch) processTablet(tablet *topodatapb.Tablet) { defer maxPosSearch.waitGroup.Done() maxPosSearch.wrangler.logger.Infof("getting replication position from %v", topoproto.TabletAliasString(tablet.Alias)) slaveStatusCtx, cancelSlaveStatus := context.WithTimeout(maxPosSearch.ctx, maxPosSearch.waitSlaveTimeout) defer cancelSlaveStatus() status, err := maxPosSearch.wrangler.tmc.SlaveStatus(slaveStatusCtx, tablet) if err != nil { maxPosSearch.wrangler.logger.Warningf("failed to get replication status from %v, ignoring tablet: %v", topoproto.TabletAliasString(tablet.Alias), err) return } replPos, err := replication.DecodePosition(status.Position) if err != nil { maxPosSearch.wrangler.logger.Warningf("cannot decode slave %v position %v: %v", topoproto.TabletAliasString(tablet.Alias), status.Position, err) return } maxPosSearch.maxPosLock.Lock() if maxPosSearch.maxPosTablet == nil || !maxPosSearch.maxPos.AtLeast(replPos) { maxPosSearch.maxPos = replPos maxPosSearch.maxPosTablet = tablet } maxPosSearch.maxPosLock.Unlock() }
// ReparentTablet tells a tablet to reparent this tablet to the current // master, based on the current replication position. If there is no // match, it will fail. func (wr *Wrangler) ReparentTablet(ctx context.Context, tabletAlias *pb.TabletAlias) error { // Get specified tablet. // Get current shard master tablet. // Sanity check they are in the same keyspace/shard. // Issue a SetMaster to the tablet. ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return err } shardInfo, err := wr.ts.GetShard(ctx, ti.Keyspace, ti.Shard) if err != nil { return err } if !shardInfo.HasMaster() { return fmt.Errorf("no master tablet for shard %v/%v", ti.Keyspace, ti.Shard) } masterTi, err := wr.ts.GetTablet(ctx, shardInfo.MasterAlias) if err != nil { return err } // Basic sanity checking. if masterTi.Type != pb.TabletType_MASTER { return fmt.Errorf("TopologyServer has inconsistent state for shard master %v", topoproto.TabletAliasString(shardInfo.MasterAlias)) } if masterTi.Keyspace != ti.Keyspace || masterTi.Shard != ti.Shard { return fmt.Errorf("master %v and potential slave not in same keyspace/shard", topoproto.TabletAliasString(shardInfo.MasterAlias)) } // and do the remote command return wr.TabletManagerClient().SetMaster(ctx, ti, shardInfo.MasterAlias, 0, false) }
func (wr *Wrangler) getMastersPosition(ctx context.Context, shards []*topo.ShardInfo) (map[*topo.ShardInfo]string, error) { mu := sync.Mutex{} result := make(map[*topo.ShardInfo]string) wg := sync.WaitGroup{} rec := concurrency.AllErrorRecorder{} for _, si := range shards { wg.Add(1) go func(si *topo.ShardInfo) { defer wg.Done() wr.Logger().Infof("Gathering master position for %v", topoproto.TabletAliasString(si.MasterAlias)) ti, err := wr.ts.GetTablet(ctx, si.MasterAlias) if err != nil { rec.RecordError(err) return } pos, err := wr.tmc.MasterPosition(ctx, ti) if err != nil { rec.RecordError(err) return } wr.Logger().Infof("Got master position for %v", topoproto.TabletAliasString(si.MasterAlias)) mu.Lock() result[si] = pos mu.Unlock() }(si) } wg.Wait() return result, rec.Error() }
// rpcWrapper handles all the logic for rpc calls. func (agent *ActionAgent) rpcWrapper(ctx context.Context, name string, args, reply interface{}, verbose bool, f func() error, lock, runAfterAction bool) (err error) { defer func() { if x := recover(); x != nil { log.Errorf("TabletManager.%v(%v) on %v panic: %v\n%s", name, args, topoproto.TabletAliasString(agent.TabletAlias), x, tb.Stack(4)) err = fmt.Errorf("caught panic during %v: %v", name, x) } }() from := "" ci, ok := callinfo.FromContext(ctx) if ok { from = ci.Text() } if lock { beforeLock := time.Now() agent.actionMutex.Lock() defer agent.actionMutex.Unlock() if time.Now().Sub(beforeLock) > rpcTimeout { return fmt.Errorf("server timeout for " + name) } } if err = f(); err != nil { log.Warningf("TabletManager.%v(%v)(on %v from %v) error: %v", name, args, topoproto.TabletAliasString(agent.TabletAlias), from, err.Error()) return fmt.Errorf("TabletManager.%v on %v error: %v", name, topoproto.TabletAliasString(agent.TabletAlias), err) } if verbose { log.Infof("TabletManager.%v(%v)(on %v from %v): %#v", name, args, topoproto.TabletAliasString(agent.TabletAlias), from, reply) } if runAfterAction { err = agent.refreshTablet(ctx, "RPC("+name+")") } return }
// createTablet creates an individual tablet, with its agent, and adds // it to the map. If it's a master tablet, it also issues a TER. func createTablet(ctx context.Context, ts topo.Server, cell string, uid uint32, keyspace, shard, dbname string, tabletType topodatapb.TabletType, mysqld mysqlctl.MysqlDaemon, dbcfgs dbconfigs.DBConfigs) error { alias := &topodatapb.TabletAlias{ Cell: cell, Uid: uid, } log.Infof("Creating %v tablet %v for %v/%v", tabletType, topoproto.TabletAliasString(alias), keyspace, shard) flag.Set("debug-url-prefix", fmt.Sprintf("/debug-%d", uid)) controller := tabletserver.NewServer() initTabletType := tabletType if tabletType == topodatapb.TabletType_MASTER { initTabletType = topodatapb.TabletType_REPLICA } agent := tabletmanager.NewComboActionAgent(ctx, ts, alias, int32(8000+uid), int32(9000+uid), controller, dbcfgs, mysqld, keyspace, shard, dbname, strings.ToLower(initTabletType.String())) if tabletType == topodatapb.TabletType_MASTER { if err := agent.TabletExternallyReparented(ctx, ""); err != nil { return fmt.Errorf("TabletExternallyReparented failed on master %v: %v", topoproto.TabletAliasString(alias), err) } } tabletMap[uid] = &tablet{ keyspace: keyspace, shard: shard, tabletType: tabletType, dbname: dbname, qsc: controller, agent: agent, } return nil }
// Track will pick the least used tablet from "stats", increment its usage by 1 // and return it. // "stats" must not be empty. func (t *TabletTracker) Track(stats []discovery.TabletStats) *topodata.TabletAlias { if len(stats) == 0 { panic("stats must not be empty") } t.mu.Lock() defer t.mu.Unlock() // Try to find a tablet which is not in use yet. for _, stat := range stats { key := topoproto.TabletAliasString(stat.Tablet.Alias) if _, ok := t.usedTablets[key]; !ok { t.usedTablets[key] = 1 return stat.Tablet.Alias } } // If we reached this point, "stats" is a subset of "usedTablets" i.e. // all tablets are already in use. Take the least used one. for _, aliasString := range t.tabletsByUsage() { for _, stat := range stats { key := topoproto.TabletAliasString(stat.Tablet.Alias) if key == aliasString { t.usedTablets[key]++ return stat.Tablet.Alias } } } panic("BUG: we did not add any tablet") }
// Syslog writes a Reparent event to syslog. func (r *Reparent) Syslog() (syslog.Priority, string) { return syslog.LOG_INFO, fmt.Sprintf("%s/%s [reparent %v -> %v] %s (%s)", r.ShardInfo.Keyspace(), r.ShardInfo.ShardName(), topoproto.TabletAliasString(r.OldMaster.Alias), topoproto.TabletAliasString(r.NewMaster.Alias), r.Status, r.ExternalID) }
// FIXME(msolomon) This validate presumes the master is up and running. // Even when that isn't true, there are validation processes that might be valuable. func (wr *Wrangler) validateShard(ctx context.Context, keyspace, shard string, pingTablets bool, wg *sync.WaitGroup, results chan<- error) { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.GetShard(%v, %v) failed: %v", keyspace, shard, err) return } aliases, err := wr.ts.FindAllTabletAliasesInShard(ctx, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err) return } tabletMap, _ := wr.ts.GetTabletMap(ctx, aliases) var masterAlias *pb.TabletAlias for _, alias := range aliases { tabletInfo, ok := tabletMap[*alias] if !ok { results <- fmt.Errorf("tablet %v not found in map", topoproto.TabletAliasString(alias)) continue } if tabletInfo.Type == pb.TabletType_MASTER { if masterAlias != nil { results <- fmt.Errorf("shard %v/%v already has master %v but found other master %v", keyspace, shard, topoproto.TabletAliasString(masterAlias), topoproto.TabletAliasString(alias)) } else { masterAlias = alias } } } if masterAlias == nil { results <- fmt.Errorf("no master for shard %v/%v", keyspace, shard) } else if !topoproto.TabletAliasEqual(shardInfo.MasterAlias, masterAlias) { results <- fmt.Errorf("master mismatch for shard %v/%v: found %v, expected %v", keyspace, shard, topoproto.TabletAliasString(masterAlias), topoproto.TabletAliasString(shardInfo.MasterAlias)) } for _, alias := range aliases { wg.Add(1) go func(alias *pb.TabletAlias) { defer wg.Done() if err := topo.Validate(ctx, wr.ts, alias); err != nil { results <- fmt.Errorf("Validate(%v) failed: %v", topoproto.TabletAliasString(alias), err) } else { wr.Logger().Infof("tablet %v is valid", topoproto.TabletAliasString(alias)) } }(alias) } if pingTablets { wr.validateReplication(ctx, shardInfo, tabletMap, results) wr.pingTablets(ctx, tabletMap, wg, results) } return }
// InitTablet creates or updates a tablet. If no parent is specified // in the tablet, and the tablet has a slave type, we will find the // appropriate parent. If createShardAndKeyspace is true and the // parent keyspace or shard don't exist, they will be created. If // allowUpdate is true, and a tablet with the same ID exists, just update it. // If a tablet is created as master, and there is already a different // master in the shard, allowMasterOverride must be set. func (wr *Wrangler) InitTablet(ctx context.Context, tablet *topodatapb.Tablet, allowMasterOverride, createShardAndKeyspace, allowUpdate bool) error { if err := topo.TabletComplete(tablet); err != nil { return err } // get the shard, possibly creating it var err error var si *topo.ShardInfo if createShardAndKeyspace { // create the parent keyspace and shard if needed si, err = wr.ts.GetOrCreateShard(ctx, tablet.Keyspace, tablet.Shard) } else { si, err = wr.ts.GetShard(ctx, tablet.Keyspace, tablet.Shard) if err == topo.ErrNoNode { return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard") } } // get the shard, checks a couple things if err != nil { return fmt.Errorf("cannot get (or create) shard %v/%v: %v", tablet.Keyspace, tablet.Shard, err) } if !key.KeyRangeEqual(si.KeyRange, tablet.KeyRange) { return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange) } if tablet.Type == topodatapb.TabletType_MASTER && si.HasMaster() && !topoproto.TabletAliasEqual(si.MasterAlias, tablet.Alias) && !allowMasterOverride { return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v, use allow_master_override flag", topoproto.TabletAliasString(si.MasterAlias), tablet.Keyspace, tablet.Shard) } // update the shard record if needed if err := wr.updateShardCellsAndMaster(ctx, si, tablet.Alias, tablet.Type, allowMasterOverride); err != nil { return err } err = wr.ts.CreateTablet(ctx, tablet) if err == topo.ErrNodeExists && allowUpdate { // Try to update then oldTablet, err := wr.ts.GetTablet(ctx, tablet.Alias) if err != nil { return fmt.Errorf("failed reading existing tablet %v: %v", topoproto.TabletAliasString(tablet.Alias), err) } // Check we have the same keyspace / shard, and if not, // require the allowDifferentShard flag. if oldTablet.Keyspace != tablet.Keyspace || oldTablet.Shard != tablet.Shard { return fmt.Errorf("old tablet has shard %v/%v. Cannot override with shard %v/%v. Delete and re-add tablet if you want to change the tablet's keyspace/shard", oldTablet.Keyspace, oldTablet.Shard, tablet.Keyspace, tablet.Shard) } *(oldTablet.Tablet) = *tablet if err := wr.ts.UpdateTablet(ctx, oldTablet); err != nil { return fmt.Errorf("failed updating tablet %v: %v", topoproto.TabletAliasString(tablet.Alias), err) } } return nil }
func checkSemiSyncEnabled(t *testing.T, master, slave bool, tablets ...*FakeTablet) { for _, tablet := range tablets { if got, want := tablet.FakeMysqlDaemon.SemiSyncMasterEnabled, master; got != want { t.Errorf("%v: SemiSyncMasterEnabled = %v, want %v", topoproto.TabletAliasString(tablet.Tablet.Alias), got, want) } if got, want := tablet.FakeMysqlDaemon.SemiSyncSlaveEnabled, slave; got != want { t.Errorf("%v: SemiSyncSlaveEnabled = %v, want %v", topoproto.TabletAliasString(tablet.Tablet.Alias), got, want) } } }
// DeleteShard will do all the necessary changes in the topology server // to entirely remove a shard. func (wr *Wrangler) DeleteShard(ctx context.Context, keyspace, shard string, recursive bool) error { shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard) if err != nil { return err } tabletMap, err := wr.ts.GetTabletMapForShard(ctx, keyspace, shard) if err != nil { return err } if recursive { wr.Logger().Infof("Deleting all tablets in shard %v/%v", keyspace, shard) for tabletAlias := range tabletMap { // We don't care about scrapping or updating the replication graph, // because we're about to delete the entire replication graph. wr.Logger().Infof("Deleting tablet %v", topoproto.TabletAliasString(&tabletAlias)) if err := wr.TopoServer().DeleteTablet(ctx, &tabletAlias); err != nil && err != topo.ErrNoNode { // Unlike the errors below in non-recursive steps, we don't want to // continue if a DeleteTablet fails. If we continue and delete the // replication graph, the tablet record will be orphaned, since we'll // no longer know it belongs to this shard. // // If the problem is temporary, or resolved externally, re-running // DeleteShard will skip over tablets that were already deleted. return fmt.Errorf("can't delete tablet %v: %v", topoproto.TabletAliasString(&tabletAlias), err) } } } else if len(tabletMap) > 0 { return fmt.Errorf("shard %v/%v still has %v tablets; use -recursive or remove them manually", keyspace, shard, len(tabletMap)) } // remove the replication graph and serving graph in each cell for _, cell := range shardInfo.Cells { if err := wr.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete ShardReplication in cell %v for %v/%v: %v", cell, keyspace, shard, err) } for _, t := range topoproto.AllTabletTypes { if !topo.IsInServingGraph(t) { continue } if err := wr.ts.DeleteEndPoints(ctx, cell, keyspace, shard, t, -1); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete EndPoints in cell %v for %v/%v/%v: %v", cell, keyspace, shard, t, err) } } if err := wr.ts.DeleteSrvShard(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode { wr.Logger().Warningf("Cannot delete SrvShard in cell %v for %v/%v: %v", cell, keyspace, shard, err) } } return wr.ts.DeleteShard(ctx, keyspace, shard) }
// helper method to asynchronously diff a schema func (wr *Wrangler) diffSchema(ctx context.Context, masterSchema *tabletmanagerdatapb.SchemaDefinition, masterTabletAlias, alias *topodatapb.TabletAlias, excludeTables []string, includeViews bool, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { defer wg.Done() log.Infof("Gathering schema for %v", topoproto.TabletAliasString(alias)) slaveSchema, err := wr.GetSchema(ctx, alias, nil, excludeTables, includeViews) if err != nil { er.RecordError(err) return } log.Infof("Diffing schema for %v", topoproto.TabletAliasString(alias)) tmutils.DiffSchema(topoproto.TabletAliasString(masterTabletAlias), masterSchema, topoproto.TabletAliasString(alias), slaveSchema, er) }
// diffPermissions is a helper method to asynchronously diff a permissions func (wr *Wrangler) diffPermissions(ctx context.Context, masterPermissions *tabletmanagerdatapb.Permissions, masterAlias *pb.TabletAlias, alias *pb.TabletAlias, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { defer wg.Done() log.Infof("Gathering permissions for %v", topoproto.TabletAliasString(alias)) slavePermissions, err := wr.GetPermissions(ctx, alias) if err != nil { er.RecordError(err) return } log.Infof("Diffing permissions for %v", topoproto.TabletAliasString(alias)) tmutils.DiffPermissions(topoproto.TabletAliasString(masterAlias), masterPermissions, topoproto.TabletAliasString(alias), slavePermissions, er) }
// ReloadSchemaShard reloads the schema for all slave tablets in a shard, // after they reach a given replication position (empty pos means immediate). // In general, we don't always expect all slaves to be ready to reload, // and the periodic schema reload makes them self-healing anyway. // So we do this on a best-effort basis, and log warnings for any tablets // that fail to reload within the context deadline. // Note that this skips the master because it's assumed that the schema // was reloaded there right after applying it. func (wr *Wrangler) ReloadSchemaShard(ctx context.Context, keyspace, shard, replicationPos string) { tablets, err := wr.ts.GetTabletMapForShard(ctx, keyspace, shard) if err != nil { if err != topo.ErrPartialResult { // This is best-effort, so just log it and move on. wr.logger.Warningf("Failed to reload schema on slave tablets in %v/%v (use vtctl ReloadSchema to fix individual tablets): %v", keyspace, shard, err) return } // We got a partial result. Do what we can, but warn that some may be missed. wr.logger.Warningf("Failed to fetch all tablets for %v/%v. Some slave tablets may not have schema reloaded (use vtctl ReloadSchema to fix individual tablets)", keyspace, shard) } var wg sync.WaitGroup for _, ti := range tablets { if ti.Type == topodatapb.TabletType_MASTER { // We don't need to reload on the master because we assume // ExecuteFetchAsDba() already did that. continue } wg.Add(1) go func(tablet *topodatapb.Tablet) { defer wg.Done() if err := wr.tmc.ReloadSchema(ctx, tablet, replicationPos); err != nil { wr.logger.Warningf( "Failed to reload schema on slave tablet %v in %v/%v (use vtctl ReloadSchema to try again): %v", topoproto.TabletAliasString(tablet.Alias), keyspace, shard, err) } }(ti.Tablet) } wg.Wait() }
// startStream assumes that getTablet() was succesfully called before and now // tries to connect to the set tablet and start the streaming query. // If the method returns an error, the first return value specifies if it is // okay to retry. func (r *RestartableResultReader) startStream() (bool, error) { // Start the streaming query. r.generateQuery() stream, err := r.conn.StreamExecute(r.ctx, &querypb.Target{ Keyspace: r.tablet.Keyspace, Shard: r.tablet.Shard, TabletType: r.tablet.Type, }, r.query, make(map[string]interface{})) if err != nil { return true /* retryable */, fmt.Errorf("failed to call StreamExecute() for query '%v': %v", r.query, err) } // Read the fields information. cols, err := stream.Recv() if err != nil { return true /* retryable */, fmt.Errorf("cannot read Fields for query '%v': %v", r.query, err) } r.fields = cols.Fields r.output = stream alias := topoproto.TabletAliasString(r.tablet.Alias) statsStreamingQueryCounters.Add(alias, 1) log.V(2).Infof("tablet=%v table=%v chunk=%v: Starting to stream rows using query '%v'.", alias, r.td.Name, r.chunk, r.query) return false, nil }
func (m *MaxReplicationLagModule) updateRate(r *result, newState state, rate int64, reason string, now time.Time, lagRecordNow replicationLagRecord, testDuration time.Duration) { oldRate := m.rate.Get() m.currentState = newState // Update result with the new state. r.NewState = newState r.NewRate = rate r.Reason = reason if rate > oldRate { r.RateChange = increasedRate } else if rate < oldRate { r.RateChange = decreasedRate } m.lastRateChange = now m.replicaUnderTest = &replicaUnderTest{lagRecordNow.Key, topoproto.TabletAliasString(lagRecordNow.Tablet.Alias), lagRecordNow.Target.TabletType, newState, now.Add(testDuration)} if rate == oldRate { return } m.rate.Set(int64(rate)) // Notify the throttler that we updated our max rate. m.rateUpdateChan <- struct{}{} }
// Validate makes sure a tablet is represented correctly in the topology server. func Validate(ctx context.Context, ts Server, tabletAlias *topodatapb.TabletAlias) error { // read the tablet record, make sure it parses tablet, err := ts.GetTablet(ctx, tabletAlias) if err != nil { return err } if !topoproto.TabletAliasEqual(tablet.Alias, tabletAlias) { return fmt.Errorf("bad tablet alias data for tablet %v: %#v", topoproto.TabletAliasString(tabletAlias), tablet.Alias) } // Validate the entry in the shard replication nodes if err = ts.ValidateShard(ctx, tablet.Keyspace, tablet.Shard); err != nil { return err } si, err := ts.GetShardReplication(ctx, tablet.Alias.Cell, tablet.Keyspace, tablet.Shard) if err != nil { return err } if _, err = si.GetShardReplicationNode(tabletAlias); err != nil { return fmt.Errorf("tablet %v not found in cell %v shard replication: %v", tabletAlias, tablet.Alias.Cell, err) } return nil }
func (scw *SplitCloneWorker) formatSources() string { result := "" for _, alias := range scw.sourceAliases { result += " " + topoproto.TabletAliasString(alias) } return result }
// UpdateShardReplicationRecord is a low level function to add / update an // entry to the ShardReplication object. func UpdateShardReplicationRecord(ctx context.Context, ts Server, keyspace, shard string, tabletAlias *pb.TabletAlias) error { span := trace.NewSpanFromContext(ctx) span.StartClient("TopoServer.UpdateShardReplicationFields") span.Annotate("keyspace", keyspace) span.Annotate("shard", shard) span.Annotate("tablet", topoproto.TabletAliasString(tabletAlias)) defer span.Finish() return ts.UpdateShardReplicationFields(ctx, tabletAlias.Cell, keyspace, shard, func(sr *pb.ShardReplication) error { // not very efficient, but easy to read nodes := make([]*pb.ShardReplication_Node, 0, len(sr.Nodes)+1) found := false for _, node := range sr.Nodes { if *node.TabletAlias == *tabletAlias { if found { log.Warningf("Found a second ShardReplication_Node for tablet %v, deleting it", tabletAlias) continue } found = true } nodes = append(nodes, node) } if !found { nodes = append(nodes, &pb.ShardReplication_Node{TabletAlias: tabletAlias}) } sr.Nodes = nodes return nil }) }
// findDestinationMasters finds for each destination shard the current master. func (scw *SplitCloneWorker) findDestinationMasters(ctx context.Context) error { scw.setState(WorkerStateFindTargets) // Make sure we find a master for each destination shard and log it. scw.wr.Logger().Infof("Finding a MASTER tablet for each destination shard...") for _, si := range scw.destinationShards { waitCtx, waitCancel := context.WithTimeout(ctx, *waitForHealthyTabletsTimeout) defer waitCancel() if err := scw.tsc.WaitForTablets(waitCtx, scw.cell, si.Keyspace(), si.ShardName(), []topodatapb.TabletType{topodatapb.TabletType_MASTER}); err != nil { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v): %v", si.Keyspace(), si.ShardName(), scw.cell, err) } masters := scw.tsc.GetHealthyTabletStats(si.Keyspace(), si.ShardName(), topodatapb.TabletType_MASTER) if len(masters) == 0 { return fmt.Errorf("cannot find MASTER tablet for destination shard for %v/%v (in cell: %v) in HealthCheck: empty TabletStats list", si.Keyspace(), si.ShardName(), scw.cell) } master := masters[0] // Get the MySQL database name of the tablet. keyspaceAndShard := topoproto.KeyspaceShardString(si.Keyspace(), si.ShardName()) scw.destinationDbNames[keyspaceAndShard] = topoproto.TabletDbName(master.Tablet) // TODO(mberlin): Verify on the destination master that the // _vt.blp_checkpoint table has the latest schema. scw.wr.Logger().Infof("Using tablet %v as destination master for %v/%v", topoproto.TabletAliasString(master.Tablet.Alias), si.Keyspace(), si.ShardName()) } scw.wr.Logger().Infof("NOTE: The used master of a destination shard might change over the course of the copy e.g. due to a reparent. The HealthCheck module will track and log master changes and any error message will always refer the actually used master address.") return nil }
func formatTabletStats(ts *discovery.TabletStats) string { webURL := "unknown http port" if webPort, ok := ts.Tablet.PortMap["vt"]; ok { webURL = fmt.Sprintf("http://%v:%d/", ts.Tablet.Hostname, webPort) } return fmt.Sprintf("%v: %v stats: %v", topoproto.TabletAliasString(ts.Tablet.Alias), webURL, ts.Stats) }
// RecordTabletTagAction records a new TabletTagAction // into the specified Cleaner func RecordTabletTagAction(cleaner *Cleaner, tabletAlias *pb.TabletAlias, name, value string) { cleaner.Record(TabletTagActionName, topoproto.TabletAliasString(tabletAlias), &TabletTagAction{ TabletAlias: tabletAlias, Name: name, Value: value, }) }
// ApplyTabletAction applies the provided action to the tablet. func (ar *ActionRepository) ApplyTabletAction(ctx context.Context, actionName string, tabletAlias *topodatapb.TabletAlias, r *http.Request) *ActionResult { result := &ActionResult{ Name: actionName, Parameters: topoproto.TabletAliasString(tabletAlias), } action, ok := ar.tabletActions[actionName] if !ok { result.error("Unknown tablet action") return result } // check the role if action.role != "" { if err := acl.CheckAccessHTTP(r, action.role); err != nil { result.error("Access denied") return result } } // run the action ctx, cancel := context.WithTimeout(ctx, *actionTimeout) wr := wrangler.New(logutil.NewConsoleLogger(), ar.ts, tmclient.NewTabletManagerClient()) output, err := action.method(ctx, wr, tabletAlias, r) cancel() if err != nil { result.error(err.Error()) return result } result.Output = output return result }
func (wr *Wrangler) validateReplication(ctx context.Context, shardInfo *topo.ShardInfo, tabletMap map[topodatapb.TabletAlias]*topo.TabletInfo, results chan<- error) { if shardInfo.MasterAlias == nil { results <- fmt.Errorf("no master in shard record %v/%v", shardInfo.Keyspace(), shardInfo.ShardName()) return } masterTabletInfo, ok := tabletMap[*shardInfo.MasterAlias] if !ok { results <- fmt.Errorf("master %v not in tablet map", topoproto.TabletAliasString(shardInfo.MasterAlias)) return } slaveList, err := wr.tmc.GetSlaves(ctx, masterTabletInfo.Tablet) if err != nil { results <- fmt.Errorf("GetSlaves(%v) failed: %v", masterTabletInfo, err) return } if len(slaveList) == 0 { results <- fmt.Errorf("no slaves of tablet %v found", topoproto.TabletAliasString(shardInfo.MasterAlias)) return } tabletIPMap := make(map[string]*topodatapb.Tablet) slaveIPMap := make(map[string]bool) for _, tablet := range tabletMap { tabletIPMap[normalizeIP(tablet.Ip)] = tablet.Tablet } // See if every slave is in the replication graph. for _, slaveAddr := range slaveList { if tabletIPMap[normalizeIP(slaveAddr)] == nil { results <- fmt.Errorf("slave %v not in replication graph for shard %v/%v (mysql instance without vttablet?)", slaveAddr, shardInfo.Keyspace(), shardInfo.ShardName()) } slaveIPMap[normalizeIP(slaveAddr)] = true } // See if every entry in the replication graph is connected to the master. for _, tablet := range tabletMap { if !tablet.IsSlaveType() { continue } if !slaveIPMap[normalizeIP(tablet.Ip)] { results <- fmt.Errorf("slave %v not replicating: %v slave list: %q", topoproto.TabletAliasString(tablet.Alias), tablet.Ip, slaveList) } } }
// Validate all tablets in all discoverable cells, even if they are // not in the replication graph. func (wr *Wrangler) validateAllTablets(ctx context.Context, wg *sync.WaitGroup, results chan<- error) { cellSet := make(map[string]bool, 16) keyspaces, err := wr.ts.GetKeyspaces(ctx) if err != nil { results <- fmt.Errorf("TopologyServer.GetKeyspaces failed: %v", err) return } for _, keyspace := range keyspaces { shards, err := wr.ts.GetShardNames(ctx, keyspace) if err != nil { results <- fmt.Errorf("TopologyServer.GetShardNames(%v) failed: %v", keyspace, err) return } for _, shard := range shards { aliases, err := wr.ts.FindAllTabletAliasesInShard(ctx, keyspace, shard) if err != nil { results <- fmt.Errorf("TopologyServer.FindAllTabletAliasesInShard(%v, %v) failed: %v", keyspace, shard, err) return } for _, alias := range aliases { cellSet[alias.Cell] = true } } } for cell := range cellSet { aliases, err := wr.ts.GetTabletsByCell(ctx, cell) if err != nil { results <- fmt.Errorf("TopologyServer.GetTabletsByCell(%v) failed: %v", cell, err) continue } for _, alias := range aliases { wg.Add(1) go func(alias *pb.TabletAlias) { defer wg.Done() if err := topo.Validate(ctx, wr.ts, alias); err != nil { results <- fmt.Errorf("Validate(%v) failed: %v", topoproto.TabletAliasString(alias), err) } else { wr.Logger().Infof("tablet %v is valid", topoproto.TabletAliasString(alias)) } }(alias) } } }
// loadTablets reads all tablets from topology, converts to endpoints, and updates HealthCheck. func (tw *TopologyWatcher) loadTablets() { var wg sync.WaitGroup newEndPoints := make(map[string]*tabletEndPoint) tabletAlias, err := tw.getTablets(tw) if err != nil { select { case <-tw.ctx.Done(): return default: } log.Errorf("cannot get tablets for cell: %v: %v", tw.cell, err) return } for _, tAlias := range tabletAlias { wg.Add(1) go func(alias *topodatapb.TabletAlias) { defer wg.Done() tw.sem <- 1 // Wait for active queue to drain. tablet, err := tw.topoServer.GetTablet(tw.ctx, alias) <-tw.sem // Done; enable next request to run if err != nil { select { case <-tw.ctx.Done(): return default: } log.Errorf("cannot get tablet for alias %v: %v", alias, err) return } endPoint, err := topo.TabletEndPoint(tablet.Tablet) if err != nil { log.Errorf("cannot get endpoint from tablet %v: %v", tablet, err) return } key := EndPointToMapKey(endPoint) tw.mu.Lock() newEndPoints[key] = &tabletEndPoint{ alias: topoproto.TabletAliasString(alias), endPoint: endPoint, } tw.mu.Unlock() }(tAlias) } wg.Wait() tw.mu.Lock() for key, tep := range newEndPoints { if _, ok := tw.endPoints[key]; !ok { tw.hc.AddEndPoint(tw.cell, tep.alias, tep.endPoint) } } for key, tep := range tw.endPoints { if _, ok := newEndPoints[key]; !ok { tw.hc.RemoveEndPoint(tep.endPoint) } } tw.endPoints = newEndPoints tw.mu.Unlock() }
func (scw *SplitCloneWorker) getSourceSchema(ctx context.Context, tablet *topodatapb.Tablet) (*tabletmanagerdatapb.SchemaDefinition, error) { // get source schema from the first shard // TODO(alainjobart): for now, we assume the schema is compatible // on all source shards. Furthermore, we estimate the number of rows // in each source shard for each table to be about the same // (rowCount is used to estimate an ETA) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) sourceSchemaDefinition, err := scw.wr.GetSchema(shortCtx, tablet.Alias, nil, scw.excludeTables, true) cancel() if err != nil { return nil, fmt.Errorf("cannot get schema from source %v: %v", topoproto.TabletAliasString(tablet.Alias), err) } if len(sourceSchemaDefinition.TableDefinitions) == 0 { return nil, fmt.Errorf("no tables matching the table filter in tablet %v", topoproto.TabletAliasString(tablet.Alias)) } return sourceSchemaDefinition, nil }
// loadTablets reads all tablets from topology, and updates TabletRecorder. func (tw *TopologyWatcher) loadTablets() { var wg sync.WaitGroup newTablets := make(map[string]*tabletInfo) tabletAlias, err := tw.getTablets(tw) if err != nil { select { case <-tw.ctx.Done(): return default: } log.Errorf("cannot get tablets for cell: %v: %v", tw.cell, err) return } for _, tAlias := range tabletAlias { wg.Add(1) go func(alias *topodatapb.TabletAlias) { defer wg.Done() tw.sem <- 1 // Wait for active queue to drain. tablet, err := tw.topoServer.GetTablet(tw.ctx, alias) <-tw.sem // Done; enable next request to run if err != nil { select { case <-tw.ctx.Done(): return default: } log.Errorf("cannot get tablet for alias %v: %v", alias, err) return } key := TabletToMapKey(tablet.Tablet) tw.mu.Lock() newTablets[key] = &tabletInfo{ alias: topoproto.TabletAliasString(alias), tablet: tablet.Tablet, } tw.mu.Unlock() }(tAlias) } wg.Wait() tw.mu.Lock() for key, tep := range newTablets { if _, ok := tw.tablets[key]; !ok { tw.tr.AddTablet(tep.tablet, tep.alias) } } for key, tep := range tw.tablets { if _, ok := newTablets[key]; !ok { tw.tr.RemoveTablet(tep.tablet) } } tw.tablets = newTablets if !tw.firstLoadDone { tw.firstLoadDone = true close(tw.firstLoadChan) } tw.mu.Unlock() }
// populateLocalMetadata creates and fills the _vt.local_metadata table, // which is a per-tablet table that is never replicated. This allows queries // against local_metadata to return different values on different tablets, // which is used for communicating between Vitess and MySQL-level tools like // Orchestrator (http://github.com/outbrain/orchestrator). func (agent *ActionAgent) populateLocalMetadata(ctx context.Context) error { log.Infof("Populating _vt.local_metadata table...") // Wait for mysqld to be ready, in case it was launched in parallel with us. if err := agent.MysqlDaemon.Wait(ctx); err != nil { return err } // Get a non-pooled DBA connection. conn, err := agent.MysqlDaemon.GetDbaConnection() if err != nil { return err } defer conn.Close() // Disable replication on this session. We close the connection after using // it, so there's no need to re-enable replication when we're done. if _, err := conn.ExecuteFetch("SET @@session.sql_log_bin = 0", 0, false); err != nil { return err } // Create the database and table if necessary. if _, err := conn.ExecuteFetch("CREATE DATABASE IF NOT EXISTS _vt", 0, false); err != nil { return err } if _, err := conn.ExecuteFetch("DROP TABLE IF EXISTS _vt.local_metadata", 0, false); err != nil { return err } if _, err := conn.ExecuteFetch(sqlCreateLocalMetadataTable, 0, false); err != nil { return err } // Insert values. tablet := agent.Tablet() values := map[string]string{ "Alias": topoproto.TabletAliasString(tablet.Alias), "ClusterAlias": fmt.Sprintf("%s.%s", tablet.Keyspace, tablet.Shard), "DataCenter": tablet.Alias.Cell, "PromotionRule": "neutral", } masterEligible, err := agent.isMasterEligible() if err != nil { return fmt.Errorf("can't determine PromotionRule while populating local_metadata: %v", err) } if !masterEligible { values["PromotionRule"] = "must_not" } var rows []string for k, v := range values { rows = append(rows, fmt.Sprintf("('%s','%s')", k, v)) } query := fmt.Sprintf( "INSERT INTO _vt.local_metadata (name,value) VALUES %s", strings.Join(rows, ",")) _, err = conn.ExecuteFetch(query, 0, false) return err }
func (p *singleTabletProvider) getTablet() (*topodatapb.Tablet, error) { shortCtx, cancel := context.WithTimeout(p.ctx, *remoteActionsTimeout) tablet, err := p.ts.GetTablet(shortCtx, p.alias) cancel() if err != nil { return nil, fmt.Errorf("failed to resolve tablet alias: %v err: %v", topoproto.TabletAliasString(p.alias), err) } return tablet.Tablet, err }