Example #1
0
// Does a topo lookup for a single shard, and returns the tablet record of the master tablet.
func resolveDestinationShardMaster(ctx context.Context, keyspace, shard string, wr *wrangler.Wrangler) (*topo.TabletInfo, error) {
	var ti *topo.TabletInfo
	shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout)
	si, err := topo.GetShard(shortCtx, wr.TopoServer(), keyspace, shard)
	cancel()
	if err != nil {
		return ti, fmt.Errorf("unable to resolve destination shard %v/%v", keyspace, shard)
	}

	if si.MasterAlias.IsZero() {
		return ti, fmt.Errorf("no master in destination shard %v/%v", keyspace, shard)
	}

	wr.Logger().Infof("Found target master alias %v in shard %v/%v", si.MasterAlias, keyspace, shard)

	shortCtx, cancel = context.WithTimeout(ctx, *remoteActionsTimeout)
	ti, err = topo.GetTablet(shortCtx, wr.TopoServer(), si.MasterAlias)
	cancel()
	if err != nil {
		return ti, fmt.Errorf("unable to get master tablet from alias %v in shard %v/%v",
			si.MasterAlias, keyspace, shard)
	}
	return ti, nil
}
Example #2
0
// changeCallback is run after every action that might
// have changed something in the tablet record.
func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topo.Tablet) error {
	span := trace.NewSpanFromContext(ctx)
	span.StartLocal("ActionAgent.changeCallback")
	defer span.Finish()

	allowQuery := newTablet.IsRunningQueryService()

	// Read the shard to get SourceShards / TabletControlMap if
	// we're going to use it.
	var shardInfo *topo.ShardInfo
	var tabletControl *pbt.Shard_TabletControl
	var blacklistedTables []string
	var err error
	if allowQuery {
		shardInfo, err = topo.GetShard(ctx, agent.TopoServer, newTablet.Keyspace, newTablet.Shard)
		if err != nil {
			log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err)
		} else {
			if newTablet.Type == topo.TYPE_MASTER {
				allowQuery = len(shardInfo.SourceShards) == 0
			}
			if tc := shardInfo.GetTabletControl(topo.TabletTypeToProto(newTablet.Type)); tc != nil {
				if topo.InCellList(newTablet.Alias.Cell, tc.Cells) {
					if tc.DisableQueryService {
						allowQuery = false
					}
					blacklistedTables = tc.BlacklistedTables
					tabletControl = tc
				}
			}
		}
	}

	// Read the keyspace on masters to get ShardingColumnType,
	// for binlog replication, only if source shards are set.
	var keyspaceInfo *topo.KeyspaceInfo
	if newTablet.Type == topo.TYPE_MASTER && shardInfo != nil && len(shardInfo.SourceShards) > 0 {
		keyspaceInfo, err = agent.TopoServer.GetKeyspace(ctx, newTablet.Keyspace)
		if err != nil {
			log.Errorf("Cannot read keyspace for this tablet %v: %v", newTablet.Alias, err)
			keyspaceInfo = nil
		}
	}

	if allowQuery {
		// There are a few transitions when we need to restart the query service:
		switch {
		// If either InitMaster or InitSlave was called, because those calls
		// (or a prior call to ResetReplication) may have silently broken the
		// rowcache invalidator by executing RESET MASTER.
		// Note that we don't care about fixing it after ResetReplication itself
		// since that call breaks everything on purpose, and we don't expect
		// anything to start working until either InitMaster or InitSlave.
		case agent.initReplication:
			agent.initReplication = false
			agent.disallowQueries()

		// Transitioning from replica to master, so clients that were already
		// connected don't keep on using the master as replica or rdonly.
		case newTablet.Type == topo.TYPE_MASTER && oldTablet.Type != topo.TYPE_MASTER:
			agent.disallowQueries()

		// Having different parameters for the query service.
		// It needs to stop and restart with the new parameters.
		// That includes:
		//   - changing KeyRange
		//   - changing the BlacklistedTables list
		case (newTablet.KeyRange != oldTablet.KeyRange),
			!reflect.DeepEqual(blacklistedTables, agent.BlacklistedTables()):
			agent.disallowQueries()
		}

		if err := agent.allowQueries(newTablet, blacklistedTables); err != nil {
			log.Errorf("Cannot start query service: %v", err)
		}
	} else {
		agent.disallowQueries()
	}

	// save the tabletControl we've been using, so the background
	// healthcheck makes the same decisions as we've been making.
	agent.setTabletControl(tabletControl)

	// update stream needs to be started or stopped too
	if agent.DBConfigs != nil {
		if topo.IsRunningUpdateStream(newTablet.Type) {
			binlog.EnableUpdateStreamService(agent.DBConfigs.App.DbName, agent.MysqlDaemon)
		} else {
			binlog.DisableUpdateStreamService()
		}
	}

	statsType.Set(string(newTablet.Type))
	statsKeyspace.Set(newTablet.Keyspace)
	statsShard.Set(newTablet.Shard)
	statsKeyRangeStart.Set(string(newTablet.KeyRange.Start.Hex()))
	statsKeyRangeEnd.Set(string(newTablet.KeyRange.End.Hex()))

	// See if we need to start or stop any binlog player
	if agent.BinlogPlayerMap != nil {
		if newTablet.Type == topo.TYPE_MASTER {
			agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, keyspaceInfo, shardInfo)
		} else {
			agent.BinlogPlayerMap.StopAllPlayersAndReset()
		}
	}
	return nil
}
Example #3
0
// changeCallback is run after every action that might
// have changed something in the tablet record.
func (agent *ActionAgent) changeCallback(ctx context.Context, oldTablet, newTablet *topo.Tablet) error {
	span := trace.NewSpanFromContext(ctx)
	span.StartLocal("ActionAgent.changeCallback")
	defer span.Finish()

	allowQuery := newTablet.IsRunningQueryService()

	// Read the shard to get SourceShards / TabletControlMap if
	// we're going to use it.
	var shardInfo *topo.ShardInfo
	var tabletControl *topo.TabletControl
	var blacklistedTables []string
	var err error
	if allowQuery {
		shardInfo, err = topo.GetShard(ctx, agent.TopoServer, newTablet.Keyspace, newTablet.Shard)
		if err != nil {
			log.Errorf("Cannot read shard for this tablet %v, might have inaccurate SourceShards and TabletControls: %v", newTablet.Alias, err)
		} else {
			if newTablet.Type == topo.TYPE_MASTER {
				allowQuery = len(shardInfo.SourceShards) == 0
			}
			if tc, ok := shardInfo.TabletControlMap[newTablet.Type]; ok {
				if topo.InCellList(newTablet.Alias.Cell, tc.Cells) {
					if tc.DisableQueryService {
						allowQuery = false
					}
					blacklistedTables = tc.BlacklistedTables
					tabletControl = tc
				}
			}
		}
	}

	// Read the keyspace on masters to get ShardingColumnType,
	// for binlog replication, only if source shards are set.
	var keyspaceInfo *topo.KeyspaceInfo
	if newTablet.Type == topo.TYPE_MASTER && shardInfo != nil && len(shardInfo.SourceShards) > 0 {
		keyspaceInfo, err = agent.TopoServer.GetKeyspace(ctx, newTablet.Keyspace)
		if err != nil {
			log.Errorf("Cannot read keyspace for this tablet %v: %v", newTablet.Alias, err)
			keyspaceInfo = nil
		}
	}

	if allowQuery {
		// There are a few transitions when we're
		// going to need to restart the query service:
		// - transitioning from replica to master, so clients
		//   that were already connected don't keep on using
		//   the master as replica or rdonly.
		// - having different parameters for the query
		//   service. It needs to stop and restart with the
		//   new parameters. That includes:
		//   - changing KeyRange
		//   - changing the BlacklistedTables list
		if (newTablet.Type == topo.TYPE_MASTER &&
			oldTablet.Type != topo.TYPE_MASTER) ||
			(newTablet.KeyRange != oldTablet.KeyRange) ||
			!reflect.DeepEqual(blacklistedTables, agent.BlacklistedTables()) {
			agent.disallowQueries()
		}
		if err := agent.allowQueries(newTablet, blacklistedTables); err != nil {
			log.Errorf("Cannot start query service: %v", err)
		}
	} else {
		agent.disallowQueries()
	}

	// save the tabletControl we've been using, so the background
	// healthcheck makes the same decisions as we've been making.
	agent.setTabletControl(tabletControl)

	// update stream needs to be started or stopped too
	if agent.DBConfigs != nil {
		if topo.IsRunningUpdateStream(newTablet.Type) {
			binlog.EnableUpdateStreamService(agent.DBConfigs.App.DbName, agent.MysqlDaemon)
		} else {
			binlog.DisableUpdateStreamService()
		}
	}

	statsType.Set(string(newTablet.Type))
	statsKeyspace.Set(newTablet.Keyspace)
	statsShard.Set(newTablet.Shard)
	statsKeyRangeStart.Set(string(newTablet.KeyRange.Start.Hex()))
	statsKeyRangeEnd.Set(string(newTablet.KeyRange.End.Hex()))

	// See if we need to start or stop any binlog player
	if agent.BinlogPlayerMap != nil {
		if newTablet.Type == topo.TYPE_MASTER {
			agent.BinlogPlayerMap.RefreshMap(agent.batchCtx, newTablet, keyspaceInfo, shardInfo)
		} else {
			agent.BinlogPlayerMap.StopAllPlayersAndReset()
		}
	}
	return nil
}
Example #4
0
// TabletExternallyReparented updates all topo records so the current
// tablet is the new master for this shard.
// Should be called under RPCWrapLock.
func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, externalID string) error {
	startTime := time.Now()

	// If there is a finalize step running, wait for it to finish or time out
	// before checking the global shard record again.
	if agent.finalizeReparentCtx != nil {
		select {
		case <-agent.finalizeReparentCtx.Done():
			agent.finalizeReparentCtx = nil
		case <-ctx.Done():
			return ctx.Err()
		}
	}

	tablet := agent.Tablet()

	// Check the global shard record.
	si, err := topo.GetShard(ctx, agent.TopoServer, tablet.Keyspace, tablet.Shard)
	if err != nil {
		log.Warningf("fastTabletExternallyReparented: failed to read global shard record for %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
		return err
	}
	if si.MasterAlias == tablet.Alias {
		// We may get called on the current master even when nothing has changed.
		// If the global shard record is already updated, it means we successfully
		// finished a previous reparent to this tablet.
		return nil
	}

	// Create a reusable Reparent event with available info.
	ev := &events.Reparent{
		ShardInfo:  *si,
		NewMaster:  *tablet.Tablet,
		OldMaster:  topo.Tablet{Alias: si.MasterAlias, Type: topo.TYPE_MASTER},
		ExternalID: externalID,
	}
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()
	event.DispatchUpdate(ev, "starting external from tablet (fast)")

	// Execute state change to master by force-updating only the local copy of the
	// tablet record. The actual record in topo will be updated later.
	log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER")
	oldTablet := *tablet.Tablet
	newTablet := oldTablet
	newTablet.Type = topo.TYPE_MASTER
	newTablet.Health = nil
	agent.setTablet(topo.NewTabletInfo(&newTablet, -1))
	if err := agent.updateState(ctx, &oldTablet, "fastTabletExternallyReparented"); err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to change tablet state to MASTER: %v", err)
	}

	// Directly write the new master endpoint in the serving graph.
	// We will do a true rebuild in the background soon, but in the meantime,
	// this will be enough for clients to re-resolve the new master.
	event.DispatchUpdate(ev, "writing new master endpoint")
	log.Infof("fastTabletExternallyReparented: writing new master endpoint to serving graph")
	ep, err := tablet.EndPoint()
	if err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to generate EndPoint for tablet %v: %v", tablet.Alias, err)
	}
	err = topo.UpdateEndPoints(ctx, agent.TopoServer, tablet.Alias.Cell,
		si.Keyspace(), si.ShardName(), topo.TYPE_MASTER,
		&topo.EndPoints{Entries: []topo.EndPoint{*ep}}, -1)
	if err != nil {
		return fmt.Errorf("fastTabletExternallyReparented: failed to update master endpoint: %v", err)
	}
	externalReparentStats.Record("NewMasterVisible", startTime)

	// Start the finalize stage with a background context, but connect the trace.
	bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout)
	bgCtx = trace.CopySpan(bgCtx, ctx)
	agent.finalizeReparentCtx = bgCtx
	go func() {
		err := agent.finalizeTabletExternallyReparented(bgCtx, si, ev)
		cancel()

		if err != nil {
			log.Warningf("finalizeTabletExternallyReparented error: %v", err)
			event.DispatchUpdate(ev, "failed: "+err.Error())
			return
		}
		externalReparentStats.Record("FullRebuild", startTime)
	}()

	return nil
}
Example #5
0
// CheckShard verifies the Shard operations work correctly
func CheckShard(ctx context.Context, t *testing.T, ts topo.Server) {
	if err := ts.CreateKeyspace(ctx, "test_keyspace", &topo.Keyspace{}); err != nil {
		t.Fatalf("CreateKeyspace: %v", err)
	}

	if err := topo.CreateShard(ctx, ts, "test_keyspace", "b0-c0"); err != nil {
		t.Fatalf("CreateShard: %v", err)
	}
	if err := topo.CreateShard(ctx, ts, "test_keyspace", "b0-c0"); err != topo.ErrNodeExists {
		t.Errorf("CreateShard called second time, got: %v", err)
	}

	// Delete shard and see if we can re-create it.
	if err := ts.DeleteShard(ctx, "test_keyspace", "b0-c0"); err != nil {
		t.Fatalf("DeleteShard: %v", err)
	}
	if err := topo.CreateShard(ctx, ts, "test_keyspace", "b0-c0"); err != nil {
		t.Fatalf("CreateShard: %v", err)
	}

	// Delete ALL shards.
	if err := ts.DeleteKeyspaceShards(ctx, "test_keyspace"); err != nil {
		t.Fatalf("DeleteKeyspaceShards: %v", err)
	}
	if err := topo.CreateShard(ctx, ts, "test_keyspace", "b0-c0"); err != nil {
		t.Fatalf("CreateShard: %v", err)
	}

	if _, err := topo.GetShard(ctx, ts, "test_keyspace", "666"); err != topo.ErrNoNode {
		t.Errorf("GetShard(666): %v", err)
	}

	shardInfo, err := topo.GetShard(ctx, ts, "test_keyspace", "b0-c0")
	if err != nil {
		t.Errorf("GetShard: %v", err)
	}
	if want := newKeyRange("b0-c0"); shardInfo.KeyRange != want {
		t.Errorf("shardInfo.KeyRange: want %v, got %v", want, shardInfo.KeyRange)
	}
	master := topo.TabletAlias{Cell: "ny", Uid: 1}
	shardInfo.MasterAlias = master
	shardInfo.KeyRange = newKeyRange("b0-c0")
	shardInfo.ServedTypesMap = map[topo.TabletType]*topo.ShardServedType{
		topo.TYPE_MASTER:  &topo.ShardServedType{},
		topo.TYPE_REPLICA: &topo.ShardServedType{Cells: []string{"c1"}},
		topo.TYPE_RDONLY:  &topo.ShardServedType{},
	}
	shardInfo.SourceShards = []topo.SourceShard{
		topo.SourceShard{
			Uid:      1,
			Keyspace: "source_ks",
			Shard:    "b8-c0",
			KeyRange: newKeyRange("b8-c0"),
			Tables:   []string{"table1", "table2"},
		},
	}
	shardInfo.TabletControlMap = map[topo.TabletType]*topo.TabletControl{
		topo.TYPE_MASTER: &topo.TabletControl{
			Cells:             []string{"c1", "c2"},
			BlacklistedTables: []string{"black1", "black2"},
		},
		topo.TYPE_REPLICA: &topo.TabletControl{
			DisableQueryService: true,
		},
	}
	if err := topo.UpdateShard(ctx, ts, shardInfo); err != nil {
		t.Errorf("UpdateShard: %v", err)
	}

	other := topo.TabletAlias{Cell: "ny", Uid: 82873}
	_, err = topo.UpdateShardFields(ctx, ts, "test_keyspace", "b0-c0", func(shard *topo.Shard) error {
		shard.MasterAlias = other
		return nil
	})
	if err != nil {
		t.Fatalf("UpdateShardFields error: %v", err)
	}
	si, err := topo.GetShard(ctx, ts, "test_keyspace", "b0-c0")
	if err != nil {
		t.Fatalf("GetShard: %v", err)
	}
	if si.MasterAlias != other {
		t.Fatalf("shard.MasterAlias = %v, want %v", si.MasterAlias, other)
	}
	_, err = topo.UpdateShardFields(ctx, ts, "test_keyspace", "b0-c0", func(shard *topo.Shard) error {
		shard.MasterAlias = master
		return nil
	})
	if err != nil {
		t.Fatalf("UpdateShardFields error: %v", err)
	}

	updatedShardInfo, err := topo.GetShard(ctx, ts, "test_keyspace", "b0-c0")
	if err != nil {
		t.Fatalf("GetShard: %v", err)
	}

	if eq, err := shardEqual(shardInfo.Shard, updatedShardInfo.Shard); err != nil {
		t.Errorf("cannot compare shards: %v", err)
	} else if !eq {
		t.Errorf("put and got shards are not identical:\n%#v\n%#v", shardInfo.Shard, updatedShardInfo.Shard)
	}

	// test GetShardNames
	shards, err := ts.GetShardNames(ctx, "test_keyspace")
	if err != nil {
		t.Errorf("GetShardNames: %v", err)
	}
	if len(shards) != 1 || shards[0] != "b0-c0" {
		t.Errorf(`GetShardNames: want [ "b0-c0" ], got %v`, shards)
	}

	if _, err := ts.GetShardNames(ctx, "test_keyspace666"); err != topo.ErrNoNode {
		t.Errorf("GetShardNames(666): %v", err)
	}

	// test ValidateShard
	if err := ts.ValidateShard(ctx, "test_keyspace", "b0-c0"); err != nil {
		t.Errorf("ValidateShard(test_keyspace, b0-c0) failed: %v", err)
	}
}