Ejemplo n.º 1
0
func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error {
	// If the majority of slaves restarted, move ahead.
	if majorityRestart {
		if leaveMasterReadOnly {
			wr.logger.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
		} else {
			wr.logger.Infof("marking master-elect read-write %v", masterElect.Alias)
			if err := wr.tmc.SetReadWrite(masterElect, wr.ActionTimeout()); err != nil {
				wr.logger.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
			}
		}
	} else {
		wr.logger.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
	}

	// save the new master in the shard info
	si.MasterAlias = masterElect.Alias
	if err := topo.UpdateShard(wr.ts, si); err != nil {
		wr.logger.Errorf("Failed to save new master into shard: %v", err)
		return err
	}

	// We rebuild all the cells, as we may have taken tablets in and
	// out of the graph.
	wr.logger.Infof("rebuilding shard serving graph data")
	return topotools.RebuildShard(wr.logger, wr.ts, masterElect.Keyspace, masterElect.Shard, nil, wr.lockTimeout, interrupted)
}
Ejemplo n.º 2
0
// replicaMigrateServedFrom handles the slave (replica, rdonly) migration.
func (wr *Wrangler) replicaMigrateServedFrom(ctx context.Context, ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, servedType pb.TabletType, cells []string, reverse bool, tables []string, ev *events.MigrateServedFrom) error {
	// Save the destination keyspace (its ServedFrom has been changed)
	event.DispatchUpdate(ev, "updating keyspace")
	if err := topo.UpdateKeyspace(ctx, wr.ts, ki); err != nil {
		return err
	}

	// Save the source shard (its blacklisted tables field has changed)
	event.DispatchUpdate(ev, "updating source shard")
	if err := sourceShard.UpdateSourceBlacklistedTables(servedType, cells, reverse, tables); err != nil {
		return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err)
	}
	if err := topo.UpdateShard(ctx, wr.ts, sourceShard); err != nil {
		return fmt.Errorf("UpdateShard(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err)
	}

	// Now refresh the source servers so they reload their
	// blacklisted table list
	event.DispatchUpdate(ev, "refreshing sources tablets state so they update their blacklisted tables")
	if err := wr.RefreshTablesByShard(ctx, sourceShard, servedType, cells); err != nil {
		return err
	}

	return nil
}
Ejemplo n.º 3
0
// replicaMigrateServedFrom handles the slave (replica, rdonly) migration.
func (wr *Wrangler) replicaMigrateServedFrom(ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, servedType topo.TabletType, reverse bool, tables []string, ev *events.MigrateServedFrom) error {
	// Save the destination keyspace (its ServedFrom has been changed)
	event.DispatchUpdate(ev, "updating keyspace")
	if err := topo.UpdateKeyspace(wr.ts, ki); err != nil {
		return err
	}

	// Save the source shard (its blacklisted tables field has changed)
	event.DispatchUpdate(ev, "updating source shard")
	if sourceShard.BlacklistedTablesMap == nil {
		sourceShard.BlacklistedTablesMap = make(map[topo.TabletType][]string)
	}
	if reverse {
		delete(sourceShard.BlacklistedTablesMap, servedType)
	} else {
		sourceShard.BlacklistedTablesMap[servedType] = tables
	}
	if err := topo.UpdateShard(wr.ts, sourceShard); err != nil {
		return err
	}

	// Now refresh the source servers so they reload their
	// blacklisted table list
	event.DispatchUpdate(ev, "refreshing sources tablets state so they update their blacklisted tables")
	if err := wr.RefreshTablesByShard(sourceShard.Keyspace(), sourceShard.ShardName(), servedType); err != nil {
		return err
	}

	return nil
}
Ejemplo n.º 4
0
func (wr *Wrangler) setShardServedTypes(keyspace, shard string, servedTypes []topo.TabletType) error {
	shardInfo, err := wr.ts.GetShard(keyspace, shard)
	if err != nil {
		return err
	}

	shardInfo.ServedTypes = servedTypes
	return topo.UpdateShard(wr.ts, shardInfo)
}
Ejemplo n.º 5
0
func (wr *Wrangler) setShardServedTypes(ctx context.Context, keyspace, shard string, cells []string, servedType topo.TabletType, remove bool) error {
	si, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	if err := si.UpdateServedTypesMap(servedType, cells, remove); err != nil {
		return err
	}
	return topo.UpdateShard(ctx, wr.ts, si)
}
Ejemplo n.º 6
0
// updateShardCellsAndMaster will update the 'Cells' and possibly
// MasterAlias records for the shard, if needed.
func (wr *Wrangler) updateShardCellsAndMaster(ctx context.Context, si *topo.ShardInfo, tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) error {
	// See if we need to update the Shard:
	// - add the tablet's cell to the shard's Cells if needed
	// - change the master if needed
	shardUpdateRequired := false
	if !si.HasCell(tabletAlias.Cell) {
		shardUpdateRequired = true
	}
	if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias {
		shardUpdateRequired = true
	}
	if !shardUpdateRequired {
		return nil
	}

	actionNode := actionnode.UpdateShard()
	keyspace := si.Keyspace()
	shard := si.ShardName()
	lockPath, err := wr.lockShard(ctx, keyspace, shard, actionNode)
	if err != nil {
		return err
	}

	// re-read the shard with the lock
	si, err = wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err)
	}

	// update it
	wasUpdated := false
	if !si.HasCell(tabletAlias.Cell) {
		si.Cells = append(si.Cells, tabletAlias.Cell)
		wasUpdated = true
	}
	if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias {
		if !si.MasterAlias.IsZero() && !force {
			return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, keyspace, shard))
		}
		si.MasterAlias = tabletAlias
		wasUpdated = true
	}

	if wasUpdated {
		// write it back
		if err := topo.UpdateShard(ctx, wr.ts, si); err != nil {
			return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err)
		}
	}

	// and unlock
	return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err)
}
Ejemplo n.º 7
0
func (wr *Wrangler) removeShardCell(keyspace, shard, cell string, force bool) error {
	shardInfo, err := wr.ts.GetShardCritical(keyspace, shard)
	if err != nil {
		return err
	}

	// check the cell is in the list already
	if !topo.InCellList(cell, shardInfo.Cells) {
		return fmt.Errorf("cell %v in not in shard info", cell)
	}

	// check the master alias is not in the cell
	if shardInfo.MasterAlias.Cell == cell {
		return fmt.Errorf("master %v is in the cell '%v' we want to remove", shardInfo.MasterAlias, cell)
	}

	// get the ShardReplication object in the cell
	sri, err := wr.ts.GetShardReplication(cell, keyspace, shard)
	switch err {
	case nil:
		if len(sri.ReplicationLinks) > 0 {
			return fmt.Errorf("cell %v has %v possible tablets in replication graph", cell, len(sri.ReplicationLinks))
		}

		// ShardReplication object is now useless, remove it
		if err := wr.ts.DeleteShardReplication(cell, keyspace, shard); err != nil {
			return fmt.Errorf("error deleting ShardReplication object in cell %v: %v", cell, err)
		}

		// we keep going
	case topo.ErrNoNode:
		// no ShardReplication object, we keep going
	default:
		// we can't get the object, assume topo server is down there,
		// so we look at force flag
		if !force {
			return err
		}
		log.Warningf("Cannot get ShardReplication from cell %v, assuming cell topo server is down, and forcing the removal", cell)
	}

	// now we can update the shard
	log.Infof("Removing cell %v from shard %v/%v", cell, keyspace, shard)
	newCells := make([]string, 0, len(shardInfo.Cells)-1)
	for _, c := range shardInfo.Cells {
		if c != cell {
			newCells = append(newCells, c)
		}
	}
	shardInfo.Cells = newCells

	return topo.UpdateShard(wr.ts, shardInfo)
}
Ejemplo n.º 8
0
func TestReparentTablet(t *testing.T) {
	ctx := context.Background()
	ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"})
	wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second)

	// create shard and tablets
	if err := topo.CreateShard(ctx, ts, "test_keyspace", "0"); err != nil {
		t.Fatalf("CreateShard failed: %v", err)
	}
	master := NewFakeTablet(t, wr, "cell1", 1, pb.TabletType_MASTER)
	slave := NewFakeTablet(t, wr, "cell1", 2, pb.TabletType_REPLICA)

	// mark the master inside the shard
	si, err := ts.GetShard(ctx, "test_keyspace", "0")
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	si.MasterAlias = master.Tablet.Alias
	if err := topo.UpdateShard(ctx, ts, si); err != nil {
		t.Fatalf("UpdateShard failed: %v", err)
	}

	// master action loop (to initialize host and port)
	master.StartActionLoop(t, wr)
	defer master.StopActionLoop(t)

	// slave loop
	slave.FakeMysqlDaemon.SetMasterCommandsInput = fmt.Sprintf("%v:%v", master.Tablet.Hostname, master.Tablet.PortMap["mysql"])
	slave.FakeMysqlDaemon.SetMasterCommandsResult = []string{"set master cmd 1"}
	slave.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
		"set master cmd 1",
	}
	slave.StartActionLoop(t, wr)
	defer slave.StopActionLoop(t)

	// run ReparentTablet
	if err := wr.ReparentTablet(ctx, slave.Tablet.Alias); err != nil {
		t.Fatalf("ReparentTablet failed: %v", err)
	}

	// check what was run
	if err := slave.FakeMysqlDaemon.CheckSuperQueryList(); err != nil {
		t.Fatalf("slave.FakeMysqlDaemon.CheckSuperQueryList failed: %v", err)
	}
}
Ejemplo n.º 9
0
func (wr *Wrangler) setShardTabletControl(ctx context.Context, keyspace, shard string, tabletType topo.TabletType, cells []string, remove, disableQueryService bool, tables []string) error {
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	if len(tables) == 0 && !remove {
		// we are setting the DisableQueryService flag only
		if err := shardInfo.UpdateDisableQueryService(tabletType, cells, disableQueryService); err != nil {
			return fmt.Errorf("UpdateDisableQueryService(%v/%v) failed: %v", shardInfo.Keyspace(), shardInfo.ShardName(), err)
		}
	} else {
		// we are setting / removing the blacklisted tables only
		if err := shardInfo.UpdateSourceBlacklistedTables(tabletType, cells, remove, tables); err != nil {
			return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", shardInfo.Keyspace(), shardInfo.ShardName(), err)
		}
	}
	return topo.UpdateShard(ctx, wr.ts, shardInfo)
}
Ejemplo n.º 10
0
// SetSourceShards is a utility function to override the SourceShards fields
// on a Shard.
func (wr *Wrangler) SetSourceShards(ctx context.Context, keyspace, shard string, sources []topo.TabletAlias, tables []string) error {
	// read the shard
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	// If the shard already has sources, maybe it's already been restored,
	// so let's be safe and abort right here.
	if len(shardInfo.SourceShards) > 0 {
		return fmt.Errorf("Shard %v/%v already has SourceShards, not overwriting them", keyspace, shard)
	}

	// read the source tablets
	sourceTablets, err := topo.GetTabletMap(ctx, wr.TopoServer(), sources)
	if err != nil {
		return err
	}

	// Insert their KeyRange in the SourceShards array.
	// We use a linear 0-based id, that matches what mysqlctld/split.go
	// inserts into _vt.blp_checkpoint.
	shardInfo.SourceShards = make([]*pb.Shard_SourceShard, len(sourceTablets))
	i := 0
	for _, ti := range sourceTablets {
		shardInfo.SourceShards[i] = &pb.Shard_SourceShard{
			Uid:      uint32(i),
			Keyspace: ti.Keyspace,
			Shard:    ti.Shard,
			KeyRange: key.KeyRangeToProto(ti.KeyRange),
			Tables:   tables,
		}
		i++
	}

	// and write the shard
	if err = topo.UpdateShard(ctx, wr.ts, shardInfo); err != nil {
		return err
	}

	return nil
}
Ejemplo n.º 11
0
func (wr *Wrangler) sourceShardDelete(ctx context.Context, keyspace, shard string, uid uint32) error {
	si, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}
	newSourceShards := make([]topo.SourceShard, 0, 0)
	for _, ss := range si.SourceShards {
		if ss.Uid != uid {
			newSourceShards = append(newSourceShards, ss)
		}
	}
	if len(newSourceShards) == len(si.SourceShards) {
		return fmt.Errorf("no SourceShard with uid %v", uid)
	}
	if len(newSourceShards) == 0 {
		newSourceShards = nil
	}
	si.SourceShards = newSourceShards
	return topo.UpdateShard(ctx, wr.ts, si)
}
Ejemplo n.º 12
0
func (wr *Wrangler) sourceShardAdd(ctx context.Context, keyspace, shard string, uid uint32, skeyspace, sshard string, keyRange key.KeyRange, tables []string) error {
	si, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	// check the uid is not used already
	for _, ss := range si.SourceShards {
		if ss.Uid == uid {
			return fmt.Errorf("uid %v is already in use", uid)
		}
	}

	si.SourceShards = append(si.SourceShards, topo.SourceShard{
		Uid:      uid,
		Keyspace: skeyspace,
		Shard:    sshard,
		KeyRange: keyRange,
		Tables:   tables,
	})
	return topo.UpdateShard(ctx, wr.ts, si)
}
Ejemplo n.º 13
0
func (wr *Wrangler) setShardBlacklistedTables(keyspace, shard string, tabletType topo.TabletType, tables []string) error {
	shardInfo, err := wr.ts.GetShard(keyspace, shard)
	if err != nil {
		return err
	}

	if len(tables) == 0 {
		// it's a removal
		if shardInfo.BlacklistedTablesMap != nil {
			delete(shardInfo.BlacklistedTablesMap, tabletType)
			if len(shardInfo.BlacklistedTablesMap) == 0 {
				shardInfo.BlacklistedTablesMap = nil
			}
		}
	} else {
		// it's an addition
		if shardInfo.BlacklistedTablesMap == nil {
			shardInfo.BlacklistedTablesMap = make(map[topo.TabletType][]string)
		}
		shardInfo.BlacklistedTablesMap[tabletType] = tables
	}
	return topo.UpdateShard(wr.ts, shardInfo)
}
Ejemplo n.º 14
0
// migrateServedTypes operates with all concerned shards locked.
func (wr *Wrangler) migrateServedTypes(keyspace string, sourceShards, destinationShards []*topo.ShardInfo, servedType topo.TabletType, reverse bool, shardCache map[string]*topo.ShardInfo) (err error) {

	// re-read all the shards so we are up to date
	for i, si := range sourceShards {
		if sourceShards[i], err = wr.ts.GetShard(si.Keyspace(), si.ShardName()); err != nil {
			return err
		}
		shardCache[si.ShardName()] = sourceShards[i]
	}
	for i, si := range destinationShards {
		if destinationShards[i], err = wr.ts.GetShard(si.Keyspace(), si.ShardName()); err != nil {
			return err
		}
		shardCache[si.ShardName()] = destinationShards[i]
	}

	ev := &events.MigrateServedTypes{
		Keyspace:          *topo.NewKeyspaceInfo(keyspace, nil, -1),
		SourceShards:      sourceShards,
		DestinationShards: destinationShards,
		ServedType:        servedType,
		Reverse:           reverse,
	}
	event.DispatchUpdate(ev, "start")
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	// check and update all shard records, in memory only
	for _, si := range sourceShards {
		if reverse {
			// need to add to source
			if topo.IsTypeInList(servedType, si.ServedTypes) {
				return fmt.Errorf("Source shard %v/%v is already serving type %v", si.Keyspace(), si.ShardName(), servedType)
			}
			si.ServedTypes = append(si.ServedTypes, servedType)
		} else {
			// need to remove from source
			var found bool
			if si.ServedTypes, found = removeType(servedType, si.ServedTypes); !found {
				return fmt.Errorf("Source shard %v/%v is not serving type %v", si.Keyspace(), si.ShardName(), servedType)
			}
		}
	}
	for _, si := range destinationShards {
		if reverse {
			// need to remove from destination
			var found bool
			if si.ServedTypes, found = removeType(servedType, si.ServedTypes); !found {
				return fmt.Errorf("Destination shard %v/%v is not serving type %v", si.Keyspace(), si.ShardName(), servedType)
			}
		} else {
			// need to add to destination
			if topo.IsTypeInList(servedType, si.ServedTypes) {
				return fmt.Errorf("Destination shard %v/%v is already serving type %v", si.Keyspace(), si.ShardName(), servedType)
			}
			si.ServedTypes = append(si.ServedTypes, servedType)
		}
	}

	// For master type migration, need to:
	// - switch the source shards to read-only
	// - gather all replication points
	// - wait for filtered replication to catch up before we continue
	// - disable filtered replication after the fact
	if servedType == topo.TYPE_MASTER {
		event.DispatchUpdate(ev, "setting all source masters read-only")
		err := wr.makeMastersReadOnly(sourceShards)
		if err != nil {
			return err
		}

		event.DispatchUpdate(ev, "getting positions of source masters")
		masterPositions, err := wr.getMastersPosition(sourceShards)
		if err != nil {
			return err
		}

		event.DispatchUpdate(ev, "waiting for destination masters to catch up")
		if err := wr.waitForFilteredReplication(masterPositions, destinationShards); err != nil {
			return err
		}

		for _, si := range destinationShards {
			si.SourceShards = nil
		}
	}

	// All is good, we can save the shards now
	event.DispatchUpdate(ev, "updating source shards")
	for _, si := range sourceShards {
		if err := topo.UpdateShard(wr.ts, si); err != nil {
			return err
		}
		shardCache[si.ShardName()] = si
	}
	event.DispatchUpdate(ev, "updating destination shards")
	for _, si := range destinationShards {
		if err := topo.UpdateShard(wr.ts, si); err != nil {
			return err
		}
		shardCache[si.ShardName()] = si
	}

	// And tell the new shards masters they can now be read-write.
	// Invoking a remote action will also make the tablet stop filtered
	// replication.
	if servedType == topo.TYPE_MASTER {
		event.DispatchUpdate(ev, "setting destination masters read-write")
		if err := wr.makeMastersReadWrite(destinationShards); err != nil {
			return err
		}
	}

	event.DispatchUpdate(ev, "finished")
	return nil
}
Ejemplo n.º 15
0
func (wr *Wrangler) migrateServedFrom(ki *topo.KeyspaceInfo, si *topo.ShardInfo, servedType topo.TabletType, reverse bool) (err error) {

	// re-read and update keyspace info record
	ki, err = wr.ts.GetKeyspace(ki.KeyspaceName())
	if err != nil {
		return err
	}
	if reverse {
		if _, ok := ki.ServedFrom[servedType]; ok {
			return fmt.Errorf("Destination Keyspace %s is not serving type %v", ki.KeyspaceName(), servedType)
		}
		ki.ServedFrom[servedType] = si.SourceShards[0].Keyspace
	} else {
		if _, ok := ki.ServedFrom[servedType]; !ok {
			return fmt.Errorf("Destination Keyspace %s is already serving type %v", ki.KeyspaceName(), servedType)
		}
		delete(ki.ServedFrom, servedType)
	}

	// re-read and check the destination shard
	si, err = wr.ts.GetShard(si.Keyspace(), si.ShardName())
	if err != nil {
		return err
	}
	if len(si.SourceShards) != 1 {
		return fmt.Errorf("Destination shard %v/%v is not a vertical split target", si.Keyspace(), si.ShardName())
	}
	tables := si.SourceShards[0].Tables

	// read the source shard, we'll need its master
	sourceShard, err := wr.ts.GetShard(si.SourceShards[0].Keyspace, si.SourceShards[0].Shard)
	if err != nil {
		return err
	}

	ev := &events.MigrateServedFrom{
		Keyspace:         *ki,
		SourceShard:      *sourceShard,
		DestinationShard: *si,
		ServedType:       servedType,
		Reverse:          reverse,
	}
	event.DispatchUpdate(ev, "start")
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	// For master type migration, need to:
	// - switch the source shard to read-only
	// - gather the replication point
	// - wait for filtered replication to catch up before we continue
	// - disable filtered replication after the fact
	var sourceMasterTabletInfo *topo.TabletInfo
	if servedType == topo.TYPE_MASTER {
		// set master to read-only
		event.DispatchUpdate(ev, "setting source shard master to read-only")
		actionPath, err := wr.ai.SetReadOnly(sourceShard.MasterAlias)
		if err != nil {
			return err
		}
		if err := wr.WaitForCompletion(actionPath); err != nil {
			return err
		}

		// get the position
		event.DispatchUpdate(ev, "getting master position")
		sourceMasterTabletInfo, err = wr.ts.GetTablet(sourceShard.MasterAlias)
		if err != nil {
			return err
		}
		masterPosition, err := wr.ai.MasterPosition(sourceMasterTabletInfo, wr.ActionTimeout())
		if err != nil {
			return err
		}

		// wait for it
		event.DispatchUpdate(ev, "waiting for destination master to catch up to source master")
		if err := wr.ai.WaitBlpPosition(si.MasterAlias, blproto.BlpPosition{
			Uid:      0,
			Position: masterPosition,
		}, wr.ActionTimeout()); err != nil {
			return err
		}

		// and clear the shard record
		si.SourceShards = nil
	}

	// All is good, we can save the keyspace and shard (if needed) now
	event.DispatchUpdate(ev, "updating keyspace")
	if err = topo.UpdateKeyspace(wr.ts, ki); err != nil {
		return err
	}
	event.DispatchUpdate(ev, "updating destination shard")
	if servedType == topo.TYPE_MASTER {
		if err := topo.UpdateShard(wr.ts, si); err != nil {
			return err
		}
	}

	// Tell the new shards masters they can now be read-write.
	// Invoking a remote action will also make the tablet stop filtered
	// replication.
	event.DispatchUpdate(ev, "setting destination shard masters read-write")
	if servedType == topo.TYPE_MASTER {
		if err := wr.makeMastersReadWrite([]*topo.ShardInfo{si}); err != nil {
			return err
		}
	}

	// Now blacklist the table list on the right servers
	event.DispatchUpdate(ev, "setting blacklisted tables on source shard")
	if servedType == topo.TYPE_MASTER {
		if err := wr.ai.SetBlacklistedTables(sourceMasterTabletInfo, tables, wr.ActionTimeout()); err != nil {
			return err
		}
	} else {
		// We use the list of tables that are replicating
		// for the blacklist. In case of a reverse move, we clear the
		// blacklist.
		if reverse {
			tables = nil
		}
		if err := wr.SetBlacklistedTablesByShard(sourceShard.Keyspace(), sourceShard.ShardName(), servedType, tables); err != nil {
			return err
		}
	}

	event.DispatchUpdate(ev, "finished")
	return nil
}
Ejemplo n.º 16
0
// TestInitMasterShardOneSlaveFails makes sure that if one slave fails to
// proceed, the action completes anyway
func TestInitMasterShardOneSlaveFails(t *testing.T) {
	ctx := context.Background()
	ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"})
	wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient(), time.Second)

	// Create a master, a couple slaves
	master := NewFakeTablet(t, wr, "cell1", 0, pb.TabletType_MASTER)
	goodSlave := NewFakeTablet(t, wr, "cell1", 1, pb.TabletType_REPLICA)
	badSlave := NewFakeTablet(t, wr, "cell2", 2, pb.TabletType_REPLICA)

	// Master: set a plausible ReplicationPosition to return,
	// and expect to add entry in _vt.reparent_journal
	master.FakeMysqlDaemon.CurrentMasterPosition = myproto.ReplicationPosition{
		GTIDSet: myproto.MariadbGTID{
			Domain:   5,
			Server:   456,
			Sequence: 890,
		},
	}
	master.FakeMysqlDaemon.ReadOnly = true
	master.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
		"CREATE DATABASE IF NOT EXISTS _vt",
		"SUBCREATE TABLE IF NOT EXISTS _vt.reparent_journal",
		"CREATE DATABASE IF NOT EXISTS _vt",
		"SUBCREATE TABLE IF NOT EXISTS _vt.reparent_journal",
		"SUBINSERT INTO _vt.reparent_journal (time_created_ns, action_name, master_alias, replication_position) VALUES",
	}
	master.StartActionLoop(t, wr)
	defer master.StopActionLoop(t)

	// goodSlave: expect to be re-parented
	goodSlave.FakeMysqlDaemon.ReadOnly = true
	goodSlave.FakeMysqlDaemon.StartReplicationCommandsStatus = &myproto.ReplicationStatus{
		Position:           master.FakeMysqlDaemon.CurrentMasterPosition,
		MasterHost:         master.Tablet.Hostname,
		MasterPort:         int(master.Tablet.PortMap["mysql"]),
		MasterConnectRetry: 10,
	}
	goodSlave.FakeMysqlDaemon.StartReplicationCommandsResult = []string{"cmd1"}
	goodSlave.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = goodSlave.FakeMysqlDaemon.StartReplicationCommandsResult
	goodSlave.StartActionLoop(t, wr)
	defer goodSlave.StopActionLoop(t)

	// badSlave: insert an error by failing the ReplicationStatus input
	// on purpose
	badSlave.FakeMysqlDaemon.ReadOnly = true
	badSlave.FakeMysqlDaemon.StartReplicationCommandsStatus = &myproto.ReplicationStatus{
		Position:           master.FakeMysqlDaemon.CurrentMasterPosition,
		MasterHost:         "",
		MasterPort:         0,
		MasterConnectRetry: 10,
	}
	badSlave.StartActionLoop(t, wr)
	defer badSlave.StopActionLoop(t)

	// also change the master alias in the Shard object, to make sure it
	// is set back.
	si, err := ts.GetShard(ctx, master.Tablet.Keyspace, master.Tablet.Shard)
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	si.MasterAlias.Uid++
	if err := topo.UpdateShard(ctx, ts, si); err != nil {
		t.Fatalf("UpdateShard failed: %v", err)
	}

	// run InitShardMaster without force, it fails because master is
	// changing.
	if err := wr.InitShardMaster(ctx, master.Tablet.Keyspace, master.Tablet.Shard, master.Tablet.Alias, false /*force*/, 10*time.Second); err == nil || !strings.Contains(err.Error(), "is not the shard master") {
		t.Errorf("InitShardMaster with mismatched new master returned wrong error: %v", err)
	}

	// run InitShardMaster
	if err := wr.InitShardMaster(ctx, master.Tablet.Keyspace, master.Tablet.Shard, master.Tablet.Alias, true /*force*/, 10*time.Second); err == nil || !strings.Contains(err.Error(), "wrong status for StartReplicationCommands") {
		t.Errorf("InitShardMaster with one failed slave returned wrong error: %v", err)
	}

	// check what was run: master should still be good
	if master.FakeMysqlDaemon.ReadOnly {
		t.Errorf("master was not turned read-write")
	}
	si, err = ts.GetShard(ctx, master.Tablet.Keyspace, master.Tablet.Shard)
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	if !topo.TabletAliasEqual(si.MasterAlias, master.Tablet.Alias) {
		t.Errorf("unexpected shard master alias, got %v expected %v", si.MasterAlias, master.Tablet.Alias)
	}
}
Ejemplo n.º 17
0
// TestTabletControl verifies the shard's TabletControl record can disable
// query service in a tablet.
func TestTabletControl(t *testing.T) {
	ctx := context.Background()
	agent := createTestAgent(ctx, t)
	targetTabletType := topo.TYPE_REPLICA

	// first health check, should change us to replica
	before := time.Now()
	agent.runHealthCheck(targetTabletType)
	ti, err := agent.TopoServer.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != targetTabletType {
		t.Errorf("First health check failed to go to replica: %v", ti.Type)
	}
	if !agent.QueryServiceControl.IsServing() {
		t.Errorf("Query service should be running")
	}
	if agent._healthyTime.Sub(before) < 0 {
		t.Errorf("runHealthCheck did not update agent._healthyTime")
	}

	// now update the shard
	si, err := agent.TopoServer.GetShard(ctx, keyspace, shard)
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	si.TabletControls = []*pb.Shard_TabletControl{
		&pb.Shard_TabletControl{
			TabletType:          topo.TabletTypeToProto(targetTabletType),
			DisableQueryService: true,
		},
	}
	if err := topo.UpdateShard(ctx, agent.TopoServer, si); err != nil {
		t.Fatalf("UpdateShard failed: %v", err)
	}

	// now refresh the tablet state, as the resharding process would do
	agent.RPCWrapLockAction(ctx, actionnode.TabletActionRefreshState, "", "", true, func() error {
		agent.RefreshState(ctx)
		return nil
	})

	// check we shutdown query service
	if agent.QueryServiceControl.IsServing() {
		t.Errorf("Query service should not be running")
	}

	// check running a health check will not start it again
	before = time.Now()
	agent.runHealthCheck(targetTabletType)
	ti, err = agent.TopoServer.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != targetTabletType {
		t.Errorf("Health check failed to go to replica: %v", ti.Type)
	}
	if agent.QueryServiceControl.IsServing() {
		t.Errorf("Query service should not be running")
	}
	if agent._healthyTime.Sub(before) < 0 {
		t.Errorf("runHealthCheck did not update agent._healthyTime")
	}

	// go unhealthy, check we go to spare and QS is not running
	agent.HealthReporter.(*fakeHealthCheck).reportError = fmt.Errorf("tablet is unhealthy")
	before = time.Now()
	agent.runHealthCheck(targetTabletType)
	ti, err = agent.TopoServer.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != topo.TYPE_SPARE {
		t.Errorf("Unhealthy health check should go to spare: %v", ti.Type)
	}
	if agent.QueryServiceControl.IsServing() {
		t.Errorf("Query service should not be running")
	}
	if agent._healthyTime.Sub(before) < 0 {
		t.Errorf("runHealthCheck did not update agent._healthyTime")
	}

	// go back healthy, check QS is still not running
	agent.HealthReporter.(*fakeHealthCheck).reportError = nil
	before = time.Now()
	agent.runHealthCheck(targetTabletType)
	ti, err = agent.TopoServer.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != targetTabletType {
		t.Errorf("Healthy health check should go to replica: %v", ti.Type)
	}
	if agent.QueryServiceControl.IsServing() {
		t.Errorf("Query service should not be running")
	}
	if agent._healthyTime.Sub(before) < 0 {
		t.Errorf("runHealthCheck did not update agent._healthyTime")
	}
}
Ejemplo n.º 18
0
// migrateServedTypes operates with all concerned shards locked.
func (wr *Wrangler) migrateServedTypes(ctx context.Context, keyspace string, sourceShards, destinationShards []*topo.ShardInfo, cells []string, servedType pb.TabletType, reverse bool, filteredReplicationWaitTime time.Duration) (err error) {

	// re-read all the shards so we are up to date
	wr.Logger().Infof("Re-reading all shards")
	for i, si := range sourceShards {
		if sourceShards[i], err = wr.ts.GetShard(ctx, si.Keyspace(), si.ShardName()); err != nil {
			return err
		}
	}
	for i, si := range destinationShards {
		if destinationShards[i], err = wr.ts.GetShard(ctx, si.Keyspace(), si.ShardName()); err != nil {
			return err
		}
	}

	ev := &events.MigrateServedTypes{
		Keyspace:          *topo.NewKeyspaceInfo(keyspace, nil, -1),
		SourceShards:      sourceShards,
		DestinationShards: destinationShards,
		ServedType:        servedType,
		Reverse:           reverse,
	}
	event.DispatchUpdate(ev, "start")
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	// For master type migration, need to:
	// - switch the source shards to read-only by disabling query service
	// - gather all replication points
	// - wait for filtered replication to catch up before we continue
	// - disable filtered replication after the fact
	if servedType == pb.TabletType_MASTER {
		event.DispatchUpdate(ev, "disabling query service on all source masters")
		for _, si := range sourceShards {
			if err := si.UpdateDisableQueryService(pb.TabletType_MASTER, nil, true); err != nil {
				return err
			}
			if err := topo.UpdateShard(ctx, wr.ts, si); err != nil {
				return err
			}
		}
		if err := wr.refreshMasters(ctx, sourceShards); err != nil {
			return err
		}

		event.DispatchUpdate(ev, "getting positions of source masters")
		masterPositions, err := wr.getMastersPosition(ctx, sourceShards)
		if err != nil {
			return err
		}

		event.DispatchUpdate(ev, "waiting for destination masters to catch up")
		if err := wr.waitForFilteredReplication(ctx, masterPositions, destinationShards, filteredReplicationWaitTime); err != nil {
			return err
		}

		for _, si := range destinationShards {
			si.SourceShards = nil
		}
	}

	// Check and update all shard records, in memory only.
	// We remember if we need to refresh the state of the source tablets
	// so their query service is enabled again, for reverse migration.
	needToRefreshSourceTablets := false
	for _, si := range sourceShards {
		if err := si.UpdateServedTypesMap(servedType, cells, !reverse); err != nil {
			return err
		}
		if tc := si.GetTabletControl(servedType); reverse && tc != nil && tc.DisableQueryService {
			// this is a backward migration, where the
			// source tablets were disabled previously, so
			// we need to refresh them
			if err := si.UpdateDisableQueryService(servedType, cells, false); err != nil {
				return err
			}
			needToRefreshSourceTablets = true
		}
		if !reverse && servedType != pb.TabletType_MASTER {
			// this is a forward migration, we need to disable
			// query service on the source shards.
			// (this was already done for masters earlier)
			if err := si.UpdateDisableQueryService(servedType, cells, true); err != nil {
				return err
			}
		}
	}
	// We remember if we need to refresh the state of the destination tablets
	// so their query service will be enabled.
	needToRefreshDestinationTablets := false
	for _, si := range destinationShards {
		if err := si.UpdateServedTypesMap(servedType, cells, reverse); err != nil {
			return err
		}
		if tc := si.GetTabletControl(servedType); !reverse && tc != nil && tc.DisableQueryService {
			// This is a forwards migration, and the destination query service was already in a disabled state.
			// We need to enable and force a refresh, otherwise it's possible that both the source and destination
			// will have query service disabled at the same time, and queries would have nowhere to go.
			if err := si.UpdateDisableQueryService(servedType, cells, false); err != nil {
				return err
			}
			needToRefreshDestinationTablets = true
		}
		if reverse && servedType != pb.TabletType_MASTER {
			// this is a backwards migration, we need to disable
			// query service on the destination shards.
			// (we're not allowed to reverse a master migration)
			if err := si.UpdateDisableQueryService(servedType, cells, true); err != nil {
				return err
			}
		}
	}

	// All is good, we can save the shards now
	event.DispatchUpdate(ev, "updating source shards")
	for _, si := range sourceShards {
		if err := topo.UpdateShard(ctx, wr.ts, si); err != nil {
			return err
		}
	}
	if needToRefreshSourceTablets {
		event.DispatchUpdate(ev, "refreshing source shard tablets so they restart their query service")
		for _, si := range sourceShards {
			wr.RefreshTablesByShard(ctx, si, servedType, cells)
		}
	}
	event.DispatchUpdate(ev, "updating destination shards")
	for _, si := range destinationShards {
		if err := topo.UpdateShard(ctx, wr.ts, si); err != nil {
			return err
		}
	}
	if needToRefreshDestinationTablets {
		event.DispatchUpdate(ev, "refreshing destination shard tablets so they restart their query service")
		for _, si := range destinationShards {
			wr.RefreshTablesByShard(ctx, si, servedType, cells)
		}
	}

	// And tell the new shards masters they can now be read-write.
	// Invoking a remote action will also make the tablet stop filtered
	// replication.
	if servedType == pb.TabletType_MASTER {
		event.DispatchUpdate(ev, "setting destination masters read-write")
		if err := wr.refreshMasters(ctx, destinationShards); err != nil {
			return err
		}
	}

	event.DispatchUpdate(ev, "finished")
	return nil
}
Ejemplo n.º 19
0
// TestInitTablet will test the InitTablet code creates / updates the
// tablet node correctly. Note we modify global parameters (the flags)
// so this has to be in one test.
func TestInitTablet(t *testing.T) {
	ctx := context.Background()
	ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"})
	tabletAlias := topo.TabletAlias{
		Cell: "cell1",
		Uid:  1,
	}

	// start with idle, and a tablet record that doesn't exist
	port := 1234
	gRPCPort := 3456
	mysqlDaemon := mysqlctl.NewFakeMysqlDaemon()
	agent := &ActionAgent{
		TopoServer:         ts,
		TabletAlias:        tabletAlias,
		MysqlDaemon:        mysqlDaemon,
		DBConfigs:          nil,
		SchemaOverrides:    nil,
		BinlogPlayerMap:    nil,
		LockTimeout:        10 * time.Second,
		batchCtx:           ctx,
		History:            history.New(historyLength),
		lastHealthMapCount: new(stats.Int),
		_healthy:           fmt.Errorf("healthcheck not run yet"),
	}
	*initTabletType = "idle"
	*tabletHostname = "localhost"
	if err := agent.InitTablet(port, gRPCPort); err != nil {
		t.Fatalf("NewTestActionAgent(idle) failed: %v", err)
	}
	ti, err := ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != topo.TYPE_IDLE {
		t.Errorf("wrong type for tablet: %v", ti.Type)
	}
	if ti.Hostname != "localhost" {
		t.Errorf("wrong hostname for tablet: %v", ti.Hostname)
	}
	if ti.Portmap["vt"] != port {
		t.Errorf("wrong port for tablet: %v", ti.Portmap["vt"])
	}
	if ti.Portmap["grpc"] != gRPCPort {
		t.Errorf("wrong gRPC port for tablet: %v", ti.Portmap["grpc"])
	}

	// try again now that the node exists
	port = 3456
	if err := agent.InitTablet(port, gRPCPort); err != nil {
		t.Fatalf("NewTestActionAgent(idle again) failed: %v", err)
	}
	ti, err = ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Portmap["vt"] != port {
		t.Errorf("wrong port for tablet: %v", ti.Portmap["vt"])
	}
	if ti.Portmap["grpc"] != gRPCPort {
		t.Errorf("wrong gRPC port for tablet: %v", ti.Portmap["grpc"])
	}

	// try with a keyspace and shard on the previously idle tablet,
	// should fail
	*initTabletType = "replica"
	*initKeyspace = "test_keyspace"
	*initShard = "-80"
	if err := agent.InitTablet(port, gRPCPort); err == nil || !strings.Contains(err.Error(), "InitTablet failed because existing tablet keyspace and shard / differ from the provided ones test_keyspace/-80") {
		t.Fatalf("InitTablet(type over idle) didn't fail correctly: %v", err)
	}

	// now let's use a different real tablet in a shard, that will create
	// the keyspace and shard.
	tabletAlias = topo.TabletAlias{
		Cell: "cell1",
		Uid:  2,
	}
	agent.TabletAlias = tabletAlias
	if err := agent.InitTablet(port, gRPCPort); err != nil {
		t.Fatalf("InitTablet(type) failed: %v", err)
	}
	si, err := ts.GetShard(ctx, "test_keyspace", "-80")
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	if len(si.Cells) != 1 || si.Cells[0] != "cell1" {
		t.Errorf("shard.Cells not updated properly: %v", si)
	}
	ti, err = ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != topo.TYPE_REPLICA {
		t.Errorf("wrong tablet type: %v", ti.Type)
	}

	// try to init again, this time with health check on
	*initTabletType = ""
	*targetTabletType = "replica"
	if err := agent.InitTablet(port, gRPCPort); err != nil {
		t.Fatalf("InitTablet(type, healthcheck) failed: %v", err)
	}
	ti, err = ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != topo.TYPE_SPARE {
		t.Errorf("wrong tablet type: %v", ti.Type)
	}

	// update shard's master to our alias, then try to init again
	si, err = ts.GetShard(ctx, "test_keyspace", "-80")
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	si.MasterAlias = topo.TabletAliasToProto(tabletAlias)
	if err := topo.UpdateShard(ctx, ts, si); err != nil {
		t.Fatalf("UpdateShard failed: %v", err)
	}
	if err := agent.InitTablet(port, gRPCPort); err != nil {
		t.Fatalf("InitTablet(type, healthcheck) failed: %v", err)
	}
	ti, err = ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != topo.TYPE_MASTER {
		t.Errorf("wrong tablet type: %v", ti.Type)
	}

	// init again with the tablet_type set, no healthcheck
	// (also check db name override and tags here)
	*initTabletType = "replica"
	*targetTabletType = ""
	*initDbNameOverride = "DBNAME"
	initTags.Set("aaa:bbb")
	if err := agent.InitTablet(port, gRPCPort); err != nil {
		t.Fatalf("InitTablet(type, healthcheck) failed: %v", err)
	}
	ti, err = ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		t.Fatalf("GetTablet failed: %v", err)
	}
	if ti.Type != topo.TYPE_MASTER {
		t.Errorf("wrong tablet type: %v", ti.Type)
	}
	if ti.DbNameOverride != "DBNAME" {
		t.Errorf("wrong tablet DbNameOverride: %v", ti.DbNameOverride)
	}
	if len(ti.Tags) != 1 || ti.Tags["aaa"] != "bbb" {
		t.Errorf("wrong tablet tags: %v", ti.Tags)
	}
}
Ejemplo n.º 20
0
// masterMigrateServedFrom handles the master migration. The ordering is
// a bit different than for rdonly / replica to guarantee a smooth transition.
//
// The order is as follows:
// - Add BlacklistedTables on the source shard map for master
// - Refresh the source master, so it stops writing on the tables
// - Get the source master position, wait until destination master reaches it
// - Clear SourceShard on the destination Shard
// - Refresh the destination master, so its stops its filtered
//   replication and starts accepting writes
func (wr *Wrangler) masterMigrateServedFrom(ctx context.Context, ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, tables []string, ev *events.MigrateServedFrom, filteredReplicationWaitTime time.Duration) error {
	// Read the data we need
	sourceMasterTabletInfo, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(sourceShard.MasterAlias))
	if err != nil {
		return err
	}
	destinationMasterTabletInfo, err := wr.ts.GetTablet(ctx, topo.ProtoToTabletAlias(destinationShard.MasterAlias))
	if err != nil {
		return err
	}

	// Update source shard (more blacklisted tables)
	event.DispatchUpdate(ev, "updating source shard")
	if err := sourceShard.UpdateSourceBlacklistedTables(pb.TabletType_MASTER, nil, false, tables); err != nil {
		return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err)
	}
	if err := topo.UpdateShard(ctx, wr.ts, sourceShard); err != nil {
		return fmt.Errorf("UpdateShard(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err)
	}

	// Now refresh the blacklisted table list on the source master
	event.DispatchUpdate(ev, "refreshing source master so it updates its blacklisted tables")
	if err := wr.tmc.RefreshState(ctx, sourceMasterTabletInfo); err != nil {
		return err
	}

	// get the position
	event.DispatchUpdate(ev, "getting master position")
	masterPosition, err := wr.tmc.MasterPosition(ctx, sourceMasterTabletInfo)
	if err != nil {
		return err
	}

	// wait for it
	event.DispatchUpdate(ev, "waiting for destination master to catch up to source master")
	if err := wr.tmc.WaitBlpPosition(ctx, destinationMasterTabletInfo, blproto.BlpPosition{
		Uid:      0,
		Position: masterPosition,
	}, filteredReplicationWaitTime); err != nil {
		return err
	}

	// Update the destination keyspace (its ServedFrom has changed)
	event.DispatchUpdate(ev, "updating keyspace")
	if err = topo.UpdateKeyspace(ctx, wr.ts, ki); err != nil {
		return err
	}

	// Update the destination shard (no more source shard)
	event.DispatchUpdate(ev, "updating destination shard")
	destinationShard.SourceShards = nil
	if err := topo.UpdateShard(ctx, wr.ts, destinationShard); err != nil {
		return err
	}

	// Tell the new shards masters they can now be read-write.
	// Invoking a remote action will also make the tablet stop filtered
	// replication.
	event.DispatchUpdate(ev, "setting destination shard masters read-write")
	if err := wr.refreshMasters(ctx, []*topo.ShardInfo{destinationShard}); err != nil {
		return err
	}

	return nil
}
Ejemplo n.º 21
0
func TestRebuildShard(t *testing.T) {
	ctx := context.Background()
	cells := []string{"test_cell"}
	logger := logutil.NewMemoryLogger()

	// Set up topology.
	ts := zktopo.NewTestServer(t, cells)
	si, err := GetOrCreateShard(ctx, ts, testKeyspace, testShard)
	if err != nil {
		t.Fatalf("GetOrCreateShard: %v", err)
	}
	si.Cells = append(si.Cells, cells[0])
	if err := topo.UpdateShard(ctx, ts, si); err != nil {
		t.Fatalf("UpdateShard: %v", err)
	}

	masterInfo := addTablet(ctx, t, ts, 1, cells[0], topo.TYPE_MASTER)
	replicaInfo := addTablet(ctx, t, ts, 2, cells[0], topo.TYPE_REPLICA)

	// Do an initial rebuild.
	if _, err := RebuildShard(ctx, logger, ts, testKeyspace, testShard, cells, time.Minute); err != nil {
		t.Fatalf("RebuildShard: %v", err)
	}

	// Check initial state.
	ep, _, err := ts.GetEndPoints(ctx, cells[0], testKeyspace, testShard, topo.TYPE_MASTER)
	if err != nil {
		t.Fatalf("GetEndPoints: %v", err)
	}
	if got, want := len(ep.Entries), 1; got != want {
		t.Fatalf("len(Entries) = %v, want %v", got, want)
	}
	ep, _, err = ts.GetEndPoints(ctx, cells[0], testKeyspace, testShard, topo.TYPE_REPLICA)
	if err != nil {
		t.Fatalf("GetEndPoints: %v", err)
	}
	if got, want := len(ep.Entries), 1; got != want {
		t.Fatalf("len(Entries) = %v, want %v", got, want)
	}

	// Make a change.
	masterInfo.Type = topo.TYPE_SPARE
	if err := topo.UpdateTablet(ctx, ts, masterInfo); err != nil {
		t.Fatalf("UpdateTablet: %v", err)
	}
	if _, err := RebuildShard(ctx, logger, ts, testKeyspace, testShard, cells, time.Minute); err != nil {
		t.Fatalf("RebuildShard: %v", err)
	}

	// Make another change.
	replicaInfo.Type = topo.TYPE_SPARE
	if err := topo.UpdateTablet(ctx, ts, replicaInfo); err != nil {
		t.Fatalf("UpdateTablet: %v", err)
	}
	if _, err := RebuildShard(ctx, logger, ts, testKeyspace, testShard, cells, time.Minute); err != nil {
		t.Fatalf("RebuildShard: %v", err)
	}

	// Check that the rebuild picked up both changes.
	if _, _, err := ts.GetEndPoints(ctx, cells[0], testKeyspace, testShard, topo.TYPE_MASTER); err == nil || !strings.Contains(err.Error(), "node doesn't exist") {
		t.Errorf("first change wasn't picked up by second rebuild")
	}
	if _, _, err := ts.GetEndPoints(ctx, cells[0], testKeyspace, testShard, topo.TYPE_REPLICA); err == nil || !strings.Contains(err.Error(), "node doesn't exist") {
		t.Errorf("second change was overwritten by first rebuild finishing late")
	}
}
Ejemplo n.º 22
0
func TestShardExternallyReparented(t *testing.T) {
	ts := zktopo.NewTestServer(t, []string{"cell1", "cell2"})
	wr := wrangler.New(logutil.NewConsoleLogger(), ts, time.Minute, time.Second)
	wr.UseRPCs = false

	// Create an old master, a new master, two good slaves, one bad slave
	oldMaster := NewFakeTablet(t, wr, "cell1", 0, topo.TYPE_MASTER)
	newMaster := NewFakeTablet(t, wr, "cell1", 1, topo.TYPE_REPLICA,
		TabletParent(oldMaster.Tablet.Alias))
	goodSlave1 := NewFakeTablet(t, wr, "cell1", 2, topo.TYPE_REPLICA,
		TabletParent(oldMaster.Tablet.Alias))
	goodSlave2 := NewFakeTablet(t, wr, "cell2", 3, topo.TYPE_REPLICA,
		TabletParent(oldMaster.Tablet.Alias))
	badSlave := NewFakeTablet(t, wr, "cell1", 4, topo.TYPE_REPLICA,
		TabletParent(oldMaster.Tablet.Alias))

	// Add a new Cell to the Shard, that doesn't map to any read topo cell,
	// to simulate a data center being unreachable.
	si, err := ts.GetShard("test_keyspace", "0")
	if err != nil {
		t.Fatalf("GetShard failed: %v", err)
	}
	si.Cells = append(si.Cells, "cell666")
	if err := topo.UpdateShard(ts, si); err != nil {
		t.Fatalf("UpdateShard failed: %v", err)
	}

	// Slightly unrelated test: make sure we can find the tablets
	// even with a datacenter being down.
	tabletMap, err := topo.GetTabletMapForShardByCell(ts, "test_keyspace", "0", []string{"cell1"})
	if err != nil {
		t.Fatalf("GetTabletMapForShardByCell should have worked but got: %v", err)
	}
	master, err := topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"])
	if err != nil || master != oldMaster.Tablet.Alias {
		t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master)
	}
	slave1, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave1.Tablet.IPAddr, "vt", goodSlave1.Tablet.Portmap["vt"])
	if err != nil || slave1 != goodSlave1.Tablet.Alias {
		t.Fatalf("FindTabletByIPAddrAndPort(slave1) failed: %v %v", err, master)
	}
	slave2, err := topotools.FindTabletByIPAddrAndPort(tabletMap, goodSlave2.Tablet.IPAddr, "vt", goodSlave2.Tablet.Portmap["vt"])
	if err != topo.ErrNoNode {
		t.Fatalf("FindTabletByIPAddrAndPort(slave2) worked: %v %v", err, slave2)
	}

	// Make sure the master is not exported in other cells
	tabletMap, err = topo.GetTabletMapForShardByCell(ts, "test_keyspace", "0", []string{"cell2"})
	master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"])
	if err != topo.ErrNoNode {
		t.Fatalf("FindTabletByIPAddrAndPort(master) worked in cell2: %v %v", err, master)
	}

	tabletMap, err = topo.GetTabletMapForShard(ts, "test_keyspace", "0")
	if err != topo.ErrPartialResult {
		t.Fatalf("GetTabletMapForShard should have returned ErrPartialResult but got: %v", err)
	}
	master, err = topotools.FindTabletByIPAddrAndPort(tabletMap, oldMaster.Tablet.IPAddr, "vt", oldMaster.Tablet.Portmap["vt"])
	if err != nil || master != oldMaster.Tablet.Alias {
		t.Fatalf("FindTabletByIPAddrAndPort(master) failed: %v %v", err, master)
	}

	// First test: reparent to the same master, make sure it works
	// as expected.
	if err := wr.ShardExternallyReparented("test_keyspace", "0", oldMaster.Tablet.Alias); err == nil {
		t.Fatalf("ShardExternallyReparented(same master) should have failed")
	} else {
		if !strings.Contains(err.Error(), "already master") {
			t.Fatalf("ShardExternallyReparented(same master) should have failed with an error that contains 'already master' but got: %v", err)
		}
	}

	// Second test: reparent to the replica, and pretend the old
	// master is still good to go.

	// On the elected master, we will respond to
	// TABLET_ACTION_SLAVE_WAS_PROMOTED
	newMaster.FakeMysqlDaemon.MasterAddr = ""
	newMaster.StartActionLoop(t, wr)
	defer newMaster.StopActionLoop(t)

	// On the old master, we will only respond to
	// TABLET_ACTION_SLAVE_WAS_RESTARTED.
	oldMaster.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr()
	oldMaster.StartActionLoop(t, wr)
	defer oldMaster.StopActionLoop(t)

	// On the good slaves, we will respond to
	// TABLET_ACTION_SLAVE_WAS_RESTARTED.
	goodSlave1.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr()
	goodSlave1.StartActionLoop(t, wr)
	defer goodSlave1.StopActionLoop(t)

	goodSlave2.FakeMysqlDaemon.MasterAddr = newMaster.Tablet.MysqlIpAddr()
	goodSlave2.StartActionLoop(t, wr)
	defer goodSlave2.StopActionLoop(t)

	// On the bad slave, we will respond to
	// TABLET_ACTION_SLAVE_WAS_RESTARTED with bad data.
	badSlave.FakeMysqlDaemon.MasterAddr = "234.0.0.1:3301"
	badSlave.StartActionLoop(t, wr)
	defer badSlave.StopActionLoop(t)

	// This tests a bad case; the new designated master is a slave,
	// but we should do what we're told anyway
	if err := wr.ShardExternallyReparented("test_keyspace", "0", goodSlave1.Tablet.Alias); err != nil {
		t.Fatalf("ShardExternallyReparented(slave) error: %v", err)
	}

	// This tests the good case, where everything works as planned
	t.Logf("ShardExternallyReparented(new master) expecting success")
	if err := wr.ShardExternallyReparented("test_keyspace", "0", newMaster.Tablet.Alias); err != nil {
		t.Fatalf("ShardExternallyReparented(replica) failed: %v", err)
	}

	// Now double-check the serving graph is good.
	// Should only have one good replica left.
	addrs, err := ts.GetEndPoints("cell1", "test_keyspace", "0", topo.TYPE_REPLICA)
	if err != nil {
		t.Fatalf("GetEndPoints failed at the end: %v", err)
	}
	if len(addrs.Entries) != 1 {
		t.Fatalf("GetEndPoints has too many entries: %v", addrs)
	}
}
Ejemplo n.º 23
0
// Scrap a tablet. If force is used, we write to topo.Server
// directly and don't remote-execute the command.
//
// If we scrap the master for a shard, we will clear its record
// from the Shard object (only if that was the right master)
func (wr *Wrangler) Scrap(tabletAlias topo.TabletAlias, force, skipRebuild bool) (actionPath string, err error) {
	// load the tablet, see if we'll need to rebuild
	ti, err := wr.ts.GetTablet(tabletAlias)
	if err != nil {
		return "", err
	}
	rebuildRequired := ti.Tablet.IsInServingGraph()
	wasMaster := ti.Type == topo.TYPE_MASTER

	if force {
		err = topotools.Scrap(wr.ts, ti.Alias, force)
	} else {
		actionPath, err = wr.ai.Scrap(ti.Alias)
	}
	if err != nil {
		return "", err
	}

	if !rebuildRequired {
		log.Infof("Rebuild not required")
		return
	}
	if skipRebuild {
		log.Warningf("Rebuild required, but skipping it")
		return
	}

	// wait for the remote Scrap if necessary
	if actionPath != "" {
		err = wr.WaitForCompletion(actionPath)
		if err != nil {
			return "", err
		}
	}

	// update the Shard object if the master was scrapped
	if wasMaster {
		actionNode := actionnode.UpdateShard()
		lockPath, err := wr.lockShard(ti.Keyspace, ti.Shard, actionNode)
		if err != nil {
			return "", err
		}

		// read the shard with the lock
		si, err := wr.ts.GetShard(ti.Keyspace, ti.Shard)
		if err != nil {
			return "", wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err)
		}

		// update it if the right alias is there
		if si.MasterAlias == tabletAlias {
			si.MasterAlias = topo.TabletAlias{}

			// write it back
			if err := topo.UpdateShard(wr.ts, si); err != nil {
				return "", wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err)
			}
		} else {
			log.Warningf("Scrapping master %v from shard %v/%v but master in Shard object was %v", tabletAlias, ti.Keyspace, ti.Shard, si.MasterAlias)
		}

		// and unlock
		if err := wr.unlockShard(ti.Keyspace, ti.Shard, actionNode, lockPath, err); err != nil {
			return "", err
		}
	}

	// and rebuild the original shard / keyspace
	return "", wr.RebuildShardGraph(ti.Keyspace, ti.Shard, []string{ti.Alias.Cell})
}
Ejemplo n.º 24
0
func CheckShard(t *testing.T, ts topo.Server) {
	if err := ts.CreateKeyspace("test_keyspace", &topo.Keyspace{}); err != nil {
		t.Fatalf("CreateKeyspace: %v", err)
	}

	if err := topo.CreateShard(ts, "test_keyspace", "b0-c0"); err != nil {
		t.Fatalf("CreateShard: %v", err)
	}
	if err := topo.CreateShard(ts, "test_keyspace", "b0-c0"); err != topo.ErrNodeExists {
		t.Errorf("CreateShard called second time, got: %v", err)
	}

	if _, err := ts.GetShard("test_keyspace", "666"); err != topo.ErrNoNode {
		t.Errorf("GetShard(666): %v", err)
	}

	shardInfo, err := ts.GetShard("test_keyspace", "b0-c0")
	if err != nil {
		t.Errorf("GetShard: %v", err)
	}
	if want := newKeyRange("b0-c0"); shardInfo.KeyRange != want {
		t.Errorf("shardInfo.KeyRange: want %v, got %v", want, shardInfo.KeyRange)
	}
	master := topo.TabletAlias{Cell: "ny", Uid: 1}
	shardInfo.MasterAlias = master
	shardInfo.KeyRange = newKeyRange("b0-c0")
	shardInfo.ServedTypes = []topo.TabletType{topo.TYPE_MASTER, topo.TYPE_REPLICA, topo.TYPE_RDONLY}
	shardInfo.SourceShards = []topo.SourceShard{
		topo.SourceShard{
			Uid:      1,
			Keyspace: "source_ks",
			Shard:    "b8-c0",
			KeyRange: newKeyRange("b8-c0"),
			Tables:   []string{"table1", "table2"},
		},
	}

	if err := topo.UpdateShard(ts, shardInfo); err != nil {
		t.Errorf("UpdateShard: %v", err)
	}

	shardInfo, err = ts.GetShard("test_keyspace", "b0-c0")
	if err != nil {
		t.Errorf("GetShard: %v", err)
	}
	if shardInfo.MasterAlias != master {
		t.Errorf("after UpdateShard: shardInfo.MasterAlias got %v", shardInfo.MasterAlias)
	}
	if shardInfo.KeyRange != newKeyRange("b0-c0") {
		t.Errorf("after UpdateShard: shardInfo.KeyRange got %v", shardInfo.KeyRange)
	}
	if len(shardInfo.ServedTypes) != 3 || shardInfo.ServedTypes[0] != topo.TYPE_MASTER || shardInfo.ServedTypes[1] != topo.TYPE_REPLICA || shardInfo.ServedTypes[2] != topo.TYPE_RDONLY {
		t.Errorf("after UpdateShard: shardInfo.ServedTypes got %v", shardInfo.ServedTypes)
	}
	if len(shardInfo.SourceShards) != 1 ||
		shardInfo.SourceShards[0].Uid != 1 ||
		shardInfo.SourceShards[0].Keyspace != "source_ks" ||
		shardInfo.SourceShards[0].Shard != "b8-c0" ||
		shardInfo.SourceShards[0].KeyRange != newKeyRange("b8-c0") ||
		len(shardInfo.SourceShards[0].Tables) != 2 ||
		shardInfo.SourceShards[0].Tables[0] != "table1" ||
		shardInfo.SourceShards[0].Tables[1] != "table2" {
		t.Errorf("after UpdateShard: shardInfo.SourceShards got %v", shardInfo.SourceShards)
	}

	shards, err := ts.GetShardNames("test_keyspace")
	if err != nil {
		t.Errorf("GetShardNames: %v", err)
	}
	if len(shards) != 1 || shards[0] != "b0-c0" {
		t.Errorf(`GetShardNames: want [ "b0-c0" ], got %v`, shards)
	}

	if _, err := ts.GetShardNames("test_keyspace666"); err != topo.ErrNoNode {
		t.Errorf("GetShardNames(666): %v", err)
	}

}
Ejemplo n.º 25
0
func tabletExternallyReparentedLocked(ts topo.Server, tablet *topo.TabletInfo, actionTimeout, lockTimeout time.Duration, interrupted chan struct{}) (err error) {
	// read the shard, make sure again the master is not already good.
	// critical read, we want up to date info (and the shard is locked).
	shardInfo, err := ts.GetShardCritical(tablet.Keyspace, tablet.Shard)
	if err != nil {
		return err
	}
	if shardInfo.MasterAlias == tablet.Alias {
		return fmt.Errorf("this tablet is already the master")
	}

	// Read the tablets, make sure the master elect is known to the shard
	// (it's this tablet, so it better be!).
	// Note we will keep going with a partial tablet map, which usually
	// happens when a cell is not reachable. After these checks, the
	// guarantees we'll have are:
	// - global cell is reachable (we just locked and read the shard)
	// - the local cell that contains the new master is reachable
	//   (as we're going to check the new master is in the list)
	// That should be enough.
	tabletMap, err := topo.GetTabletMapForShard(ts, tablet.Keyspace, tablet.Shard)
	switch err {
	case nil:
		// keep going
	case topo.ErrPartialResult:
		log.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets")
	default:
		return err
	}
	masterElectTablet, ok := tabletMap[tablet.Alias]
	if !ok {
		return fmt.Errorf("this master-elect tablet %v not found in replication graph %v/%v %v", tablet.Alias, tablet.Keyspace, tablet.Shard, topotools.MapKeys(tabletMap))
	}

	// Create reusable Reparent event with available info
	ev := &events.Reparent{
		ShardInfo: *shardInfo,
		NewMaster: *tablet.Tablet,
	}

	if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok {
		ev.OldMaster = *oldMasterTablet.Tablet
	}

	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	// sort the tablets, and handle them
	slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap)
	event.DispatchUpdate(ev, "starting external from tablet")

	// we fix the new master in the replication graph
	event.DispatchUpdate(ev, "mark ourself as new master")
	err = updateReplicationGraphForPromotedSlave(ts, tablet)
	if err != nil {
		// This suggests we can't talk to topo server. This is bad.
		return fmt.Errorf("updateReplicationGraphForPromotedSlave failed: %v", err)
	}

	// Once this tablet is promoted, remove it from our maps
	delete(slaveTabletMap, tablet.Alias)
	delete(masterTabletMap, tablet.Alias)

	// Then fix all the slaves, including the old master.  This
	// last step is very likely to time out for some tablets (one
	// random guy is dead, the old master is dead, ...). We
	// execute them all in parallel until we get to
	// wr.ActionTimeout(). After this, no other action with a
	// timeout is executed, so even if we got to the timeout,
	// we're still good.
	event.DispatchUpdate(ev, "restarting slaves")
	logger := logutil.NewConsoleLogger()
	ai := initiator.NewActionInitiator(ts)
	topotools.RestartSlavesExternal(ts, logger, slaveTabletMap, masterTabletMap, masterElectTablet.Alias, func(ti *topo.TabletInfo, swrd *actionnode.SlaveWasRestartedArgs) error {
		return ai.RpcSlaveWasRestarted(ti, swrd, actionTimeout)
	})

	// Compute the list of Cells we need to rebuild: old master and
	// all other cells if reparenting to another cell.
	cells := []string{shardInfo.MasterAlias.Cell}
	if shardInfo.MasterAlias.Cell != tablet.Alias.Cell {
		cells = nil
	}

	// now update the master record in the shard object
	event.DispatchUpdate(ev, "updating shard record")
	log.Infof("Updating Shard's MasterAlias record")
	shardInfo.MasterAlias = tablet.Alias
	if err = topo.UpdateShard(ts, shardInfo); err != nil {
		return err
	}

	// and rebuild the shard serving graph
	event.DispatchUpdate(ev, "rebuilding shard serving graph")
	log.Infof("Rebuilding shard serving graph data")
	if err = topotools.RebuildShard(logger, ts, tablet.Keyspace, tablet.Shard, cells, lockTimeout, interrupted); err != nil {
		return err
	}

	event.DispatchUpdate(ev, "finished")
	return nil
}
Ejemplo n.º 26
0
func (wr *Wrangler) shardExternallyReparentedLocked(keyspace, shard string, masterElectTabletAlias topo.TabletAlias) (err error) {
	// read the shard, make sure the master is not already good.
	shardInfo, err := wr.ts.GetShard(keyspace, shard)
	if err != nil {
		return err
	}
	if shardInfo.MasterAlias == masterElectTabletAlias {
		return fmt.Errorf("master-elect tablet %v is already master", masterElectTabletAlias)
	}

	// Read the tablets, make sure the master elect is known to us.
	// Note we will keep going with a partial tablet map, which usually
	// happens when a cell is not reachable. After these checks, the
	// guarantees we'll have are:
	// - global cell is reachable (we just locked and read the shard)
	// - the local cell that contains the new master is reachable
	//   (as we're going to check the new master is in the list)
	// That should be enough.
	tabletMap, err := topo.GetTabletMapForShard(wr.ts, keyspace, shard)
	switch err {
	case nil:
		// keep going
	case topo.ErrPartialResult:
		wr.logger.Warningf("Got topo.ErrPartialResult from GetTabletMapForShard, may need to re-init some tablets")
	default:
		return err
	}
	masterElectTablet, ok := tabletMap[masterElectTabletAlias]
	if !ok {
		return fmt.Errorf("master-elect tablet %v not found in replication graph %v/%v %v", masterElectTabletAlias, keyspace, shard, topotools.MapKeys(tabletMap))
	}

	// Create reusable Reparent event with available info
	ev := &events.Reparent{
		ShardInfo: *shardInfo,
		NewMaster: *masterElectTablet.Tablet,
	}

	if oldMasterTablet, ok := tabletMap[shardInfo.MasterAlias]; ok {
		ev.OldMaster = *oldMasterTablet.Tablet
	}

	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	// sort the tablets, and handle them
	slaveTabletMap, masterTabletMap := topotools.SortedTabletMap(tabletMap)
	err = wr.reparentShardExternal(ev, slaveTabletMap, masterTabletMap, masterElectTablet)
	if err != nil {
		wr.logger.Infof("Skipping shard rebuild with failed reparent")
		return err
	}

	// Compute the list of Cells we need to rebuild: old master and
	// all other cells if reparenting to another cell.
	cells := []string{shardInfo.MasterAlias.Cell}
	if shardInfo.MasterAlias.Cell != masterElectTabletAlias.Cell {
		cells = nil
	}

	// now update the master record in the shard object
	event.DispatchUpdate(ev, "updating shard record")
	wr.logger.Infof("Updating Shard's MasterAlias record")
	shardInfo.MasterAlias = masterElectTabletAlias
	if err = topo.UpdateShard(wr.ts, shardInfo); err != nil {
		return err
	}

	// and rebuild the shard serving graph
	event.DispatchUpdate(ev, "rebuilding shard serving graph")
	wr.logger.Infof("Rebuilding shard serving graph data")
	if _, err = topotools.RebuildShard(wr.logger, wr.ts, masterElectTablet.Keyspace, masterElectTablet.Shard, cells, wr.lockTimeout, interrupted); err != nil {
		return err
	}

	event.DispatchUpdate(ev, "finished")
	return nil
}
Ejemplo n.º 27
0
// Scrap a tablet. If force is used, we write to topo.Server
// directly and don't remote-execute the command.
//
// If we scrap the master for a shard, we will clear its record
// from the Shard object (only if that was the right master)
func (wr *Wrangler) Scrap(ctx context.Context, tabletAlias topo.TabletAlias, force, skipRebuild bool) error {
	// load the tablet, see if we'll need to rebuild
	ti, err := wr.ts.GetTablet(ctx, tabletAlias)
	if err != nil {
		return err
	}
	rebuildRequired := ti.IsInServingGraph()
	wasMaster := ti.Type == topo.TYPE_MASTER

	if force {
		err = topotools.Scrap(ctx, wr.ts, ti.Alias, force)
	} else {
		err = wr.tmc.Scrap(ctx, ti)
	}
	if err != nil {
		return err
	}

	if !rebuildRequired {
		wr.Logger().Infof("Rebuild not required")
		return nil
	}
	if skipRebuild {
		wr.Logger().Warningf("Rebuild required, but skipping it")
		return nil
	}

	// update the Shard object if the master was scrapped
	if wasMaster {
		actionNode := actionnode.UpdateShard()
		lockPath, err := wr.lockShard(ctx, ti.Keyspace, ti.Shard, actionNode)
		if err != nil {
			return err
		}

		// read the shard with the lock
		si, err := wr.ts.GetShard(ctx, ti.Keyspace, ti.Shard)
		if err != nil {
			return wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err)
		}

		// update it if the right alias is there
		if topo.TabletAliasEqual(si.MasterAlias, topo.TabletAliasToProto(tabletAlias)) {
			si.MasterAlias = nil

			// write it back
			if err := topo.UpdateShard(ctx, wr.ts, si); err != nil {
				return wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err)
			}
		} else {
			wr.Logger().Warningf("Scrapping master %v from shard %v/%v but master in Shard object was %v", tabletAlias, ti.Keyspace, ti.Shard, si.MasterAlias)
		}

		// and unlock
		if err := wr.unlockShard(ctx, ti.Keyspace, ti.Shard, actionNode, lockPath, err); err != nil {
			return err
		}
	}

	// and rebuild the original shard
	_, err = wr.RebuildShardGraph(ctx, ti.Keyspace, ti.Shard, []string{ti.Alias.Cell})
	return err
}
Ejemplo n.º 28
0
func (wr *Wrangler) removeShardCell(ctx context.Context, keyspace, shard, cell string, force, recursive bool) error {
	shardInfo, err := wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return err
	}

	// check the cell is in the list already
	if !topo.InCellList(cell, shardInfo.Cells) {
		return fmt.Errorf("cell %v in not in shard info", cell)
	}

	// check the master alias is not in the cell
	if shardInfo.MasterAlias.Cell == cell {
		return fmt.Errorf("master %v is in the cell '%v' we want to remove", shardInfo.MasterAlias, cell)
	}

	// get the ShardReplication object in the cell
	sri, err := wr.ts.GetShardReplication(ctx, cell, keyspace, shard)
	switch err {
	case nil:
		if recursive {
			wr.Logger().Infof("Deleting all tablets in shard %v/%v", keyspace, shard)
			for _, node := range sri.Nodes {
				// We don't care about scrapping or updating the replication graph,
				// because we're about to delete the entire replication graph.
				wr.Logger().Infof("Deleting tablet %v", node.TabletAlias)
				if err := wr.TopoServer().DeleteTablet(ctx, topo.ProtoToTabletAlias(node.TabletAlias)); err != nil && err != topo.ErrNoNode {
					return fmt.Errorf("can't delete tablet %v: %v", node.TabletAlias, err)
				}
			}
		} else if len(sri.Nodes) > 0 {
			return fmt.Errorf("cell %v has %v possible tablets in replication graph", cell, len(sri.Nodes))
		}

		// ShardReplication object is now useless, remove it
		if err := wr.ts.DeleteShardReplication(ctx, cell, keyspace, shard); err != nil && err != topo.ErrNoNode {
			return fmt.Errorf("error deleting ShardReplication object in cell %v: %v", cell, err)
		}

		// Rebuild the shard serving graph to reflect the tablets we deleted.
		// This must be done before removing the cell from the global shard record,
		// since this cell will be skipped by all future rebuilds.
		if _, err := wr.RebuildShardGraph(ctx, keyspace, shard, []string{cell}); err != nil {
			return fmt.Errorf("can't rebuild serving graph for shard %v/%v in cell %v: %v", keyspace, shard, cell, err)
		}

		// we keep going
	case topo.ErrNoNode:
		// no ShardReplication object, we keep going
	default:
		// we can't get the object, assume topo server is down there,
		// so we look at force flag
		if !force {
			return err
		}
		wr.Logger().Warningf("Cannot get ShardReplication from cell %v, assuming cell topo server is down, and forcing the removal", cell)
	}

	// now we can update the shard
	wr.Logger().Infof("Removing cell %v from shard %v/%v", cell, keyspace, shard)
	newCells := make([]string, 0, len(shardInfo.Cells)-1)
	for _, c := range shardInfo.Cells {
		if c != cell {
			newCells = append(newCells, c)
		}
	}
	shardInfo.Cells = newCells

	return topo.UpdateShard(ctx, wr.ts, shardInfo)
}
Ejemplo n.º 29
0
// InitTablet initializes the tablet record if necessary.
func (agent *ActionAgent) InitTablet(port, securePort, gRPCPort int) error {
	// only enabled if one of init_tablet_type (when healthcheck
	// is disabled) or init_keyspace (when healthcheck is enabled)
	// is passed in, then check other parameters
	if *initTabletType == "" && *initKeyspace == "" {
		return nil
	}

	// figure out our default target type
	var tabletType topo.TabletType
	if *initTabletType != "" {
		if *targetTabletType != "" {
			log.Fatalf("cannot specify both target_tablet_type and init_tablet_type parameters (as they might conflict)")
		}

		// use the type specified on the command line
		tabletType = topo.TabletType(*initTabletType)
		if !topo.IsTypeInList(tabletType, topo.AllTabletTypes) {
			log.Fatalf("InitTablet encountered unknown init_tablet_type '%v'", *initTabletType)
		}
		if tabletType == topo.TYPE_MASTER || tabletType == topo.TYPE_SCRAP {
			// We disallow TYPE_MASTER, so we don't have to change
			// shard.MasterAlias, and deal with the corner cases.
			// We also disallow TYPE_SCRAP, obviously.
			log.Fatalf("init_tablet_type cannot be %v", tabletType)
		}

	} else if *targetTabletType != "" {
		if tabletType := topo.TabletType(*targetTabletType); tabletType == topo.TYPE_MASTER {
			log.Fatalf("target_tablet_type cannot be '%v'. Use '%v' instead.", tabletType, topo.TYPE_REPLICA)
		}

		// use spare, the healthcheck will turn us into what
		// we need to be eventually
		tabletType = topo.TYPE_SPARE

	} else {
		log.Fatalf("if init tablet is enabled, one of init_tablet_type or target_tablet_type needs to be specified")
	}

	// create a context for this whole operation
	ctx, cancel := context.WithTimeout(agent.batchCtx, *initTimeout)
	defer cancel()

	// if we're assigned to a shard, make sure it exists, see if
	// we are its master, and update its cells list if necessary
	if tabletType != topo.TYPE_IDLE {
		if *initKeyspace == "" || *initShard == "" {
			log.Fatalf("if init tablet is enabled and the target type is not idle, init_keyspace and init_shard also need to be specified")
		}
		shard, _, err := topo.ValidateShardName(*initShard)
		if err != nil {
			log.Fatalf("cannot validate shard name: %v", err)
		}

		log.Infof("Reading shard record %v/%v", *initKeyspace, shard)

		// read the shard, create it if necessary
		si, err := topotools.GetOrCreateShard(ctx, agent.TopoServer, *initKeyspace, shard)
		if err != nil {
			return fmt.Errorf("InitTablet cannot GetOrCreateShard shard: %v", err)
		}
		if si.MasterAlias == agent.TabletAlias {
			// we are the current master for this shard (probably
			// means the master tablet process was just restarted),
			// so InitTablet as master.
			tabletType = topo.TYPE_MASTER
		}

		// See if we need to add the tablet's cell to the shard's cell
		// list.  If we do, it has to be under the shard lock.
		if !si.HasCell(agent.TabletAlias.Cell) {
			actionNode := actionnode.UpdateShard()
			lockPath, err := actionNode.LockShard(ctx, agent.TopoServer, *initKeyspace, shard)
			if err != nil {
				return fmt.Errorf("LockShard(%v/%v) failed: %v", *initKeyspace, shard, err)
			}

			// re-read the shard with the lock
			si, err = agent.TopoServer.GetShard(ctx, *initKeyspace, shard)
			if err != nil {
				return actionNode.UnlockShard(ctx, agent.TopoServer, *initKeyspace, shard, lockPath, err)
			}

			// see if we really need to update it now
			if !si.HasCell(agent.TabletAlias.Cell) {
				si.Cells = append(si.Cells, agent.TabletAlias.Cell)

				// write it back
				if err := topo.UpdateShard(ctx, agent.TopoServer, si); err != nil {
					return actionNode.UnlockShard(ctx, agent.TopoServer, *initKeyspace, shard, lockPath, err)
				}
			}

			// and unlock
			if err := actionNode.UnlockShard(ctx, agent.TopoServer, *initKeyspace, shard, lockPath, nil); err != nil {
				return err
			}
		}
	}
	log.Infof("Initializing the tablet for type %v", tabletType)

	// figure out the hostname
	hostname := *tabletHostname
	if hostname == "" {
		var err error
		hostname, err = netutil.FullyQualifiedHostname()
		if err != nil {
			return err
		}
	}

	// create and populate tablet record
	tablet := &topo.Tablet{
		Alias:          agent.TabletAlias,
		Hostname:       hostname,
		Portmap:        make(map[string]int),
		Keyspace:       *initKeyspace,
		Shard:          *initShard,
		Type:           tabletType,
		DbNameOverride: *initDbNameOverride,
		Tags:           initTags,
	}
	if port != 0 {
		tablet.Portmap["vt"] = port
	}
	if securePort != 0 {
		tablet.Portmap["vts"] = securePort
	}
	if gRPCPort != 0 {
		tablet.Portmap["grpc"] = gRPCPort
	}
	if err := tablet.Complete(); err != nil {
		return fmt.Errorf("InitTablet tablet.Complete failed: %v", err)
	}

	// now try to create the record
	err := topo.CreateTablet(ctx, agent.TopoServer, tablet)
	switch err {
	case nil:
		// it worked, we're good, can update the replication graph
		if tablet.IsInReplicationGraph() {
			if err := topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet); err != nil {
				return fmt.Errorf("UpdateTabletReplicationData failed: %v", err)
			}
		}

	case topo.ErrNodeExists:
		// The node already exists, will just try to update
		// it. So we read it first.
		oldTablet, err := agent.TopoServer.GetTablet(ctx, tablet.Alias)
		if err != nil {
			fmt.Errorf("InitTablet failed to read existing tablet record: %v", err)
		}

		// Sanity check the keyspace and shard
		if oldTablet.Keyspace != tablet.Keyspace || oldTablet.Shard != tablet.Shard {
			return fmt.Errorf("InitTablet failed because existing tablet keyspace and shard %v/%v differ from the provided ones %v/%v", oldTablet.Keyspace, oldTablet.Shard, tablet.Keyspace, tablet.Shard)
		}

		// And overwrite the rest
		*(oldTablet.Tablet) = *tablet
		if err := topo.UpdateTablet(ctx, agent.TopoServer, oldTablet); err != nil {
			return fmt.Errorf("UpdateTablet failed: %v", err)
		}

		// Note we don't need to UpdateTabletReplicationData
		// as the tablet already existed with the right data
		// in the replication graph
	default:
		return fmt.Errorf("CreateTablet failed: %v", err)
	}

	// and now update the serving graph. Note we do that in any case,
	// to clean any inaccurate record from any part of the serving graph.
	if tabletType != topo.TYPE_IDLE {
		if err := topotools.UpdateTabletEndpoints(ctx, agent.TopoServer, tablet); err != nil {
			return fmt.Errorf("UpdateTabletEndpoints failed: %v", err)
		}
	}

	return nil
}
Ejemplo n.º 30
0
func TestUpdateTabletEndpoints(t *testing.T) {
	ctx := context.Background()
	cell := "test_cell"

	// Set up topology.
	ts := zktopo.NewTestServer(t, []string{cell})
	si, err := GetOrCreateShard(ctx, ts, testKeyspace, testShard)
	if err != nil {
		t.Fatalf("GetOrCreateShard: %v", err)
	}
	si.Cells = append(si.Cells, cell)
	if err := topo.UpdateShard(ctx, ts, si); err != nil {
		t.Fatalf("UpdateShard: %v", err)
	}

	tablet1 := addTablet(ctx, t, ts, 1, cell, topo.TYPE_MASTER).Tablet
	tablet2 := addTablet(ctx, t, ts, 2, cell, topo.TYPE_REPLICA).Tablet

	update := func(tablet *topo.Tablet) {
		if err := UpdateTabletEndpoints(ctx, ts, tablet); err != nil {
			t.Fatalf("UpdateTabletEndpoints(%v): %v", tablet, err)
		}
	}
	expect := func(tabletType topo.TabletType, want int) {
		eps, _, err := ts.GetEndPoints(ctx, cell, testKeyspace, testShard, tabletType)
		if err != nil && err != topo.ErrNoNode {
			t.Errorf("GetEndPoints(%v): %v", tabletType, err)
			return
		}
		var got int
		if err == nil {
			got = len(eps.Entries)
			if got == 0 {
				t.Errorf("len(EndPoints) = 0, expected ErrNoNode instead")
			}
		}
		if got != want {
			t.Errorf("len(GetEndPoints(%v)) = %v, want %v. EndPoints = %v", tabletType, len(eps.Entries), want, eps)
		}
	}

	// Update tablets. This should create the serving graph dirs too.
	update(tablet1)
	expect(topo.TYPE_MASTER, 1)
	update(tablet2)
	expect(topo.TYPE_REPLICA, 1)

	// Re-update an identical tablet.
	update(tablet1)
	expect(topo.TYPE_MASTER, 1)

	// Change a tablet, but keep it the same type.
	tablet2.Hostname += "extra"
	update(tablet2)
	expect(topo.TYPE_REPLICA, 1)

	// Move the master to replica.
	tablet1.Type = topo.TYPE_REPLICA
	update(tablet1)
	expect(topo.TYPE_MASTER, 0)
	expect(topo.TYPE_REPLICA, 2)

	// Take a replica out of serving.
	tablet1.Type = topo.TYPE_SPARE
	update(tablet1)
	expect(topo.TYPE_MASTER, 0)
	expect(topo.TYPE_REPLICA, 1)

	// Put it back to serving.
	tablet1.Type = topo.TYPE_REPLICA
	update(tablet1)
	expect(topo.TYPE_MASTER, 0)
	expect(topo.TYPE_REPLICA, 2)

	// Move a replica to master.
	tablet2.Type = topo.TYPE_MASTER
	update(tablet2)
	expect(topo.TYPE_MASTER, 1)
	expect(topo.TYPE_REPLICA, 1)
}