Exemple #1
0
func (wr *Wrangler) validateReplication(shardInfo *topo.ShardInfo, tabletMap map[topo.TabletAlias]*topo.TabletInfo, results chan<- vresult) {
	_, ok := tabletMap[shardInfo.MasterAlias]
	if !ok {
		results <- vresult{shardInfo.MasterAlias.String(), fmt.Errorf("master not in tablet map")}
		return
	}

	actionPath, err := wr.ai.GetSlaves(shardInfo.MasterAlias)
	if err != nil {
		results <- vresult{shardInfo.MasterAlias.String(), err}
		return
	}
	sa, err := wr.ai.WaitForCompletionReply(actionPath, wr.actionTimeout())
	if err != nil {
		results <- vresult{shardInfo.MasterAlias.String(), err}
		return
	}
	slaveAddrs := sa.(*tm.SlaveList).Addrs
	if len(slaveAddrs) == 0 {
		results <- vresult{shardInfo.MasterAlias.String(), fmt.Errorf("no slaves found")}
		return
	}

	// Some addresses don't resolve in all locations, just use IP address
	if err != nil {
		results <- vresult{shardInfo.MasterAlias.String(), fmt.Errorf("resolve slaves failed: %v", err)}
		return
	}

	tabletIpMap := make(map[string]*topo.Tablet)
	for _, tablet := range tabletMap {
		ipAddr, _, err := net.SplitHostPort(tablet.MysqlIpAddr)
		if err != nil {
			results <- vresult{tablet.Alias().String(), fmt.Errorf("bad mysql addr: %v %v", tablet.MysqlIpAddr, err)}
			continue
		}
		tabletIpMap[ipAddr] = tablet.Tablet
	}

	// See if every slave is in the replication graph.
	for _, slaveAddr := range slaveAddrs {
		if tabletIpMap[slaveAddr] == nil {
			results <- vresult{shardInfo.Keyspace() + "/" + shardInfo.ShardName(), fmt.Errorf("slave not in replication graph: %v (mysql instance without vttablet?)", slaveAddr)}
		}
	}

	// See if every entry in the replication graph is connected to the master.
	for _, tablet := range tabletMap {
		if !tablet.IsSlaveType() {
			continue
		}

		ipAddr, _, err := net.SplitHostPort(tablet.MysqlIpAddr)
		if err != nil {
			results <- vresult{tablet.Alias().String(), fmt.Errorf("bad mysql addr: %v", err)}
		} else if !strInList(slaveAddrs, ipAddr) {
			results <- vresult{tablet.Alias().String(), fmt.Errorf("slave not replicating: %v %q", ipAddr, slaveAddrs)}
		}
	}
}
Exemple #2
0
// RefreshTablesByShard calls RefreshState on all the tables of a
// given type in a shard. It would work for the master, but the
// discovery wouldn't be very efficient.
func (wr *Wrangler) RefreshTablesByShard(si *topo.ShardInfo, tabletType topo.TabletType, cells []string) error {
	tabletMap, err := topo.GetTabletMapForShardByCell(wr.ts, si.Keyspace(), si.ShardName(), cells)
	switch err {
	case nil:
		// keep going
	case topo.ErrPartialResult:
		wr.Logger().Warningf("RefreshTablesByShard: got partial result for shard %v/%v, may not refresh all tablets everywhere", si.Keyspace(), si.ShardName())
	default:
		return err
	}

	// ignore errors in this phase
	wg := sync.WaitGroup{}
	for _, ti := range tabletMap {
		if ti.Type != tabletType {
			continue
		}

		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			if err := wr.tmc.RefreshState(ti, wr.ActionTimeout()); err != nil {
				wr.Logger().Warningf("RefreshTablesByShard: failed to refresh %v: %v", ti.Alias, err)
			}
			wg.Done()
		}(ti)
	}
	wg.Wait()

	return nil
}
Exemple #3
0
// replicaMigrateServedFrom handles the slave (replica, rdonly) migration.
func (wr *Wrangler) replicaMigrateServedFrom(ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, servedType topo.TabletType, reverse bool, tables []string, ev *events.MigrateServedFrom) error {
	// Save the destination keyspace (its ServedFrom has been changed)
	event.DispatchUpdate(ev, "updating keyspace")
	if err := topo.UpdateKeyspace(wr.ts, ki); err != nil {
		return err
	}

	// Save the source shard (its blacklisted tables field has changed)
	event.DispatchUpdate(ev, "updating source shard")
	if sourceShard.BlacklistedTablesMap == nil {
		sourceShard.BlacklistedTablesMap = make(map[topo.TabletType][]string)
	}
	if reverse {
		delete(sourceShard.BlacklistedTablesMap, servedType)
	} else {
		sourceShard.BlacklistedTablesMap[servedType] = tables
	}
	if err := topo.UpdateShard(wr.ts, sourceShard); err != nil {
		return err
	}

	// Now refresh the source servers so they reload their
	// blacklisted table list
	event.DispatchUpdate(ev, "refreshing sources tablets state so they update their blacklisted tables")
	if err := wr.RefreshTablesByShard(sourceShard.Keyspace(), sourceShard.ShardName(), servedType); err != nil {
		return err
	}

	return nil
}
Exemple #4
0
// InitTablet creates or updates a tablet. If no parent is specified
// in the tablet, and the tablet has a slave type, we will find the
// appropriate parent. If createShardAndKeyspace is true and the
// parent keyspace or shard don't exist, they will be created.  If
// allowUpdate is true, and a tablet with the same ID exists, just update it.
// If a tablet is created as master, and there is already a different
// master in the shard, allowMasterOverride must be set.
func (wr *Wrangler) InitTablet(ctx context.Context, tablet *topodatapb.Tablet, allowMasterOverride, createShardAndKeyspace, allowUpdate bool) error {
	if err := topo.TabletComplete(tablet); err != nil {
		return err
	}

	// get the shard, possibly creating it
	var err error
	var si *topo.ShardInfo

	if createShardAndKeyspace {
		// create the parent keyspace and shard if needed
		si, err = wr.ts.GetOrCreateShard(ctx, tablet.Keyspace, tablet.Shard)
	} else {
		si, err = wr.ts.GetShard(ctx, tablet.Keyspace, tablet.Shard)
		if err == topo.ErrNoNode {
			return fmt.Errorf("missing parent shard, use -parent option to create it, or CreateKeyspace / CreateShard")
		}
	}

	// get the shard, checks a couple things
	if err != nil {
		return fmt.Errorf("cannot get (or create) shard %v/%v: %v", tablet.Keyspace, tablet.Shard, err)
	}
	if !key.KeyRangeEqual(si.KeyRange, tablet.KeyRange) {
		return fmt.Errorf("shard %v/%v has a different KeyRange: %v != %v", tablet.Keyspace, tablet.Shard, si.KeyRange, tablet.KeyRange)
	}
	if tablet.Type == topodatapb.TabletType_MASTER && si.HasMaster() && !topoproto.TabletAliasEqual(si.MasterAlias, tablet.Alias) && !allowMasterOverride {
		return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v, use allow_master_override flag", topoproto.TabletAliasString(si.MasterAlias), tablet.Keyspace, tablet.Shard)
	}

	// update the shard record if needed
	if err := wr.updateShardCellsAndMaster(ctx, si, tablet.Alias, tablet.Type, allowMasterOverride); err != nil {
		return err
	}

	err = wr.ts.CreateTablet(ctx, tablet)
	if err == topo.ErrNodeExists && allowUpdate {
		// Try to update then
		oldTablet, err := wr.ts.GetTablet(ctx, tablet.Alias)
		if err != nil {
			return fmt.Errorf("failed reading existing tablet %v: %v", topoproto.TabletAliasString(tablet.Alias), err)
		}

		// Check we have the same keyspace / shard, and if not,
		// require the allowDifferentShard flag.
		if oldTablet.Keyspace != tablet.Keyspace || oldTablet.Shard != tablet.Shard {
			return fmt.Errorf("old tablet has shard %v/%v. Cannot override with shard %v/%v. Delete and re-add tablet if you want to change the tablet's keyspace/shard", oldTablet.Keyspace, oldTablet.Shard, tablet.Keyspace, tablet.Shard)
		}

		*(oldTablet.Tablet) = *tablet
		if err := wr.ts.UpdateTablet(ctx, oldTablet); err != nil {
			return fmt.Errorf("failed updating tablet %v: %v", topoproto.TabletAliasString(tablet.Alias), err)
		}
	}
	return nil
}
Exemple #5
0
func (zkts *Server) UpdateShard(si *topo.ShardInfo) error {
	shardPath := path.Join(globalKeyspacesPath, si.Keyspace(), "shards", si.ShardName())
	_, err := zkts.zconn.Set(shardPath, jscfg.ToJson(si.Shard), -1)
	if err != nil {
		if zookeeper.IsError(err, zookeeper.ZNONODE) {
			err = topo.ErrNoNode
		}
	}
	return err
}
Exemple #6
0
func (wr *Wrangler) shardReplicationPositions(shardInfo *topo.ShardInfo) ([]*topo.TabletInfo, []*mysqlctl.ReplicationPosition, error) {
	// FIXME(msolomon) this assumes no hierarchical replication, which is currently the case.
	tabletMap, err := GetTabletMapForShard(wr.ts, shardInfo.Keyspace(), shardInfo.ShardName())
	if err != nil {
		return nil, nil, err
	}
	tablets := CopyMapValues(tabletMap, []*topo.TabletInfo{}).([]*topo.TabletInfo)
	positions, err := wr.tabletReplicationPositions(tablets)
	return tablets, positions, err
}
Exemple #7
0
func (wr *Wrangler) shardReplicationStatuses(ctx context.Context, shardInfo *topo.ShardInfo) ([]*topo.TabletInfo, []*myproto.ReplicationStatus, error) {
	// FIXME(msolomon) this assumes no hierarchical replication, which is currently the case.
	tabletMap, err := wr.ts.GetTabletMapForShard(ctx, shardInfo.Keyspace(), shardInfo.ShardName())
	if err != nil {
		return nil, nil, err
	}
	tablets := topotools.CopyMapValues(tabletMap, []*topo.TabletInfo{}).([]*topo.TabletInfo)
	stats, err := wr.tabletReplicationStatuses(ctx, tablets)
	return tablets, stats, err
}
Exemple #8
0
// updateShardCellsAndMaster will update the 'Cells' and possibly
// MasterAlias records for the shard, if needed.
func (wr *Wrangler) updateShardCellsAndMaster(ctx context.Context, si *topo.ShardInfo, tabletAlias topo.TabletAlias, tabletType topo.TabletType, force bool) error {
	// See if we need to update the Shard:
	// - add the tablet's cell to the shard's Cells if needed
	// - change the master if needed
	shardUpdateRequired := false
	if !si.HasCell(tabletAlias.Cell) {
		shardUpdateRequired = true
	}
	if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias {
		shardUpdateRequired = true
	}
	if !shardUpdateRequired {
		return nil
	}

	actionNode := actionnode.UpdateShard()
	keyspace := si.Keyspace()
	shard := si.ShardName()
	lockPath, err := wr.lockShard(ctx, keyspace, shard, actionNode)
	if err != nil {
		return err
	}

	// re-read the shard with the lock
	si, err = wr.ts.GetShard(ctx, keyspace, shard)
	if err != nil {
		return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err)
	}

	// update it
	wasUpdated := false
	if !si.HasCell(tabletAlias.Cell) {
		si.Cells = append(si.Cells, tabletAlias.Cell)
		wasUpdated = true
	}
	if tabletType == topo.TYPE_MASTER && si.MasterAlias != tabletAlias {
		if !si.MasterAlias.IsZero() && !force {
			return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", si.MasterAlias, keyspace, shard))
		}
		si.MasterAlias = tabletAlias
		wasUpdated = true
	}

	if wasUpdated {
		// write it back
		if err := topo.UpdateShard(ctx, wr.ts, si); err != nil {
			return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err)
		}
	}

	// and unlock
	return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err)
}
Exemple #9
0
func (tee *Tee) UpdateShard(si *topo.ShardInfo) error {
	if err := tee.primary.UpdateShard(si); err != nil {
		// failed on primary, not updating secondary
		return err
	}

	if err := tee.secondary.UpdateShard(si); err != nil {
		// not critical enough to fail
		log.Warningf("secondary.UpdateShard(%v,%v) failed: %v", si.Keyspace(), si.ShardName(), err)
	}
	return nil
}
Exemple #10
0
// replicaMigrateServedFrom handles the slave (replica, rdonly) migration.
func (wr *Wrangler) replicaMigrateServedFrom(ctx context.Context, ki *topo.KeyspaceInfo, sourceShard *topo.ShardInfo, destinationShard *topo.ShardInfo, servedType topodatapb.TabletType, cells []string, reverse bool, tables []string, ev *events.MigrateServedFrom) error {
	// Save the destination keyspace (its ServedFrom has been changed)
	event.DispatchUpdate(ev, "updating keyspace")
	if err := wr.ts.UpdateKeyspace(ctx, ki); err != nil {
		return err
	}

	// Save the source shard (its blacklisted tables field has changed)
	event.DispatchUpdate(ev, "updating source shard")
	if err := sourceShard.UpdateSourceBlacklistedTables(servedType, cells, reverse, tables); err != nil {
		return fmt.Errorf("UpdateSourceBlacklistedTables(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err)
	}
	if err := wr.ts.UpdateShard(ctx, sourceShard); err != nil {
		return fmt.Errorf("UpdateShard(%v/%v) failed: %v", sourceShard.Keyspace(), sourceShard.ShardName(), err)
	}

	// Now refresh the source servers so they reload their
	// blacklisted table list
	event.DispatchUpdate(ev, "refreshing sources tablets state so they update their blacklisted tables")
	if err := wr.RefreshTablesByShard(ctx, sourceShard, servedType, cells); err != nil {
		return err
	}

	return nil
}
Exemple #11
0
func (wr *Wrangler) validateReplication(shardInfo *topo.ShardInfo, tabletMap map[topo.TabletAlias]*topo.TabletInfo, results chan<- vresult) {
	masterTablet, ok := tabletMap[shardInfo.MasterAlias]
	if !ok {
		results <- vresult{shardInfo.MasterAlias.String(), fmt.Errorf("master not in tablet map")}
		return
	}

	slaveList, err := wr.ai.GetSlaves(masterTablet, wr.ActionTimeout())
	if err != nil {
		results <- vresult{shardInfo.MasterAlias.String(), err}
		return
	}
	if len(slaveList) == 0 {
		results <- vresult{shardInfo.MasterAlias.String(), fmt.Errorf("no slaves found")}
		return
	}

	// Some addresses don't resolve in all locations, just use IP address
	if err != nil {
		results <- vresult{shardInfo.MasterAlias.String(), fmt.Errorf("resolve slaves failed: %v", err)}
		return
	}

	tabletIpMap := make(map[string]*topo.Tablet)
	slaveIpMap := make(map[string]bool)
	for _, tablet := range tabletMap {
		tabletIpMap[normalizeIP(tablet.IPAddr)] = tablet.Tablet
	}

	// See if every slave is in the replication graph.
	for _, slaveAddr := range slaveList {
		if tabletIpMap[normalizeIP(slaveAddr)] == nil {
			results <- vresult{shardInfo.Keyspace() + "/" + shardInfo.ShardName(), fmt.Errorf("slave not in replication graph: %v (mysql instance without vttablet?)", slaveAddr)}
		}
		slaveIpMap[normalizeIP(slaveAddr)] = true
	}

	// See if every entry in the replication graph is connected to the master.
	for _, tablet := range tabletMap {
		if !tablet.IsSlaveType() {
			continue
		}

		if !slaveIpMap[normalizeIP(tablet.IPAddr)] {
			results <- vresult{tablet.Alias.String(), fmt.Errorf("slave not replicating: %v %q", tablet.IPAddr, slaveList)}
		}
	}
}
Exemple #12
0
func (zkts *Server) UpdateShard(si *topo.ShardInfo) error {
	shardPath := path.Join(globalKeyspacesPath, si.Keyspace(), "shards", si.ShardName())
	_, err := zkts.zconn.Set(shardPath, jscfg.ToJson(si.Shard), -1)
	if err != nil {
		if zookeeper.IsError(err, zookeeper.ZNONODE) {
			err = topo.ErrNoNode
		}
		return err
	}

	event.Dispatch(&events.ShardChange{
		ShardInfo: *si,
		Status:    "updated",
	})
	return nil
}
Exemple #13
0
func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error {
	// If the majority of slaves restarted, move ahead.
	if majorityRestart {
		if leaveMasterReadOnly {
			wr.logger.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
		} else {
			wr.logger.Infof("marking master-elect read-write %v", masterElect.Alias)
			if err := wr.tmc.SetReadWrite(masterElect, wr.ActionTimeout()); err != nil {
				wr.logger.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
			}
		}
	} else {
		wr.logger.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
	}

	// save the new master in the shard info
	si.MasterAlias = masterElect.Alias
	if err := topo.UpdateShard(wr.ts, si); err != nil {
		wr.logger.Errorf("Failed to save new master into shard: %v", err)
		return err
	}

	// We rebuild all the cells, as we may have taken tablets in and
	// out of the graph.
	wr.logger.Infof("rebuilding shard serving graph data")
	_, err := topotools.RebuildShard(wr.logger, wr.ts, masterElect.Keyspace, masterElect.Shard, nil, wr.lockTimeout, interrupted)
	return err
}
Exemple #14
0
func (wr *Wrangler) finishReparent(si *topo.ShardInfo, masterElect *topo.TabletInfo, majorityRestart, leaveMasterReadOnly bool) error {
	// If the majority of slaves restarted, move ahead.
	if majorityRestart {
		if leaveMasterReadOnly {
			log.Warningf("leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
		} else {
			log.Infof("marking master-elect read-write %v", masterElect.Alias)
			actionPath, err := wr.ai.SetReadWrite(masterElect.Alias)
			if err == nil {
				err = wr.ai.WaitForCompletion(actionPath, wr.actionTimeout())
			}
			if err != nil {
				log.Warningf("master master-elect read-write failed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
			}
		}
	} else {
		log.Warningf("minority reparent, manual fixes are needed, leaving master-elect read-only, change with: vtctl SetReadWrite %v", masterElect.Alias)
	}

	// save the new master in the shard info
	si.MasterAlias = masterElect.Alias
	if err := wr.ts.UpdateShard(si); err != nil {
		log.Errorf("Failed to save new master into shard: %v", err)
		return err
	}

	// We rebuild all the cells, as we may have taken tablets in and
	// out of the graph.
	log.Infof("rebuilding shard serving graph data")
	return topotools.RebuildShard(wr.ts, masterElect.Keyspace, masterElect.Shard, topotools.RebuildShardOptions{IgnorePartialResult: false}, wr.lockTimeout, interrupted)
}
Exemple #15
0
// UpdateShard is part of the topo.Server interface
func (zkts *Server) UpdateShard(ctx context.Context, si *topo.ShardInfo, existingVersion int64) (int64, error) {
	shardPath := path.Join(globalKeyspacesPath, si.Keyspace(), "shards", si.ShardName())
	stat, err := zkts.zconn.Set(shardPath, jscfg.ToJSON(si.Shard), int(existingVersion))
	if err != nil {
		if zookeeper.IsError(err, zookeeper.ZNONODE) {
			err = topo.ErrNoNode
		}
		return -1, err
	}

	event.Dispatch(&events.ShardChange{
		ShardInfo: *si,
		Status:    "updated",
	})
	return int64(stat.Version()), nil
}
Exemple #16
0
func (wr *Wrangler) validateReplication(ctx context.Context, shardInfo *topo.ShardInfo, tabletMap map[topodatapb.TabletAlias]*topo.TabletInfo, results chan<- error) {
	if shardInfo.MasterAlias == nil {
		results <- fmt.Errorf("no master in shard record %v/%v", shardInfo.Keyspace(), shardInfo.ShardName())
		return
	}

	masterTabletInfo, ok := tabletMap[*shardInfo.MasterAlias]
	if !ok {
		results <- fmt.Errorf("master %v not in tablet map", topoproto.TabletAliasString(shardInfo.MasterAlias))
		return
	}

	slaveList, err := wr.tmc.GetSlaves(ctx, masterTabletInfo.Tablet)
	if err != nil {
		results <- fmt.Errorf("GetSlaves(%v) failed: %v", masterTabletInfo, err)
		return
	}
	if len(slaveList) == 0 {
		results <- fmt.Errorf("no slaves of tablet %v found", topoproto.TabletAliasString(shardInfo.MasterAlias))
		return
	}

	tabletIPMap := make(map[string]*topodatapb.Tablet)
	slaveIPMap := make(map[string]bool)
	for _, tablet := range tabletMap {
		tabletIPMap[normalizeIP(tablet.Ip)] = tablet.Tablet
	}

	// See if every slave is in the replication graph.
	for _, slaveAddr := range slaveList {
		if tabletIPMap[normalizeIP(slaveAddr)] == nil {
			results <- fmt.Errorf("slave %v not in replication graph for shard %v/%v (mysql instance without vttablet?)", slaveAddr, shardInfo.Keyspace(), shardInfo.ShardName())
		}
		slaveIPMap[normalizeIP(slaveAddr)] = true
	}

	// See if every entry in the replication graph is connected to the master.
	for _, tablet := range tabletMap {
		if !tablet.IsSlaveType() {
			continue
		}

		if !slaveIPMap[normalizeIP(tablet.Ip)] {
			results <- fmt.Errorf("slave %v not replicating: %v slave list: %q", topoproto.TabletAliasString(tablet.Alias), tablet.Ip, slaveList)
		}
	}
}
Exemple #17
0
// Update shard file with new master, replicas, etc.
//
// Re-read from TopologyServer to make sure we are using the side
// effects of all actions.
//
// This function should only be used with an action lock on the shard
// - otherwise the consistency of the serving graph data can't be
// guaranteed.
func (wr *Wrangler) rebuildShard(keyspace, shard string, options rebuildShardOptions) error {
	log.Infof("rebuildShard %v/%v", keyspace, shard)

	// read the existing shard info. It has to exist.

	var (
		shardInfo *topo.ShardInfo
		err       error
	)
	if options.Critical {
		shardInfo, err = wr.ts.GetShardCritical(keyspace, shard)
	} else {
		shardInfo, err = wr.ts.GetShard(keyspace, shard)
	}

	if err != nil {
		return err
	}

	tabletMap, err := GetTabletMapForShardByCell(wr.ts, keyspace, shard, options.Cells)
	if err != nil {
		if options.IgnorePartialResult && err == topo.ErrPartialResult {
			log.Warningf("rebuildShard: got topo.ErrPartialResult from GetTabletMapForShard, but skipping error as it was expected")
		} else {
			return err
		}
	}

	tablets := make([]*topo.TabletInfo, 0, len(tabletMap))
	for _, ti := range tabletMap {
		if ti.Keyspace != shardInfo.Keyspace() || ti.Shard != shardInfo.ShardName() {
			return fmt.Errorf("CRITICAL: tablet %v is in replication graph for shard %v/%v but belongs to shard %v:%v (maybe remove its replication path in shard %v/%v)", ti.Alias, keyspace, shard, ti.Keyspace, ti.Shard, keyspace, shard)
		}
		if !ti.IsInReplicationGraph() {
			// only valid case is a scrapped master in the
			// catastrophic reparent case
			if ti.Parent.Uid != topo.NO_TABLET {
				log.Warningf("Tablet %v should not be in the replication graph, please investigate (it will be ignored in the rebuild)", ti.Alias)
			}
		}
		tablets = append(tablets, ti)
	}

	return wr.rebuildShardSrvGraph(shardInfo, tablets, options.Cells)
}
Exemple #18
0
// UpdateShard implements topo.Server.
func (s *Server) UpdateShard(ctx context.Context, si *topo.ShardInfo, existingVersion int64) (int64, error) {
	data := jscfg.ToJSON(si.Shard)

	resp, err := s.getGlobal().CompareAndSwap(shardFilePath(si.Keyspace(), si.ShardName()),
		data, 0 /* ttl */, "" /* prevValue */, uint64(existingVersion))
	if err != nil {
		return -1, convertError(err)
	}
	if resp.Node == nil {
		return -1, ErrBadResponse
	}

	event.Dispatch(&events.ShardChange{
		ShardInfo: *si,
		Status:    "updated",
	})
	return int64(resp.Node.ModifiedIndex), nil
}
Exemple #19
0
func (wr *Wrangler) applySchemaShard(ctx context.Context, shardInfo *topo.ShardInfo, preflight *myproto.SchemaChangeResult, masterTabletAlias *pb.TabletAlias, change string, newParentTabletAlias *pb.TabletAlias, simple, force bool, waitSlaveTimeout time.Duration) (*myproto.SchemaChangeResult, error) {

	// find all the shards we need to handle
	aliases, err := topo.FindAllTabletAliasesInShard(ctx, wr.ts, shardInfo.Keyspace(), shardInfo.ShardName())
	if err != nil {
		return nil, err
	}

	// build the array of tabletStatus we're going to use
	statusArray := make([]*tabletStatus, 0, len(aliases)-1)
	for _, alias := range aliases {
		if alias == masterTabletAlias {
			// we skip the master
			continue
		}

		ti, err := wr.ts.GetTablet(ctx, alias)
		if err != nil {
			return nil, err
		}
		statusArray = append(statusArray, &tabletStatus{ti: ti})
	}

	// get schema on all tablets.
	log.Infof("Getting schema on all tablets for shard %v/%v", shardInfo.Keyspace(), shardInfo.ShardName())
	wg := &sync.WaitGroup{}
	for _, status := range statusArray {
		wg.Add(1)
		go func(status *tabletStatus) {
			status.beforeSchema, status.lastError = wr.tmc.GetSchema(ctx, status.ti, nil, nil, false)
			wg.Done()
		}(status)
	}
	wg.Wait()

	// quick check for errors
	for _, status := range statusArray {
		if status.lastError != nil {
			return nil, fmt.Errorf("Error getting schema on tablet %v: %v", status.ti.Alias, status.lastError)
		}
	}

	// simple or complex?
	if simple {
		return wr.applySchemaShardSimple(ctx, statusArray, preflight, masterTabletAlias, change, force)
	}

	return wr.applySchemaShardComplex(ctx, statusArray, shardInfo, preflight, masterTabletAlias, change, newParentTabletAlias, force, waitSlaveTimeout)
}
Exemple #20
0
func (wr *Wrangler) migrateServedFrom(ki *topo.KeyspaceInfo, destinationShard *topo.ShardInfo, servedType topo.TabletType, reverse bool) (err error) {

	// re-read and update keyspace info record
	ki, err = wr.ts.GetKeyspace(ki.KeyspaceName())
	if err != nil {
		return err
	}
	if reverse {
		if _, ok := ki.ServedFrom[servedType]; ok {
			return fmt.Errorf("Destination Keyspace %s is not serving type %v", ki.KeyspaceName(), servedType)
		}
		ki.ServedFrom[servedType] = destinationShard.SourceShards[0].Keyspace
	} else {
		if _, ok := ki.ServedFrom[servedType]; !ok {
			return fmt.Errorf("Destination Keyspace %s is already serving type %v", ki.KeyspaceName(), servedType)
		}
		delete(ki.ServedFrom, servedType)
	}

	// re-read and check the destination shard
	destinationShard, err = wr.ts.GetShard(destinationShard.Keyspace(), destinationShard.ShardName())
	if err != nil {
		return err
	}
	if len(destinationShard.SourceShards) != 1 {
		return fmt.Errorf("Destination shard %v/%v is not a vertical split target", destinationShard.Keyspace(), destinationShard.ShardName())
	}
	tables := destinationShard.SourceShards[0].Tables

	// read the source shard, we'll need its master, and we'll need to
	// update the blacklisted tables.
	var sourceShard *topo.ShardInfo
	sourceShard, err = wr.ts.GetShard(destinationShard.SourceShards[0].Keyspace, destinationShard.SourceShards[0].Shard)
	if err != nil {
		return err
	}

	ev := &events.MigrateServedFrom{
		Keyspace:         *ki,
		SourceShard:      *sourceShard,
		DestinationShard: *destinationShard,
		ServedType:       servedType,
		Reverse:          reverse,
	}
	event.DispatchUpdate(ev, "start")
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	if servedType == topo.TYPE_MASTER {
		err = wr.masterMigrateServedFrom(ki, sourceShard, destinationShard, servedType, tables, ev)
	} else {
		err = wr.replicaMigrateServedFrom(ki, sourceShard, destinationShard, servedType, reverse, tables, ev)
	}
	event.DispatchUpdate(ev, "finished")
	return
}
Exemple #21
0
// UpdateSrvShard creates the SrvShard object based on the global ShardInfo,
// and writes it to the given cell.
func UpdateSrvShard(ctx context.Context, ts topo.Server, cell string, si *topo.ShardInfo) error {
	srvShard := &topo.SrvShard{
		Name:       si.ShardName(),
		KeyRange:   si.KeyRange,
		MasterCell: si.MasterAlias.Cell,
	}
	return ts.UpdateSrvShard(ctx, cell, si.Keyspace(), si.ShardName(), srvShard)
}
Exemple #22
0
func (wr *Wrangler) migrateServedFrom(ctx context.Context, ki *topo.KeyspaceInfo, destinationShard *topo.ShardInfo, servedType topodatapb.TabletType, cells []string, reverse bool, filteredReplicationWaitTime time.Duration) (err error) {

	// re-read and update keyspace info record
	ki, err = wr.ts.GetKeyspace(ctx, ki.KeyspaceName())
	if err != nil {
		return err
	}
	if reverse {
		ki.UpdateServedFromMap(servedType, cells, destinationShard.SourceShards[0].Keyspace, false, nil)
	} else {
		ki.UpdateServedFromMap(servedType, cells, destinationShard.SourceShards[0].Keyspace, true, destinationShard.Cells)
	}

	// re-read and check the destination shard
	destinationShard, err = wr.ts.GetShard(ctx, destinationShard.Keyspace(), destinationShard.ShardName())
	if err != nil {
		return err
	}
	if len(destinationShard.SourceShards) != 1 {
		return fmt.Errorf("Destination shard %v/%v is not a vertical split target", destinationShard.Keyspace(), destinationShard.ShardName())
	}
	tables := destinationShard.SourceShards[0].Tables

	// read the source shard, we'll need its master, and we'll need to
	// update the blacklisted tables.
	var sourceShard *topo.ShardInfo
	sourceShard, err = wr.ts.GetShard(ctx, destinationShard.SourceShards[0].Keyspace, destinationShard.SourceShards[0].Shard)
	if err != nil {
		return err
	}

	ev := &events.MigrateServedFrom{
		KeyspaceName:     ki.KeyspaceName(),
		SourceShard:      *sourceShard,
		DestinationShard: *destinationShard,
		ServedType:       servedType,
		Reverse:          reverse,
	}
	event.DispatchUpdate(ev, "start")
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	if servedType == topodatapb.TabletType_MASTER {
		err = wr.masterMigrateServedFrom(ctx, ki, sourceShard, destinationShard, tables, ev, filteredReplicationWaitTime)
	} else {
		err = wr.replicaMigrateServedFrom(ctx, ki, sourceShard, destinationShard, servedType, cells, reverse, tables, ev)
	}
	event.DispatchUpdate(ev, "finished")
	return
}
Exemple #23
0
// updateShardCellsAndMaster will update the 'Cells' and possibly
// MasterAlias records for the shard, if needed.
func (wr *Wrangler) updateShardCellsAndMaster(ctx context.Context, si *topo.ShardInfo, tabletAlias *topodatapb.TabletAlias, tabletType topodatapb.TabletType, allowMasterOverride bool) error {
	// See if we need to update the Shard:
	// - add the tablet's cell to the shard's Cells if needed
	// - change the master if needed
	shardUpdateRequired := false
	if !si.HasCell(tabletAlias.Cell) {
		shardUpdateRequired = true
	}
	if tabletType == topodatapb.TabletType_MASTER && !topoproto.TabletAliasEqual(si.MasterAlias, tabletAlias) {
		shardUpdateRequired = true
	}
	if !shardUpdateRequired {
		return nil
	}

	// we do need to update the shard, lock it to not interfere with
	// reparenting operations.
	actionNode := actionnode.UpdateShard()
	keyspace := si.Keyspace()
	shard := si.ShardName()
	lockPath, err := wr.lockShard(ctx, keyspace, shard, actionNode)
	if err != nil {
		return err
	}

	// run the update
	_, err = wr.ts.UpdateShardFields(ctx, keyspace, shard, func(s *topodatapb.Shard) error {
		wasUpdated := false
		if !topoproto.ShardHasCell(s, tabletAlias.Cell) {
			s.Cells = append(s.Cells, tabletAlias.Cell)
			wasUpdated = true
		}

		if tabletType == topodatapb.TabletType_MASTER && !topoproto.TabletAliasEqual(s.MasterAlias, tabletAlias) {
			if !topoproto.TabletAliasIsZero(s.MasterAlias) && !allowMasterOverride {
				return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", topoproto.TabletAliasString(s.MasterAlias), keyspace, shard)
			}
			s.MasterAlias = tabletAlias
			wasUpdated = true
		}

		if !wasUpdated {
			return topo.ErrNoUpdateNeeded
		}
		return nil
	})
	return wr.unlockShard(ctx, keyspace, shard, actionNode, lockPath, err)
}
Exemple #24
0
// RefreshTablesByShard calls RefreshState on all the tables of a
// given type in a shard. It would work for the master, but the
// discovery wouldn't be very efficient.
func (wr *Wrangler) RefreshTablesByShard(ctx context.Context, si *topo.ShardInfo, tabletType topodatapb.TabletType, cells []string) error {
	wr.Logger().Infof("RefreshTablesByShard called on shard %v/%v", si.Keyspace(), si.ShardName())
	tabletMap, err := wr.ts.GetTabletMapForShardByCell(ctx, si.Keyspace(), si.ShardName(), cells)
	switch err {
	case nil:
		// keep going
	case topo.ErrPartialResult:
		wr.Logger().Warningf("RefreshTablesByShard: got partial result for shard %v/%v, may not refresh all tablets everywhere", si.Keyspace(), si.ShardName())
	default:
		return err
	}

	// ignore errors in this phase
	wg := sync.WaitGroup{}
	for _, ti := range tabletMap {
		if ti.Type != tabletType {
			continue
		}

		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			wr.Logger().Infof("Calling RefreshState on tablet %v", ti.AliasString())
			// Setting an upper bound timeout to fail faster in case of an error.
			// Using 60 seconds because RefreshState should not take more than 30 seconds.
			// (RefreshState will restart the tablet's QueryService and most time will be spent on the shutdown, i.e. waiting up to 30 seconds on transactions (see Config.TransactionTimeout)).
			ctx, cancel := context.WithTimeout(ctx, 60*time.Second)
			if err := wr.tmc.RefreshState(ctx, ti); err != nil {
				wr.Logger().Warningf("RefreshTablesByShard: failed to refresh %v: %v", ti.AliasString(), err)
			}
			cancel()
			wg.Done()
		}(ti)
	}
	wg.Wait()

	return nil
}
Exemple #25
0
// updateShardCellsAndMaster will update the 'Cells' and possibly
// MasterAlias records for the shard, if needed.
func (wr *Wrangler) updateShardCellsAndMaster(ctx context.Context, si *topo.ShardInfo, tabletAlias *topodatapb.TabletAlias, tabletType topodatapb.TabletType, allowMasterOverride bool) error {
	// See if we need to update the Shard:
	// - add the tablet's cell to the shard's Cells if needed
	// - change the master if needed
	shardUpdateRequired := false
	if !si.HasCell(tabletAlias.Cell) {
		shardUpdateRequired = true
	}
	if tabletType == topodatapb.TabletType_MASTER && !topoproto.TabletAliasEqual(si.MasterAlias, tabletAlias) {
		shardUpdateRequired = true
	}
	if !shardUpdateRequired {
		return nil
	}

	// run the update
	_, err := wr.ts.UpdateShardFields(ctx, si.Keyspace(), si.ShardName(), func(s *topo.ShardInfo) error {
		wasUpdated := false
		if !s.HasCell(tabletAlias.Cell) {
			s.Cells = append(s.Cells, tabletAlias.Cell)
			wasUpdated = true
		}

		if tabletType == topodatapb.TabletType_MASTER && !topoproto.TabletAliasEqual(s.MasterAlias, tabletAlias) {
			if !topoproto.TabletAliasIsZero(s.MasterAlias) && !allowMasterOverride {
				return fmt.Errorf("creating this tablet would override old master %v in shard %v/%v", topoproto.TabletAliasString(s.MasterAlias), si.Keyspace(), si.ShardName())
			}
			s.MasterAlias = tabletAlias
			wasUpdated = true
		}

		if !wasUpdated {
			return topo.ErrNoUpdateNeeded
		}
		return nil
	})
	return err
}
Exemple #26
0
func (wr *Wrangler) applySchemaShardComplex(ctx context.Context, statusArray []*tabletStatus, shardInfo *topo.ShardInfo, preflight *myproto.SchemaChangeResult, masterTabletAlias *pb.TabletAlias, change string, newParentTabletAlias *pb.TabletAlias, force bool, waitSlaveTimeout time.Duration) (*myproto.SchemaChangeResult, error) {
	// apply the schema change to all replica / slave tablets
	for _, status := range statusArray {
		// if already applied, we skip this guy
		diffs := myproto.DiffSchemaToArray("after", preflight.AfterSchema, topo.TabletAliasString(status.ti.Alias), status.beforeSchema)
		if len(diffs) == 0 {
			log.Infof("Tablet %v already has the AfterSchema, skipping", status.ti.Alias)
			continue
		}

		// make sure the before schema matches
		diffs = myproto.DiffSchemaToArray("master", preflight.BeforeSchema, topo.TabletAliasString(status.ti.Alias), status.beforeSchema)
		if len(diffs) > 0 {
			if force {
				log.Warningf("Tablet %v has inconsistent schema, ignoring: %v", status.ti.Alias, strings.Join(diffs, "\n"))
			} else {
				return nil, fmt.Errorf("Tablet %v has inconsistent schema: %v", status.ti.Alias, strings.Join(diffs, "\n"))
			}
		}

		// take this guy out of the serving graph if necessary
		ti, err := wr.ts.GetTablet(ctx, status.ti.Alias)
		if err != nil {
			return nil, err
		}
		typeChangeRequired := ti.IsInServingGraph()
		if typeChangeRequired {
			// note we want to update the serving graph there
			err = wr.changeTypeInternal(ctx, ti.Alias, pb.TabletType_SCHEMA_UPGRADE)
			if err != nil {
				return nil, err
			}
		}

		// apply the schema change
		log.Infof("Applying schema change to slave %v in complex mode", status.ti.Alias)
		sc := &myproto.SchemaChange{Sql: change, Force: force, AllowReplication: false, BeforeSchema: preflight.BeforeSchema, AfterSchema: preflight.AfterSchema}
		_, err = wr.ApplySchema(ctx, status.ti.Alias, sc)
		if err != nil {
			return nil, err
		}

		// put this guy back into the serving graph
		if typeChangeRequired {
			err = wr.changeTypeInternal(ctx, ti.Alias, ti.Tablet.Type)
			if err != nil {
				return nil, err
			}
		}
	}

	// if newParentTabletAlias is passed in, use that as the new master
	if !topo.TabletAliasIsZero(newParentTabletAlias) {
		log.Infof("Reparenting with new master set to %v", newParentTabletAlias)
		oldMasterAlias := shardInfo.MasterAlias

		// Create reusable Reparent event with available info
		ev := &events.Reparent{}

		if err := wr.plannedReparentShardLocked(ctx, ev, shardInfo.Keyspace(), shardInfo.ShardName(), newParentTabletAlias, waitSlaveTimeout); err != nil {
			return nil, err
		}

		// Here we would apply the schema change to the old
		// master, but we just scrap it, to be consistent
		// with the previous implementation of the reparent.
		// (this code will be refactored at some point anyway)
		if err := wr.Scrap(ctx, oldMasterAlias, false, false); err != nil {
			wr.Logger().Warningf("Scrapping old master %v from shard %v/%v failed: %v", oldMasterAlias, shardInfo.Keyspace(), shardInfo.ShardName(), err)
		}
	}
	return &myproto.SchemaChangeResult{BeforeSchema: preflight.BeforeSchema, AfterSchema: preflight.AfterSchema}, nil
}
Exemple #27
0
func (wr *Wrangler) migrateServedFrom(ki *topo.KeyspaceInfo, si *topo.ShardInfo, servedType topo.TabletType, reverse bool) (err error) {

	// re-read and update keyspace info record
	ki, err = wr.ts.GetKeyspace(ki.KeyspaceName())
	if err != nil {
		return err
	}
	if reverse {
		if _, ok := ki.ServedFrom[servedType]; ok {
			return fmt.Errorf("Destination Keyspace %s is not serving type %v", ki.KeyspaceName(), servedType)
		}
		ki.ServedFrom[servedType] = si.SourceShards[0].Keyspace
	} else {
		if _, ok := ki.ServedFrom[servedType]; !ok {
			return fmt.Errorf("Destination Keyspace %s is already serving type %v", ki.KeyspaceName(), servedType)
		}
		delete(ki.ServedFrom, servedType)
	}

	// re-read and check the destination shard
	si, err = wr.ts.GetShard(si.Keyspace(), si.ShardName())
	if err != nil {
		return err
	}
	if len(si.SourceShards) != 1 {
		return fmt.Errorf("Destination shard %v/%v is not a vertical split target", si.Keyspace(), si.ShardName())
	}
	tables := si.SourceShards[0].Tables

	// read the source shard, we'll need its master
	sourceShard, err := wr.ts.GetShard(si.SourceShards[0].Keyspace, si.SourceShards[0].Shard)
	if err != nil {
		return err
	}

	ev := &events.MigrateServedFrom{
		Keyspace:         *ki,
		SourceShard:      *sourceShard,
		DestinationShard: *si,
		ServedType:       servedType,
		Reverse:          reverse,
	}
	event.DispatchUpdate(ev, "start")
	defer func() {
		if err != nil {
			event.DispatchUpdate(ev, "failed: "+err.Error())
		}
	}()

	// For master type migration, need to:
	// - switch the source shard to read-only
	// - gather the replication point
	// - wait for filtered replication to catch up before we continue
	// - disable filtered replication after the fact
	var sourceMasterTabletInfo *topo.TabletInfo
	if servedType == topo.TYPE_MASTER {
		// set master to read-only
		event.DispatchUpdate(ev, "setting source shard master to read-only")
		actionPath, err := wr.ai.SetReadOnly(sourceShard.MasterAlias)
		if err != nil {
			return err
		}
		if err := wr.WaitForCompletion(actionPath); err != nil {
			return err
		}

		// get the position
		event.DispatchUpdate(ev, "getting master position")
		sourceMasterTabletInfo, err = wr.ts.GetTablet(sourceShard.MasterAlias)
		if err != nil {
			return err
		}
		masterPosition, err := wr.ai.MasterPosition(sourceMasterTabletInfo, wr.ActionTimeout())
		if err != nil {
			return err
		}

		// wait for it
		event.DispatchUpdate(ev, "waiting for destination master to catch up to source master")
		if err := wr.ai.WaitBlpPosition(si.MasterAlias, blproto.BlpPosition{
			Uid:      0,
			Position: masterPosition,
		}, wr.ActionTimeout()); err != nil {
			return err
		}

		// and clear the shard record
		si.SourceShards = nil
	}

	// All is good, we can save the keyspace and shard (if needed) now
	event.DispatchUpdate(ev, "updating keyspace")
	if err = topo.UpdateKeyspace(wr.ts, ki); err != nil {
		return err
	}
	event.DispatchUpdate(ev, "updating destination shard")
	if servedType == topo.TYPE_MASTER {
		if err := topo.UpdateShard(wr.ts, si); err != nil {
			return err
		}
	}

	// Tell the new shards masters they can now be read-write.
	// Invoking a remote action will also make the tablet stop filtered
	// replication.
	event.DispatchUpdate(ev, "setting destination shard masters read-write")
	if servedType == topo.TYPE_MASTER {
		if err := wr.makeMastersReadWrite([]*topo.ShardInfo{si}); err != nil {
			return err
		}
	}

	// Now blacklist the table list on the right servers
	event.DispatchUpdate(ev, "setting blacklisted tables on source shard")
	if servedType == topo.TYPE_MASTER {
		if err := wr.ai.SetBlacklistedTables(sourceMasterTabletInfo, tables, wr.ActionTimeout()); err != nil {
			return err
		}
	} else {
		// We use the list of tables that are replicating
		// for the blacklist. In case of a reverse move, we clear the
		// blacklist.
		if reverse {
			tables = nil
		}
		if err := wr.SetBlacklistedTablesByShard(sourceShard.Keyspace(), sourceShard.ShardName(), servedType, tables); err != nil {
			return err
		}
	}

	event.DispatchUpdate(ev, "finished")
	return nil
}
Exemple #28
0
// rebuildCellSrvShard computes and writes the serving graph data to a
// single cell
func rebuildCellSrvShard(ctx context.Context, log logutil.Logger, ts topo.Server, si *topo.ShardInfo, cell string) (err error) {
	log.Infof("rebuildCellSrvShard %v/%v in cell %v", si.Keyspace(), si.ShardName(), cell)

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		// Read existing EndPoints node versions, so we know if any
		// changes sneak in after we read the tablets.
		versions, err := getEndPointsVersions(ctx, ts, cell, si.Keyspace(), si.ShardName())

		// Get all tablets in this cell/shard.
		tablets, err := ts.GetTabletMapForShardByCell(ctx, si.Keyspace(), si.ShardName(), []string{cell})
		if err != nil {
			if err != topo.ErrPartialResult {
				return err
			}
			log.Warningf("Got ErrPartialResult from topo.GetTabletMapForShardByCell(%v), some tablets may not be added properly to serving graph", cell)
		}

		// Build up the serving graph from scratch.
		serving := make(map[pb.TabletType]*pb.EndPoints)
		for _, tablet := range tablets {
			// Only add serving types.
			if !tablet.IsInServingGraph() {
				continue
			}

			// Check the Keyspace and Shard for the tablet are right.
			if tablet.Keyspace != si.Keyspace() || tablet.Shard != si.ShardName() {
				return fmt.Errorf("CRITICAL: tablet %v is in replication graph for shard %v/%v but belongs to shard %v:%v", tablet.Alias, si.Keyspace(), si.ShardName(), tablet.Keyspace, tablet.Shard)
			}

			// Add the tablet to the list.
			endpoints, ok := serving[tablet.Type]
			if !ok {
				endpoints = topo.NewEndPoints()
				serving[tablet.Type] = endpoints
			}
			entry, err := topo.TabletEndPoint(tablet.Tablet)
			if err != nil {
				log.Warningf("EndPointForTablet failed for tablet %v: %v", tablet.Alias, err)
				continue
			}
			endpoints.Entries = append(endpoints.Entries, entry)
		}

		wg := sync.WaitGroup{}
		fatalErrs := concurrency.AllErrorRecorder{}
		retryErrs := concurrency.AllErrorRecorder{}

		// Write nodes that should exist.
		for tabletType, endpoints := range serving {
			wg.Add(1)
			go func(tabletType pb.TabletType, endpoints *pb.EndPoints) {
				defer wg.Done()

				log.Infof("saving serving graph for cell %v shard %v/%v tabletType %v", cell, si.Keyspace(), si.ShardName(), tabletType)

				version, ok := versions[tabletType]
				if !ok {
					// This type didn't exist when we first checked.
					// Try to create, but only if it still doesn't exist.
					if err := ts.CreateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints); err != nil {
						log.Warningf("CreateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
						switch err {
						case topo.ErrNodeExists:
							retryErrs.RecordError(err)
						default:
							fatalErrs.RecordError(err)
						}
					}
					return
				}

				// Update only if the version matches.
				if err := ts.UpdateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints, version); err != nil {
					log.Warningf("UpdateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
					switch err {
					case topo.ErrBadVersion, topo.ErrNoNode:
						retryErrs.RecordError(err)
					default:
						fatalErrs.RecordError(err)
					}
				}
			}(tabletType, endpoints)
		}

		// Delete nodes that shouldn't exist.
		for tabletType, version := range versions {
			if _, ok := serving[tabletType]; !ok {
				wg.Add(1)
				go func(tabletType pb.TabletType, version int64) {
					defer wg.Done()
					log.Infof("removing stale db type from serving graph: %v", tabletType)
					if err := ts.DeleteEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, version); err != nil && err != topo.ErrNoNode {
						log.Warningf("DeleteEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
						switch err {
						case topo.ErrNoNode:
							// Someone else deleted it, which is fine.
						case topo.ErrBadVersion:
							retryErrs.RecordError(err)
						default:
							fatalErrs.RecordError(err)
						}
					}
				}(tabletType, version)
			}
		}

		// Update srvShard object
		wg.Add(1)
		go func() {
			defer wg.Done()
			log.Infof("updating shard serving graph in cell %v for %v/%v", cell, si.Keyspace(), si.ShardName())
			if err := UpdateSrvShard(ctx, ts, cell, si); err != nil {
				fatalErrs.RecordError(err)
				log.Warningf("writing serving data in cell %v for %v/%v failed: %v", cell, si.Keyspace(), si.ShardName(), err)
			}
		}()

		wg.Wait()

		// If there are any fatal errors, give up.
		if fatalErrs.HasErrors() {
			return fatalErrs.Error()
		}
		// If there are any retry errors, try again.
		if retryErrs.HasErrors() {
			continue
		}
		// Otherwise, success!
		return nil
	}
}
Exemple #29
0
func (wr *Wrangler) applySchemaShardComplex(statusArray []*TabletStatus, shardInfo *topo.ShardInfo, preflight *mysqlctl.SchemaChangeResult, masterTabletAlias topo.TabletAlias, change string, newParentTabletAlias topo.TabletAlias, force bool) (*mysqlctl.SchemaChangeResult, error) {
	// apply the schema change to all replica / slave tablets
	for _, status := range statusArray {
		// if already applied, we skip this guy
		diffs := mysqlctl.DiffSchemaToArray("after", preflight.AfterSchema, status.ti.Alias().String(), status.beforeSchema)
		if len(diffs) == 0 {
			relog.Info("Tablet %v already has the AfterSchema, skipping", status.ti.Alias())
			continue
		}

		// make sure the before schema matches
		diffs = mysqlctl.DiffSchemaToArray("master", preflight.BeforeSchema, status.ti.Alias().String(), status.beforeSchema)
		if len(diffs) > 0 {
			if force {
				relog.Warning("Tablet %v has inconsistent schema, ignoring: %v", status.ti.Alias(), strings.Join(diffs, "\n"))
			} else {
				return nil, fmt.Errorf("Tablet %v has inconsistent schema: %v", status.ti.Alias(), strings.Join(diffs, "\n"))
			}
		}

		// take this guy out of the serving graph if necessary
		ti, err := wr.ts.GetTablet(status.ti.Alias())
		if err != nil {
			return nil, err
		}
		typeChangeRequired := ti.Tablet.IsServingType()
		if typeChangeRequired {
			// note we want to update the serving graph there
			err = wr.changeTypeInternal(ti.Alias(), topo.TYPE_SCHEMA_UPGRADE)
			if err != nil {
				return nil, err
			}
		}

		// apply the schema change
		relog.Info("Applying schema change to slave %v in complex mode", status.ti.Alias())
		sc := &mysqlctl.SchemaChange{Sql: change, Force: force, AllowReplication: false, BeforeSchema: preflight.BeforeSchema, AfterSchema: preflight.AfterSchema}
		_, err = wr.ApplySchema(status.ti.Alias(), sc)
		if err != nil {
			return nil, err
		}

		// put this guy back into the serving graph
		if typeChangeRequired {
			err = wr.changeTypeInternal(ti.Alias(), ti.Tablet.Type)
			if err != nil {
				return nil, err
			}
		}
	}

	// if newParentTabletAlias is passed in, use that as the new master
	if newParentTabletAlias != (topo.TabletAlias{}) {
		relog.Info("Reparenting with new master set to %v", newParentTabletAlias)
		tabletMap, err := GetTabletMapForShard(wr.ts, shardInfo.Keyspace(), shardInfo.ShardName())
		if err != nil {
			return nil, err
		}

		slaveTabletMap, foundMaster, err := slaveTabletMap(tabletMap)
		if err != nil {
			return nil, err
		}

		newMasterTablet, err := wr.ts.GetTablet(newParentTabletAlias)
		if err != nil {
			return nil, err
		}

		err = wr.reparentShardGraceful(slaveTabletMap, foundMaster, newMasterTablet /*leaveMasterReadOnly*/, false)
		if err != nil {
			return nil, err
		}

		// Here we would apply the schema change to the old
		// master, but after a reparent it's in Scrap state,
		// so no need to.  When/if reparent leaves the
		// original master in a different state (like replica
		// or rdonly), then we should apply the schema there
		// too.
		relog.Info("Skipping schema change on old master %v in complex mode, it's been Scrapped", masterTabletAlias)
	}
	return &mysqlctl.SchemaChangeResult{BeforeSchema: preflight.BeforeSchema, AfterSchema: preflight.AfterSchema}, nil
}
Exemple #30
0
func (wr *Wrangler) applySchemaShard(shardInfo *topo.ShardInfo, preflight *mysqlctl.SchemaChangeResult, masterTabletAlias topo.TabletAlias, change string, newParentTabletAlias topo.TabletAlias, simple, force bool) (*mysqlctl.SchemaChangeResult, error) {

	// find all the shards we need to handle
	aliases, err := topo.FindAllTabletAliasesInShard(wr.ts, shardInfo.Keyspace(), shardInfo.ShardName())
	if err != nil {
		return nil, err
	}

	// build the array of TabletStatus we're going to use
	statusArray := make([]*TabletStatus, 0, len(aliases)-1)
	for _, alias := range aliases {
		if alias == masterTabletAlias {
			// we skip the master
			continue
		}

		ti, err := wr.ts.GetTablet(alias)
		if err != nil {
			return nil, err
		}
		if ti.Type == topo.TYPE_LAG {
			// lag tablets are usually behind, not replicating,
			// and a general pain. So let's just skip them
			// all together.
			// TODO(alainjobart) figure out other types to skip:
			// ValidateSchemaShard only does the serving types.
			// We do everything in the replication graph
			// but LAG. This seems fine for now.
			relog.Info("Skipping tablet %v as it is LAG", ti.Alias())
			continue
		}

		statusArray = append(statusArray, &TabletStatus{ti: ti})
	}

	// get schema on all tablets.
	relog.Info("Getting schema on all tablets for shard %v/%v", shardInfo.Keyspace(), shardInfo.ShardName())
	wg := &sync.WaitGroup{}
	for _, status := range statusArray {
		wg.Add(1)
		go func(status *TabletStatus) {
			status.beforeSchema, status.lastError = wr.GetSchemaTablet(status.ti, nil, false)
			wg.Done()
		}(status)
	}
	wg.Wait()

	// quick check for errors
	for _, status := range statusArray {
		if status.lastError != nil {
			return nil, fmt.Errorf("Error getting schema on tablet %v: %v", status.ti.Alias(), status.lastError)
		}
	}

	// simple or complex?
	if simple {
		return wr.applySchemaShardSimple(statusArray, preflight, masterTabletAlias, change, force)
	}

	return wr.applySchemaShardComplex(statusArray, shardInfo, preflight, masterTabletAlias, change, newParentTabletAlias, force)
}