Ejemplo n.º 1
0
// FixShardReplication will fix the first problem it encounters within
// a ShardReplication object
func FixShardReplication(ts Server, logger logutil.Logger, cell, keyspace, shard string) error {
	sri, err := ts.GetShardReplication(cell, keyspace, shard)
	if err != nil {
		return err
	}

	for _, rl := range sri.ReplicationLinks {
		ti, err := ts.GetTablet(rl.TabletAlias)
		if err == ErrNoNode {
			logger.Warningf("Tablet %v is in the replication graph, but does not exist, removing it", rl.TabletAlias)
			return RemoveShardReplicationRecord(ts, cell, keyspace, shard, rl.TabletAlias)
		}
		if err != nil {
			// unknown error, we probably don't want to continue
			return err
		}

		if ti.Type == TYPE_SCRAP {
			logger.Warningf("Tablet %v is in the replication graph, but is scrapped, removing it", rl.TabletAlias)
			return RemoveShardReplicationRecord(ts, cell, keyspace, shard, rl.TabletAlias)
		}

		logger.Infof("Keeping tablet %v in the replication graph", rl.TabletAlias)
	}

	logger.Infof("All entries in replication graph are valid")
	return nil
}
Ejemplo n.º 2
0
// RestartSlavesExternal will tell all the slaves in the provided list
// that they have a new master, and also tell all the masters. The
// masters will be scrapped if they don't answer.
// We execute all the actions in parallel.
func RestartSlavesExternal(ts topo.Server, log logutil.Logger, slaveTabletMap, masterTabletMap map[topo.TabletAlias]*topo.TabletInfo, masterElectTabletAlias topo.TabletAlias, slaveWasRestarted func(*topo.TabletInfo, *actionnode.SlaveWasRestartedArgs) error) {
	wg := sync.WaitGroup{}

	swrd := actionnode.SlaveWasRestartedArgs{
		Parent: masterElectTabletAlias,
	}

	log.Infof("Updating individual tablets with the right master...")

	// do all the slaves
	for _, ti := range slaveTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			if err := slaveWasRestarted(ti, &swrd); err != nil {
				log.Warningf("Slave %v had an error: %v", ti.Alias, err)
			}
			wg.Done()
		}(ti)
	}

	// and do the old master and any straggler, if possible.
	for _, ti := range masterTabletMap {
		wg.Add(1)
		go func(ti *topo.TabletInfo) {
			err := slaveWasRestarted(ti, &swrd)
			if err != nil {
				// the old master can be annoying if left
				// around in the replication graph, so if we
				// can't restart it, we just scrap it.
				// We don't rebuild the Shard just yet though.
				log.Warningf("Old master %v is not restarting in time, forcing it to spare: %v", ti.Alias, err)

				ti.Type = topo.TYPE_SPARE
				ti.Parent = masterElectTabletAlias
				if err := topo.UpdateTablet(context.TODO(), ts, ti); err != nil {
					log.Warningf("Failed to change old master %v to spare: %v", ti.Alias, err)
				}
			}
			wg.Done()
		}(ti)
	}
	wg.Wait()
}
Ejemplo n.º 3
0
// Update shard file with new master, replicas, etc.
//
// Re-read from TopologyServer to make sure we are using the side
// effects of all actions.
//
// This function locks individual SvrShard paths, so it doesn't need a lock
// on the shard.
func RebuildShard(ctx context.Context, log logutil.Logger, ts topo.Server, keyspace, shard string, cells []string, timeout time.Duration, interrupted chan struct{}) (*topo.ShardInfo, error) {
	log.Infof("RebuildShard %v/%v", keyspace, shard)

	span := trace.NewSpanFromContext(ctx)
	span.StartLocal("topotools.RebuildShard")
	defer span.Finish()
	ctx = trace.NewContext(ctx, span)

	// read the existing shard info. It has to exist.
	shardInfo, err := ts.GetShard(keyspace, shard)
	if err != nil {
		return nil, err
	}

	// rebuild all cells in parallel
	wg := sync.WaitGroup{}
	rec := concurrency.AllErrorRecorder{}
	for _, cell := range shardInfo.Cells {
		// skip this cell if we shouldn't rebuild it
		if !topo.InCellList(cell, cells) {
			continue
		}

		// start with the master if it's in the current cell
		tabletsAsMap := make(map[topo.TabletAlias]bool)
		if shardInfo.MasterAlias.Cell == cell {
			tabletsAsMap[shardInfo.MasterAlias] = true
		}

		wg.Add(1)
		go func(cell string) {
			defer wg.Done()

			// Lock the SrvShard so we don't race with other rebuilds of the same
			// shard in the same cell (e.g. from our peer tablets).
			actionNode := actionnode.RebuildSrvShard()
			lockPath, err := actionNode.LockSrvShard(ctx, ts, cell, keyspace, shard, timeout, interrupted)
			if err != nil {
				rec.RecordError(err)
				return
			}

			// read the ShardReplication object to find tablets
			sri, err := ts.GetShardReplication(cell, keyspace, shard)
			if err != nil {
				rec.RecordError(fmt.Errorf("GetShardReplication(%v, %v, %v) failed: %v", cell, keyspace, shard, err))
				return
			}

			// add all relevant tablets to the map
			for _, rl := range sri.ReplicationLinks {
				tabletsAsMap[rl.TabletAlias] = true
				if rl.Parent.Cell == cell {
					tabletsAsMap[rl.Parent] = true
				}
			}

			// convert the map to a list
			aliases := make([]topo.TabletAlias, 0, len(tabletsAsMap))
			for a := range tabletsAsMap {
				aliases = append(aliases, a)
			}

			// read all the Tablet records
			tablets, err := topo.GetTabletMap(ctx, ts, aliases)
			switch err {
			case nil:
				// keep going, we're good
			case topo.ErrPartialResult:
				log.Warningf("Got ErrPartialResult from topo.GetTabletMap in cell %v, some tablets may not be added properly to serving graph", cell)
			default:
				rec.RecordError(fmt.Errorf("GetTabletMap in cell %v failed: %v", cell, err))
				return
			}

			// write the data we need to
			rebuildErr := rebuildCellSrvShard(ctx, log, ts, shardInfo, cell, tablets)

			// and unlock
			if err := actionNode.UnlockSrvShard(ctx, ts, cell, keyspace, shard, lockPath, rebuildErr); err != nil {
				rec.RecordError(err)
			}
		}(cell)
	}
	wg.Wait()

	return shardInfo, rec.Error()
}
Ejemplo n.º 4
0
// rebuildCellSrvShard computes and writes the serving graph data to a
// single cell
func rebuildCellSrvShard(ctx context.Context, log logutil.Logger, ts topo.Server, shardInfo *topo.ShardInfo, cell string, tablets map[topo.TabletAlias]*topo.TabletInfo) error {
	log.Infof("rebuildCellSrvShard %v/%v in cell %v", shardInfo.Keyspace(), shardInfo.ShardName(), cell)

	// Get all existing db types so they can be removed if nothing
	// had been edited.
	existingTabletTypes, err := ts.GetSrvTabletTypesPerShard(cell, shardInfo.Keyspace(), shardInfo.ShardName())
	if err != nil {
		if err != topo.ErrNoNode {
			return err
		}
	}

	// Update db type addresses in the serving graph
	//
	// locationAddrsMap is a map:
	//   key: tabletType
	//   value: EndPoints (list of server records)
	locationAddrsMap := make(map[topo.TabletType]*topo.EndPoints)
	for _, tablet := range tablets {
		if !tablet.IsInReplicationGraph() {
			// only valid case is a scrapped master in the
			// catastrophic reparent case
			if tablet.Parent.Uid != topo.NO_TABLET {
				log.Warningf("Tablet %v should not be in the replication graph, please investigate (it is being ignored in the rebuild)", tablet.Alias)
			}
			continue
		}

		// Check IsInServingGraph, we don't want to add tablets that
		// are not serving
		if !tablet.IsInServingGraph() {
			continue
		}

		// Check the Keyspace and Shard for the tablet are right
		if tablet.Keyspace != shardInfo.Keyspace() || tablet.Shard != shardInfo.ShardName() {
			return fmt.Errorf("CRITICAL: tablet %v is in replication graph for shard %v/%v but belongs to shard %v:%v", tablet.Alias, shardInfo.Keyspace(), shardInfo.ShardName(), tablet.Keyspace, tablet.Shard)
		}

		// Add the tablet to the list
		addrs, ok := locationAddrsMap[tablet.Type]
		if !ok {
			addrs = topo.NewEndPoints()
			locationAddrsMap[tablet.Type] = addrs
		}
		entry, err := tablet.Tablet.EndPoint()
		if err != nil {
			log.Warningf("EndPointForTablet failed for tablet %v: %v", tablet.Alias, err)
			continue
		}
		addrs.Entries = append(addrs.Entries, *entry)
	}

	// we're gonna parallelize a lot here:
	// - writing all the tabletTypes records
	// - removing the unused records
	// - writing SrvShard
	rec := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	// write all the EndPoints nodes everywhere we want them
	for tabletType, addrs := range locationAddrsMap {
		wg.Add(1)
		go func(tabletType topo.TabletType, addrs *topo.EndPoints) {
			log.Infof("saving serving graph for cell %v shard %v/%v tabletType %v", cell, shardInfo.Keyspace(), shardInfo.ShardName(), tabletType)
			span := trace.NewSpanFromContext(ctx)
			span.StartClient("TopoServer.UpdateEndPoints")
			span.Annotate("tablet_type", string(tabletType))
			if err := ts.UpdateEndPoints(cell, shardInfo.Keyspace(), shardInfo.ShardName(), tabletType, addrs); err != nil {
				rec.RecordError(fmt.Errorf("writing endpoints for cell %v shard %v/%v tabletType %v failed: %v", cell, shardInfo.Keyspace(), shardInfo.ShardName(), tabletType, err))
			}
			span.Finish()
			wg.Done()
		}(tabletType, addrs)
	}

	// Delete any pre-existing paths that were not updated by this process.
	// That's the existingTabletTypes - locationAddrsMap
	for _, tabletType := range existingTabletTypes {
		if _, ok := locationAddrsMap[tabletType]; !ok {
			wg.Add(1)
			go func(tabletType topo.TabletType) {
				log.Infof("removing stale db type from serving graph: %v", tabletType)
				span := trace.NewSpanFromContext(ctx)
				span.StartClient("TopoServer.DeleteEndPoints")
				span.Annotate("tablet_type", string(tabletType))
				if err := ts.DeleteEndPoints(cell, shardInfo.Keyspace(), shardInfo.ShardName(), tabletType); err != nil {
					log.Warningf("unable to remove stale db type %v from serving graph: %v", tabletType, err)
				}
				span.Finish()
				wg.Done()
			}(tabletType)
		}
	}

	// Update srvShard object
	wg.Add(1)
	go func() {
		log.Infof("updating shard serving graph in cell %v for %v/%v", cell, shardInfo.Keyspace(), shardInfo.ShardName())
		srvShard := &topo.SrvShard{
			Name:        shardInfo.ShardName(),
			KeyRange:    shardInfo.KeyRange,
			ServedTypes: shardInfo.GetServedTypesPerCell(cell),
			MasterCell:  shardInfo.MasterAlias.Cell,
			TabletTypes: make([]topo.TabletType, 0, len(locationAddrsMap)),
		}
		for tabletType := range locationAddrsMap {
			srvShard.TabletTypes = append(srvShard.TabletTypes, tabletType)
		}

		span := trace.NewSpanFromContext(ctx)
		span.StartClient("TopoServer.UpdateSrvShard")
		span.Annotate("keyspace", shardInfo.Keyspace())
		span.Annotate("shard", shardInfo.ShardName())
		span.Annotate("cell", cell)
		if err := ts.UpdateSrvShard(cell, shardInfo.Keyspace(), shardInfo.ShardName(), srvShard); err != nil {
			rec.RecordError(fmt.Errorf("writing serving data in cell %v for %v/%v failed: %v", cell, shardInfo.Keyspace(), shardInfo.ShardName(), err))
		}
		span.Finish()
		wg.Done()
	}()

	wg.Wait()
	return rec.Error()
}