示例#1
0
func (zkts *Server) updateTabletEndpoint(oldValue string, oldStat zk.Stat, addr *topodatapb.EndPoint) (newValue string, err error) {
	if oldStat == nil {
		// The incoming object doesn't exist - we haven't been placed in the serving
		// graph yet, so don't update. Assume the next process that rebuilds the graph
		// will get the updated tablet location.
		return "", errSkipUpdate
	}

	var addrs *topodatapb.EndPoints
	if oldValue != "" {
		addrs = &topodatapb.EndPoints{}
		if len(oldValue) > 0 {
			if err := json.Unmarshal([]byte(oldValue), addrs); err != nil {
				return "", fmt.Errorf("EndPoints unmarshal failed: %v %v", oldValue, err)
			}
		}

		foundTablet := false
		for i, entry := range addrs.Entries {
			if entry.Uid == addr.Uid {
				foundTablet = true
				if !topo.EndPointEquality(entry, addr) {
					addrs.Entries[i] = addr
				}
				break
			}
		}

		if !foundTablet {
			addrs.Entries = append(addrs.Entries, addr)
		}
	} else {
		addrs = topo.NewEndPoints()
		addrs.Entries = append(addrs.Entries, addr)
	}
	data, err := json.MarshalIndent(addrs, "", "  ")
	if err != nil {
		return "", err
	}
	return string(data), nil
}
示例#2
0
文件: rebuild.go 项目: rn2dy/vitess
// Write serving graph data to the cells
func (wr *Wrangler) rebuildShardSrvGraph(shardInfo *topo.ShardInfo, tablets []*topo.TabletInfo, cells []string) error {
	log.Infof("rebuildShardSrvGraph %v/%v", shardInfo.Keyspace(), shardInfo.ShardName())

	// Get all existing db types so they can be removed if nothing
	// had been editted.  This applies to all cells, which can't
	// be determined until you walk through all the tablets.
	//
	// existingDbTypeLocations is a map:
	//   key: {cell,keyspace,shard,tabletType}
	//   value: true
	existingDbTypeLocations := make(map[cellKeyspaceShardType]bool)

	// Update db type addresses in the serving graph
	//
	// locationAddrsMap is a map:
	//   key: {cell,keyspace,shard,tabletType}
	//   value: topo.EndPoints (list of server records)
	locationAddrsMap := make(map[cellKeyspaceShardType]*topo.EndPoints)

	// we keep track of the existingDbTypeLocations we've already looked at
	knownShardLocations := make(map[cellKeyspaceShard]bool)

	for _, tablet := range tablets {
		// only look at tablets in the cells we want to rebuild
		if !topo.InCellList(tablet.Tablet.Alias.Cell, cells) {
			continue
		}

		// this is {cell,keyspace,shard}
		// we'll get the children to find the existing types
		shardLocation := cellKeyspaceShard{tablet.Tablet.Alias.Cell, tablet.Tablet.Keyspace, tablet.Shard}
		// only need to do this once per cell
		if !knownShardLocations[shardLocation] {
			log.Infof("Getting tablet types on cell %v for %v/%v", tablet.Tablet.Alias.Cell, tablet.Tablet.Keyspace, tablet.Shard)
			tabletTypes, err := wr.ts.GetSrvTabletTypesPerShard(tablet.Tablet.Alias.Cell, tablet.Tablet.Keyspace, tablet.Shard)
			if err != nil {
				if err != topo.ErrNoNode {
					return err
				}
			} else {
				for _, tabletType := range tabletTypes {
					existingDbTypeLocations[cellKeyspaceShardType{tablet.Tablet.Alias.Cell, tablet.Tablet.Keyspace, tablet.Shard, tabletType}] = true
				}
			}
			knownShardLocations[shardLocation] = true
		}

		// Check IsInServingGraph after we have populated
		// existingDbTypeLocations so we properly prune data
		// if the definition of serving type changes.
		if !tablet.IsInServingGraph() {
			continue
		}

		location := cellKeyspaceShardType{tablet.Tablet.Alias.Cell, tablet.Keyspace, tablet.Shard, tablet.Type}
		addrs, ok := locationAddrsMap[location]
		if !ok {
			addrs = topo.NewEndPoints()
			locationAddrsMap[location] = addrs
		}

		entry, err := tabletmanager.EndPointForTablet(tablet.Tablet)
		if err != nil {
			log.Warningf("EndPointForTablet failed for tablet %v: %v", tablet.Alias, err)
			continue
		}
		addrs.Entries = append(addrs.Entries, *entry)
	}

	// we're gonna parallelize a lot here
	rec := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	// write all the {cell,keyspace,shard,type}
	// nodes everywhere we want them
	for location, addrs := range locationAddrsMap {
		wg.Add(1)
		go func(location cellKeyspaceShardType, addrs *topo.EndPoints) {
			log.Infof("saving serving graph for cell %v shard %v/%v tabletType %v", location.cell, location.keyspace, location.shard, location.tabletType)
			if err := wr.ts.UpdateEndPoints(location.cell, location.keyspace, location.shard, location.tabletType, addrs); err != nil {
				rec.RecordError(fmt.Errorf("writing endpoints for cell %v shard %v/%v tabletType %v failed: %v", location.cell, location.keyspace, location.shard, location.tabletType, err))
			}
			wg.Done()
		}(location, addrs)
	}

	// Delete any pre-existing paths that were not updated by this process.
	// That's the existingDbTypeLocations - locationAddrsMap
	for dbTypeLocation := range existingDbTypeLocations {
		if _, ok := locationAddrsMap[dbTypeLocation]; !ok {
			cell := dbTypeLocation.cell
			if !topo.InCellList(cell, cells) {
				continue
			}

			wg.Add(1)
			go func(dbTypeLocation cellKeyspaceShardType) {
				log.Infof("removing stale db type from serving graph: %v", dbTypeLocation)
				if err := wr.ts.DeleteSrvTabletType(dbTypeLocation.cell, dbTypeLocation.keyspace, dbTypeLocation.shard, dbTypeLocation.tabletType); err != nil {
					log.Warningf("unable to remove stale db type %v from serving graph: %v", dbTypeLocation, err)
				}
				wg.Done()
			}(dbTypeLocation)
		}
	}

	// wait until we're done with the background stuff to do the rest
	// FIXME(alainjobart) this wouldn't be necessary if UpdateSrvShard
	// below was creating the zookeeper nodes recursively.
	wg.Wait()
	if err := rec.Error(); err != nil {
		return err
	}

	// Update per-shard information per cell-specific serving path.
	//
	// srvShardByPath is a map:
	//   key: {cell,keyspace,shard}
	//   value: topo.SrvShard
	// this will create all the SrvShard objects
	srvShardByPath := make(map[cellKeyspaceShard]*topo.SrvShard)
	for location := range locationAddrsMap {
		// location will be {cell,keyspace,shard,type}
		srvShardPath := cellKeyspaceShard{location.cell, location.keyspace, location.shard}
		srvShard, ok := srvShardByPath[srvShardPath]
		if !ok {
			srvShard = &topo.SrvShard{
				KeyRange:    shardInfo.KeyRange,
				ServedTypes: shardInfo.ServedTypes,
				TabletTypes: make([]topo.TabletType, 0, 2),
			}
			srvShardByPath[srvShardPath] = srvShard
		}
		foundType := false
		for _, t := range srvShard.TabletTypes {
			if t == location.tabletType {
				foundType = true
			}
		}
		if !foundType {
			srvShard.TabletTypes = append(srvShard.TabletTypes, location.tabletType)
		}
	}

	// Save the shard entries
	for cks, srvShard := range srvShardByPath {
		wg.Add(1)
		go func(cks cellKeyspaceShard, srvShard *topo.SrvShard) {
			log.Infof("updating shard serving graph in cell %v for %v/%v", cks.cell, cks.keyspace, cks.shard)
			if err := wr.ts.UpdateSrvShard(cks.cell, cks.keyspace, cks.shard, srvShard); err != nil {
				rec.RecordError(fmt.Errorf("writing serving data in cell %v for %v/%v failed: %v", cks.cell, cks.keyspace, cks.shard, err))
			}
			wg.Done()
		}(cks, srvShard)
	}
	wg.Wait()
	return rec.Error()
}
示例#3
0
// rebuildCellSrvShard computes and writes the serving graph data to a
// single cell
func rebuildCellSrvShard(ctx context.Context, log logutil.Logger, ts topo.Server, si *topo.ShardInfo, cell string) (err error) {
	log.Infof("rebuildCellSrvShard %v/%v in cell %v", si.Keyspace(), si.ShardName(), cell)

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		// Read existing EndPoints node versions, so we know if any
		// changes sneak in after we read the tablets.
		versions, err := getEndPointsVersions(ctx, ts, cell, si.Keyspace(), si.ShardName())

		// Get all tablets in this cell/shard.
		tablets, err := ts.GetTabletMapForShardByCell(ctx, si.Keyspace(), si.ShardName(), []string{cell})
		if err != nil {
			if err != topo.ErrPartialResult {
				return err
			}
			log.Warningf("Got ErrPartialResult from topo.GetTabletMapForShardByCell(%v), some tablets may not be added properly to serving graph", cell)
		}

		// Build up the serving graph from scratch.
		serving := make(map[pb.TabletType]*pb.EndPoints)
		for _, tablet := range tablets {
			// Only add serving types.
			if !tablet.IsInServingGraph() {
				continue
			}

			// Check the Keyspace and Shard for the tablet are right.
			if tablet.Keyspace != si.Keyspace() || tablet.Shard != si.ShardName() {
				return fmt.Errorf("CRITICAL: tablet %v is in replication graph for shard %v/%v but belongs to shard %v:%v", tablet.Alias, si.Keyspace(), si.ShardName(), tablet.Keyspace, tablet.Shard)
			}

			// Add the tablet to the list.
			endpoints, ok := serving[tablet.Type]
			if !ok {
				endpoints = topo.NewEndPoints()
				serving[tablet.Type] = endpoints
			}
			entry, err := topo.TabletEndPoint(tablet.Tablet)
			if err != nil {
				log.Warningf("EndPointForTablet failed for tablet %v: %v", tablet.Alias, err)
				continue
			}
			endpoints.Entries = append(endpoints.Entries, entry)
		}

		wg := sync.WaitGroup{}
		fatalErrs := concurrency.AllErrorRecorder{}
		retryErrs := concurrency.AllErrorRecorder{}

		// Write nodes that should exist.
		for tabletType, endpoints := range serving {
			wg.Add(1)
			go func(tabletType pb.TabletType, endpoints *pb.EndPoints) {
				defer wg.Done()

				log.Infof("saving serving graph for cell %v shard %v/%v tabletType %v", cell, si.Keyspace(), si.ShardName(), tabletType)

				version, ok := versions[tabletType]
				if !ok {
					// This type didn't exist when we first checked.
					// Try to create, but only if it still doesn't exist.
					if err := ts.CreateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints); err != nil {
						log.Warningf("CreateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
						switch err {
						case topo.ErrNodeExists:
							retryErrs.RecordError(err)
						default:
							fatalErrs.RecordError(err)
						}
					}
					return
				}

				// Update only if the version matches.
				if err := ts.UpdateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints, version); err != nil {
					log.Warningf("UpdateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
					switch err {
					case topo.ErrBadVersion, topo.ErrNoNode:
						retryErrs.RecordError(err)
					default:
						fatalErrs.RecordError(err)
					}
				}
			}(tabletType, endpoints)
		}

		// Delete nodes that shouldn't exist.
		for tabletType, version := range versions {
			if _, ok := serving[tabletType]; !ok {
				wg.Add(1)
				go func(tabletType pb.TabletType, version int64) {
					defer wg.Done()
					log.Infof("removing stale db type from serving graph: %v", tabletType)
					if err := ts.DeleteEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, version); err != nil && err != topo.ErrNoNode {
						log.Warningf("DeleteEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
						switch err {
						case topo.ErrNoNode:
							// Someone else deleted it, which is fine.
						case topo.ErrBadVersion:
							retryErrs.RecordError(err)
						default:
							fatalErrs.RecordError(err)
						}
					}
				}(tabletType, version)
			}
		}

		// Update srvShard object
		wg.Add(1)
		go func() {
			defer wg.Done()
			log.Infof("updating shard serving graph in cell %v for %v/%v", cell, si.Keyspace(), si.ShardName())
			if err := UpdateSrvShard(ctx, ts, cell, si); err != nil {
				fatalErrs.RecordError(err)
				log.Warningf("writing serving data in cell %v for %v/%v failed: %v", cell, si.Keyspace(), si.ShardName(), err)
			}
		}()

		wg.Wait()

		// If there are any fatal errors, give up.
		if fatalErrs.HasErrors() {
			return fatalErrs.Error()
		}
		// If there are any retry errors, try again.
		if retryErrs.HasErrors() {
			continue
		}
		// Otherwise, success!
		return nil
	}
}
示例#4
0
// rebuildCellSrvShard computes and writes the serving graph data to a
// single cell
func rebuildCellSrvShard(log logutil.Logger, ts topo.Server, shardInfo *topo.ShardInfo, cell string, tablets map[topo.TabletAlias]*topo.TabletInfo) error {
	log.Infof("rebuildCellSrvShard %v/%v in cell %v", shardInfo.Keyspace(), shardInfo.ShardName(), cell)

	// Get all existing db types so they can be removed if nothing
	// had been edited.
	existingTabletTypes, err := ts.GetSrvTabletTypesPerShard(cell, shardInfo.Keyspace(), shardInfo.ShardName())
	if err != nil {
		if err != topo.ErrNoNode {
			return err
		}
	}

	// Update db type addresses in the serving graph
	//
	// locationAddrsMap is a map:
	//   key: tabletType
	//   value: EndPoints (list of server records)
	locationAddrsMap := make(map[topo.TabletType]*topo.EndPoints)
	for _, tablet := range tablets {
		if !tablet.IsInReplicationGraph() {
			// only valid case is a scrapped master in the
			// catastrophic reparent case
			if tablet.Parent.Uid != topo.NO_TABLET {
				log.Warningf("Tablet %v should not be in the replication graph, please investigate (it is being ignored in the rebuild)", tablet.Alias)
			}
			continue
		}

		// Check IsInServingGraph, we don't want to add tablets that
		// are not serving
		if !tablet.IsInServingGraph() {
			continue
		}

		// Check the Keyspace and Shard for the tablet are right
		if tablet.Keyspace != shardInfo.Keyspace() || tablet.Shard != shardInfo.ShardName() {
			return fmt.Errorf("CRITICAL: tablet %v is in replication graph for shard %v/%v but belongs to shard %v:%v", tablet.Alias, shardInfo.Keyspace(), shardInfo.ShardName(), tablet.Keyspace, tablet.Shard)
		}

		// Add the tablet to the list
		addrs, ok := locationAddrsMap[tablet.Type]
		if !ok {
			addrs = topo.NewEndPoints()
			locationAddrsMap[tablet.Type] = addrs
		}
		entry, err := tablet.Tablet.EndPoint()
		if err != nil {
			log.Warningf("EndPointForTablet failed for tablet %v: %v", tablet.Alias, err)
			continue
		}
		addrs.Entries = append(addrs.Entries, *entry)
	}

	// we're gonna parallelize a lot here:
	// - writing all the tabletTypes records
	// - removing the unused records
	// - writing SrvShard
	rec := concurrency.AllErrorRecorder{}
	wg := sync.WaitGroup{}

	// write all the EndPoints nodes everywhere we want them
	for tabletType, addrs := range locationAddrsMap {
		wg.Add(1)
		go func(tabletType topo.TabletType, addrs *topo.EndPoints) {
			log.Infof("saving serving graph for cell %v shard %v/%v tabletType %v", cell, shardInfo.Keyspace(), shardInfo.ShardName(), tabletType)
			if err := ts.UpdateEndPoints(cell, shardInfo.Keyspace(), shardInfo.ShardName(), tabletType, addrs); err != nil {
				rec.RecordError(fmt.Errorf("writing endpoints for cell %v shard %v/%v tabletType %v failed: %v", cell, shardInfo.Keyspace(), shardInfo.ShardName(), tabletType, err))
			}
			wg.Done()
		}(tabletType, addrs)
	}

	// Delete any pre-existing paths that were not updated by this process.
	// That's the existingTabletTypes - locationAddrsMap
	for _, tabletType := range existingTabletTypes {
		if _, ok := locationAddrsMap[tabletType]; !ok {
			wg.Add(1)
			go func(tabletType topo.TabletType) {
				log.Infof("removing stale db type from serving graph: %v", tabletType)
				if err := ts.DeleteEndPoints(cell, shardInfo.Keyspace(), shardInfo.ShardName(), tabletType); err != nil {
					log.Warningf("unable to remove stale db type %v from serving graph: %v", tabletType, err)
				}
				wg.Done()
			}(tabletType)
		}
	}

	// Update srvShard object
	wg.Add(1)
	go func() {
		log.Infof("updating shard serving graph in cell %v for %v/%v", cell, shardInfo.Keyspace(), shardInfo.ShardName())
		srvShard := &topo.SrvShard{
			Name:        shardInfo.ShardName(),
			KeyRange:    shardInfo.KeyRange,
			ServedTypes: shardInfo.ServedTypes,
			MasterCell:  shardInfo.MasterAlias.Cell,
			TabletTypes: make([]topo.TabletType, 0, len(locationAddrsMap)),
		}
		for tabletType := range locationAddrsMap {
			srvShard.TabletTypes = append(srvShard.TabletTypes, tabletType)
		}

		if err := ts.UpdateSrvShard(cell, shardInfo.Keyspace(), shardInfo.ShardName(), srvShard); err != nil {
			rec.RecordError(fmt.Errorf("writing serving data in cell %v for %v/%v failed: %v", cell, shardInfo.Keyspace(), shardInfo.ShardName(), err))
		}
		wg.Done()
	}()

	wg.Wait()
	return rec.Error()
}