Ejemplo n.º 1
0
func retryUpdateEndpoints(ctx context.Context, ts topo.Server, cell, keyspace, shard string, tabletType pb.TabletType, create bool, updateFunc func(*pb.EndPoints) bool) error {
	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		// Get or create EndPoints list.
		endpoints, version, err := ts.GetEndPoints(ctx, cell, keyspace, shard, tabletType)
		if err == topo.ErrNoNode && create {
			// Create instead of updating.
			endpoints = &pb.EndPoints{}
			if !updateFunc(endpoints) {
				// Nothing changed.
				return nil
			}
			err = ts.CreateEndPoints(ctx, cell, keyspace, shard, tabletType, endpoints)
			if err == topo.ErrNodeExists {
				// Someone else beat us to it. Try again.
				continue
			}
			return err
		}
		if err != nil {
			return err
		}

		// We got an existing EndPoints list. Try to update.
		if !updateFunc(endpoints) {
			// Nothing changed.
			return nil
		}

		// If there's nothing left, we should delete the list entirely.
		if len(endpoints.Entries) == 0 {
			err = ts.DeleteEndPoints(ctx, cell, keyspace, shard, tabletType, version)
			switch err {
			case topo.ErrNoNode:
				// Someone beat us to it, which is fine.
				return nil
			case topo.ErrBadVersion:
				// Someone else updated the list. Try again.
				continue
			}
			return err
		}

		err = ts.UpdateEndPoints(ctx, cell, keyspace, shard, tabletType, endpoints, version)
		if err == topo.ErrBadVersion || (err == topo.ErrNoNode && create) {
			// Someone else updated or deleted the list in the meantime. Try again.
			continue
		}
		return err
	}
}
Ejemplo n.º 2
0
// rebuildCellSrvShard computes and writes the serving graph data to a
// single cell
func rebuildCellSrvShard(ctx context.Context, log logutil.Logger, ts topo.Server, si *topo.ShardInfo, cell string) (err error) {
	log.Infof("rebuildCellSrvShard %v/%v in cell %v", si.Keyspace(), si.ShardName(), cell)

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		// Read existing EndPoints node versions, so we know if any
		// changes sneak in after we read the tablets.
		versions, err := getEndPointsVersions(ctx, ts, cell, si.Keyspace(), si.ShardName())

		// Get all tablets in this cell/shard.
		tablets, err := ts.GetTabletMapForShardByCell(ctx, si.Keyspace(), si.ShardName(), []string{cell})
		if err != nil {
			if err != topo.ErrPartialResult {
				return err
			}
			log.Warningf("Got ErrPartialResult from topo.GetTabletMapForShardByCell(%v), some tablets may not be added properly to serving graph", cell)
		}

		// Build up the serving graph from scratch.
		serving := make(map[pb.TabletType]*pb.EndPoints)
		for _, tablet := range tablets {
			// Only add serving types.
			if !tablet.IsInServingGraph() {
				continue
			}

			// Check the Keyspace and Shard for the tablet are right.
			if tablet.Keyspace != si.Keyspace() || tablet.Shard != si.ShardName() {
				return fmt.Errorf("CRITICAL: tablet %v is in replication graph for shard %v/%v but belongs to shard %v:%v", tablet.Alias, si.Keyspace(), si.ShardName(), tablet.Keyspace, tablet.Shard)
			}

			// Add the tablet to the list.
			endpoints, ok := serving[tablet.Type]
			if !ok {
				endpoints = topo.NewEndPoints()
				serving[tablet.Type] = endpoints
			}
			entry, err := topo.TabletEndPoint(tablet.Tablet)
			if err != nil {
				log.Warningf("EndPointForTablet failed for tablet %v: %v", tablet.Alias, err)
				continue
			}
			endpoints.Entries = append(endpoints.Entries, entry)
		}

		wg := sync.WaitGroup{}
		fatalErrs := concurrency.AllErrorRecorder{}
		retryErrs := concurrency.AllErrorRecorder{}

		// Write nodes that should exist.
		for tabletType, endpoints := range serving {
			wg.Add(1)
			go func(tabletType pb.TabletType, endpoints *pb.EndPoints) {
				defer wg.Done()

				log.Infof("saving serving graph for cell %v shard %v/%v tabletType %v", cell, si.Keyspace(), si.ShardName(), tabletType)

				version, ok := versions[tabletType]
				if !ok {
					// This type didn't exist when we first checked.
					// Try to create, but only if it still doesn't exist.
					if err := ts.CreateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints); err != nil {
						log.Warningf("CreateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
						switch err {
						case topo.ErrNodeExists:
							retryErrs.RecordError(err)
						default:
							fatalErrs.RecordError(err)
						}
					}
					return
				}

				// Update only if the version matches.
				if err := ts.UpdateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints, version); err != nil {
					log.Warningf("UpdateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
					switch err {
					case topo.ErrBadVersion, topo.ErrNoNode:
						retryErrs.RecordError(err)
					default:
						fatalErrs.RecordError(err)
					}
				}
			}(tabletType, endpoints)
		}

		// Delete nodes that shouldn't exist.
		for tabletType, version := range versions {
			if _, ok := serving[tabletType]; !ok {
				wg.Add(1)
				go func(tabletType pb.TabletType, version int64) {
					defer wg.Done()
					log.Infof("removing stale db type from serving graph: %v", tabletType)
					if err := ts.DeleteEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, version); err != nil && err != topo.ErrNoNode {
						log.Warningf("DeleteEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err)
						switch err {
						case topo.ErrNoNode:
							// Someone else deleted it, which is fine.
						case topo.ErrBadVersion:
							retryErrs.RecordError(err)
						default:
							fatalErrs.RecordError(err)
						}
					}
				}(tabletType, version)
			}
		}

		// Update srvShard object
		wg.Add(1)
		go func() {
			defer wg.Done()
			log.Infof("updating shard serving graph in cell %v for %v/%v", cell, si.Keyspace(), si.ShardName())
			if err := UpdateSrvShard(ctx, ts, cell, si); err != nil {
				fatalErrs.RecordError(err)
				log.Warningf("writing serving data in cell %v for %v/%v failed: %v", cell, si.Keyspace(), si.ShardName(), err)
			}
		}()

		wg.Wait()

		// If there are any fatal errors, give up.
		if fatalErrs.HasErrors() {
			return fatalErrs.Error()
		}
		// If there are any retry errors, try again.
		if retryErrs.HasErrors() {
			continue
		}
		// Otherwise, success!
		return nil
	}
}
Ejemplo n.º 3
0
// CheckServingGraph makes sure the serving graph functions work properly.
func CheckServingGraph(ctx context.Context, t *testing.T, ts topo.Server) {
	cell := getLocalCell(ctx, t, ts)

	// test individual cell/keyspace/shard/type entries
	if _, err := ts.GetSrvTabletTypesPerShard(ctx, cell, "test_keyspace", "-10"); err != topo.ErrNoNode {
		t.Errorf("GetSrvTabletTypesPerShard(invalid): %v", err)
	}
	if _, _, err := ts.GetEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER); err != topo.ErrNoNode {
		t.Errorf("GetEndPoints(invalid): %v", err)
	}

	endPoints := &pb.EndPoints{
		Entries: []*pb.EndPoint{
			&pb.EndPoint{
				Uid:  1,
				Host: "host1",
				PortMap: map[string]int32{
					"vt":    1234,
					"mysql": 1235,
					"grpc":  1236,
				},
			},
		},
	}

	if err := ts.CreateEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints); err != nil {
		t.Fatalf("CreateEndPoints(master): %v", err)
	}
	// Try to create again.
	if err := ts.CreateEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints); err != topo.ErrNodeExists {
		t.Fatalf("CreateEndPoints(master): err = %v, want topo.ErrNodeExists", err)
	}

	// Get version.
	_, version, err := ts.GetEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER)
	if err != nil {
		t.Fatalf("GetEndPoints(master): %v", err)
	}
	// Make a change.
	tmp := endPoints.Entries[0].Uid
	endPoints.Entries[0].Uid = tmp + 1
	if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, -1); err != nil {
		t.Fatalf("UpdateEndPoints(master): %v", err)
	}
	endPoints.Entries[0].Uid = tmp
	// Try to delete with the wrong version.
	if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, version); err != topo.ErrBadVersion {
		t.Fatalf("DeleteEndPoints: err = %v, want topo.ErrBadVersion", err)
	}
	// Delete with the correct version.
	_, version, err = ts.GetEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER)
	if err != nil {
		t.Fatalf("GetEndPoints(master): %v", err)
	}
	if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, version); err != nil {
		t.Fatalf("DeleteEndPoints: %v", err)
	}
	// Recreate it with an unconditional update.
	if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, -1); err != nil {
		t.Fatalf("UpdateEndPoints(master): %v", err)
	}

	if types, err := ts.GetSrvTabletTypesPerShard(ctx, cell, "test_keyspace", "-10"); err != nil || len(types) != 1 || types[0] != topo.TYPE_MASTER {
		t.Errorf("GetSrvTabletTypesPerShard(1): %v %v", err, types)
	}

	// Delete it unconditionally.
	if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, -1); err != nil {
		t.Fatalf("DeleteEndPoints: %v", err)
	}

	// Delete the SrvShard.
	if err := ts.DeleteSrvShard(ctx, cell, "test_keyspace", "-10"); err != nil {
		t.Fatalf("DeleteSrvShard: %v", err)
	}
	if _, err := ts.GetSrvShard(ctx, cell, "test_keyspace", "-10"); err != topo.ErrNoNode {
		t.Errorf("GetSrvShard(deleted) got %v, want ErrNoNode", err)
	}

	// Re-add endpoints.
	if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, -1); err != nil {
		t.Fatalf("UpdateEndPoints(master): %v", err)
	}

	addrs, version, err := ts.GetEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER)
	if err != nil {
		t.Errorf("GetEndPoints: %v", err)
	}
	if len(addrs.Entries) != 1 || addrs.Entries[0].Uid != 1 {
		t.Errorf("GetEndPoints(1): %v", addrs)
	}
	if pm := addrs.Entries[0].PortMap; pm["vt"] != 1234 || pm["mysql"] != 1235 || pm["grpc"] != 1236 {
		t.Errorf("GetSrcTabletType(1).PortMap: want %v, got %v", endPoints.Entries[0].PortMap, pm)
	}

	// Update with the wrong version.
	if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, version+1); err != topo.ErrBadVersion {
		t.Fatalf("UpdateEndPoints(master): err = %v, want topo.ErrBadVersion", err)
	}
	// Update with the right version.
	if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, version); err != nil {
		t.Fatalf("UpdateEndPoints(master): %v", err)
	}
	// Update existing EndPoints unconditionally.
	if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, -1); err != nil {
		t.Fatalf("UpdateEndPoints(master): %v", err)
	}

	if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_REPLICA, -1); err != topo.ErrNoNode {
		t.Errorf("DeleteEndPoints(unknown): %v", err)
	}
	if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, -1); err != nil {
		t.Errorf("DeleteEndPoints(master): %v", err)
	}

	// test cell/keyspace/shard entries (SrvShard)
	srvShard := &pb.SrvShard{
		Name:       "-10",
		KeyRange:   newKeyRange3("-10"),
		MasterCell: "test",
	}
	if err := ts.UpdateSrvShard(ctx, cell, "test_keyspace", "-10", srvShard); err != nil {
		t.Fatalf("UpdateSrvShard(1): %v", err)
	}
	if _, err := ts.GetSrvShard(ctx, cell, "test_keyspace", "666"); err != topo.ErrNoNode {
		t.Errorf("GetSrvShard(invalid): %v", err)
	}
	if s, err := ts.GetSrvShard(ctx, cell, "test_keyspace", "-10"); err != nil ||
		s.Name != "-10" ||
		!key.KeyRangeEqual(s.KeyRange, newKeyRange3("-10")) ||
		s.MasterCell != "test" {
		t.Errorf("GetSrvShard(valid): %v", err)
	}

	// test cell/keyspace entries (SrvKeyspace)
	srvKeyspace := topo.SrvKeyspace{
		Partitions: map[topo.TabletType]*topo.KeyspacePartition{
			topo.TYPE_MASTER: &topo.KeyspacePartition{
				ShardReferences: []topo.ShardReference{
					topo.ShardReference{
						Name:     "-80",
						KeyRange: newKeyRange("-80"),
					},
				},
			},
		},
		ShardingColumnName: "video_id",
		ShardingColumnType: key.KIT_UINT64,
		ServedFrom: map[topo.TabletType]string{
			topo.TYPE_REPLICA: "other_keyspace",
		},
	}
	if err := ts.UpdateSrvKeyspace(ctx, cell, "test_keyspace", &srvKeyspace); err != nil {
		t.Errorf("UpdateSrvKeyspace(1): %v", err)
	}
	if _, err := ts.GetSrvKeyspace(ctx, cell, "test_keyspace666"); err != topo.ErrNoNode {
		t.Errorf("GetSrvKeyspace(invalid): %v", err)
	}
	if k, err := ts.GetSrvKeyspace(ctx, cell, "test_keyspace"); err != nil ||
		len(k.Partitions) != 1 ||
		len(k.Partitions[topo.TYPE_MASTER].ShardReferences) != 1 ||
		k.Partitions[topo.TYPE_MASTER].ShardReferences[0].Name != "-80" ||
		k.Partitions[topo.TYPE_MASTER].ShardReferences[0].KeyRange != newKeyRange("-80") ||
		k.ShardingColumnName != "video_id" ||
		k.ShardingColumnType != key.KIT_UINT64 ||
		k.ServedFrom[topo.TYPE_REPLICA] != "other_keyspace" {
		t.Errorf("GetSrvKeyspace(valid): %v %v", err, k)
	}
	if k, err := ts.GetSrvKeyspaceNames(ctx, cell); err != nil || len(k) != 1 || k[0] != "test_keyspace" {
		t.Errorf("GetSrvKeyspaceNames(): %v", err)
	}

	// check that updating a SrvKeyspace out of the blue works
	if err := ts.UpdateSrvKeyspace(ctx, cell, "unknown_keyspace_so_far", &srvKeyspace); err != nil {
		t.Fatalf("UpdateSrvKeyspace(2): %v", err)
	}
	if k, err := ts.GetSrvKeyspace(ctx, cell, "unknown_keyspace_so_far"); err != nil ||
		len(k.Partitions) != 1 ||
		len(k.Partitions[topo.TYPE_MASTER].ShardReferences) != 1 ||
		k.Partitions[topo.TYPE_MASTER].ShardReferences[0].Name != "-80" ||
		k.Partitions[topo.TYPE_MASTER].ShardReferences[0].KeyRange != newKeyRange("-80") ||
		k.ShardingColumnName != "video_id" ||
		k.ShardingColumnType != key.KIT_UINT64 ||
		k.ServedFrom[topo.TYPE_REPLICA] != "other_keyspace" {
		t.Errorf("GetSrvKeyspace(out of the blue): %v %v", err, *k)
	}

	// Delete the SrvKeyspace.
	if err := ts.DeleteSrvKeyspace(ctx, cell, "unknown_keyspace_so_far"); err != nil {
		t.Fatalf("DeleteSrvShard: %v", err)
	}
	if _, err := ts.GetSrvKeyspace(ctx, cell, "unknown_keyspace_so_far"); err != topo.ErrNoNode {
		t.Errorf("GetSrvKeyspace(deleted) got %v, want ErrNoNode", err)
	}
}