func retryUpdateEndpoints(ctx context.Context, ts topo.Server, cell, keyspace, shard string, tabletType pb.TabletType, create bool, updateFunc func(*pb.EndPoints) bool) error { for { select { case <-ctx.Done(): return ctx.Err() default: } // Get or create EndPoints list. endpoints, version, err := ts.GetEndPoints(ctx, cell, keyspace, shard, tabletType) if err == topo.ErrNoNode && create { // Create instead of updating. endpoints = &pb.EndPoints{} if !updateFunc(endpoints) { // Nothing changed. return nil } err = ts.CreateEndPoints(ctx, cell, keyspace, shard, tabletType, endpoints) if err == topo.ErrNodeExists { // Someone else beat us to it. Try again. continue } return err } if err != nil { return err } // We got an existing EndPoints list. Try to update. if !updateFunc(endpoints) { // Nothing changed. return nil } // If there's nothing left, we should delete the list entirely. if len(endpoints.Entries) == 0 { err = ts.DeleteEndPoints(ctx, cell, keyspace, shard, tabletType, version) switch err { case topo.ErrNoNode: // Someone beat us to it, which is fine. return nil case topo.ErrBadVersion: // Someone else updated the list. Try again. continue } return err } err = ts.UpdateEndPoints(ctx, cell, keyspace, shard, tabletType, endpoints, version) if err == topo.ErrBadVersion || (err == topo.ErrNoNode && create) { // Someone else updated or deleted the list in the meantime. Try again. continue } return err } }
// rebuildCellSrvShard computes and writes the serving graph data to a // single cell func rebuildCellSrvShard(ctx context.Context, log logutil.Logger, ts topo.Server, si *topo.ShardInfo, cell string) (err error) { log.Infof("rebuildCellSrvShard %v/%v in cell %v", si.Keyspace(), si.ShardName(), cell) for { select { case <-ctx.Done(): return ctx.Err() default: } // Read existing EndPoints node versions, so we know if any // changes sneak in after we read the tablets. versions, err := getEndPointsVersions(ctx, ts, cell, si.Keyspace(), si.ShardName()) // Get all tablets in this cell/shard. tablets, err := ts.GetTabletMapForShardByCell(ctx, si.Keyspace(), si.ShardName(), []string{cell}) if err != nil { if err != topo.ErrPartialResult { return err } log.Warningf("Got ErrPartialResult from topo.GetTabletMapForShardByCell(%v), some tablets may not be added properly to serving graph", cell) } // Build up the serving graph from scratch. serving := make(map[pb.TabletType]*pb.EndPoints) for _, tablet := range tablets { // Only add serving types. if !tablet.IsInServingGraph() { continue } // Check the Keyspace and Shard for the tablet are right. if tablet.Keyspace != si.Keyspace() || tablet.Shard != si.ShardName() { return fmt.Errorf("CRITICAL: tablet %v is in replication graph for shard %v/%v but belongs to shard %v:%v", tablet.Alias, si.Keyspace(), si.ShardName(), tablet.Keyspace, tablet.Shard) } // Add the tablet to the list. endpoints, ok := serving[tablet.Type] if !ok { endpoints = topo.NewEndPoints() serving[tablet.Type] = endpoints } entry, err := topo.TabletEndPoint(tablet.Tablet) if err != nil { log.Warningf("EndPointForTablet failed for tablet %v: %v", tablet.Alias, err) continue } endpoints.Entries = append(endpoints.Entries, entry) } wg := sync.WaitGroup{} fatalErrs := concurrency.AllErrorRecorder{} retryErrs := concurrency.AllErrorRecorder{} // Write nodes that should exist. for tabletType, endpoints := range serving { wg.Add(1) go func(tabletType pb.TabletType, endpoints *pb.EndPoints) { defer wg.Done() log.Infof("saving serving graph for cell %v shard %v/%v tabletType %v", cell, si.Keyspace(), si.ShardName(), tabletType) version, ok := versions[tabletType] if !ok { // This type didn't exist when we first checked. // Try to create, but only if it still doesn't exist. if err := ts.CreateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints); err != nil { log.Warningf("CreateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err) switch err { case topo.ErrNodeExists: retryErrs.RecordError(err) default: fatalErrs.RecordError(err) } } return } // Update only if the version matches. if err := ts.UpdateEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, endpoints, version); err != nil { log.Warningf("UpdateEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err) switch err { case topo.ErrBadVersion, topo.ErrNoNode: retryErrs.RecordError(err) default: fatalErrs.RecordError(err) } } }(tabletType, endpoints) } // Delete nodes that shouldn't exist. for tabletType, version := range versions { if _, ok := serving[tabletType]; !ok { wg.Add(1) go func(tabletType pb.TabletType, version int64) { defer wg.Done() log.Infof("removing stale db type from serving graph: %v", tabletType) if err := ts.DeleteEndPoints(ctx, cell, si.Keyspace(), si.ShardName(), tabletType, version); err != nil && err != topo.ErrNoNode { log.Warningf("DeleteEndPoints(%v, %v, %v) failed during rebuild: %v", cell, si, tabletType, err) switch err { case topo.ErrNoNode: // Someone else deleted it, which is fine. case topo.ErrBadVersion: retryErrs.RecordError(err) default: fatalErrs.RecordError(err) } } }(tabletType, version) } } // Update srvShard object wg.Add(1) go func() { defer wg.Done() log.Infof("updating shard serving graph in cell %v for %v/%v", cell, si.Keyspace(), si.ShardName()) if err := UpdateSrvShard(ctx, ts, cell, si); err != nil { fatalErrs.RecordError(err) log.Warningf("writing serving data in cell %v for %v/%v failed: %v", cell, si.Keyspace(), si.ShardName(), err) } }() wg.Wait() // If there are any fatal errors, give up. if fatalErrs.HasErrors() { return fatalErrs.Error() } // If there are any retry errors, try again. if retryErrs.HasErrors() { continue } // Otherwise, success! return nil } }
// CheckServingGraph makes sure the serving graph functions work properly. func CheckServingGraph(ctx context.Context, t *testing.T, ts topo.Server) { cell := getLocalCell(ctx, t, ts) // test individual cell/keyspace/shard/type entries if _, err := ts.GetSrvTabletTypesPerShard(ctx, cell, "test_keyspace", "-10"); err != topo.ErrNoNode { t.Errorf("GetSrvTabletTypesPerShard(invalid): %v", err) } if _, _, err := ts.GetEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER); err != topo.ErrNoNode { t.Errorf("GetEndPoints(invalid): %v", err) } endPoints := &pb.EndPoints{ Entries: []*pb.EndPoint{ &pb.EndPoint{ Uid: 1, Host: "host1", PortMap: map[string]int32{ "vt": 1234, "mysql": 1235, "grpc": 1236, }, }, }, } if err := ts.CreateEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints); err != nil { t.Fatalf("CreateEndPoints(master): %v", err) } // Try to create again. if err := ts.CreateEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints); err != topo.ErrNodeExists { t.Fatalf("CreateEndPoints(master): err = %v, want topo.ErrNodeExists", err) } // Get version. _, version, err := ts.GetEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER) if err != nil { t.Fatalf("GetEndPoints(master): %v", err) } // Make a change. tmp := endPoints.Entries[0].Uid endPoints.Entries[0].Uid = tmp + 1 if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, -1); err != nil { t.Fatalf("UpdateEndPoints(master): %v", err) } endPoints.Entries[0].Uid = tmp // Try to delete with the wrong version. if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, version); err != topo.ErrBadVersion { t.Fatalf("DeleteEndPoints: err = %v, want topo.ErrBadVersion", err) } // Delete with the correct version. _, version, err = ts.GetEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER) if err != nil { t.Fatalf("GetEndPoints(master): %v", err) } if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, version); err != nil { t.Fatalf("DeleteEndPoints: %v", err) } // Recreate it with an unconditional update. if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, -1); err != nil { t.Fatalf("UpdateEndPoints(master): %v", err) } if types, err := ts.GetSrvTabletTypesPerShard(ctx, cell, "test_keyspace", "-10"); err != nil || len(types) != 1 || types[0] != topo.TYPE_MASTER { t.Errorf("GetSrvTabletTypesPerShard(1): %v %v", err, types) } // Delete it unconditionally. if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, -1); err != nil { t.Fatalf("DeleteEndPoints: %v", err) } // Delete the SrvShard. if err := ts.DeleteSrvShard(ctx, cell, "test_keyspace", "-10"); err != nil { t.Fatalf("DeleteSrvShard: %v", err) } if _, err := ts.GetSrvShard(ctx, cell, "test_keyspace", "-10"); err != topo.ErrNoNode { t.Errorf("GetSrvShard(deleted) got %v, want ErrNoNode", err) } // Re-add endpoints. if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, -1); err != nil { t.Fatalf("UpdateEndPoints(master): %v", err) } addrs, version, err := ts.GetEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER) if err != nil { t.Errorf("GetEndPoints: %v", err) } if len(addrs.Entries) != 1 || addrs.Entries[0].Uid != 1 { t.Errorf("GetEndPoints(1): %v", addrs) } if pm := addrs.Entries[0].PortMap; pm["vt"] != 1234 || pm["mysql"] != 1235 || pm["grpc"] != 1236 { t.Errorf("GetSrcTabletType(1).PortMap: want %v, got %v", endPoints.Entries[0].PortMap, pm) } // Update with the wrong version. if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, version+1); err != topo.ErrBadVersion { t.Fatalf("UpdateEndPoints(master): err = %v, want topo.ErrBadVersion", err) } // Update with the right version. if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, version); err != nil { t.Fatalf("UpdateEndPoints(master): %v", err) } // Update existing EndPoints unconditionally. if err := topo.UpdateEndPoints(ctx, ts, cell, "test_keyspace", "-10", topo.TYPE_MASTER, endPoints, -1); err != nil { t.Fatalf("UpdateEndPoints(master): %v", err) } if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_REPLICA, -1); err != topo.ErrNoNode { t.Errorf("DeleteEndPoints(unknown): %v", err) } if err := ts.DeleteEndPoints(ctx, cell, "test_keyspace", "-10", topo.TYPE_MASTER, -1); err != nil { t.Errorf("DeleteEndPoints(master): %v", err) } // test cell/keyspace/shard entries (SrvShard) srvShard := &pb.SrvShard{ Name: "-10", KeyRange: newKeyRange3("-10"), MasterCell: "test", } if err := ts.UpdateSrvShard(ctx, cell, "test_keyspace", "-10", srvShard); err != nil { t.Fatalf("UpdateSrvShard(1): %v", err) } if _, err := ts.GetSrvShard(ctx, cell, "test_keyspace", "666"); err != topo.ErrNoNode { t.Errorf("GetSrvShard(invalid): %v", err) } if s, err := ts.GetSrvShard(ctx, cell, "test_keyspace", "-10"); err != nil || s.Name != "-10" || !key.KeyRangeEqual(s.KeyRange, newKeyRange3("-10")) || s.MasterCell != "test" { t.Errorf("GetSrvShard(valid): %v", err) } // test cell/keyspace entries (SrvKeyspace) srvKeyspace := topo.SrvKeyspace{ Partitions: map[topo.TabletType]*topo.KeyspacePartition{ topo.TYPE_MASTER: &topo.KeyspacePartition{ ShardReferences: []topo.ShardReference{ topo.ShardReference{ Name: "-80", KeyRange: newKeyRange("-80"), }, }, }, }, ShardingColumnName: "video_id", ShardingColumnType: key.KIT_UINT64, ServedFrom: map[topo.TabletType]string{ topo.TYPE_REPLICA: "other_keyspace", }, } if err := ts.UpdateSrvKeyspace(ctx, cell, "test_keyspace", &srvKeyspace); err != nil { t.Errorf("UpdateSrvKeyspace(1): %v", err) } if _, err := ts.GetSrvKeyspace(ctx, cell, "test_keyspace666"); err != topo.ErrNoNode { t.Errorf("GetSrvKeyspace(invalid): %v", err) } if k, err := ts.GetSrvKeyspace(ctx, cell, "test_keyspace"); err != nil || len(k.Partitions) != 1 || len(k.Partitions[topo.TYPE_MASTER].ShardReferences) != 1 || k.Partitions[topo.TYPE_MASTER].ShardReferences[0].Name != "-80" || k.Partitions[topo.TYPE_MASTER].ShardReferences[0].KeyRange != newKeyRange("-80") || k.ShardingColumnName != "video_id" || k.ShardingColumnType != key.KIT_UINT64 || k.ServedFrom[topo.TYPE_REPLICA] != "other_keyspace" { t.Errorf("GetSrvKeyspace(valid): %v %v", err, k) } if k, err := ts.GetSrvKeyspaceNames(ctx, cell); err != nil || len(k) != 1 || k[0] != "test_keyspace" { t.Errorf("GetSrvKeyspaceNames(): %v", err) } // check that updating a SrvKeyspace out of the blue works if err := ts.UpdateSrvKeyspace(ctx, cell, "unknown_keyspace_so_far", &srvKeyspace); err != nil { t.Fatalf("UpdateSrvKeyspace(2): %v", err) } if k, err := ts.GetSrvKeyspace(ctx, cell, "unknown_keyspace_so_far"); err != nil || len(k.Partitions) != 1 || len(k.Partitions[topo.TYPE_MASTER].ShardReferences) != 1 || k.Partitions[topo.TYPE_MASTER].ShardReferences[0].Name != "-80" || k.Partitions[topo.TYPE_MASTER].ShardReferences[0].KeyRange != newKeyRange("-80") || k.ShardingColumnName != "video_id" || k.ShardingColumnType != key.KIT_UINT64 || k.ServedFrom[topo.TYPE_REPLICA] != "other_keyspace" { t.Errorf("GetSrvKeyspace(out of the blue): %v %v", err, *k) } // Delete the SrvKeyspace. if err := ts.DeleteSrvKeyspace(ctx, cell, "unknown_keyspace_so_far"); err != nil { t.Fatalf("DeleteSrvShard: %v", err) } if _, err := ts.GetSrvKeyspace(ctx, cell, "unknown_keyspace_so_far"); err != topo.ErrNoNode { t.Errorf("GetSrvKeyspace(deleted) got %v, want ErrNoNode", err) } }