// NewStorePool creates a StorePool and registers the store updating callback // with gossip. func NewStorePool( g *gossip.Gossip, clock *hlc.Clock, rpcContext *rpc.Context, reservationsEnabled bool, timeUntilStoreDead time.Duration, stopper *stop.Stopper, ) *StorePool { sp := &StorePool{ clock: clock, timeUntilStoreDead: timeUntilStoreDead, rpcContext: rpcContext, reservationsEnabled: reservationsEnabled, failedReservationsTimeout: envutil.EnvOrDefaultDuration("failed_reservation_timeout", defaultFailedReservationsTimeout), declinedReservationsTimeout: envutil.EnvOrDefaultDuration("declined_reservation_timeout", defaultDeclinedReservationsTimeout), reserveRPCTimeout: envutil.EnvOrDefaultDuration("reserve_rpc_timeout", defaultReserveRPCTimeout), resolver: GossipAddressResolver(g), } sp.mu.stores = make(map[roachpb.StoreID]*storeDetail) heap.Init(&sp.mu.queue) storeRegex := gossip.MakePrefixPattern(gossip.KeyStorePrefix) g.RegisterCallback(storeRegex, sp.storeGossipUpdate) deadReplicasRegex := gossip.MakePrefixPattern(gossip.KeyDeadReplicasPrefix) g.RegisterCallback(deadReplicasRegex, sp.deadReplicasGossipUpdate) sp.start(stopper) return sp }
// waitForStores waits for all of the store descriptors to be gossiped. Servers // other than the first "bootstrap" their stores asynchronously, but we'd like // to wait for all of the stores to be initialized before returning the // TestCluster. func (tc *TestCluster) waitForStores(t testing.TB) { // Register a gossip callback for the store descriptors. g := tc.Servers[0].Gossip() var storesMu sync.Mutex stores := map[roachpb.StoreID]struct{}{} storesDone := make(chan error) unregister := g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(_ string, content roachpb.Value) { var desc roachpb.StoreDescriptor if err := content.GetProto(&desc); err != nil { storesDone <- err return } storesMu.Lock() stores[desc.StoreID] = struct{}{} if len(stores) == len(tc.Servers) { close(storesDone) } storesMu.Unlock() }) defer unregister() // Wait for the store descriptors to be gossiped. if err := <-storesDone; err != nil { t.Fatal(err) } }
// TestStoreRangeReplicate verifies that the replication queue will notice // under-replicated ranges and replicate them. func TestStoreRangeReplicate(t *testing.T) { defer leaktest.AfterTest(t) mtc := multiTestContext{} mtc.Start(t, 3) defer mtc.Stop() // Initialize the gossip network. var wg sync.WaitGroup wg.Add(len(mtc.stores)) key := gossip.MakePrefixPattern(gossip.KeyCapacityPrefix) mtc.stores[0].Gossip().RegisterCallback(key, func(_ string, _ bool) { wg.Done() }) for _, s := range mtc.stores { s.GossipCapacity() } wg.Wait() // Once we know our peers, trigger a scan. mtc.stores[0].ForceReplicationScan(t) // The range should become available on every node. if err := util.IsTrueWithin(func() bool { for _, s := range mtc.stores { r := s.LookupRange(proto.Key("a"), proto.Key("b")) if r == nil { return false } } return true }, 1*time.Second); err != nil { t.Fatal(err) } }
// TestStoreRangeUpReplicate verifies that the replication queue will notice // under-replicated ranges and replicate them. func TestStoreRangeUpReplicate(t *testing.T) { defer leaktest.AfterTest(t)() mtc := startMultiTestContext(t, 3) defer mtc.Stop() // Initialize the gossip network. var wg sync.WaitGroup wg.Add(len(mtc.stores)) key := gossip.MakePrefixPattern(gossip.KeyStorePrefix) mtc.stores[0].Gossip().RegisterCallback(key, func(_ string, _ roachpb.Value) { wg.Done() }) for _, s := range mtc.stores { s.GossipStore() } wg.Wait() // Once we know our peers, trigger a scan. mtc.stores[0].ForceReplicationScanAndProcess() // The range should become available on every node. util.SucceedsSoon(t, func() error { for _, s := range mtc.stores { r := s.LookupReplica(roachpb.RKey("a"), roachpb.RKey("b")) if r == nil { return util.Errorf("expected replica for keys \"a\" - \"b\"") } } return nil }) }
// TestStoreRangeUpReplicate verifies that the replication queue will notice // under-replicated ranges and replicate them. func TestStoreRangeUpReplicate(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 3) defer mtc.Stop() // Initialize the gossip network. var wg sync.WaitGroup wg.Add(len(mtc.stores)) key := gossip.MakePrefixPattern(gossip.KeyStorePrefix) mtc.stores[0].Gossip().RegisterCallback(key, func(_ string, _ roachpb.Value) { wg.Done() }) for _, s := range mtc.stores { s.GossipStore() } wg.Wait() // Once we know our peers, trigger a scan. mtc.stores[0].ForceReplicationScanAndProcess() // The range should become available on every node. if err := util.IsTrueWithin(func() bool { for _, s := range mtc.stores { r := s.LookupReplica(roachpb.RKey("a"), roachpb.RKey("b")) if r == nil { return false } } return true }, replicationTimeout); err != nil { t.Fatal(err) } }
func newStoreGossiper(g *gossip.Gossip) *storeGossiper { sg := &storeGossiper{ g: g, } g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(_ string, _ []byte) { sg.wg.Done() }) return sg }
// newReplicateQueue returns a new instance of replicateQueue. func newReplicateQueue(store *Store, g *gossip.Gossip, allocator Allocator, clock *hlc.Clock, options AllocatorOptions) *replicateQueue { rq := &replicateQueue{ allocator: allocator, clock: clock, updateChan: make(chan struct{}, 1), } rq.baseQueue = makeBaseQueue("replicate", rq, store, g, queueConfig{ maxSize: replicateQueueMaxSize, needsLease: true, acceptsUnsplitRanges: false, }) if g != nil { // gossip is nil for some unittests // Register a gossip callback to signal queue that replicas in // purgatory might be retried due to new store gossip. g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(_ string, _ roachpb.Value) { select { case rq.updateChan <- struct{}{}: default: } }) } return rq }
// newAllocator creates a new allocator using the specified gossip. func newAllocator(g *gossip.Gossip) *allocator { a := &allocator{ gossip: g, randGen: rand.New(rand.NewSource(rand.Int63())), } // Callback triggers on any capacity gossip updates. if a.gossip != nil { capacityRegex := gossip.MakePrefixPattern(gossip.KeyCapacityPrefix) a.gossip.RegisterCallback(capacityRegex, a.capacityGossipUpdate) } return a }
// newStoreGossiper creates a store gossiper for use by tests. It adds the // callback to gossip. func newStoreGossiper(g *gossip.Gossip) *storeGossiper { sg := &storeGossiper{ g: g, storeKeyMap: make(map[string]struct{}), } g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(key string, _ []byte) { sg.mu.Lock() defer sg.mu.Unlock() if _, ok := sg.storeKeyMap[key]; ok { sg.wg.Done() } }) return sg }
// NewStorePool creates a StorePool and registers the store updating callback // with gossip. func NewStorePool(g *gossip.Gossip, timeUntilStoreDead time.Duration, stopper *stop.Stopper) *StorePool { sp := &StorePool{ timeUntilStoreDead: timeUntilStoreDead, stores: make(map[roachpb.StoreID]*storeDetail), } heap.Init(&sp.queue) storeRegex := gossip.MakePrefixPattern(gossip.KeyStorePrefix) g.RegisterCallback(storeRegex, sp.storeGossipUpdate) sp.start(stopper) return sp }
// NewStoreGossiper creates a store gossiper for use by tests. It adds the // callback to gossip. func NewStoreGossiper(g *gossip.Gossip) *StoreGossiper { sg := &StoreGossiper{ g: g, storeKeyMap: make(map[string]struct{}), } sg.cond = sync.NewCond(&sg.mu) g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(key string, _ roachpb.Value) { sg.mu.Lock() defer sg.mu.Unlock() delete(sg.storeKeyMap, key) sg.cond.Broadcast() }) return sg }
func gossipStores(g *gossip.Gossip, stores []*proto.StoreDescriptor, t *testing.T) { var wg sync.WaitGroup wg.Add(len(stores)) g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyCapacityPrefix), func(_ string, _ bool) { wg.Done() }) for _, s := range stores { keyMaxCapacity := gossip.MakeCapacityKey(s.Node.NodeID, s.StoreID) // Gossip store descriptor. err := g.AddInfo(keyMaxCapacity, *s, 0) if err != nil { t.Fatal(err) } } // Wait for all gossip callbacks to be invoked. wg.Wait() }
func Example_rebalancing() { stopper := stop.NewStopper() defer stopper.Stop() // Model a set of stores in a cluster, // randomly adding / removing stores and adding bytes. g := gossip.New(nil, nil, stopper) // Have to call g.SetNodeID before call g.AddInfo g.SetNodeID(roachpb.NodeID(1)) sp := NewStorePool( g, hlc.NewClock(hlc.UnixNano), nil, /* reservationsEnabled */ true, TestTimeUntilStoreDeadOff, stopper, ) alloc := MakeAllocator(sp, AllocatorOptions{AllowRebalance: true, Deterministic: true}) var wg sync.WaitGroup g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(_ string, _ roachpb.Value) { wg.Done() }) const generations = 100 const nodes = 20 // Initialize testStores. var testStores [nodes]testStore for i := 0; i < len(testStores); i++ { testStores[i].StoreID = roachpb.StoreID(i) testStores[i].Node = roachpb.NodeDescriptor{NodeID: roachpb.NodeID(i)} testStores[i].Capacity = roachpb.StoreCapacity{Capacity: 1 << 30, Available: 1 << 30} } // Initialize the cluster with a single range. testStores[0].add(alloc.randGen.Int63n(1 << 20)) for i := 0; i < generations; i++ { // First loop through test stores and add data. wg.Add(len(testStores)) for j := 0; j < len(testStores); j++ { // Add a pretend range to the testStore if there's already one. if testStores[j].Capacity.RangeCount > 0 { testStores[j].add(alloc.randGen.Int63n(1 << 20)) } if err := g.AddInfoProto(gossip.MakeStoreKey(roachpb.StoreID(j)), &testStores[j].StoreDescriptor, 0); err != nil { panic(err) } } wg.Wait() // Next loop through test stores and maybe rebalance. for j := 0; j < len(testStores); j++ { ts := &testStores[j] if alloc.ShouldRebalance(ts.StoreID) { target := alloc.RebalanceTarget(ts.StoreID, roachpb.Attributes{}, []roachpb.ReplicaDescriptor{{NodeID: ts.Node.NodeID, StoreID: ts.StoreID}}) if target != nil { testStores[j].rebalance(&testStores[int(target.StoreID)], alloc.randGen.Int63n(1<<20)) } } } // Output store capacities as hexidecimal 2-character values. if i%(generations/50) == 0 { var maxBytes int64 for j := 0; j < len(testStores); j++ { bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available if bytes > maxBytes { maxBytes = bytes } } if maxBytes > 0 { for j := 0; j < len(testStores); j++ { endStr := " " if j == len(testStores)-1 { endStr = "" } bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available fmt.Printf("%03d%s", (999*bytes)/maxBytes, endStr) } fmt.Printf("\n") } } } var totBytes int64 var totRanges int32 for i := 0; i < len(testStores); i++ { totBytes += testStores[i].Capacity.Capacity - testStores[i].Capacity.Available totRanges += testStores[i].Capacity.RangeCount } fmt.Printf("Total bytes=%d, ranges=%d\n", totBytes, totRanges) // Output: // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 045 140 000 000 000 000 000 105 000 000 // 999 014 143 000 000 000 000 039 017 000 112 071 000 088 009 000 097 134 000 151 // 999 196 213 000 000 000 143 098 210 039 262 260 077 139 078 087 237 316 281 267 // 999 394 368 391 000 393 316 356 364 263 474 262 214 321 345 374 403 445 574 220 // 999 337 426 577 023 525 459 426 229 315 495 327 310 370 363 423 390 473 587 308 // 999 481 529 533 132 563 519 496 396 363 636 337 414 408 425 533 445 605 559 405 // 999 572 585 507 256 609 570 586 513 341 660 347 544 443 488 525 446 596 556 462 // 999 580 575 603 325 636 590 549 495 337 698 386 663 526 518 511 517 572 546 533 // 999 576 601 637 374 629 573 558 520 391 684 446 692 555 510 461 552 593 568 564 // 999 573 636 671 441 643 619 629 628 452 705 525 795 590 542 525 589 658 589 655 // 999 585 625 651 467 686 606 662 611 508 654 516 746 594 542 528 591 646 569 642 // 999 636 690 728 501 704 638 700 619 539 688 555 738 592 556 568 659 669 602 649 // 999 655 749 773 519 790 713 781 698 604 758 601 755 634 580 661 716 735 607 660 // 999 648 716 726 549 813 748 766 693 606 784 568 749 655 579 642 692 711 587 632 // 999 688 734 731 553 805 736 779 701 575 763 562 722 647 599 631 691 732 598 608 // 999 679 770 719 590 815 754 799 687 613 748 540 715 664 590 638 703 720 621 588 // 999 736 775 724 614 813 771 829 703 679 782 560 754 692 624 658 756 763 636 643 // 999 759 792 737 688 847 782 872 761 695 841 617 756 730 607 664 762 807 677 666 // 999 793 837 754 704 876 803 897 753 742 880 639 758 766 653 684 785 850 720 670 // 999 815 864 778 735 921 843 927 778 752 896 696 775 796 698 681 775 859 730 693 // 999 827 876 759 759 911 838 938 781 798 920 708 778 794 698 711 804 870 732 710 // 999 815 893 733 790 924 849 940 755 777 901 720 794 832 704 721 834 851 722 748 // 999 820 905 772 807 941 884 938 781 788 888 738 835 849 735 742 865 884 743 791 // 999 828 889 768 828 939 865 936 789 805 913 751 841 860 751 759 895 889 730 814 // 999 829 893 794 840 933 883 943 805 830 929 735 842 871 778 788 886 912 746 845 // 999 848 892 820 824 963 913 978 832 828 952 755 860 890 784 814 905 905 755 855 // 999 847 880 846 847 963 939 984 851 835 958 777 862 880 799 829 912 895 772 870 // 999 850 886 859 871 950 921 998 847 823 925 759 877 861 787 810 908 915 798 840 // 982 854 891 854 900 956 945 999 833 804 929 767 896 861 781 797 911 932 791 855 // 961 849 884 846 881 949 928 999 829 796 906 768 868 858 797 804 883 897 774 834 // 965 863 924 874 903 988 953 999 864 831 924 786 876 886 821 804 903 940 799 843 // 963 873 936 880 915 997 966 999 885 832 935 799 891 919 854 801 916 953 802 866 // 951 886 938 873 900 990 972 999 898 822 915 795 871 917 853 798 928 953 779 850 // 932 880 939 866 897 999 948 970 884 837 912 805 877 893 866 807 922 933 791 846 // 925 896 935 885 899 999 963 965 886 858 897 820 894 876 876 811 918 921 793 856 // 926 881 933 876 896 999 952 942 857 859 878 812 898 884 883 791 920 894 783 853 // 951 890 947 898 919 999 959 952 863 871 895 845 902 898 893 816 934 920 790 881 // 962 895 959 919 921 999 982 951 883 877 901 860 911 910 899 835 949 923 803 883 // 957 886 970 905 915 999 970 974 888 894 924 879 938 930 909 847 955 937 830 899 // 941 881 958 889 914 999 957 953 885 890 900 870 946 919 885 822 950 927 832 875 // 937 888 962 897 934 999 963 950 902 900 905 890 952 920 895 831 963 930 852 872 // 916 888 967 881 924 999 970 946 912 890 901 889 958 910 911 830 966 928 834 866 // 900 859 959 877 895 999 955 931 893 868 894 881 929 893 885 813 937 909 819 849 // 902 857 960 875 896 999 944 929 911 867 911 895 946 897 897 812 926 921 815 859 // 902 855 951 867 893 999 949 938 901 867 911 892 949 898 903 803 935 930 809 868 // Total bytes=909881714, ranges=1745 }
func Example_rebalancing() { // Model a set of stores in a cluster, // randomly adding / removing stores and adding bytes. g := gossip.New(nil, 0, nil) alloc := newAllocator(g) alloc.randGen = rand.New(rand.NewSource(0)) alloc.deterministic = true var wg sync.WaitGroup g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(_ string, _ []byte) { wg.Done() }) const generations = 100 const nodes = 20 // Initialize testStores. var testStores [nodes]testStore for i := 0; i < len(testStores); i++ { testStores[i].StoreID = proto.StoreID(i) testStores[i].Node = proto.NodeDescriptor{NodeID: proto.NodeID(i)} testStores[i].Capacity = proto.StoreCapacity{Capacity: 1 << 30, Available: 1 << 30} } // Initialize the cluster with a single range. testStores[0].Add(alloc.randGen.Int63n(1 << 20)) for i := 0; i < generations; i++ { // First loop through test stores and add data. wg.Add(len(testStores)) for j := 0; j < len(testStores); j++ { // Add a pretend range to the testStore if there's already one. if testStores[j].Capacity.RangeCount > 0 { testStores[j].Add(alloc.randGen.Int63n(1 << 20)) } key := gossip.MakeStoreKey(proto.StoreID(j)) if err := g.AddInfoProto(key, &testStores[j].StoreDescriptor, 0); err != nil { panic(err) } } wg.Wait() // Next loop through test stores and maybe rebalance. for j := 0; j < len(testStores); j++ { ts := &testStores[j] if alloc.ShouldRebalance(&testStores[j].StoreDescriptor) { target := alloc.RebalanceTarget(proto.Attributes{}, []proto.Replica{{NodeID: ts.Node.NodeID, StoreID: ts.StoreID}}) if target != nil { testStores[j].Rebalance(&testStores[int(target.StoreID)], alloc.randGen.Int63n(1<<20)) } } } // Output store capacities as hexidecimal 2-character values. if i%(generations/50) == 0 { var maxBytes int64 for j := 0; j < len(testStores); j++ { bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available if bytes > maxBytes { maxBytes = bytes } } if maxBytes > 0 { for j := 0; j < len(testStores); j++ { endStr := " " if j == len(testStores)-1 { endStr = "" } bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available fmt.Printf("%03d%s", (999*bytes)/maxBytes, endStr) } fmt.Printf("\n") } } } var totBytes int64 var totRanges int32 for i := 0; i < len(testStores); i++ { totBytes += testStores[i].Capacity.Capacity - testStores[i].Capacity.Available totRanges += testStores[i].Capacity.RangeCount } fmt.Printf("Total bytes=%d, ranges=%d\n", totBytes, totRanges) // Output: // 999 000 000 000 000 000 000 739 000 000 000 000 000 000 000 000 000 000 000 000 // 999 107 000 000 204 000 000 375 000 000 000 000 000 000 000 000 000 000 536 000 // 999 310 000 262 872 000 000 208 000 705 000 526 000 000 439 000 000 607 933 000 // 812 258 000 220 999 673 402 480 000 430 516 374 000 431 318 000 551 714 917 000 // 582 625 185 334 720 589 647 619 000 300 483 352 279 502 208 665 816 684 999 374 // 751 617 771 542 738 676 665 525 309 435 612 449 457 616 306 837 993 754 999 445 // 759 659 828 478 693 622 594 591 349 458 630 538 526 613 462 827 879 787 999 550 // 861 658 828 559 801 660 681 560 487 529 652 686 642 716 575 999 989 875 989 581 // 775 647 724 557 779 662 670 494 535 502 681 676 624 695 561 961 999 772 888 592 // 856 712 753 661 767 658 717 606 529 615 755 699 672 700 576 955 999 755 861 671 // 882 735 776 685 844 643 740 578 610 688 787 741 661 767 587 999 955 809 803 731 // 958 716 789 719 861 689 821 608 634 724 800 782 694 799 619 994 999 851 812 818 // 949 726 788 664 873 633 749 599 680 714 790 728 663 842 628 999 978 816 823 791 // 923 698 792 712 816 605 774 651 661 728 802 718 670 819 714 999 966 801 829 791 // 962 779 847 737 900 675 811 691 745 778 835 812 680 894 790 999 989 872 923 799 // 967 812 826 772 891 685 828 683 761 808 864 820 643 873 783 969 999 873 910 781 // 923 813 837 739 867 672 792 664 773 772 879 803 610 845 740 957 999 867 912 732 // 952 803 866 759 881 655 765 668 803 772 929 762 601 844 751 973 999 892 864 731 // 970 777 867 800 859 639 774 662 787 760 906 751 595 854 732 989 999 853 859 762 // 943 776 872 787 861 686 780 663 789 793 926 784 612 832 733 999 968 868 827 767 // 914 801 912 802 878 704 800 685 818 808 939 759 627 844 717 999 976 872 828 757 // 935 806 911 797 887 710 798 711 826 824 938 775 614 870 716 999 986 886 803 767 // 991 851 898 856 872 795 828 782 826 852 963 797 710 868 775 994 999 923 896 794 // 999 924 866 877 884 883 886 836 846 869 953 851 762 887 858 985 949 900 917 836 // 999 910 887 878 897 890 906 868 906 903 983 947 801 895 913 976 924 890 904 898 // 955 884 888 916 886 879 901 872 898 883 999 874 829 888 892 937 918 889 891 862 // 974 952 957 990 950 976 945 946 980 961 999 975 942 926 957 994 965 946 960 960 // 949 929 952 999 929 961 943 946 993 918 984 961 952 919 953 950 952 941 949 934 // 907 999 916 935 903 903 909 907 960 939 973 912 901 885 916 910 941 911 906 913 // 939 999 948 948 945 962 951 954 952 964 996 942 975 962 962 956 971 969 975 969 // 940 974 964 947 971 975 949 954 953 970 992 971 981 973 948 962 999 969 978 975 // 950 971 953 938 962 967 930 964 953 978 999 945 974 972 951 950 998 951 949 962 // 934 946 943 936 942 949 929 956 928 970 989 944 945 923 987 927 999 942 931 944 // 939 957 942 958 951 970 937 946 930 950 940 959 963 937 973 943 999 931 949 940 // 933 935 945 929 933 960 937 935 919 918 930 931 950 924 969 935 999 943 949 926 // 959 941 948 952 948 957 936 937 943 930 955 962 953 949 980 948 999 934 980 942 // 950 973 954 962 949 964 935 949 925 936 951 962 979 962 999 942 990 948 969 959 // 937 993 958 949 960 960 942 954 969 950 951 952 974 970 999 927 979 964 975 944 // 981 986 971 968 964 984 954 959 985 979 966 963 994 963 999 970 991 971 988 965 // 967 997 961 957 959 985 956 940 955 955 957 955 970 952 979 964 999 951 960 968 // 937 969 931 950 945 954 932 925 954 946 944 926 955 938 957 949 999 934 947 938 // 958 967 954 955 971 973 946 934 979 947 944 958 954 954 960 948 999 936 960 951 // 950 948 940 958 937 955 928 927 953 923 935 939 934 921 934 934 999 922 940 938 // 960 960 929 962 955 955 926 935 957 928 939 941 938 926 941 924 999 923 957 942 // 979 958 947 987 980 972 945 943 984 939 951 943 944 946 942 942 999 928 970 943 // 981 941 931 961 969 962 927 935 985 925 964 945 946 939 946 938 999 933 964 928 // 980 944 929 970 973 955 942 937 977 920 955 929 937 946 935 933 999 947 956 926 // 980 948 926 981 938 939 936 936 963 949 965 935 943 946 933 933 999 947 955 943 // 968 959 945 941 929 926 924 941 970 951 959 941 924 952 931 943 999 941 951 950 // 961 946 930 923 933 932 953 937 954 940 964 944 931 952 939 935 999 936 945 948 // Total bytes=996294324, ranges=1897 }
// ExampleAllocatorRebalancing models a set of stores in a cluster, // randomly adding / removing stores and adding bytes. func ExampleAllocatorRebalancing() { g := gossip.New(nil, 0, nil) alloc := newAllocator(g) alloc.randGen = rand.New(rand.NewSource(0)) alloc.deterministic = true var wg sync.WaitGroup g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyCapacityPrefix), func(_ string, _ bool) { wg.Done() }) const generations = 100 const nodes = 20 // Initialize testStores. var testStores [nodes]testStore for i := 0; i < len(testStores); i++ { testStores[i].StoreID = proto.StoreID(i) testStores[i].Node = proto.NodeDescriptor{NodeID: proto.NodeID(i)} testStores[i].Capacity = proto.StoreCapacity{Capacity: 1 << 30, Available: 1 << 30} } // Initialize the cluster with a single range. testStores[0].Add(alloc.randGen.Int63n(1 << 20)) for i := 0; i < generations; i++ { // First loop through test stores and add data. wg.Add(len(testStores)) for j := 0; j < len(testStores); j++ { // Add a pretend range to the testStore if there's already one. if testStores[j].Capacity.RangeCount > 0 { testStores[j].Add(alloc.randGen.Int63n(1 << 20)) } key := gossip.MakeCapacityKey(proto.NodeID(j), proto.StoreID(j)) if err := g.AddInfo(key, testStores[j].StoreDescriptor, 0); err != nil { panic(err) } } wg.Wait() // Next loop through test stores and maybe rebalance. for j := 0; j < len(testStores); j++ { ts := &testStores[j] if alloc.ShouldRebalance(&testStores[j].StoreDescriptor) { target := alloc.RebalanceTarget(proto.Attributes{}, []proto.Replica{{NodeID: ts.Node.NodeID, StoreID: ts.StoreID}}) if target != nil { testStores[j].Rebalance(&testStores[int(target.StoreID)], alloc.randGen.Int63n(1<<20)) } } } // Output store capacities as hexidecimal 2-character values. if i%(generations/50) == 0 { var maxBytes int64 for j := 0; j < len(testStores); j++ { bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available if bytes > maxBytes { maxBytes = bytes } } if maxBytes > 0 { for j := 0; j < len(testStores); j++ { endStr := " " if j == len(testStores)-1 { endStr = "" } bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available fmt.Printf("%03d%s", (999*bytes)/maxBytes, endStr) } fmt.Printf("\n") } } } var totBytes int64 var totRanges int32 for i := 0; i < len(testStores); i++ { totBytes += testStores[i].Capacity.Capacity - testStores[i].Capacity.Available totRanges += testStores[i].Capacity.RangeCount } fmt.Printf("Total bytes=%d, ranges=%d\n", totBytes, totRanges) // Output: // 999 000 000 000 000 000 000 739 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 204 000 000 375 000 000 107 000 000 000 000 000 000 000 000 536 // 942 000 000 463 140 000 000 646 000 288 288 000 442 000 058 647 000 000 316 999 // 880 000 412 630 365 745 445 565 122 407 380 570 276 000 271 709 000 718 299 999 // 925 000 667 600 555 975 704 552 272 491 773 890 584 000 407 974 000 930 476 999 // 990 967 793 579 493 999 698 453 616 608 777 755 709 425 455 984 483 698 267 931 // 965 999 869 606 635 908 630 585 567 577 818 870 740 621 550 868 805 790 411 913 // 953 995 990 624 617 947 562 609 670 658 909 952 835 851 641 958 924 999 526 987 // 999 923 901 571 687 915 636 636 674 685 831 881 847 820 702 905 897 983 509 981 // 999 884 809 585 691 826 640 572 748 641 754 887 758 848 643 927 865 897 541 956 // 999 856 891 594 691 745 602 615 766 663 814 834 719 886 733 925 882 911 593 926 // 999 890 900 653 707 759 642 697 771 732 851 858 748 869 842 953 903 928 655 923 // 999 924 909 696 748 797 693 689 806 766 841 902 705 897 874 914 913 916 730 892 // 999 948 892 704 740 821 685 656 859 772 893 911 690 878 824 935 928 941 741 860 // 999 948 931 697 770 782 697 666 893 761 944 869 658 902 816 925 923 983 742 831 // 999 878 901 736 750 737 677 647 869 731 930 825 631 880 775 947 949 930 687 810 // 999 890 910 764 778 757 709 663 849 777 964 837 672 891 814 978 944 946 721 868 // 985 895 968 806 791 791 720 694 883 819 999 847 652 888 790 995 950 947 692 843 // 960 903 956 794 815 779 746 706 891 824 958 830 665 886 757 999 931 969 701 861 // 999 928 954 805 807 822 764 734 910 829 952 827 678 927 785 980 936 962 677 836 // 999 903 924 800 769 822 776 730 886 815 935 781 668 890 805 948 929 965 676 837 // 999 926 935 836 782 836 809 756 897 835 937 781 690 894 804 979 951 978 667 832 // 999 937 936 875 843 872 854 793 908 873 950 808 714 901 860 981 975 962 693 866 // 988 957 938 898 922 912 916 886 905 912 964 867 764 915 911 992 999 985 776 896 // 945 959 922 910 937 913 938 944 957 921 993 916 898 957 928 999 976 997 855 957 // 980 986 944 956 963 920 966 967 999 966 991 956 981 973 955 998 990 954 994 981 // 956 985 942 945 950 900 933 949 981 969 946 935 963 951 931 999 936 941 972 963 // 940 999 964 949 941 974 967 937 970 975 965 951 976 968 949 993 944 949 977 964 // 926 999 973 932 944 952 933 944 963 965 927 940 964 960 938 995 932 935 968 951 // 907 999 919 957 941 958 934 935 930 941 940 926 966 933 920 973 937 923 938 946 // 924 999 914 963 976 945 911 936 929 951 930 930 972 935 941 977 932 960 939 958 // 942 999 950 961 987 942 928 945 938 941 939 936 985 937 969 985 952 958 957 948 // 956 999 950 947 943 939 949 934 929 935 940 942 943 957 988 974 933 936 938 951 // 967 990 950 949 964 952 951 922 943 940 954 956 962 946 982 999 945 949 940 954 // 970 999 952 959 970 955 957 974 937 965 968 947 950 958 947 993 953 938 958 950 // 945 964 954 963 965 959 967 961 925 978 954 944 968 937 960 999 947 947 961 960 // 930 957 938 974 956 944 968 930 944 972 930 946 958 974 940 999 961 945 953 947 // 966 980 954 989 979 960 969 995 961 986 954 980 980 971 968 999 968 977 979 972 // 963 953 958 986 990 947 973 955 955 983 974 981 961 964 977 999 984 982 966 964 // 964 968 975 993 999 955 965 958 972 995 978 981 956 966 981 987 978 976 985 966 // 967 957 954 999 963 940 968 966 941 966 971 969 957 961 949 940 968 963 988 947 // 951 939 952 980 937 948 964 970 941 965 979 966 941 940 952 938 973 955 999 934 // 939 958 941 998 942 951 962 942 962 951 972 978 946 935 958 935 950 947 999 953 // 959 952 938 999 936 957 961 950 937 954 975 971 958 930 938 930 944 939 978 950 // 957 943 963 999 947 965 953 937 966 953 978 972 963 937 933 945 944 937 979 952 // 945 951 956 999 926 948 958 923 947 934 951 961 955 941 949 936 945 929 960 947 // 956 960 975 999 945 977 956 934 954 943 961 956 956 954 960 954 958 929 969 938 // 947 966 993 999 944 963 942 939 963 935 952 957 968 947 962 946 962 947 959 942 // 940 961 999 992 935 946 938 932 968 939 957 938 970 949 964 934 948 957 952 939 // 944 955 999 978 940 932 937 944 957 936 957 945 958 955 947 933 956 948 947 942 // Total bytes=1003302292, ranges=1899 }
// TestStoreRangeDownReplicate verifies that the replication queue will notice // over-replicated ranges and remove replicas from them. func TestStoreRangeDownReplicate(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 5) defer mtc.Stop() store0 := mtc.stores[0] // Split off a range from the initial range for testing; there are // complications if the metadata ranges are removed from store 1, this // simplifies the test. splitKey := roachpb.RKey("m") rightKey := roachpb.RKey("z") { replica := store0.LookupReplica(roachpb.RKeyMin, nil) mtc.replicateRange(replica.Desc().RangeID, 0, 1, 2) desc := replica.Desc() splitArgs := adminSplitArgs(splitKey, splitKey) if _, err := replica.AdminSplit(splitArgs, desc); err != nil { t.Fatal(err) } } // Replicate the new range to all five stores. replica := store0.LookupReplica(rightKey, nil) desc := replica.Desc() mtc.replicateRange(desc.RangeID, 0, 3, 4) // Initialize the gossip network. var wg sync.WaitGroup wg.Add(len(mtc.stores)) key := gossip.MakePrefixPattern(gossip.KeyStorePrefix) mtc.stores[0].Gossip().RegisterCallback(key, func(_ string, _ []byte) { wg.Done() }) for _, s := range mtc.stores { s.GossipStore() } wg.Wait() // storeIDset is used to compare the replica sets from different views (i.e. // local range descriptors) type storeIDset map[roachpb.StoreID]struct{} makeStoreIDset := func(replicas []roachpb.ReplicaDescriptor) storeIDset { idSet := make(storeIDset) for _, r := range replicas { idSet[r.StoreID] = struct{}{} } return idSet } // Function to see if the replication level of the new range has reached the // expected equilibrium. If equilibrium has not been reached, this function // returns the list of stores that *should* have a replica for the range. checkReplication := func() (bool, storeIDset) { // Query each store for a replica of the range, generating a real map of // the replicas. foundIDset := make(storeIDset) foundLocalRangeDescs := make([]*roachpb.RangeDescriptor, 0, len(mtc.stores)) for _, s := range mtc.stores { r := s.LookupReplica(splitKey, nil) if r != nil { foundLocalRangeDescs = append(foundLocalRangeDescs, r.Desc()) foundIDset[s.StoreID()] = struct{}{} } } // Fail immediately if there are less than three replicas. replicaCount := len(foundIDset) if replicaCount < 3 { t.Fatalf("Removed too many replicas; expected at least three replicas, found %d", replicaCount) } // Look up the official range descriptor, make sure it agrees with the // found replicas. realRangeDesc := getRangeMetadata(rightKey, mtc, t) realIDset := makeStoreIDset(realRangeDesc.Replicas) if !reflect.DeepEqual(realIDset, foundIDset) { return false, realIDset } // Ensure the local range descriptors everywhere agree with reality. for _, desc := range foundLocalRangeDescs { localIDset := makeStoreIDset(desc.Replicas) if !reflect.DeepEqual(localIDset, foundIDset) { return false, realIDset } } // If we have only three replicas, exit the loop. if replicaCount == 3 { return true, nil } return false, foundIDset } maxTimeout := time.After(10 * time.Second) succeeded := false for !succeeded { select { case <-maxTimeout: t.Fatalf("Failed to achieve proper replication within 10 seconds") case <-time.After(10 * time.Millisecond): mtc.expireLeaderLeases() // If our replication level matches the target, we have succeeded. var idSet storeIDset succeeded, idSet = checkReplication() if succeeded { break } // Kick off a manual ReplicaGC Scan on any store which is not part of the // current replica set. Kick off a Replication scan on *one* store which // is part of the replica set. kickedOffReplicationQueue := false for _, store := range mtc.stores { if _, ok := idSet[store.StoreID()]; !ok { store.ForceReplicaGCScan(t) } else if !kickedOffReplicationQueue { store.ForceReplicationScan(t) kickedOffReplicationQueue = true } } } } // Expire leader leases one more time, so that any remaining resolutions can // get a leader lease. mtc.expireLeaderLeases() }
func Example_rebalancing() { stopper := stop.NewStopper() defer stopper.Stop() // Model a set of stores in a cluster, // randomly adding / removing stores and adding bytes. rpcContext := rpc.NewContext(&base.Context{Insecure: true}, nil, stopper) server := rpc.NewServer(rpcContext) // never started g := gossip.New(context.Background(), rpcContext, server, nil, stopper, metric.NewRegistry()) // Have to call g.SetNodeID before call g.AddInfo g.SetNodeID(roachpb.NodeID(1)) sp := NewStorePool( g, hlc.NewClock(hlc.UnixNano), nil, /* reservationsEnabled */ true, TestTimeUntilStoreDeadOff, stopper, ) alloc := MakeAllocator(sp, AllocatorOptions{AllowRebalance: true, Deterministic: true}) var wg sync.WaitGroup g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(_ string, _ roachpb.Value) { wg.Done() }) const generations = 100 const nodes = 20 // Initialize testStores. var testStores [nodes]testStore for i := 0; i < len(testStores); i++ { testStores[i].StoreID = roachpb.StoreID(i) testStores[i].Node = roachpb.NodeDescriptor{NodeID: roachpb.NodeID(i)} testStores[i].Capacity = roachpb.StoreCapacity{Capacity: 1 << 30, Available: 1 << 30} } // Initialize the cluster with a single range. testStores[0].add(alloc.randGen.Int63n(1 << 20)) for i := 0; i < generations; i++ { // First loop through test stores and add data. wg.Add(len(testStores)) for j := 0; j < len(testStores); j++ { // Add a pretend range to the testStore if there's already one. if testStores[j].Capacity.RangeCount > 0 { testStores[j].add(alloc.randGen.Int63n(1 << 20)) } if err := g.AddInfoProto(gossip.MakeStoreKey(roachpb.StoreID(j)), &testStores[j].StoreDescriptor, 0); err != nil { panic(err) } } wg.Wait() // Next loop through test stores and maybe rebalance. for j := 0; j < len(testStores); j++ { ts := &testStores[j] target := alloc.RebalanceTarget( roachpb.Attributes{}, []roachpb.ReplicaDescriptor{{NodeID: ts.Node.NodeID, StoreID: ts.StoreID}}, -1) if target != nil { testStores[j].rebalance(&testStores[int(target.StoreID)], alloc.randGen.Int63n(1<<20)) } } // Output store capacities as hexadecimal 2-character values. if i%(generations/50) == 0 { var maxBytes int64 for j := 0; j < len(testStores); j++ { bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available if bytes > maxBytes { maxBytes = bytes } } if maxBytes > 0 { for j := 0; j < len(testStores); j++ { endStr := " " if j == len(testStores)-1 { endStr = "" } bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available fmt.Printf("%03d%s", (999*bytes)/maxBytes, endStr) } fmt.Printf("\n") } } } var totBytes int64 var totRanges int32 for i := 0; i < len(testStores); i++ { totBytes += testStores[i].Capacity.Capacity - testStores[i].Capacity.Available totRanges += testStores[i].Capacity.RangeCount } fmt.Printf("Total bytes=%d, ranges=%d\n", totBytes, totRanges) // Output: // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 // 999 000 000 000 014 000 000 118 000 000 000 000 111 000 000 000 000 000 000 000 // 999 113 095 000 073 064 000 221 003 000 020 178 182 000 057 000 027 000 055 000 // 999 398 222 000 299 366 000 525 239 135 263 385 424 261 261 000 260 194 207 322 // 999 307 170 335 380 357 336 539 233 373 218 444 539 336 258 479 267 352 384 387 // 999 558 318 587 623 602 453 719 409 506 414 617 638 411 359 486 502 507 454 673 // 999 588 404 854 616 642 559 705 482 622 505 554 673 489 410 390 524 607 535 671 // 999 651 498 922 668 696 612 809 619 691 682 674 682 584 533 449 619 724 646 702 // 999 710 505 832 645 732 719 867 605 794 743 693 717 645 602 503 683 733 686 776 // 999 773 688 810 658 761 812 957 663 875 856 797 871 670 733 602 839 781 736 909 // 959 778 750 879 685 797 786 999 751 944 870 786 882 670 828 611 880 817 714 883 // 946 843 781 892 726 887 876 993 717 999 940 802 879 672 842 634 862 818 736 906 // 924 826 754 859 742 878 836 927 721 999 893 762 874 672 882 684 918 818 745 897 // 910 872 789 858 752 878 824 976 715 999 860 739 848 684 890 699 968 846 751 833 // 938 892 789 879 754 916 861 997 774 983 887 805 827 690 912 751 999 927 800 893 // 895 887 792 845 784 920 800 999 770 961 890 747 871 701 907 733 970 893 811 858 // 887 843 742 839 792 938 826 999 778 971 859 792 857 731 889 777 979 900 779 833 // 891 861 802 819 802 966 826 999 776 946 843 792 836 769 914 792 968 879 775 874 // 923 840 830 842 778 969 820 999 791 950 843 820 838 767 893 794 995 915 789 885 // 932 816 783 830 805 926 783 999 790 977 824 856 838 789 866 787 992 892 760 896 // 917 799 781 813 800 901 759 999 776 983 795 861 813 799 852 776 944 891 739 883 // 895 759 757 827 799 894 741 999 772 955 779 864 823 812 835 785 956 882 746 865 // 906 762 773 867 848 874 747 999 763 992 766 866 831 812 839 820 973 906 765 885 // 915 795 781 884 854 899 782 983 756 999 744 890 840 791 848 806 992 934 774 904 // 935 768 813 893 859 881 788 948 758 999 748 892 828 803 857 834 989 940 798 900 // 953 752 816 919 852 882 806 966 771 976 733 877 804 802 854 822 999 957 800 898 // 909 732 804 882 874 885 814 956 758 937 703 877 805 783 849 833 999 955 796 903 // 885 744 788 859 851 881 802 929 732 905 702 843 801 774 847 810 999 936 778 880 // 856 741 790 827 842 897 771 922 732 873 719 849 771 789 845 828 999 914 764 859 // 880 787 825 841 867 941 782 962 752 918 749 886 797 819 899 862 999 935 792 891 // 902 829 841 857 903 979 786 979 760 935 767 903 816 839 907 880 999 963 827 927 // 873 809 831 837 906 964 786 952 772 928 780 904 810 817 914 878 999 974 827 914 // 879 810 855 843 936 977 806 956 799 930 801 931 823 835 928 895 997 999 864 935 // 885 806 858 825 921 971 791 965 784 930 809 936 813 829 904 893 999 974 858 902 // 865 776 855 811 903 966 771 958 770 906 809 923 810 825 896 901 999 964 841 895 // 880 789 876 816 918 987 772 972 776 912 814 935 836 833 913 901 999 978 863 903 // 866 779 861 824 926 986 773 958 776 920 810 936 836 855 894 899 999 989 859 904 // 880 798 862 826 910 997 795 948 767 910 798 923 838 835 872 911 999 975 856 894 // 885 785 845 807 906 973 783 943 782 918 789 920 832 838 861 894 999 965 849 877 // 889 793 855 802 918 985 786 948 793 920 800 941 818 849 846 899 999 982 851 886 // 866 796 854 801 911 969 782 958 791 907 788 940 800 844 843 890 999 977 851 873 // 849 794 855 815 912 970 790 942 792 898 789 938 794 850 843 884 999 964 854 886 // 856 806 867 837 930 980 787 944 789 903 804 947 800 863 840 891 999 977 860 874 // 847 796 852 849 925 980 777 948 786 905 792 922 798 853 835 887 999 968 868 866 // 851 801 866 846 936 999 795 945 774 909 793 931 794 860 846 908 985 976 882 854 // 861 815 861 845 934 999 808 958 784 913 780 924 800 860 844 912 986 974 897 844 // Total bytes=941960698, ranges=1750 }
// TestStoreRangeDownReplicate verifies that the replication queue will notice // over-replicated ranges and remove replicas from them. func TestStoreRangeDownReplicate(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 5) defer mtc.Stop() store0 := mtc.stores[0] // Split off a range from the initial range for testing; there are // complications if the metadata ranges are removed from store 1, this // simplifies the test. splitKey := proto.Key("m") rightKey := proto.Key("z") { replica := store0.LookupReplica(proto.KeyMin, nil) mtc.replicateRange(replica.Desc().RangeID, 0, 1, 2) desc := replica.Desc() splitArgs := adminSplitArgs(splitKey, splitKey, desc.RangeID, store0.StoreID()) if _, err := replica.AdminSplit(splitArgs, desc); err != nil { t.Fatal(err) } } // Replicate the new range to all five stores. replica := store0.LookupReplica(rightKey, nil) desc := replica.Desc() mtc.replicateRange(desc.RangeID, 0, 3, 4) // Initialize the gossip network. var wg sync.WaitGroup wg.Add(len(mtc.stores)) key := gossip.MakePrefixPattern(gossip.KeyStorePrefix) mtc.stores[0].Gossip().RegisterCallback(key, func(_ string, _ []byte) { wg.Done() }) for _, s := range mtc.stores { s.GossipStore() } wg.Wait() // storeIDset is used to compare the replica sets from different views (i.e. // local range descriptors) type storeIDset map[proto.StoreID]struct{} makeStoreIDset := func(replicas []proto.Replica) storeIDset { idSet := make(storeIDset) for _, r := range replicas { idSet[r.StoreID] = struct{}{} } return idSet } // Function to get the current range descriptor for the target range. getRangeMetadata := func() proto.RangeDescriptor { // Calls to RangeLookup typically use inconsistent reads, but we // want to do a consistent read here. This is important when we are // considering one of the metadata ranges: we must not do an // inconsistent lookup in our own copy of the range. reply := proto.RangeLookupResponse{} b := &client.Batch{} b.InternalAddCall(proto.Call{ Args: &proto.RangeLookupRequest{ RequestHeader: proto.RequestHeader{ Key: keys.RangeMetaKey(rightKey), }, MaxRanges: 1, }, Reply: &reply, }) if err := mtc.db.Run(b); err != nil { t.Fatalf("error getting range metadata: %s", err.Error()) } if len(reply.Ranges) != 1 { t.Fatalf("expected 1 range descriptor, go %d", len(reply.Ranges)) } return reply.Ranges[0] } // Function to see if the replication level of the new range has reached the // expected equilibrium. If equilibrium has not been reached, this function // returns the list of stores that *should* have a replica for the range. checkReplication := func() (bool, storeIDset) { // Query each store for a replica of the range, generating a real map of // the replicas. foundIDset := make(storeIDset) foundLocalRangeDescs := make([]*proto.RangeDescriptor, 0, len(mtc.stores)) for _, s := range mtc.stores { r := s.LookupReplica(splitKey, nil) if r != nil { foundLocalRangeDescs = append(foundLocalRangeDescs, r.Desc()) foundIDset[s.StoreID()] = struct{}{} } } // Fail immediately if there are less than three replicas. replicaCount := len(foundIDset) if replicaCount < 3 { t.Fatalf("Removed too many replicas; expected at least three replicas, found %d", replicaCount) } // Look up the official range descriptor, make sure it agrees with the // found replicas. realRangeDesc := getRangeMetadata() realIDset := makeStoreIDset(realRangeDesc.Replicas) if !reflect.DeepEqual(realIDset, foundIDset) { return false, realIDset } // Ensure the local range descriptors everywhere agree with reality. for _, desc := range foundLocalRangeDescs { localIDset := makeStoreIDset(desc.Replicas) if !reflect.DeepEqual(localIDset, foundIDset) { return false, realIDset } } // If we have only three replicas, exit the loop. if replicaCount == 3 { return true, nil } return false, foundIDset } maxTimeout := time.After(5 * time.Second) succeeded := false for !succeeded { select { case <-maxTimeout: t.Fatalf("Failed to achieve proper replication within 5 seconds") case <-time.After(10 * time.Millisecond): mtc.manualClock.Increment(int64(storage.DefaultLeaderLeaseDuration)) // If our replication level matches the target, we have succeeded. var idSet storeIDset succeeded, idSet = checkReplication() if succeeded { break } // Kick off a manual RangeGC Scan on any store which is not part of the // current replica set. Kick off a Replication scan on *one* store which // is part of the replica set. kickedOffReplicationQueue := false for _, store := range mtc.stores { if _, ok := idSet[store.StoreID()]; !ok { store.ForceRangeGCScan(t) } else if !kickedOffReplicationQueue { store.ForceReplicationScan(t) kickedOffReplicationQueue = true } } } } // Increment the manual clock one more time, so that any remaining intent // resolutions can get a leader lease. mtc.manualClock.Increment(int64(storage.DefaultLeaderLeaseDuration)) }
func Example_rebalancing() { // Model a set of stores in a cluster, // randomly adding / removing stores and adding bytes. g := gossip.New(nil, 0, nil) stopper := stop.NewStopper() defer stopper.Stop() sp := NewStorePool(g, TestTimeUntilStoreDeadOff, stopper) alloc := MakeAllocator(sp, RebalancingOptions{AllowRebalance: true, Deterministic: true}) alloc.randGen = rand.New(rand.NewSource(0)) var wg sync.WaitGroup g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(_ string, _ []byte) { wg.Done() }) const generations = 100 const nodes = 20 // Initialize testStores. var testStores [nodes]testStore for i := 0; i < len(testStores); i++ { testStores[i].StoreID = roachpb.StoreID(i) testStores[i].Node = roachpb.NodeDescriptor{NodeID: roachpb.NodeID(i)} testStores[i].Capacity = roachpb.StoreCapacity{Capacity: 1 << 30, Available: 1 << 30} } // Initialize the cluster with a single range. testStores[0].add(alloc.randGen.Int63n(1 << 20)) for i := 0; i < generations; i++ { // First loop through test stores and add data. wg.Add(len(testStores)) for j := 0; j < len(testStores); j++ { // Add a pretend range to the testStore if there's already one. if testStores[j].Capacity.RangeCount > 0 { testStores[j].add(alloc.randGen.Int63n(1 << 20)) } key := gossip.MakeStoreKey(roachpb.StoreID(j)) if err := g.AddInfoProto(key, &testStores[j].StoreDescriptor, 0); err != nil { panic(err) } } wg.Wait() // Next loop through test stores and maybe rebalance. for j := 0; j < len(testStores); j++ { ts := &testStores[j] if alloc.ShouldRebalance(ts.StoreID) { target := alloc.RebalanceTarget(ts.StoreID, roachpb.Attributes{}, []roachpb.ReplicaDescriptor{{NodeID: ts.Node.NodeID, StoreID: ts.StoreID}}) if target != nil { testStores[j].rebalance(&testStores[int(target.StoreID)], alloc.randGen.Int63n(1<<20)) } } } // Output store capacities as hexidecimal 2-character values. if i%(generations/50) == 0 { var maxBytes int64 for j := 0; j < len(testStores); j++ { bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available if bytes > maxBytes { maxBytes = bytes } } if maxBytes > 0 { for j := 0; j < len(testStores); j++ { endStr := " " if j == len(testStores)-1 { endStr = "" } bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available fmt.Printf("%03d%s", (999*bytes)/maxBytes, endStr) } fmt.Printf("\n") } } } var totBytes int64 var totRanges int32 for i := 0; i < len(testStores); i++ { totBytes += testStores[i].Capacity.Capacity - testStores[i].Capacity.Available totRanges += testStores[i].Capacity.RangeCount } fmt.Printf("Total bytes=%d, ranges=%d\n", totBytes, totRanges) // Output: // 138 000 000 000 000 000 000 999 000 000 000 000 000 000 000 000 000 000 000 000 // 922 319 000 000 000 239 000 999 000 000 000 000 000 214 073 000 000 000 190 000 // 999 505 480 000 634 352 421 644 212 331 396 144 000 242 419 275 000 000 727 028 // 999 678 908 705 350 558 549 714 651 824 895 694 000 373 610 490 372 106 492 796 // 932 701 763 999 660 706 571 702 787 945 848 678 062 692 762 413 603 252 513 882 // 937 656 875 984 734 717 676 685 910 895 847 841 349 754 864 463 722 377 655 999 // 885 701 805 999 647 744 802 659 778 834 830 725 569 761 922 587 684 458 693 935 // 813 650 709 931 583 733 843 619 793 881 768 658 565 713 956 598 733 594 656 999 // 873 727 721 999 544 812 848 666 817 943 831 658 556 769 927 554 799 733 670 869 // 937 765 827 999 543 875 907 670 929 997 913 768 621 853 922 618 878 832 733 937 // 902 819 744 988 547 904 922 688 879 999 812 710 554 789 890 591 808 865 658 932 // 870 873 846 997 596 937 899 765 864 969 855 751 577 824 951 579 858 908 653 999 // 880 833 856 999 640 918 932 774 920 930 869 739 686 784 853 553 885 941 685 986 // 874 797 808 999 645 925 928 781 920 956 859 762 678 761 819 627 899 941 725 959 // 886 801 835 999 638 984 927 825 968 958 860 760 813 716 800 638 908 908 798 945 // 860 840 836 973 634 999 944 834 977 923 848 769 846 728 836 605 865 915 781 896 // 859 864 891 993 633 961 999 863 951 885 857 791 864 731 871 656 868 920 790 879 // 866 845 896 999 688 966 998 904 942 864 861 815 867 756 879 704 919 940 804 888 // 825 850 876 983 712 945 999 885 943 870 854 838 848 771 825 701 939 940 809 885 // 821 872 915 999 711 927 968 928 963 898 846 865 863 814 858 719 935 951 818 877 // 829 868 940 999 729 919 938 911 957 905 846 872 860 844 872 724 920 941 844 832 // 825 848 901 999 736 882 911 926 937 935 876 901 824 870 892 714 902 927 844 846 // 837 861 921 999 872 890 875 911 894 939 868 921 871 894 887 740 905 948 881 879 // 876 879 956 999 893 889 875 910 910 953 880 898 900 895 905 736 920 965 918 916 // 921 897 909 999 924 907 895 936 955 974 901 902 933 937 929 763 909 997 914 944 // 930 882 892 995 925 910 907 942 911 952 915 922 936 911 944 819 891 999 906 953 // 904 867 889 989 913 890 877 932 931 937 936 927 939 915 936 843 901 999 937 915 // 936 916 872 937 920 901 900 917 928 972 949 936 917 923 934 897 896 999 913 907 // 979 963 909 954 923 950 953 942 969 999 975 970 925 942 971 948 933 993 958 952 // 978 950 903 942 905 949 937 927 948 991 980 948 921 929 932 946 920 999 942 937 // 962 958 937 947 910 967 949 934 945 983 966 951 917 918 957 965 923 999 961 957 // 972 953 942 921 891 955 949 921 964 977 969 951 927 930 953 953 928 999 950 936 // 965 978 953 928 896 963 964 928 983 986 948 972 963 921 953 970 928 999 937 941 // 961 964 936 930 930 969 964 930 964 973 965 983 974 928 958 969 949 999 941 961 // 977 963 931 955 959 979 970 942 941 965 963 980 951 948 966 971 954 999 962 950 // 988 966 932 960 951 973 978 942 953 964 967 989 941 959 986 956 950 999 962 954 // 989 950 955 951 940 985 964 958 941 982 970 981 946 968 999 965 959 984 979 939 // 976 959 962 956 959 975 955 974 966 988 980 991 956 967 992 953 961 999 959 925 // 983 979 962 973 970 983 962 970 997 999 987 997 970 969 997 973 972 996 981 940 // 980 959 958 968 954 983 952 966 973 974 978 981 975 947 995 977 956 999 972 932 // 960 973 955 982 956 970 944 947 979 978 985 977 962 939 999 986 945 995 965 922 // 956 983 941 982 957 958 945 950 985 973 987 986 944 947 984 999 948 959 961 914 // 970 972 946 968 968 978 951 945 980 973 988 981 956 957 967 999 955 945 966 913 // 967 976 943 974 973 969 951 948 994 963 986 984 949 958 984 999 951 945 980 926 // 960 955 933 954 973 964 943 929 973 943 988 991 948 959 981 999 950 951 978 927 // 954 948 912 953 960 968 937 931 977 936 979 978 945 950 970 999 945 949 971 925 // 953 965 922 947 963 966 933 919 992 943 973 991 933 953 975 999 949 947 970 907 // 951 983 923 958 979 972 945 936 999 941 982 987 946 960 973 975 959 940 970 925 // 954 982 909 948 973 967 945 938 999 950 994 976 957 960 970 968 964 952 966 932 // 956 964 896 944 963 953 924 940 999 936 981 961 939 956 965 951 937 937 959 908 // Total bytes=1042395713, ranges=1919 }
// TestStoreRangeDownReplicate verifies that the replication queue will notice // over-replicated ranges and remove replicas from them. func TestStoreRangeDownReplicate(t *testing.T) { defer leaktest.AfterTest(t) mtc := startMultiTestContext(t, 5) defer mtc.Stop() store0 := mtc.stores[0] // Split off a range from the initial range for testing; there are // complications if the metadata ranges are removed from store 1, this // simplifies the test. splitKey := roachpb.Key("m") rightKey := roachpb.Key("z") { replica := store0.LookupReplica(roachpb.RKeyMin, nil) mtc.replicateRange(replica.RangeID, 1, 2) desc := replica.Desc() splitArgs := adminSplitArgs(splitKey, splitKey) if _, err := replica.AdminSplit(splitArgs, desc); err != nil { t.Fatal(err) } } // Replicate the new range to all five stores. replica := store0.LookupReplica(keys.Addr(rightKey), nil) desc := replica.Desc() mtc.replicateRange(desc.RangeID, 3, 4) // Initialize the gossip network. var wg sync.WaitGroup wg.Add(len(mtc.stores)) key := gossip.MakePrefixPattern(gossip.KeyStorePrefix) mtc.stores[0].Gossip().RegisterCallback(key, func(_ string, _ roachpb.Value) { wg.Done() }) for _, s := range mtc.stores { s.GossipStore() } wg.Wait() maxTimeout := time.After(10 * time.Second) succeeded := false for !succeeded { select { case <-maxTimeout: t.Fatalf("Failed to achieve proper replication within 10 seconds") case <-time.After(10 * time.Millisecond): mtc.expireLeaderLeases() rangeDesc := getRangeMetadata(keys.Addr(rightKey), mtc, t) if count := len(rangeDesc.Replicas); count < 3 { t.Fatalf("Removed too many replicas; expected at least 3 replicas, found %d", count) } else if count == 3 { succeeded = true break } // Run replication scans on every store; only the store with the // leader lease will actually do anything. If we did not wait // for the scan to complete here it could be interrupted by the // next call to expireLeaderLeases. for _, store := range mtc.stores { store.ForceReplicationScanAndProcess() } } } // Expire leader leases one more time, so that any remaining resolutions can // get a leader lease. // TODO(bdarnell): understand why some tests need this. mtc.expireLeaderLeases() }
func Example_rebalancing() { // Model a set of stores in a cluster, // randomly adding / removing stores and adding bytes. g := gossip.New(nil, 0, nil) stopper := stop.NewStopper() defer stopper.Stop() sp := NewStorePool(g, TestTimeUntilStoreDeadOff, stopper) alloc := makeAllocator(sp) alloc.randGen = rand.New(rand.NewSource(0)) alloc.deterministic = true var wg sync.WaitGroup g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(_ string, _ []byte) { wg.Done() }) const generations = 100 const nodes = 20 // Initialize testStores. var testStores [nodes]testStore for i := 0; i < len(testStores); i++ { testStores[i].StoreID = proto.StoreID(i) testStores[i].Node = proto.NodeDescriptor{NodeID: proto.NodeID(i)} testStores[i].Capacity = proto.StoreCapacity{Capacity: 1 << 30, Available: 1 << 30} } // Initialize the cluster with a single range. testStores[0].add(alloc.randGen.Int63n(1 << 20)) for i := 0; i < generations; i++ { // First loop through test stores and add data. wg.Add(len(testStores)) for j := 0; j < len(testStores); j++ { // Add a pretend range to the testStore if there's already one. if testStores[j].Capacity.RangeCount > 0 { testStores[j].add(alloc.randGen.Int63n(1 << 20)) } key := gossip.MakeStoreKey(proto.StoreID(j)) if err := g.AddInfoProto(key, &testStores[j].StoreDescriptor, 0); err != nil { panic(err) } } wg.Wait() // Next loop through test stores and maybe rebalance. for j := 0; j < len(testStores); j++ { ts := &testStores[j] if alloc.shouldRebalance(&testStores[j].StoreDescriptor) { target := alloc.rebalanceTarget(proto.Attributes{}, []proto.Replica{{NodeID: ts.Node.NodeID, StoreID: ts.StoreID}}) if target != nil { testStores[j].rebalance(&testStores[int(target.StoreID)], alloc.randGen.Int63n(1<<20)) } } } // Output store capacities as hexidecimal 2-character values. if i%(generations/50) == 0 { var maxBytes int64 for j := 0; j < len(testStores); j++ { bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available if bytes > maxBytes { maxBytes = bytes } } if maxBytes > 0 { for j := 0; j < len(testStores); j++ { endStr := " " if j == len(testStores)-1 { endStr = "" } bytes := testStores[j].Capacity.Capacity - testStores[j].Capacity.Available fmt.Printf("%03d%s", (999*bytes)/maxBytes, endStr) } fmt.Printf("\n") } } } var totBytes int64 var totRanges int32 for i := 0; i < len(testStores); i++ { totBytes += testStores[i].Capacity.Capacity - testStores[i].Capacity.Available totRanges += testStores[i].Capacity.RangeCount } fmt.Printf("Total bytes=%d, ranges=%d\n", totBytes, totRanges) // Output: // 999 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 739 000 000 // 999 107 000 000 000 000 000 000 000 000 177 000 000 000 204 000 000 734 000 000 // 929 288 000 168 000 057 623 000 114 272 471 000 000 565 385 000 000 999 000 284 // 683 367 133 087 000 527 381 607 379 380 502 000 188 824 490 295 420 999 000 490 // 540 443 380 319 000 438 382 534 599 579 602 000 268 859 601 374 450 999 000 532 // 412 428 539 429 170 332 424 696 505 439 503 691 327 752 427 437 451 999 076 441 // 496 583 662 586 280 431 499 714 564 578 540 661 431 784 548 516 547 999 329 589 // 502 563 646 541 430 428 576 693 633 578 537 577 455 803 573 596 528 999 402 639 // 603 641 764 638 764 521 650 764 713 683 648 652 579 860 610 731 665 999 463 749 // 615 642 779 688 813 459 650 791 728 702 743 614 526 829 600 767 760 999 497 700 // 677 677 879 787 867 518 700 852 775 801 793 666 526 820 601 843 767 999 544 772 // 723 696 866 838 853 589 730 882 800 768 782 695 567 776 656 836 832 999 613 832 // 830 764 936 879 976 673 824 974 864 825 835 761 703 874 700 909 888 999 635 957 // 832 766 949 842 995 730 839 965 870 843 790 765 693 931 706 936 936 999 683 948 // 866 787 990 851 999 780 867 968 892 847 783 787 708 912 768 963 951 954 681 942 // 844 768 995 866 999 811 836 952 931 849 764 763 716 923 758 980 907 921 705 927 // 866 780 999 897 979 844 858 975 968 897 762 772 739 939 731 984 909 912 752 931 // 895 818 999 918 991 884 916 936 942 899 790 780 775 930 779 976 963 893 752 920 // 832 782 999 865 978 854 908 902 887 886 760 785 747 903 762 924 911 830 750 876 // 864 804 999 856 987 855 939 911 905 909 735 818 808 909 757 941 956 807 761 875 // 892 839 999 901 977 871 933 930 931 910 766 812 831 908 768 969 935 861 793 867 // 908 858 982 911 994 872 947 951 960 935 798 832 835 895 786 999 951 873 790 863 // 881 863 963 908 991 845 919 941 959 939 801 815 832 876 783 999 959 877 798 822 // 907 929 962 930 942 904 901 925 955 940 871 985 921 902 873 999 991 894 908 926 // 892 923 956 939 976 961 904 933 958 942 914 979 925 907 940 999 982 906 929 922 // 913 928 940 921 930 913 913 929 925 918 920 963 923 923 940 999 928 909 923 948 // 924 950 910 947 926 918 917 933 944 920 907 917 911 920 914 999 921 909 915 939 // 908 933 920 946 940 936 935 931 957 924 914 914 915 936 929 999 929 934 924 906 // 946 943 954 968 961 966 966 935 951 940 935 959 949 950 938 996 941 981 955 999 // 940 926 938 941 930 935 934 935 958 897 941 955 931 943 944 971 949 941 972 999 // 950 952 944 957 935 953 951 938 972 932 957 988 943 950 970 961 947 968 970 999 // 966 959 957 945 956 964 973 947 991 946 966 985 966 980 986 962 967 958 959 999 // 941 964 956 979 959 983 992 937 999 948 976 961 950 996 987 944 968 949 961 968 // 982 975 932 950 960 969 955 980 977 944 956 944 920 943 999 940 947 938 937 951 // 977 970 935 953 960 965 980 957 991 943 938 956 945 971 999 936 939 952 948 944 // 993 969 987 951 988 979 999 966 983 957 948 971 971 956 990 958 967 955 979 983 // 969 935 955 952 936 961 979 943 975 954 938 957 967 959 950 977 940 999 956 964 // 964 953 975 956 923 972 978 952 974 962 939 946 952 947 952 984 944 999 947 974 // 957 966 969 951 950 977 984 935 940 954 962 956 957 953 952 965 943 999 957 975 // 974 984 982 953 947 968 999 953 940 956 972 946 973 954 948 958 953 987 970 961 // 976 980 976 948 946 955 999 963 952 962 962 941 955 946 926 944 952 951 957 944 // 988 975 951 938 957 972 999 963 973 963 968 948 958 948 945 954 963 943 945 959 // 999 977 958 930 955 957 993 954 983 955 952 945 953 947 923 963 945 939 972 953 // 999 970 928 975 945 938 983 956 954 933 939 938 940 921 923 958 935 933 962 938 // 999 970 915 961 935 923 993 932 929 926 935 925 927 931 917 947 938 929 959 921 // 999 949 953 933 929 952 984 922 935 943 931 942 932 951 934 957 935 926 947 920 // 999 957 948 937 943 951 992 926 927 938 930 934 943 953 946 945 938 935 953 941 // 999 958 962 932 934 938 989 935 930 921 927 935 932 957 926 945 935 921 949 932 // 989 962 950 942 927 941 999 943 926 936 931 930 926 959 923 943 931 928 951 945 // 999 960 952 934 933 935 995 939 928 938 924 929 971 946 927 950 932 933 943 947 // Total bytes=1001793961, ranges=1898 }