func TestRepairEmpty(t *testing.T) { _, cidr, _ := net.ParseCIDR("192.168.1.0/24") previous := ipallocator.NewCIDRRange(cidr) previous.Allocate(net.ParseIP("192.168.1.10")) var dst api.RangeAllocation err := previous.Snapshot(&dst) if err != nil { t.Fatal(err) } fakeClient := fake.NewSimpleClientset() ipregistry := &mockRangeRegistry{ item: &api.RangeAllocation{ ObjectMeta: api.ObjectMeta{ ResourceVersion: "1", }, Range: dst.Range, Data: dst.Data, }, } r := NewRepair(0, fakeClient.Core(), cidr, ipregistry) if err := r.RunOnce(); err != nil { t.Fatal(err) } after := ipallocator.NewCIDRRange(cidr) if err := after.Restore(cidr, ipregistry.updated.Data); err != nil { t.Fatal(err) } if after.Has(net.ParseIP("192.168.1.10")) { t.Errorf("unexpected ipallocator state: %#v", after) } }
func TestRepairWithExisting(t *testing.T) { _, cidr, _ := net.ParseCIDR("192.168.1.0/24") previous := ipallocator.NewCIDRRange(cidr) var dst api.RangeAllocation err := previous.Snapshot(&dst) if err != nil { t.Fatal(err) } registry := registrytest.NewServiceRegistry() registry.List = api.ServiceList{ Items: []api.Service{ { Spec: api.ServiceSpec{ClusterIP: "192.168.1.1"}, }, { Spec: api.ServiceSpec{ClusterIP: "192.168.1.100"}, }, { // outside CIDR, will be dropped Spec: api.ServiceSpec{ClusterIP: "192.168.0.1"}, }, { // empty, ignored Spec: api.ServiceSpec{ClusterIP: ""}, }, { // duplicate, dropped Spec: api.ServiceSpec{ClusterIP: "192.168.1.1"}, }, { // headless Spec: api.ServiceSpec{ClusterIP: "None"}, }, }, } ipregistry := &mockRangeRegistry{ item: &api.RangeAllocation{ ObjectMeta: api.ObjectMeta{ ResourceVersion: "1", }, Range: dst.Range, Data: dst.Data, }, } r := NewRepair(0, registry, cidr, ipregistry) if err := r.RunOnce(); err != nil { t.Fatal(err) } after := ipallocator.NewCIDRRange(cidr) if err := after.Restore(cidr, ipregistry.updated.Data); err != nil { t.Fatal(err) } if !after.Has(net.ParseIP("192.168.1.1")) || !after.Has(net.ParseIP("192.168.1.100")) { t.Errorf("unexpected ipallocator state: %#v", after) } if after.Free() != 252 { t.Errorf("unexpected ipallocator state: %#v", after) } }
func NewTestREST(t *testing.T, endpoints *api.EndpointsList) (*REST, *registrytest.ServiceRegistry) { registry := registrytest.NewServiceRegistry() endpointRegistry := ®istrytest.EndpointRegistry{ Endpoints: endpoints, } r := ipallocator.NewCIDRRange(makeIPNet(t)) portRange := utilnet.PortRange{Base: 30000, Size: 1000} portAllocator := portallocator.NewPortAllocator(portRange) storage := NewStorage(registry, endpointRegistry, r, portAllocator, nil) return storage.Service, registry }
func TestRepairLeak(t *testing.T) { _, cidr, _ := net.ParseCIDR("192.168.1.0/24") previous := ipallocator.NewCIDRRange(cidr) previous.Allocate(net.ParseIP("192.168.1.10")) var dst api.RangeAllocation err := previous.Snapshot(&dst) if err != nil { t.Fatal(err) } fakeClient := fake.NewSimpleClientset() ipregistry := &mockRangeRegistry{ item: &api.RangeAllocation{ ObjectMeta: metav1.ObjectMeta{ ResourceVersion: "1", }, Range: dst.Range, Data: dst.Data, }, } r := NewRepair(0, fakeClient.Core(), cidr, ipregistry) // Run through the "leak detection holdoff" loops. for i := 0; i < (numRepairsBeforeLeakCleanup - 1); i++ { if err := r.RunOnce(); err != nil { t.Fatal(err) } after, err := ipallocator.NewFromSnapshot(ipregistry.updated) if err != nil { t.Fatal(err) } if !after.Has(net.ParseIP("192.168.1.10")) { t.Errorf("expected ipallocator to still have leaked IP") } } // Run one more time to actually remove the leak. if err := r.RunOnce(); err != nil { t.Fatal(err) } after, err := ipallocator.NewFromSnapshot(ipregistry.updated) if err != nil { t.Fatal(err) } if after.Has(net.ParseIP("192.168.1.10")) { t.Errorf("expected ipallocator to not have leaked IP") } }
// runOnce verifies the state of the cluster IP allocations and returns an error if an unrecoverable problem occurs. func (c *Repair) runOnce() error { // TODO: (per smarterclayton) if Get() or ListServices() is a weak consistency read, // or if they are executed against different leaders, // the ordering guarantee required to ensure no IP is allocated twice is violated. // ListServices must return a ResourceVersion higher than the etcd index Get triggers, // and the release code must not release services that have had IPs allocated but not yet been created // See #8295 // If etcd server is not running we should wait for some time and fail only then. This is particularly // important when we start apiserver and etcd at the same time. var latest *api.RangeAllocation var err error err = wait.PollImmediate(time.Second, 10*time.Second, func() (bool, error) { latest, err = c.alloc.Get() return err == nil, err }) if err != nil { return fmt.Errorf("unable to refresh the service IP block: %v", err) } ctx := api.WithNamespace(api.NewDefaultContext(), api.NamespaceAll) // We explicitly send no resource version, since the resource version // of 'latest' is from a different collection, it's not comparable to // the service collection. The caching layer keeps per-collection RVs, // and this is proper, since in theory the collections could be hosted // in separate etcd (or even non-etcd) instances. list, err := c.registry.ListServices(ctx, nil) if err != nil { return fmt.Errorf("unable to refresh the service IP block: %v", err) } r := ipallocator.NewCIDRRange(c.network) for _, svc := range list.Items { if !api.IsServiceIPSet(&svc) { continue } ip := net.ParseIP(svc.Spec.ClusterIP) if ip == nil { // cluster IP is broken, reallocate runtime.HandleError(fmt.Errorf("the cluster IP %s for service %s/%s is not a valid IP; please recreate", svc.Spec.ClusterIP, svc.Name, svc.Namespace)) continue } switch err := r.Allocate(ip); err { case nil: case ipallocator.ErrAllocated: // TODO: send event // cluster IP is broken, reallocate runtime.HandleError(fmt.Errorf("the cluster IP %s for service %s/%s was assigned to multiple services; please recreate", ip, svc.Name, svc.Namespace)) case ipallocator.ErrNotInRange: // TODO: send event // cluster IP is broken, reallocate runtime.HandleError(fmt.Errorf("the cluster IP %s for service %s/%s is not within the service CIDR %s; please recreate", ip, svc.Name, svc.Namespace, c.network)) case ipallocator.ErrFull: // TODO: send event return fmt.Errorf("the service CIDR %v is full; you must widen the CIDR in order to create new services", r) default: return fmt.Errorf("unable to allocate cluster IP %s for service %s/%s due to an unknown error, exiting: %v", ip, svc.Name, svc.Namespace, err) } } if err := r.Snapshot(latest); err != nil { return fmt.Errorf("unable to snapshot the updated service IP allocations: %v", err) } if err := c.alloc.CreateOrUpdate(latest); err != nil { if errors.IsConflict(err) { return err } return fmt.Errorf("unable to persist the updated service IP allocations: %v", err) } return nil }
func TestRepairWithExisting(t *testing.T) { _, cidr, _ := net.ParseCIDR("192.168.1.0/24") previous := ipallocator.NewCIDRRange(cidr) var dst api.RangeAllocation err := previous.Snapshot(&dst) if err != nil { t.Fatal(err) } fakeClient := fake.NewSimpleClientset( &api.Service{ ObjectMeta: api.ObjectMeta{Namespace: "one", Name: "one"}, Spec: api.ServiceSpec{ClusterIP: "192.168.1.1"}, }, &api.Service{ ObjectMeta: api.ObjectMeta{Namespace: "two", Name: "two"}, Spec: api.ServiceSpec{ClusterIP: "192.168.1.100"}, }, &api.Service{ // outside CIDR, will be dropped ObjectMeta: api.ObjectMeta{Namespace: "three", Name: "three"}, Spec: api.ServiceSpec{ClusterIP: "192.168.0.1"}, }, &api.Service{ // empty, ignored ObjectMeta: api.ObjectMeta{Namespace: "four", Name: "four"}, Spec: api.ServiceSpec{ClusterIP: ""}, }, &api.Service{ // duplicate, dropped ObjectMeta: api.ObjectMeta{Namespace: "five", Name: "five"}, Spec: api.ServiceSpec{ClusterIP: "192.168.1.1"}, }, &api.Service{ // headless ObjectMeta: api.ObjectMeta{Namespace: "six", Name: "six"}, Spec: api.ServiceSpec{ClusterIP: "None"}, }, ) ipregistry := &mockRangeRegistry{ item: &api.RangeAllocation{ ObjectMeta: api.ObjectMeta{ ResourceVersion: "1", }, Range: dst.Range, Data: dst.Data, }, } r := NewRepair(0, fakeClient.Core(), cidr, ipregistry) if err := r.RunOnce(); err != nil { t.Fatal(err) } after := ipallocator.NewCIDRRange(cidr) if err := after.Restore(cidr, ipregistry.updated.Data); err != nil { t.Fatal(err) } if !after.Has(net.ParseIP("192.168.1.1")) || !after.Has(net.ParseIP("192.168.1.100")) { t.Errorf("unexpected ipallocator state: %#v", after) } if after.Free() != 252 { t.Errorf("unexpected ipallocator state: %#v", after) } }
// runOnce verifies the state of the cluster IP allocations and returns an error if an unrecoverable problem occurs. func (c *Repair) runOnce() error { // TODO: (per smarterclayton) if Get() or ListServices() is a weak consistency read, // or if they are executed against different leaders, // the ordering guarantee required to ensure no IP is allocated twice is violated. // ListServices must return a ResourceVersion higher than the etcd index Get triggers, // and the release code must not release services that have had IPs allocated but not yet been created // See #8295 // If etcd server is not running we should wait for some time and fail only then. This is particularly // important when we start apiserver and etcd at the same time. var snapshot *api.RangeAllocation err := wait.PollImmediate(time.Second, 10*time.Second, func() (bool, error) { var err error snapshot, err = c.alloc.Get() return err == nil, err }) if err != nil { return fmt.Errorf("unable to refresh the service IP block: %v", err) } // If not yet initialized. if snapshot.Range == "" { snapshot.Range = c.network.String() } // Create an allocator because it is easy to use. stored, err := ipallocator.NewFromSnapshot(snapshot) if err != nil { return fmt.Errorf("unable to rebuild allocator from snapshot: %v", err) } // We explicitly send no resource version, since the resource version // of 'snapshot' is from a different collection, it's not comparable to // the service collection. The caching layer keeps per-collection RVs, // and this is proper, since in theory the collections could be hosted // in separate etcd (or even non-etcd) instances. list, err := c.serviceClient.Services(api.NamespaceAll).List(api.ListOptions{}) if err != nil { return fmt.Errorf("unable to refresh the service IP block: %v", err) } rebuilt := ipallocator.NewCIDRRange(c.network) // Check every Service's ClusterIP, and rebuild the state as we think it should be. for _, svc := range list.Items { if !api.IsServiceIPSet(&svc) { // didn't need a cluster IP continue } ip := net.ParseIP(svc.Spec.ClusterIP) if ip == nil { // cluster IP is corrupt runtime.HandleError(fmt.Errorf("the cluster IP %s for service %s/%s is not a valid IP; please recreate", svc.Spec.ClusterIP, svc.Name, svc.Namespace)) continue } // mark it as in-use switch err := rebuilt.Allocate(ip); err { case nil: if stored.Has(ip) { // remove it from the old set, so we can find leaks stored.Release(ip) } else { // cluster IP doesn't seem to be allocated runtime.HandleError(fmt.Errorf("the cluster IP %s for service %s/%s is not allocated; repairing", svc.Spec.ClusterIP, svc.Name, svc.Namespace)) } delete(c.leaks, ip.String()) // it is used, so it can't be leaked case ipallocator.ErrAllocated: // TODO: send event // cluster IP is duplicate runtime.HandleError(fmt.Errorf("the cluster IP %s for service %s/%s was assigned to multiple services; please recreate", ip, svc.Name, svc.Namespace)) case ipallocator.ErrNotInRange: // TODO: send event // cluster IP is out of range runtime.HandleError(fmt.Errorf("the cluster IP %s for service %s/%s is not within the service CIDR %s; please recreate", ip, svc.Name, svc.Namespace, c.network)) case ipallocator.ErrFull: // TODO: send event // somehow we are out of IPs return fmt.Errorf("the service CIDR %v is full; you must widen the CIDR in order to create new services", rebuilt) default: return fmt.Errorf("unable to allocate cluster IP %s for service %s/%s due to an unknown error, exiting: %v", ip, svc.Name, svc.Namespace, err) } } // Check for IPs that are left in the old set. They appear to have been leaked. stored.ForEach(func(ip net.IP) { count, found := c.leaks[ip.String()] switch { case !found: // flag it to be cleaned up after any races (hopefully) are gone runtime.HandleError(fmt.Errorf("the cluster IP %s may have leaked: flagging for later clean up", ip)) count = numRepairsBeforeLeakCleanup - 1 fallthrough case count > 0: // pretend it is still in use until count expires c.leaks[ip.String()] = count - 1 if err := rebuilt.Allocate(ip); err != nil { runtime.HandleError(fmt.Errorf("the cluster IP %s may have leaked, but can not be allocated: %v", ip, err)) } default: // do not add it to the rebuilt set, which means it will be available for reuse runtime.HandleError(fmt.Errorf("the cluster IP %s appears to have leaked: cleaning up", ip)) } }) // Blast the rebuilt state into storage. if err := rebuilt.Snapshot(snapshot); err != nil { return fmt.Errorf("unable to snapshot the updated service IP allocations: %v", err) } if err := c.alloc.CreateOrUpdate(snapshot); err != nil { if errors.IsConflict(err) { return err } return fmt.Errorf("unable to persist the updated service IP allocations: %v", err) } return nil }