func CheckCadvisorHealthOnAllNodes(c *client.Client, timeout time.Duration) { By("getting list of nodes") nodeList, err := c.Nodes().List(labels.Everything(), fields.Everything()) expectNoError(err) var errors []error retries := maxRetries for { errors = []error{} for _, node := range nodeList.Items { // cadvisor is not accessible directly unless its port (4194 by default) is exposed. // Here, we access '/stats/' REST endpoint on the kubelet which polls cadvisor internally. statsResource := fmt.Sprintf("api/v1/proxy/nodes/%s/stats/", node.Name) By(fmt.Sprintf("Querying stats from node %s using url %s", node.Name, statsResource)) _, err = c.Get().AbsPath(statsResource).Timeout(timeout).Do().Raw() if err != nil { errors = append(errors, err) } } if len(errors) == 0 { return } if retries--; retries <= 0 { break } Logf("failed to retrieve kubelet stats -\n %v", errors) time.Sleep(sleepDuration) } Failf("Failed after retrying %d times for cadvisor to be healthy on all nodes. Errors:\n%v", maxRetries, errors) }
func pickNode(c *client.Client) (string, error) { nodes, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { return "", err } if len(nodes.Items) == 0 { return "", fmt.Errorf("no nodes exist, can't test node proxy") } return nodes.Items[0].Name, nil }
func getAllNodesInCluster(c *client.Client) ([]string, error) { nodeList, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { return nil, err } result := []string{} for _, node := range nodeList.Items { result = append(result, node.Name) } return result, nil }
func getNodePublicIps(c *client.Client) ([]string, error) { nodes, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { return nil, err } ips := collectAddresses(nodes, api.NodeExternalIP) if len(ips) == 0 { ips = collectAddresses(nodes, api.NodeLegacyHostIP) } return ips, nil }
func clearDaemonSetNodeLabels(c *client.Client) error { nodeClient := c.Nodes() nodeList, err := nodeClient.List(labels.Everything(), fields.Everything()) if err != nil { return err } for _, node := range nodeList.Items { _, err := setDaemonSetNodeLabels(c, node.Name, map[string]string{}) if err != nil { return err } } return nil }
func setDaemonSetNodeLabels(c *client.Client, nodeName string, labels map[string]string) (*api.Node, error) { nodeClient := c.Nodes() var newNode *api.Node var newLabels map[string]string err := wait.Poll(updateRetryPeriod, updateRetryTimeout, func() (bool, error) { node, err := nodeClient.Get(nodeName) if err != nil { return false, err } // remove all labels this test is creating daemonSetLabels, otherLabels := separateDaemonSetNodeLabels(node.Labels) if reflect.DeepEqual(daemonSetLabels, labels) { newNode = node return true, nil } node.Labels = otherLabels for k, v := range labels { node.Labels[k] = v } newNode, err = nodeClient.Update(node) if err == nil { newLabels, _ = separateDaemonSetNodeLabels(newNode.Labels) return true, err } if se, ok := err.(*apierrs.StatusError); ok && se.ErrStatus.Reason == unversioned.StatusReasonConflict { Logf("failed to update node due to resource version conflict") return false, nil } return false, err }) if err != nil { return nil, err } else if len(newLabels) != len(labels) { return nil, fmt.Errorf("Could not set daemon set test labels as expected.") } return newNode, nil }
} var _ = Describe("SchedulerPredicates", func() { var c *client.Client var nodeList *api.NodeList var nodeCount int var totalPodCapacity int64 var RCName string var ns string var uuid string BeforeEach(func() { var err error c, err = loadClient() expectNoError(err) nodeList, err = c.Nodes().List(labels.Everything(), fields.Everything()) expectNoError(err) nodeCount = len(nodeList.Items) Expect(nodeCount).NotTo(BeZero()) err = checkTestingNSDeletedExcept(c, "") expectNoError(err) nsForTesting, err := createTestingNS("sched-pred", c) ns = nsForTesting.Name expectNoError(err) uuid = string(util.NewUUID()) }) AfterEach(func() { rc, err := c.ReplicationControllers(ns).Get(RCName)
func DoTestUnschedulableNodes(t *testing.T, restClient *client.Client, nodeStore cache.Store) { goodCondition := api.NodeCondition{ Type: api.NodeReady, Status: api.ConditionTrue, Reason: fmt.Sprintf("schedulable condition"), LastHeartbeatTime: unversioned.Time{time.Now()}, } badCondition := api.NodeCondition{ Type: api.NodeReady, Status: api.ConditionUnknown, Reason: fmt.Sprintf("unschedulable condition"), LastHeartbeatTime: unversioned.Time{time.Now()}, } // Create a new schedulable node, since we're first going to apply // the unschedulable condition and verify that pods aren't scheduled. node := &api.Node{ ObjectMeta: api.ObjectMeta{Name: "node-scheduling-test-node"}, Spec: api.NodeSpec{Unschedulable: false}, Status: api.NodeStatus{ Capacity: api.ResourceList{ api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), }, Conditions: []api.NodeCondition{goodCondition}, }, } nodeKey, err := cache.MetaNamespaceKeyFunc(node) if err != nil { t.Fatalf("Couldn't retrieve key for node %v", node.Name) } // The test does the following for each nodeStateManager in this list: // 1. Create a new node // 2. Apply the makeUnSchedulable function // 3. Create a new pod // 4. Check that the pod doesn't get assigned to the node // 5. Apply the schedulable function // 6. Check that the pod *does* get assigned to the node // 7. Delete the pod and node. nodeModifications := []nodeStateManager{ // Test node.Spec.Unschedulable=true/false { makeUnSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) { n.Spec.Unschedulable = true if _, err := c.Nodes().Update(n); err != nil { t.Fatalf("Failed to update node with unschedulable=true: %v", err) } err = waitForReflection(s, nodeKey, func(node interface{}) bool { // An unschedulable node should get deleted from the store return node == nil }) if err != nil { t.Fatalf("Failed to observe reflected update for setting unschedulable=true: %v", err) } }, makeSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) { n.Spec.Unschedulable = false if _, err := c.Nodes().Update(n); err != nil { t.Fatalf("Failed to update node with unschedulable=false: %v", err) } err = waitForReflection(s, nodeKey, func(node interface{}) bool { return node != nil && node.(*api.Node).Spec.Unschedulable == false }) if err != nil { t.Fatalf("Failed to observe reflected update for setting unschedulable=false: %v", err) } }, }, // Test node.Status.Conditions=ConditionTrue/Unknown { makeUnSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) { n.Status = api.NodeStatus{ Capacity: api.ResourceList{ api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), }, Conditions: []api.NodeCondition{badCondition}, } if _, err = c.Nodes().UpdateStatus(n); err != nil { t.Fatalf("Failed to update node with bad status condition: %v", err) } err = waitForReflection(s, nodeKey, func(node interface{}) bool { return node != nil && node.(*api.Node).Status.Conditions[0].Status == api.ConditionUnknown }) if err != nil { t.Fatalf("Failed to observe reflected update for status condition update: %v", err) } }, makeSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) { n.Status = api.NodeStatus{ Capacity: api.ResourceList{ api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), }, Conditions: []api.NodeCondition{goodCondition}, } if _, err = c.Nodes().UpdateStatus(n); err != nil { t.Fatalf("Failed to update node with healthy status condition: %v", err) } waitForReflection(s, nodeKey, func(node interface{}) bool { return node != nil && node.(*api.Node).Status.Conditions[0].Status == api.ConditionTrue }) if err != nil { t.Fatalf("Failed to observe reflected update for status condition update: %v", err) } }, }, } for i, mod := range nodeModifications { unSchedNode, err := restClient.Nodes().Create(node) if err != nil { t.Fatalf("Failed to create node: %v", err) } // Apply the unschedulable modification to the node, and wait for the reflection mod.makeUnSchedulable(t, unSchedNode, nodeStore, restClient) // Create the new pod, note that this needs to happen post unschedulable // modification or we have a race in the test. pod := &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "node-scheduling-test-pod"}, Spec: api.PodSpec{ Containers: []api.Container{{Name: "container", Image: "kubernetes/pause:go"}}, }, } myPod, err := restClient.Pods(api.NamespaceDefault).Create(pod) if err != nil { t.Fatalf("Failed to create pod: %v", err) } // There are no schedulable nodes - the pod shouldn't be scheduled. err = wait.Poll(time.Second, util.ForeverTestTimeout, podScheduled(restClient, myPod.Namespace, myPod.Name)) if err == nil { t.Errorf("Pod scheduled successfully on unschedulable nodes") } if err != wait.ErrWaitTimeout { t.Errorf("Test %d: failed while trying to confirm the pod does not get scheduled on the node: %v", i, err) } else { t.Logf("Test %d: Pod did not get scheduled on an unschedulable node", i) } // Apply the schedulable modification to the node, and wait for the reflection schedNode, err := restClient.Nodes().Get(unSchedNode.Name) if err != nil { t.Fatalf("Failed to get node: %v", err) } mod.makeSchedulable(t, schedNode, nodeStore, restClient) // Wait until the pod is scheduled. err = wait.Poll(time.Second, util.ForeverTestTimeout, podScheduled(restClient, myPod.Namespace, myPod.Name)) if err != nil { t.Errorf("Test %d: failed to schedule a pod: %v", i, err) } else { t.Logf("Test %d: Pod got scheduled on a schedulable node", i) } err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, api.NewDeleteOptions(0)) if err != nil { t.Errorf("Failed to delete pod: %v", err) } err = restClient.Nodes().Delete(schedNode.Name) if err != nil { t.Errorf("Failed to delete node: %v", err) } } }
// rebootNode takes node name on provider through the following steps using c: // - ensures the node is ready // - ensures all pods on the node are running and ready // - reboots the node (by executing rebootCmd over ssh) // - ensures the node reaches some non-ready state // - ensures the node becomes ready again // - ensures all pods on the node become running and ready again // // It returns true through result only if all of the steps pass; at the first // failed step, it will return false through result and not run the rest. func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) { // Setup ns := api.NamespaceSystem ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) defer ps.Stop() // Get the node initially. Logf("Getting %s", name) node, err := c.Nodes().Get(name) if err != nil { Logf("Couldn't get node %s", name) result <- false return } // Node sanity check: ensure it is "ready". if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) { result <- false return } // Get all the pods on the node that don't have liveness probe set. // Liveness probe may cause restart of a pod during node reboot, and the pod may not be running. pods := ps.List() podNames := []string{} for _, p := range pods { probe := false for _, c := range p.Spec.Containers { if c.LivenessProbe != nil { probe = true break } } if !probe { podNames = append(podNames, p.ObjectMeta.Name) } } Logf("Node %s has %d pods: %v", name, len(podNames), podNames) // For each pod, we do a sanity check to ensure it's running / healthy // now, as that's what we'll be checking later. if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) { result <- false return } // Reboot the node. if err = issueSSHCommand(node, provider, rebootCmd); err != nil { Logf("Error while issuing ssh command: %v", err) result <- false return } // Wait for some kind of "not ready" status. if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) { result <- false return } // Wait for some kind of "ready" status. if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) { result <- false return } // Ensure all of the pods that we found on this node before the reboot are // running / healthy. if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) { result <- false return } Logf("Reboot successful on node %s", name) result <- true }