// checkNodesReady waits up to nt for expect nodes accessed by c to be ready, // returning an error if this doesn't happen in time. It returns the names of // nodes it finds. func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, error) { // First, keep getting all of the nodes until we get the number we expect. var nodeList *api.NodeList var errLast error start := time.Now() found := wait.Poll(framework.Poll, nt, func() (bool, error) { // A rolling-update (GCE/GKE implementation of restart) can complete before the apiserver // knows about all of the nodes. Thus, we retry the list nodes call // until we get the expected number of nodes. nodeList, errLast = c.Nodes().List(api.ListOptions{ FieldSelector: fields.Set{"spec.unschedulable": "false"}.AsSelector()}) if errLast != nil { return false, nil } if len(nodeList.Items) != expect { errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)", expect, len(nodeList.Items), time.Since(start)) framework.Logf("%v", errLast) return false, nil } return true, nil }) == nil nodeNames := make([]string, len(nodeList.Items)) for i, n := range nodeList.Items { nodeNames[i] = n.ObjectMeta.Name } if !found { return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v", expect, nt, errLast) } framework.Logf("Successfully found %d nodes", expect) // Next, ensure in parallel that all the nodes are ready. We subtract the // time we spent waiting above. timeout := nt - time.Since(start) result := make(chan bool, len(nodeList.Items)) for _, n := range nodeNames { n := n go func() { result <- framework.WaitForNodeToBeReady(c, n, timeout) }() } failed := false // TODO(mbforbes): Change to `for range` syntax once we support only Go // >= 1.4. for i := range nodeList.Items { _ = i if !<-result { failed = true } } if failed { return nodeNames, fmt.Errorf("at least one node failed to be ready") } return nodeNames, nil }
// kubeletCommand performs `start`, `restart`, or `stop` on the kubelet running on the node of the target pod. // Allowed kubeltOps are `kStart`, `kStop`, and `kRestart` func kubeletCommand(kOp kubeletOpt, c clientset.Interface, pod *v1.Pod) { nodeIP, err := framework.GetHostExternalAddress(c, pod) Expect(err).NotTo(HaveOccurred()) nodeIP = nodeIP + ":22" sshResult, err := framework.SSH("sudo /etc/init.d/kubelet "+string(kOp), nodeIP, framework.TestContext.Provider) Expect(err).NotTo(HaveOccurred()) framework.LogSSHResult(sshResult) // On restart, waiting for node NotReady prevents a race condition where the node takes a few moments to leave the // Ready state which in turn short circuits WaitForNodeToBeReady() if kOp == kStop || kOp == kRestart { if ok := framework.WaitForNodeToBeNotReady(c, pod.Spec.NodeName, NodeStateTimeout); !ok { framework.Failf("Node %s failed to enter NotReady state", pod.Spec.NodeName) } } if kOp == kStart || kOp == kRestart { if ok := framework.WaitForNodeToBeReady(c, pod.Spec.NodeName, NodeStateTimeout); !ok { framework.Failf("Node %s failed to enter Ready state", pod.Spec.NodeName) } } }
// rebootNode takes node name on provider through the following steps using c: // - ensures the node is ready // - ensures all pods on the node are running and ready // - reboots the node (by executing rebootCmd over ssh) // - ensures the node reaches some non-ready state // - ensures the node becomes ready again // - ensures all pods on the node become running and ready again // // It returns true through result only if all of the steps pass; at the first // failed step, it will return false through result and not run the rest. func rebootNode(c *client.Client, provider, name, rebootCmd string) bool { // Setup ns := api.NamespaceSystem ps := framework.NewPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(api.PodHostField, name)) defer ps.Stop() // Get the node initially. framework.Logf("Getting %s", name) node, err := c.Nodes().Get(name) if err != nil { framework.Logf("Couldn't get node %s", name) return false } // Node sanity check: ensure it is "ready". if !framework.WaitForNodeToBeReady(c, name, framework.NodeReadyInitialTimeout) { return false } // Get all the pods on the node that don't have liveness probe set. // Liveness probe may cause restart of a pod during node reboot, and the pod may not be running. pods := ps.List() podNames := []string{} for _, p := range pods { probe := false for _, c := range p.Spec.Containers { if c.LivenessProbe != nil { probe = true break } } if !probe { podNames = append(podNames, p.ObjectMeta.Name) } } framework.Logf("Node %s has %d assigned pods with no liveness probes: %v", name, len(podNames), podNames) // For each pod, we do a sanity check to ensure it's running / healthy // or succeeded now, as that's what we'll be checking later. if !framework.CheckPodsRunningReadyOrSucceeded(c, ns, podNames, framework.PodReadyBeforeTimeout) { printStatusAndLogsForNotReadyPods(c, ns, podNames, pods) return false } // Reboot the node. if err = framework.IssueSSHCommand(rebootCmd, provider, node); err != nil { framework.Logf("Error while issuing ssh command: %v", err) return false } // Wait for some kind of "not ready" status. if !framework.WaitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) { return false } // Wait for some kind of "ready" status. if !framework.WaitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) { return false } // Ensure all of the pods that we found on this node before the reboot are // running / healthy, or succeeded. if !framework.CheckPodsRunningReadyOrSucceeded(c, ns, podNames, rebootPodReadyAgainTimeout) { newPods := ps.List() printStatusAndLogsForNotReadyPods(c, ns, podNames, newPods) return false } framework.Logf("Reboot successful on node %s", name) return true }
diskName, err := createPDWithRetry() framework.ExpectNoError(err, "Error creating a pd") host0Pod := testPDPod([]string{diskName}, host0Name, false, 1) originalCount := len(nodes.Items) containerName := "mycontainer" nodeToDelete := &nodes.Items[0] defer func() error { By("Cleaning up PD-RW test env") detachAndDeletePDs(diskName, []types.NodeName{host0Name}) nodeToDelete.ObjectMeta.SetResourceVersion("0") // need to set the resource version or else the Create() fails _, err := nodeClient.Create(nodeToDelete) framework.ExpectNoError(err, "Unable to re-create the deleted node") framework.ExpectNoError(WaitForGroupSize(framework.TestContext.CloudConfig.NodeInstanceGroup, int32(initialGroupSize)), "Unable to get the node group back to the original size") framework.WaitForNodeToBeReady(f.ClientSet, nodeToDelete.Name, nodeStatusTimeout) if len(nodes.Items) != originalCount { return fmt.Errorf("The node count is not back to original count") } return nil }() By("submitting host0Pod to kubernetes") _, err = podClient.Create(host0Pod) framework.ExpectNoError(err, fmt.Sprintf("Failed to create host0pod: %v", err)) framework.ExpectNoError(f.WaitForPodRunningSlow(host0Pod.Name)) testFile := "/testpd1/tracker" testFileContents := fmt.Sprintf("%v", mathrand.Int())
Expect(err).NotTo(HaveOccurred(), "Each pod should start running and responding") By("choose a node with at least one pod - we will block some network traffic on this node") label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name})) options := api.ListOptions{LabelSelector: label} pods, err := c.Pods(ns).List(options) // list pods after all have been scheduled Expect(err).NotTo(HaveOccurred()) nodeName := pods.Items[0].Spec.NodeName node, err := c.Nodes().Get(nodeName) Expect(err).NotTo(HaveOccurred()) By(fmt.Sprintf("block network traffic from node %s", node.Name)) performTemporaryNetworkFailure(c, ns, name, replicas, pods.Items[0].Name, node) framework.Logf("Waiting %v for node %s to be ready once temporary network failure ends", resizeNodeReadyTimeout, node.Name) if !framework.WaitForNodeToBeReady(c, node.Name, resizeNodeReadyTimeout) { framework.Failf("Node %s did not become ready within %v", node.Name, resizeNodeReadyTimeout) } // sleep a bit, to allow Watch in NodeController to catch up. time.Sleep(5 * time.Second) By("verify whether new pods can be created on the re-attached node") // increasing the RC size is not a valid way to test this // since we have no guarantees the pod will be scheduled on our node. additionalPod := "additionalpod" err = newPodOnNode(c, ns, additionalPod, node.Name) Expect(err).NotTo(HaveOccurred()) err = framework.VerifyPods(c, ns, additionalPod, true, 1) Expect(err).NotTo(HaveOccurred())