func waitForClusterSize(c *client.Client, size int) error { timeout := 4 * time.Minute if providerIs("aws") { // AWS is not as fast as gce/gke at having nodes come online timeout = 10 * time.Minute } for start := time.Now(); time.Since(start) < timeout; time.Sleep(20 * time.Second) { nodes, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { Logf("Failed to list nodes: %v", err) continue } // Filter out not-ready nodes. filterNodes(nodes, func(node api.Node) bool { return isNodeReadySetAsExpected(&node, true) }) if len(nodes.Items) == size { Logf("Cluster has reached the desired size %d", size) return nil } Logf("Waiting for cluster size %d, current size %d", size, len(nodes.Items)) } return fmt.Errorf("timeout waiting for cluster size to be %d", size) }
func CheckCadvisorHealthOnAllNodes(c *client.Client, timeout time.Duration) { By("getting list of nodes") nodeList, err := c.Nodes().List(labels.Everything(), fields.Everything()) expectNoError(err) var errors []error retries := maxRetries for { errors = []error{} for _, node := range nodeList.Items { // cadvisor is not accessible directly unless its port (4194 by default) is exposed. // Here, we access '/stats/' REST endpoint on the kubelet which polls cadvisor internally. statsResource := fmt.Sprintf("api/v1/proxy/nodes/%s/stats/", node.Name) By(fmt.Sprintf("Querying stats from node %s using url %s", node.Name, statsResource)) _, err = c.Get().AbsPath(statsResource).Timeout(timeout).Do().Raw() if err != nil { errors = append(errors, err) } } if len(errors) == 0 { return } if retries--; retries <= 0 { break } Logf("failed to retrieve kubelet stats -\n %v", errors) time.Sleep(sleepDuration) } Failf("Failed after retrying %d times for cadvisor to be healthy on all nodes. Errors:\n%v", maxRetries, errors) }
func watchNodes(client *client.Client) { nodeList, err := client.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { log.Fatal(err) } nodes := nodeList.Items writeNodeTargetsFile(nodes) watcher, err := client.Nodes().Watch(labels.Everything(), fields.Everything(), nodeList.ResourceVersion) if err != nil { log.Fatal(err) } for event := range watcher.ResultChan() { switch event.Type { case watch.Added: switch obj := event.Object.(type) { case *api.Node: nodes = append(nodes, *obj) } writeNodeTargetsFile(nodes) case watch.Deleted: switch obj := event.Object.(type) { case *api.Node: index := findNodeIndexInSlice(nodes, obj) nodes = append(nodes[:index], nodes[index+1:]...) } writeNodeTargetsFile(nodes) } } }
// NodeSSHHosts returns SSH-able host names for all nodes. It returns an error // if it can't find an external IP for every node, though it still returns all // hosts that it found in that case. func NodeSSHHosts(c *client.Client) ([]string, error) { var hosts []string nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { return hosts, fmt.Errorf("error getting nodes: %v", err) } for _, n := range nodelist.Items { for _, addr := range n.Status.Addresses { // Use the first external IP address we find on the node, and // use at most one per node. // TODO(mbforbes): Use the "preferred" address for the node, once // such a thing is defined (#2462). if addr.Type == api.NodeExternalIP { hosts = append(hosts, addr.Address+":22") break } } } // Error if any node didn't have an external IP. if len(hosts) != len(nodelist.Items) { return hosts, fmt.Errorf( "only found %d external IPs on nodes, but found %d nodes. Nodelist: %v", len(hosts), len(nodelist.Items), nodelist) } return hosts, nil }
func pickNode(c *client.Client) (string, error) { nodes, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { return "", err } if len(nodes.Items) == 0 { return "", fmt.Errorf("no nodes exist, can't test node proxy") } return nodes.Items[0].Name, nil }
func getAllNodesInCluster(c *client.Client) ([]string, error) { nodeList, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { return nil, err } result := []string{} for _, node := range nodeList.Items { result = append(result, node.Name) } return result, nil }
func getMinionPublicIps(c *client.Client) ([]string, error) { nodes, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { return nil, err } ips := collectAddresses(nodes, api.NodeExternalIP) if len(ips) == 0 { ips = collectAddresses(nodes, api.NodeLegacyHostIP) } return ips, nil }
func waitForClusterSize(c *client.Client, size int) error { for start := time.Now(); time.Since(start) < 4*time.Minute; time.Sleep(20 * time.Second) { nodes, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { Logf("Failed to list nodes: %v", err) continue } if len(nodes.Items) == size { Logf("Cluster has reached the desired size %d", size) return nil } Logf("Waiting for cluster size %d, current size %d", size, len(nodes.Items)) } return fmt.Errorf("timeout waiting for cluster size to be %d", size) }
func DoTestUnschedulableNodes(t *testing.T, client *client.Client) { node := &api.Node{ ObjectMeta: api.ObjectMeta{Name: "node"}, Spec: api.NodeSpec{Unschedulable: true}, } if _, err := client.Nodes().Create(node); err != nil { t.Fatalf("Failed to create node: %v", err) } pod := &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "my-pod"}, Spec: api.PodSpec{ Containers: []api.Container{{Name: "container", Image: "kubernetes/pause:go"}}, }, } myPod, err := client.Pods(api.NamespaceDefault).Create(pod) if err != nil { t.Fatalf("Failed to create pod: %v", err) } // There are no schedulable nodes - the pod shouldn't be scheduled. err = wait.Poll(time.Second, time.Second*10, podScheduled(client, myPod.Namespace, myPod.Name)) if err == nil { t.Errorf("Pod scheduled successfully on unschedulable nodes") } if err != wait.ErrWaitTimeout { t.Errorf("Failed while waiting for scheduled pod: %v", err) } // Make the node schedulable and wait until the pod is scheduled. newNode, err := client.Nodes().Get(node.Name) if err != nil { t.Fatalf("Failed to get node: %v", err) } newNode.Spec.Unschedulable = false if _, err = client.Nodes().Update(newNode); err != nil { t.Fatalf("Failed to update node: %v", err) } err = wait.Poll(time.Second, time.Second*10, podScheduled(client, myPod.Namespace, myPod.Name)) if err != nil { t.Errorf("Failed to schedule a pod: %v", err) } err = client.Pods(api.NamespaceDefault).Delete(myPod.Name) if err != nil { t.Errorf("Failed to delete pod: %v", err) } }
func DoTestUnschedulableNodes(t *testing.T, restClient *client.Client, nodeStore cache.Store) { goodCondition := api.NodeCondition{ Type: api.NodeReady, Status: api.ConditionTrue, Reason: fmt.Sprintf("schedulable condition"), LastHeartbeatTime: util.Time{time.Now()}, } badCondition := api.NodeCondition{ Type: api.NodeReady, Status: api.ConditionUnknown, Reason: fmt.Sprintf("unschedulable condition"), LastHeartbeatTime: util.Time{time.Now()}, } // Create a new schedulable node, since we're first going to apply // the unschedulable condition and verify that pods aren't scheduled. node := &api.Node{ ObjectMeta: api.ObjectMeta{Name: "node-scheduling-test-node"}, Spec: api.NodeSpec{Unschedulable: false}, Status: api.NodeStatus{ Capacity: api.ResourceList{ api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), }, Conditions: []api.NodeCondition{goodCondition}, }, } nodeKey, err := cache.MetaNamespaceKeyFunc(node) if err != nil { t.Fatalf("Couldn't retrieve key for node %v", node.Name) } // The test does the following for each nodeStateManager in this list: // 1. Create a new node // 2. Apply the makeUnSchedulable function // 3. Create a new pod // 4. Check that the pod doesn't get assigned to the node // 5. Apply the schedulable function // 6. Check that the pod *does* get assigned to the node // 7. Delete the pod and node. nodeModifications := []nodeStateManager{ // Test node.Spec.Unschedulable=true/false { makeUnSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) { n.Spec.Unschedulable = true if _, err := c.Nodes().Update(n); err != nil { t.Fatalf("Failed to update node with unschedulable=true: %v", err) } err = waitForReflection(s, nodeKey, func(node interface{}) bool { // An unschedulable node should get deleted from the store return node == nil }) if err != nil { t.Fatalf("Failed to observe reflected update for setting unschedulable=true: %v", err) } }, makeSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) { n.Spec.Unschedulable = false if _, err := c.Nodes().Update(n); err != nil { t.Fatalf("Failed to update node with unschedulable=false: %v", err) } err = waitForReflection(s, nodeKey, func(node interface{}) bool { return node != nil && node.(*api.Node).Spec.Unschedulable == false }) if err != nil { t.Fatalf("Failed to observe reflected update for setting unschedulable=false: %v", err) } }, }, // Test node.Status.Conditions=ConditionTrue/Unknown { makeUnSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) { n.Status = api.NodeStatus{ Capacity: api.ResourceList{ api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), }, Conditions: []api.NodeCondition{badCondition}, } if _, err = c.Nodes().UpdateStatus(n); err != nil { t.Fatalf("Failed to update node with bad status condition: %v", err) } err = waitForReflection(s, nodeKey, func(node interface{}) bool { return node != nil && node.(*api.Node).Status.Conditions[0].Status == api.ConditionUnknown }) if err != nil { t.Fatalf("Failed to observe reflected update for status condition update: %v", err) } }, makeSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) { n.Status = api.NodeStatus{ Capacity: api.ResourceList{ api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), }, Conditions: []api.NodeCondition{goodCondition}, } if _, err = c.Nodes().UpdateStatus(n); err != nil { t.Fatalf("Failed to update node with healthy status condition: %v", err) } waitForReflection(s, nodeKey, func(node interface{}) bool { return node != nil && node.(*api.Node).Status.Conditions[0].Status == api.ConditionTrue }) if err != nil { t.Fatalf("Failed to observe reflected update for status condition update: %v", err) } }, }, } for i, mod := range nodeModifications { unSchedNode, err := restClient.Nodes().Create(node) if err != nil { t.Fatalf("Failed to create node: %v", err) } // Apply the unschedulable modification to the node, and wait for the reflection mod.makeUnSchedulable(t, unSchedNode, nodeStore, restClient) // Create the new pod, note that this needs to happen post unschedulable // modification or we have a race in the test. pod := &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "node-scheduling-test-pod"}, Spec: api.PodSpec{ Containers: []api.Container{{Name: "container", Image: "kubernetes/pause:go"}}, }, } myPod, err := restClient.Pods(api.NamespaceDefault).Create(pod) if err != nil { t.Fatalf("Failed to create pod: %v", err) } // There are no schedulable nodes - the pod shouldn't be scheduled. err = wait.Poll(time.Second, time.Second*10, podScheduled(restClient, myPod.Namespace, myPod.Name)) if err == nil { t.Errorf("Pod scheduled successfully on unschedulable nodes") } if err != wait.ErrWaitTimeout { t.Errorf("Test %d: failed while trying to confirm the pod does not get scheduled on the node: %v", i, err) } else { t.Logf("Test %d: Pod did not get scheduled on an unschedulable node", i) } // Apply the schedulable modification to the node, and wait for the reflection schedNode, err := restClient.Nodes().Get(unSchedNode.Name) if err != nil { t.Fatalf("Failed to get node: %v", err) } mod.makeSchedulable(t, schedNode, nodeStore, restClient) // Wait until the pod is scheduled. err = wait.Poll(time.Second, time.Second*10, podScheduled(restClient, myPod.Namespace, myPod.Name)) if err != nil { t.Errorf("Test %d: failed to schedule a pod: %v", i, err) } else { t.Logf("Test %d: Pod got scheduled on a schedulable node", i) } err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, nil) if err != nil { t.Errorf("Failed to delete pod: %v", err) } err = restClient.Nodes().Delete(schedNode.Name) if err != nil { t.Errorf("Failed to delete node: %v", err) } } }
. "github.com/onsi/gomega" ) var _ = Describe("MaxPods", func() { var c *client.Client var nodeCount int var totalPodCapacity int64 var RCName string var ns string var uuid string BeforeEach(func() { var err error c, err = loadClient() expectNoError(err) nodes, err := c.Nodes().List(labels.Everything(), fields.Everything()) expectNoError(err) nodeCount = len(nodes.Items) Expect(nodeCount).NotTo(BeZero()) totalPodCapacity = 0 for _, node := range nodes.Items { podCapacity, found := node.Status.Capacity["pods"] Expect(found).To(Equal(true)) totalPodCapacity += podCapacity.Value() } err = deleteTestingNS(c) expectNoError(err) nsForTesting, err := createTestingNS("maxp", c)
// Create a replication controller for a service that serves its hostname. // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname name := "my-hostname-net" newSVCByName(c, ns, name) replicas := testContext.CloudConfig.NumNodes newRCByName(c, ns, name, replicas) err := verifyPods(c, ns, name, true, replicas) Expect(err).NotTo(HaveOccurred(), "Each pod should start running and responding") By("choose a node with at least one pod - we will block some network traffic on this node") label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name})) pods, err := c.Pods(ns).List(label, fields.Everything()) // list pods after all have been scheduled Expect(err).NotTo(HaveOccurred()) nodeName := pods.Items[0].Spec.NodeName node, err := c.Nodes().Get(nodeName) Expect(err).NotTo(HaveOccurred()) By(fmt.Sprintf("block network traffic from node %s", node.Name)) performTemporaryNetworkFailure(c, ns, name, replicas, pods.Items[0].Name, node) Logf("Waiting for node %s to be ready", node.Name) waitForNodeToBe(c, node.Name, true, 2*time.Minute) By("verify whether new pods can be created on the re-attached node") // increasing the RC size is not a valid way to test this // since we have no guarantees the pod will be scheduled on our node. additionalPod := "additionalpod" err = newPodOnNode(c, ns, additionalPod, node.Name) Expect(err).NotTo(HaveOccurred()) err = verifyPods(c, ns, additionalPod, true, 1) Expect(err).NotTo(HaveOccurred())
// This test suite can take a long time to run, so by default it is disabled // by being marked as Pending. To enable this suite, remove the P from the // front of PDescribe (PDescribe->Describe) and then all tests will // be available var _ = PDescribe("Density", func() { var c *client.Client var minionCount int var RCName string var ns string BeforeEach(func() { var err error c, err = loadClient() expectNoError(err) minions, err := c.Nodes().List() expectNoError(err) minionCount = len(minions.Items) Expect(minionCount).NotTo(BeZero()) ns = api.NamespaceDefault }) AfterEach(func() { // Remove any remaining pods from this test if the // replication controller still exists and the replica count // isn't 0. This means the controller wasn't cleaned up // during the test so clean it up here rc, err := c.ReplicationControllers(ns).Get(RCName) if err == nil && rc.Spec.Replicas != 0 { DeleteRC(c, ns, RCName) }
// rebootNode takes node name on provider through the following steps using c: // - ensures the node is ready // - ensures all pods on the node are running and ready // - reboots the node (by executing rebootCmd over ssh) // - ensures the node reaches some non-ready state // - ensures the node becomes ready again // - ensures all pods on the node become running and ready again // // It returns true through result only if all of the steps pass; at the first // failed step, it will return false through result and not run the rest. func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) { // Setup ns := api.NamespaceDefault ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) defer ps.Stop() // Get the node initially. Logf("Getting %s", name) node, err := c.Nodes().Get(name) if err != nil { Logf("Couldn't get node %s", name) result <- false return } // Node sanity check: ensure it is "ready". if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) { result <- false return } // Get all the pods on the node. pods := ps.List() podNames := make([]string, len(pods)) for i, p := range pods { podNames[i] = p.ObjectMeta.Name } Logf("Node %s has %d pods: %v", name, len(podNames), podNames) // For each pod, we do a sanity check to ensure it's running / healthy // now, as that's what we'll be checking later. if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) { result <- false return } // Reboot the node. if err = issueSSHCommand(node, provider, rebootCmd); err != nil { Logf("Error while issuing ssh command: %v", err) result <- false return } // Wait for some kind of "not ready" status. if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) { result <- false return } // Wait for some kind of "ready" status. if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) { result <- false return } // Ensure all of the pods that we found on this node before the reboot are // running / healthy. if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) { result <- false return } Logf("Reboot successful on node %s", name) result <- true }
// ClusterLevelLoggingWithElasticsearch is an end to end test for cluster level logging. func ClusterLevelLoggingWithElasticsearch(c *client.Client) { // TODO: For now assume we are only testing cluster logging with Elasticsearch // on GCE. Once we are sure that Elasticsearch cluster level logging // works for other providers we should widen this scope of this test. if !providerIs("gce") { Logf("Skipping cluster level logging test for provider %s", testContext.Provider) return } // Check for the existence of the Elasticsearch service. By("Checking the Elasticsearch service exists.") s := c.Services(api.NamespaceDefault) // Make a few attempts to connect. This makes the test robust against // being run as the first e2e test just after the e2e cluster has been created. var err error const graceTime = 10 * time.Minute for start := time.Now(); time.Since(start) < graceTime; time.Sleep(5 * time.Second) { if _, err = s.Get("elasticsearch-logging"); err == nil { break } Logf("Attempt to check for the existence of the Elasticsearch service failed after %v", time.Since(start)) } Expect(err).NotTo(HaveOccurred()) // Wait for the Elasticsearch pods to enter the running state. By("Checking to make sure the Elasticsearch pods are running") label := labels.SelectorFromSet(labels.Set(map[string]string{"name": "elasticsearch-logging"})) pods, err := c.Pods(api.NamespaceDefault).List(label, fields.Everything()) Expect(err).NotTo(HaveOccurred()) for _, pod := range pods.Items { err = waitForPodRunning(c, pod.Name) Expect(err).NotTo(HaveOccurred()) } By("Checking to make sure we are talking to an Elasticsearch service.") // Perform a few checks to make sure this looks like an Elasticsearch cluster. var statusCode float64 var esResponse map[string]interface{} err = nil for start := time.Now(); time.Since(start) < graceTime; time.Sleep(5 * time.Second) { // Query against the root URL for Elasticsearch. body, err := c.Get(). Namespace(api.NamespaceDefault). Prefix("proxy"). Resource("services"). Name("elasticsearch-logging"). DoRaw() if err != nil { Logf("After %v proxy call to elasticsearch-loigging failed: %v", time.Since(start), err) continue } esResponse, err = bodyToJSON(body) if err != nil { Logf("After %v failed to convert Elasticsearch JSON response %v to map[string]interface{}: %v", time.Since(start), string(body), err) continue } statusIntf, ok := esResponse["status"] if !ok { Logf("After %v Elasticsearch response has no status field: %v", time.Since(start), esResponse) continue } statusCode, ok = statusIntf.(float64) if !ok { // Assume this is a string returning Failure. Retry. Logf("After %v expected status to be a float64 but got %v of type %T", time.Since(start), statusIntf, statusIntf) continue } break } Expect(err).NotTo(HaveOccurred()) if int(statusCode) != 200 { Failf("Elasticsearch cluster has a bad status: %v", statusCode) } // Check to see if have a cluster_name field. clusterName, ok := esResponse["cluster_name"] if !ok { Failf("No cluster_name field in Elasticsearch response: %v", esResponse) } if clusterName != "kubernetes_logging" { Failf("Connected to wrong cluster %q (expecting kubernetes_logging)", clusterName) } // Now assume we really are talking to an Elasticsearch instance. // Check the cluster health. By("Checking health of Elasticsearch service.") body, err := c.Get(). Namespace(api.NamespaceDefault). Prefix("proxy"). Resource("services"). Name("elasticsearch-logging"). Suffix("_cluster/health"). Param("health", "pretty"). DoRaw() Expect(err).NotTo(HaveOccurred()) health, err := bodyToJSON(body) Expect(err).NotTo(HaveOccurred()) statusIntf, ok := health["status"] if !ok { Failf("No status field found in cluster health response: %v", health) } status := statusIntf.(string) if status != "green" && status != "yellow" { Failf("Cluster health has bad status: %s", status) } // Obtain a list of nodes so we can place one synthetic logger on each node. nodes, err := c.Nodes().List(labels.Everything(), fields.Everything()) if err != nil { Failf("Failed to list nodes: %v", err) } nodeCount := len(nodes.Items) if nodeCount == 0 { Failf("Failed to find any nodes") } // Create a unique root name for the resources in this test to permit // parallel executions of this test. // Use a unique namespace for the resources created in this test. ns := "es-logging-" + randomSuffix() name := "synthlogger" // Form a unique name to taint log lines to be colelcted. // Replace '-' characters with '_' to prevent the analyzer from breaking apart names. taintName := strings.Replace(ns+name, "-", "_", -1) // podNames records the names of the synthetic logging pods that are created in the // loop below. var podNames []string // countTo is the number of log lines emitted (and checked) for each synthetic logging pod. const countTo = 100 // Instantiate a synthetic logger pod on each node. for i, node := range nodes.Items { podName := fmt.Sprintf("%s-%d", name, i) _, err := c.Pods(ns).Create(&api.Pod{ ObjectMeta: api.ObjectMeta{ Name: podName, Labels: map[string]string{"name": name}, }, Spec: api.PodSpec{ Containers: []api.Container{ { Name: "synth-logger", Image: "gcr.io/google_containers/ubuntu:14.04", Command: []string{"bash", "-c", fmt.Sprintf("i=0; while ((i < %d)); do echo \"%d %s $i %s\"; i=$(($i+1)); done", countTo, i, taintName, podName)}, }, }, Host: node.Name, RestartPolicy: api.RestartPolicyNever, }, }) Expect(err).NotTo(HaveOccurred()) podNames = append(podNames, podName) } // Cleanup the pods when we are done. defer func() { for _, pod := range podNames { if err = c.Pods(ns).Delete(pod); err != nil { Logf("Failed to delete pod %s: %v", pod, err) } } }() // Wait for the syntehtic logging pods to finish. By("Waiting for the pods to succeed.") for _, pod := range podNames { err = waitForPodSuccessInNamespace(c, pod, "synth-logger", ns) Expect(err).NotTo(HaveOccurred()) } // Wait a bit for the log information to make it into Elasticsearch. time.Sleep(30 * time.Second) // Make several attempts to observe the logs ingested into Elasticsearch. By("Checking all the log lines were ingested into Elasticsearch") missing := 0 expected := nodeCount * countTo for start := time.Now(); time.Since(start) < graceTime; time.Sleep(10 * time.Second) { // Ask Elasticsearch to return all the log lines that were tagged with the underscore // verison of the name. Ask for twice as many log lines as we expect to check for // duplication bugs. body, err = c.Get(). Namespace(api.NamespaceDefault). Prefix("proxy"). Resource("services"). Name("elasticsearch-logging"). Suffix("_search"). Param("q", fmt.Sprintf("log:%s", taintName)). Param("size", strconv.Itoa(2*expected)). DoRaw() if err != nil { Logf("After %v failed to make proxy call to elasticsearch-logging: %v", time.Since(start), err) continue } response, err := bodyToJSON(body) if err != nil { Logf("After %v failed to unmarshal response: %v", time.Since(start), err) continue } hits, ok := response["hits"].(map[string]interface{}) if !ok { Failf("response[hits] not of the expected type: %T", response["hits"]) } totalF, ok := hits["total"].(float64) if !ok { Logf("After %v hits[total] not of the expected type: %T", time.Since(start), hits["total"]) continue } total := int(totalF) if total < expected { Logf("After %v expecting to find %d log lines but saw only %d", time.Since(start), expected, total) continue } h, ok := hits["hits"].([]interface{}) if !ok { Logf("After %v hits not of the expected type: %T", time.Since(start), hits["hits"]) continue } // Initialize data-structure for observing counts. observed := make([][]int, nodeCount) for i := range observed { observed[i] = make([]int, countTo) } // Iterate over the hits and populate the observed array. for _, e := range h { l, ok := e.(map[string]interface{}) if !ok { Failf("element of hit not of expected type: %T", e) } source, ok := l["_source"].(map[string]interface{}) if !ok { Failf("_source not of the expected type: %T", l["_source"]) } msg, ok := source["log"].(string) if !ok { Failf("log not of the expected type: %T", source["log"]) } words := strings.Split(msg, " ") if len(words) < 4 { Failf("Malformed log line: %s", msg) } n, err := strconv.ParseUint(words[0], 10, 0) if err != nil { Failf("Expecting numer of node as first field of %s", msg) } if n < 0 || int(n) >= nodeCount { Failf("Node count index out of range: %d", nodeCount) } index, err := strconv.ParseUint(words[2], 10, 0) if err != nil { Failf("Expecting number as third field of %s", msg) } if index < 0 || index >= countTo { Failf("Index value out of range: %d", index) } // Record the observation of a log line from node n at the given index. observed[n][index]++ } // Make sure we correctly observed the expected log lines from each node. missing = 0 for n := range observed { for i, c := range observed[n] { if c == 0 { missing++ } if c < 0 || c > 1 { Failf("Got incorrect count for node %d index %d: %d", n, i, c) } } } if missing != 0 { Logf("After %v still missing %d log lines", time.Since(start), missing) continue } Logf("After %s found all %d log lines", time.Since(start), expected) return } Failf("Failed to find all %d log lines", expected) }
// rebootNode takes node name on provider through the following steps using c: // - ensures the node is ready // - ensures all pods on the node are running and ready // - reboots the node (by executing rebootCmd over ssh) // - ensures the node reaches some non-ready state // - ensures the node becomes ready again // - ensures all pods on the node become running and ready again // // It returns true through result only if all of the steps pass; at the first // failed step, it will return false through result and not run the rest. func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) { // Setup ns := api.NamespaceSystem ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) defer ps.Stop() // Get the node initially. Logf("Getting %s", name) node, err := c.Nodes().Get(name) if err != nil { Logf("Couldn't get node %s", name) result <- false return } // Node sanity check: ensure it is "ready". if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) { result <- false return } // Get all the pods on the node that don't have liveness probe set. // Liveness probe may cause restart of a pod during node reboot, and the pod may not be running. pods := ps.List() podNames := []string{} for _, p := range pods { probe := false for _, c := range p.Spec.Containers { if c.LivenessProbe != nil { probe = true break } } if !probe { podNames = append(podNames, p.ObjectMeta.Name) } } Logf("Node %s has %d pods: %v", name, len(podNames), podNames) // For each pod, we do a sanity check to ensure it's running / healthy // now, as that's what we'll be checking later. if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) { result <- false return } // Reboot the node. if err = issueSSHCommand(node, provider, rebootCmd); err != nil { Logf("Error while issuing ssh command: %v", err) result <- false return } // Wait for some kind of "not ready" status. if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) { result <- false return } // Wait for some kind of "ready" status. if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) { result <- false return } // Ensure all of the pods that we found on this node before the reboot are // running / healthy. if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) { result <- false return } Logf("Reboot successful on node %s", name) result <- true }