Example #1
0
func waitForClusterSize(c *client.Client, size int) error {
	timeout := 4 * time.Minute
	if providerIs("aws") {
		// AWS is not as fast as gce/gke at having nodes come online
		timeout = 10 * time.Minute
	}

	for start := time.Now(); time.Since(start) < timeout; time.Sleep(20 * time.Second) {
		nodes, err := c.Nodes().List(labels.Everything(), fields.Everything())
		if err != nil {
			Logf("Failed to list nodes: %v", err)
			continue
		}
		// Filter out not-ready nodes.
		filterNodes(nodes, func(node api.Node) bool {
			return isNodeReadySetAsExpected(&node, true)
		})

		if len(nodes.Items) == size {
			Logf("Cluster has reached the desired size %d", size)
			return nil
		}
		Logf("Waiting for cluster size %d, current size %d", size, len(nodes.Items))
	}
	return fmt.Errorf("timeout waiting for cluster size to be %d", size)
}
Example #2
0
func CheckCadvisorHealthOnAllNodes(c *client.Client, timeout time.Duration) {
	By("getting list of nodes")
	nodeList, err := c.Nodes().List(labels.Everything(), fields.Everything())
	expectNoError(err)
	var errors []error
	retries := maxRetries
	for {
		errors = []error{}
		for _, node := range nodeList.Items {
			// cadvisor is not accessible directly unless its port (4194 by default) is exposed.
			// Here, we access '/stats/' REST endpoint on the kubelet which polls cadvisor internally.
			statsResource := fmt.Sprintf("api/v1/proxy/nodes/%s/stats/", node.Name)
			By(fmt.Sprintf("Querying stats from node %s using url %s", node.Name, statsResource))
			_, err = c.Get().AbsPath(statsResource).Timeout(timeout).Do().Raw()
			if err != nil {
				errors = append(errors, err)
			}
		}
		if len(errors) == 0 {
			return
		}
		if retries--; retries <= 0 {
			break
		}
		Logf("failed to retrieve kubelet stats -\n %v", errors)
		time.Sleep(sleepDuration)
	}
	Failf("Failed after retrying %d times for cadvisor to be healthy on all nodes. Errors:\n%v", maxRetries, errors)
}
Example #3
0
func watchNodes(client *client.Client) {
	nodeList, err := client.Nodes().List(labels.Everything(), fields.Everything())
	if err != nil {
		log.Fatal(err)
	}
	nodes := nodeList.Items
	writeNodeTargetsFile(nodes)
	watcher, err := client.Nodes().Watch(labels.Everything(), fields.Everything(), nodeList.ResourceVersion)
	if err != nil {
		log.Fatal(err)
	}

	for event := range watcher.ResultChan() {
		switch event.Type {
		case watch.Added:
			switch obj := event.Object.(type) {
			case *api.Node:
				nodes = append(nodes, *obj)
			}
			writeNodeTargetsFile(nodes)
		case watch.Deleted:
			switch obj := event.Object.(type) {
			case *api.Node:
				index := findNodeIndexInSlice(nodes, obj)
				nodes = append(nodes[:index], nodes[index+1:]...)
			}
			writeNodeTargetsFile(nodes)
		}
	}
}
Example #4
0
// NodeSSHHosts returns SSH-able host names for all nodes. It returns an error
// if it can't find an external IP for every node, though it still returns all
// hosts that it found in that case.
func NodeSSHHosts(c *client.Client) ([]string, error) {
	var hosts []string
	nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
	if err != nil {
		return hosts, fmt.Errorf("error getting nodes: %v", err)
	}
	for _, n := range nodelist.Items {
		for _, addr := range n.Status.Addresses {
			// Use the first external IP address we find on the node, and
			// use at most one per node.
			// TODO(mbforbes): Use the "preferred" address for the node, once
			// such a thing is defined (#2462).
			if addr.Type == api.NodeExternalIP {
				hosts = append(hosts, addr.Address+":22")
				break
			}
		}
	}

	// Error if any node didn't have an external IP.
	if len(hosts) != len(nodelist.Items) {
		return hosts, fmt.Errorf(
			"only found %d external IPs on nodes, but found %d nodes. Nodelist: %v",
			len(hosts), len(nodelist.Items), nodelist)
	}
	return hosts, nil
}
Example #5
0
func pickNode(c *client.Client) (string, error) {
	nodes, err := c.Nodes().List(labels.Everything(), fields.Everything())
	if err != nil {
		return "", err
	}
	if len(nodes.Items) == 0 {
		return "", fmt.Errorf("no nodes exist, can't test node proxy")
	}
	return nodes.Items[0].Name, nil
}
Example #6
0
func getAllNodesInCluster(c *client.Client) ([]string, error) {
	nodeList, err := c.Nodes().List(labels.Everything(), fields.Everything())
	if err != nil {
		return nil, err
	}
	result := []string{}
	for _, node := range nodeList.Items {
		result = append(result, node.Name)
	}
	return result, nil
}
Example #7
0
func getMinionPublicIps(c *client.Client) ([]string, error) {
	nodes, err := c.Nodes().List(labels.Everything(), fields.Everything())
	if err != nil {
		return nil, err
	}

	ips := collectAddresses(nodes, api.NodeExternalIP)
	if len(ips) == 0 {
		ips = collectAddresses(nodes, api.NodeLegacyHostIP)
	}
	return ips, nil
}
Example #8
0
func waitForClusterSize(c *client.Client, size int) error {
	for start := time.Now(); time.Since(start) < 4*time.Minute; time.Sleep(20 * time.Second) {
		nodes, err := c.Nodes().List(labels.Everything(), fields.Everything())
		if err != nil {
			Logf("Failed to list nodes: %v", err)
			continue
		}
		if len(nodes.Items) == size {
			Logf("Cluster has reached the desired size %d", size)
			return nil
		}
		Logf("Waiting for cluster size %d, current size %d", size, len(nodes.Items))
	}
	return fmt.Errorf("timeout waiting for cluster size to be %d", size)
}
func DoTestUnschedulableNodes(t *testing.T, client *client.Client) {
	node := &api.Node{
		ObjectMeta: api.ObjectMeta{Name: "node"},
		Spec:       api.NodeSpec{Unschedulable: true},
	}
	if _, err := client.Nodes().Create(node); err != nil {
		t.Fatalf("Failed to create node: %v", err)
	}

	pod := &api.Pod{
		ObjectMeta: api.ObjectMeta{Name: "my-pod"},
		Spec: api.PodSpec{
			Containers: []api.Container{{Name: "container", Image: "kubernetes/pause:go"}},
		},
	}
	myPod, err := client.Pods(api.NamespaceDefault).Create(pod)
	if err != nil {
		t.Fatalf("Failed to create pod: %v", err)
	}
	// There are no schedulable nodes - the pod shouldn't be scheduled.
	err = wait.Poll(time.Second, time.Second*10, podScheduled(client, myPod.Namespace, myPod.Name))
	if err == nil {
		t.Errorf("Pod scheduled successfully on unschedulable nodes")
	}
	if err != wait.ErrWaitTimeout {
		t.Errorf("Failed while waiting for scheduled pod: %v", err)
	}

	// Make the node schedulable and wait until the pod is scheduled.
	newNode, err := client.Nodes().Get(node.Name)
	if err != nil {
		t.Fatalf("Failed to get node: %v", err)
	}
	newNode.Spec.Unschedulable = false
	if _, err = client.Nodes().Update(newNode); err != nil {
		t.Fatalf("Failed to update node: %v", err)
	}
	err = wait.Poll(time.Second, time.Second*10, podScheduled(client, myPod.Namespace, myPod.Name))
	if err != nil {
		t.Errorf("Failed to schedule a pod: %v", err)
	}

	err = client.Pods(api.NamespaceDefault).Delete(myPod.Name)
	if err != nil {
		t.Errorf("Failed to delete pod: %v", err)
	}
}
Example #10
0
func DoTestUnschedulableNodes(t *testing.T, restClient *client.Client, nodeStore cache.Store) {
	goodCondition := api.NodeCondition{
		Type:              api.NodeReady,
		Status:            api.ConditionTrue,
		Reason:            fmt.Sprintf("schedulable condition"),
		LastHeartbeatTime: util.Time{time.Now()},
	}
	badCondition := api.NodeCondition{
		Type:              api.NodeReady,
		Status:            api.ConditionUnknown,
		Reason:            fmt.Sprintf("unschedulable condition"),
		LastHeartbeatTime: util.Time{time.Now()},
	}
	// Create a new schedulable node, since we're first going to apply
	// the unschedulable condition and verify that pods aren't scheduled.
	node := &api.Node{
		ObjectMeta: api.ObjectMeta{Name: "node-scheduling-test-node"},
		Spec:       api.NodeSpec{Unschedulable: false},
		Status: api.NodeStatus{
			Capacity: api.ResourceList{
				api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
			},
			Conditions: []api.NodeCondition{goodCondition},
		},
	}
	nodeKey, err := cache.MetaNamespaceKeyFunc(node)
	if err != nil {
		t.Fatalf("Couldn't retrieve key for node %v", node.Name)
	}

	// The test does the following for each nodeStateManager in this list:
	//	1. Create a new node
	//	2. Apply the makeUnSchedulable function
	//	3. Create a new pod
	//  4. Check that the pod doesn't get assigned to the node
	//  5. Apply the schedulable function
	//  6. Check that the pod *does* get assigned to the node
	//  7. Delete the pod and node.

	nodeModifications := []nodeStateManager{
		// Test node.Spec.Unschedulable=true/false
		{
			makeUnSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) {
				n.Spec.Unschedulable = true
				if _, err := c.Nodes().Update(n); err != nil {
					t.Fatalf("Failed to update node with unschedulable=true: %v", err)
				}
				err = waitForReflection(s, nodeKey, func(node interface{}) bool {
					// An unschedulable node should get deleted from the store
					return node == nil
				})
				if err != nil {
					t.Fatalf("Failed to observe reflected update for setting unschedulable=true: %v", err)
				}
			},
			makeSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) {
				n.Spec.Unschedulable = false
				if _, err := c.Nodes().Update(n); err != nil {
					t.Fatalf("Failed to update node with unschedulable=false: %v", err)
				}
				err = waitForReflection(s, nodeKey, func(node interface{}) bool {
					return node != nil && node.(*api.Node).Spec.Unschedulable == false
				})
				if err != nil {
					t.Fatalf("Failed to observe reflected update for setting unschedulable=false: %v", err)
				}
			},
		},
		// Test node.Status.Conditions=ConditionTrue/Unknown
		{
			makeUnSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) {
				n.Status = api.NodeStatus{
					Capacity: api.ResourceList{
						api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
					},
					Conditions: []api.NodeCondition{badCondition},
				}
				if _, err = c.Nodes().UpdateStatus(n); err != nil {
					t.Fatalf("Failed to update node with bad status condition: %v", err)
				}
				err = waitForReflection(s, nodeKey, func(node interface{}) bool {
					return node != nil && node.(*api.Node).Status.Conditions[0].Status == api.ConditionUnknown
				})
				if err != nil {
					t.Fatalf("Failed to observe reflected update for status condition update: %v", err)
				}
			},
			makeSchedulable: func(t *testing.T, n *api.Node, s cache.Store, c *client.Client) {
				n.Status = api.NodeStatus{
					Capacity: api.ResourceList{
						api.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
					},
					Conditions: []api.NodeCondition{goodCondition},
				}
				if _, err = c.Nodes().UpdateStatus(n); err != nil {
					t.Fatalf("Failed to update node with healthy status condition: %v", err)
				}
				waitForReflection(s, nodeKey, func(node interface{}) bool {
					return node != nil && node.(*api.Node).Status.Conditions[0].Status == api.ConditionTrue
				})
				if err != nil {
					t.Fatalf("Failed to observe reflected update for status condition update: %v", err)
				}
			},
		},
	}

	for i, mod := range nodeModifications {
		unSchedNode, err := restClient.Nodes().Create(node)
		if err != nil {
			t.Fatalf("Failed to create node: %v", err)
		}

		// Apply the unschedulable modification to the node, and wait for the reflection
		mod.makeUnSchedulable(t, unSchedNode, nodeStore, restClient)

		// Create the new pod, note that this needs to happen post unschedulable
		// modification or we have a race in the test.
		pod := &api.Pod{
			ObjectMeta: api.ObjectMeta{Name: "node-scheduling-test-pod"},
			Spec: api.PodSpec{
				Containers: []api.Container{{Name: "container", Image: "kubernetes/pause:go"}},
			},
		}
		myPod, err := restClient.Pods(api.NamespaceDefault).Create(pod)
		if err != nil {
			t.Fatalf("Failed to create pod: %v", err)
		}

		// There are no schedulable nodes - the pod shouldn't be scheduled.
		err = wait.Poll(time.Second, time.Second*10, podScheduled(restClient, myPod.Namespace, myPod.Name))
		if err == nil {
			t.Errorf("Pod scheduled successfully on unschedulable nodes")
		}
		if err != wait.ErrWaitTimeout {
			t.Errorf("Test %d: failed while trying to confirm the pod does not get scheduled on the node: %v", i, err)
		} else {
			t.Logf("Test %d: Pod did not get scheduled on an unschedulable node", i)
		}

		// Apply the schedulable modification to the node, and wait for the reflection
		schedNode, err := restClient.Nodes().Get(unSchedNode.Name)
		if err != nil {
			t.Fatalf("Failed to get node: %v", err)
		}
		mod.makeSchedulable(t, schedNode, nodeStore, restClient)

		// Wait until the pod is scheduled.
		err = wait.Poll(time.Second, time.Second*10, podScheduled(restClient, myPod.Namespace, myPod.Name))
		if err != nil {
			t.Errorf("Test %d: failed to schedule a pod: %v", i, err)
		} else {
			t.Logf("Test %d: Pod got scheduled on a schedulable node", i)
		}

		err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, nil)
		if err != nil {
			t.Errorf("Failed to delete pod: %v", err)
		}
		err = restClient.Nodes().Delete(schedNode.Name)
		if err != nil {
			t.Errorf("Failed to delete node: %v", err)
		}
	}
}
Example #11
0
	. "github.com/onsi/gomega"
)

var _ = Describe("MaxPods", func() {
	var c *client.Client
	var nodeCount int
	var totalPodCapacity int64
	var RCName string
	var ns string
	var uuid string

	BeforeEach(func() {
		var err error
		c, err = loadClient()
		expectNoError(err)
		nodes, err := c.Nodes().List(labels.Everything(), fields.Everything())
		expectNoError(err)
		nodeCount = len(nodes.Items)
		Expect(nodeCount).NotTo(BeZero())

		totalPodCapacity = 0
		for _, node := range nodes.Items {
			podCapacity, found := node.Status.Capacity["pods"]
			Expect(found).To(Equal(true))
			totalPodCapacity += podCapacity.Value()
		}

		err = deleteTestingNS(c)
		expectNoError(err)

		nsForTesting, err := createTestingNS("maxp", c)
Example #12
0
				// Create a replication controller for a service that serves its hostname.
				// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
				name := "my-hostname-net"
				newSVCByName(c, ns, name)
				replicas := testContext.CloudConfig.NumNodes
				newRCByName(c, ns, name, replicas)
				err := verifyPods(c, ns, name, true, replicas)
				Expect(err).NotTo(HaveOccurred(), "Each pod should start running and responding")

				By("choose a node with at least one pod - we will block some network traffic on this node")
				label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
				pods, err := c.Pods(ns).List(label, fields.Everything()) // list pods after all have been scheduled
				Expect(err).NotTo(HaveOccurred())
				nodeName := pods.Items[0].Spec.NodeName

				node, err := c.Nodes().Get(nodeName)
				Expect(err).NotTo(HaveOccurred())

				By(fmt.Sprintf("block network traffic from node %s", node.Name))
				performTemporaryNetworkFailure(c, ns, name, replicas, pods.Items[0].Name, node)
				Logf("Waiting for node %s to be ready", node.Name)
				waitForNodeToBe(c, node.Name, true, 2*time.Minute)

				By("verify whether new pods can be created on the re-attached node")
				// increasing the RC size is not a valid way to test this
				// since we have no guarantees the pod will be scheduled on our node.
				additionalPod := "additionalpod"
				err = newPodOnNode(c, ns, additionalPod, node.Name)
				Expect(err).NotTo(HaveOccurred())
				err = verifyPods(c, ns, additionalPod, true, 1)
				Expect(err).NotTo(HaveOccurred())
Example #13
0
// This test suite can take a long time to run, so by default it is disabled
// by being marked as Pending.  To enable this suite, remove the P from the
// front of PDescribe (PDescribe->Describe) and then all tests will
// be available
var _ = PDescribe("Density", func() {
	var c *client.Client
	var minionCount int
	var RCName string
	var ns string

	BeforeEach(func() {
		var err error
		c, err = loadClient()
		expectNoError(err)
		minions, err := c.Nodes().List()
		expectNoError(err)
		minionCount = len(minions.Items)
		Expect(minionCount).NotTo(BeZero())
		ns = api.NamespaceDefault
	})

	AfterEach(func() {
		// Remove any remaining pods from this test if the
		// replication controller still exists and the replica count
		// isn't 0.  This means the controller wasn't cleaned up
		// during the test so clean it up here
		rc, err := c.ReplicationControllers(ns).Get(RCName)
		if err == nil && rc.Spec.Replicas != 0 {
			DeleteRC(c, ns, RCName)
		}
Example #14
0
// rebootNode takes node name on provider through the following steps using c:
//  - ensures the node is ready
//  - ensures all pods on the node are running and ready
//  - reboots the node (by executing rebootCmd over ssh)
//  - ensures the node reaches some non-ready state
//  - ensures the node becomes ready again
//  - ensures all pods on the node become running and ready again
//
// It returns true through result only if all of the steps pass; at the first
// failed step, it will return false through result and not run the rest.
func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) {
	// Setup
	ns := api.NamespaceDefault
	ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
	defer ps.Stop()

	// Get the node initially.
	Logf("Getting %s", name)
	node, err := c.Nodes().Get(name)
	if err != nil {
		Logf("Couldn't get node %s", name)
		result <- false
		return
	}

	// Node sanity check: ensure it is "ready".
	if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) {
		result <- false
		return
	}

	// Get all the pods on the node.
	pods := ps.List()
	podNames := make([]string, len(pods))
	for i, p := range pods {
		podNames[i] = p.ObjectMeta.Name
	}
	Logf("Node %s has %d pods: %v", name, len(podNames), podNames)

	// For each pod, we do a sanity check to ensure it's running / healthy
	// now, as that's what we'll be checking later.
	if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) {
		result <- false
		return
	}

	// Reboot the node.
	if err = issueSSHCommand(node, provider, rebootCmd); err != nil {
		Logf("Error while issuing ssh command: %v", err)
		result <- false
		return
	}

	// Wait for some kind of "not ready" status.
	if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) {
		result <- false
		return
	}

	// Wait for some kind of "ready" status.
	if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) {
		result <- false
		return
	}

	// Ensure all of the pods that we found on this node before the reboot are
	// running / healthy.
	if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) {
		result <- false
		return
	}

	Logf("Reboot successful on node %s", name)
	result <- true
}
// ClusterLevelLoggingWithElasticsearch is an end to end test for cluster level logging.
func ClusterLevelLoggingWithElasticsearch(c *client.Client) {
	// TODO: For now assume we are only testing cluster logging with Elasticsearch
	// on GCE. Once we are sure that Elasticsearch cluster level logging
	// works for other providers we should widen this scope of this test.
	if !providerIs("gce") {
		Logf("Skipping cluster level logging test for provider %s", testContext.Provider)
		return
	}

	// Check for the existence of the Elasticsearch service.
	By("Checking the Elasticsearch service exists.")
	s := c.Services(api.NamespaceDefault)
	// Make a few attempts to connect. This makes the test robust against
	// being run as the first e2e test just after the e2e cluster has been created.
	var err error
	const graceTime = 10 * time.Minute
	for start := time.Now(); time.Since(start) < graceTime; time.Sleep(5 * time.Second) {
		if _, err = s.Get("elasticsearch-logging"); err == nil {
			break
		}
		Logf("Attempt to check for the existence of the Elasticsearch service failed after %v", time.Since(start))
	}
	Expect(err).NotTo(HaveOccurred())

	// Wait for the Elasticsearch pods to enter the running state.
	By("Checking to make sure the Elasticsearch pods are running")
	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": "elasticsearch-logging"}))
	pods, err := c.Pods(api.NamespaceDefault).List(label, fields.Everything())
	Expect(err).NotTo(HaveOccurred())
	for _, pod := range pods.Items {
		err = waitForPodRunning(c, pod.Name)
		Expect(err).NotTo(HaveOccurred())
	}

	By("Checking to make sure we are talking to an Elasticsearch service.")
	// Perform a few checks to make sure this looks like an Elasticsearch cluster.
	var statusCode float64
	var esResponse map[string]interface{}
	err = nil
	for start := time.Now(); time.Since(start) < graceTime; time.Sleep(5 * time.Second) {
		// Query against the root URL for Elasticsearch.
		body, err := c.Get().
			Namespace(api.NamespaceDefault).
			Prefix("proxy").
			Resource("services").
			Name("elasticsearch-logging").
			DoRaw()
		if err != nil {
			Logf("After %v proxy call to elasticsearch-loigging failed: %v", time.Since(start), err)
			continue
		}
		esResponse, err = bodyToJSON(body)
		if err != nil {
			Logf("After %v failed to convert Elasticsearch JSON response %v to map[string]interface{}: %v", time.Since(start), string(body), err)
			continue
		}
		statusIntf, ok := esResponse["status"]
		if !ok {
			Logf("After %v Elasticsearch response has no status field: %v", time.Since(start), esResponse)
			continue
		}
		statusCode, ok = statusIntf.(float64)
		if !ok {
			// Assume this is a string returning Failure. Retry.
			Logf("After %v expected status to be a float64 but got %v of type %T", time.Since(start), statusIntf, statusIntf)
			continue
		}
		break
	}
	Expect(err).NotTo(HaveOccurred())
	if int(statusCode) != 200 {
		Failf("Elasticsearch cluster has a bad status: %v", statusCode)
	}
	// Check to see if have a cluster_name field.
	clusterName, ok := esResponse["cluster_name"]
	if !ok {
		Failf("No cluster_name field in Elasticsearch response: %v", esResponse)
	}
	if clusterName != "kubernetes_logging" {
		Failf("Connected to wrong cluster %q (expecting kubernetes_logging)", clusterName)
	}

	// Now assume we really are talking to an Elasticsearch instance.
	// Check the cluster health.
	By("Checking health of Elasticsearch service.")
	body, err := c.Get().
		Namespace(api.NamespaceDefault).
		Prefix("proxy").
		Resource("services").
		Name("elasticsearch-logging").
		Suffix("_cluster/health").
		Param("health", "pretty").
		DoRaw()
	Expect(err).NotTo(HaveOccurred())

	health, err := bodyToJSON(body)
	Expect(err).NotTo(HaveOccurred())
	statusIntf, ok := health["status"]
	if !ok {
		Failf("No status field found in cluster health response: %v", health)
	}
	status := statusIntf.(string)
	if status != "green" && status != "yellow" {
		Failf("Cluster health has bad status: %s", status)
	}

	// Obtain a list of nodes so we can place one synthetic logger on each node.
	nodes, err := c.Nodes().List(labels.Everything(), fields.Everything())
	if err != nil {
		Failf("Failed to list nodes: %v", err)
	}
	nodeCount := len(nodes.Items)
	if nodeCount == 0 {
		Failf("Failed to find any nodes")
	}

	// Create a unique root name for the resources in this test to permit
	// parallel executions of this test.
	// Use a unique namespace for the resources created in this test.
	ns := "es-logging-" + randomSuffix()
	name := "synthlogger"
	// Form a unique name to taint log lines to be colelcted.
	// Replace '-' characters with '_' to prevent the analyzer from breaking apart names.
	taintName := strings.Replace(ns+name, "-", "_", -1)

	// podNames records the names of the synthetic logging pods that are created in the
	// loop below.
	var podNames []string
	// countTo is the number of log lines emitted (and checked) for each synthetic logging pod.
	const countTo = 100
	// Instantiate a synthetic logger pod on each node.
	for i, node := range nodes.Items {
		podName := fmt.Sprintf("%s-%d", name, i)
		_, err := c.Pods(ns).Create(&api.Pod{
			ObjectMeta: api.ObjectMeta{
				Name:   podName,
				Labels: map[string]string{"name": name},
			},
			Spec: api.PodSpec{
				Containers: []api.Container{
					{
						Name:    "synth-logger",
						Image:   "gcr.io/google_containers/ubuntu:14.04",
						Command: []string{"bash", "-c", fmt.Sprintf("i=0; while ((i < %d)); do echo \"%d %s $i %s\"; i=$(($i+1)); done", countTo, i, taintName, podName)},
					},
				},
				Host:          node.Name,
				RestartPolicy: api.RestartPolicyNever,
			},
		})
		Expect(err).NotTo(HaveOccurred())
		podNames = append(podNames, podName)
	}

	// Cleanup the pods when we are done.
	defer func() {
		for _, pod := range podNames {
			if err = c.Pods(ns).Delete(pod); err != nil {
				Logf("Failed to delete pod %s: %v", pod, err)
			}
		}
	}()

	// Wait for the syntehtic logging pods to finish.
	By("Waiting for the pods to succeed.")
	for _, pod := range podNames {
		err = waitForPodSuccessInNamespace(c, pod, "synth-logger", ns)
		Expect(err).NotTo(HaveOccurred())
	}

	// Wait a bit for the log information to make it into Elasticsearch.
	time.Sleep(30 * time.Second)

	// Make several attempts to observe the logs ingested into Elasticsearch.
	By("Checking all the log lines were ingested into Elasticsearch")
	missing := 0
	expected := nodeCount * countTo
	for start := time.Now(); time.Since(start) < graceTime; time.Sleep(10 * time.Second) {
		// Ask Elasticsearch to return all the log lines that were tagged with the underscore
		// verison of the name. Ask for twice as many log lines as we expect to check for
		// duplication bugs.
		body, err = c.Get().
			Namespace(api.NamespaceDefault).
			Prefix("proxy").
			Resource("services").
			Name("elasticsearch-logging").
			Suffix("_search").
			Param("q", fmt.Sprintf("log:%s", taintName)).
			Param("size", strconv.Itoa(2*expected)).
			DoRaw()
		if err != nil {
			Logf("After %v failed to make proxy call to elasticsearch-logging: %v", time.Since(start), err)
			continue
		}

		response, err := bodyToJSON(body)
		if err != nil {
			Logf("After %v failed to unmarshal response: %v", time.Since(start), err)
			continue
		}
		hits, ok := response["hits"].(map[string]interface{})
		if !ok {
			Failf("response[hits] not of the expected type: %T", response["hits"])
		}
		totalF, ok := hits["total"].(float64)
		if !ok {
			Logf("After %v hits[total] not of the expected type: %T", time.Since(start), hits["total"])
			continue
		}
		total := int(totalF)
		if total < expected {
			Logf("After %v expecting to find %d log lines but saw only %d", time.Since(start), expected, total)
			continue
		}
		h, ok := hits["hits"].([]interface{})
		if !ok {
			Logf("After %v hits not of the expected type: %T", time.Since(start), hits["hits"])
			continue
		}
		// Initialize data-structure for observing counts.
		observed := make([][]int, nodeCount)
		for i := range observed {
			observed[i] = make([]int, countTo)
		}
		// Iterate over the hits and populate the observed array.
		for _, e := range h {
			l, ok := e.(map[string]interface{})
			if !ok {
				Failf("element of hit not of expected type: %T", e)
			}
			source, ok := l["_source"].(map[string]interface{})
			if !ok {
				Failf("_source not of the expected type: %T", l["_source"])
			}
			msg, ok := source["log"].(string)
			if !ok {
				Failf("log not of the expected type: %T", source["log"])
			}
			words := strings.Split(msg, " ")
			if len(words) < 4 {
				Failf("Malformed log line: %s", msg)
			}
			n, err := strconv.ParseUint(words[0], 10, 0)
			if err != nil {
				Failf("Expecting numer of node as first field of %s", msg)
			}
			if n < 0 || int(n) >= nodeCount {
				Failf("Node count index out of range: %d", nodeCount)
			}
			index, err := strconv.ParseUint(words[2], 10, 0)
			if err != nil {
				Failf("Expecting number as third field of %s", msg)
			}
			if index < 0 || index >= countTo {
				Failf("Index value out of range: %d", index)
			}
			// Record the observation of a log line from node n at the given index.
			observed[n][index]++
		}
		// Make sure we correctly observed the expected log lines from each node.
		missing = 0
		for n := range observed {
			for i, c := range observed[n] {
				if c == 0 {
					missing++
				}
				if c < 0 || c > 1 {
					Failf("Got incorrect count for node %d index %d: %d", n, i, c)
				}
			}
		}
		if missing != 0 {
			Logf("After %v still missing %d log lines", time.Since(start), missing)
			continue
		}
		Logf("After %s found all %d log lines", time.Since(start), expected)
		return
	}
	Failf("Failed to find all %d log lines", expected)
}
Example #16
0
// rebootNode takes node name on provider through the following steps using c:
//  - ensures the node is ready
//  - ensures all pods on the node are running and ready
//  - reboots the node (by executing rebootCmd over ssh)
//  - ensures the node reaches some non-ready state
//  - ensures the node becomes ready again
//  - ensures all pods on the node become running and ready again
//
// It returns true through result only if all of the steps pass; at the first
// failed step, it will return false through result and not run the rest.
func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) {
	// Setup
	ns := api.NamespaceSystem
	ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
	defer ps.Stop()

	// Get the node initially.
	Logf("Getting %s", name)
	node, err := c.Nodes().Get(name)
	if err != nil {
		Logf("Couldn't get node %s", name)
		result <- false
		return
	}

	// Node sanity check: ensure it is "ready".
	if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) {
		result <- false
		return
	}

	// Get all the pods on the node that don't have liveness probe set.
	// Liveness probe may cause restart of a pod during node reboot, and the pod may not be running.
	pods := ps.List()
	podNames := []string{}
	for _, p := range pods {
		probe := false
		for _, c := range p.Spec.Containers {
			if c.LivenessProbe != nil {
				probe = true
				break
			}
		}
		if !probe {
			podNames = append(podNames, p.ObjectMeta.Name)
		}
	}
	Logf("Node %s has %d pods: %v", name, len(podNames), podNames)

	// For each pod, we do a sanity check to ensure it's running / healthy
	// now, as that's what we'll be checking later.
	if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) {
		result <- false
		return
	}

	// Reboot the node.
	if err = issueSSHCommand(node, provider, rebootCmd); err != nil {
		Logf("Error while issuing ssh command: %v", err)
		result <- false
		return
	}

	// Wait for some kind of "not ready" status.
	if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) {
		result <- false
		return
	}

	// Wait for some kind of "ready" status.
	if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) {
		result <- false
		return
	}

	// Ensure all of the pods that we found on this node before the reboot are
	// running / healthy.
	if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) {
		result <- false
		return
	}

	Logf("Reboot successful on node %s", name)
	result <- true
}