Beispiel #1
0
func (d *ClusterRegistry) getRegistryPods(service *kapi.Service, r types.DiagnosticResult) []*kapi.Pod {
	runningPods := []*kapi.Pod{}
	pods, err := d.KubeClient.Pods(kapi.NamespaceDefault).List(labels.SelectorFromSet(service.Spec.Selector), fields.Everything())
	if err != nil {
		r.Error("DClu1005", err, fmt.Sprintf("Finding pods for '%s' service failed. This should never happen. Error: (%T) %[2]v", registryName, err))
		return runningPods
	} else if len(pods.Items) < 1 {
		r.Error("DClu1006", nil, fmt.Sprintf(clRegNoPods, registryName))
		return runningPods
	} else if len(pods.Items) > 1 {
		// multiple registry pods using EmptyDir will be inconsistent
		for _, volume := range pods.Items[0].Spec.Volumes {
			if volume.Name == registryVolume && volume.EmptyDir != nil {
				r.Error("DClu1007", nil, fmt.Sprintf(clRegMultiPods, registryName))
				break
			}
		}
	}
	for _, pod := range pods.Items {
		r.Debug("DClu1008", fmt.Sprintf("Found %s pod with name %s", registryName, pod.ObjectMeta.Name))
		if pod.Status.Phase != kapi.PodRunning {
			r.Warn("DClu1009", nil, fmt.Sprintf(clRegPodDown, pod.ObjectMeta.Name, registryName))
		} else {
			runningPods = append(runningPods, &pod)
			// Check the logs for that pod for common issues (credentials, DNS resolution failure)
			d.checkRegistryLogs(&pod, r)
		}
	}
	return runningPods
}
Beispiel #2
0
func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticResult) {
	// pull out logs from the pod
	readCloser, err := d.KubeClient.RESTClient.Get().
		Namespace("default").Name(pod.ObjectMeta.Name).
		Resource("pods").SubResource("log").
		Param("follow", "false").
		Param("container", pod.Spec.Containers[0].Name).
		Stream()
	if err != nil {
		r.Warn("DClu1010", nil, fmt.Sprintf(clRegPodLog, pod.ObjectMeta.Name, registryName, fmt.Sprintf("(%T) %[1]v", err)))
		return
	}
	defer readCloser.Close()

	clientError := ""
	registryError := ""
	scanner := bufio.NewScanner(readCloser)
	for scanner.Scan() {
		logLine := scanner.Text()
		// TODO: once the logging API gets "since" and "tail" and "limit", limit to more recent log entries
		// https://github.com/kubernetes/kubernetes/issues/12447
		if strings.Contains(logLine, `level=error msg="client error:`) {
			clientError = logLine // end up showing only the most recent client error
		} else if strings.Contains(logLine, "level=error msg=") {
			registryError += "\n" + logLine // gather generic errors
		}
	}
	if clientError != "" {
		r.Error("DClu1011", nil, fmt.Sprintf(clRegPodConn, pod.ObjectMeta.Name, registryName, clientError))
	}
	if registryError != "" {
		r.Warn("DClu1012", nil, fmt.Sprintf(clRegPodErr, pod.ObjectMeta.Name, registryName, registryError))
	}

}
Beispiel #3
0
func (d *ClusterRegistry) verifyRegistryImageStream(service *kapi.Service, r types.DiagnosticResult) {
	if d.PreventModification {
		r.Info("DClu1021", "Skipping creating an ImageStream to test registry service address, because you requested no API modifications.")
		return
	}
	imgStream, err := d.OsClient.ImageStreams(kapi.NamespaceDefault).Create(&osapi.ImageStream{ObjectMeta: kapi.ObjectMeta{GenerateName: "diagnostic-test"}})
	if err != nil {
		r.Error("DClu1015", err, fmt.Sprintf("Creating test ImageStream failed. Error: (%T) %[1]v", err))
		return
	}
	defer func() { // delete what we created, or notify that we couldn't
		if err := d.OsClient.ImageStreams(kapi.NamespaceDefault).Delete(imgStream.ObjectMeta.Name); err != nil {
			r.Warn("DClu1016", err, fmt.Sprintf(clRegISDelFail, imgStream.ObjectMeta.Name, fmt.Sprintf("(%T) %[1]s", err)))
		}
	}()
	imgStream, err = d.OsClient.ImageStreams(kapi.NamespaceDefault).Get(imgStream.ObjectMeta.Name) // status is filled in post-create
	if err != nil {
		r.Error("DClu1017", err, fmt.Sprintf("Getting created test ImageStream failed. Error: (%T) %[1]v", err))
		return
	}
	r.Debug("DClu1018", fmt.Sprintf("Created test ImageStream: %[1]v", imgStream))
	cacheHost := strings.SplitN(imgStream.Status.DockerImageRepository, "/", 2)[0]
	serviceHost := fmt.Sprintf("%s:%d", service.Spec.ClusterIP, service.Spec.Ports[0].Port)
	if cacheHost != serviceHost {
		r.Error("DClu1019", nil, fmt.Sprintf(clRegISMismatch, registryName, serviceHost, cacheHost))
	}
}
Beispiel #4
0
// authenticateToMaster tests whether we can use the serviceaccount token
// to reach the master and authenticate
func (d PodCheckAuth) authenticateToMaster(token string, r types.DiagnosticResult) {
	clientConfig := &clientcmd.Config{
		MasterAddr:     flagtypes.Addr{Value: d.MasterUrl}.Default(),
		KubernetesAddr: flagtypes.Addr{Value: d.MasterUrl}.Default(),
		CommonConfig: restclient.Config{
			TLSClientConfig: restclient.TLSClientConfig{CAFile: d.MasterCaPath},
			BearerToken:     token,
		},
	}
	oclient, _, _, err := clientConfig.Clients()
	if err != nil {
		r.Error("DP1002", err, fmt.Sprintf("could not create API clients from the service account client config: %v", err))
		return
	}
	rchan := make(chan error, 1) // for concurrency with timeout
	go func() {
		_, err := oclient.Users().Get("~")
		rchan <- err
	}()

	select {
	case <-time.After(time.Second * 4): // timeout per query
		r.Warn("DP1005", nil, "A request to the master timed out.\nThis could be temporary but could also indicate network or DNS problems.")
	case err := <-rchan:
		if err != nil {
			r.Error("DP1003", err, fmt.Sprintf("Could not authenticate to the master with the service account credentials: %v", err))
		} else {
			r.Info("DP1004", "Service account token successfully authenticated to master")
		}
	}
	return
}
Beispiel #5
0
func (d PodCheckAuth) authenticateToRegistry(token string, r types.DiagnosticResult) {
	resolvConf, err := getResolvConf(r)
	if err != nil {
		return // any errors have been reported via "r", env is very borked, test cannot proceed
	}
	msg := new(dns.Msg)
	msg.SetQuestion(registryHostname+".", dns.TypeA)
	msg.RecursionDesired = false
	result, completed := dnsQueryWithTimeout(msg, resolvConf.Servers[0], 2)
	switch {
	case !completed:
		r.Error("DP1006", nil, fmt.Sprintf("DNS resolution for registry address %s timed out; this could indicate problems with DNS resolution or networking", registryHostname))
		return
	case result.err != nil:
		r.Error("DP1016", nil, fmt.Sprintf("DNS resolution for registry address %s returned an error; container DNS is likely incorrect. The error was: %v", registryHostname, result.err))
		return
	case result.in == nil, len(result.in.Answer) == 0:
		r.Warn("DP1007", nil, fmt.Sprintf("DNS resolution for registry address %s returned no results; either the integrated registry is not deployed, or container DNS configuration is incorrect.", registryHostname))
		return
	}

	// first try the secure connection in case they followed directions to secure the registry
	// (https://docs.openshift.org/latest/install_config/install/docker_registry.html#securing-the-registry)
	cacert, err := ioutil.ReadFile(d.MasterCaPath) // TODO: we assume same CA as master - better choice?
	if err != nil {
		r.Error("DP1008", err, fmt.Sprintf("Failed to read CA cert file %s:\n%v", d.MasterCaPath, err))
		return
	}
	pool := x509.NewCertPool()
	if !pool.AppendCertsFromPEM(cacert) {
		r.Error("DP1009", err, fmt.Sprintf("Could not use cert from CA cert file %s:\n%v", d.MasterCaPath, err))
		return
	}
	noSecClient := http.Client{
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			return fmt.Errorf("no redirect expected")
		},
		Timeout: time.Second * 2,
	}
	secClient := noSecClient
	secClient.Transport = knet.SetTransportDefaults(&http.Transport{TLSClientConfig: &tls.Config{RootCAs: pool}})
	secError := processRegistryRequest(&secClient, fmt.Sprintf("https://%s:%s/v2/", registryHostname, registryPort), token, r)
	if secError == nil {
		return // made the request successfully enough to diagnose
	}
	switch {
	case strings.Contains(secError.Error(), "tls: oversized record received"),
		strings.Contains(secError.Error(), "server gave HTTP response to HTTPS"):
		r.Debug("DP1015", "docker-registry not secured; falling back to cleartext connection")
		if nosecError := processRegistryRequest(&noSecClient, fmt.Sprintf("http://%s:%s/v2/", registryHostname, registryPort), token, r); nosecError != nil {
			r.Error("DP1013", nosecError, fmt.Sprintf("Unexpected error authenticating to the integrated registry:\n(%T) %[1]v", nosecError))
		}
	default:
		r.Error("DP1013", secError, fmt.Sprintf("Unexpected error authenticating to the integrated registry:\n(%T) %[1]v", secError))
	}
}
Beispiel #6
0
func (d *ClusterRegistry) getRegistryService(r types.DiagnosticResult) *kapi.Service {
	service, err := d.KubeClient.Services(kapi.NamespaceDefault).Get(registryName)
	if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) {
		r.Warn("DClu1002", err, fmt.Sprintf(clGetRegNone, registryName, kapi.NamespaceDefault))
		return nil
	} else if err != nil {
		r.Error("DClu1003", err, fmt.Sprintf(clGetRegFailed, err))
		return nil
	}
	r.Debug("DClu1004", fmt.Sprintf("Found %s service with ports %v", registryName, service.Spec.Ports))
	return service
}
Beispiel #7
0
func (d *ClusterRouter) getRouterDC(r types.DiagnosticResult) *osapi.DeploymentConfig {
	dc, err := d.OsClient.DeploymentConfigs(kapi.NamespaceDefault).Get(routerName)
	if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) {
		r.Warn("DClu2001", err, fmt.Sprintf(clGetRtNone, routerName))
		return nil
	} else if err != nil {
		r.Error("DClu2002", err, fmt.Sprintf(clGetRtFailed, routerName, err))
		return nil
	}
	r.Debug("DClu2003", fmt.Sprintf("Found default router DC"))
	return dc
}
Beispiel #8
0
func (d *ClusterRegistry) getRegistryPods(service *kapi.Service, r types.DiagnosticResult) []*kapi.Pod {
	runningPods := []*kapi.Pod{}
	pods, err := d.KubeClient.Core().Pods(kapi.NamespaceDefault).List(kapi.ListOptions{LabelSelector: labels.SelectorFromSet(service.Spec.Selector)})
	if err != nil {
		r.Error("DClu1005", err, fmt.Sprintf("Finding pods for '%s' service failed. This should never happen. Error: (%T) %[2]v", registryName, err))
		return runningPods
	} else if len(pods.Items) < 1 {
		r.Error("DClu1006", nil, fmt.Sprintf(clRegNoPods, registryName))
		return runningPods
	} else if len(pods.Items) > 1 {
		emptyDir := false     // multiple registry pods using EmptyDir will be inconsistent
		customConfig := false // ... unless the user has configured them for e.g. S3
		configPath := "/config.yml"
		// look through the pod volumes to see if that might be a problem
		podSpec := pods.Items[0].Spec
		container := podSpec.Containers[0]
		for _, volume := range podSpec.Volumes {
			if volume.Name == registryVolume && volume.EmptyDir != nil {
				emptyDir = true
			}
		}
		for _, env := range container.Env {
			if env.Name == "REGISTRY_CONFIGURATION_PATH" {
				configPath = env.Value // look for custom config here
			}
		}
		for _, vmount := range container.VolumeMounts {
			if strings.HasPrefix(configPath, vmount.MountPath) {
				customConfig = true // if something's mounted there, assume custom config.
			}
		}
		if emptyDir {
			if customConfig { // assume they know what they're doing
				r.Info("DClu1020", fmt.Sprintf(clRegMultiCustomCfg, registryName, configPath))
			} else { // assume they scaled up with ephemeral storage
				r.Error("DClu1007", nil, fmt.Sprintf(clRegMultiPods, registryName))
			}
		}
	}
	for _, pod := range pods.Items {
		r.Debug("DClu1008", fmt.Sprintf("Found %s pod with name %s", registryName, pod.ObjectMeta.Name))
		if pod.Status.Phase != kapi.PodRunning {
			r.Warn("DClu1009", nil, fmt.Sprintf(clRegPodDown, pod.ObjectMeta.Name, registryName))
		} else {
			runningPods = append(runningPods, &pod)
			// Check the logs for that pod for common issues (credentials, DNS resolution failure)
			d.checkRegistryLogs(&pod, r)
		}
	}
	return runningPods
}
Beispiel #9
0
func (d *ClusterRegistry) checkRegistryLogs(pod *kapi.Pod, r types.DiagnosticResult) {
	// pull out logs from the pod
	readCloser, err := d.KubeClient.RESTClient.Get().
		Namespace("default").Name(pod.ObjectMeta.Name).
		Resource("pods").SubResource("log").
		Param("follow", "false").
		Param("container", pod.Spec.Containers[0].Name).
		Stream()
	if err != nil {
		r.Warn("DClu1010", nil, fmt.Sprintf(clRegPodLog, pod.ObjectMeta.Name, registryName, fmt.Sprintf("(%T) %[1]v", err)))
		return
	}
	defer readCloser.Close()

	// Indicator that selinux is blocking the registry from writing to disk:
	selinuxErrorRegex, _ := regexp.Compile(".*level=error.*mkdir.*permission denied.*")
	// If seen after the above error regex, we know the problem has since been fixed:
	selinuxSuccessRegex, _ := regexp.Compile(".*level=info.*response completed.*http.request.method=PUT.*")

	clientError := ""
	registryError := ""
	selinuxError := ""

	scanner := bufio.NewScanner(readCloser)
	for scanner.Scan() {
		logLine := scanner.Text()
		// TODO: once the logging API gets "since" and "tail" and "limit", limit to more recent log entries
		// https://github.com/kubernetes/kubernetes/issues/12447
		if strings.Contains(logLine, `level=error msg="client error:`) {
			clientError = logLine // end up showing only the most recent client error
		} else if selinuxErrorRegex.MatchString(logLine) {
			selinuxError = logLine
		} else if selinuxSuccessRegex.MatchString(logLine) {
			// Check for a successful registry push, if this occurs after a selinux error
			// we can safely clear it, the problem has already been fixed.
			selinuxError = ""
		} else if strings.Contains(logLine, "level=error msg=") {
			registryError += "\n" + logLine // gather generic errors
		}
	}
	if clientError != "" {
		r.Error("DClu1011", nil, fmt.Sprintf(clRegPodConn, pod.ObjectMeta.Name, registryName, clientError))
	}
	if selinuxError != "" {
		r.Error("DClu1020", nil, fmt.Sprintf(clRegSelinuxErr, pod.ObjectMeta.Name, registryName, selinuxError))
	}
	if registryError != "" {
		r.Warn("DClu1012", nil, fmt.Sprintf(clRegPodErr, pod.ObjectMeta.Name, registryName, registryError))
	}
}
Beispiel #10
0
func (d *ClusterRegistry) checkRegistryEndpoints(pods []*kapi.Pod, r types.DiagnosticResult) bool {
	endPoint, err := d.KubeClient.Endpoints(kapi.NamespaceDefault).Get(registryName)
	if err != nil {
		r.Error("DClu1013", err, fmt.Sprintf(`Finding endpoints for "%s" service failed. This should never happen. Error: (%[2]T) %[2]v`, registryName, err))
		return false
	}
	numEP := 0
	for _, subs := range endPoint.Subsets {
		numEP += len(subs.Addresses)
	}
	if numEP != len(pods) {
		r.Warn("DClu1014", nil, fmt.Sprintf(clRegNoEP, registryName, len(pods), numEP))
		return false
	}
	return true
}
Beispiel #11
0
func retrieveLoggingProject(r types.DiagnosticResult, masterCfg *configapi.MasterConfig, osClient *client.Client) string {
	r.Debug("AGL0010", fmt.Sprintf("masterConfig.AssetConfig.LoggingPublicURL: '%s'", masterCfg.AssetConfig.LoggingPublicURL))
	projectName := ""
	if len(masterCfg.AssetConfig.LoggingPublicURL) == 0 {
		r.Debug("AGL0017", "masterConfig.AssetConfig.LoggingPublicURL is empty")
		return projectName
	}

	loggingUrl, err := url.Parse(masterCfg.AssetConfig.LoggingPublicURL)
	if err != nil {
		r.Error("AGL0011", err, fmt.Sprintf("Unable to parse the loggingPublicURL from the masterConfig '%s'", masterCfg.AssetConfig.LoggingPublicURL))
		return projectName
	}

	routeList, err := osClient.Routes(kapi.NamespaceAll).List(kapi.ListOptions{LabelSelector: loggingSelector.AsSelector()})
	if err != nil {
		r.Error("AGL0012", err, fmt.Sprintf("There was an error while trying to find the route associated with '%s' which is probably transient: %s", loggingUrl, err))
		return projectName
	}

	for _, route := range routeList.Items {
		r.Debug("AGL0013", fmt.Sprintf("Comparing URL to route.Spec.Host: %s", route.Spec.Host))
		if loggingUrl.Host == route.Spec.Host {
			if len(projectName) == 0 {
				projectName = route.ObjectMeta.Namespace
				r.Info("AGL0015", fmt.Sprintf("Found route '%s' matching logging URL '%s' in project: '%s'", route.ObjectMeta.Name, loggingUrl.Host, projectName))
			} else {
				r.Warn("AGL0019", nil, fmt.Sprintf("Found additional route '%s' matching logging URL '%s' in project: '%s'.  This could mean you have multiple logging deployments.", route.ObjectMeta.Name, loggingUrl.Host, projectName))
			}
		}
	}
	if len(projectName) == 0 {
		message := fmt.Sprintf("Unable to find a route matching the loggingPublicURL defined in the master config '%s'. Check that the URL is correct and aggregated logging is deployed.", loggingUrl)
		r.Error("AGL0014", errors.New(message), message)
		return ""
	}
	project, err := osClient.Projects().Get(projectName)
	if err != nil {
		r.Error("AGL0018", err, fmt.Sprintf("There was an error retrieving project '%s' which is most likely a transient error: %s", projectName, err))
		return ""
	}
	nodeSelector, ok := project.ObjectMeta.Annotations["openshift.io/node-selector"]
	if !ok || len(nodeSelector) != 0 {
		r.Warn("AGL0030", nil, fmt.Sprintf(projectNodeSelectorWarning, projectName))
	}
	return projectName
}
Beispiel #12
0
// getResolvConf reads a clientConfig from resolv.conf and complains about any errors
func getResolvConf(r types.DiagnosticResult) (*dns.ClientConfig, error) {
	resolvConf, err := dns.ClientConfigFromFile("/etc/resolv.conf")
	if err != nil {
		r.Error("DP3001", err, fmt.Sprintf("could not load/parse resolver file /etc/resolv.conf: %v", err))
		return nil, err
	}
	if len(resolvConf.Servers) == 0 {
		r.Error("DP3002", nil, "could not find any nameservers defined in /etc/resolv.conf")
		return nil, err
	}
	if len(resolvConf.Search) == 0 {
		r.Warn("DP3011", nil, "could not find any search domains defined in /etc/resolv.conf")
		resolvConf.Search = nil
	}
	r.Debug("DP3012", fmt.Sprintf("Pod /etc/resolv.conf contains:\nnameservers: %v\nsearch domains: %v", resolvConf.Servers, resolvConf.Search))
	return resolvConf, nil
}
Beispiel #13
0
func resolveSearch(resolvConf *dns.ClientConfig, r types.DiagnosticResult) {
	foundDomain := false
	randomString := func() string {
		b := make([]byte, 20)
		for i := range b {
			b[i] = letterBytes[rand.Intn(len(letterBytes))]
		}
		return string(b)
	}()
	seenDP2014 := sets.String{}
	seenDP2015 := sets.String{}
	for _, domain := range resolvConf.Search {
		if domain == "svc.cluster.local" {
			foundDomain = true // this will make kubernetes.default work
		}
		// put together a DNS query to configured nameservers for each search domain
		msg := new(dns.Msg)
		msg.SetQuestion("wildcard."+randomString+"."+domain+".", dns.TypeA)
		msg.RecursionDesired = true // otherwise we just get the authority section for the TLD
		for _, server := range resolvConf.Servers {
			result, completed := dnsQueryWithTimeout(msg, server, 2)
			switch {
			case !completed:
				if !seenDP2014.Has(server) {
					r.Warn("DP2014", nil, fmt.Sprintf("A request to the nameserver %s timed out.\nThis could be temporary but could also indicate network or DNS problems.", server))
					seenDP2014.Insert(server) // no need to keep warning about the same server for every domain
				}
			case result.err != nil:
				if !seenDP2015.Has(server) {
					r.Warn("DP2015", result.err, fmt.Sprintf("Error querying nameserver %s:\n  %v\nThis may indicate a problem with DNS.", server, result.err))
					seenDP2015.Insert(server) // don't repeat the error for the same nameserver; chances are it's the same error
				}
			case result.in.Answer == nil, len(result.in.Answer) == 0:
				r.Debug("DP2017", fmt.Sprintf("Nameserver %s responded to wildcard with no answer, which is expected.\n%v", server, result.in))
			default: // the random domain is not supposed to resolve
				r.Error("DP2016", nil, fmt.Sprintf(txtDP2016, server, domain, result.in))
			}
		}
	}
	if !foundDomain {
		r.Error("DP2019", nil, "Did not find svc.cluster.local among the configured search domains in /etc/resolv.conf.\nThis is likely to cause problems with certain components that expect to use partial cluster addresses.")
	}
}
Beispiel #14
0
func (d *ClusterRouter) checkRouterLogs(pod *kapi.Pod, r types.DiagnosticResult) {
	scanner, err := d.getPodLogScanner(pod)
	if err != nil {
		r.Warn("DClu2008", err, fmt.Sprintf(clRtPodLog, pod.ObjectMeta.Name, fmt.Sprintf("(%T) %[1]v", err)))
		return
	}
	defer scanner.Close()

	for scanner.Scan() {
		matches := regexp.MustCompile(`^(\S+).*Failed to list \*api.Route: (.*)`).FindStringSubmatch(scanner.Text())
		if len(matches) > 0 {
			stamp, err := time.Parse(referenceTimestampLayout, matches[1])
			// router checks every second. error only if failure is recent.
			// of course... we cannot always trust the local clock.
			if err == nil && time.Since(stamp).Seconds() < 30.0 {
				r.Error("DClu2009", nil, fmt.Sprintf(clRtPodConn, pod.ObjectMeta.Name, matches[2], matches[1]))
				break
			}
		}
	}
}
Beispiel #15
0
func connectAndResolve(resolvConf *dns.ClientConfig, r types.DiagnosticResult) {
	for serverIndex, server := range resolvConf.Servers {
		// put together a DNS query to configured nameservers for kubernetes.default
		msg := new(dns.Msg)
		msg.SetQuestion("kubernetes.default.svc.cluster.local.", dns.TypeA)
		msg.RecursionDesired = false
		if result, completed := dnsQueryWithTimeout(msg, server, 2); !completed {
			if serverIndex == 0 { // in a pod, master (SkyDNS) IP is injected as first nameserver
				r.Warn("DP2009", nil, fmt.Sprintf("A request to the master (SkyDNS) nameserver %s timed out.\nThis could be temporary but could also indicate network or DNS problems.\nThis nameserver is critical for resolving cluster DNS names.", server))
			} else {
				r.Warn("DP2010", nil, fmt.Sprintf("A request to the nameserver %s timed out.\nThis could be temporary but could also indicate network or DNS problems.", server))
			}
		} else {
			in, err := result.in, result.err
			if serverIndex == 0 { // in a pod, master (SkyDNS) IP is injected as first nameserver
				if err != nil {
					r.Error("DP2003", err, fmt.Sprintf("The first /etc/resolv.conf nameserver %s\ncould not resolve kubernetes.default.svc.cluster.local.\nError: %v\nThis nameserver points to the master's SkyDNS which is critical for\nresolving cluster names, e.g. for Services.", server, err))
				} else if len(in.Answer) == 0 {
					r.Error("DP2006", err, fmt.Sprintf("The first /etc/resolv.conf nameserver %s\ncould not resolve kubernetes.default.svc.cluster.local.\nReturn code: %v\nThis nameserver points to the master's SkyDNS which is critical for\nresolving cluster names, e.g. for Services.", server, dns.RcodeToString[in.MsgHdr.Rcode]))
				} else {
					r.Debug("DP2007", fmt.Sprintf("The first /etc/resolv.conf nameserver %s\nresolved kubernetes.default.svc.cluster.local. to:\n  %s", server, in.Answer[0]))
				}
			} else if err != nil {
				r.Warn("DP2004", err, fmt.Sprintf("Error querying nameserver %s:\n  %v\nThis may indicate a problem with non-cluster DNS.", server, err))
			} else {
				rcode := in.MsgHdr.Rcode
				switch rcode {
				case dns.RcodeSuccess, dns.RcodeNameError: // aka NXDOMAIN
					r.Debug("DP2005", fmt.Sprintf("Successful query to nameserver %s", server))
				default:
					r.Warn("DP2008", nil, fmt.Sprintf("Received unexpected return code '%s' from nameserver %s:\nThis may indicate a problem with non-cluster DNS.", dns.RcodeToString[rcode], server))
				}
			}
		}
	}
}
func (d *DiagnosticPod) runDiagnosticPod(service *kapi.Service, r types.DiagnosticResult) {
	loglevel := d.Level
	if loglevel > 2 {
		loglevel = 2 // need to show summary at least
	}
	imageName := d.ImageTemplate.ExpandOrDie("deployer")
	pod, err := d.KubeClient.Core().Pods(d.Namespace).Create(&kapi.Pod{
		ObjectMeta: kapi.ObjectMeta{GenerateName: "pod-diagnostic-test-"},
		Spec: kapi.PodSpec{
			RestartPolicy: kapi.RestartPolicyNever,
			Containers: []kapi.Container{
				{
					Name:    "pod-diagnostics",
					Image:   imageName,
					Command: []string{"openshift", "infra", "diagnostic-pod", "-l", strconv.Itoa(loglevel)},
				},
			},
		},
	})
	if err != nil {
		r.Error("DCli2001", err, fmt.Sprintf("Creating diagnostic pod with image %s failed. Error: (%[2]T) %[2]v", imageName, err))
		return
	}
	defer func() { // delete what we created, or notify that we couldn't
		zero := int64(0)
		delOpts := kapi.DeleteOptions{TypeMeta: pod.TypeMeta, GracePeriodSeconds: &zero}
		if err := d.KubeClient.Core().Pods(d.Namespace).Delete(pod.ObjectMeta.Name, &delOpts); err != nil {
			r.Error("DCl2002", err, fmt.Sprintf("Deleting diagnostic pod '%s' failed. Error: %s", pod.ObjectMeta.Name, fmt.Sprintf("(%T) %[1]s", err)))
		}
	}()
	pod, err = d.KubeClient.Core().Pods(d.Namespace).Get(pod.ObjectMeta.Name) // status is filled in post-create
	if err != nil {
		r.Error("DCli2003", err, fmt.Sprintf("Retrieving the diagnostic pod definition failed. Error: (%T) %[1]v", err))
		return
	}
	r.Debug("DCli2004", fmt.Sprintf("Created diagnostic pod named %v running image %s.", pod.ObjectMeta.Name, imageName))

	bytelim := int64(1024000)
	podLogsOpts := &kapi.PodLogOptions{
		TypeMeta:   pod.TypeMeta,
		Container:  "pod-diagnostics",
		Follow:     true,
		LimitBytes: &bytelim,
	}
	req, err := d.Factory.LogsForObject(pod, podLogsOpts)
	if err != nil {
		r.Error("DCli2005", err, fmt.Sprintf("The request for diagnostic pod logs failed unexpectedly. Error: (%T) %[1]v", err))
		return
	}

	// wait for pod to be started and logs available
	var scanner *bufio.Scanner
	var lastError error
	for times := 1; true; times++ {
		if times <= 25 {
			readCloser, err := req.Stream()
			if err != nil {
				lastError = err
				r.Debug("DCli2010", fmt.Sprintf("Could not get diagnostic pod logs (loop %d): (%T[2]) %[2]v", times, err))
				time.Sleep(time.Duration(times*100) * time.Millisecond)
				continue
			}
			defer readCloser.Close()
			// make sure we can actually get something from the stream before going on.
			// it seems the creation of docker logs can trail the container start a bit.
			lineScanner := bufio.NewScanner(readCloser)
			if lineScanner.Scan() {
				scanner = lineScanner
				break // success - drop down to reading the logs.
			}
			// no luck - try, try again
			lastError = fmt.Errorf("Diagnostics pod is ready but not its logs (loop %d). Retry.", times)
			r.Debug("DCli2010", lastError.Error())
			time.Sleep(time.Duration(times*100) * time.Millisecond)
			continue
		}
		// after 25 times trying:
		r.Warn("DCli2006", err, fmt.Sprintf("Timed out preparing diagnostic pod logs for streaming, so this diagnostic cannot run.\nIt is likely that the image '%s' was not pulled and running yet.\nLast error: (%T[2]) %[2]v", pod.Spec.Containers[0].Image, lastError))
		return
	}
	// then watch logs and wait until it exits
	podLogs, warnings, errors := "", 0, 0
	errorRegex := regexp.MustCompile(`^\[Note\]\s+Errors\s+seen:\s+(\d+)`)
	warnRegex := regexp.MustCompile(`^\[Note\]\s+Warnings\s+seen:\s+(\d+)`)
	// keep in mind one test line was already scanned, so scan after the loop runs once
	for scanned := true; scanned; scanned = scanner.Scan() {
		line := scanner.Text()
		podLogs += line + "\n"
		if matches := errorRegex.FindStringSubmatch(line); matches != nil {
			errors, _ = strconv.Atoi(matches[1])
		} else if matches := warnRegex.FindStringSubmatch(line); matches != nil {
			warnings, _ = strconv.Atoi(matches[1])
		}
	}
	if err := scanner.Err(); err != nil { // Scan terminated abnormally
		r.Error("DCli2009", err, fmt.Sprintf("Unexpected error reading diagnostic pod logs: (%T) %[1]v\nLogs are:\n%[2]s", err, podLogs))
	} else {
		if errors > 0 {
			r.Error("DCli2012", nil, "See the errors below in the output from the diagnostic pod:\n"+podLogs)
		} else if warnings > 0 {
			r.Warn("DCli2013", nil, "See the warnings below in the output from the diagnostic pod:\n"+podLogs)
		} else {
			r.Info("DCli2008", fmt.Sprintf("Output from the diagnostic pod (image %s):\n", imageName)+podLogs)
		}
	}
}