func (d *ClusterRegistry) getRegistryPods(service *kapi.Service, r types.DiagnosticResult) []*kapi.Pod { runningPods := []*kapi.Pod{} pods, err := d.KubeClient.Pods(kapi.NamespaceDefault).List(labels.SelectorFromSet(service.Spec.Selector), fields.Everything()) if err != nil { r.Error("DClu1005", err, fmt.Sprintf("Finding pods for '%s' service failed. This should never happen. Error: (%T) %[2]v", registryName, err)) return runningPods } else if len(pods.Items) < 1 { r.Error("DClu1006", nil, fmt.Sprintf(clRegNoPods, registryName)) return runningPods } else if len(pods.Items) > 1 { // multiple registry pods using EmptyDir will be inconsistent for _, volume := range pods.Items[0].Spec.Volumes { if volume.Name == registryVolume && volume.EmptyDir != nil { r.Error("DClu1007", nil, fmt.Sprintf(clRegMultiPods, registryName)) break } } } for _, pod := range pods.Items { r.Debug("DClu1008", fmt.Sprintf("Found %s pod with name %s", registryName, pod.ObjectMeta.Name)) if pod.Status.Phase != kapi.PodRunning { r.Warn("DClu1009", nil, fmt.Sprintf(clRegPodDown, pod.ObjectMeta.Name, registryName)) } else { runningPods = append(runningPods, &pod) // Check the logs for that pod for common issues (credentials, DNS resolution failure) d.checkRegistryLogs(&pod, r) } } return runningPods }
func (d *ClusterRegistry) verifyRegistryImageStream(service *kapi.Service, r types.DiagnosticResult) { if d.PreventModification { r.Info("DClu1021", "Skipping creating an ImageStream to test registry service address, because you requested no API modifications.") return } imgStream, err := d.OsClient.ImageStreams(kapi.NamespaceDefault).Create(&osapi.ImageStream{ObjectMeta: kapi.ObjectMeta{GenerateName: "diagnostic-test"}}) if err != nil { r.Error("DClu1015", err, fmt.Sprintf("Creating test ImageStream failed. Error: (%T) %[1]v", err)) return } defer func() { // delete what we created, or notify that we couldn't if err := d.OsClient.ImageStreams(kapi.NamespaceDefault).Delete(imgStream.ObjectMeta.Name); err != nil { r.Warn("DClu1016", err, fmt.Sprintf(clRegISDelFail, imgStream.ObjectMeta.Name, fmt.Sprintf("(%T) %[1]s", err))) } }() imgStream, err = d.OsClient.ImageStreams(kapi.NamespaceDefault).Get(imgStream.ObjectMeta.Name) // status is filled in post-create if err != nil { r.Error("DClu1017", err, fmt.Sprintf("Getting created test ImageStream failed. Error: (%T) %[1]v", err)) return } r.Debug("DClu1018", fmt.Sprintf("Created test ImageStream: %[1]v", imgStream)) cacheHost := strings.SplitN(imgStream.Status.DockerImageRepository, "/", 2)[0] serviceHost := fmt.Sprintf("%s:%d", service.Spec.ClusterIP, service.Spec.Ports[0].Port) if cacheHost != serviceHost { r.Error("DClu1019", nil, fmt.Sprintf(clRegISMismatch, registryName, serviceHost, cacheHost)) } }
func (d PodCheckAuth) authenticateToRegistry(token string, r types.DiagnosticResult) { resolvConf, err := getResolvConf(r) if err != nil { return // any errors have been reported via "r", env is very borked, test cannot proceed } msg := new(dns.Msg) msg.SetQuestion(registryHostname+".", dns.TypeA) msg.RecursionDesired = false result, completed := dnsQueryWithTimeout(msg, resolvConf.Servers[0], 2) switch { case !completed: r.Error("DP1006", nil, fmt.Sprintf("DNS resolution for registry address %s timed out; this could indicate problems with DNS resolution or networking", registryHostname)) return case result.err != nil: r.Error("DP1016", nil, fmt.Sprintf("DNS resolution for registry address %s returned an error; container DNS is likely incorrect. The error was: %v", registryHostname, result.err)) return case result.in == nil, len(result.in.Answer) == 0: r.Warn("DP1007", nil, fmt.Sprintf("DNS resolution for registry address %s returned no results; either the integrated registry is not deployed, or container DNS configuration is incorrect.", registryHostname)) return } // first try the secure connection in case they followed directions to secure the registry // (https://docs.openshift.org/latest/install_config/install/docker_registry.html#securing-the-registry) cacert, err := ioutil.ReadFile(d.MasterCaPath) // TODO: we assume same CA as master - better choice? if err != nil { r.Error("DP1008", err, fmt.Sprintf("Failed to read CA cert file %s:\n%v", d.MasterCaPath, err)) return } pool := x509.NewCertPool() if !pool.AppendCertsFromPEM(cacert) { r.Error("DP1009", err, fmt.Sprintf("Could not use cert from CA cert file %s:\n%v", d.MasterCaPath, err)) return } noSecClient := http.Client{ CheckRedirect: func(req *http.Request, via []*http.Request) error { return fmt.Errorf("no redirect expected") }, Timeout: time.Second * 2, } secClient := noSecClient secClient.Transport = knet.SetTransportDefaults(&http.Transport{TLSClientConfig: &tls.Config{RootCAs: pool}}) secError := processRegistryRequest(&secClient, fmt.Sprintf("https://%s:%s/v2/", registryHostname, registryPort), token, r) if secError == nil { return // made the request successfully enough to diagnose } switch { case strings.Contains(secError.Error(), "tls: oversized record received"), strings.Contains(secError.Error(), "server gave HTTP response to HTTPS"): r.Debug("DP1015", "docker-registry not secured; falling back to cleartext connection") if nosecError := processRegistryRequest(&noSecClient, fmt.Sprintf("http://%s:%s/v2/", registryHostname, registryPort), token, r); nosecError != nil { r.Error("DP1013", nosecError, fmt.Sprintf("Unexpected error authenticating to the integrated registry:\n(%T) %[1]v", nosecError)) } default: r.Error("DP1013", secError, fmt.Sprintf("Unexpected error authenticating to the integrated registry:\n(%T) %[1]v", secError)) } }
func (d *ClusterRegistry) getRegistryService(r types.DiagnosticResult) *kapi.Service { service, err := d.KubeClient.Services(kapi.NamespaceDefault).Get(registryName) if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) { r.Warn("DClu1002", err, fmt.Sprintf(clGetRegNone, registryName, kapi.NamespaceDefault)) return nil } else if err != nil { r.Error("DClu1003", err, fmt.Sprintf(clGetRegFailed, err)) return nil } r.Debug("DClu1004", fmt.Sprintf("Found %s service with ports %v", registryName, service.Spec.Ports)) return service }
func (d *ClusterRouter) getRouterDC(r types.DiagnosticResult) *osapi.DeploymentConfig { dc, err := d.OsClient.DeploymentConfigs(kapi.NamespaceDefault).Get(routerName) if err != nil && reflect.TypeOf(err) == reflect.TypeOf(&kerrs.StatusError{}) { r.Warn("DClu2001", err, fmt.Sprintf(clGetRtNone, routerName)) return nil } else if err != nil { r.Error("DClu2002", err, fmt.Sprintf(clGetRtFailed, routerName, err)) return nil } r.Debug("DClu2003", fmt.Sprintf("Found default router DC")) return dc }
func (d *ClusterRegistry) getRegistryPods(service *kapi.Service, r types.DiagnosticResult) []*kapi.Pod { runningPods := []*kapi.Pod{} pods, err := d.KubeClient.Core().Pods(kapi.NamespaceDefault).List(kapi.ListOptions{LabelSelector: labels.SelectorFromSet(service.Spec.Selector)}) if err != nil { r.Error("DClu1005", err, fmt.Sprintf("Finding pods for '%s' service failed. This should never happen. Error: (%T) %[2]v", registryName, err)) return runningPods } else if len(pods.Items) < 1 { r.Error("DClu1006", nil, fmt.Sprintf(clRegNoPods, registryName)) return runningPods } else if len(pods.Items) > 1 { emptyDir := false // multiple registry pods using EmptyDir will be inconsistent customConfig := false // ... unless the user has configured them for e.g. S3 configPath := "/config.yml" // look through the pod volumes to see if that might be a problem podSpec := pods.Items[0].Spec container := podSpec.Containers[0] for _, volume := range podSpec.Volumes { if volume.Name == registryVolume && volume.EmptyDir != nil { emptyDir = true } } for _, env := range container.Env { if env.Name == "REGISTRY_CONFIGURATION_PATH" { configPath = env.Value // look for custom config here } } for _, vmount := range container.VolumeMounts { if strings.HasPrefix(configPath, vmount.MountPath) { customConfig = true // if something's mounted there, assume custom config. } } if emptyDir { if customConfig { // assume they know what they're doing r.Info("DClu1020", fmt.Sprintf(clRegMultiCustomCfg, registryName, configPath)) } else { // assume they scaled up with ephemeral storage r.Error("DClu1007", nil, fmt.Sprintf(clRegMultiPods, registryName)) } } } for _, pod := range pods.Items { r.Debug("DClu1008", fmt.Sprintf("Found %s pod with name %s", registryName, pod.ObjectMeta.Name)) if pod.Status.Phase != kapi.PodRunning { r.Warn("DClu1009", nil, fmt.Sprintf(clRegPodDown, pod.ObjectMeta.Name, registryName)) } else { runningPods = append(runningPods, &pod) // Check the logs for that pod for common issues (credentials, DNS resolution failure) d.checkRegistryLogs(&pod, r) } } return runningPods }
func GetMasterConfig(r types.DiagnosticResult, masterConfigFile string) (*configapi.MasterConfig, error) { if masterConfigLoaded { // no need to do this more than once return masterConfig, masterConfigLoadError } r.Debug("DH0001", fmt.Sprintf("Looking for master config file at '%s'", masterConfigFile)) masterConfigLoaded = true masterConfig, masterConfigLoadError = configapilatest.ReadAndResolveMasterConfig(masterConfigFile) if masterConfigLoadError != nil { r.Error("DH0002", masterConfigLoadError, fmt.Sprintf("Could not read master config file '%s':\n(%T) %[2]v", masterConfigFile, masterConfigLoadError)) } else { r.Debug("DH0003", fmt.Sprintf("Found a master config file: %[1]s", masterConfigFile)) } return masterConfig, masterConfigLoadError }
// getResolvConf reads a clientConfig from resolv.conf and complains about any errors func getResolvConf(r types.DiagnosticResult) (*dns.ClientConfig, error) { resolvConf, err := dns.ClientConfigFromFile("/etc/resolv.conf") if err != nil { r.Error("DP3001", err, fmt.Sprintf("could not load/parse resolver file /etc/resolv.conf: %v", err)) return nil, err } if len(resolvConf.Servers) == 0 { r.Error("DP3002", nil, "could not find any nameservers defined in /etc/resolv.conf") return nil, err } if len(resolvConf.Search) == 0 { r.Warn("DP3011", nil, "could not find any search domains defined in /etc/resolv.conf") resolvConf.Search = nil } r.Debug("DP3012", fmt.Sprintf("Pod /etc/resolv.conf contains:\nnameservers: %v\nsearch domains: %v", resolvConf.Servers, resolvConf.Search)) return resolvConf, nil }
func resolveSearch(resolvConf *dns.ClientConfig, r types.DiagnosticResult) { foundDomain := false randomString := func() string { b := make([]byte, 20) for i := range b { b[i] = letterBytes[rand.Intn(len(letterBytes))] } return string(b) }() seenDP2014 := sets.String{} seenDP2015 := sets.String{} for _, domain := range resolvConf.Search { if domain == "svc.cluster.local" { foundDomain = true // this will make kubernetes.default work } // put together a DNS query to configured nameservers for each search domain msg := new(dns.Msg) msg.SetQuestion("wildcard."+randomString+"."+domain+".", dns.TypeA) msg.RecursionDesired = true // otherwise we just get the authority section for the TLD for _, server := range resolvConf.Servers { result, completed := dnsQueryWithTimeout(msg, server, 2) switch { case !completed: if !seenDP2014.Has(server) { r.Warn("DP2014", nil, fmt.Sprintf("A request to the nameserver %s timed out.\nThis could be temporary but could also indicate network or DNS problems.", server)) seenDP2014.Insert(server) // no need to keep warning about the same server for every domain } case result.err != nil: if !seenDP2015.Has(server) { r.Warn("DP2015", result.err, fmt.Sprintf("Error querying nameserver %s:\n %v\nThis may indicate a problem with DNS.", server, result.err)) seenDP2015.Insert(server) // don't repeat the error for the same nameserver; chances are it's the same error } case result.in.Answer == nil, len(result.in.Answer) == 0: r.Debug("DP2017", fmt.Sprintf("Nameserver %s responded to wildcard with no answer, which is expected.\n%v", server, result.in)) default: // the random domain is not supposed to resolve r.Error("DP2016", nil, fmt.Sprintf(txtDP2016, server, domain, result.in)) } } } if !foundDomain { r.Error("DP2019", nil, "Did not find svc.cluster.local among the configured search domains in /etc/resolv.conf.\nThis is likely to cause problems with certain components that expect to use partial cluster addresses.") } }
//checkKibanaRoutesInOauthClient verifies the client contains the correct redirect uris func checkKibanaRoutesInOauthClient(r types.DiagnosticResult, osClient *client.Client, project string, oauthclient *oauthapi.OAuthClient) { r.Debug("AGL0141", "Checking oauthclient redirectURIs for the logging routes...") routeList, err := osClient.Routes(project).List(kapi.ListOptions{LabelSelector: loggingSelector.AsSelector()}) if err != nil { r.Error("AGL0143", err, fmt.Sprintf("Error retrieving the logging routes: %s", err)) return } redirectUris, err := parseRedirectUris(oauthclient.RedirectURIs) if err != nil { r.Error("AGL0145", err, "Error parsing the OAuthClient.RedirectURIs") return } for _, route := range routeList.Items { if !redirectUris.Has(route.Spec.Host) { message := fmt.Sprintf("OauthClient '%s' does not include a redirectURI for route '%s' which is '%s'", oauthclient.ObjectMeta.Name, route.ObjectMeta.Name, route.Spec.Host) r.Error("AGL0147", errors.New(message), message) } } return }
func (d *ClusterRouter) getRouterPods(dc *osapi.DeploymentConfig, r types.DiagnosticResult) *kapi.PodList { pods, err := d.KubeClient.Pods(kapi.NamespaceDefault).List(kapi.ListOptions{LabelSelector: labels.SelectorFromSet(dc.Spec.Selector)}) if err != nil { r.Error("DClu2004", err, fmt.Sprintf("Finding pods for '%s' DeploymentConfig failed. This should never happen. Error: (%[2]T) %[2]v", routerName, err)) return nil } running := []kapi.Pod{} for _, pod := range pods.Items { if pod.Status.Phase != kapi.PodRunning { r.Debug("DClu2005", fmt.Sprintf("router pod with name %s is not running", pod.ObjectMeta.Name)) } else { running = append(running, pod) r.Debug("DClu2006", fmt.Sprintf("Found running router pod with name %s", pod.ObjectMeta.Name)) } } pods.Items = running if len(running) == 0 { r.Error("DClu2007", nil, fmt.Sprintf(clRtNoPods, routerName)) return nil } return pods }
func connectAndResolve(resolvConf *dns.ClientConfig, r types.DiagnosticResult) { for serverIndex, server := range resolvConf.Servers { // put together a DNS query to configured nameservers for kubernetes.default msg := new(dns.Msg) msg.SetQuestion("kubernetes.default.svc.cluster.local.", dns.TypeA) msg.RecursionDesired = false if result, completed := dnsQueryWithTimeout(msg, server, 2); !completed { if serverIndex == 0 { // in a pod, master (SkyDNS) IP is injected as first nameserver r.Warn("DP2009", nil, fmt.Sprintf("A request to the master (SkyDNS) nameserver %s timed out.\nThis could be temporary but could also indicate network or DNS problems.\nThis nameserver is critical for resolving cluster DNS names.", server)) } else { r.Warn("DP2010", nil, fmt.Sprintf("A request to the nameserver %s timed out.\nThis could be temporary but could also indicate network or DNS problems.", server)) } } else { in, err := result.in, result.err if serverIndex == 0 { // in a pod, master (SkyDNS) IP is injected as first nameserver if err != nil { r.Error("DP2003", err, fmt.Sprintf("The first /etc/resolv.conf nameserver %s\ncould not resolve kubernetes.default.svc.cluster.local.\nError: %v\nThis nameserver points to the master's SkyDNS which is critical for\nresolving cluster names, e.g. for Services.", server, err)) } else if len(in.Answer) == 0 { r.Error("DP2006", err, fmt.Sprintf("The first /etc/resolv.conf nameserver %s\ncould not resolve kubernetes.default.svc.cluster.local.\nReturn code: %v\nThis nameserver points to the master's SkyDNS which is critical for\nresolving cluster names, e.g. for Services.", server, dns.RcodeToString[in.MsgHdr.Rcode])) } else { r.Debug("DP2007", fmt.Sprintf("The first /etc/resolv.conf nameserver %s\nresolved kubernetes.default.svc.cluster.local. to:\n %s", server, in.Answer[0])) } } else if err != nil { r.Warn("DP2004", err, fmt.Sprintf("Error querying nameserver %s:\n %v\nThis may indicate a problem with non-cluster DNS.", server, err)) } else { rcode := in.MsgHdr.Rcode switch rcode { case dns.RcodeSuccess, dns.RcodeNameError: // aka NXDOMAIN r.Debug("DP2005", fmt.Sprintf("Successful query to nameserver %s", server)) default: r.Warn("DP2008", nil, fmt.Sprintf("Received unexpected return code '%s' from nameserver %s:\nThis may indicate a problem with non-cluster DNS.", dns.RcodeToString[rcode], server)) } } } } }
func retrieveLoggingProject(r types.DiagnosticResult, masterCfg *configapi.MasterConfig, osClient *client.Client) string { r.Debug("AGL0010", fmt.Sprintf("masterConfig.AssetConfig.LoggingPublicURL: '%s'", masterCfg.AssetConfig.LoggingPublicURL)) projectName := "" if len(masterCfg.AssetConfig.LoggingPublicURL) == 0 { r.Debug("AGL0017", "masterConfig.AssetConfig.LoggingPublicURL is empty") return projectName } loggingUrl, err := url.Parse(masterCfg.AssetConfig.LoggingPublicURL) if err != nil { r.Error("AGL0011", err, fmt.Sprintf("Unable to parse the loggingPublicURL from the masterConfig '%s'", masterCfg.AssetConfig.LoggingPublicURL)) return projectName } routeList, err := osClient.Routes(kapi.NamespaceAll).List(kapi.ListOptions{LabelSelector: loggingSelector.AsSelector()}) if err != nil { r.Error("AGL0012", err, fmt.Sprintf("There was an error while trying to find the route associated with '%s' which is probably transient: %s", loggingUrl, err)) return projectName } for _, route := range routeList.Items { r.Debug("AGL0013", fmt.Sprintf("Comparing URL to route.Spec.Host: %s", route.Spec.Host)) if loggingUrl.Host == route.Spec.Host { if len(projectName) == 0 { projectName = route.ObjectMeta.Namespace r.Info("AGL0015", fmt.Sprintf("Found route '%s' matching logging URL '%s' in project: '%s'", route.ObjectMeta.Name, loggingUrl.Host, projectName)) } else { r.Warn("AGL0019", nil, fmt.Sprintf("Found additional route '%s' matching logging URL '%s' in project: '%s'. This could mean you have multiple logging deployments.", route.ObjectMeta.Name, loggingUrl.Host, projectName)) } } } if len(projectName) == 0 { message := fmt.Sprintf("Unable to find a route matching the loggingPublicURL defined in the master config '%s'. Check that the URL is correct and aggregated logging is deployed.", loggingUrl) r.Error("AGL0014", errors.New(message), message) return "" } project, err := osClient.Projects().Get(projectName) if err != nil { r.Error("AGL0018", err, fmt.Sprintf("There was an error retrieving project '%s' which is most likely a transient error: %s", projectName, err)) return "" } nodeSelector, ok := project.ObjectMeta.Annotations["openshift.io/node-selector"] if !ok || len(nodeSelector) != 0 { r.Warn("AGL0030", nil, fmt.Sprintf(projectNodeSelectorWarning, projectName)) } return projectName }
//checkKibanaSecret confirms the secret used by kibana matches that configured in the oauth client func checkKibanaSecret(r types.DiagnosticResult, osClient *client.Client, kClient *kclient.Client, project string, oauthclient *oauthapi.OAuthClient) { r.Debug("AGL0100", "Checking oauthclient secrets...") secret, err := kClient.Secrets(project).Get(kibanaProxySecretName) if err != nil { r.Error("AGL0105", err, fmt.Sprintf("Error retrieving the secret '%s': %s", kibanaProxySecretName, err)) return } decoded, err := decodeSecret(secret, oauthSecretKeyName) if err != nil { r.Error("AGL0110", err, fmt.Sprintf("Unable to decode Kibana Secret: %s", err)) return } if decoded != oauthclient.Secret { r.Debug("AGL0120", fmt.Sprintf("OauthClient Secret: '%s'", oauthclient.Secret)) r.Debug("AGL0125", fmt.Sprintf("Decoded Kibana Secret: '%s'", decoded)) message := fmt.Sprintf("The %s OauthClient.Secret does not match the decoded oauth secret in '%s'", kibanaProxyOauthClientName, kibanaProxySecretName) r.Error("AGL0130", errors.New(message), message) } }
// ---------------------------------------------------------- // Attempt to open file at path as client config // If there is a problem and errmsg is set, log an error func (d ConfigLoading) canOpenConfigFile(path string, errmsg string, r types.DiagnosticResult) bool { var file *os.File var err error if path == "" { // empty param/envvar return false } else if file, err = os.Open(path); err == nil { r.Debug("DCli1004", fmt.Sprintf("Reading client config at %s", path)) } else if errmsg == "" { r.Debug("DCli1005", fmt.Sprintf("Could not read client config at %s:\n%#v", path, err)) } else if os.IsNotExist(err) { r.Debug("DCli1006", errmsg+"but that file does not exist.") } else if os.IsPermission(err) { r.Error("DCli1007", err, errmsg+"but lack permission to read that file.") } else { r.Error("DCli1008", err, fmt.Sprintf("%sbut there was an error opening it:\n%#v", errmsg, err)) } if file != nil { // it is open for reading defer file.Close() if buffer, err := ioutil.ReadAll(file); err != nil { r.Error("DCli1009", err, fmt.Sprintf("Unexpected error while reading client config file (%s): %v", path, err)) } else if _, err := clientcmd.Load(buffer); err != nil { r.Error("DCli1010", err, fmt.Sprintf(` Error reading YAML from client config file (%s): %v This file may have been truncated or mis-edited. Please fix, remove, or obtain a new client config`, file.Name(), err)) } else { r.Info("DCli1011", fmt.Sprintf("Successfully read a client config file at '%s'", path)) /* Note, we're not going to use this config file directly. * Instead, we'll defer to the openshift client code to assimilate * flags, env vars, and the potential hierarchy of config files * into an actual configuration that the client uses. * However, for diagnostic purposes, record the files we find. */ return true } } return false }
func (d *DiagnosticPod) runDiagnosticPod(service *kapi.Service, r types.DiagnosticResult) { loglevel := d.Level if loglevel > 2 { loglevel = 2 // need to show summary at least } imageName := d.ImageTemplate.ExpandOrDie("deployer") pod, err := d.KubeClient.Core().Pods(d.Namespace).Create(&kapi.Pod{ ObjectMeta: kapi.ObjectMeta{GenerateName: "pod-diagnostic-test-"}, Spec: kapi.PodSpec{ RestartPolicy: kapi.RestartPolicyNever, Containers: []kapi.Container{ { Name: "pod-diagnostics", Image: imageName, Command: []string{"openshift", "infra", "diagnostic-pod", "-l", strconv.Itoa(loglevel)}, }, }, }, }) if err != nil { r.Error("DCli2001", err, fmt.Sprintf("Creating diagnostic pod with image %s failed. Error: (%[2]T) %[2]v", imageName, err)) return } defer func() { // delete what we created, or notify that we couldn't zero := int64(0) delOpts := kapi.DeleteOptions{TypeMeta: pod.TypeMeta, GracePeriodSeconds: &zero} if err := d.KubeClient.Core().Pods(d.Namespace).Delete(pod.ObjectMeta.Name, &delOpts); err != nil { r.Error("DCl2002", err, fmt.Sprintf("Deleting diagnostic pod '%s' failed. Error: %s", pod.ObjectMeta.Name, fmt.Sprintf("(%T) %[1]s", err))) } }() pod, err = d.KubeClient.Core().Pods(d.Namespace).Get(pod.ObjectMeta.Name) // status is filled in post-create if err != nil { r.Error("DCli2003", err, fmt.Sprintf("Retrieving the diagnostic pod definition failed. Error: (%T) %[1]v", err)) return } r.Debug("DCli2004", fmt.Sprintf("Created diagnostic pod named %v running image %s.", pod.ObjectMeta.Name, imageName)) bytelim := int64(1024000) podLogsOpts := &kapi.PodLogOptions{ TypeMeta: pod.TypeMeta, Container: "pod-diagnostics", Follow: true, LimitBytes: &bytelim, } req, err := d.Factory.LogsForObject(pod, podLogsOpts) if err != nil { r.Error("DCli2005", err, fmt.Sprintf("The request for diagnostic pod logs failed unexpectedly. Error: (%T) %[1]v", err)) return } // wait for pod to be started and logs available var scanner *bufio.Scanner var lastError error for times := 1; true; times++ { if times <= 25 { readCloser, err := req.Stream() if err != nil { lastError = err r.Debug("DCli2010", fmt.Sprintf("Could not get diagnostic pod logs (loop %d): (%T[2]) %[2]v", times, err)) time.Sleep(time.Duration(times*100) * time.Millisecond) continue } defer readCloser.Close() // make sure we can actually get something from the stream before going on. // it seems the creation of docker logs can trail the container start a bit. lineScanner := bufio.NewScanner(readCloser) if lineScanner.Scan() { scanner = lineScanner break // success - drop down to reading the logs. } // no luck - try, try again lastError = fmt.Errorf("Diagnostics pod is ready but not its logs (loop %d). Retry.", times) r.Debug("DCli2010", lastError.Error()) time.Sleep(time.Duration(times*100) * time.Millisecond) continue } // after 25 times trying: r.Warn("DCli2006", err, fmt.Sprintf("Timed out preparing diagnostic pod logs for streaming, so this diagnostic cannot run.\nIt is likely that the image '%s' was not pulled and running yet.\nLast error: (%T[2]) %[2]v", pod.Spec.Containers[0].Image, lastError)) return } // then watch logs and wait until it exits podLogs, warnings, errors := "", 0, 0 errorRegex := regexp.MustCompile(`^\[Note\]\s+Errors\s+seen:\s+(\d+)`) warnRegex := regexp.MustCompile(`^\[Note\]\s+Warnings\s+seen:\s+(\d+)`) // keep in mind one test line was already scanned, so scan after the loop runs once for scanned := true; scanned; scanned = scanner.Scan() { line := scanner.Text() podLogs += line + "\n" if matches := errorRegex.FindStringSubmatch(line); matches != nil { errors, _ = strconv.Atoi(matches[1]) } else if matches := warnRegex.FindStringSubmatch(line); matches != nil { warnings, _ = strconv.Atoi(matches[1]) } } if err := scanner.Err(); err != nil { // Scan terminated abnormally r.Error("DCli2009", err, fmt.Sprintf("Unexpected error reading diagnostic pod logs: (%T) %[1]v\nLogs are:\n%[2]s", err, podLogs)) } else { if errors > 0 { r.Error("DCli2012", nil, "See the errors below in the output from the diagnostic pod:\n"+podLogs) } else if warnings > 0 { r.Warn("DCli2013", nil, "See the warnings below in the output from the diagnostic pod:\n"+podLogs) } else { r.Info("DCli2008", fmt.Sprintf("Output from the diagnostic pod (image %s):\n", imageName)+podLogs) } } }