func (d *MetricsApiProxy) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(MetricsApiProxyName) // see if it has any active endpoints if endpoints, err := d.KubeClient.Core().Endpoints(MetricsApiProxyProject).Get(MetricsApiProxyService); err != nil { r.Error("DClu4001", err, fmt.Sprintf("Unexpected error while retrieving %[1]s service endpoints: (%[2]T) %[2]v", MetricsApiProxyService, err)) return r } else { active := false if endpoints.Subsets != nil { for _, endpoint := range endpoints.Subsets { if len(endpoint.Addresses) > 0 { active = true break } } } if !active { r.Error("DClu4002", nil, fmt.Sprintf(errMsgNoHeapsterEndpoints, MetricsApiProxyService, MetricsApiProxyProject)) return r } } // the service should respond; see if we can reach it via API proxy uri := fmt.Sprintf("/api/v1/proxy/namespaces/%[1]s/services/https:%[2]s:/api/v1/model/metrics", MetricsApiProxyProject, MetricsApiProxyService) // note in above, project and service name are already URL-safe result := d.KubeClient.CoreClient.RESTClient.Get().RequestURI(uri).Do() if err := result.Error(); err != nil { r.Error("DClu4003", err, fmt.Sprintf(errMsgApiProxyAccess, uri, err)) } return r }
func (d NodeConfigCheck) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(NodeConfigCheckName) r.Debug("DH1001", fmt.Sprintf("Looking for node config file at '%s'", d.NodeConfigFile)) nodeConfig, err := configapilatest.ReadAndResolveNodeConfig(d.NodeConfigFile) if err != nil { r.Error("DH1002", err, fmt.Sprintf("Could not read node config file '%s':\n(%T) %[2]v", d.NodeConfigFile, err)) return r } r.Info("DH1003", fmt.Sprintf("Found a node config file: %[1]s", d.NodeConfigFile)) results := configvalidation.ValidateNodeConfig(nodeConfig, nil) if len(results.Errors) > 0 { errText := fmt.Sprintf("Validation of node config file '%s' failed:\n", d.NodeConfigFile) for _, err := range results.Errors { errText += fmt.Sprintf("%v\n", err) } r.Error("DH1004", nil, errText) } if len(results.Warnings) > 0 { warnText := fmt.Sprintf("Validation of node config file '%s' warned:\n", d.NodeConfigFile) for _, warn := range results.Warnings { warnText += fmt.Sprintf("%v\n", warn) } r.Warn("DH1005", nil, warnText) } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d *NetworkDiagnostic) Check() types.DiagnosticResult { d.res = types.NewDiagnosticResult(NetworkDiagnosticName) var err error var ok bool d.pluginName, ok, err = util.GetOpenShiftNetworkPlugin(d.OSClient) if err != nil { d.res.Error("DNet2001", err, fmt.Sprintf("Checking network plugin failed. Error: %s", err)) return d.res } if !ok { d.res.Warn("DNet2002", nil, "Skipping network diagnostics check. Reason: Not using openshift network plugin.") return d.res } d.nodes, err = util.GetSchedulableNodes(d.KubeClient) if err != nil { d.res.Error("DNet2003", err, fmt.Sprintf("Fetching schedulable nodes failed. Error: %s", err)) return d.res } if len(d.nodes) == 0 { d.res.Warn("DNet2004", nil, "Skipping network checks. Reason: No schedulable/ready nodes found.") return d.res } if len(d.LogDir) == 0 { d.LogDir = util.NetworkDiagDefaultLogDir } d.runNetworkDiagnostic() return d.res }
func (d *ClusterRoleBindings) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(ClusterRoleBindingsName) reconcileOptions := &policycmd.ReconcileClusterRoleBindingsOptions{ Confirmed: false, Union: false, Out: ioutil.Discard, RoleBindingClient: d.ClusterRoleBindingsClient.ClusterRoleBindings(), } changedClusterRoleBindings, err := reconcileOptions.ChangedClusterRoleBindings() if policycmd.IsClusterRoleBindingLookupError(err) { // we got a partial match, so we log the error that stopped us from getting a full match // but continue to interpret the partial results that we did get r.Warn("CRBD1008", err, fmt.Sprintf("Error finding ClusterRoleBindings: %v", err)) } else if err != nil { r.Error("CRBD1000", err, fmt.Sprintf("Error inspecting ClusterRoleBindings: %v", err)) return r } // success if len(changedClusterRoleBindings) == 0 { return r } for _, changedClusterRoleBinding := range changedClusterRoleBindings { actualClusterRole, err := d.ClusterRoleBindingsClient.ClusterRoleBindings().Get(changedClusterRoleBinding.Name) if kerrs.IsNotFound(err) { r.Error("CRBD1001", nil, fmt.Sprintf("clusterrolebinding/%s is missing.\n\nUse the `oadm policy reconcile-cluster-role-bindings` command to create the role binding.", changedClusterRoleBinding.Name)) continue } if err != nil { r.Error("CRBD1002", err, fmt.Sprintf("Unable to get clusterrolebinding/%s: %v", changedClusterRoleBinding.Name, err)) } missingSubjects, extraSubjects := policycmd.DiffObjectReferenceLists(changedClusterRoleBinding.Subjects, actualClusterRole.Subjects) switch { case len(missingSubjects) > 0: // Only a warning, because they can remove things like self-provisioner role from system:unauthenticated, and it's not an error r.Warn("CRBD1003", nil, fmt.Sprintf("clusterrolebinding/%s is missing expected subjects.\n\nUse the `oadm policy reconcile-cluster-role-bindings` command to update the role binding to include expected subjects.", changedClusterRoleBinding.Name)) case len(extraSubjects) > 0: // Only info, because it is normal to use policy to grant cluster roles to users r.Info("CRBD1004", fmt.Sprintf("clusterrolebinding/%s has more subjects than expected.\n\nUse the `oadm policy reconcile-cluster-role-bindings` command to update the role binding to remove extra subjects.", changedClusterRoleBinding.Name)) } for _, missingSubject := range missingSubjects { r.Info("CRBD1005", fmt.Sprintf("clusterrolebinding/%s is missing subject %v.", changedClusterRoleBinding.Name, missingSubject)) } for _, extraSubject := range extraSubjects { r.Info("CRBD1006", fmt.Sprintf("clusterrolebinding/%s has extra subject %v.", changedClusterRoleBinding.Name, extraSubject)) } r.Debug("CRBD1007", fmt.Sprintf("clusterrolebinding/%s is now %v.", changedClusterRoleBinding.Name, changedClusterRoleBinding)) } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d PodCheckDns) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(PodCheckDnsName) if resolvConf, err := getResolvConf(r); err == nil { connectAndResolve(resolvConf, r) resolveSearch(resolvConf, r) } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d CheckServiceNetwork) Check() types.DiagnosticResult { d.res = types.NewDiagnosticResult(CheckServiceNetworkName) pluginName, ok, err := util.GetOpenShiftNetworkPlugin(d.OSClient) if err != nil { d.res.Error("DSvcNet1001", err, fmt.Sprintf("Checking network plugin failed. Error: %s", err)) return d.res } if !ok { d.res.Warn("DSvcNet1002", nil, "Skipping service connectivity test. Reason: Not using openshift network plugin.") return d.res } services, err := getAllServices(d.KubeClient) if err != nil { d.res.Error("DSvcNet1003", err, fmt.Sprintf("Getting all services failed. Error: %s", err)) return d.res } if len(services) == 0 { d.res.Warn("DSvcNet1004", nil, "Skipping service connectivity test. Reason: No services found.") return d.res } localPods, _, err := util.GetLocalAndNonLocalDiagnosticPods(d.KubeClient) if err != nil { d.res.Error("DSvcNet1005", err, fmt.Sprintf("Getting local and nonlocal pods failed. Error: %s", err)) return d.res } if sdnapi.IsOpenShiftMultitenantNetworkPlugin(pluginName) { netnsList, err := d.OSClient.NetNamespaces().List(kapi.ListOptions{}) if err != nil { d.res.Error("DSvcNet1006", err, fmt.Sprintf("Getting all network namespaces failed. Error: %s", err)) return d.res } d.vnidMap = map[string]uint32{} for _, netns := range netnsList.Items { d.vnidMap[netns.NetName] = netns.NetID } } localGlobalPods, localNonGlobalPods := util.GetGlobalAndNonGlobalPods(localPods, d.vnidMap) // Applicable to flat and multitenant networks if len(localGlobalPods) > 0 { d.checkConnection(localGlobalPods, services, "Skipping service connectivity test for global projects. Reason: Couldn't find a global pod.") } // Applicable to multitenant network isMultitenant := (d.vnidMap != nil) if isMultitenant { d.checkConnection(localNonGlobalPods, services, "Skipping service connectivity test for non-global projects. Reason: Couldn't find a non-global pod.") } return d.res }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d CheckExternalNetwork) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(CheckExternalNetworkName) externalAddress := "www.redhat.com" kexecer := kexec.New() if _, err := kexecer.Command("ping", "-c1", "-W2", externalAddress).CombinedOutput(); err != nil { // Admin may intentionally block access to the external network. If this check fails it doesn't necessarily mean that something is wrong. So just warn in this case. r.Warn("DExtNet1001", nil, fmt.Sprintf("Pinging external address %q failed. Check if the admin intentionally blocked access to the external network. Error: %s", externalAddress, err)) } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d PodCheckAuth) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(PodCheckAuthName) token, err := ioutil.ReadFile(d.TokenPath) if err != nil { r.Error("DP1001", err, fmt.Sprintf("could not read the service account token: %v", err)) return r } d.authenticateToMaster(string(token), r) d.authenticateToRegistry(string(token), r) return r }
func (d *ClusterRouter) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(ClusterRouterName) if dc := d.getRouterDC(r); dc != nil { // Check that it actually has running pod(s) selected if podList := d.getRouterPods(dc, r); podList != nil { for _, pod := range podList.Items { // Check the logs for that pod for common issues (credentials, DNS resolution failure) d.checkRouterLogs(&pod, r) } } } return r }
func (d *ServiceExternalIPs) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(ServiceExternalIPsName) masterConfig, err := hostdiag.GetMasterConfig(r, d.MasterConfigFile) if err != nil { r.Info("DH2004", "Unreadable master config; skipping this diagnostic.") return r } admit, reject := []*net.IPNet{}, []*net.IPNet{} if cidrs := masterConfig.NetworkConfig.ExternalIPNetworkCIDRs; cidrs != nil { reject, admit, err = admission.ParseCIDRRules(cidrs) if err != nil { r.Error("DH2007", err, fmt.Sprintf("Could not parse master config NetworkConfig.ExternalIPNetworkCIDRs: (%[1]T) %[1]v", err)) return r } } services, err := d.KclusterClient.Services("").List(kapi.ListOptions{}) if err != nil { r.Error("DH2005", err, fmt.Sprintf("Error while listing cluster services: (%[1]T) %[1]v", err)) return r } errList := []string{} for _, service := range services.Items { if len(service.Spec.ExternalIPs) == 0 { continue } if len(admit) == 0 { errList = append(errList, fmt.Sprintf("Service %s.%s specifies ExternalIPs %v, but none are permitted.", service.Namespace, service.Name, service.Spec.ExternalIPs)) continue } for _, ipString := range service.Spec.ExternalIPs { ip := net.ParseIP(ipString) if ip == nil { continue // we don't really care for the purposes of this diagnostic } if admission.NetworkSlice(reject).Contains(ip) || !admission.NetworkSlice(admit).Contains(ip) { errList = append(errList, fmt.Sprintf("Service %s.%s specifies ExternalIP %s that is not permitted by the master ExternalIPNetworkCIDRs setting.", service.Namespace, service.Name, ipString)) } } } if len(errList) > 0 { r.Error("DH2006", nil, `The following problems were found with service ExternalIPs in the cluster. These services were created before the master ExternalIPNetworkCIDRs setting changed to exclude them. The default ExternalIPNetworkCIDRs now excludes all ExternalIPs on services. `+strings.Join(errList, "\n")) } return r }
func (d *ClusterRoles) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(ClusterRolesName) reconcileOptions := &policycmd.ReconcileClusterRolesOptions{ Confirmed: false, Union: false, Out: ioutil.Discard, RoleClient: d.ClusterRolesClient.ClusterRoles(), } changedClusterRoles, err := reconcileOptions.ChangedClusterRoles() if err != nil { r.Error("CRD1000", err, fmt.Sprintf("Error inspecting ClusterRoles: %v", err)) return r } // success if len(changedClusterRoles) == 0 { return r } for _, changedClusterRole := range changedClusterRoles { actualClusterRole, err := d.ClusterRolesClient.ClusterRoles().Get(changedClusterRole.Name) if kerrs.IsNotFound(err) { r.Error("CRD1002", nil, fmt.Sprintf("clusterrole/%s is missing.\n\nUse the `oadm policy reconcile-cluster-roles` command to create the role.", changedClusterRole.Name)) continue } if err != nil { r.Error("CRD1001", err, fmt.Sprintf("Unable to get clusterrole/%s: %v", changedClusterRole.Name, err)) } _, missingRules := rulevalidation.Covers(actualClusterRole.Rules, changedClusterRole.Rules) if len(missingRules) == 0 { r.Warn("CRD1003", nil, fmt.Sprintf("clusterrole/%s has changed, but the existing role has more permissions than the new role.\n\nUse the `oadm policy reconcile-cluster-roles` command to update the role to reduce permissions.", changedClusterRole.Name)) _, extraRules := rulevalidation.Covers(changedClusterRole.Rules, actualClusterRole.Rules) for _, extraRule := range extraRules { r.Info("CRD1008", fmt.Sprintf("clusterrole/%s has extra permission %v.", changedClusterRole.Name, extraRule)) } continue } r.Error("CRD1005", nil, fmt.Sprintf("clusterrole/%s has changed and the existing role does not have enough permissions.\n\nUse the `oadm policy reconcile-cluster-roles` command to update the role.", changedClusterRole.Name)) for _, missingRule := range missingRules { r.Info("CRD1007", fmt.Sprintf("clusterrole/%s is missing permission %v.", changedClusterRole.Name, missingRule)) } r.Debug("CRD1006", fmt.Sprintf("clusterrole/%s is now %v.", changedClusterRole.Name, changedClusterRole)) } return r }
func (d *ClusterRoles) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(ClusterRolesName) reconcileOptions := &policycmd.ReconcileClusterRolesOptions{ Confirmed: false, Union: false, Out: ioutil.Discard, RoleClient: d.ClusterRolesClient.ClusterRoles(), } changedClusterRoles, _, err := reconcileOptions.ChangedClusterRoles() if err != nil { r.Error("CRD1000", err, fmt.Sprintf("Error inspecting ClusterRoles: %v", err)) return r } // success if len(changedClusterRoles) == 0 { return r } for _, changedClusterRole := range changedClusterRoles { actualClusterRole, err := d.ClusterRolesClient.ClusterRoles().Get(changedClusterRole.Name) if kerrs.IsNotFound(err) { r.Error("CRD1002", nil, fmt.Sprintf(clusterRoleMissing, changedClusterRole.Name)) continue } if err != nil { r.Error("CRD1001", err, fmt.Sprintf("Unable to get clusterrole/%s: %v", changedClusterRole.Name, err)) } _, missingRules := rulevalidation.Covers(actualClusterRole.Rules, changedClusterRole.Rules) if len(missingRules) == 0 { r.Info("CRD1003", fmt.Sprintf(clusterRoleReduced, changedClusterRole.Name)) _, extraRules := rulevalidation.Covers(changedClusterRole.Rules, actualClusterRole.Rules) for _, extraRule := range extraRules { r.Info("CRD1008", fmt.Sprintf("clusterrole/%s has extra permission %v.", changedClusterRole.Name, extraRule)) } continue } r.Error("CRD1005", nil, fmt.Sprintf(clusterRoleChanged, changedClusterRole.Name)) for _, missingRule := range missingRules { r.Info("CRD1007", fmt.Sprintf("clusterrole/%s is missing permission %v.", changedClusterRole.Name, missingRule)) } r.Debug("CRD1006", fmt.Sprintf("clusterrole/%s is now %v.", changedClusterRole.Name, changedClusterRole)) } return r }
func (d *ClusterRegistry) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(ClusterRegistryName) if service := d.getRegistryService(r); service != nil { // Check that it actually has pod(s) selected and running if runningPods := d.getRegistryPods(service, r); len(runningPods) == 0 { r.Error("DClu1001", nil, fmt.Sprintf(clRegNoRunningPods, registryName)) return r } else if d.checkRegistryEndpoints(runningPods, r) { // Check that matching endpoint exists on the service // attempt to create an imagestream and see if it gets the same registry service IP from the service cache d.verifyRegistryImageStream(service, r) } } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d CollectNetworkInfo) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(CollectNetworkInfoName) nodeName, _, err := util.GetLocalNode(d.KubeClient) if err != nil { r.Error("DColNet1001", err, fmt.Sprintf("Fetching local node info failed: %s", err)) return r } l := util.LogInterface{ Result: r, Logdir: filepath.Join(util.NetworkDiagDefaultLogDir, util.NetworkDiagNodeLogDirPrefix, nodeName), } l.LogNode(d.KubeClient) return r }
func (d NodeConfigCheck) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(NodeConfigCheckName) r.Debug("DH1001", fmt.Sprintf("Looking for node config file at '%s'", d.NodeConfigFile)) nodeConfig, err := configapilatest.ReadAndResolveNodeConfig(d.NodeConfigFile) if err != nil { r.Error("DH1002", err, fmt.Sprintf("Could not read node config file '%s':\n(%T) %[2]v", d.NodeConfigFile, err)) return r } r.Info("DH1003", fmt.Sprintf("Found a node config file: %[1]s", d.NodeConfigFile)) for _, err := range configvalidation.ValidateNodeConfig(nodeConfig) { r.Error("DH1004", err, fmt.Sprintf("Validation of node config file '%s' failed:\n(%T) %[2]v", d.NodeConfigFile, err)) } return r }
func (d *MasterNode) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(MasterNodeName) nodes, err := d.KubeClient.Nodes().List(kapi.ListOptions{}) if err != nil { r.Error("DClu3002", err, fmt.Sprintf(clientErrorGettingNodes, err)) return r } // Provide the actual net.LookupHost as the DNS resolver: serverIps, err := resolveServerIP(d.ServerUrl, net.LookupHost) if err != nil { r.Error("DClu3007", err, "Error resolving servers IP") return r } return searchNodesForIP(nodes.Items, serverIps) }
func (d UnitStatus) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(UnitStatusName) unitRequiresUnit(r, d.SystemdUnits["openshift-node"], d.SystemdUnits["iptables"], nodeRequiresIPTables) unitRequiresUnit(r, d.SystemdUnits["openshift-node"], d.SystemdUnits["docker"], `Nodes use Docker to run containers.`) unitRequiresUnit(r, d.SystemdUnits["openshift-node"], d.SystemdUnits["openvswitch"], sdUnitSDNreqOVS) unitRequiresUnit(r, d.SystemdUnits["openshift-master"], d.SystemdUnits["openvswitch"], `Masters use openvswitch for access to cluster SDN networking`) // all-in-one networking *could* be simpler, so fewer checks unitRequiresUnit(r, d.SystemdUnits["openshift"], d.SystemdUnits["docker"], `Nodes use Docker to run containers.`) // Anything that is enabled but not running deserves notice for name, unit := range d.SystemdUnits { if unit.Enabled && !unit.Active { r.Error("DS3001", nil, fmt.Sprintf(sdUnitInactive, name)) } } return r }
func (d *NodeDefinitions) Check() types.DiagnosticResult { r := types.NewDiagnosticResult("NodeDefinition") nodes, err := d.KubeClient.Core().Nodes().List(kapi.ListOptions{}) if err != nil { r.Error("DClu0001", err, fmt.Sprintf(clientErrorGettingNodes, err)) return r } anyNodesAvail := false for _, node := range nodes.Items { var ready *kapi.NodeCondition for i, condition := range node.Status.Conditions { switch condition.Type { // Each condition appears only once. Currently there's only one... used to be more case kapi.NodeReady: ready = &node.Status.Conditions[i] } } if ready == nil || ready.Status != kapi.ConditionTrue { templateData := log.Hash{"node": node.Name} if ready == nil { templateData["status"] = "None" templateData["reason"] = "There is no readiness record." } else { templateData["status"] = ready.Status templateData["reason"] = ready.Reason } r.Warn("DClu0002", nil, log.EvalTemplate("DClu0002", nodeNotReady, templateData)) } else if node.Spec.Unschedulable { r.Warn("DClu0003", nil, log.EvalTemplate("DClu0003", nodeNotSched, log.Hash{"node": node.Name})) } else { anyNodesAvail = true } } if !anyNodesAvail { r.Error("DClu0004", nil, "There were no nodes available to use. No new pods can be scheduled.") } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d CheckNodeNetwork) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(CheckNodeNetworkName) _, localIP, err := util.GetLocalNode(d.KubeClient) if err != nil { r.Error("DNodeNet1001", err, err.Error()) return r } localPods, _, err := util.GetLocalAndNonLocalDiagnosticPods(d.KubeClient) if err != nil { r.Error("DNodeNet1002", err, fmt.Sprintf("Getting local and nonlocal pods failed. Error: %s", err)) return r } for _, pod := range localPods { checkNodeConnection(&pod, localIP, r) } return r }
func (d UnitStatus) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(UnitStatusName) unitRequiresUnit(r, d.SystemdUnits["atomic-openshift-node"], d.SystemdUnits["iptables"], nodeRequiresIPTables) unitRequiresUnit(r, d.SystemdUnits["atomic-openshift-node"], d.SystemdUnits["docker"], `Nodes use Docker to run containers.`) unitRequiresUnit(r, d.SystemdUnits["atomic-openshift-node"], d.SystemdUnits["openvswitch"], fmt.Sprintf(sdUnitSDNreqOVS, "atomic-openshift-node")) unitRequiresUnit(r, d.SystemdUnits["atomic-openshift-master"], d.SystemdUnits["atomic-openshift-node"], `Masters must currently also be nodes for access to cluster SDN networking`) unitRequiresUnit(r, d.SystemdUnits["origin-node"], d.SystemdUnits["iptables"], nodeRequiresIPTables) unitRequiresUnit(r, d.SystemdUnits["origin-node"], d.SystemdUnits["docker"], `Nodes use Docker to run containers.`) unitRequiresUnit(r, d.SystemdUnits["origin-node"], d.SystemdUnits["openvswitch"], fmt.Sprintf(sdUnitSDNreqOVS, "origin-node")) unitRequiresUnit(r, d.SystemdUnits["origin-master"], d.SystemdUnits["origin-node"], `Masters must currently also be nodes for access to cluster SDN networking`) // Anything that is enabled but not running deserves notice for name, unit := range d.SystemdUnits { if unit.Enabled && !unit.Active { r.Error("DS3001", nil, fmt.Sprintf(sdUnitInactive, name)) } } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d CheckPodNetwork) Check() types.DiagnosticResult { d.res = types.NewDiagnosticResult(CheckPodNetworkName) pluginName, ok, err := util.GetOpenShiftNetworkPlugin(d.OSClient) if err != nil { d.res.Error("DPodNet1001", err, fmt.Sprintf("Checking network plugin failed. Error: %s", err)) return d.res } if !ok { d.res.Warn("DPodNet1002", nil, "Skipping pod connectivity test. Reason: Not using openshift network plugin.") return d.res } localPods, nonlocalPods, err := util.GetLocalAndNonLocalDiagnosticPods(d.KubeClient) if err != nil { d.res.Error("DPodNet1003", err, fmt.Sprintf("Getting local and nonlocal pods failed. Error: %s", err)) return d.res } if sdnapi.IsOpenShiftMultitenantNetworkPlugin(pluginName) { netnsList, err := d.OSClient.NetNamespaces().List(kapi.ListOptions{}) if err != nil { d.res.Error("DPodNet1004", err, fmt.Sprintf("Getting all network namespaces failed. Error: %s", err)) return d.res } d.vnidMap = map[string]uint32{} for _, netns := range netnsList.Items { d.vnidMap[netns.NetName] = netns.NetID } } localGlobalPods, localNonGlobalPods := util.GetGlobalAndNonGlobalPods(localPods, d.vnidMap) nonlocalGlobalPods, nonlocalNonGlobalPods := util.GetGlobalAndNonGlobalPods(nonlocalPods, d.vnidMap) d.checkSameNodePodToPodConnection(localGlobalPods, localNonGlobalPods) d.checkDifferentNodePodToPodConnection(localGlobalPods, localNonGlobalPods, nonlocalGlobalPods, nonlocalNonGlobalPods) return d.res }
func searchNodesForIP(nodes []kapi.Node, ips []string) types.DiagnosticResult { r := types.NewDiagnosticResult(MasterNodeName) r.Debug("DClu3005", fmt.Sprintf("Seaching for a node with master IP: %s", ips)) // Loops = # of nodes * number of IPs per node (2 commonly) * # of IPs the // server hostname resolves to. (should usually be 1) for _, node := range nodes { for _, address := range node.Status.Addresses { for _, ipAddress := range ips { r.Debug("DClu3006", fmt.Sprintf("Checking node %s address %s", node.ObjectMeta.Name, address.Address)) if address.Address == ipAddress { r.Info("DClu3003", fmt.Sprintf("Found a node with same IP as master: %s", node.ObjectMeta.Name)) return r } } } } r.Warn("DClu3004", nil, masterNotRunningAsANode) return r }
func (d MasterConfigCheck) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(MasterConfigCheckName) masterConfig, err := GetMasterConfig(r, d.MasterConfigFile) if err != nil { return r } results := configvalidation.ValidateMasterConfig(masterConfig, nil) if len(results.Errors) > 0 { errText := fmt.Sprintf("Validation of master config file '%s' failed:\n", d.MasterConfigFile) for _, err := range results.Errors { errText += fmt.Sprintf("%v\n", err) } r.Error("DH0004", nil, errText) } if len(results.Warnings) > 0 { warnText := fmt.Sprintf("Validation of master config file '%s' warned:\n", d.MasterConfigFile) for _, warn := range results.Warnings { warnText += fmt.Sprintf("%v\n", warn) } r.Warn("DH0005", nil, warnText) } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d *ConfigLoading) Check() types.DiagnosticResult { r := types.NewDiagnosticResult("ConfigLoading") confFlagValue := d.ClientFlags.Lookup(d.ConfFlagName).Value.String() var foundPath string rules := config.NewOpenShiftClientConfigLoadingRules() paths := append([]string{confFlagValue}, rules.Precedence...) for index, path := range paths { errmsg := "" switch index { case 0: errmsg = fmt.Sprintf("--%s specified that client config should be at %s\n", d.ConfFlagName, path) case len(paths) - 1: // config in ~/.kube // no error message indicated if it is not there... user didn't say it would be default: // can be multiple paths from the env var in theory; all cases should go here if len(os.Getenv(config.OpenShiftConfigPathEnvVar)) != 0 { errmsg = fmt.Sprintf("Env var %s specified that client config could be at %s\n", config.OpenShiftConfigPathEnvVar, path) } } if d.canOpenConfigFile(path, errmsg, r) && foundPath == "" { d.successfulLoad = true foundPath = path } } if foundPath != "" { if confFlagValue != "" && confFlagValue != foundPath { // found config but not where --config said r.Error("DCli1001", nil, fmt.Sprintf(` The client configuration file was not found where the --%s flag indicated: %s A config file was found at the following location: %s If you wish to use this file for client configuration, you can specify it with the --%[1]s flag, or just not specify the flag. `, d.ConfFlagName, confFlagValue, foundPath)) } } else { // not found, check for master-generated ones to recommend if confFlagValue != "" { r.Error("DCli1002", nil, fmt.Sprintf("Did not find config file where --%s=%s indicated", d.ConfFlagName, confFlagValue)) } adminWarningF := ` No client config file was available; however, one exists at %[2]s which may have been generated automatically by the master. If you want to use this config, you should copy it to the standard location (%[3]s), or you can set the environment variable %[1]s: export %[1]s=%[2]s If not, obtain a config file and place it in the standard location for use by the client and diagnostics. ` // look for it in auto-generated locations when not found properly for _, path := range util.AdminKubeConfigPaths { msg := fmt.Sprintf("Looking for a possible client config at %s\n", path) if d.canOpenConfigFile(path, msg, r) { r.Warn("DCli1003", nil, fmt.Sprintf(adminWarningF, config.OpenShiftConfigPathEnvVar, path, config.RecommendedHomeFile)) break } } } return r }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d ConfigContext) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(ConfigContextsName) isDefaultContext := d.RawConfig.CurrentContext == d.ContextName // prepare bad news message errorKey := "DCli0001" unusableLine := fmt.Sprintf("The client config context '%s' is unusable", d.ContextName) if isDefaultContext { errorKey = "DCli0002" unusableLine = fmt.Sprintf("The current client config context '%s' is unusable", d.ContextName) } // check that the context and its constituents are defined in the kubeconfig context, exists := d.RawConfig.Contexts[d.ContextName] if !exists { r.Error(errorKey, nil, fmt.Sprintf("%s:\n Client config context '%s' is not defined.", unusableLine, d.ContextName)) return r } clusterName := context.Cluster cluster, exists := d.RawConfig.Clusters[clusterName] if !exists { r.Error(errorKey, nil, fmt.Sprintf("%s:\n Client config context '%s' has a cluster '%s' which is not defined.", unusableLine, d.ContextName, clusterName)) return r } authName := context.AuthInfo if _, exists := d.RawConfig.AuthInfos[authName]; !exists { r.Error(errorKey, nil, fmt.Sprintf("%s:\n Client config context '%s' has a user '%s' which is not defined.", unusableLine, d.ContextName, authName)) return r } // we found a fully-defined context project := context.Namespace if project == "" { project = kapi.NamespaceDefault // k8s fills this in anyway if missing from the context } msgText := contextDesc if isDefaultContext { msgText = currContextDesc } msgText = fmt.Sprintf(msgText, d.ContextName, cluster.Server, authName, project) // Actually send a request to see if context has connectivity. // Note: we cannot reuse factories as they cache the clients, so build new factory for each context. osClient, _, _, err := osclientcmd.NewFactory(kclientcmd.NewDefaultClientConfig(*d.RawConfig, &kclientcmd.ConfigOverrides{Context: *context})).Clients() // client create now *fails* if cannot connect to server; so, address connectivity errors below if err == nil { if projects, projerr := osClient.Projects().List(kapi.ListOptions{}); projerr != nil { err = projerr } else { // success! list := []string{} for i, project := range projects.Items { if i > 9 { list = append(list, "...") break } list = append(list, project.Name) } if len(list) == 0 { r.Info("DCli0003", msgText+"Successfully requested project list, but it is empty, so user has no access to anything.") } else { r.Info("DCli0004", msgText+fmt.Sprintf("Successfully requested project list; has access to project(s):\n %v", list)) } return r } } // something went wrong; couldn't create client or get project list. // interpret the terse error messages with helpful info. errMsg := err.Error() errFull := fmt.Sprintf("(%T) %[1]v\n", err) var reason, errId string switch { case regexp.MustCompile("dial tcp: lookup (\\S+): no such host").MatchString(errMsg): errId, reason = "DCli0005", clientNoResolve case strings.Contains(errMsg, "x509: certificate signed by unknown authority"): errId, reason = "DCli0006", clientUnknownCa case strings.Contains(errMsg, "specifying a root certificates file with the insecure flag is not allowed"): errId, reason = "DCli0007", clientUnneededCa case invalidCertNameRx.MatchString(errMsg): match := invalidCertNameRx.FindStringSubmatch(errMsg) serverHost := match[len(match)-1] errId, reason = "DCli0008", fmt.Sprintf(clientInvCertName, serverHost) case regexp.MustCompile("dial tcp (\\S+): connection refused").MatchString(errMsg): errId, reason = "DCli0009", clientConnRefused case regexp.MustCompile("dial tcp (\\S+): (?:connection timed out|i/o timeout|no route to host)").MatchString(errMsg): errId, reason = "DCli0010", clientConnTimeout case strings.Contains(errMsg, "malformed HTTP response"): errId, reason = "DCli0011", clientMalformedHTTP case strings.Contains(errMsg, "tls: oversized record received with length"): errId, reason = "DCli0012", clientMalformedTLS case strings.Contains(errMsg, `User "system:anonymous" cannot`): errId, reason = "DCli0013", clientUnauthn case strings.Contains(errMsg, "provide credentials"): errId, reason = "DCli0014", clientUnauthz default: errId, reason = "DCli0015", `Diagnostics does not have an explanation for what this means. Please report this error so one can be added.` } r.Error(errId, err, msgText+errFull+reason) return r }
//NewAggregatedLogging returns the AggregatedLogging Diagnostic func NewAggregatedLogging(masterConfigFile string, kclient *kclient.Client, osclient *client.Client) *AggregatedLogging { return &AggregatedLogging{nil, masterConfigFile, osclient, kclient, types.NewDiagnosticResult(AggregatedLoggingName)} }
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic func (d *DiagnosticPod) Check() types.DiagnosticResult { r := types.NewDiagnosticResult("DiagnosticPod") d.runDiagnosticPod(nil, r) return r }
func (d AnalyzeLogs) Check() types.DiagnosticResult { r := types.NewDiagnosticResult(AnalyzeLogsName) for _, unit := range unitLogSpecs { if svc := d.SystemdUnits[unit.Name]; svc.Enabled || svc.Active { r.Info("DS0001", fmt.Sprintf("Checking journalctl logs for '%s' service", unit.Name)) cmd := exec.Command("journalctl", "-ru", unit.Name, "--output=json") // JSON comes out of journalctl one line per record lineReader, reader, err := func(cmd *exec.Cmd) (*bufio.Scanner, io.ReadCloser, error) { stdout, err := cmd.StdoutPipe() if err == nil { lineReader := bufio.NewScanner(stdout) if err = cmd.Start(); err == nil { return lineReader, stdout, nil } } return nil, nil, err }(cmd) if err != nil { r.Error("DS0002", err, fmt.Sprintf(sdLogReadErr, unit.Name, errStr(err))) return r } defer func() { // close out pipe once done reading reader.Close() cmd.Wait() }() timeLimit := time.Now().Add(-time.Hour) // if it didn't happen in the last hour, probably not too relevant matchCopy := append([]logMatcher(nil), unit.LogMatchers...) // make a copy, will remove matchers after they match something lineCount := 0 // each log entry is a line for lineReader.Scan() { lineCount += 1 if len(matchCopy) == 0 { // if no rules remain to match break // don't waste time reading more log entries } bytes, entry := lineReader.Bytes(), logEntry{} if err := json.Unmarshal(bytes, &entry); err != nil { r.Debug("DS0003", fmt.Sprintf("Couldn't read the JSON for this log message:\n%s\nGot error %s", string(bytes), errStr(err))) } else { if lineCount > 500 && stampTooOld(entry.TimeStamp, timeLimit) { r.Debug("DS0004", fmt.Sprintf("Stopped reading %s log: timestamp %s too old", unit.Name, entry.TimeStamp)) break // if we've analyzed at least 500 entries, stop when age limit reached (don't scan days of logs) } if unit.StartMatch.MatchString(entry.Message) { break // saw log message for unit startup; don't analyze previous logs } for index, match := range matchCopy { // match log message against provided matchers if strings := match.Regexp.FindStringSubmatch(entry.Message); strings != nil { // if matches: print interpretation, remove from matchCopy, and go on to next log entry keep := match.KeepAfterMatch // generic keep logic if match.Interpret != nil { // apply custom match logic currKeep := match.Interpret(&entry, strings, r) keep = currKeep } else { // apply generic match processing text := fmt.Sprintf("Found '%s' journald log message:\n %s\n%s", unit.Name, entry.Message, match.Interpretation) switch match.Level { case log.DebugLevel: r.Debug(match.Id, text) case log.InfoLevel: r.Info(match.Id, text) case log.WarnLevel: r.Warn(match.Id, nil, text) case log.ErrorLevel: r.Error(match.Id, nil, text) } } if !keep { // remove matcher once seen matchCopy = append(matchCopy[:index], matchCopy[index+1:]...) } break } } } } } } return r }