// getChainLines parses a table's iptables-save data to find chains in the table. // It returns a map of iptables.Chain to string where the string is the chain line from the save (with counters etc). func getChainLines(table utiliptables.Table, save []byte) map[utiliptables.Chain]string { // get lines lines := strings.Split(string(save), "\n") chainsMap := make(map[utiliptables.Chain]string) tablePrefix := "*" + string(table) lineNum := 0 // find beginning of table for ; lineNum < len(lines); lineNum++ { if strings.HasPrefix(strings.TrimSpace(lines[lineNum]), tablePrefix) { lineNum++ break } } // parse table lines for ; lineNum < len(lines); lineNum++ { line := strings.TrimSpace(lines[lineNum]) if strings.HasPrefix(line, "COMMIT") || strings.HasPrefix(line, "*") { break } else if len(line) == 0 || strings.HasPrefix(line, "#") { continue } else if strings.HasPrefix(line, ":") && len(line) > 1 { chain := utiliptables.Chain(strings.SplitN(line[1:], " ", 2)[0]) chainsMap[chain] = lines[lineNum] } } return chainsMap }
// This is the same as servicePortChainName but with the endpoint included. func servicePortEndpointChainName(s proxy.ServicePortName, protocol string, endpoint string) utiliptables.Chain { hash := sha256.Sum256([]byte(s.String() + protocol + endpoint)) encoded := base32.StdEncoding.EncodeToString(hash[:]) return utiliptables.Chain("KUBE-SEP-" + encoded[:16]) }
// This is where all of the iptables-save/restore calls happen. // The only other iptables rules are those that are setup in iptablesInit() // assumes proxier.mu is held func (proxier *Proxier) syncProxyRules() { // don't sync rules till we've received services and endpoints if !proxier.haveReceivedEndpointsUpdate || !proxier.haveReceivedServiceUpdate { glog.V(2).Info("Not syncing iptables until Services and Endpoints have been received from master") return } glog.V(3).Infof("Syncing iptables rules") // Ensure main chains and rules are installed. inputChains := []utiliptables.Chain{utiliptables.ChainOutput, utiliptables.ChainPrerouting} // Link the services chain. for _, chain := range inputChains { if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, iptablesServicesChain); err != nil { glog.Errorf("Failed to ensure that chain %s exists: %v", iptablesServicesChain, err) return } comment := "kubernetes service portals" args := []string{"-m", "comment", "--comment", comment, "-j", string(iptablesServicesChain)} if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, chain, args...); err != nil { glog.Errorf("Failed to ensure that chain %s jumps to %s: %v", chain, iptablesServicesChain, err) return } } // Link the output rules. { comment := "kubernetes service traffic requiring SNAT" args := []string{"-m", "comment", "--comment", comment, "-m", "mark", "--mark", iptablesMasqueradeMark, "-j", "MASQUERADE"} if _, err := proxier.iptables.EnsureRule(utiliptables.Append, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil { glog.Errorf("Failed to ensure that chain %s obeys MASQUERADE mark: %v", utiliptables.ChainPostrouting, err) return } } // Get iptables-save output so we can check for existing chains and rules. // This will be a map of chain name to chain with rules as stored in iptables-save/iptables-restore existingChains := make(map[utiliptables.Chain]string) iptablesSaveRaw, err := proxier.iptables.Save(utiliptables.TableNAT) if err != nil { // if we failed to get any rules glog.Errorf("Failed to execute iptable-save, syncing all rules. %s", err.Error()) } else { // otherwise parse the output existingChains = getChainLines(utiliptables.TableNAT, iptablesSaveRaw) } chainsLines := bytes.NewBuffer(nil) rulesLines := bytes.NewBuffer(nil) // Write table header. writeLine(chainsLines, "*nat") // Make sure we keep stats for the top-level chains, if they existed // (which they should have because we created them above). if chain, ok := existingChains[iptablesServicesChain]; ok { writeLine(chainsLines, chain) } else { writeLine(chainsLines, makeChainLine(iptablesServicesChain)) } if chain, ok := existingChains[iptablesNodePortsChain]; ok { writeLine(chainsLines, chain) } else { writeLine(chainsLines, makeChainLine(iptablesNodePortsChain)) } // Accumulate chains to keep. activeChains := map[utiliptables.Chain]bool{} // use a map as a set // Accumulate new local ports that we have opened. newLocalPorts := map[localPort]closeable{} // Build rules for each service. for svcName, svcInfo := range proxier.serviceMap { protocol := strings.ToLower(string(svcInfo.protocol)) // Create the per-service chain, retaining counters if possible. svcChain := servicePortChainName(svcName, protocol) if chain, ok := existingChains[svcChain]; ok { writeLine(chainsLines, chain) } else { writeLine(chainsLines, makeChainLine(svcChain)) } activeChains[svcChain] = true // Capture the clusterIP. args := []string{ "-A", string(iptablesServicesChain), "-m", "comment", "--comment", fmt.Sprintf("\"%s cluster IP\"", svcName.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", svcInfo.clusterIP.String()), "--dport", fmt.Sprintf("%d", svcInfo.port), } if proxier.masqueradeAll { writeLine(rulesLines, append(args, "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...) } writeLine(rulesLines, append(args, "-j", string(svcChain))...) // Capture externalIPs. for _, externalIP := range svcInfo.externalIPs { // If the "external" IP happens to be an IP that is local to this // machine, hold the local port open so no other process can open it // (because the socket might open but it would never work). if local, err := isLocalIP(externalIP); err != nil { glog.Errorf("can't determine if IP is local, assuming not: %v", err) } else if local { lp := localPort{ desc: "externalIP for " + svcName.String(), ip: externalIP, port: svcInfo.port, protocol: protocol, } if proxier.portsMap[lp] != nil { newLocalPorts[lp] = proxier.portsMap[lp] } else { socket, err := openLocalPort(&lp) if err != nil { glog.Errorf("can't open %s, skipping this externalIP: %v", lp.String(), err) continue } newLocalPorts[lp] = socket } } // We're holding the port, so it's OK to install iptables rules. args := []string{ "-A", string(iptablesServicesChain), "-m", "comment", "--comment", fmt.Sprintf("\"%s external IP\"", svcName.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", externalIP), "--dport", fmt.Sprintf("%d", svcInfo.port), } // We have to SNAT packets to external IPs. writeLine(rulesLines, append(args, "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...) // Allow traffic for external IPs that does not come from a bridge (i.e. not from a container) // nor from a local process to be forwarded to the service. // This rule roughly translates to "all traffic from off-machine". // This is imperfect in the face of network plugins that might not use a bridge, but we can revisit that later. externalTrafficOnlyArgs := append(args, "-m", "physdev", "!", "--physdev-is-in", "-m", "addrtype", "!", "--src-type", "LOCAL") writeLine(rulesLines, append(externalTrafficOnlyArgs, "-j", string(svcChain))...) dstLocalOnlyArgs := append(args, "-m", "addrtype", "--dst-type", "LOCAL") // Allow traffic bound for external IPs that happen to be recognized as local IPs to stay local. // This covers cases like GCE load-balancers which get added to the local routing table. writeLine(rulesLines, append(dstLocalOnlyArgs, "-j", string(svcChain))...) } // Capture load-balancer ingress. for _, ingress := range svcInfo.loadBalancerStatus.Ingress { if ingress.IP != "" { args := []string{ "-A", string(iptablesServicesChain), "-m", "comment", "--comment", fmt.Sprintf("\"%s loadbalancer IP\"", svcName.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", ingress.IP), "--dport", fmt.Sprintf("%d", svcInfo.port), } // We have to SNAT packets from external IPs. writeLine(rulesLines, append(args, "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...) writeLine(rulesLines, append(args, "-j", string(svcChain))...) } } // Capture nodeports. If we had more than 2 rules it might be // worthwhile to make a new per-service chain for nodeport rules, but // with just 2 rules it ends up being a waste and a cognitive burden. if svcInfo.nodePort != 0 { // Hold the local port open so no other process can open it // (because the socket might open but it would never work). lp := localPort{ desc: "nodePort for " + svcName.String(), ip: "", port: svcInfo.nodePort, protocol: protocol, } if proxier.portsMap[lp] != nil { newLocalPorts[lp] = proxier.portsMap[lp] } else { socket, err := openLocalPort(&lp) if err != nil { glog.Errorf("can't open %s, skipping this nodePort: %v", lp.String(), err) continue } newLocalPorts[lp] = socket } // We're holding the port, so it's OK to install iptables rules. // Nodeports need SNAT. writeLine(rulesLines, "-A", string(iptablesNodePortsChain), "-m", "comment", "--comment", svcName.String(), "-m", protocol, "-p", protocol, "--dport", fmt.Sprintf("%d", svcInfo.nodePort), "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark)) // Jump to the service chain. writeLine(rulesLines, "-A", string(iptablesNodePortsChain), "-m", "comment", "--comment", svcName.String(), "-m", protocol, "-p", protocol, "--dport", fmt.Sprintf("%d", svcInfo.nodePort), "-j", string(svcChain)) } // Generate the per-endpoint chains. We do this in multiple passes so we // can group rules together. endpoints := make([]string, 0) endpointChains := make([]utiliptables.Chain, 0) for _, ep := range proxier.endpointsMap[svcName] { endpoints = append(endpoints, ep) endpointChain := servicePortEndpointChainName(svcName, protocol, ep) endpointChains = append(endpointChains, endpointChain) // Create the endpoint chain, retaining counters if possible. if chain, ok := existingChains[utiliptables.Chain(endpointChain)]; ok { writeLine(chainsLines, chain) } else { writeLine(chainsLines, makeChainLine(endpointChain)) } activeChains[endpointChain] = true } // First write session affinity rules, if applicable. if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP { for _, endpointChain := range endpointChains { writeLine(rulesLines, "-A", string(svcChain), "-m", "comment", "--comment", svcName.String(), "-m", "recent", "--name", string(endpointChain), "--rcheck", "--seconds", fmt.Sprintf("%d", svcInfo.stickyMaxAgeSeconds), "--reap", "-j", string(endpointChain)) } } // Now write loadbalancing & DNAT rules. n := len(endpointChains) for i, endpointChain := range endpointChains { // Balancing rules in the per-service chain. args := []string{ "-A", string(svcChain), "-m", "comment", "--comment", svcName.String(), } if i < (n - 1) { // Each rule is a probabilistic match. args = append(args, "-m", "statistic", "--mode", "random", "--probability", fmt.Sprintf("%0.5f", 1.0/float64(n-i))) } // The final (or only if n == 1) rule is a guaranteed match. args = append(args, "-j", string(endpointChain)) writeLine(rulesLines, args...) // Rules in the per-endpoint chain. args = []string{ "-A", string(endpointChain), "-m", "comment", "--comment", svcName.String(), } // Handle traffic that loops back to the originator with SNAT. // Technically we only need to do this if the endpoint is on this // host, but we don't have that information, so we just do this for // all endpoints. // TODO: if we grow logic to get this node's pod CIDR, we can use it. writeLine(rulesLines, append(args, "-s", fmt.Sprintf("%s/32", strings.Split(endpoints[i], ":")[0]), "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...) // Update client-affinity lists. if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP { args = append(args, "-m", "recent", "--name", string(endpointChain), "--set") } // DNAT to final destination. args = append(args, "-m", protocol, "-p", protocol, "-j", "DNAT", "--to-destination", endpoints[i]) writeLine(rulesLines, args...) } } // Delete chains no longer in use. for chain := range existingChains { if !activeChains[chain] { chainString := string(chain) if !strings.HasPrefix(chainString, "KUBE-SVC-") && !strings.HasPrefix(chainString, "KUBE-SEP-") { // Ignore chains that aren't ours. continue } // We must (as per iptables) write a chain-line for it, which has // the nice effect of flushing the chain. Then we can remove the // chain. writeLine(chainsLines, existingChains[chain]) writeLine(rulesLines, "-X", chainString) } } // Finally, tail-call to the nodeports chain. This needs to be after all // other service portal rules. writeLine(rulesLines, "-A", string(iptablesServicesChain), "-m", "comment", "--comment", "\"kubernetes service nodeports; NOTE: this must be the last rule in this chain\"", "-m", "addrtype", "--dst-type", "LOCAL", "-j", string(iptablesNodePortsChain)) // Write the end-of-table marker. writeLine(rulesLines, "COMMIT") // Sync rules. // NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table. lines := append(chainsLines.Bytes(), rulesLines.Bytes()...) glog.V(3).Infof("Syncing rules: %s", lines) err = proxier.iptables.Restore(utiliptables.TableNAT, lines, utiliptables.NoFlushTables, utiliptables.RestoreCounters) if err != nil { glog.Errorf("Failed to sync iptables rules: %v", err) // Revert new local ports. for k, v := range newLocalPorts { glog.Errorf("Closing local port %s", k.String()) v.Close() } } else { // Close old local ports and save new ones. for k, v := range proxier.portsMap { if newLocalPorts[k] == nil { v.Close() } } proxier.portsMap = newLocalPorts } }
func TestgetChainLinesMultipleTables(t *testing.T) { iptables_save := `# Generated by iptables-save v1.4.21 on Fri Aug 7 14:47:37 2015 *nat :PREROUTING ACCEPT [2:138] :INPUT ACCEPT [0:0] :OUTPUT ACCEPT [0:0] :POSTROUTING ACCEPT [0:0] :DOCKER - [0:0] :KUBE-NODEPORT-CONTAINER - [0:0] :KUBE-NODEPORT-HOST - [0:0] :KUBE-PORTALS-CONTAINER - [0:0] :KUBE-PORTALS-HOST - [0:0] :KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U - [0:0] :KUBE-SVC-PknUqKuv-LNZiCKRqGm - [0:0] :KUBE-SVC-RWEx6uDf8yWGww1OQ8E - [0:0] :KUBE-SVC-UvIpe7oTYVlacW1-G4C - [0:0] :KUBE-SVC-g_TrwxBdTXDbEtecmNo - [0:0] :KUBE-SVC-gvTMDzeC8lcXUan15iP - [0:0] -A PREROUTING -m comment --comment "handle ClusterIPs; NOTE: this must be before the NodePort rules" -j KUBE-PORTALS-CONTAINER -A PREROUTING -m addrtype --dst-type LOCAL -j DOCKER -A PREROUTING -m addrtype --dst-type LOCAL -m comment --comment "handle service NodePorts; NOTE: this must be the last rule in the chain" -j KUBE-NODEPORT-CONTAINER -A OUTPUT -m comment --comment "handle ClusterIPs; NOTE: this must be before the NodePort rules" -j KUBE-PORTALS-HOST -A OUTPUT ! -d 127.0.0.0/8 -m addrtype --dst-type LOCAL -j DOCKER -A OUTPUT -m addrtype --dst-type LOCAL -m comment --comment "handle service NodePorts; NOTE: this must be the last rule in the chain" -j KUBE-NODEPORT-HOST -A POSTROUTING -s 10.246.1.0/24 ! -o cbr0 -j MASQUERADE -A POSTROUTING -s 10.0.2.15/32 -d 10.0.2.15/32 -m comment --comment "handle pod connecting to self" -j MASQUERADE -A KUBE-PORTALS-CONTAINER -d 10.247.0.1/32 -p tcp -m comment --comment "portal for default/kubernetes:" -m state --state NEW -m tcp --dport 443 -j KUBE-SVC-g_TrwxBdTXDbEtecmNo -A KUBE-PORTALS-CONTAINER -d 10.247.0.10/32 -p udp -m comment --comment "portal for kube-system/kube-dns:dns" -m state --state NEW -m udp --dport 53 -j KUBE-SVC-gvTMDzeC8lcXUan15iP -A KUBE-PORTALS-CONTAINER -d 10.247.0.10/32 -p tcp -m comment --comment "portal for kube-system/kube-dns:dns-tcp" -m state --state NEW -m tcp --dport 53 -j KUBE-SVC-PknUqKuv-LNZiCKRqGm -A KUBE-PORTALS-HOST -d 10.247.0.1/32 -p tcp -m comment --comment "portal for default/kubernetes:" -m state --state NEW -m tcp --dport 443 -j KUBE-SVC-g_TrwxBdTXDbEtecmNo -A KUBE-PORTALS-HOST -d 10.247.0.10/32 -p udp -m comment --comment "portal for kube-system/kube-dns:dns" -m state --state NEW -m udp --dport 53 -j KUBE-SVC-gvTMDzeC8lcXUan15iP -A KUBE-PORTALS-HOST -d 10.247.0.10/32 -p tcp -m comment --comment "portal for kube-system/kube-dns:dns-tcp" -m state --state NEW -m tcp --dport 53 -j KUBE-SVC-PknUqKuv-LNZiCKRqGm -A KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U -p udp -m comment --comment "kube-system/kube-dns:dns" -m recent --set --name KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.246.1.2:53 -A KUBE-SVC-PknUqKuv-LNZiCKRqGm -m comment --comment "kube-system/kube-dns:dns-tcp" -j KUBE-SVC-RWEx6uDf8yWGww1OQ8E -A KUBE-SVC-RWEx6uDf8yWGww1OQ8E -p tcp -m comment --comment "kube-system/kube-dns:dns-tcp" -m recent --set --name KUBE-SVC-RWEx6uDf8yWGww1OQ8E --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.246.1.2:53 -A KUBE-SVC-UvIpe7oTYVlacW1-G4C -p tcp -m comment --comment "default/kubernetes:" -m recent --set --name KUBE-SVC-UvIpe7oTYVlacW1-G4C --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.245.1.2:443 -A KUBE-SVC-g_TrwxBdTXDbEtecmNo -m comment --comment "default/kubernetes:" -j KUBE-SVC-UvIpe7oTYVlacW1-G4C -A KUBE-SVC-gvTMDzeC8lcXUan15iP -m comment --comment "kube-system/kube-dns:dns" -j KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U COMMIT # Completed on Fri Aug 7 14:47:37 2015 # Generated by iptables-save v1.4.21 on Fri Aug 7 14:47:37 2015 *filter :INPUT ACCEPT [17514:83115836] :FORWARD ACCEPT [0:0] :OUTPUT ACCEPT [8909:688225] :DOCKER - [0:0] -A FORWARD -o cbr0 -j DOCKER -A FORWARD -o cbr0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT -A FORWARD -i cbr0 ! -o cbr0 -j ACCEPT -A FORWARD -i cbr0 -o cbr0 -j ACCEPT COMMIT ` expected := map[utiliptables.Chain]string{ utiliptables.ChainPrerouting: ":PREROUTING ACCEPT [2:138]", utiliptables.Chain("INPUT"): ":INPUT ACCEPT [0:0]", utiliptables.Chain("OUTPUT"): ":OUTPUT ACCEPT [0:0]", utiliptables.ChainPostrouting: ":POSTROUTING ACCEPT [0:0]", utiliptables.Chain("DOCKER"): ":DOCKER - [0:0]", utiliptables.Chain("KUBE-NODEPORT-CONTAINER"): ":KUBE-NODEPORT-CONTAINER - [0:0]", utiliptables.Chain("KUBE-NODEPORT-HOST"): ":KUBE-NODEPORT-HOST - [0:0]", utiliptables.Chain("KUBE-PORTALS-CONTAINER"): ":KUBE-PORTALS-CONTAINER - [0:0]", utiliptables.Chain("KUBE-PORTALS-HOST"): ":KUBE-PORTALS-HOST - [0:0]", utiliptables.Chain("KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U"): ":KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U - [0:0]", utiliptables.Chain("KUBE-SVC-PknUqKuv-LNZiCKRqGm"): ":KUBE-SVC-PknUqKuv-LNZiCKRqGm - [0:0]", utiliptables.Chain("KUBE-SVC-RWEx6uDf8yWGww1OQ8E"): ":KUBE-SVC-RWEx6uDf8yWGww1OQ8E - [0:0]", utiliptables.Chain("KUBE-SVC-UvIpe7oTYVlacW1-G4C"): ":KUBE-SVC-UvIpe7oTYVlacW1-G4C - [0:0]", utiliptables.Chain("KUBE-SVC-g_TrwxBdTXDbEtecmNo"): ":KUBE-SVC-g_TrwxBdTXDbEtecmNo - [0:0]", utiliptables.Chain("KUBE-SVC-gvTMDzeC8lcXUan15iP"): ":KUBE-SVC-gvTMDzeC8lcXUan15iP - [0:0]", } checkAllLines(t, utiliptables.TableNAT, []byte(iptables_save), expected) }