func (f *fakeIptables) restore(restoreTableName utiliptables.Table, data []byte, flush utiliptables.FlushFlag) error { buf := bytes.NewBuffer(data) var tableName utiliptables.Table for { line, err := buf.ReadString('\n') if err != nil { break } if line[0] == '#' { continue } line = strings.TrimSuffix(line, "\n") if strings.HasPrefix(line, "*") { tableName = utiliptables.Table(line[1:]) } if tableName != "" { if restoreTableName != "" && restoreTableName != tableName { continue } if strings.HasPrefix(line, ":") { chainName := utiliptables.Chain(strings.Split(line[1:], " ")[0]) if flush == utiliptables.FlushTables { table, chain, _ := f.getChain(tableName, chainName) if chain != nil { delete(table.chains, string(chainName)) } } _, _ = f.ensureChain(tableName, chainName) } else if strings.HasPrefix(line, "-A") { parts := strings.Split(line, " ") if len(parts) < 3 { return fmt.Errorf("Invalid iptables rule '%s'", line) } chainName := utiliptables.Chain(parts[1]) rule := strings.TrimPrefix(line, fmt.Sprintf("-A %s ", chainName)) _, err := f.ensureRule(utiliptables.Append, tableName, chainName, rule) if err != nil { return err } } else if strings.HasPrefix(line, "-X") { parts := strings.Split(line, " ") if len(parts) < 3 { return fmt.Errorf("Invalid iptables rule '%s'", line) } if err := f.DeleteChain(tableName, utiliptables.Chain(parts[1])); err != nil { return err } } else if line == "COMMIT" { if restoreTableName == tableName { return nil } tableName = "" } } } return nil }
// getChainLines parses a table's iptables-save data to find chains in the table. // It returns a map of iptables.Chain to string where the string is the chain line from the save (with counters etc). func getChainLines(table utiliptables.Table, save []byte) map[utiliptables.Chain]string { // get lines lines := strings.Split(string(save), "\n") chainsMap := make(map[utiliptables.Chain]string) tablePrefix := "*" + string(table) lineNum := 0 // find beginning of table for ; lineNum < len(lines); lineNum++ { if strings.HasPrefix(strings.TrimSpace(lines[lineNum]), tablePrefix) { lineNum++ break } } // parse table lines for ; lineNum < len(lines); lineNum++ { line := strings.TrimSpace(lines[lineNum]) if strings.HasPrefix(line, "COMMIT") || strings.HasPrefix(line, "*") { break } else if len(line) == 0 || strings.HasPrefix(line, "#") { continue } else if strings.HasPrefix(line, ":") && len(line) > 1 { chain := utiliptables.Chain(strings.SplitN(line[1:], " ", 2)[0]) chainsMap[chain] = lines[lineNum] } } return chainsMap }
// getChainLines parses a table's iptables-save data to find chains in the table. // It returns a map of iptables.Chain to string where the string is the chain line from the save (with counters etc). func getChainLines(table utiliptables.Table, save []byte) map[utiliptables.Chain]string { chainsMap := make(map[utiliptables.Chain]string) tablePrefix := "*" + string(table) readIndex := 0 // find beginning of table for readIndex < len(save) { line, n := readLine(readIndex, save) readIndex = n if strings.HasPrefix(line, tablePrefix) { break } } // parse table lines for readIndex < len(save) { line, n := readLine(readIndex, save) readIndex = n if len(line) == 0 { continue } if strings.HasPrefix(line, "COMMIT") || strings.HasPrefix(line, "*") { break } else if strings.HasPrefix(line, "#") { continue } else if strings.HasPrefix(line, ":") && len(line) > 1 { chain := utiliptables.Chain(strings.SplitN(line[1:], " ", 2)[0]) chainsMap[chain] = line } } return chainsMap }
// Start starts a keepalived process in foreground. // In case of any error it will terminate the execution with a fatal error func (k *keepalived) Start() { ae, err := k.ipt.EnsureChain(iptables.TableFilter, iptables.Chain(iptablesChain)) if err != nil { glog.Fatalf("unexpected error: %v", err) } if ae { glog.V(2).Infof("chain %v already existed", iptablesChain) } k.cmd = exec.Command("keepalived", "--dont-fork", "--log-console", "--release-vips", "--pid", "/keepalived.pid") k.cmd.Stdout = os.Stdout k.cmd.Stderr = os.Stderr k.started = true if err := k.cmd.Start(); err != nil { glog.Errorf("keepalived error: %v", err) } if err := k.cmd.Wait(); err != nil { glog.Fatalf("keepalived error: %v", err) } }
// Start starts a keepalived process in foreground. // In case of any error it will terminate the execution with a fatal error func (k *keepalived) Start() { ae, err := k.ipt.EnsureChain(iptables.TableFilter, iptables.Chain(iptablesChain)) if err != nil { glog.Fatalf("unexpected error: %v", err) } if ae { glog.V(2).Infof("chain %v already existed", iptablesChain) } k.cmd = exec.Command("keepalived", "--dont-fork", "--log-console", "--release-vips", "--pid", "/keepalived.pid") k.cmd.Stdout = os.Stdout k.cmd.Stderr = os.Stderr k.started = true // in case the pod is terminated we need to check that the vips are removed c := make(chan os.Signal, 2) signal.Notify(c, syscall.SIGTERM) go func() { for range c { glog.Warning("TERM signal received. removing vips") for _, vip := range k.vips { k.removeVIP(vip) } err := k.ipt.FlushChain(iptables.TableFilter, iptables.Chain(iptablesChain)) if err != nil { glog.V(2).Infof("unexpected error flushing iptables chain %v: %v", err, iptablesChain) } } }() if err := k.cmd.Start(); err != nil { glog.Errorf("keepalived error: %v", err) } if err := k.cmd.Wait(); err != nil { glog.Fatalf("keepalived error: %v", err) } }
// Stop stop keepalived process func (k *keepalived) Stop() { for _, vip := range k.vips { k.removeVIP(vip) } err := k.ipt.FlushChain(iptables.TableFilter, iptables.Chain(iptablesChain)) if err != nil { glog.V(2).Infof("unexpected error flushing iptables chain %v: %v", err, iptablesChain) } err = syscall.Kill(k.cmd.Process.Pid, syscall.SIGTERM) if err != nil { fmt.Errorf("error stopping keepalived: %v", err) } }
func SetupIptables(ipt iptables.Interface, clusterNetworkCIDR string) error { rules := []FirewallRule{ {"nat", "POSTROUTING", []string{"-s", clusterNetworkCIDR, "!", "-d", clusterNetworkCIDR, "-j", "MASQUERADE"}}, {"filter", "INPUT", []string{"-p", "udp", "-m", "multiport", "--dports", "4789", "-m", "comment", "--comment", "001 vxlan incoming", "-j", "ACCEPT"}}, {"filter", "INPUT", []string{"-i", "tun0", "-m", "comment", "--comment", "traffic from docker for internet", "-j", "ACCEPT"}}, {"filter", "FORWARD", []string{"-d", clusterNetworkCIDR, "-j", "ACCEPT"}}, {"filter", "FORWARD", []string{"-s", clusterNetworkCIDR, "-j", "ACCEPT"}}, } for _, rule := range rules { _, err := ipt.EnsureRule(iptables.Prepend, iptables.Table(rule.table), iptables.Chain(rule.chain), rule.args...) if err != nil { return err } } return nil }
// syncIPTableRules syncs the cluster network cidr iptables rules. // Called from SyncLoop() or firwalld reload() func (n *NodeIPTables) syncIPTableRules() error { n.mu.Lock() defer n.mu.Unlock() start := time.Now() defer func() { glog.V(4).Infof("syncIPTableRules took %v", time.Since(start)) }() glog.V(3).Infof("Syncing openshift iptables rules") rules := n.getStaticNodeIPTablesRules() for _, rule := range rules { _, err := n.ipt.EnsureRule(iptables.Prepend, iptables.Table(rule.table), iptables.Chain(rule.chain), rule.args...) if err != nil { return fmt.Errorf("Failed to ensure rule %v exists: %v", rule, err) } } return nil }
// This is where all of the iptables-save/restore calls happen. // The only other iptables rules are those that are setup in iptablesInit() // assumes proxier.mu is held func (proxier *Proxier) syncProxyRules() error { // don't sync rules till we've received services and endpoints if !proxier.haveReceivedEndpointsUpdate || !proxier.haveReceivedServiceUpdate { glog.V(2).Info("not syncing iptables until Services and Endpoints have been received from master") return nil } glog.V(4).Infof("Syncing iptables rules.") // ensure main chain and rule connecting to output args := []string{"-j", string(iptablesServicesChain)} if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, iptablesServicesChain); err != nil { return err } if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainOutput, args...); err != nil { return err } if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPrerouting, args...); err != nil { return err } // Get iptables-save output so we can check for existing chains and rules. // This will be a map of chain name to chain with rules as stored in iptables-save/iptables-restore existingChains := make(map[utiliptables.Chain]string) // run iptables-save iptablesSaveRaw, err := proxier.iptables.Save(utiliptables.TableNAT) if err != nil { // if we failed to get any rules glog.Errorf("Failed to execute iptable-save, syncing all rules. %s", err.Error()) } else { // otherwise parse the output existingChains = getChainLines(utiliptables.TableNAT, iptablesSaveRaw) } // for first line and chains var chainsLines bytes.Buffer // for the actual rules (which should be after the list of chains) var rulesLines bytes.Buffer // write table header chainsLines.WriteString("*nat\n") if chain, ok := existingChains[iptablesServicesChain]; ok { chainsLines.WriteString(fmt.Sprintf("%s\n", chain)) } else { chainsLines.WriteString(makeChainLine(iptablesServicesChain)) } newHostChains := []utiliptables.Chain{} newServiceChains := []utiliptables.Chain{} //Build rules for services for name, info := range proxier.serviceMap { protocol := strings.ToLower((string)(info.portal.protocol)) // get chain name svcChain := servicePortToServiceChain(name) // Create chain if chain, ok := existingChains[svcChain]; ok { chainsLines.WriteString(fmt.Sprintf("%s\n", chain)) } else { chainsLines.WriteString(makeChainLine(svcChain)) } // get hosts and host-Chains hosts := make([]string, 0) hostChains := make([]utiliptables.Chain, 0) for _, ep := range info.endpoints { hosts = append(hosts, ep) hostChains = append(hostChains, servicePortAndEndpointToServiceChain(name, ep)) } // Ensure we know what chains to flush/remove next time we generate the rules newHostChains = append(newHostChains, hostChains...) newServiceChains = append(newServiceChains, svcChain) // write chain and sticky session rule for _, hostChain := range hostChains { // Create chain if chain, ok := existingChains[utiliptables.Chain(hostChain)]; ok { chainsLines.WriteString(fmt.Sprintf("%s\n", chain)) } else { chainsLines.WriteString(makeChainLine(hostChain)) } // Sticky session if info.sessionAffinityType == api.ServiceAffinityClientIP { rulesLines.WriteString(fmt.Sprintf("-A %s -m comment --comment \"%s\" -m recent --name %s --rcheck --seconds %d --reap -j %s\n", svcChain, name.String(), hostChain, info.stickyMaxAgeMinutes*60, hostChain)) } } // write proxy/loadblanacing rules n := len(hostChains) for i, hostChain := range hostChains { // Roughly round robin statistically if we have more than one host if i < (n - 1) { rulesLines.WriteString(fmt.Sprintf("-A %s -m comment --comment \"%s\" -m statistic --mode random --probability %f -j %s\n", svcChain, name.String(), 1.0/float64(n-i), hostChain)) } else { rulesLines.WriteString(fmt.Sprintf("-A %s -m comment --comment \"%s\" -j %s\n", svcChain, name.String(), hostChain)) } // proxy if info.sessionAffinityType == api.ServiceAffinityClientIP { rulesLines.WriteString(fmt.Sprintf("-A %s -m comment --comment \"%s\" -m recent --name %s --set -j DNAT -p %s --to-destination %s\n", hostChain, name.String(), hostChain, protocol, hosts[i])) } else { rulesLines.WriteString(fmt.Sprintf("-A %s -m comment --comment \"%s\" -j DNAT -p %s --to-destination %s\n", hostChain, name.String(), protocol, hosts[i])) } } // proxy rulesLines.WriteString(fmt.Sprintf("-A %s -m comment --comment \"portal for %s\" -d %s/32 -m state --state NEW -p %s -m %s --dport %d -j %s\n", iptablesServicesChain, name.String(), info.portal.ip.String(), protocol, protocol, info.portal.port, svcChain)) for _, publicIP := range info.deprecatedPublicIPs { rulesLines.WriteString(fmt.Sprintf("-A %s -m comment --comment \"deprecated-PublicIP portal for %s\" -d %s/32 -m state --state NEW -p %s -m %s --dport %d -j %s\n", iptablesServicesChain, name.String(), publicIP, protocol, protocol, info.portal.port, svcChain)) } for _, ingress := range info.loadBalancerStatus.Ingress { if ingress.IP != "" { rulesLines.WriteString(fmt.Sprintf("-A %s -m comment --comment \"load-balancer portal for %s\" -d %s/32 -m state --state NEW -p %s -m %s --dport %d -j %s\n", iptablesServicesChain, name.String(), ingress.IP, protocol, protocol, info.portal.port, svcChain)) } } } // Delete chains no longer in use: activeChains := make(map[utiliptables.Chain]bool) // use a map as a set for _, chain := range newHostChains { activeChains[chain] = true } for _, chain := range newServiceChains { activeChains[chain] = true } for chain := range existingChains { if !activeChains[chain] { chainString := string(chain) // Ignore chains that aren't ours. if !strings.HasPrefix(chainString, "KUBE-SVC-") && !strings.HasPrefix(chainString, "KUBE-SEP-") { continue } rulesLines.WriteString(fmt.Sprintf("-F %s\n-X %s\n", chain, chain)) } } // write end of table rulesLines.WriteString("COMMIT\n") // combine parts lines := append(chainsLines.Bytes(), rulesLines.Bytes()...) // sync rules and return error glog.V(3).Infof("Syncing rules: %s", lines) // NOTE: flush=false is used so we don't flush non-kubernetes chains in the table. err = proxier.iptables.Restore(utiliptables.TableNAT, lines, utiliptables.NoFlushTables, utiliptables.RestoreCounters) return err }
func TestGetChainLinesMultipleTables(t *testing.T) { iptables_save := `# Generated by iptables-save v1.4.21 on Fri Aug 7 14:47:37 2015 *nat :PREROUTING ACCEPT [2:138] :INPUT ACCEPT [0:0] :OUTPUT ACCEPT [0:0] :POSTROUTING ACCEPT [0:0] :DOCKER - [0:0] :KUBE-NODEPORT-CONTAINER - [0:0] :KUBE-NODEPORT-HOST - [0:0] :KUBE-PORTALS-CONTAINER - [0:0] :KUBE-PORTALS-HOST - [0:0] :KUBE-SVC-1111111111111111 - [0:0] :KUBE-SVC-2222222222222222 - [0:0] :KUBE-SVC-3333333333333333 - [0:0] :KUBE-SVC-4444444444444444 - [0:0] :KUBE-SVC-5555555555555555 - [0:0] :KUBE-SVC-6666666666666666 - [0:0] -A PREROUTING -m comment --comment "handle ClusterIPs; NOTE: this must be before the NodePort rules" -j KUBE-PORTALS-CONTAINER -A PREROUTING -m addrtype --dst-type LOCAL -j DOCKER -A PREROUTING -m addrtype --dst-type LOCAL -m comment --comment "handle service NodePorts; NOTE: this must be the last rule in the chain" -j KUBE-NODEPORT-CONTAINER -A OUTPUT -m comment --comment "handle ClusterIPs; NOTE: this must be before the NodePort rules" -j KUBE-PORTALS-HOST -A OUTPUT ! -d 127.0.0.0/8 -m addrtype --dst-type LOCAL -j DOCKER -A OUTPUT -m addrtype --dst-type LOCAL -m comment --comment "handle service NodePorts; NOTE: this must be the last rule in the chain" -j KUBE-NODEPORT-HOST -A POSTROUTING -s 10.246.1.0/24 ! -o cbr0 -j MASQUERADE -A POSTROUTING -s 10.0.2.15/32 -d 10.0.2.15/32 -m comment --comment "handle pod connecting to self" -j MASQUERADE -A KUBE-PORTALS-CONTAINER -d 10.247.0.1/32 -p tcp -m comment --comment "portal for default/kubernetes:" -m state --state NEW -m tcp --dport 443 -j KUBE-SVC-5555555555555555 -A KUBE-PORTALS-CONTAINER -d 10.247.0.10/32 -p udp -m comment --comment "portal for kube-system/kube-dns:dns" -m state --state NEW -m udp --dport 53 -j KUBE-SVC-6666666666666666 -A KUBE-PORTALS-CONTAINER -d 10.247.0.10/32 -p tcp -m comment --comment "portal for kube-system/kube-dns:dns-tcp" -m state --state NEW -m tcp --dport 53 -j KUBE-SVC-2222222222222222 -A KUBE-PORTALS-HOST -d 10.247.0.1/32 -p tcp -m comment --comment "portal for default/kubernetes:" -m state --state NEW -m tcp --dport 443 -j KUBE-SVC-5555555555555555 -A KUBE-PORTALS-HOST -d 10.247.0.10/32 -p udp -m comment --comment "portal for kube-system/kube-dns:dns" -m state --state NEW -m udp --dport 53 -j KUBE-SVC-6666666666666666 -A KUBE-PORTALS-HOST -d 10.247.0.10/32 -p tcp -m comment --comment "portal for kube-system/kube-dns:dns-tcp" -m state --state NEW -m tcp --dport 53 -j KUBE-SVC-2222222222222222 -A KUBE-SVC-1111111111111111 -p udp -m comment --comment "kube-system/kube-dns:dns" -m recent --set --name KUBE-SVC-1111111111111111 --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.246.1.2:53 -A KUBE-SVC-2222222222222222 -m comment --comment "kube-system/kube-dns:dns-tcp" -j KUBE-SVC-3333333333333333 -A KUBE-SVC-3333333333333333 -p tcp -m comment --comment "kube-system/kube-dns:dns-tcp" -m recent --set --name KUBE-SVC-3333333333333333 --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.246.1.2:53 -A KUBE-SVC-4444444444444444 -p tcp -m comment --comment "default/kubernetes:" -m recent --set --name KUBE-SVC-4444444444444444 --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.245.1.2:443 -A KUBE-SVC-5555555555555555 -m comment --comment "default/kubernetes:" -j KUBE-SVC-4444444444444444 -A KUBE-SVC-6666666666666666 -m comment --comment "kube-system/kube-dns:dns" -j KUBE-SVC-1111111111111111 COMMIT # Completed on Fri Aug 7 14:47:37 2015 # Generated by iptables-save v1.4.21 on Fri Aug 7 14:47:37 2015 *filter :INPUT ACCEPT [17514:83115836] :FORWARD ACCEPT [0:0] :OUTPUT ACCEPT [8909:688225] :DOCKER - [0:0] -A FORWARD -o cbr0 -j DOCKER -A FORWARD -o cbr0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT -A FORWARD -i cbr0 ! -o cbr0 -j ACCEPT -A FORWARD -i cbr0 -o cbr0 -j ACCEPT COMMIT ` expected := map[utiliptables.Chain]string{ utiliptables.ChainPrerouting: ":PREROUTING ACCEPT [2:138]", utiliptables.Chain("INPUT"): ":INPUT ACCEPT [0:0]", utiliptables.Chain("OUTPUT"): ":OUTPUT ACCEPT [0:0]", utiliptables.ChainPostrouting: ":POSTROUTING ACCEPT [0:0]", utiliptables.Chain("DOCKER"): ":DOCKER - [0:0]", utiliptables.Chain("KUBE-NODEPORT-CONTAINER"): ":KUBE-NODEPORT-CONTAINER - [0:0]", utiliptables.Chain("KUBE-NODEPORT-HOST"): ":KUBE-NODEPORT-HOST - [0:0]", utiliptables.Chain("KUBE-PORTALS-CONTAINER"): ":KUBE-PORTALS-CONTAINER - [0:0]", utiliptables.Chain("KUBE-PORTALS-HOST"): ":KUBE-PORTALS-HOST - [0:0]", utiliptables.Chain("KUBE-SVC-1111111111111111"): ":KUBE-SVC-1111111111111111 - [0:0]", utiliptables.Chain("KUBE-SVC-2222222222222222"): ":KUBE-SVC-2222222222222222 - [0:0]", utiliptables.Chain("KUBE-SVC-3333333333333333"): ":KUBE-SVC-3333333333333333 - [0:0]", utiliptables.Chain("KUBE-SVC-4444444444444444"): ":KUBE-SVC-4444444444444444 - [0:0]", utiliptables.Chain("KUBE-SVC-5555555555555555"): ":KUBE-SVC-5555555555555555 - [0:0]", utiliptables.Chain("KUBE-SVC-6666666666666666"): ":KUBE-SVC-6666666666666666 - [0:0]", } checkAllLines(t, utiliptables.TableNAT, []byte(iptables_save), expected) }
// This is the same as servicePortChainName but with the endpoint included. func servicePortEndpointChainName(s proxy.ServicePortName, protocol string, endpoint string) utiliptables.Chain { hash := sha256.Sum256([]byte(s.String() + protocol + endpoint)) encoded := base32.StdEncoding.EncodeToString(hash[:]) return utiliptables.Chain("KUBE-SEP-" + encoded[:16]) }
// This is where all of the iptables-save/restore calls happen. // The only other iptables rules are those that are setup in iptablesInit() // assumes proxier.mu is held func (proxier *Proxier) syncProxyRules() { start := time.Now() defer func() { glog.V(4).Infof("syncProxyRules took %v", time.Since(start)) }() // don't sync rules till we've received services and endpoints if !proxier.haveReceivedEndpointsUpdate || !proxier.haveReceivedServiceUpdate { glog.V(2).Info("Not syncing iptables until Services and Endpoints have been received from master") return } glog.V(3).Infof("Syncing iptables rules") // Create and link the kube services chain. { tablesNeedServicesChain := []utiliptables.Table{utiliptables.TableFilter, utiliptables.TableNAT} for _, table := range tablesNeedServicesChain { if _, err := proxier.iptables.EnsureChain(table, kubeServicesChain); err != nil { glog.Errorf("Failed to ensure that %s chain %s exists: %v", table, kubeServicesChain, err) return } } tableChainsNeedJumpServices := []struct { table utiliptables.Table chain utiliptables.Chain }{ {utiliptables.TableFilter, utiliptables.ChainOutput}, {utiliptables.TableNAT, utiliptables.ChainOutput}, {utiliptables.TableNAT, utiliptables.ChainPrerouting}, } comment := "kubernetes service portals" args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeServicesChain)} for _, tc := range tableChainsNeedJumpServices { if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, tc.table, tc.chain, args...); err != nil { glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", tc.table, tc.chain, kubeServicesChain, err) return } } } // Create and link the kube postrouting chain. { if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, kubePostroutingChain); err != nil { glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, kubePostroutingChain, err) return } comment := "kubernetes postrouting rules" args := []string{"-m", "comment", "--comment", comment, "-j", string(kubePostroutingChain)} if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil { glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, err) return } } // Get iptables-save output so we can check for existing chains and rules. // This will be a map of chain name to chain with rules as stored in iptables-save/iptables-restore existingFilterChains := make(map[utiliptables.Chain]string) iptablesSaveRaw, err := proxier.iptables.Save(utiliptables.TableFilter) if err != nil { // if we failed to get any rules glog.Errorf("Failed to execute iptables-save, syncing all rules: %v", err) } else { // otherwise parse the output existingFilterChains = getChainLines(utiliptables.TableFilter, iptablesSaveRaw) } existingNATChains := make(map[utiliptables.Chain]string) iptablesSaveRaw, err = proxier.iptables.Save(utiliptables.TableNAT) if err != nil { // if we failed to get any rules glog.Errorf("Failed to execute iptables-save, syncing all rules: %v", err) } else { // otherwise parse the output existingNATChains = getChainLines(utiliptables.TableNAT, iptablesSaveRaw) } filterChains := bytes.NewBuffer(nil) filterRules := bytes.NewBuffer(nil) natChains := bytes.NewBuffer(nil) natRules := bytes.NewBuffer(nil) // Write table headers. writeLine(filterChains, "*filter") writeLine(natChains, "*nat") // Make sure we keep stats for the top-level chains, if they existed // (which most should have because we created them above). if chain, ok := existingFilterChains[kubeServicesChain]; ok { writeLine(filterChains, chain) } else { writeLine(filterChains, makeChainLine(kubeServicesChain)) } if chain, ok := existingNATChains[kubeServicesChain]; ok { writeLine(natChains, chain) } else { writeLine(natChains, makeChainLine(kubeServicesChain)) } if chain, ok := existingNATChains[kubeNodePortsChain]; ok { writeLine(natChains, chain) } else { writeLine(natChains, makeChainLine(kubeNodePortsChain)) } if chain, ok := existingNATChains[kubePostroutingChain]; ok { writeLine(natChains, chain) } else { writeLine(natChains, makeChainLine(kubePostroutingChain)) } if chain, ok := existingNATChains[kubeMarkMasqChain]; ok { writeLine(natChains, chain) } else { writeLine(natChains, makeChainLine(kubeMarkMasqChain)) } // Install the kubernetes-specific postrouting rules. We use a whole chain for // this so that it is easier to flush and change, for example if the mark // value should ever change. writeLine(natRules, []string{ "-A", string(kubePostroutingChain), "-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`, "-m", "mark", "--mark", proxier.masqueradeMark, "-j", "MASQUERADE", }...) // Install the kubernetes-specific masquerade mark rule. We use a whole chain for // this so that it is easier to flush and change, for example if the mark // value should ever change. writeLine(natRules, []string{ "-A", string(kubeMarkMasqChain), "-j", "MARK", "--set-xmark", proxier.masqueradeMark, }...) // Accumulate NAT chains to keep. activeNATChains := map[utiliptables.Chain]bool{} // use a map as a set // Accumulate the set of local ports that we will be holding open once this update is complete replacementPortsMap := map[localPort]closeable{} // Build rules for each service. for svcName, svcInfo := range proxier.serviceMap { protocol := strings.ToLower(string(svcInfo.protocol)) // Create the per-service chain, retaining counters if possible. svcChain := servicePortChainName(svcName, protocol) if chain, ok := existingNATChains[svcChain]; ok { writeLine(natChains, chain) } else { writeLine(natChains, makeChainLine(svcChain)) } activeNATChains[svcChain] = true // Capture the clusterIP. args := []string{ "-A", string(kubeServicesChain), "-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcName.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", svcInfo.clusterIP.String()), "--dport", fmt.Sprintf("%d", svcInfo.port), } if proxier.masqueradeAll { writeLine(natRules, append(args, "-j", string(kubeMarkMasqChain))...) } if len(proxier.clusterCIDR) > 0 { writeLine(natRules, append(args, "! -s", proxier.clusterCIDR, "-j", string(kubeMarkMasqChain))...) } writeLine(natRules, append(args, "-j", string(svcChain))...) // Capture externalIPs. for _, externalIP := range svcInfo.externalIPs { // If the "external" IP happens to be an IP that is local to this // machine, hold the local port open so no other process can open it // (because the socket might open but it would never work). if local, err := isLocalIP(externalIP); err != nil { glog.Errorf("can't determine if IP is local, assuming not: %v", err) } else if local { lp := localPort{ desc: "externalIP for " + svcName.String(), ip: externalIP, port: svcInfo.port, protocol: protocol, } if proxier.portsMap[lp] != nil { glog.V(4).Infof("Port %s was open before and is still needed", lp.String()) replacementPortsMap[lp] = proxier.portsMap[lp] } else { socket, err := openLocalPort(&lp) if err != nil { glog.Errorf("can't open %s, skipping this externalIP: %v", lp.String(), err) continue } replacementPortsMap[lp] = socket } } // We're holding the port, so it's OK to install iptables rules. args := []string{ "-A", string(kubeServicesChain), "-m", "comment", "--comment", fmt.Sprintf(`"%s external IP"`, svcName.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", externalIP), "--dport", fmt.Sprintf("%d", svcInfo.port), } // We have to SNAT packets to external IPs. writeLine(natRules, append(args, "-j", string(kubeMarkMasqChain))...) // Allow traffic for external IPs that does not come from a bridge (i.e. not from a container) // nor from a local process to be forwarded to the service. // This rule roughly translates to "all traffic from off-machine". // This is imperfect in the face of network plugins that might not use a bridge, but we can revisit that later. externalTrafficOnlyArgs := append(args, "-m", "physdev", "!", "--physdev-is-in", "-m", "addrtype", "!", "--src-type", "LOCAL") writeLine(natRules, append(externalTrafficOnlyArgs, "-j", string(svcChain))...) dstLocalOnlyArgs := append(args, "-m", "addrtype", "--dst-type", "LOCAL") // Allow traffic bound for external IPs that happen to be recognized as local IPs to stay local. // This covers cases like GCE load-balancers which get added to the local routing table. writeLine(natRules, append(dstLocalOnlyArgs, "-j", string(svcChain))...) } // Capture load-balancer ingress. for _, ingress := range svcInfo.loadBalancerStatus.Ingress { if ingress.IP != "" { args := []string{ "-A", string(kubeServicesChain), "-m", "comment", "--comment", fmt.Sprintf(`"%s loadbalancer IP"`, svcName.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", ingress.IP), "--dport", fmt.Sprintf("%d", svcInfo.port), } // We have to SNAT packets from external IPs. writeLine(natRules, append(args, "-j", string(kubeMarkMasqChain))...) writeLine(natRules, append(args, "-j", string(svcChain))...) } } // Capture nodeports. If we had more than 2 rules it might be // worthwhile to make a new per-service chain for nodeport rules, but // with just 2 rules it ends up being a waste and a cognitive burden. if svcInfo.nodePort != 0 { // Hold the local port open so no other process can open it // (because the socket might open but it would never work). lp := localPort{ desc: "nodePort for " + svcName.String(), ip: "", port: svcInfo.nodePort, protocol: protocol, } if proxier.portsMap[lp] != nil { glog.V(4).Infof("Port %s was open before and is still needed", lp.String()) replacementPortsMap[lp] = proxier.portsMap[lp] } else { socket, err := openLocalPort(&lp) if err != nil { glog.Errorf("can't open %s, skipping this nodePort: %v", lp.String(), err) continue } replacementPortsMap[lp] = socket } // We're holding the port, so it's OK to install iptables rules. args := []string{ "-A", string(kubeNodePortsChain), "-m", "comment", "--comment", svcName.String(), "-m", protocol, "-p", protocol, "--dport", fmt.Sprintf("%d", svcInfo.nodePort), } // Nodeports need SNAT. writeLine(natRules, append(args, "-j", string(kubeMarkMasqChain))...) // Jump to the service chain. writeLine(natRules, append(args, "-j", string(svcChain))...) } // If the service has no endpoints then reject packets. if len(proxier.endpointsMap[svcName]) == 0 { writeLine(filterRules, "-A", string(kubeServicesChain), "-m", "comment", "--comment", fmt.Sprintf(`"%s has no endpoints"`, svcName.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", svcInfo.clusterIP.String()), "--dport", fmt.Sprintf("%d", svcInfo.port), "-j", "REJECT", ) continue } // Generate the per-endpoint chains. We do this in multiple passes so we // can group rules together. endpoints := make([]string, 0) endpointChains := make([]utiliptables.Chain, 0) for _, ep := range proxier.endpointsMap[svcName] { endpoints = append(endpoints, ep) endpointChain := servicePortEndpointChainName(svcName, protocol, ep) endpointChains = append(endpointChains, endpointChain) // Create the endpoint chain, retaining counters if possible. if chain, ok := existingNATChains[utiliptables.Chain(endpointChain)]; ok { writeLine(natChains, chain) } else { writeLine(natChains, makeChainLine(endpointChain)) } activeNATChains[endpointChain] = true } // First write session affinity rules, if applicable. if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP { for _, endpointChain := range endpointChains { writeLine(natRules, "-A", string(svcChain), "-m", "comment", "--comment", svcName.String(), "-m", "recent", "--name", string(endpointChain), "--rcheck", "--seconds", fmt.Sprintf("%d", svcInfo.stickyMaxAgeSeconds), "--reap", "-j", string(endpointChain)) } } // Now write loadbalancing & DNAT rules. n := len(endpointChains) for i, endpointChain := range endpointChains { // Balancing rules in the per-service chain. args := []string{ "-A", string(svcChain), "-m", "comment", "--comment", svcName.String(), } if i < (n - 1) { // Each rule is a probabilistic match. args = append(args, "-m", "statistic", "--mode", "random", "--probability", fmt.Sprintf("%0.5f", 1.0/float64(n-i))) } // The final (or only if n == 1) rule is a guaranteed match. args = append(args, "-j", string(endpointChain)) writeLine(natRules, args...) // Rules in the per-endpoint chain. args = []string{ "-A", string(endpointChain), "-m", "comment", "--comment", svcName.String(), } // Handle traffic that loops back to the originator with SNAT. // Technically we only need to do this if the endpoint is on this // host, but we don't have that information, so we just do this for // all endpoints. // TODO: if we grow logic to get this node's pod CIDR, we can use it. writeLine(natRules, append(args, "-s", fmt.Sprintf("%s/32", strings.Split(endpoints[i], ":")[0]), "-j", string(kubeMarkMasqChain))...) // Update client-affinity lists. if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP { args = append(args, "-m", "recent", "--name", string(endpointChain), "--set") } // DNAT to final destination. args = append(args, "-m", protocol, "-p", protocol, "-j", "DNAT", "--to-destination", endpoints[i]) writeLine(natRules, args...) } } // Delete chains no longer in use. for chain := range existingNATChains { if !activeNATChains[chain] { chainString := string(chain) if !strings.HasPrefix(chainString, "KUBE-SVC-") && !strings.HasPrefix(chainString, "KUBE-SEP-") { // Ignore chains that aren't ours. continue } // We must (as per iptables) write a chain-line for it, which has // the nice effect of flushing the chain. Then we can remove the // chain. writeLine(natChains, existingNATChains[chain]) writeLine(natRules, "-X", chainString) } } // Finally, tail-call to the nodeports chain. This needs to be after all // other service portal rules. writeLine(natRules, "-A", string(kubeServicesChain), "-m", "comment", "--comment", `"kubernetes service nodeports; NOTE: this must be the last rule in this chain"`, "-m", "addrtype", "--dst-type", "LOCAL", "-j", string(kubeNodePortsChain)) // Write the end-of-table markers. writeLine(filterRules, "COMMIT") writeLine(natRules, "COMMIT") // Sync rules. // NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table. filterLines := append(filterChains.Bytes(), filterRules.Bytes()...) natLines := append(natChains.Bytes(), natRules.Bytes()...) lines := append(filterLines, natLines...) glog.V(3).Infof("Restoring iptables rules: %s", lines) err = proxier.iptables.RestoreAll(lines, utiliptables.NoFlushTables, utiliptables.RestoreCounters) if err != nil { glog.Errorf("Failed to execute iptables-restore: %v", err) // Revert new local ports. revertPorts(replacementPortsMap, proxier.portsMap) return } // Close old local ports and save new ones. for k, v := range proxier.portsMap { if replacementPortsMap[k] == nil { v.Close() } } proxier.portsMap = replacementPortsMap // Clean up the older SNAT rule which was directly in POSTROUTING. // TODO(thockin): Remove this for v1.3 or v1.4. args := []string{ "-m", "comment", "--comment", "kubernetes service traffic requiring SNAT", "-m", "mark", "--mark", oldIptablesMasqueradeMark, "-j", "MASQUERADE", } if err := proxier.iptables.DeleteRule(utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil { if !utiliptables.IsNotFoundError(err) { glog.Errorf("Error removing old-style SNAT rule: %v", err) } } }
// serviceLBPortChainName takes the ServicePortName for a service and // returns the associated iptables chain. This is computed by hashing (sha256) // then encoding to base32 and truncating with the prefix "KUBE-XLB-". We do // this because IPTables Chain Names must be <= 28 chars long, and the longer // they are the harder they are to read. func serviceLBChainName(s proxy.ServicePortName, protocol string) utiliptables.Chain { return utiliptables.Chain("KUBE-XLB-" + portProtoHash(s, protocol)) }
func TestgetChainLinesMultipleTables(t *testing.T) { iptables_save := `# Generated by iptables-save v1.4.21 on Fri Aug 7 14:47:37 2015 *nat :PREROUTING ACCEPT [2:138] :INPUT ACCEPT [0:0] :OUTPUT ACCEPT [0:0] :POSTROUTING ACCEPT [0:0] :DOCKER - [0:0] :KUBE-NODEPORT-CONTAINER - [0:0] :KUBE-NODEPORT-HOST - [0:0] :KUBE-PORTALS-CONTAINER - [0:0] :KUBE-PORTALS-HOST - [0:0] :KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U - [0:0] :KUBE-SVC-PknUqKuv-LNZiCKRqGm - [0:0] :KUBE-SVC-RWEx6uDf8yWGww1OQ8E - [0:0] :KUBE-SVC-UvIpe7oTYVlacW1-G4C - [0:0] :KUBE-SVC-g_TrwxBdTXDbEtecmNo - [0:0] :KUBE-SVC-gvTMDzeC8lcXUan15iP - [0:0] -A PREROUTING -m comment --comment "handle ClusterIPs; NOTE: this must be before the NodePort rules" -j KUBE-PORTALS-CONTAINER -A PREROUTING -m addrtype --dst-type LOCAL -j DOCKER -A PREROUTING -m addrtype --dst-type LOCAL -m comment --comment "handle service NodePorts; NOTE: this must be the last rule in the chain" -j KUBE-NODEPORT-CONTAINER -A OUTPUT -m comment --comment "handle ClusterIPs; NOTE: this must be before the NodePort rules" -j KUBE-PORTALS-HOST -A OUTPUT ! -d 127.0.0.0/8 -m addrtype --dst-type LOCAL -j DOCKER -A OUTPUT -m addrtype --dst-type LOCAL -m comment --comment "handle service NodePorts; NOTE: this must be the last rule in the chain" -j KUBE-NODEPORT-HOST -A POSTROUTING -s 10.246.1.0/24 ! -o cbr0 -j MASQUERADE -A POSTROUTING -s 10.0.2.15/32 -d 10.0.2.15/32 -m comment --comment "handle pod connecting to self" -j MASQUERADE -A KUBE-PORTALS-CONTAINER -d 10.247.0.1/32 -p tcp -m comment --comment "portal for default/kubernetes:" -m state --state NEW -m tcp --dport 443 -j KUBE-SVC-g_TrwxBdTXDbEtecmNo -A KUBE-PORTALS-CONTAINER -d 10.247.0.10/32 -p udp -m comment --comment "portal for kube-system/kube-dns:dns" -m state --state NEW -m udp --dport 53 -j KUBE-SVC-gvTMDzeC8lcXUan15iP -A KUBE-PORTALS-CONTAINER -d 10.247.0.10/32 -p tcp -m comment --comment "portal for kube-system/kube-dns:dns-tcp" -m state --state NEW -m tcp --dport 53 -j KUBE-SVC-PknUqKuv-LNZiCKRqGm -A KUBE-PORTALS-HOST -d 10.247.0.1/32 -p tcp -m comment --comment "portal for default/kubernetes:" -m state --state NEW -m tcp --dport 443 -j KUBE-SVC-g_TrwxBdTXDbEtecmNo -A KUBE-PORTALS-HOST -d 10.247.0.10/32 -p udp -m comment --comment "portal for kube-system/kube-dns:dns" -m state --state NEW -m udp --dport 53 -j KUBE-SVC-gvTMDzeC8lcXUan15iP -A KUBE-PORTALS-HOST -d 10.247.0.10/32 -p tcp -m comment --comment "portal for kube-system/kube-dns:dns-tcp" -m state --state NEW -m tcp --dport 53 -j KUBE-SVC-PknUqKuv-LNZiCKRqGm -A KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U -p udp -m comment --comment "kube-system/kube-dns:dns" -m recent --set --name KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.246.1.2:53 -A KUBE-SVC-PknUqKuv-LNZiCKRqGm -m comment --comment "kube-system/kube-dns:dns-tcp" -j KUBE-SVC-RWEx6uDf8yWGww1OQ8E -A KUBE-SVC-RWEx6uDf8yWGww1OQ8E -p tcp -m comment --comment "kube-system/kube-dns:dns-tcp" -m recent --set --name KUBE-SVC-RWEx6uDf8yWGww1OQ8E --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.246.1.2:53 -A KUBE-SVC-UvIpe7oTYVlacW1-G4C -p tcp -m comment --comment "default/kubernetes:" -m recent --set --name KUBE-SVC-UvIpe7oTYVlacW1-G4C --mask 255.255.255.255 --rsource -j DNAT --to-destination 10.245.1.2:443 -A KUBE-SVC-g_TrwxBdTXDbEtecmNo -m comment --comment "default/kubernetes:" -j KUBE-SVC-UvIpe7oTYVlacW1-G4C -A KUBE-SVC-gvTMDzeC8lcXUan15iP -m comment --comment "kube-system/kube-dns:dns" -j KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U COMMIT # Completed on Fri Aug 7 14:47:37 2015 # Generated by iptables-save v1.4.21 on Fri Aug 7 14:47:37 2015 *filter :INPUT ACCEPT [17514:83115836] :FORWARD ACCEPT [0:0] :OUTPUT ACCEPT [8909:688225] :DOCKER - [0:0] -A FORWARD -o cbr0 -j DOCKER -A FORWARD -o cbr0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT -A FORWARD -i cbr0 ! -o cbr0 -j ACCEPT -A FORWARD -i cbr0 -o cbr0 -j ACCEPT COMMIT ` expected := map[utiliptables.Chain]string{ utiliptables.ChainPrerouting: ":PREROUTING ACCEPT [2:138]", utiliptables.Chain("INPUT"): ":INPUT ACCEPT [0:0]", utiliptables.Chain("OUTPUT"): ":OUTPUT ACCEPT [0:0]", utiliptables.ChainPostrouting: ":POSTROUTING ACCEPT [0:0]", utiliptables.Chain("DOCKER"): ":DOCKER - [0:0]", utiliptables.Chain("KUBE-NODEPORT-CONTAINER"): ":KUBE-NODEPORT-CONTAINER - [0:0]", utiliptables.Chain("KUBE-NODEPORT-HOST"): ":KUBE-NODEPORT-HOST - [0:0]", utiliptables.Chain("KUBE-PORTALS-CONTAINER"): ":KUBE-PORTALS-CONTAINER - [0:0]", utiliptables.Chain("KUBE-PORTALS-HOST"): ":KUBE-PORTALS-HOST - [0:0]", utiliptables.Chain("KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U"): ":KUBE-SVC-Dgkr8H9s4LQ2mn-Py5U - [0:0]", utiliptables.Chain("KUBE-SVC-PknUqKuv-LNZiCKRqGm"): ":KUBE-SVC-PknUqKuv-LNZiCKRqGm - [0:0]", utiliptables.Chain("KUBE-SVC-RWEx6uDf8yWGww1OQ8E"): ":KUBE-SVC-RWEx6uDf8yWGww1OQ8E - [0:0]", utiliptables.Chain("KUBE-SVC-UvIpe7oTYVlacW1-G4C"): ":KUBE-SVC-UvIpe7oTYVlacW1-G4C - [0:0]", utiliptables.Chain("KUBE-SVC-g_TrwxBdTXDbEtecmNo"): ":KUBE-SVC-g_TrwxBdTXDbEtecmNo - [0:0]", utiliptables.Chain("KUBE-SVC-gvTMDzeC8lcXUan15iP"): ":KUBE-SVC-gvTMDzeC8lcXUan15iP - [0:0]", } checkAllLines(t, utiliptables.TableNAT, []byte(iptables_save), expected) }
// servicePortToServiceChain takes the ServicePortName for a // service and returns the associated iptables chain // this is computed by hashing (sha256) then encoding to base64 and // truncating with the prefix "KUBE-SVC-" // We do this because Iptables Chain Names must be <= 28 chars long func servicePortToServiceChain(s proxy.ServicePortName) utiliptables.Chain { hash := sha256.Sum256([]byte(s.String())) encoded := base32.StdEncoding.EncodeToString(hash[:]) return utiliptables.Chain("KUBE-SVC-" + encoded[:19]) }
//hostportChainName takes containerPort for a pod and returns associated iptables chain. // This is computed by hashing (sha256) // then encoding to base32 and truncating with the prefix "KUBE-SVC-". We do // this because IPTables Chain Names must be <= 28 chars long, and the longer // they are the harder they are to read. func hostportChainName(cp api.ContainerPort, podFullName string) utiliptables.Chain { hash := sha256.Sum256([]byte(string(cp.HostPort) + string(cp.Protocol) + podFullName)) encoded := base32.StdEncoding.EncodeToString(hash[:]) return utiliptables.Chain(kubeHostportChainPrefix + encoded[:16]) }
// this is the same as servicePortToServiceChain but with the endpoint included essentially func servicePortAndEndpointToServiceChain(s proxy.ServicePortName, endpoint string) utiliptables.Chain { hash := sha256.Sum256([]byte(s.String() + "_" + endpoint)) encoded := base32.StdEncoding.EncodeToString(hash[:]) return utiliptables.Chain("KUBE-SEP-" + encoded[:19]) }
func TestOpenPodHostports(t *testing.T) { fakeIPTables := NewFakeIPTables() h := &handler{ hostPortMap: make(map[hostport]closeable), iptables: fakeIPTables, portOpener: openFakeSocket, } tests := []struct { pod *v1.Pod ip string matches []*ruleMatch }{ // New pod that we are going to add { &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "test-pod", Namespace: v1.NamespaceDefault, }, Spec: v1.PodSpec{ Containers: []v1.Container{{ Ports: []v1.ContainerPort{{ HostPort: 4567, ContainerPort: 80, Protocol: v1.ProtocolTCP, }, { HostPort: 5678, ContainerPort: 81, Protocol: v1.ProtocolUDP, }}, }}, }, }, "10.1.1.2", []*ruleMatch{ { -1, "KUBE-HOSTPORTS", "-m comment --comment \"test-pod_default hostport 4567\" -m tcp -p tcp --dport 4567", }, { 4567, "", "-m comment --comment \"test-pod_default hostport 4567\" -s 10.1.1.2/32 -j KUBE-MARK-MASQ", }, { 4567, "", "-m comment --comment \"test-pod_default hostport 4567\" -m tcp -p tcp -j DNAT --to-destination 10.1.1.2:80", }, { -1, "KUBE-HOSTPORTS", "-m comment --comment \"test-pod_default hostport 5678\" -m udp -p udp --dport 5678", }, { 5678, "", "-m comment --comment \"test-pod_default hostport 5678\" -s 10.1.1.2/32 -j KUBE-MARK-MASQ", }, { 5678, "", "-m comment --comment \"test-pod_default hostport 5678\" -m udp -p udp -j DNAT --to-destination 10.1.1.2:81", }, }, }, // Already running pod { &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "another-test-pod", Namespace: v1.NamespaceDefault, }, Spec: v1.PodSpec{ Containers: []v1.Container{{ Ports: []v1.ContainerPort{{ HostPort: 123, ContainerPort: 654, Protocol: v1.ProtocolTCP, }}, }}, }, }, "10.1.1.5", []*ruleMatch{ { -1, "KUBE-HOSTPORTS", "-m comment --comment \"another-test-pod_default hostport 123\" -m tcp -p tcp --dport 123", }, { 123, "", "-m comment --comment \"another-test-pod_default hostport 123\" -s 10.1.1.5/32 -j KUBE-MARK-MASQ", }, { 123, "", "-m comment --comment \"another-test-pod_default hostport 123\" -m tcp -p tcp -j DNAT --to-destination 10.1.1.5:654", }, }, }, } activePods := make([]*ActivePod, 0) // Fill in any match rules missing chain names for _, test := range tests { for _, match := range test.matches { if match.hostport >= 0 { found := false for _, c := range test.pod.Spec.Containers { for _, cp := range c.Ports { if int(cp.HostPort) == match.hostport { match.chain = string(hostportChainName(cp, kubecontainer.GetPodFullName(test.pod))) found = true break } } } if !found { t.Fatalf("Failed to find ContainerPort for match %d/'%s'", match.hostport, match.match) } } } activePods = append(activePods, &ActivePod{ Pod: test.pod, IP: net.ParseIP(test.ip), }) } // Already running pod's host port hp := hostport{ tests[1].pod.Spec.Containers[0].Ports[0].HostPort, strings.ToLower(string(tests[1].pod.Spec.Containers[0].Ports[0].Protocol)), } h.hostPortMap[hp] = &fakeSocket{ tests[1].pod.Spec.Containers[0].Ports[0].HostPort, strings.ToLower(string(tests[1].pod.Spec.Containers[0].Ports[0].Protocol)), false, } err := h.OpenPodHostportsAndSync(&ActivePod{Pod: tests[0].pod, IP: net.ParseIP(tests[0].ip)}, "br0", activePods) if err != nil { t.Fatalf("Failed to OpenPodHostportsAndSync: %v", err) } // Generic rules genericRules := []*ruleMatch{ {-1, "POSTROUTING", "-m comment --comment \"SNAT for localhost access to hostports\" -o br0 -s 127.0.0.0/8 -j MASQUERADE"}, {-1, "PREROUTING", "-m comment --comment \"kube hostport portals\" -m addrtype --dst-type LOCAL -j KUBE-HOSTPORTS"}, {-1, "OUTPUT", "-m comment --comment \"kube hostport portals\" -m addrtype --dst-type LOCAL -j KUBE-HOSTPORTS"}, } for _, rule := range genericRules { _, chain, err := fakeIPTables.getChain(utiliptables.TableNAT, utiliptables.Chain(rule.chain)) if err != nil { t.Fatalf("Expected NAT chain %s did not exist", rule.chain) } if !matchRule(chain, rule.match) { t.Fatalf("Expected %s chain rule match '%s' not found", rule.chain, rule.match) } } // Pod rules for _, test := range tests { for _, match := range test.matches { // Ensure chain exists _, chain, err := fakeIPTables.getChain(utiliptables.TableNAT, utiliptables.Chain(match.chain)) if err != nil { t.Fatalf("Expected NAT chain %s did not exist", match.chain) } if !matchRule(chain, match.match) { t.Fatalf("Expected NAT chain %s rule containing '%s' not found", match.chain, match.match) } } } // Socket hostPortMap := map[hostport]closeable{ hostport{123, "tcp"}: &fakeSocket{123, "tcp", false}, hostport{4567, "tcp"}: &fakeSocket{4567, "tcp", false}, hostport{5678, "udp"}: &fakeSocket{5678, "udp", false}, } if !reflect.DeepEqual(hostPortMap, h.hostPortMap) { t.Fatalf("Mismatch in expected hostPortMap. Expected '%v', got '%v'", hostPortMap, h.hostPortMap) } }
// This is where all of the iptables-save/restore calls happen. // The only other iptables rules are those that are setup in iptablesInit() // assumes proxier.mu is held func (proxier *Proxier) syncProxyRules() error { // don't sync rules till we've received services and endpoints if !proxier.haveReceivedEndpointsUpdate || !proxier.haveReceivedServiceUpdate { glog.V(2).Info("Not syncing iptables until Services and Endpoints have been received from master") return nil } glog.V(3).Infof("Syncing iptables rules") // Ensure main chains and rules are installed. inputChains := []utiliptables.Chain{utiliptables.ChainOutput, utiliptables.ChainPrerouting} // Link the services chain. for _, chain := range inputChains { if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, iptablesServicesChain); err != nil { return err } comment := "kubernetes service portals; must be before nodeports" args := []string{"-m", "comment", "--comment", comment, "-j", string(iptablesServicesChain)} if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, chain, args...); err != nil { return err } } // Link the nodeports chain. for _, chain := range inputChains { if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, iptablesNodePortsChain); err != nil { return err } comment := "kubernetes service nodeports; must be after portals" args := []string{"-m", "comment", "--comment", comment, "-m", "addrtype", "--dst-type", "LOCAL", "-j", string(iptablesNodePortsChain)} if _, err := proxier.iptables.EnsureRule(utiliptables.Append, utiliptables.TableNAT, chain, args...); err != nil { return err } } // Link the output rules. { comment := "kubernetes service traffic requiring SNAT" args := []string{"-m", "comment", "--comment", comment, "-m", "mark", "--mark", iptablesMasqueradeMark, "-j", "MASQUERADE"} if _, err := proxier.iptables.EnsureRule(utiliptables.Append, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil { return err } } // Get iptables-save output so we can check for existing chains and rules. // This will be a map of chain name to chain with rules as stored in iptables-save/iptables-restore existingChains := make(map[utiliptables.Chain]string) iptablesSaveRaw, err := proxier.iptables.Save(utiliptables.TableNAT) if err != nil { // if we failed to get any rules glog.Errorf("Failed to execute iptable-save, syncing all rules. %s", err.Error()) } else { // otherwise parse the output existingChains = getChainLines(utiliptables.TableNAT, iptablesSaveRaw) } chainsLines := bytes.NewBuffer(nil) rulesLines := bytes.NewBuffer(nil) // Write table header. writeLine(chainsLines, "*nat") // Make sure we keep stats for the top-level chains, if they existed // (which they should have because we created them above). if chain, ok := existingChains[iptablesServicesChain]; ok { writeLine(chainsLines, chain) } else { writeLine(chainsLines, makeChainLine(iptablesServicesChain)) } if chain, ok := existingChains[iptablesNodePortsChain]; ok { writeLine(chainsLines, chain) } else { writeLine(chainsLines, makeChainLine(iptablesNodePortsChain)) } // Accumulate chains to keep. activeChains := make(map[utiliptables.Chain]bool) // use a map as a set // Build rules for each service. for name, info := range proxier.serviceMap { protocol := strings.ToLower(string(info.protocol)) // Create the per-service chain, retaining counters if possible. svcChain := servicePortToServiceChain(name, protocol) if chain, ok := existingChains[svcChain]; ok { writeLine(chainsLines, chain) } else { writeLine(chainsLines, makeChainLine(svcChain)) } activeChains[svcChain] = true // Capture the clusterIP. writeLine(rulesLines, "-A", string(iptablesServicesChain), "-m", "comment", "--comment", fmt.Sprintf("\"%s cluster IP\"", name.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", info.clusterIP.String()), "--dport", fmt.Sprintf("%d", info.port), "-j", string(svcChain)) // Capture externalIPs. for _, externalIP := range info.externalIPs { args := []string{ "-A", string(iptablesServicesChain), "-m", "comment", "--comment", fmt.Sprintf("\"%s external IP\"", name.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", externalIP), "--dport", fmt.Sprintf("%d", info.port), } // We have to SNAT packets to external IPs. writeLine(rulesLines, append(args, "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...) // Allow traffic for external IPs that does not come from a bridge (i.e. not from a container) // nor from a local process to be forwarded to the service. // This rule roughly translates to "all traffic from off-machine". // This is imperfect in the face of network plugins that might not use a bridge, but we can revisit that later. externalTrafficOnlyArgs := append(args, "-m", "physdev", "!", "--physdev-is-in", "-m", "addrtype", "!", "--src-type", "LOCAL") writeLine(rulesLines, append(externalTrafficOnlyArgs, "-j", string(svcChain))...) dstLocalOnlyArgs := append(args, "-m", "addrtype", "--dst-type", "LOCAL") // Allow traffic bound for external IPs that happen to be recognized as local IPs to stay local. // This covers cases like GCE load-balancers which get added to the local routing table. writeLine(rulesLines, append(dstLocalOnlyArgs, "-j", string(svcChain))...) } // Capture load-balancer ingress. for _, ingress := range info.loadBalancerStatus.Ingress { if ingress.IP != "" { args := []string{ "-A", string(iptablesServicesChain), "-m", "comment", "--comment", fmt.Sprintf("\"%s loadbalancer IP\"", name.String()), "-m", protocol, "-p", protocol, "-d", fmt.Sprintf("%s/32", ingress.IP), "--dport", fmt.Sprintf("%d", info.port), } // We have to SNAT packets from external IPs. writeLine(rulesLines, append(args, "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...) writeLine(rulesLines, append(args, "-j", string(svcChain))...) } } // Capture nodeports. If we had more than 2 rules it might be // worthwhile to make a new per-service chain for nodeport rules, but // with just 2 rules it ends up being a waste and a cognitive burden. if info.nodePort != 0 { // Nodeports need SNAT. writeLine(rulesLines, "-A", string(iptablesNodePortsChain), "-m", "comment", "--comment", name.String(), "-m", protocol, "-p", protocol, "--dport", fmt.Sprintf("%d", info.nodePort), "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark)) // Jump to the service chain. writeLine(rulesLines, "-A", string(iptablesNodePortsChain), "-m", "comment", "--comment", name.String(), "-m", protocol, "-p", protocol, "--dport", fmt.Sprintf("%d", info.nodePort), "-j", string(svcChain)) } // Generate the per-endpoint chains. We do this in multiple passes so we // can group rules together. endpoints := make([]string, 0) endpointChains := make([]utiliptables.Chain, 0) for _, ep := range info.endpoints { endpoints = append(endpoints, ep) endpointChain := servicePortAndEndpointToServiceChain(name, protocol, ep) endpointChains = append(endpointChains, endpointChain) // Create the endpoint chain, retaining counters if possible. if chain, ok := existingChains[utiliptables.Chain(endpointChain)]; ok { writeLine(chainsLines, chain) } else { writeLine(chainsLines, makeChainLine(endpointChain)) } activeChains[endpointChain] = true } // First write session affinity rules, if applicable. if info.sessionAffinityType == api.ServiceAffinityClientIP { for _, endpointChain := range endpointChains { writeLine(rulesLines, "-A", string(svcChain), "-m", "comment", "--comment", name.String(), "-m", "recent", "--name", string(endpointChain), "--rcheck", "--seconds", fmt.Sprintf("%d", info.stickyMaxAgeSeconds), "--reap", "-j", string(endpointChain)) } } // Now write loadbalancing & DNAT rules. n := len(endpointChains) for i, endpointChain := range endpointChains { // Balancing rules in the per-service chain. args := []string{ "-A", string(svcChain), "-m", "comment", "--comment", name.String(), } if i < (n - 1) { // Each rule is a probabilistic match. args = append(args, "-m", "statistic", "--mode", "random", "--probability", fmt.Sprintf("%0.5f", 1.0/float64(n-i))) } // The final (or only if n == 1) rule is a guaranteed match. args = append(args, "-j", string(endpointChain)) writeLine(rulesLines, args...) // Rules in the per-endpoint chain. args = []string{ "-A", string(endpointChain), "-m", "comment", "--comment", name.String(), } // Handle traffic that loops back to the originator with SNAT. // Technically we only need to do this if the endpoint is on this // host, but we don't have that information, so we just do this for // all endpoints. // TODO: if we grow logic to get this node's pod CIDR, we can use it. writeLine(rulesLines, append(args, "-s", fmt.Sprintf("%s/32", strings.Split(endpoints[i], ":")[0]), "-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...) // Update client-affinity lists. if info.sessionAffinityType == api.ServiceAffinityClientIP { args = append(args, "-m", "recent", "--name", string(endpointChain), "--set") } // DNAT to final destination. args = append(args, "-m", protocol, "-p", protocol, "-j", "DNAT", "--to-destination", endpoints[i]) writeLine(rulesLines, args...) } } // Delete chains no longer in use. for chain := range existingChains { if !activeChains[chain] { chainString := string(chain) if !strings.HasPrefix(chainString, "KUBE-SVC-") && !strings.HasPrefix(chainString, "KUBE-SEP-") { // Ignore chains that aren't ours. continue } // We must (as per iptables) write a chain-line for it, which has // the nice effect of flushing the chain. Then we can remove the // chain. writeLine(chainsLines, existingChains[chain]) writeLine(rulesLines, "-X", chainString) } } // Write the end-of-table marker. writeLine(rulesLines, "COMMIT") // Sync rules. // NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table. lines := append(chainsLines.Bytes(), rulesLines.Bytes()...) glog.V(3).Infof("Syncing rules: %s", lines) return proxier.iptables.Restore(utiliptables.TableNAT, lines, utiliptables.NoFlushTables, utiliptables.RestoreCounters) }