All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] bonding: symmetric ICMP transmit
@ 2019-11-15 11:10 Matteo Croce
  2019-11-16 21:03 ` David Miller
  0 siblings, 1 reply; 2+ messages in thread
From: Matteo Croce @ 2019-11-15 11:10 UTC (permalink / raw)
  To: netdev
  Cc: Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, David S. Miller,
	linux-kernel

A bonding with layer2+3 or layer3+4 hashing uses the IP addresses and the ports
to balance packets between slaves. With some network errors, we receive an ICMP
error packet by the remote host or a router. If sent by a router, the source IP
can differ from the remote host one. Additionally the ICMP protocol has no port
numbers, so a layer3+4 bonding will get a different hash than the previous one.
These two conditions could let the packet go through a different interface than
the other packets of the same flow:

    # tcpdump -qltnni veth0 |sed 's/^/0: /' &
    # tcpdump -qltnni veth1 |sed 's/^/1: /' &
    # hping3 -2 192.168.0.2 -p 9
    0: IP 192.168.0.1.2251 > 192.168.0.2.9: UDP, length 0
    1: IP 192.168.0.2 > 192.168.0.1: ICMP 192.168.0.2 udp port 9 unreachable, length 36
    1: IP 192.168.0.1.2252 > 192.168.0.2.9: UDP, length 0
    1: IP 192.168.0.2 > 192.168.0.1: ICMP 192.168.0.2 udp port 9 unreachable, length 36
    1: IP 192.168.0.1.2253 > 192.168.0.2.9: UDP, length 0
    1: IP 192.168.0.2 > 192.168.0.1: ICMP 192.168.0.2 udp port 9 unreachable, length 36
    0: IP 192.168.0.1.2254 > 192.168.0.2.9: UDP, length 0
    1: IP 192.168.0.2 > 192.168.0.1: ICMP 192.168.0.2 udp port 9 unreachable, length 36

An ICMP error packet contains the header of the packet which caused the network
error, so inspect it and match the flow against it, so we can send the ICMP via
the same interface of the previous packet in the flow.
Move the IP and port dissect code into a generic function bond_flow_ip() and if
we are dissecting an ICMP error packet, call it again with the adjusted offset.

    # hping3 -2 192.168.0.2 -p 9
    1: IP 192.168.0.1.1224 > 192.168.0.2.9: UDP, length 0
    1: IP 192.168.0.2 > 192.168.0.1: ICMP 192.168.0.2 udp port 9 unreachable, length 36
    1: IP 192.168.0.1.1225 > 192.168.0.2.9: UDP, length 0
    1: IP 192.168.0.2 > 192.168.0.1: ICMP 192.168.0.2 udp port 9 unreachable, length 36
    0: IP 192.168.0.1.1226 > 192.168.0.2.9: UDP, length 0
    0: IP 192.168.0.2 > 192.168.0.1: ICMP 192.168.0.2 udp port 9 unreachable, length 36
    0: IP 192.168.0.1.1227 > 192.168.0.2.9: UDP, length 0
    0: IP 192.168.0.2 > 192.168.0.1: ICMP 192.168.0.2 udp port 9 unreachable, length 36

Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
 drivers/net/bonding/bond_main.c | 83 ++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 26 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 08b2b0d855af..fcb7c2f7f001 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -41,6 +41,8 @@
 #include <linux/in.h>
 #include <net/ip.h>
 #include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/slab.h>
@@ -3297,12 +3299,42 @@ static inline u32 bond_eth_hash(struct sk_buff *skb)
 	return 0;
 }
 
+static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
+			 int *noff, int *proto, bool l34)
+{
+	const struct ipv6hdr *iph6;
+	const struct iphdr *iph;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		if (unlikely(!pskb_may_pull(skb, *noff + sizeof(*iph))))
+			return false;
+		iph = (const struct iphdr *)(skb->data + *noff);
+		iph_to_flow_copy_v4addrs(fk, iph);
+		*noff += iph->ihl << 2;
+		if (!ip_is_fragment(iph))
+			*proto = iph->protocol;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (unlikely(!pskb_may_pull(skb, *noff + sizeof(*iph6))))
+			return false;
+		iph6 = (const struct ipv6hdr *)(skb->data + *noff);
+		iph_to_flow_copy_v6addrs(fk, iph6);
+		*noff += sizeof(*iph6);
+		*proto = iph6->nexthdr;
+	} else {
+		return false;
+	}
+
+	if (l34 && *proto >= 0)
+		fk->ports.ports = skb_flow_get_ports(skb, *noff, *proto);
+
+	return true;
+}
+
 /* Extract the appropriate headers based on bond's xmit policy */
 static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 			      struct flow_keys *fk)
 {
-	const struct ipv6hdr *iph6;
-	const struct iphdr *iph;
+	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
 	int noff, proto = -1;
 
 	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
@@ -3314,31 +3346,30 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	fk->ports.ports = 0;
 	memset(&fk->icmp, 0, sizeof(fk->icmp));
 	noff = skb_network_offset(skb);
-	if (skb->protocol == htons(ETH_P_IP)) {
-		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
-			return false;
-		iph = ip_hdr(skb);
-		iph_to_flow_copy_v4addrs(fk, iph);
-		noff += iph->ihl << 2;
-		if (!ip_is_fragment(iph))
-			proto = iph->protocol;
-	} else if (skb->protocol == htons(ETH_P_IPV6)) {
-		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6))))
-			return false;
-		iph6 = ipv6_hdr(skb);
-		iph_to_flow_copy_v6addrs(fk, iph6);
-		noff += sizeof(*iph6);
-		proto = iph6->nexthdr;
-	} else {
+	if (!bond_flow_ip(skb, fk, &noff, &proto, l34))
 		return false;
-	}
-	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) {
-		if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
-			skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data,
-					      skb_transport_offset(skb),
-					      skb_headlen(skb));
-		else
-			fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
+
+	/* ICMP error packets contains at least 8 bytes of the header
+	 * of the packet which generated the error. Use this information
+	 * to correlate ICMP error packets within the same flow which
+	 * generated the error.
+	 */
+	if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) {
+		skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data,
+				      skb_transport_offset(skb),
+				      skb_headlen(skb));
+		if (proto == IPPROTO_ICMP) {
+			if (!icmp_is_err(fk->icmp.type))
+				return true;
+
+			noff += sizeof(struct icmphdr);
+		} else if (proto == IPPROTO_ICMPV6) {
+			if (!icmpv6_is_err(fk->icmp.type))
+				return true;
+
+			noff += sizeof(struct icmp6hdr);
+		}
+		return bond_flow_ip(skb, fk, &noff, &proto, l34);
 	}
 
 	return true;
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH net-next] bonding: symmetric ICMP transmit
  2019-11-15 11:10 [PATCH net-next] bonding: symmetric ICMP transmit Matteo Croce
@ 2019-11-16 21:03 ` David Miller
  0 siblings, 0 replies; 2+ messages in thread
From: David Miller @ 2019-11-16 21:03 UTC (permalink / raw)
  To: mcroce; +Cc: netdev, j.vosburgh, vfalico, andy, linux-kernel

From: Matteo Croce <mcroce@redhat.com>
Date: Fri, 15 Nov 2019 12:10:37 +0100

> A bonding with layer2+3 or layer3+4 hashing uses the IP addresses and the ports
> to balance packets between slaves. With some network errors, we receive an ICMP
> error packet by the remote host or a router. If sent by a router, the source IP
> can differ from the remote host one. Additionally the ICMP protocol has no port
> numbers, so a layer3+4 bonding will get a different hash than the previous one.
> These two conditions could let the packet go through a different interface than
> the other packets of the same flow:
 ...
> An ICMP error packet contains the header of the packet which caused the network
> error, so inspect it and match the flow against it, so we can send the ICMP via
> the same interface of the previous packet in the flow.
> Move the IP and port dissect code into a generic function bond_flow_ip() and if
> we are dissecting an ICMP error packet, call it again with the adjusted offset.
 ...
> Signed-off-by: Matteo Croce <mcroce@redhat.com>

Applied, thanks.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-11-16 21:03 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-15 11:10 [PATCH net-next] bonding: symmetric ICMP transmit Matteo Croce
2019-11-16 21:03 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.