[PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface

* [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface
@ 2020-09-09  6:26 mtk81216
  2020-09-15  7:30 ` Steffen Klassert
  0 siblings, 1 reply; 6+ messages in thread
From: mtk81216 @ 2020-09-09  6:26 UTC (permalink / raw)
  To: David S . Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Kicinski, Steffen Klassert, Herbert Xu, Matthias Brugger
  Cc: netdev, linux-kernel, linux-arm-kernel, linux-mediatek, mtk81216

In esp's tunnel mode,if inner interface is ipv4,outer is ipv4,one big 
packet which travels through tunnel will be fragmented with outer 
interface's mtu,peer server will remove tunnelled esp header and assemble
them in big packet.After forwarding such packet to next endpoint,it will 
be dropped because of exceeding mtu or be returned ICMP(packet-too-big).
When inner interface is ipv4,outer is ipv6,the flag of xfrm state in tunnel
mode is af-unspec, thing is different.One big packet through tunnel will be
fragmented with outer interface's mtu minus tunneled header, then two or 
more less fragmented packets will be tunneled and transmitted in outer 
interface,that is what xfrm6_output has done. If peer server receives such
packets, it will forward successfully to next because length is valid.

This patch has followed up xfrm6_output's logic,which includes two changes,
one is choosing suitable mtu value which considering innner/outer 
interface's mtu and dst path, the other is if packet is too big, calling 
ip_fragment first,then tunnelling fragmented packets in outer interface and
transmitting finally.

Signed-off-by: mtk81216 <lina.wang@mediatek.com>
---
 include/net/ip.h        |  3 +++
 net/ipv4/ip_output.c    | 10 +++-------
 net/ipv4/xfrm4_output.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index b09c48d862cc..05f9c6454ff5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -163,6 +163,9 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		   int (*output)(struct net *, struct sock *, struct sk_buff *));
+int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		unsigned int mtu,
+		int (*output)(struct net *, struct sock *, struct sk_buff *));
 
 struct ip_fraglist_iter {
 	struct sk_buff	*frag;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 61f802d5350c..f99249132a76 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -82,10 +82,6 @@
 #include <linux/netlink.h>
 #include <linux/tcp.h>
 
-static int
-ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
-	    unsigned int mtu,
-	    int (*output)(struct net *, struct sock *, struct sk_buff *));
 
 /* Generate a checksum for an outgoing IP datagram. */
 void ip_send_check(struct iphdr *iph)
@@ -569,9 +565,9 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	skb_copy_secmark(to, from);
 }
 
-static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
-		       unsigned int mtu,
-		       int (*output)(struct net *, struct sock *, struct sk_buff *))
+int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		unsigned int mtu,
+		int (*output)(struct net *, struct sock *, struct sk_buff *))
 {
 	struct iphdr *iph = ip_hdr(skb);
 
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 3cff51ba72bb..1488b79186ad 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -14,8 +14,27 @@
 #include <net/xfrm.h>
 #include <net/icmp.h>
 
+static int __xfrm4_output_finish(struct net *net, struct sock *sk,
+				 struct sk_buff *skb)
+{
+	return xfrm_output(sk, skb);
+}
+
+static inline int ip4_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct inet_sock *np = skb->sk && !dev_recursion_level() ?
+				inet_sk(skb->sk) : NULL;
+
+	return (np & np->pmtudisc >= IP_PMTUDISC_PROBE) ?
+		skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
+}
+
 static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	int mtu;
+	bool toobig;
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
 #ifdef CONFIG_NETFILTER
 	struct xfrm_state *x = skb_dst(skb)->xfrm;
 
@@ -25,6 +44,24 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	}
 #endif
 
+	if (x->props.mode != XFRM_MODE_TUNNEL)
+		goto skip_frag;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		mtu = ip4_skb_dst_mtu(skb);
+	else
+		goto skip_frag;
+
+	toobig = skb->len > mtu && !skb_is_gso(skb);
+	if (!skb->ignore_df && toobig && skb->sk) {
+		xfrm_local_error(skb, mtu);
+		return -EMSGSIZE;
+	}
+
+	if (toobig || dst_allfrag(skb_dst(skb)))
+		return ip_fragment(net, sk, skb, mtu, __xfrm4_output_finish);
+
+skip_frag:
 	return xfrm_output(sk, skb);
 }
 
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 6+ messages in thread