linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface
@ 2020-09-09  6:26 mtk81216
  2020-09-15  7:30 ` Steffen Klassert
  0 siblings, 1 reply; 6+ messages in thread
From: mtk81216 @ 2020-09-09  6:26 UTC (permalink / raw)
  To: David S . Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Kicinski, Steffen Klassert, Herbert Xu, Matthias Brugger
  Cc: netdev, linux-kernel, linux-arm-kernel, linux-mediatek, mtk81216

In esp's tunnel mode,if inner interface is ipv4,outer is ipv4,one big 
packet which travels through tunnel will be fragmented with outer 
interface's mtu,peer server will remove tunnelled esp header and assemble
them in big packet.After forwarding such packet to next endpoint,it will 
be dropped because of exceeding mtu or be returned ICMP(packet-too-big).
When inner interface is ipv4,outer is ipv6,the flag of xfrm state in tunnel
mode is af-unspec, thing is different.One big packet through tunnel will be
fragmented with outer interface's mtu minus tunneled header, then two or 
more less fragmented packets will be tunneled and transmitted in outer 
interface,that is what xfrm6_output has done. If peer server receives such
packets, it will forward successfully to next because length is valid.

This patch has followed up xfrm6_output's logic,which includes two changes,
one is choosing suitable mtu value which considering innner/outer 
interface's mtu and dst path, the other is if packet is too big, calling 
ip_fragment first,then tunnelling fragmented packets in outer interface and
transmitting finally.

Signed-off-by: mtk81216 <lina.wang@mediatek.com>
---
 include/net/ip.h        |  3 +++
 net/ipv4/ip_output.c    | 10 +++-------
 net/ipv4/xfrm4_output.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index b09c48d862cc..05f9c6454ff5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -163,6 +163,9 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		   int (*output)(struct net *, struct sock *, struct sk_buff *));
+int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		unsigned int mtu,
+		int (*output)(struct net *, struct sock *, struct sk_buff *));
 
 struct ip_fraglist_iter {
 	struct sk_buff	*frag;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 61f802d5350c..f99249132a76 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -82,10 +82,6 @@
 #include <linux/netlink.h>
 #include <linux/tcp.h>
 
-static int
-ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
-	    unsigned int mtu,
-	    int (*output)(struct net *, struct sock *, struct sk_buff *));
 
 /* Generate a checksum for an outgoing IP datagram. */
 void ip_send_check(struct iphdr *iph)
@@ -569,9 +565,9 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	skb_copy_secmark(to, from);
 }
 
-static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
-		       unsigned int mtu,
-		       int (*output)(struct net *, struct sock *, struct sk_buff *))
+int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		unsigned int mtu,
+		int (*output)(struct net *, struct sock *, struct sk_buff *))
 {
 	struct iphdr *iph = ip_hdr(skb);
 
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 3cff51ba72bb..1488b79186ad 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -14,8 +14,27 @@
 #include <net/xfrm.h>
 #include <net/icmp.h>
 
+static int __xfrm4_output_finish(struct net *net, struct sock *sk,
+				 struct sk_buff *skb)
+{
+	return xfrm_output(sk, skb);
+}
+
+static inline int ip4_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct inet_sock *np = skb->sk && !dev_recursion_level() ?
+				inet_sk(skb->sk) : NULL;
+
+	return (np & np->pmtudisc >= IP_PMTUDISC_PROBE) ?
+		skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
+}
+
 static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	int mtu;
+	bool toobig;
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
 #ifdef CONFIG_NETFILTER
 	struct xfrm_state *x = skb_dst(skb)->xfrm;
 
@@ -25,6 +44,24 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	}
 #endif
 
+	if (x->props.mode != XFRM_MODE_TUNNEL)
+		goto skip_frag;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		mtu = ip4_skb_dst_mtu(skb);
+	else
+		goto skip_frag;
+
+	toobig = skb->len > mtu && !skb_is_gso(skb);
+	if (!skb->ignore_df && toobig && skb->sk) {
+		xfrm_local_error(skb, mtu);
+		return -EMSGSIZE;
+	}
+
+	if (toobig || dst_allfrag(skb_dst(skb)))
+		return ip_fragment(net, sk, skb, mtu, __xfrm4_output_finish);
+
+skip_frag:
 	return xfrm_output(sk, skb);
 }
 
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface
  2020-09-09  6:26 [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface mtk81216
@ 2020-09-15  7:30 ` Steffen Klassert
       [not found]   ` <1600160722.5295.15.camel@mbjsdccf07>
  2020-11-05  4:52   ` Lorenzo Colitti
  0 siblings, 2 replies; 6+ messages in thread
From: Steffen Klassert @ 2020-09-15  7:30 UTC (permalink / raw)
  To: mtk81216
  Cc: David S . Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Kicinski, Herbert Xu, Matthias Brugger, netdev,
	linux-kernel, linux-arm-kernel, linux-mediatek

On Wed, Sep 09, 2020 at 02:26:13PM +0800, mtk81216 wrote:
> In esp's tunnel mode,if inner interface is ipv4,outer is ipv4,one big 
> packet which travels through tunnel will be fragmented with outer 
> interface's mtu,peer server will remove tunnelled esp header and assemble
> them in big packet.After forwarding such packet to next endpoint,it will 
> be dropped because of exceeding mtu or be returned ICMP(packet-too-big).

What is the exact case where packets are dropped? Given that the packet
was fragmented (and reassembled), I'd assume the DF bit was not set. So
every router along the path is allowed to fragment again if needed.

> When inner interface is ipv4,outer is ipv6,the flag of xfrm state in tunnel
> mode is af-unspec, thing is different.One big packet through tunnel will be
> fragmented with outer interface's mtu minus tunneled header, then two or 
> more less fragmented packets will be tunneled and transmitted in outer 
> interface,that is what xfrm6_output has done. If peer server receives such
> packets, it will forward successfully to next because length is valid.
> 
> This patch has followed up xfrm6_output's logic,which includes two changes,
> one is choosing suitable mtu value which considering innner/outer 
> interface's mtu and dst path, the other is if packet is too big, calling 
> ip_fragment first,then tunnelling fragmented packets in outer interface and
> transmitting finally.
> 
> Signed-off-by: mtk81216 <lina.wang@mediatek.com>

Please use your real name to sign off.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface
       [not found]             ` <1604547381.23648.14.camel@mbjsdccf07>
@ 2020-11-05  4:41               ` Maciej Żenczykowski
  0 siblings, 0 replies; 6+ messages in thread
From: Maciej Żenczykowski @ 2020-11-05  4:41 UTC (permalink / raw)
  To: lina.wang
  Cc: Steffen Klassert, David S . Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Jakub Kicinski, Herbert Xu, Matthias Brugger,
	Linux NetDev, Kernel hackers, linux-arm-kernel, linux-mediatek,
	Lorenzo Colitti, Greg Kroah-Hartman

On Wed, Nov 4, 2020 at 7:53 PM lina.wang <lina.wang@mediatek.com> wrote:
>
> Besides several operators donot fragment packets even DF bit is not set,
> and instead just drop packets which they think big, maybe they have a
> consideration--- fragmentation is expensive for both the router that
> fragments and for the host that reassembles.
>
> BTW, ipv6 has the different behaviour when it meets such scenary, which
> is what we expected,ipv4 should follow the same thing. otherwise,
> packets are drop, it is a serious thing, and there is no hints. What we
> do is just fragmenting smaller packets, or is it possible to give us
> some flag or a sysctl to allow us to change the behaviour?
>
> On Thu, 2020-09-17 at 19:19 +0800, lina.wang wrote:
> > But it is not just one operator's broken router or misconfigured
> > router.In the whole network, it is common to meet that router will drop
> > bigger packet silently.I think we should make code more compatible.and
> > the scenary is wifi calling, which mostly used udp,you know, udp
> > wouldnot retransmit.It is serious when packet is dropped
> >
> > On Thu, 2020-09-17 at 09:46 +0200, Steffen Klassert wrote:
> > > On Tue, Sep 15, 2020 at 08:17:40PM +0800, lina.wang wrote:
> > > > We didnot get the router's log, which is some operator's.Actually, after
> > > > we patched, there is no such issue. Sometimes,router will return
> > > > packet-too-big, most of times nothing returned,dropped silently
> > >
> > > This looks like a broken router, we can't fix that in the code.
> > >
> > > > On Tue, 2020-09-15 at 11:32 +0200, Steffen Klassert wrote:
> > > > > On Tue, Sep 15, 2020 at 05:05:22PM +0800, lina.wang wrote:
> > > > > >
> > > > > > Yes, DF bit is not set.
> > > > >
> > > > > ...
> > > > >
> > > > > > Those two packets are fragmented to one big udp packet, which payload is 1516B.
> > > > > > After getting rid of tunnel header, it is also a udp packet, which payload is
> > > > > > 1466 bytes.It didnot get any response for this packet.We guess it is dropped
> > > > > > by router. Because if we reduced the length, it got response.
> > > > >
> > > > > If the DF bit is not set, the router should fragment the packet. If it
> > > > > does not do so, it is misconfigured. Do you have access to that router?

I'm afraid I don't really understand the exact scenario from the
description of the patch.

However... a few questions come to mind.
(a) why not set the DF flag, does the protocol require > mtu udp frames
(b) what is the mtu of the inner tunnel device, and what is the mtu on
the route through the device?
- shouldn't we be udp fragmenting to the route/device mtu?
maybe stuff is simply misconfigured...

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface
  2020-09-15  7:30 ` Steffen Klassert
       [not found]   ` <1600160722.5295.15.camel@mbjsdccf07>
@ 2020-11-05  4:52   ` Lorenzo Colitti
  2020-11-09  9:58     ` Steffen Klassert
  1 sibling, 1 reply; 6+ messages in thread
From: Lorenzo Colitti @ 2020-11-05  4:52 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: mtk81216, David S . Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Kicinski, Herbert Xu, Matthias Brugger, Linux NetDev, lkml,
	linux-arm-kernel, linux-mediatek, Maciej Żenczykowski,
	Greg Kroah-Hartman

On Tue, Sep 15, 2020 at 4:30 PM Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> > In esp's tunnel mode,if inner interface is ipv4,outer is ipv4,one big
> > packet which travels through tunnel will be fragmented with outer
> > interface's mtu,peer server will remove tunnelled esp header and assemble
> > them in big packet.After forwarding such packet to next endpoint,it will
> > be dropped because of exceeding mtu or be returned ICMP(packet-too-big).
>
> What is the exact case where packets are dropped? Given that the packet
> was fragmented (and reassembled), I'd assume the DF bit was not set. So
> every router along the path is allowed to fragment again if needed.

In general, isn't it just suboptimal to rely on fragmentation if the
sender already knows the packet is too big? That's why we have things
like path MTU discovery (RFC 1191). Fragmentation is generally
expensive, increases the chance of packet loss, and has historically
caused lots of security vulnerabilities. Also, in real world networks,
fragments sometimes just don't work, either because intermediate
routers don't fragment, or because firewalls drop the fragments due to
security reasons.

While it's possible in theory to ask these operators to configure
their routers to fragment packets, that may not result in the network
being fixed, due to hardware constraints, security policy or other
reasons. Those operators may also be in a position to place
requirements on devices that have to use their network. If the Linux
stack does not work as is on these networks, then those devices will
have to meet those requirements by making out-of-tree changes. It
would be good to avoid that if there's a better solution (e.g., make
this configurable via sysctl).

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface
  2020-11-05  4:52   ` Lorenzo Colitti
@ 2020-11-09  9:58     ` Steffen Klassert
  2020-11-09 19:38       ` Maciej Żenczykowski
  0 siblings, 1 reply; 6+ messages in thread
From: Steffen Klassert @ 2020-11-09  9:58 UTC (permalink / raw)
  To: Lorenzo Colitti
  Cc: mtk81216, David S . Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Kicinski, Herbert Xu, Matthias Brugger, Linux NetDev, lkml,
	linux-arm-kernel, linux-mediatek, Maciej Żenczykowski,
	Greg Kroah-Hartman

On Thu, Nov 05, 2020 at 01:52:01PM +0900, Lorenzo Colitti wrote:
> On Tue, Sep 15, 2020 at 4:30 PM Steffen Klassert
> <steffen.klassert@secunet.com> wrote:
> > > In esp's tunnel mode,if inner interface is ipv4,outer is ipv4,one big
> > > packet which travels through tunnel will be fragmented with outer
> > > interface's mtu,peer server will remove tunnelled esp header and assemble
> > > them in big packet.After forwarding such packet to next endpoint,it will
> > > be dropped because of exceeding mtu or be returned ICMP(packet-too-big).
> >
> > What is the exact case where packets are dropped? Given that the packet
> > was fragmented (and reassembled), I'd assume the DF bit was not set. So
> > every router along the path is allowed to fragment again if needed.
> 
> In general, isn't it just suboptimal to rely on fragmentation if the
> sender already knows the packet is too big? That's why we have things
> like path MTU discovery (RFC 1191).

When we setup packets that are sent from a local socket, we take
MTU/PMTU info we have into account. So we don't create fragments in
that case.

When forwarding packets it is different. The router that can not
TX the packet because it exceeds the MTU of the sending interface
is responsible to either fragment (if DF is not set), or send a
PMTU notification (if DF is set). So if we are able to transmit
the packet, we do it.

> Fragmentation is generally
> expensive, increases the chance of packet loss, and has historically
> caused lots of security vulnerabilities. Also, in real world networks,
> fragments sometimes just don't work, either because intermediate
> routers don't fragment, or because firewalls drop the fragments due to
> security reasons.
> 
> While it's possible in theory to ask these operators to configure
> their routers to fragment packets, that may not result in the network
> being fixed, due to hardware constraints, security policy or other
> reasons.

We can not really do anything here. If a flow has no DF bit set
on the packets, we can not rely on PMTU information. If we have PMTU
info on the route, then we have it because some other flow (that has
DF bit set on the packets) triggered PMTU discovery. That means that
the PMTU information is reset when this flow (with DF set) stops
sending packets. So the other flow (with DF not set) will send
big packets again.

> Those operators may also be in a position to place
> requirements on devices that have to use their network. If the Linux
> stack does not work as is on these networks, then those devices will
> have to meet those requirements by making out-of-tree changes. It
> would be good to avoid that if there's a better solution (e.g., make
> this configurable via sysctl).

We should not try to workaround broken configurations, there are just
too many possibilities to configure a broken network.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface
  2020-11-09  9:58     ` Steffen Klassert
@ 2020-11-09 19:38       ` Maciej Żenczykowski
  0 siblings, 0 replies; 6+ messages in thread
From: Maciej Żenczykowski @ 2020-11-09 19:38 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Lorenzo Colitti, mtk81216, David S . Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Jakub Kicinski, Herbert Xu, Matthias Brugger,
	Linux NetDev, lkml, linux-arm-kernel, linux-mediatek,
	Greg Kroah-Hartman

On Mon, Nov 9, 2020 at 1:58 AM Steffen Klassert
<steffen.klassert@secunet.com> wrote:
>
> On Thu, Nov 05, 2020 at 01:52:01PM +0900, Lorenzo Colitti wrote:
> > On Tue, Sep 15, 2020 at 4:30 PM Steffen Klassert
> > <steffen.klassert@secunet.com> wrote:
> > > > In esp's tunnel mode,if inner interface is ipv4,outer is ipv4,one big
> > > > packet which travels through tunnel will be fragmented with outer
> > > > interface's mtu,peer server will remove tunnelled esp header and assemble
> > > > them in big packet.After forwarding such packet to next endpoint,it will
> > > > be dropped because of exceeding mtu or be returned ICMP(packet-too-big).
> > >
> > > What is the exact case where packets are dropped? Given that the packet
> > > was fragmented (and reassembled), I'd assume the DF bit was not set. So
> > > every router along the path is allowed to fragment again if needed.
> >
> > In general, isn't it just suboptimal to rely on fragmentation if the
> > sender already knows the packet is too big? That's why we have things
> > like path MTU discovery (RFC 1191).
>
> When we setup packets that are sent from a local socket, we take
> MTU/PMTU info we have into account. So we don't create fragments in
> that case.
>
> When forwarding packets it is different. The router that can not
> TX the packet because it exceeds the MTU of the sending interface
> is responsible to either fragment (if DF is not set), or send a
> PMTU notification (if DF is set). So if we are able to transmit
> the packet, we do it.
>
> > Fragmentation is generally
> > expensive, increases the chance of packet loss, and has historically
> > caused lots of security vulnerabilities. Also, in real world networks,
> > fragments sometimes just don't work, either because intermediate
> > routers don't fragment, or because firewalls drop the fragments due to
> > security reasons.
> >
> > While it's possible in theory to ask these operators to configure
> > their routers to fragment packets, that may not result in the network
> > being fixed, due to hardware constraints, security policy or other
> > reasons.
>
> We can not really do anything here. If a flow has no DF bit set
> on the packets, we can not rely on PMTU information. If we have PMTU
> info on the route, then we have it because some other flow (that has
> DF bit set on the packets) triggered PMTU discovery. That means that
> the PMTU information is reset when this flow (with DF set) stops
> sending packets. So the other flow (with DF not set) will send
> big packets again.

PMTU is by default ignored by forwarding - because it's spoofable.

That said I wonder if my recent changes to honour route mtu (for ipv4)
haven't fixed this particular issue in the presence of correctly
configured device/route mtus...

I don't understand if the problem here is locally generated packets,
or forwarded packets.

It does seem like there is (or was) a bug somewhere... but it might
already be fixed (see above) or might be caused by a misconfiguration
of device mtu or routing rules.

I don't really understand the example.

>
> > Those operators may also be in a position to place
> > requirements on devices that have to use their network. If the Linux
> > stack does not work as is on these networks, then those devices will
> > have to meet those requirements by making out-of-tree changes. It
> > would be good to avoid that if there's a better solution (e.g., make
> > this configurable via sysctl).
>
> We should not try to workaround broken configurations, there are just
> too many possibilities to configure a broken network.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-11-09 19:38 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-09  6:26 [PATCH] xfrm:fragmented ipv4 tunnel packets in inner interface mtk81216
2020-09-15  7:30 ` Steffen Klassert
     [not found]   ` <1600160722.5295.15.camel@mbjsdccf07>
     [not found]     ` <20200915093230.GS20687@gauss3.secunet.de>
     [not found]       ` <1600172260.2494.2.camel@mbjsdccf07>
     [not found]         ` <20200917074637.GV20687@gauss3.secunet.de>
     [not found]           ` <1600341549.32639.5.camel@mbjsdccf07>
     [not found]             ` <1604547381.23648.14.camel@mbjsdccf07>
2020-11-05  4:41               ` Maciej Żenczykowski
2020-11-05  4:52   ` Lorenzo Colitti
2020-11-09  9:58     ` Steffen Klassert
2020-11-09 19:38       ` Maciej Żenczykowski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).