All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
@ 2012-06-11  9:29 David Miller
  2012-06-11 11:16 ` Steffen Klassert
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-11  9:29 UTC (permalink / raw)
  To: netdev


There is zero point to this function.

It's only real substance is to perform an extremely outdated BSD4.2
ICMP check, which we can safely remove.  If you really have a MTU
limited link being routed by a BSD4.2 derived system, here's a nickel
go buy yourself a real router.

The other actions of ip_rt_frag_needed(), checking and conditionally
updating the peer, are done by the per-protocol handlers of the ICMP
event.

TCP, UDP, et al. have a handler which will receive this event and
transmit it back into the associated route via dst_ops->update_pmtu().

This simplification is important, because it eliminates the one place
where we do not have a proper route context in which to make an
inetpeer lookup.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h  |    2 --
 net/ipv4/icmp.c      |    4 +---
 net/ipv4/route.c     |   61 --------------------------------------------------
 net/rxrpc/ar-error.c |    4 ----
 4 files changed, 1 insertion(+), 70 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 6340c37..cc693a5 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -215,8 +215,6 @@ static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 s
 	return ip_route_input_common(skb, dst, src, tos, devin, true);
 }
 
-extern unsigned short	ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
-					  unsigned short new_mtu, struct net_device *dev);
 extern void		ip_rt_send_redirect(struct sk_buff *skb);
 
 extern unsigned int		inet_addr_type(struct net *net, __be32 addr);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0c78ef1..e1caa1a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -673,9 +673,7 @@ static void icmp_unreach(struct sk_buff *skb)
 				LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
 					       &iph->daddr);
 			} else {
-				info = ip_rt_frag_needed(net, iph,
-							 ntohs(icmph->un.frag.mtu),
-							 skb->dev);
+				info = ntohs(icmph->un.frag.mtu);
 				if (!info)
 					goto out;
 			}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 03e5b61..4f5834c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1664,67 +1664,6 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-/*
- *	The last two values are not from the RFC but
- *	are needed for AMPRnet AX.25 paths.
- */
-
-static const unsigned short mtu_plateau[] =
-{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
-
-static inline unsigned short guess_mtu(unsigned short old_mtu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
-		if (old_mtu > mtu_plateau[i])
-			return mtu_plateau[i];
-	return 68;
-}
-
-unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
-				 unsigned short new_mtu,
-				 struct net_device *dev)
-{
-	unsigned short old_mtu = ntohs(iph->tot_len);
-	unsigned short est_mtu = 0;
-	struct inet_peer *peer;
-
-	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
-	if (peer) {
-		unsigned short mtu = new_mtu;
-
-		if (new_mtu < 68 || new_mtu >= old_mtu) {
-			/* BSD 4.2 derived systems incorrectly adjust
-			 * tot_len by the IP header length, and report
-			 * a zero MTU in the ICMP message.
-			 */
-			if (mtu == 0 &&
-			    old_mtu >= 68 + (iph->ihl << 2))
-				old_mtu -= iph->ihl << 2;
-			mtu = guess_mtu(old_mtu);
-		}
-
-		if (mtu < ip_rt_min_pmtu)
-			mtu = ip_rt_min_pmtu;
-		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
-			unsigned long pmtu_expires;
-
-			pmtu_expires = jiffies + ip_rt_mtu_expires;
-			if (!pmtu_expires)
-				pmtu_expires = 1UL;
-
-			est_mtu = mtu;
-			peer->pmtu_learned = mtu;
-			peer->pmtu_expires = pmtu_expires;
-			atomic_inc(&__rt_peer_genid);
-		}
-
-		inet_putpeer(peer);
-	}
-	return est_mtu ? : new_mtu;
-}
-
 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
 {
 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
index 5d6b572..a920608 100644
--- a/net/rxrpc/ar-error.c
+++ b/net/rxrpc/ar-error.c
@@ -81,10 +81,6 @@ void rxrpc_UDP_error_report(struct sock *sk)
 			_net("I/F MTU %u", mtu);
 		}
 
-		/* ip_rt_frag_needed() may have eaten the info */
-		if (mtu == 0)
-			mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
-
 		if (mtu == 0) {
 			/* they didn't give us a size, estimate one */
 			if (mtu > 1500) {
-- 
1.7.10

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-11  9:29 [PATCH 2/5] ipv4: Kill ip_rt_frag_needed() David Miller
@ 2012-06-11 11:16 ` Steffen Klassert
  2012-06-11 11:20   ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Steffen Klassert @ 2012-06-11 11:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, Jun 11, 2012 at 02:29:11AM -0700, David Miller wrote:
> 
> -unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
> -				 unsigned short new_mtu,
> -				 struct net_device *dev)
> -{
> -	unsigned short old_mtu = ntohs(iph->tot_len);
> -	unsigned short est_mtu = 0;
> -	struct inet_peer *peer;
> -
> -	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
> -	if (peer) {
> -		unsigned short mtu = new_mtu;
> -
> -		if (new_mtu < 68 || new_mtu >= old_mtu) {
> -			/* BSD 4.2 derived systems incorrectly adjust
> -			 * tot_len by the IP header length, and report
> -			 * a zero MTU in the ICMP message.
> -			 */
> -			if (mtu == 0 &&
> -			    old_mtu >= 68 + (iph->ihl << 2))
> -				old_mtu -= iph->ihl << 2;
> -			mtu = guess_mtu(old_mtu);
> -		}
> -
> -		if (mtu < ip_rt_min_pmtu)
> -			mtu = ip_rt_min_pmtu;
> -		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
> -			unsigned long pmtu_expires;
> -
> -			pmtu_expires = jiffies + ip_rt_mtu_expires;
> -			if (!pmtu_expires)
> -				pmtu_expires = 1UL;
> -
> -			est_mtu = mtu;
> -			peer->pmtu_learned = mtu;
> -			peer->pmtu_expires = pmtu_expires;
> -			atomic_inc(&__rt_peer_genid);
> -		}
> -
> -		inet_putpeer(peer);
> -	}
> -	return est_mtu ? : new_mtu;
> -}
> -

It seems that we don't cache the learned pmtu informations
in some cases with ip_rt_frag_needed() removed. 

At least when doing a simple ping test on a network that has
a router with mtu 1300 along the path, the following happens:

bash-3.00# ping -c 4 -s 1400 192.168.40.2                                       
PING 192.168.40.2 (192.168.40.2) 1400(1428) bytes of data.                      
>From 10.2.2.2 icmp_seq=1 Frag needed and DF set (mtu = 1300)                    
>From 10.2.2.2 icmp_seq=2 Frag needed and DF set (mtu = 1300)                    
>From 10.2.2.2 icmp_seq=3 Frag needed and DF set (mtu = 1300)                    
>From 10.2.2.2 icmp_seq=4 Frag needed and DF set (mtu = 1300)                    
                                                                                
--- 192.168.40.2 ping statistics ---                                            
4 packets transmitted, 0 received, +4 errors, 100% packet loss, time 3005ms     

We should learn the pmtu information with the first packet,
all further packets should get fragmented according to
the learned informations. Unfortunately we don't cache
these informations: 
                                                        
bash-3.00# ip r g 192.168.40.2                                                  
192.168.40.2 via 192.168.20.1 dev eth0  src 192.168.20.2                        
    cache

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-11 11:16 ` Steffen Klassert
@ 2012-06-11 11:20   ` David Miller
  2012-06-11 11:28     ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-11 11:20 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 11 Jun 2012 13:16:59 +0200

> It seems that we don't cache the learned pmtu informations
> in some cases with ip_rt_frag_needed() removed. 

We need to find a way to implement this then, in such a way
that we have the route context used to send the ping packet
out.

Otherwise, it's impossible to record the information properly.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-11 11:20   ` David Miller
@ 2012-06-11 11:28     ` David Miller
  2012-06-11 11:42       ` Steffen Klassert
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-11 11:28 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: David Miller <davem@davemloft.net>
Date: Mon, 11 Jun 2012 04:20:24 -0700 (PDT)

> We need to find a way to implement this then, in such a way
> that we have the route context used to send the ping packet
> out.

The problem is RAW sockets right?  If so, then this is where the
fix belongs.

There is nothing preventing the RAW socket code from remembering the
last route used, as well as the flow4 key used to look it up, and
processing the PMTU message appropriately in raw_err() using that
remembered information if the flow4 key matches.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-11 11:28     ` David Miller
@ 2012-06-11 11:42       ` Steffen Klassert
  2012-06-11 23:02         ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Steffen Klassert @ 2012-06-11 11:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, Jun 11, 2012 at 04:28:13AM -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Mon, 11 Jun 2012 04:20:24 -0700 (PDT)
> 
> > We need to find a way to implement this then, in such a way
> > that we have the route context used to send the ping packet
> > out.
> 
> The problem is RAW sockets right?  If so, then this is where the
> fix belongs.
> 

Hm, I've just tried with tracepath (udp) and I also don't see the
pmtu informations cached.

I still had no time to look deeper into the new inetpeer code,
I've just gave it a quick try. I'll try to find out what's going on.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-11 11:42       ` Steffen Klassert
@ 2012-06-11 23:02         ` David Miller
  2012-06-12 11:44           ` Steffen Klassert
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-11 23:02 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 11 Jun 2012 13:42:56 +0200

> On Mon, Jun 11, 2012 at 04:28:13AM -0700, David Miller wrote:
>> From: David Miller <davem@davemloft.net>
>> Date: Mon, 11 Jun 2012 04:20:24 -0700 (PDT)
>> 
>> > We need to find a way to implement this then, in such a way
>> > that we have the route context used to send the ping packet
>> > out.
>> 
>> The problem is RAW sockets right?  If so, then this is where the
>> fix belongs.
>> 
> 
> Hm, I've just tried with tracepath (udp) and I also don't see the
> pmtu informations cached.
> 
> I still had no time to look deeper into the new inetpeer code,
> I've just gave it a quick try. I'll try to find out what's going on.

Here below is the kind of patch I was suggesting we make.  I did a
simple test to make sure the update MTU code path is taken in
raw_err().

But I'm having second thoughts about whether any of this is a good
idea.

UDP works by notifying userspace of PMTU events.  And this is
mandatory, if we're setting DF we have to get the user to decrease the
size of it's datagram writes below the reported PMTU value.

As a consequence I believe RAW sockets should also work via
notifications.

And therefore it can be argued that in neither case should we update
the routing cache PMTU information.  This is in line with the fact
that TCP goes to great lengths to validate that the PMTU it's getting
is really legitimate and not forged, and UDP and RAW have no way of
making such strict checks.

And it also shows that those TCP strict checks were for nothing
beforehand.  That PMTU update guard done by TCP was (before my changes
this past weekend) pointless because we were doing the PMTU update
unconditionally earlier in the ICMP handling.

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4032b81..3e2507b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -242,6 +242,17 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 		err = icmp_err_convert[code].errno;
 		harderr = icmp_err_convert[code].fatal;
 		if (code == ICMP_FRAG_NEEDED) {
+			struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+			struct rtable *rt;
+
+			bh_lock_sock(sk);
+			rt = ip_route_output_flow(sock_net(sk), fl4, sk);
+			if (!IS_ERR(rt)) {
+				rt->dst.ops->update_pmtu(&rt->dst, info);
+				ip_rt_put(rt);
+			}
+			bh_unlock_sock(sk);
+
 			harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
 			err = EMSGSIZE;
 		}
@@ -316,9 +327,8 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
-			   void *from, size_t length,
-			   struct rtable **rtp,
+static int raw_send_hdrinc(struct sock *sk, __be32 saddr, __be32 daddr,
+			   void *from, size_t length, struct rtable **rtp,
 			   unsigned int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -331,7 +341,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 	int hlen, tlen;
 
 	if (length > rt->dst.dev->mtu) {
-		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+		ip_local_error(sk, EMSGSIZE, daddr, inet->inet_dport,
 			       rt->dst.dev->mtu);
 		return -EMSGSIZE;
 	}
@@ -378,7 +388,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	if (iphlen >= sizeof(*iph)) {
 		if (!iph->saddr)
-			iph->saddr = fl4->saddr;
+			iph->saddr = saddr;
 		iph->check   = 0;
 		iph->tot_len = htons(length);
 		if (!iph->id)
@@ -461,7 +471,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipcm_cookie ipc;
 	struct rtable *rt = NULL;
-	struct flowi4 fl4;
+	struct flowi4 *fl4;
 	int free = 0;
 	__be32 daddr;
 	__be32 saddr;
@@ -563,54 +573,66 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	} else if (!ipc.oif)
 		ipc.oif = inet->uc_index;
 
-	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+	lock_sock(sk);
+	fl4 = &inet->cork.fl.u.ip4;
+	flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
 			   RT_SCOPE_UNIVERSE,
 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP,
 			   daddr, saddr, 0, 0);
 
 	if (!inet->hdrincl) {
-		err = raw_probe_proto_opt(&fl4, msg);
+		err = raw_probe_proto_opt(fl4, msg);
 		if (err)
-			goto done;
+			goto done_unlock;
 	}
 
-	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
-	rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+	security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+	rt = ip_route_output_flow(sock_net(sk), fl4, sk);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		rt = NULL;
-		goto done;
+		goto done_unlock;
 	}
 
 	err = -EACCES;
 	if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
-		goto done;
+		goto done_unlock;
+
+	if (msg->msg_flags & MSG_CONFIRM) {
+		dst_confirm(&rt->dst);
+		if ((msg->msg_flags & MSG_PROBE) && !len) {
+			err = 0;
+			goto done_unlock;
+		}
+	}
 
-	if (msg->msg_flags & MSG_CONFIRM)
-		goto do_confirm;
-back_from_confirm:
+	if (inet->hdrincl) {
+		__be32 saddr = fl4->saddr;
+		__be32 daddr = fl4->daddr;
 
-	if (inet->hdrincl)
-		err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len,
+		release_sock(sk);
+		err = raw_send_hdrinc(sk, saddr, daddr,
+				      msg->msg_iov, len,
 				      &rt, msg->msg_flags);
 
-	 else {
+		goto done;
+	}  else {
 		if (!ipc.addr)
-			ipc.addr = fl4.daddr;
-		lock_sock(sk);
-		err = ip_append_data(sk, &fl4, ip_generic_getfrag,
+			ipc.addr = fl4->daddr;
+		err = ip_append_data(sk, fl4, ip_generic_getfrag,
 				     msg->msg_iov, len, 0,
 				     &ipc, &rt, msg->msg_flags);
 		if (err)
 			ip_flush_pending_frames(sk);
 		else if (!(msg->msg_flags & MSG_MORE)) {
-			err = ip_push_pending_frames(sk, &fl4);
+			err = ip_push_pending_frames(sk, fl4);
 			if (err == -ENOBUFS && !inet->recverr)
 				err = 0;
 		}
-		release_sock(sk);
 	}
+done_unlock:
+	release_sock(sk);
 done:
 	if (free)
 		kfree(ipc.opt);
@@ -620,13 +642,6 @@ out:
 	if (err < 0)
 		return err;
 	return len;
-
-do_confirm:
-	dst_confirm(&rt->dst);
-	if (!(msg->msg_flags & MSG_PROBE) || len)
-		goto back_from_confirm;
-	err = 0;
-	goto done;
 }
 
 static void raw_close(struct sock *sk, long timeout)

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-11 23:02         ` David Miller
@ 2012-06-12 11:44           ` Steffen Klassert
  2012-06-12 20:33             ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Steffen Klassert @ 2012-06-12 11:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, Jun 11, 2012 at 04:02:58PM -0700, David Miller wrote:
> 
> Here below is the kind of patch I was suggesting we make.  I did a
> simple test to make sure the update MTU code path is taken in
> raw_err().

I can confirm that your patch restores the old behaviour of ping.

> 
> But I'm having second thoughts about whether any of this is a good
> idea.
> 
> UDP works by notifying userspace of PMTU events.  And this is
> mandatory, if we're setting DF we have to get the user to decrease the
> size of it's datagram writes below the reported PMTU value.
> 
> As a consequence I believe RAW sockets should also work via
> notifications.
> 
> And therefore it can be argued that in neither case should we update
> the routing cache PMTU information.  

Should be ok as long as all userspace applications that use UDP or
RAW sockets handle pmtu event notifications properly.

ping might be a special case, but now the behaviour of a big
sized ping (say 1400 byte on a network that has a router with
mtu 1300 along the path) with IP_PMTUDISC_WANT might depend on
whether the cached pmtu informations are updated by a recent
tcp connection.

If we had no tcp connection before, we see the behaviour that
I described in my first mail. All packets have the DF bit set.

If a tcp connection updated the cached pmtu informations recently,
the packets don't have the DF bit set. They are fragmented according
the cached pmtu informations instead.

Other applications that do not care for pmtu event notifications
might be in a similar situation. So perhaps we need the kind of
patch you are suggested.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-12 11:44           ` Steffen Klassert
@ 2012-06-12 20:33             ` David Miller
  2012-06-13  4:22               ` David Miller
  2012-06-13  8:01               ` Steffen Klassert
  0 siblings, 2 replies; 19+ messages in thread
From: David Miller @ 2012-06-12 20:33 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 12 Jun 2012 13:44:40 +0200

> On Mon, Jun 11, 2012 at 04:02:58PM -0700, David Miller wrote:
>> UDP works by notifying userspace of PMTU events.  And this is
>> mandatory, if we're setting DF we have to get the user to decrease the
>> size of it's datagram writes below the reported PMTU value.
>> 
>> As a consequence I believe RAW sockets should also work via
>> notifications.
>> 
>> And therefore it can be argued that in neither case should we update
>> the routing cache PMTU information.  
> 
> Should be ok as long as all userspace applications that use UDP or
> RAW sockets handle pmtu event notifications properly.

I am convinced that they absolutely must, if they use IP_PMTUDISC_DO.

Otherwise they will continue making larger-than-PMTU sendmsg()
requests.  As these are datagram sockets, we can't simply segment the
data.

> ping might be a special case, but now the behaviour of a big
> sized ping (say 1400 byte on a network that has a router with
> mtu 1300 along the path) with IP_PMTUDISC_WANT might depend on
> whether the cached pmtu informations are updated by a recent
> tcp connection.
> 
> If we had no tcp connection before, we see the behaviour that
> I described in my first mail. All packets have the DF bit set.
> 
> If a tcp connection updated the cached pmtu informations recently,
> the packets don't have the DF bit set. They are fragmented according
> the cached pmtu informations instead.
> 
> Other applications that do not care for pmtu event notifications
> might be in a similar situation. So perhaps we need the kind of
> patch you are suggested.

We can't do exactly as my patch did, because it allows remote entities
to easily poison PMTU information.  All they have to know is that
there is some UDP or RAW socket open with a certain ID and then send
forged ICMP to us.

What we possibly could do is adjust the socket's IP_PMTUDISC_* setting
from IP_PMTUDISC_WANT to IP_PMTUDISC_DONT in response to PMTU
messages.

This seems to solve all the problems.  Individual RAW and UDP sockets
get the behavior they did before, and route cache PMTU poisoning is
less of an issue.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-12 20:33             ` David Miller
@ 2012-06-13  4:22               ` David Miller
  2012-06-13  8:01               ` Steffen Klassert
  1 sibling, 0 replies; 19+ messages in thread
From: David Miller @ 2012-06-13  4:22 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: David Miller <davem@davemloft.net>
Date: Tue, 12 Jun 2012 13:33:33 -0700 (PDT)

> What we possibly could do is adjust the socket's IP_PMTUDISC_* setting
> from IP_PMTUDISC_WANT to IP_PMTUDISC_DONT in response to PMTU
> messages.
> 
> This seems to solve all the problems.  Individual RAW and UDP sockets
> get the behavior they did before, and route cache PMTU poisoning is
> less of an issue.

Here is an implementation of that idea:

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 8260ef7..61cb532 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -416,6 +416,16 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 	return inet_sk(__sk)->pinet6;
 }
 
+/* We don't want to update the routing tables because there is no way
+ * to validate the legitimacy of this PMTU event.  Instead, downgrade
+ * the PMTU setting of the socket.
+ */
+static inline void inet6_datagram_pmtu_event(struct ipv6_pinfo *np)
+{
+	if (np->pmtudisc == IP_PMTUDISC_WANT)
+		np->pmtudisc = IP_PMTUDISC_DONT;
+}
+
 static inline struct inet6_request_sock *
 			inet6_rsk(const struct request_sock *rsk)
 {
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ae17e13..2199afb 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -199,6 +199,16 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 }
 #endif
 
+/* We don't want to update the routing cache because there is no way
+ * to validate the legitimacy of this PMTU event.  Instead, downgrade
+ * the PMTU setting of the socket.
+ */
+static inline void inet_datagram_pmtu_event(struct inet_sock *inet)
+{
+	if (inet->pmtudisc == IP_PMTUDISC_WANT)
+		inet->pmtudisc = IP_PMTUDISC_DONT;
+}
+
 extern int inet_sk_rebuild_header(struct sock *sk);
 
 extern u32 inet_ehash_secret;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2c00e8b..c6becc1 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -374,6 +374,7 @@ void ping_err(struct sk_buff *skb, u32 info)
 			if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
 				err = EMSGSIZE;
 				harderr = 1;
+				inet_datagram_pmtu_event(inet_sock);
 				break;
 			}
 			goto out;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4032b81..ed24b05 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -244,6 +244,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 		if (code == ICMP_FRAG_NEEDED) {
 			harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
 			err = EMSGSIZE;
+			inet_datagram_pmtu_event(inet);
 		}
 	}
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eaca736..40cf013 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -618,6 +618,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
 				err = EMSGSIZE;
 				harderr = 1;
+				inet_datagram_pmtu_event(inet);
 				break;
 			}
 			goto out;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 93d6983..79e2f7a 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -328,9 +328,10 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 		return;
 
 	harderr = icmpv6_err_convert(type, code, &err);
-	if (type == ICMPV6_PKT_TOOBIG)
+	if (type == ICMPV6_PKT_TOOBIG) {
 		harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
-
+		inet6_datagram_pmtu_event(np);
+	}
 	if (np->recverr) {
 		u8 *payload = skb->data;
 		if (!inet->hdrincl)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f05099f..fb57815 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -481,6 +481,9 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	np = inet6_sk(sk);
 
+	if (type == ICMPV6_PKT_TOOBIG)
+		inet6_datagram_pmtu_event(np);
+
 	if (!icmpv6_err_convert(type, code, &err) && !np->recverr)
 		goto out;
 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-12 20:33             ` David Miller
  2012-06-13  4:22               ` David Miller
@ 2012-06-13  8:01               ` Steffen Klassert
  2012-06-13  9:42                 ` David Miller
  1 sibling, 1 reply; 19+ messages in thread
From: Steffen Klassert @ 2012-06-13  8:01 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Tue, Jun 12, 2012 at 01:33:33PM -0700, David Miller wrote:
> 
> We can't do exactly as my patch did, because it allows remote entities
> to easily poison PMTU information.  All they have to know is that
> there is some UDP or RAW socket open with a certain ID and then send
> forged ICMP to us.

Yes, I know what you mean. But not updating the the cached pmtu
informations results in slow path fragmentation along the path.

Btw. what happens to ipv6 if we stop doing pmtu discovery?
Shouldn't we reduce the packet size to 1280 bytes then?

> 
> What we possibly could do is adjust the socket's IP_PMTUDISC_* setting
> from IP_PMTUDISC_WANT to IP_PMTUDISC_DONT in response to PMTU
> messages.
> 

I think an application that sets IP_PMTUDISC_WANT explicitly will
rely on the fact that the kernel does pmtu discovery. Changing
the socket setting to IP_PMTUDISC_DONT the first time we get into
trouble makes IP_PMTUDISC_WANT pointless for udp and raw sockets.

Another option would be to change the sockets default setting
from IP_PMTUDISC_WANT to IP_PMTUDISC_DONT (at least for udp and
raw) and do pmtu discovery if an application sets IP_PMTUDISC_WANT.

With this we don't have the pmtu cache poisoning issue as the default.
We would only have it if a sockets sets IP_PMTUDISC_WANT explicitly.

This is not perfect too, but I fear there is no perfect solution here.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-13  8:01               ` Steffen Klassert
@ 2012-06-13  9:42                 ` David Miller
  2012-06-13 10:07                   ` Steffen Klassert
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-13  9:42 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 13 Jun 2012 10:01:52 +0200

> I think an application that sets IP_PMTUDISC_WANT explicitly will
> rely on the fact that the kernel does pmtu discovery. Changing
> the socket setting to IP_PMTUDISC_DONT the first time we get into
> trouble makes IP_PMTUDISC_WANT pointless for udp and raw sockets.

How so?

We are mimicking exactly what would happen if we had just created
a new routing cache entry when the application openned the socket.

There is no behavioral difference whatsoever.

We absolutely do perform PMTU discovery, the first large packet
will trigger it.  And then, as if we had lowered the PMTU in
the routing cache entry, we will stop setting DF in the packets.

Because that is how the IP_PMTUDISC_* checks work in the IP output
path in the place that decides whether to set DF or not.

> Another option would be to change the sockets default setting
> from IP_PMTUDISC_WANT to IP_PMTUDISC_DONT (at least for udp and
> raw) and do pmtu discovery if an application sets IP_PMTUDISC_WANT.

Changing defaults doesn't make the problem go away, and is also
unexpected.

I did all of my testing using the "-M" option of ping.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-13  9:42                 ` David Miller
@ 2012-06-13 10:07                   ` Steffen Klassert
  2012-06-13 10:22                     ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Steffen Klassert @ 2012-06-13 10:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Wed, Jun 13, 2012 at 02:42:25AM -0700, David Miller wrote:
> From: Steffen Klassert <steffen.klassert@secunet.com>
> Date: Wed, 13 Jun 2012 10:01:52 +0200
> 
> > I think an application that sets IP_PMTUDISC_WANT explicitly will
> > rely on the fact that the kernel does pmtu discovery. Changing
> > the socket setting to IP_PMTUDISC_DONT the first time we get into
> > trouble makes IP_PMTUDISC_WANT pointless for udp and raw sockets.
> 
> How so?
> 
> We are mimicking exactly what would happen if we had just created
> a new routing cache entry when the application openned the socket.
> 
> There is no behavioral difference whatsoever.
> 
> We absolutely do perform PMTU discovery, the first large packet
> will trigger it.  And then, as if we had lowered the PMTU in
> the routing cache entry, we will stop setting DF in the packets.

Maybe I missunderstood what you meant. I thought that you don't want
to update the pmtu cache informations at all on udp and raw.
If we update the pmtu cache informations with first large packet,
I agree absolutely.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-13 10:07                   ` Steffen Klassert
@ 2012-06-13 10:22                     ` David Miller
  2012-06-14  5:35                       ` Steffen Klassert
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-13 10:22 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 13 Jun 2012 12:07:09 +0200

> On Wed, Jun 13, 2012 at 02:42:25AM -0700, David Miller wrote:
>> From: Steffen Klassert <steffen.klassert@secunet.com>
>> Date: Wed, 13 Jun 2012 10:01:52 +0200
>> 
>> > I think an application that sets IP_PMTUDISC_WANT explicitly will
>> > rely on the fact that the kernel does pmtu discovery. Changing
>> > the socket setting to IP_PMTUDISC_DONT the first time we get into
>> > trouble makes IP_PMTUDISC_WANT pointless for udp and raw sockets.
>> 
>> How so?
>> 
>> We are mimicking exactly what would happen if we had just created
>> a new routing cache entry when the application openned the socket.
>> 
>> There is no behavioral difference whatsoever.
>> 
>> We absolutely do perform PMTU discovery, the first large packet
>> will trigger it.  And then, as if we had lowered the PMTU in
>> the routing cache entry, we will stop setting DF in the packets.
> 
> Maybe I missunderstood what you meant. I thought that you don't want
> to update the pmtu cache informations at all on udp and raw.
> If we update the pmtu cache informations with first large packet,
> I agree absolutely.

We don't update the PMTU.

But we behave as if we did.

The only effect the IP_PMTUDISC_* values have is in deciding whether
to set the DF flag in the outgoing packets.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-13 10:22                     ` David Miller
@ 2012-06-14  5:35                       ` Steffen Klassert
  2012-06-14  5:42                         ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Steffen Klassert @ 2012-06-14  5:35 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Wed, Jun 13, 2012 at 03:22:28AM -0700, David Miller wrote:
> From: Steffen Klassert <steffen.klassert@secunet.com>
> Date: Wed, 13 Jun 2012 12:07:09 +0200
> 
> > On Wed, Jun 13, 2012 at 02:42:25AM -0700, David Miller wrote:
> >> From: Steffen Klassert <steffen.klassert@secunet.com>
> >> Date: Wed, 13 Jun 2012 10:01:52 +0200
> >> 
> >> > I think an application that sets IP_PMTUDISC_WANT explicitly will
> >> > rely on the fact that the kernel does pmtu discovery. Changing
> >> > the socket setting to IP_PMTUDISC_DONT the first time we get into
> >> > trouble makes IP_PMTUDISC_WANT pointless for udp and raw sockets.
> >> 
> >> How so?
> >> 
> >> We are mimicking exactly what would happen if we had just created
> >> a new routing cache entry when the application openned the socket.
> >> 
> >> There is no behavioral difference whatsoever.
> >> 
> >> We absolutely do perform PMTU discovery, the first large packet
> >> will trigger it.  And then, as if we had lowered the PMTU in
> >> the routing cache entry, we will stop setting DF in the packets.
> > 
> > Maybe I missunderstood what you meant. I thought that you don't want
> > to update the pmtu cache informations at all on udp and raw.
> > If we update the pmtu cache informations with first large packet,
> > I agree absolutely.
> 
> We don't update the PMTU.
> 
> But we behave as if we did.
> 
> The only effect the IP_PMTUDISC_* values have is in deciding whether
> to set the DF flag in the outgoing packets.

With your patch applied, we stop setting the DF bit after we received
a 'need to frag' ICMP message, but we don't fragment. We send the packets
out unfragmented. Before we removed ip_rt_frag_needed(), we did the
fragmentation according to the pmtu informations we got from the icmp
message. Now the router with the low mtu has to do the fragmentation.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-14  5:35                       ` Steffen Klassert
@ 2012-06-14  5:42                         ` David Miller
  2012-06-14  5:58                           ` Steffen Klassert
  2012-06-14  5:59                           ` David Miller
  0 siblings, 2 replies; 19+ messages in thread
From: David Miller @ 2012-06-14  5:42 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 14 Jun 2012 07:35:29 +0200

> With your patch applied, we stop setting the DF bit after we
> received a 'need to frag' ICMP message, but we don't fragment. We
> send the packets out unfragmented. Before we removed
> ip_rt_frag_needed(), we did the fragmentation according to the pmtu
> informations we got from the icmp message. Now the router with the
> low mtu has to do the fragmentation.

Ok, then if we want to do the fragmentation locally then we have to
consider my initial patch which updates the PMTU in raw_err().

Did you test that?  I mean specifically, this patch:

http://marc.info/?l=linux-netdev&m=133945597319917&w=2

If it works for you, I will try to extend it to the other datagram
cases.

Thanks.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-14  5:42                         ` David Miller
@ 2012-06-14  5:58                           ` Steffen Klassert
  2012-06-14  5:59                           ` David Miller
  1 sibling, 0 replies; 19+ messages in thread
From: Steffen Klassert @ 2012-06-14  5:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Wed, Jun 13, 2012 at 10:42:03PM -0700, David Miller wrote:
> 
> Ok, then if we want to do the fragmentation locally then we have to
> consider my initial patch which updates the PMTU in raw_err().
> 
> Did you test that?  I mean specifically, this patch:
> 
> http://marc.info/?l=linux-netdev&m=133945597319917&w=2
> 
> If it works for you, I will try to extend it to the other datagram
> cases.
> 

Yes, I can confirm that this one works. It restores the old behaviour.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-14  5:42                         ` David Miller
  2012-06-14  5:58                           ` Steffen Klassert
@ 2012-06-14  5:59                           ` David Miller
  2012-06-14  6:36                             ` Steffen Klassert
  1 sibling, 1 reply; 19+ messages in thread
From: David Miller @ 2012-06-14  5:59 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: David Miller <davem@davemloft.net>
Date: Wed, 13 Jun 2012 22:42:03 -0700 (PDT)

> From: Steffen Klassert <steffen.klassert@secunet.com>
> Date: Thu, 14 Jun 2012 07:35:29 +0200
> 
>> With your patch applied, we stop setting the DF bit after we
>> received a 'need to frag' ICMP message, but we don't fragment. We
>> send the packets out unfragmented. Before we removed
>> ip_rt_frag_needed(), we did the fragmentation according to the pmtu
>> informations we got from the icmp message. Now the router with the
>> low mtu has to do the fragmentation.
> 
> Ok, then if we want to do the fragmentation locally then we have to
> consider my initial patch which updates the PMTU in raw_err().
> 
> Did you test that?  I mean specifically, this patch:
> 
> http://marc.info/?l=linux-netdev&m=133945597319917&w=2
> 
> If it works for you, I will try to extend it to the other datagram
> cases.

Actually, thinking some more, we could extend my inet->pmtudisc patch
to achieve a similar effect.

Essentially we'd have a socket local PMTU value for datagram sockets.

Would you be OK with that approach?

I like the inet->pmtudisc way of solving this problem, because it:

1) Requires no special code to "remember" the flow used for the last
   socket sendmsg() call.

2) In the events of a malicious attempt to poison the routing cache
   PMTU information, only one socket will be harmed, rather than
   the whole system.

I tried to look for inspiration in other systems, but all of them lack
source based routing and other things we support, so they just use
a purely destination address based cache for PMTU information.

Other systems also don't have to deal with SO_BINDTODEVICE which
influences the route.

So we absolutely have to make our PMTU operations with the full
context used to emit the packet.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-14  5:59                           ` David Miller
@ 2012-06-14  6:36                             ` Steffen Klassert
  2012-06-14  6:54                               ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Steffen Klassert @ 2012-06-14  6:36 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Wed, Jun 13, 2012 at 10:59:41PM -0700, David Miller wrote:
> 
> Actually, thinking some more, we could extend my inet->pmtudisc patch
> to achieve a similar effect.
> 
> Essentially we'd have a socket local PMTU value for datagram sockets.
> 
> Would you be OK with that approach?

This would require to maintain socket local pmtu expire times too.
Also, which of these pmtu values do we report if a user asks for
that? And how should we flush all these pmtu values?

It could have some side effects. But if we get it to work,
it would be an improvement. I'm fine with everything
that works in the end :-)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed().
  2012-06-14  6:36                             ` Steffen Klassert
@ 2012-06-14  6:54                               ` David Miller
  0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2012-06-14  6:54 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 14 Jun 2012 08:36:32 +0200

> On Wed, Jun 13, 2012 at 10:59:41PM -0700, David Miller wrote:
>> 
>> Actually, thinking some more, we could extend my inet->pmtudisc patch
>> to achieve a similar effect.
>> 
>> Essentially we'd have a socket local PMTU value for datagram sockets.
>> 
>> Would you be OK with that approach?
> 
> This would require to maintain socket local pmtu expire times too.
> Also, which of these pmtu values do we report if a user asks for
> that? And how should we flush all these pmtu values?
> 
> It could have some side effects. But if we get it to work,
> it would be an improvement. I'm fine with everything
> that works in the end :-)

Your right, maybe the route updating approach is therefore better.

I'll play around with my original patch.

Thanks.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2012-06-14  6:54 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-11  9:29 [PATCH 2/5] ipv4: Kill ip_rt_frag_needed() David Miller
2012-06-11 11:16 ` Steffen Klassert
2012-06-11 11:20   ` David Miller
2012-06-11 11:28     ` David Miller
2012-06-11 11:42       ` Steffen Klassert
2012-06-11 23:02         ` David Miller
2012-06-12 11:44           ` Steffen Klassert
2012-06-12 20:33             ` David Miller
2012-06-13  4:22               ` David Miller
2012-06-13  8:01               ` Steffen Klassert
2012-06-13  9:42                 ` David Miller
2012-06-13 10:07                   ` Steffen Klassert
2012-06-13 10:22                     ` David Miller
2012-06-14  5:35                       ` Steffen Klassert
2012-06-14  5:42                         ` David Miller
2012-06-14  5:58                           ` Steffen Klassert
2012-06-14  5:59                           ` David Miller
2012-06-14  6:36                             ` Steffen Klassert
2012-06-14  6:54                               ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.