All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] [NET] Do pmtu check in transport layer
@ 2007-03-24  0:06 John Heffner
  2007-03-24  0:06 ` [PATCH 2/3] [NET] Move DF check to ip_forward John Heffner
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: John Heffner @ 2007-03-24  0:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, John Heffner

Check the pmtu check at the transport layer (for UDP, ICMP and raw), and
send a local error if socket is PMTUDISC_DO and packet is too big.  This is
actually a pure bugfix for ipv6.  For ipv4, it allows us to do pmtu checks
in the same way as for ipv6.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 net/ipv4/ip_output.c  |    4 +++-
 net/ipv4/raw.c        |    8 +++++---
 net/ipv6/ip6_output.c |   11 ++++++-----
 net/ipv6/raw.c        |    7 +++++--
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d096332..593acf7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -822,7 +822,9 @@ int ip_append_data(struct sock *sk,
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
-	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
+	if (inet->cork.length + length > 0xFFFF - fragheaderlen ||
+	    (inet->pmtudisc >= IP_PMTUDISC_DO &&
+	     inet->cork.length + length > mtu)) {
 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 		return -EMSGSIZE;
 	}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 87e9c16..f252f4e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -271,10 +271,12 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	struct iphdr *iph;
 	struct sk_buff *skb;
 	int err;
+	int mtu;
 
-	if (length > rt->u.dst.dev->mtu) {
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
-			       rt->u.dst.dev->mtu);
+	mtu = inet->pmtudisc == IP_PMTUDISC_DO ? dst_mtu(&rt->u.dst) :
+	                                         rt->u.dst.dev->mtu;
+	if (length > mtu) {
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
 		return -EMSGSIZE;
 	}
 	if (flags&MSG_PROBE)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3055169..711dfc3 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1044,11 +1044,12 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 	fragheaderlen = sizeof(struct ipv6hdr) + rt->u.dst.nfheader_len + (opt ? opt->opt_nflen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 
-	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
-		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
-			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
-			return -EMSGSIZE;
-		}
+	if ((mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN &&
+	     inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) ||
+	    (np->pmtudisc >= IPV6_PMTUDISC_DO &&
+	     inet->cork.length + length > mtu)) {
+		ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
+		return -EMSGSIZE;
 	}
 
 	/*
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 306d5d8..75db277 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -556,9 +556,12 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 	struct sk_buff *skb;
 	unsigned int hh_len;
 	int err;
+	int mtu;
 
-	if (length > rt->u.dst.dev->mtu) {
-		ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu);
+	mtu = np->pmtudisc == IPV6_PMTUDISC_DO ? dst_mtu(&rt->u.dst) :
+	                                         rt->u.dst.dev->mtu;
+	if (length > mtu) {
+		ipv6_local_error(sk, EMSGSIZE, fl, mtu);
 		return -EMSGSIZE;
 	}
 	if (flags&MSG_PROBE)
-- 
1.5.0.2.gc260-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/3] [NET] Move DF check to ip_forward
  2007-03-24  0:06 [PATCH 1/3] [NET] Do pmtu check in transport layer John Heffner
@ 2007-03-24  0:06 ` John Heffner
  2007-03-24  0:06   ` [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
                     ` (2 more replies)
  2007-03-25  4:14 ` [PATCH 1/3] [NET] Do pmtu check in transport layer David Miller
                   ` (3 subsequent siblings)
  4 siblings, 3 replies; 26+ messages in thread
From: John Heffner @ 2007-03-24  0:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, John Heffner

Do fragmentation check in ip_forward, similar to ipv6 forwarding.  Also add
a debug printk in the DF check in ip_fragment since we should now never
reach it.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 net/ipv4/ip_forward.c |    8 ++++++++
 net/ipv4/ip_output.c  |    2 ++
 2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 369e721..0efb1f5 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -85,6 +85,14 @@ int ip_forward(struct sk_buff *skb)
 	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 		goto sr_failed;
 
+	if (unlikely(skb->len > dst_mtu(&rt->u.dst) &&
+	             (skb->nh.iph->frag_off & htons(IP_DF))) && !skb->local_df) {
+		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(dst_mtu(&rt->u.dst)));
+		goto drop;
+	}
+
 	/* We are about to mangle packet. Copy it! */
 	if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
 		goto drop;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 593acf7..90bdd53 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -433,6 +433,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 	iph = skb->nh.iph;
 
 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "ip_fragment: requested fragment of packet with DF set\n");
 		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 			  htonl(dst_mtu(&rt->u.dst)));
-- 
1.5.0.2.gc260-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE
  2007-03-24  0:06 ` [PATCH 2/3] [NET] Move DF check to ip_forward John Heffner
@ 2007-03-24  0:06   ` John Heffner
  2007-03-25  4:23     ` David Miller
  2007-03-27 14:18     ` Andi Kleen
  2007-03-25  4:17   ` [PATCH 2/3] [NET] Move DF check to ip_forward David Miller
  2007-03-25 13:37   ` [NET]: Fix breakage, use ip_hdr() for DF check in ip_forward Thomas Graf
  2 siblings, 2 replies; 26+ messages in thread
From: John Heffner @ 2007-03-24  0:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, John Heffner

Add IP(V6)_PMTUDISC_PROBE value for IP(V6)_MTU_DISCOVER.  This option forces
us not to fragment, but does not make use of the kernel path MTU discovery. 
That is, it allows for user-mode MTU probing (or, packetization-layer path
MTU discovery).  This is particularly useful for diagnostic utilities, like
traceroute/tracepath.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 include/linux/in.h       |    1 +
 include/linux/in6.h      |    1 +
 include/linux/skbuff.h   |    3 ++-
 include/net/ip.h         |    2 +-
 net/core/skbuff.c        |    2 ++
 net/ipv4/ip_output.c     |   14 ++++++++++----
 net/ipv4/ip_sockglue.c   |    2 +-
 net/ipv4/raw.c           |    3 +++
 net/ipv6/ip6_output.c    |   12 ++++++++----
 net/ipv6/ipv6_sockglue.c |    2 +-
 net/ipv6/raw.c           |    3 +++
 11 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/include/linux/in.h b/include/linux/in.h
index 1912e7c..2dc1f8a 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -83,6 +83,7 @@ struct in_addr {
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
 #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
 #define IP_PMTUDISC_DO			2	/* Always DF		*/
+#define IP_PMTUDISC_PROBE		3	/* Ignore dst pmtu	*/
 
 #define IP_MULTICAST_IF			32
 #define IP_MULTICAST_TTL 		33
diff --git a/include/linux/in6.h b/include/linux/in6.h
index 4e8350a..d559fac 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -179,6 +179,7 @@ struct in6_flowlabel_req
 #define IPV6_PMTUDISC_DONT		0
 #define IPV6_PMTUDISC_WANT		1
 #define IPV6_PMTUDISC_DO		2
+#define IPV6_PMTUDISC_PROBE		3
 
 /* Flowlabel */
 #define IPV6_FLOWLABEL_MGR	32
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4ff3940..64038b4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -284,7 +284,8 @@ struct sk_buff {
 				nfctinfo:3;
 	__u8			pkt_type:3,
 				fclone:2,
-				ipvs_property:1;
+				ipvs_property:1,
+				ign_dst_mtu;
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
diff --git a/include/net/ip.h b/include/net/ip.h
index e79c3e3..f5874a3 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -201,7 +201,7 @@ int ip_decrease_ttl(struct iphdr *iph)
 static inline
 int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
 {
-	return (inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
+	return (inet_sk(sk)->pmtudisc >= IP_PMTUDISC_DO ||
 		(inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT &&
 		 !(dst_metric(dst, RTAX_LOCK)&(1<<RTAX_MTU))));
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 702fa8f..5c8515c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -474,6 +474,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	C(ipvs_property);
 #endif
+	C(ign_dst_mtu);
 	C(protocol);
 	n->destructor = NULL;
 	C(mark);
@@ -549,6 +550,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	new->ipvs_property = old->ipvs_property;
 #endif
+	new->ign_dst_mtu	= old->ign_dst_mtu;
 #ifdef CONFIG_BRIDGE_NETFILTER
 	new->nf_bridge	= old->nf_bridge;
 	nf_bridge_get(old->nf_bridge);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 90bdd53..a7e8944 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -201,7 +201,8 @@ static inline int ip_finish_output(struct sk_buff *skb)
 		return dst_output(skb);
 	}
 #endif
-	if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
+	if (skb->len > dst_mtu(skb->dst) &&
+	    !skb->ign_dst_mtu && !skb_is_gso(skb))
 		return ip_fragment(skb, ip_finish_output2);
 	else
 		return ip_finish_output2(skb);
@@ -801,7 +802,9 @@ int ip_append_data(struct sock *sk,
 			inet->cork.addr = ipc->addr;
 		}
 		dst_hold(&rt->u.dst);
-		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
+		                            rt->u.dst.dev->mtu :
+		                            dst_mtu(rt->u.dst.path);
 		inet->cork.rt = rt;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
@@ -1220,13 +1223,16 @@ int ip_push_pending_frames(struct sock *sk)
 	 * to fragment the frame generated here. No matter, what transforms
 	 * how transforms change size of the packet, it will come out.
 	 */
-	if (inet->pmtudisc != IP_PMTUDISC_DO)
+	if (inet->pmtudisc < IP_PMTUDISC_DO)
 		skb->local_df = 1;
 
+	if (inet->pmtudisc == IP_PMTUDISC_PROBE)
+		skb->ign_dst_mtu = 1;
+
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If local_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc == IP_PMTUDISC_DO ||
+	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
 	    (skb->len <= dst_mtu(&rt->u.dst) &&
 	     ip_dont_fragment(sk, &rt->u.dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 23048d9..98fa088 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -536,7 +536,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			inet->hdrincl = val ? 1 : 0;
 			break;
 		case IP_MTU_DISCOVER:
-			if (val<0 || val>2)
+			if (val<0 || val>3)
 				goto e_inval;
 			inet->pmtudisc = val;
 			break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f252f4e..f562262 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -302,6 +302,9 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	if (err)
 		goto error_fault;
 
+	if (inet->pmtudisc == IP_PMTUDISC_PROBE)
+		skb->ign_dst_mtu = 1;
+
 	/* We don't modify invalid header */
 	if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
 		if (!iph->saddr)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 711dfc3..8b8c04b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -139,8 +139,8 @@ static int ip6_output2(struct sk_buff *skb)
 
 int ip6_output(struct sk_buff *skb)
 {
-	if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
-				dst_allfrag(skb->dst))
+	if ((skb->len > dst_mtu(skb->dst) && !skb->ign_dst_mtu &&
+	     !skb_is_gso(skb)) || dst_allfrag(skb->dst))
 		return ip6_fragment(skb, ip6_output2);
 	else
 		return ip6_output2(skb);
@@ -574,7 +574,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = skb->ign_dst_mtu ? skb->len : dst_mtu(&rt->u.dst);
 	if (np && np->frag_size < mtu) {
 		if (np->frag_size)
 			mtu = np->frag_size;
@@ -1015,7 +1015,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
 		np->cork.tclass = tclass;
-		mtu = dst_mtu(rt->u.dst.path);
+		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
+		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
 		if (np->frag_size < mtu) {
 			if (np->frag_size)
 				mtu = np->frag_size;
@@ -1303,6 +1304,9 @@ int ip6_push_pending_frames(struct sock *sk)
 		tmp_skb->sk = NULL;
 	}
 
+	if (np->pmtudisc == IPV6_PMTUDISC_PROBE)
+		skb->ign_dst_mtu = 1;
+
 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
 	__skb_pull(skb, skb->h.raw - skb->nh.raw);
 	if (opt && opt->opt_flen)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f5f9582..6e88597 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -694,7 +694,7 @@ done:
 		retv = ip6_ra_control(sk, val, NULL);
 		break;
 	case IPV6_MTU_DISCOVER:
-		if (val<0 || val>2)
+		if (val<0 || val>3)
 			goto e_inval;
 		np->pmtudisc = val;
 		retv = 0;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 75db277..9ef0946 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -587,6 +587,9 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 	if (err)
 		goto error_fault;
 
+	if (np->pmtudisc == IPV6_PMTUDISC_PROBE)
+		skb->ign_dst_mtu = 1;
+
 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 		      dst_output);
-- 
1.5.0.2.gc260-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] [NET] Do pmtu check in transport layer
  2007-03-24  0:06 [PATCH 1/3] [NET] Do pmtu check in transport layer John Heffner
  2007-03-24  0:06 ` [PATCH 2/3] [NET] Move DF check to ip_forward John Heffner
@ 2007-03-25  4:14 ` David Miller
  2007-04-09  8:40 ` Patrick McHardy
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2007-03-25  4:14 UTC (permalink / raw)
  To: jheffner; +Cc: netdev

From: John Heffner <jheffner@psc.edu>
Date: Fri, 23 Mar 2007 20:06:44 -0400

> Check the pmtu check at the transport layer (for UDP, ICMP and raw), and
> send a local error if socket is PMTUDISC_DO and packet is too big.  This is
> actually a pure bugfix for ipv6.  For ipv4, it allows us to do pmtu checks
> in the same way as for ipv6.
> 
> Signed-off-by: John Heffner <jheffner@psc.edu>

Applied, thanks John.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] [NET] Move DF check to ip_forward
  2007-03-24  0:06 ` [PATCH 2/3] [NET] Move DF check to ip_forward John Heffner
  2007-03-24  0:06   ` [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
@ 2007-03-25  4:17   ` David Miller
  2007-03-25 13:37   ` [NET]: Fix breakage, use ip_hdr() for DF check in ip_forward Thomas Graf
  2 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2007-03-25  4:17 UTC (permalink / raw)
  To: jheffner; +Cc: netdev

From: John Heffner <jheffner@psc.edu>
Date: Fri, 23 Mar 2007 20:06:45 -0400

> Do fragmentation check in ip_forward, similar to ipv6 forwarding.  Also add
> a debug printk in the DF check in ip_fragment since we should now never
> reach it.
> 
> Signed-off-by: John Heffner <jheffner@psc.edu>

I don't think this debugging log message makes sense considering
netfilter can potentially do interesting things to the packet.

Therefore I applied your patch without the log message, we can
discuss it's merits seperately as I like the rest of this patch.

Thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE
  2007-03-24  0:06   ` [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
@ 2007-03-25  4:23     ` David Miller
  2007-03-27 14:18     ` Andi Kleen
  1 sibling, 0 replies; 26+ messages in thread
From: David Miller @ 2007-03-25  4:23 UTC (permalink / raw)
  To: jheffner; +Cc: netdev

From: John Heffner <jheffner@psc.edu>
Date: Fri, 23 Mar 2007 20:06:46 -0400

> Add IP(V6)_PMTUDISC_PROBE value for IP(V6)_MTU_DISCOVER.  This option forces
> us not to fragment, but does not make use of the kernel path MTU discovery. 
> That is, it allows for user-mode MTU probing (or, packetization-layer path
> MTU discovery).  This is particularly useful for diagnostic utilities, like
> traceroute/tracepath.
> 
> Signed-off-by: John Heffner <jheffner@psc.edu>

Also applied to net-2.6.22, thanks John.

I made a slight change:

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 4ff3940..64038b4 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -284,7 +284,8 @@ struct sk_buff {
>  				nfctinfo:3;
>  	__u8			pkt_type:3,
>  				fclone:2,
> -				ipvs_property:1;
> +				ipvs_property:1,
> +				ign_dst_mtu;
>  	__be16			protocol;
>  
>  	void			(*destructor)(struct sk_buff *skb);

I marked "ign_dst_mtu" with a bit field size of one since this
appears to be a boolean I take it this is what you mean to do
here.  Otherwise it adds another __u8 to struct sk_buff. :-)


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [NET]: Fix breakage, use ip_hdr() for DF check in ip_forward
  2007-03-24  0:06 ` [PATCH 2/3] [NET] Move DF check to ip_forward John Heffner
  2007-03-24  0:06   ` [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
  2007-03-25  4:17   ` [PATCH 2/3] [NET] Move DF check to ip_forward David Miller
@ 2007-03-25 13:37   ` Thomas Graf
  2007-03-25 20:27     ` David Miller
  2 siblings, 1 reply; 26+ messages in thread
From: Thomas Graf @ 2007-03-25 13:37 UTC (permalink / raw)
  To: davem; +Cc: jheffner, netdev

The patch [NET] Move DF check to ip_forward broke net-2.6.22 as
skb->nh has been renamed to skb->network_header. Use ip_hdr().

Signed-off-by: Thomas Graf <tgraf@suug.ch>

Index: net-2.6.22/net/ipv4/ip_forward.c
===================================================================
--- net-2.6.22.orig/net/ipv4/ip_forward.c	2007-03-25 15:31:32.000000000 +0200
+++ net-2.6.22/net/ipv4/ip_forward.c	2007-03-25 15:32:21.000000000 +0200
@@ -86,7 +86,7 @@ int ip_forward(struct sk_buff *skb)
 		goto sr_failed;
 
 	if (unlikely(skb->len > dst_mtu(&rt->u.dst) &&
-	             (skb->nh.iph->frag_off & htons(IP_DF))) && !skb->local_df) {
+	             (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
 		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 			  htonl(dst_mtu(&rt->u.dst)));

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [NET]: Fix breakage, use ip_hdr() for DF check in ip_forward
  2007-03-25 13:37   ` [NET]: Fix breakage, use ip_hdr() for DF check in ip_forward Thomas Graf
@ 2007-03-25 20:27     ` David Miller
  0 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2007-03-25 20:27 UTC (permalink / raw)
  To: tgraf; +Cc: jheffner, netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Sun, 25 Mar 2007 15:37:31 +0200

> The patch [NET] Move DF check to ip_forward broke net-2.6.22 as
> skb->nh has been renamed to skb->network_header. Use ip_hdr().
> 
> Signed-off-by: Thomas Graf <tgraf@suug.ch>

Thanks Thomas.

John I'm extremely irritated.  You send me which is obviouly a feature
patch, which I'm obviously NOT going to apply to the current upstream
for 2.6.21, which is only accepting bug fixes, yet you do not test
compile your work and build your patches against my net-2.6.22 tree
which is where such feature patches are going to get applied?

That's just irresponsible, please don't do that again or you'll
be on my shit list for a good long while.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE
  2007-03-24  0:06   ` [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
  2007-03-25  4:23     ` David Miller
@ 2007-03-27 14:18     ` Andi Kleen
       [not found]       ` <4609640D.7010709@psc.edu>
  1 sibling, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2007-03-27 14:18 UTC (permalink / raw)
  To: John Heffner; +Cc: David Miller, netdev, Michael Kerrisk

John Heffner <jheffner@psc.edu> writes:

> Add IP(V6)_PMTUDISC_PROBE value for IP(V6)_MTU_DISCOVER.  This option forces
> us not to fragment, but does not make use of the kernel path MTU discovery. 
> That is, it allows for user-mode MTU probing (or, packetization-layer path
> MTU discovery).  This is particularly useful for diagnostic utilities, like
> traceroute/tracepath.

You should probably send a manpages update to the manpages maintainer too
(cc'ed with fullquote)

-Andi

> 
> Signed-off-by: John Heffner <jheffner@psc.edu>
> ---
>  include/linux/in.h       |    1 +
>  include/linux/in6.h      |    1 +
>  include/linux/skbuff.h   |    3 ++-
>  include/net/ip.h         |    2 +-
>  net/core/skbuff.c        |    2 ++
>  net/ipv4/ip_output.c     |   14 ++++++++++----
>  net/ipv4/ip_sockglue.c   |    2 +-
>  net/ipv4/raw.c           |    3 +++
>  net/ipv6/ip6_output.c    |   12 ++++++++----
>  net/ipv6/ipv6_sockglue.c |    2 +-
>  net/ipv6/raw.c           |    3 +++
>  11 files changed, 33 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/in.h b/include/linux/in.h
> index 1912e7c..2dc1f8a 100644
> --- a/include/linux/in.h
> +++ b/include/linux/in.h
> @@ -83,6 +83,7 @@ struct in_addr {
>  #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
>  #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
>  #define IP_PMTUDISC_DO			2	/* Always DF		*/
> +#define IP_PMTUDISC_PROBE		3	/* Ignore dst pmtu	*/
>  
>  #define IP_MULTICAST_IF			32
>  #define IP_MULTICAST_TTL 		33
> diff --git a/include/linux/in6.h b/include/linux/in6.h
> index 4e8350a..d559fac 100644
> --- a/include/linux/in6.h
> +++ b/include/linux/in6.h
> @@ -179,6 +179,7 @@ struct in6_flowlabel_req
>  #define IPV6_PMTUDISC_DONT		0
>  #define IPV6_PMTUDISC_WANT		1
>  #define IPV6_PMTUDISC_DO		2
> +#define IPV6_PMTUDISC_PROBE		3
>  
>  /* Flowlabel */
>  #define IPV6_FLOWLABEL_MGR	32
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 4ff3940..64038b4 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -284,7 +284,8 @@ struct sk_buff {
>  				nfctinfo:3;
>  	__u8			pkt_type:3,
>  				fclone:2,
> -				ipvs_property:1;
> +				ipvs_property:1,
> +				ign_dst_mtu;
>  	__be16			protocol;
>  
>  	void			(*destructor)(struct sk_buff *skb);
> diff --git a/include/net/ip.h b/include/net/ip.h
> index e79c3e3..f5874a3 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -201,7 +201,7 @@ int ip_decrease_ttl(struct iphdr *iph)
>  static inline
>  int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
>  {
> -	return (inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
> +	return (inet_sk(sk)->pmtudisc >= IP_PMTUDISC_DO ||
>  		(inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT &&
>  		 !(dst_metric(dst, RTAX_LOCK)&(1<<RTAX_MTU))));
>  }
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 702fa8f..5c8515c 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -474,6 +474,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
>  #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
>  	C(ipvs_property);
>  #endif
> +	C(ign_dst_mtu);
>  	C(protocol);
>  	n->destructor = NULL;
>  	C(mark);
> @@ -549,6 +550,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
>  #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
>  	new->ipvs_property = old->ipvs_property;
>  #endif
> +	new->ign_dst_mtu	= old->ign_dst_mtu;
>  #ifdef CONFIG_BRIDGE_NETFILTER
>  	new->nf_bridge	= old->nf_bridge;
>  	nf_bridge_get(old->nf_bridge);
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 90bdd53..a7e8944 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -201,7 +201,8 @@ static inline int ip_finish_output(struct sk_buff *skb)
>  		return dst_output(skb);
>  	}
>  #endif
> -	if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
> +	if (skb->len > dst_mtu(skb->dst) &&
> +	    !skb->ign_dst_mtu && !skb_is_gso(skb))
>  		return ip_fragment(skb, ip_finish_output2);
>  	else
>  		return ip_finish_output2(skb);
> @@ -801,7 +802,9 @@ int ip_append_data(struct sock *sk,
>  			inet->cork.addr = ipc->addr;
>  		}
>  		dst_hold(&rt->u.dst);
> -		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
> +		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
> +		                            rt->u.dst.dev->mtu :
> +		                            dst_mtu(rt->u.dst.path);
>  		inet->cork.rt = rt;
>  		inet->cork.length = 0;
>  		sk->sk_sndmsg_page = NULL;
> @@ -1220,13 +1223,16 @@ int ip_push_pending_frames(struct sock *sk)
>  	 * to fragment the frame generated here. No matter, what transforms
>  	 * how transforms change size of the packet, it will come out.
>  	 */
> -	if (inet->pmtudisc != IP_PMTUDISC_DO)
> +	if (inet->pmtudisc < IP_PMTUDISC_DO)
>  		skb->local_df = 1;
>  
> +	if (inet->pmtudisc == IP_PMTUDISC_PROBE)
> +		skb->ign_dst_mtu = 1;
> +
>  	/* DF bit is set when we want to see DF on outgoing frames.
>  	 * If local_df is set too, we still allow to fragment this frame
>  	 * locally. */
> -	if (inet->pmtudisc == IP_PMTUDISC_DO ||
> +	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
>  	    (skb->len <= dst_mtu(&rt->u.dst) &&
>  	     ip_dont_fragment(sk, &rt->u.dst)))
>  		df = htons(IP_DF);
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 23048d9..98fa088 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -536,7 +536,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
>  			inet->hdrincl = val ? 1 : 0;
>  			break;
>  		case IP_MTU_DISCOVER:
> -			if (val<0 || val>2)
> +			if (val<0 || val>3)
>  				goto e_inval;
>  			inet->pmtudisc = val;
>  			break;
> diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
> index f252f4e..f562262 100644
> --- a/net/ipv4/raw.c
> +++ b/net/ipv4/raw.c
> @@ -302,6 +302,9 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
>  	if (err)
>  		goto error_fault;
>  
> +	if (inet->pmtudisc == IP_PMTUDISC_PROBE)
> +		skb->ign_dst_mtu = 1;
> +
>  	/* We don't modify invalid header */
>  	if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
>  		if (!iph->saddr)
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index 711dfc3..8b8c04b 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -139,8 +139,8 @@ static int ip6_output2(struct sk_buff *skb)
>  
>  int ip6_output(struct sk_buff *skb)
>  {
> -	if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
> -				dst_allfrag(skb->dst))
> +	if ((skb->len > dst_mtu(skb->dst) && !skb->ign_dst_mtu &&
> +	     !skb_is_gso(skb)) || dst_allfrag(skb->dst))
>  		return ip6_fragment(skb, ip6_output2);
>  	else
>  		return ip6_output2(skb);
> @@ -574,7 +574,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
>  	hlen = ip6_find_1stfragopt(skb, &prevhdr);
>  	nexthdr = *prevhdr;
>  
> -	mtu = dst_mtu(&rt->u.dst);
> +	mtu = skb->ign_dst_mtu ? skb->len : dst_mtu(&rt->u.dst);
>  	if (np && np->frag_size < mtu) {
>  		if (np->frag_size)
>  			mtu = np->frag_size;
> @@ -1015,7 +1015,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
>  		inet->cork.fl = *fl;
>  		np->cork.hop_limit = hlimit;
>  		np->cork.tclass = tclass;
> -		mtu = dst_mtu(rt->u.dst.path);
> +		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
> +		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
>  		if (np->frag_size < mtu) {
>  			if (np->frag_size)
>  				mtu = np->frag_size;
> @@ -1303,6 +1304,9 @@ int ip6_push_pending_frames(struct sock *sk)
>  		tmp_skb->sk = NULL;
>  	}
>  
> +	if (np->pmtudisc == IPV6_PMTUDISC_PROBE)
> +		skb->ign_dst_mtu = 1;
> +
>  	ipv6_addr_copy(final_dst, &fl->fl6_dst);
>  	__skb_pull(skb, skb->h.raw - skb->nh.raw);
>  	if (opt && opt->opt_flen)
> diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
> index f5f9582..6e88597 100644
> --- a/net/ipv6/ipv6_sockglue.c
> +++ b/net/ipv6/ipv6_sockglue.c
> @@ -694,7 +694,7 @@ done:
>  		retv = ip6_ra_control(sk, val, NULL);
>  		break;
>  	case IPV6_MTU_DISCOVER:
> -		if (val<0 || val>2)
> +		if (val<0 || val>3)
>  			goto e_inval;
>  		np->pmtudisc = val;
>  		retv = 0;
> diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
> index 75db277..9ef0946 100644
> --- a/net/ipv6/raw.c
> +++ b/net/ipv6/raw.c
> @@ -587,6 +587,9 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
>  	if (err)
>  		goto error_fault;
>  
> +	if (np->pmtudisc == IPV6_PMTUDISC_PROBE)
> +		skb->ign_dst_mtu = 1;
> +
>  	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
>  	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
>  		      dst_output);
> -- 
> 1.5.0.2.gc260-dirty
> 
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] ip(7) IP_PMTUDISC_PROBE
       [not found]         ` <20070327193115.GA28138@one.firstfloor.org>
@ 2007-03-27 19:52           ` John Heffner
  2007-04-08 18:08             ` Michael Kerrisk
  0 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-03-27 19:52 UTC (permalink / raw)
  To: Michael Kerrisk; +Cc: Andi Kleen, netdev

[-- Attachment #1: Type: text/plain, Size: 100 bytes --]

Document new IP_PMTUDISC_PROBE value for IP_MTU_DISCOVERY.  (Going into 
2.6.22).

Thanks,
   -John

[-- Attachment #2: ip_mtu_probe.patch --]
[-- Type: text/plain, Size: 919 bytes --]

diff -rU3 man-pages-2.43-a/man7/ip.7 man-pages-2.43-b/man7/ip.7
--- man-pages-2.43-a/man7/ip.7	2006-09-26 09:54:29.000000000 -0400
+++ man-pages-2.43-b/man7/ip.7	2007-03-27 15:46:18.000000000 -0400
@@ -515,6 +515,7 @@
 IP_PMTUDISC_WANT:Use per-route settings.
 IP_PMTUDISC_DONT:Never do Path MTU Discovery.
 IP_PMTUDISC_DO:Always do Path MTU Discovery. 
+IP_PMTUDISC_PROBE:Set DF but ignore Path MTU.
 .TE   
 
 When PMTU discovery is enabled the kernel automatically keeps track of
@@ -550,6 +551,15 @@
 with the
 .B IP_MTU
 option.     
+
+It is possible to implement RFC 4821 MTU probing with
+.B SOCK_DGRAM
+of
+.B SOCK_RAW
+sockets by setting a value of IP_PMTUDISC_PROBE.  This is also particularly
+useful for diagnostic tools such as
+.BR tracepath (8)
+that wish to deliberately send probe packets larger than the observed Path MTU.
 .TP
 .B IP_MTU
 Retrieve the current known path MTU of the current socket. 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] ip(7) IP_PMTUDISC_PROBE
  2007-03-27 19:52           ` [PATCH] ip(7) IP_PMTUDISC_PROBE John Heffner
@ 2007-04-08 18:08             ` Michael Kerrisk
  0 siblings, 0 replies; 26+ messages in thread
From: Michael Kerrisk @ 2007-04-08 18:08 UTC (permalink / raw)
  To: John Heffner; +Cc: Andi Kleen, netdev

> > Document new IP_PMTUDISC_PROBE value for IP_MTU_DISCOVERY.  (Going into
> > 2.6.22).

Hi John,

Thanks -- accepted -- fix will appear in man-pages-2.47.

Andi: thanks for pointing John in the right direction.

Cheers,

Michael


> > ------------------------------------------------------------------------
> >
> > diff -rU3 man-pages-2.43-a/man7/ip.7 man-pages-2.43-b/man7/ip.7
> > --- man-pages-2.43-a/man7/ip.7	2006-09-26 09:54:29.000000000 -0400
> > +++ man-pages-2.43-b/man7/ip.7	2007-03-27 15:46:18.000000000 -0400

-- 
Michael Kerrisk
maintainer of Linux man pages Sections 2, 3, 4, 5, and 7
Want to help with man page maintenance?
Grab the latest tarball at http://www.kernel.org/pub/linux/docs/manpages/
read the HOWTOHELP file and grep the source files for 'FIXME'.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] [NET] Do pmtu check in transport layer
  2007-03-24  0:06 [PATCH 1/3] [NET] Do pmtu check in transport layer John Heffner
  2007-03-24  0:06 ` [PATCH 2/3] [NET] Move DF check to ip_forward John Heffner
  2007-03-25  4:14 ` [PATCH 1/3] [NET] Do pmtu check in transport layer David Miller
@ 2007-04-09  8:40 ` Patrick McHardy
  2007-04-09 16:23   ` John Heffner
  2007-04-19  1:07   ` [PATCH 0/0] Re-try changes for PMTUDISC_PROBE John Heffner
  2007-04-19  1:07 ` [PATCH] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
  2007-04-19  1:09 ` [PATCH 1/4] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
  4 siblings, 2 replies; 26+ messages in thread
From: Patrick McHardy @ 2007-04-09  8:40 UTC (permalink / raw)
  To: John Heffner; +Cc: David Miller, netdev

John Heffner wrote:
> Check the pmtu check at the transport layer (for UDP, ICMP and raw), and
> send a local error if socket is PMTUDISC_DO and packet is too big.  This is
> actually a pure bugfix for ipv6.  For ipv4, it allows us to do pmtu checks
> in the same way as for ipv6.
> 
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index d096332..593acf7 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -822,7 +822,9 @@ int ip_append_data(struct sock *sk,
>  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
>  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
>  
> -	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
> +	if (inet->cork.length + length > 0xFFFF - fragheaderlen ||
> +	    (inet->pmtudisc >= IP_PMTUDISC_DO &&
> +	     inet->cork.length + length > mtu)) {
>  		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
>  		return -EMSGSIZE;
>  	}


This makes ping report an incorrect MTU when IPsec is used since we're
only accounting for the additional header_len, not the trailer_len
(which is not easily changeable). Additionally it will report different
MTUs for the first and following fragments when the socket is corked
because only the first fragment includes the header_len. It also can't
deal with things like NAT and routing by fwmark that change the route.
The old behaviour was that we get an ICMP frag. required with the MTU
of the final route, while this will always report the MTU of the
initially chosen route.

For all these reasons I think it should be reverted to the old
behaviour.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] [NET] Do pmtu check in transport layer
  2007-04-09  8:40 ` Patrick McHardy
@ 2007-04-09 16:23   ` John Heffner
  2007-04-09 16:40     ` Patrick McHardy
  2007-04-19  1:07   ` [PATCH 0/0] Re-try changes for PMTUDISC_PROBE John Heffner
  1 sibling, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-09 16:23 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David Miller, netdev

Patrick McHardy wrote:
> John Heffner wrote:
>> Check the pmtu check at the transport layer (for UDP, ICMP and raw), and
>> send a local error if socket is PMTUDISC_DO and packet is too big.  This is
>> actually a pure bugfix for ipv6.  For ipv4, it allows us to do pmtu checks
>> in the same way as for ipv6.
>>
>> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
>> index d096332..593acf7 100644
>> --- a/net/ipv4/ip_output.c
>> +++ b/net/ipv4/ip_output.c
>> @@ -822,7 +822,9 @@ int ip_append_data(struct sock *sk,
>>  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
>>  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
>>  
>> -	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
>> +	if (inet->cork.length + length > 0xFFFF - fragheaderlen ||
>> +	    (inet->pmtudisc >= IP_PMTUDISC_DO &&
>> +	     inet->cork.length + length > mtu)) {
>>  		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
>>  		return -EMSGSIZE;
>>  	}
> 
> 
> This makes ping report an incorrect MTU when IPsec is used since we're
> only accounting for the additional header_len, not the trailer_len
> (which is not easily changeable). Additionally it will report different
> MTUs for the first and following fragments when the socket is corked
> because only the first fragment includes the header_len. It also can't
> deal with things like NAT and routing by fwmark that change the route.
> The old behaviour was that we get an ICMP frag. required with the MTU
> of the final route, while this will always report the MTU of the
> initially chosen route.
> 
> For all these reasons I think it should be reverted to the old
> behaviour.

You're right, this is no good.  I think the other problems are fixable, 
but NAT really screws this.

Unfortunately, there is still a real problem with ipv6, in that the 
output side does not generate a packet too big ICMP like ipv4.  Also, it 
feels kind of undesirable be rely on local ICMP instead of direct error 
message delivery.  I'll try to generate a new patch.

Thanks,
   -John

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] [NET] Do pmtu check in transport layer
  2007-04-09 16:23   ` John Heffner
@ 2007-04-09 16:40     ` Patrick McHardy
  0 siblings, 0 replies; 26+ messages in thread
From: Patrick McHardy @ 2007-04-09 16:40 UTC (permalink / raw)
  To: John Heffner; +Cc: David Miller, netdev

John Heffner wrote:
> Patrick McHardy wrote:
> 
>> This makes ping report an incorrect MTU when IPsec is used since we're
>> only accounting for the additional header_len, not the trailer_len
>> (which is not easily changeable). Additionally it will report different
>> MTUs for the first and following fragments when the socket is corked
>> because only the first fragment includes the header_len. It also can't
>> deal with things like NAT and routing by fwmark that change the route.
>> The old behaviour was that we get an ICMP frag. required with the MTU
>> of the final route, while this will always report the MTU of the
>> initially chosen route.
>>
>> For all these reasons I think it should be reverted to the old
>> behaviour.
> 
> 
> You're right, this is no good.  I think the other problems are fixable,
> but NAT really screws this.


Routing by fwmark is also unfixable and IPsec is quite hard.

> Unfortunately, there is still a real problem with ipv6, in that the
> output side does not generate a packet too big ICMP like ipv4.  Also, it
> feels kind of undesirable be rely on local ICMP instead of direct error
> message delivery.  I'll try to generate a new patch.


I think its necessary since at the transport layer we simply don't
have all the information about whats going to happen to a packet.
IPv6 now also supports routing by fwmark, so it has the same problem
if it doesn't generate packet too big messages.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 0/0] Re-try changes for PMTUDISC_PROBE
  2007-04-09  8:40 ` Patrick McHardy
  2007-04-09 16:23   ` John Heffner
@ 2007-04-19  1:07   ` John Heffner
  2007-04-20 22:55     ` David Miller
  1 sibling, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:07 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev

This backs out the the transport layer MTU checks that don't work.  As a 
consequence, I had to back out the PMTUDISC_PROBE patch as well.  These 
patches should fix the problem with ipv6 that the transport layer change 
tried to address, and re-implement PMTUDISC_PROBE.  I think this 
approach is nicer than the last one, since it doesn't require a bit in 
struct sk_buff.

Thanks,
   -John

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE"
  2007-03-24  0:06 [PATCH 1/3] [NET] Do pmtu check in transport layer John Heffner
                   ` (2 preceding siblings ...)
  2007-04-09  8:40 ` Patrick McHardy
@ 2007-04-19  1:07 ` John Heffner
  2007-04-19  1:07   ` [PATCH] Revert "[NET] Do pmtu check in transport layer" John Heffner
  2007-04-19  1:09 ` [PATCH 1/4] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
  4 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:07 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, John Heffner

This reverts commit d21d2a90b879c0cf159df5944847e6d9833816eb.

Must be backed out because commit 87e927a0583bd4a8ba9e97cd75b58d8aa1c76e37
does not work.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 include/linux/in.h       |    1 -
 include/linux/in6.h      |    1 -
 include/linux/skbuff.h   |    3 +--
 include/net/ip.h         |    2 +-
 net/core/skbuff.c        |    2 --
 net/ipv4/ip_output.c     |   14 ++++----------
 net/ipv4/ip_sockglue.c   |    2 +-
 net/ipv4/raw.c           |    3 ---
 net/ipv6/ip6_output.c    |   12 ++++--------
 net/ipv6/ipv6_sockglue.c |    2 +-
 net/ipv6/raw.c           |    3 ---
 11 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/include/linux/in.h b/include/linux/in.h
index 2dc1f8a..1912e7c 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -83,7 +83,6 @@ struct in_addr {
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
 #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
 #define IP_PMTUDISC_DO			2	/* Always DF		*/
-#define IP_PMTUDISC_PROBE		3	/* Ignore dst pmtu	*/
 
 #define IP_MULTICAST_IF			32
 #define IP_MULTICAST_TTL 		33
diff --git a/include/linux/in6.h b/include/linux/in6.h
index d559fac..4e8350a 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -179,7 +179,6 @@ struct in6_flowlabel_req
 #define IPV6_PMTUDISC_DONT		0
 #define IPV6_PMTUDISC_WANT		1
 #define IPV6_PMTUDISC_DO		2
-#define IPV6_PMTUDISC_PROBE		3
 
 /* Flowlabel */
 #define IPV6_FLOWLABEL_MGR	32
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8bf9b9f..7f17cfc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -277,8 +277,7 @@ struct sk_buff {
 				nfctinfo:3;
 	__u8			pkt_type:3,
 				fclone:2,
-				ipvs_property:1,
-				ign_dst_mtu:1;
+				ipvs_property:1;
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
diff --git a/include/net/ip.h b/include/net/ip.h
index 6a08b65..75f226d 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -206,7 +206,7 @@ int ip_decrease_ttl(struct iphdr *iph)
 static inline
 int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
 {
-	return (inet_sk(sk)->pmtudisc >= IP_PMTUDISC_DO ||
+	return (inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
 		(inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT &&
 		 !(dst_metric(dst, RTAX_LOCK)&(1<<RTAX_MTU))));
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2391cdf..f0d986a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -479,7 +479,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	C(ipvs_property);
 #endif
-	C(ign_dst_mtu);
 	C(protocol);
 	n->destructor = NULL;
 	C(mark);
@@ -543,7 +542,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	new->ipvs_property = old->ipvs_property;
 #endif
-	new->ign_dst_mtu	= old->ign_dst_mtu;
 #ifdef CONFIG_NET_SCHED
 #ifdef CONFIG_NET_CLS_ACT
 	new->tc_verd = old->tc_verd;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 704bc44..79e71ee 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -198,8 +198,7 @@ static inline int ip_finish_output(struct sk_buff *skb)
 		return dst_output(skb);
 	}
 #endif
-	if (skb->len > dst_mtu(skb->dst) &&
-	    !skb->ign_dst_mtu && !skb_is_gso(skb))
+	if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
 		return ip_fragment(skb, ip_finish_output2);
 	else
 		return ip_finish_output2(skb);
@@ -788,9 +787,7 @@ int ip_append_data(struct sock *sk,
 			inet->cork.addr = ipc->addr;
 		}
 		dst_hold(&rt->u.dst);
-		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-		                            rt->u.dst.dev->mtu :
-		                            dst_mtu(rt->u.dst.path);
+		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 		inet->cork.rt = rt;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
@@ -1208,16 +1205,13 @@ int ip_push_pending_frames(struct sock *sk)
 	 * to fragment the frame generated here. No matter, what transforms
 	 * how transforms change size of the packet, it will come out.
 	 */
-	if (inet->pmtudisc < IP_PMTUDISC_DO)
+	if (inet->pmtudisc != IP_PMTUDISC_DO)
 		skb->local_df = 1;
 
-	if (inet->pmtudisc == IP_PMTUDISC_PROBE)
-		skb->ign_dst_mtu = 1;
-
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If local_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
+	if (inet->pmtudisc == IP_PMTUDISC_DO ||
 	    (skb->len <= dst_mtu(&rt->u.dst) &&
 	     ip_dont_fragment(sk, &rt->u.dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4d54457..c199d23 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -542,7 +542,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		inet->hdrincl = val ? 1 : 0;
 		break;
 	case IP_MTU_DISCOVER:
-		if (val<0 || val>3)
+		if (val<0 || val>2)
 			goto e_inval;
 		inet->pmtudisc = val;
 		break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index addb786..c60aadf 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -304,9 +304,6 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	if (err)
 		goto error_fault;
 
-	if (inet->pmtudisc == IP_PMTUDISC_PROBE)
-		skb->ign_dst_mtu = 1;
-
 	/* We don't modify invalid header */
 	if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
 		if (!iph->saddr)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 78317a4..b8e307a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -139,8 +139,8 @@ static int ip6_output2(struct sk_buff *skb)
 
 int ip6_output(struct sk_buff *skb)
 {
-	if ((skb->len > dst_mtu(skb->dst) && !skb->ign_dst_mtu &&
-	     !skb_is_gso(skb)) || dst_allfrag(skb->dst))
+	if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
+				dst_allfrag(skb->dst))
 		return ip6_fragment(skb, ip6_output2);
 	else
 		return ip6_output2(skb);
@@ -566,7 +566,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 
-	mtu = skb->ign_dst_mtu ? skb->len : dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->u.dst);
 	if (np && np->frag_size < mtu) {
 		if (np->frag_size)
 			mtu = np->frag_size;
@@ -1050,8 +1050,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
 		np->cork.tclass = tclass;
-		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
-		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
+		mtu = dst_mtu(rt->u.dst.path);
 		if (np->frag_size < mtu) {
 			if (np->frag_size)
 				mtu = np->frag_size;
@@ -1339,9 +1338,6 @@ int ip6_push_pending_frames(struct sock *sk)
 		tmp_skb->sk = NULL;
 	}
 
-	if (np->pmtudisc == IPV6_PMTUDISC_PROBE)
-		skb->ign_dst_mtu = 1;
-
 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
 	__skb_pull(skb, skb_network_header_len(skb));
 	if (opt && opt->opt_flen)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index aa3d07c..da930fa 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -694,7 +694,7 @@ done:
 		retv = ip6_ra_control(sk, val, NULL);
 		break;
 	case IPV6_MTU_DISCOVER:
-		if (val<0 || val>3)
+		if (val<0 || val>2)
 			goto e_inval;
 		np->pmtudisc = val;
 		retv = 0;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 86c9943..f4cd90b 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -591,9 +591,6 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 	if (err)
 		goto error_fault;
 
-	if (np->pmtudisc == IPV6_PMTUDISC_PROBE)
-		skb->ign_dst_mtu = 1;
-
 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 		      dst_output);
-- 
1.5.1.rc3.30.ga8f4-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] Revert "[NET] Do pmtu check in transport layer"
  2007-04-19  1:07 ` [PATCH] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
@ 2007-04-19  1:07   ` John Heffner
  2007-04-19  1:07     ` [PATCH] [NET] MTU discovery check in ip6_fragment() John Heffner
  0 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:07 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, John Heffner

This reverts commit 87e927a0583bd4a8ba9e97cd75b58d8aa1c76e37.

This idea does not work, as pointed at by Patrick McHardy.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 net/ipv4/ip_output.c  |    4 +---
 net/ipv4/raw.c        |    8 +++-----
 net/ipv6/ip6_output.c |   11 +++++------
 net/ipv6/raw.c        |    7 ++-----
 4 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 79e71ee..34606ef 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -810,9 +810,7 @@ int ip_append_data(struct sock *sk,
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
-	if (inet->cork.length + length > 0xFFFF - fragheaderlen ||
-	    (inet->pmtudisc >= IP_PMTUDISC_DO &&
-	     inet->cork.length + length > mtu)) {
+	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 		return -EMSGSIZE;
 	}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index c60aadf..24d7c9f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -271,12 +271,10 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	struct iphdr *iph;
 	struct sk_buff *skb;
 	int err;
-	int mtu;
 
-	mtu = inet->pmtudisc == IP_PMTUDISC_DO ? dst_mtu(&rt->u.dst) :
-	                                         rt->u.dst.dev->mtu;
-	if (length > mtu) {
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+	if (length > rt->u.dst.dev->mtu) {
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
+			       rt->u.dst.dev->mtu);
 		return -EMSGSIZE;
 	}
 	if (flags&MSG_PROBE)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b8e307a..4cfdad4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1079,12 +1079,11 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 	fragheaderlen = sizeof(struct ipv6hdr) + rt->u.dst.nfheader_len + (opt ? opt->opt_nflen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 
-	if ((mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN &&
-	     inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) ||
-	    (np->pmtudisc >= IPV6_PMTUDISC_DO &&
-	     inet->cork.length + length > mtu)) {
-		ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
-		return -EMSGSIZE;
+	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
+		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
+			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
+			return -EMSGSIZE;
+		}
 	}
 
 	/*
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index f4cd90b..f65fcd7 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -558,12 +558,9 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 	struct sk_buff *skb;
 	unsigned int hh_len;
 	int err;
-	int mtu;
 
-	mtu = np->pmtudisc == IPV6_PMTUDISC_DO ? dst_mtu(&rt->u.dst) :
-	                                         rt->u.dst.dev->mtu;
-	if (length > mtu) {
-		ipv6_local_error(sk, EMSGSIZE, fl, mtu);
+	if (length > rt->u.dst.dev->mtu) {
+		ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu);
 		return -EMSGSIZE;
 	}
 	if (flags&MSG_PROBE)
-- 
1.5.1.rc3.30.ga8f4-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] [NET] MTU discovery check in ip6_fragment()
  2007-04-19  1:07   ` [PATCH] Revert "[NET] Do pmtu check in transport layer" John Heffner
@ 2007-04-19  1:07     ` John Heffner
  2007-04-19  1:07       ` [PATCH] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
  0 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:07 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, John Heffner

Adds a check in ip6_fragment() mirroring ip_fragment() for packets
that we can't fragment, and sends an ICMP Packet Too Big message
in response.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 net/ipv6/ip6_output.c |   13 +++++++++++++
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4cfdad4..5a5b7d4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -567,6 +567,19 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	nexthdr = *prevhdr;
 
 	mtu = dst_mtu(&rt->u.dst);
+
+	/* We must not fragment if the socket is set to force MTU discovery
+	 * or if the skb it not generated by a local socket.  (This last
+	 * check should be redundant, but it's free.)
+	 */
+	if (!np || np->pmtudisc >= IPV6_PMTUDISC_DO) {
+		skb->dev = skb->dst->dev;
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
 	if (np && np->frag_size < mtu) {
 		if (np->frag_size)
 			mtu = np->frag_size;
-- 
1.5.1.rc3.30.ga8f4-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] [NET] Add IP(V6)_PMTUDISC_RPOBE
  2007-04-19  1:07     ` [PATCH] [NET] MTU discovery check in ip6_fragment() John Heffner
@ 2007-04-19  1:07       ` John Heffner
  2007-04-19  1:11         ` John Heffner
  0 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:07 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, John Heffner

Add IP(V6)_PMTUDISC_PROBE value for IP(V6)_MTU_DISCOVER.  This option forces
us not to fragment, but does not make use of the kernel path MTU discovery.
That is, it allows for user-mode MTU probing (or, packetization-layer path
MTU discovery).  This is particularly useful for diagnostic utilities, like
traceroute/tracepath.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 include/linux/in.h       |    1 +
 include/linux/in6.h      |    1 +
 net/ipv4/ip_output.c     |   20 +++++++++++++++-----
 net/ipv4/ip_sockglue.c   |    2 +-
 net/ipv6/ip6_output.c    |   15 ++++++++++++---
 net/ipv6/ipv6_sockglue.c |    2 +-
 6 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/include/linux/in.h b/include/linux/in.h
index 1912e7c..3975cbf 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -83,6 +83,7 @@ struct in_addr {
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
 #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
 #define IP_PMTUDISC_DO			2	/* Always DF		*/
+#define IP_PMTUDISC_PROBE		3       /* Ignore dst pmtu      */
 
 #define IP_MULTICAST_IF			32
 #define IP_MULTICAST_TTL 		33
diff --git a/include/linux/in6.h b/include/linux/in6.h
index 4e8350a..d559fac 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -179,6 +179,7 @@ struct in6_flowlabel_req
 #define IPV6_PMTUDISC_DONT		0
 #define IPV6_PMTUDISC_WANT		1
 #define IPV6_PMTUDISC_DO		2
+#define IPV6_PMTUDISC_PROBE		3
 
 /* Flowlabel */
 #define IPV6_FLOWLABEL_MGR	32
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 34606ef..66e2c3a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -189,6 +189,14 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	return -EINVAL;
 }
 
+static inline int ip_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
+
+	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
+	       skb->dst->dev->mtu : dst_mtu(skb->dst);
+}
+
 static inline int ip_finish_output(struct sk_buff *skb)
 {
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
@@ -198,7 +206,7 @@ static inline int ip_finish_output(struct sk_buff *skb)
 		return dst_output(skb);
 	}
 #endif
-	if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
+	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 		return ip_fragment(skb, ip_finish_output2);
 	else
 		return ip_finish_output2(skb);
@@ -422,7 +430,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(dst_mtu(&rt->u.dst)));
+			  htonl(ip_skb_dst_mtu(skb)));
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
@@ -787,7 +795,9 @@ int ip_append_data(struct sock *sk,
 			inet->cork.addr = ipc->addr;
 		}
 		dst_hold(&rt->u.dst);
-		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
+					    rt->u.dst.dev->mtu : 
+					    dst_mtu(rt->u.dst.path);
 		inet->cork.rt = rt;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
@@ -1203,13 +1213,13 @@ int ip_push_pending_frames(struct sock *sk)
 	 * to fragment the frame generated here. No matter, what transforms
 	 * how transforms change size of the packet, it will come out.
 	 */
-	if (inet->pmtudisc != IP_PMTUDISC_DO)
+	if (inet->pmtudisc < IP_PMTUDISC_DO)
 		skb->local_df = 1;
 
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If local_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc == IP_PMTUDISC_DO ||
+	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
 	    (skb->len <= dst_mtu(&rt->u.dst) &&
 	     ip_dont_fragment(sk, &rt->u.dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c199d23..4d54457 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -542,7 +542,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		inet->hdrincl = val ? 1 : 0;
 		break;
 	case IP_MTU_DISCOVER:
-		if (val<0 || val>2)
+		if (val<0 || val>3)
 			goto e_inval;
 		inet->pmtudisc = val;
 		break;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5a5b7d4..f508171 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -137,9 +137,17 @@ static int ip6_output2(struct sk_buff *skb)
 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 }
 
+static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+
+	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
+	       skb->dst->dev->mtu : dst_mtu(skb->dst);
+}
+
 int ip6_output(struct sk_buff *skb)
 {
-	if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
+	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 				dst_allfrag(skb->dst))
 		return ip6_fragment(skb, ip6_output2);
 	else
@@ -566,7 +574,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = ip6_skb_dst_mtu(skb);
 
 	/* We must not fragment if the socket is set to force MTU discovery
 	 * or if the skb it not generated by a local socket.  (This last
@@ -1063,7 +1071,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
 		np->cork.tclass = tclass;
-		mtu = dst_mtu(rt->u.dst.path);
+		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
+		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
 		if (np->frag_size < mtu) {
 			if (np->frag_size)
 				mtu = np->frag_size;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index da930fa..aa3d07c 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -694,7 +694,7 @@ done:
 		retv = ip6_ra_control(sk, val, NULL);
 		break;
 	case IPV6_MTU_DISCOVER:
-		if (val<0 || val>2)
+		if (val<0 || val>3)
 			goto e_inval;
 		np->pmtudisc = val;
 		retv = 0;
-- 
1.5.1.rc3.30.ga8f4-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 1/4] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE"
  2007-03-24  0:06 [PATCH 1/3] [NET] Do pmtu check in transport layer John Heffner
                   ` (3 preceding siblings ...)
  2007-04-19  1:07 ` [PATCH] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
@ 2007-04-19  1:09 ` John Heffner
  2007-04-19  1:09   ` [PATCH 2/4] Revert "[NET] Do pmtu check in transport layer" John Heffner
  4 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:09 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, John Heffner

This reverts commit d21d2a90b879c0cf159df5944847e6d9833816eb.

Must be backed out because commit 87e927a0583bd4a8ba9e97cd75b58d8aa1c76e37
does not work.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 include/linux/in.h       |    1 -
 include/linux/in6.h      |    1 -
 include/linux/skbuff.h   |    3 +--
 include/net/ip.h         |    2 +-
 net/core/skbuff.c        |    2 --
 net/ipv4/ip_output.c     |   14 ++++----------
 net/ipv4/ip_sockglue.c   |    2 +-
 net/ipv4/raw.c           |    3 ---
 net/ipv6/ip6_output.c    |   12 ++++--------
 net/ipv6/ipv6_sockglue.c |    2 +-
 net/ipv6/raw.c           |    3 ---
 11 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/include/linux/in.h b/include/linux/in.h
index 2dc1f8a..1912e7c 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -83,7 +83,6 @@ struct in_addr {
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
 #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
 #define IP_PMTUDISC_DO			2	/* Always DF		*/
-#define IP_PMTUDISC_PROBE		3	/* Ignore dst pmtu	*/
 
 #define IP_MULTICAST_IF			32
 #define IP_MULTICAST_TTL 		33
diff --git a/include/linux/in6.h b/include/linux/in6.h
index d559fac..4e8350a 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -179,7 +179,6 @@ struct in6_flowlabel_req
 #define IPV6_PMTUDISC_DONT		0
 #define IPV6_PMTUDISC_WANT		1
 #define IPV6_PMTUDISC_DO		2
-#define IPV6_PMTUDISC_PROBE		3
 
 /* Flowlabel */
 #define IPV6_FLOWLABEL_MGR	32
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8bf9b9f..7f17cfc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -277,8 +277,7 @@ struct sk_buff {
 				nfctinfo:3;
 	__u8			pkt_type:3,
 				fclone:2,
-				ipvs_property:1,
-				ign_dst_mtu:1;
+				ipvs_property:1;
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
diff --git a/include/net/ip.h b/include/net/ip.h
index 6a08b65..75f226d 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -206,7 +206,7 @@ int ip_decrease_ttl(struct iphdr *iph)
 static inline
 int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
 {
-	return (inet_sk(sk)->pmtudisc >= IP_PMTUDISC_DO ||
+	return (inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
 		(inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT &&
 		 !(dst_metric(dst, RTAX_LOCK)&(1<<RTAX_MTU))));
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2391cdf..f0d986a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -479,7 +479,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	C(ipvs_property);
 #endif
-	C(ign_dst_mtu);
 	C(protocol);
 	n->destructor = NULL;
 	C(mark);
@@ -543,7 +542,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	new->ipvs_property = old->ipvs_property;
 #endif
-	new->ign_dst_mtu	= old->ign_dst_mtu;
 #ifdef CONFIG_NET_SCHED
 #ifdef CONFIG_NET_CLS_ACT
 	new->tc_verd = old->tc_verd;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 704bc44..79e71ee 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -198,8 +198,7 @@ static inline int ip_finish_output(struct sk_buff *skb)
 		return dst_output(skb);
 	}
 #endif
-	if (skb->len > dst_mtu(skb->dst) &&
-	    !skb->ign_dst_mtu && !skb_is_gso(skb))
+	if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
 		return ip_fragment(skb, ip_finish_output2);
 	else
 		return ip_finish_output2(skb);
@@ -788,9 +787,7 @@ int ip_append_data(struct sock *sk,
 			inet->cork.addr = ipc->addr;
 		}
 		dst_hold(&rt->u.dst);
-		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-		                            rt->u.dst.dev->mtu :
-		                            dst_mtu(rt->u.dst.path);
+		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 		inet->cork.rt = rt;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
@@ -1208,16 +1205,13 @@ int ip_push_pending_frames(struct sock *sk)
 	 * to fragment the frame generated here. No matter, what transforms
 	 * how transforms change size of the packet, it will come out.
 	 */
-	if (inet->pmtudisc < IP_PMTUDISC_DO)
+	if (inet->pmtudisc != IP_PMTUDISC_DO)
 		skb->local_df = 1;
 
-	if (inet->pmtudisc == IP_PMTUDISC_PROBE)
-		skb->ign_dst_mtu = 1;
-
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If local_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
+	if (inet->pmtudisc == IP_PMTUDISC_DO ||
 	    (skb->len <= dst_mtu(&rt->u.dst) &&
 	     ip_dont_fragment(sk, &rt->u.dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4d54457..c199d23 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -542,7 +542,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		inet->hdrincl = val ? 1 : 0;
 		break;
 	case IP_MTU_DISCOVER:
-		if (val<0 || val>3)
+		if (val<0 || val>2)
 			goto e_inval;
 		inet->pmtudisc = val;
 		break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index addb786..c60aadf 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -304,9 +304,6 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	if (err)
 		goto error_fault;
 
-	if (inet->pmtudisc == IP_PMTUDISC_PROBE)
-		skb->ign_dst_mtu = 1;
-
 	/* We don't modify invalid header */
 	if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
 		if (!iph->saddr)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 78317a4..b8e307a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -139,8 +139,8 @@ static int ip6_output2(struct sk_buff *skb)
 
 int ip6_output(struct sk_buff *skb)
 {
-	if ((skb->len > dst_mtu(skb->dst) && !skb->ign_dst_mtu &&
-	     !skb_is_gso(skb)) || dst_allfrag(skb->dst))
+	if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
+				dst_allfrag(skb->dst))
 		return ip6_fragment(skb, ip6_output2);
 	else
 		return ip6_output2(skb);
@@ -566,7 +566,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 
-	mtu = skb->ign_dst_mtu ? skb->len : dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->u.dst);
 	if (np && np->frag_size < mtu) {
 		if (np->frag_size)
 			mtu = np->frag_size;
@@ -1050,8 +1050,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
 		np->cork.tclass = tclass;
-		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
-		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
+		mtu = dst_mtu(rt->u.dst.path);
 		if (np->frag_size < mtu) {
 			if (np->frag_size)
 				mtu = np->frag_size;
@@ -1339,9 +1338,6 @@ int ip6_push_pending_frames(struct sock *sk)
 		tmp_skb->sk = NULL;
 	}
 
-	if (np->pmtudisc == IPV6_PMTUDISC_PROBE)
-		skb->ign_dst_mtu = 1;
-
 	ipv6_addr_copy(final_dst, &fl->fl6_dst);
 	__skb_pull(skb, skb_network_header_len(skb));
 	if (opt && opt->opt_flen)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index aa3d07c..da930fa 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -694,7 +694,7 @@ done:
 		retv = ip6_ra_control(sk, val, NULL);
 		break;
 	case IPV6_MTU_DISCOVER:
-		if (val<0 || val>3)
+		if (val<0 || val>2)
 			goto e_inval;
 		np->pmtudisc = val;
 		retv = 0;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 86c9943..f4cd90b 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -591,9 +591,6 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 	if (err)
 		goto error_fault;
 
-	if (np->pmtudisc == IPV6_PMTUDISC_PROBE)
-		skb->ign_dst_mtu = 1;
-
 	IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 		      dst_output);
-- 
1.5.1.rc3.30.ga8f4-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/4] Revert "[NET] Do pmtu check in transport layer"
  2007-04-19  1:09 ` [PATCH 1/4] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
@ 2007-04-19  1:09   ` John Heffner
  2007-04-19  1:09     ` [PATCH 3/4] [NET] MTU discovery check in ip6_fragment() John Heffner
  0 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:09 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, John Heffner

This reverts commit 87e927a0583bd4a8ba9e97cd75b58d8aa1c76e37.

This idea does not work, as pointed at by Patrick McHardy.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 net/ipv4/ip_output.c  |    4 +---
 net/ipv4/raw.c        |    8 +++-----
 net/ipv6/ip6_output.c |   11 +++++------
 net/ipv6/raw.c        |    7 ++-----
 4 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 79e71ee..34606ef 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -810,9 +810,7 @@ int ip_append_data(struct sock *sk,
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
-	if (inet->cork.length + length > 0xFFFF - fragheaderlen ||
-	    (inet->pmtudisc >= IP_PMTUDISC_DO &&
-	     inet->cork.length + length > mtu)) {
+	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 		return -EMSGSIZE;
 	}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index c60aadf..24d7c9f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -271,12 +271,10 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	struct iphdr *iph;
 	struct sk_buff *skb;
 	int err;
-	int mtu;
 
-	mtu = inet->pmtudisc == IP_PMTUDISC_DO ? dst_mtu(&rt->u.dst) :
-	                                         rt->u.dst.dev->mtu;
-	if (length > mtu) {
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+	if (length > rt->u.dst.dev->mtu) {
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
+			       rt->u.dst.dev->mtu);
 		return -EMSGSIZE;
 	}
 	if (flags&MSG_PROBE)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b8e307a..4cfdad4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1079,12 +1079,11 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 	fragheaderlen = sizeof(struct ipv6hdr) + rt->u.dst.nfheader_len + (opt ? opt->opt_nflen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 
-	if ((mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN &&
-	     inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) ||
-	    (np->pmtudisc >= IPV6_PMTUDISC_DO &&
-	     inet->cork.length + length > mtu)) {
-		ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
-		return -EMSGSIZE;
+	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
+		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
+			ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
+			return -EMSGSIZE;
+		}
 	}
 
 	/*
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index f4cd90b..f65fcd7 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -558,12 +558,9 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 	struct sk_buff *skb;
 	unsigned int hh_len;
 	int err;
-	int mtu;
 
-	mtu = np->pmtudisc == IPV6_PMTUDISC_DO ? dst_mtu(&rt->u.dst) :
-	                                         rt->u.dst.dev->mtu;
-	if (length > mtu) {
-		ipv6_local_error(sk, EMSGSIZE, fl, mtu);
+	if (length > rt->u.dst.dev->mtu) {
+		ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu);
 		return -EMSGSIZE;
 	}
 	if (flags&MSG_PROBE)
-- 
1.5.1.rc3.30.ga8f4-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/4] [NET] MTU discovery check in ip6_fragment()
  2007-04-19  1:09   ` [PATCH 2/4] Revert "[NET] Do pmtu check in transport layer" John Heffner
@ 2007-04-19  1:09     ` John Heffner
  2007-04-19  1:09       ` [PATCH 4/4] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
  0 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:09 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, John Heffner

Adds a check in ip6_fragment() mirroring ip_fragment() for packets
that we can't fragment, and sends an ICMP Packet Too Big message
in response.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 net/ipv6/ip6_output.c |   13 +++++++++++++
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4cfdad4..5a5b7d4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -567,6 +567,19 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	nexthdr = *prevhdr;
 
 	mtu = dst_mtu(&rt->u.dst);
+
+	/* We must not fragment if the socket is set to force MTU discovery
+	 * or if the skb it not generated by a local socket.  (This last
+	 * check should be redundant, but it's free.)
+	 */
+	if (!np || np->pmtudisc >= IPV6_PMTUDISC_DO) {
+		skb->dev = skb->dst->dev;
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+		IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
 	if (np && np->frag_size < mtu) {
 		if (np->frag_size)
 			mtu = np->frag_size;
-- 
1.5.1.rc3.30.ga8f4-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 4/4] [NET] Add IP(V6)_PMTUDISC_RPOBE
  2007-04-19  1:09     ` [PATCH 3/4] [NET] MTU discovery check in ip6_fragment() John Heffner
@ 2007-04-19  1:09       ` John Heffner
  0 siblings, 0 replies; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:09 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, John Heffner

Add IP(V6)_PMTUDISC_PROBE value for IP(V6)_MTU_DISCOVER.  This option forces
us not to fragment, but does not make use of the kernel path MTU discovery.
That is, it allows for user-mode MTU probing (or, packetization-layer path
MTU discovery).  This is particularly useful for diagnostic utilities, like
traceroute/tracepath.

Signed-off-by: John Heffner <jheffner@psc.edu>
---
 include/linux/in.h       |    1 +
 include/linux/in6.h      |    1 +
 net/ipv4/ip_output.c     |   20 +++++++++++++++-----
 net/ipv4/ip_sockglue.c   |    2 +-
 net/ipv6/ip6_output.c    |   15 ++++++++++++---
 net/ipv6/ipv6_sockglue.c |    2 +-
 6 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/include/linux/in.h b/include/linux/in.h
index 1912e7c..3975cbf 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -83,6 +83,7 @@ struct in_addr {
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
 #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
 #define IP_PMTUDISC_DO			2	/* Always DF		*/
+#define IP_PMTUDISC_PROBE		3       /* Ignore dst pmtu      */
 
 #define IP_MULTICAST_IF			32
 #define IP_MULTICAST_TTL 		33
diff --git a/include/linux/in6.h b/include/linux/in6.h
index 4e8350a..d559fac 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -179,6 +179,7 @@ struct in6_flowlabel_req
 #define IPV6_PMTUDISC_DONT		0
 #define IPV6_PMTUDISC_WANT		1
 #define IPV6_PMTUDISC_DO		2
+#define IPV6_PMTUDISC_PROBE		3
 
 /* Flowlabel */
 #define IPV6_FLOWLABEL_MGR	32
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 34606ef..66e2c3a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -189,6 +189,14 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	return -EINVAL;
 }
 
+static inline int ip_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
+
+	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
+	       skb->dst->dev->mtu : dst_mtu(skb->dst);
+}
+
 static inline int ip_finish_output(struct sk_buff *skb)
 {
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
@@ -198,7 +206,7 @@ static inline int ip_finish_output(struct sk_buff *skb)
 		return dst_output(skb);
 	}
 #endif
-	if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
+	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 		return ip_fragment(skb, ip_finish_output2);
 	else
 		return ip_finish_output2(skb);
@@ -422,7 +430,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(dst_mtu(&rt->u.dst)));
+			  htonl(ip_skb_dst_mtu(skb)));
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
@@ -787,7 +795,9 @@ int ip_append_data(struct sock *sk,
 			inet->cork.addr = ipc->addr;
 		}
 		dst_hold(&rt->u.dst);
-		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
+					    rt->u.dst.dev->mtu : 
+					    dst_mtu(rt->u.dst.path);
 		inet->cork.rt = rt;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
@@ -1203,13 +1213,13 @@ int ip_push_pending_frames(struct sock *sk)
 	 * to fragment the frame generated here. No matter, what transforms
 	 * how transforms change size of the packet, it will come out.
 	 */
-	if (inet->pmtudisc != IP_PMTUDISC_DO)
+	if (inet->pmtudisc < IP_PMTUDISC_DO)
 		skb->local_df = 1;
 
 	/* DF bit is set when we want to see DF on outgoing frames.
 	 * If local_df is set too, we still allow to fragment this frame
 	 * locally. */
-	if (inet->pmtudisc == IP_PMTUDISC_DO ||
+	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
 	    (skb->len <= dst_mtu(&rt->u.dst) &&
 	     ip_dont_fragment(sk, &rt->u.dst)))
 		df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c199d23..4d54457 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -542,7 +542,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		inet->hdrincl = val ? 1 : 0;
 		break;
 	case IP_MTU_DISCOVER:
-		if (val<0 || val>2)
+		if (val<0 || val>3)
 			goto e_inval;
 		inet->pmtudisc = val;
 		break;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5a5b7d4..f508171 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -137,9 +137,17 @@ static int ip6_output2(struct sk_buff *skb)
 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 }
 
+static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+
+	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
+	       skb->dst->dev->mtu : dst_mtu(skb->dst);
+}
+
 int ip6_output(struct sk_buff *skb)
 {
-	if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
+	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 				dst_allfrag(skb->dst))
 		return ip6_fragment(skb, ip6_output2);
 	else
@@ -566,7 +574,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = ip6_skb_dst_mtu(skb);
 
 	/* We must not fragment if the socket is set to force MTU discovery
 	 * or if the skb it not generated by a local socket.  (This last
@@ -1063,7 +1071,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
 		np->cork.tclass = tclass;
-		mtu = dst_mtu(rt->u.dst.path);
+		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
+		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
 		if (np->frag_size < mtu) {
 			if (np->frag_size)
 				mtu = np->frag_size;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index da930fa..aa3d07c 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -694,7 +694,7 @@ done:
 		retv = ip6_ra_control(sk, val, NULL);
 		break;
 	case IPV6_MTU_DISCOVER:
-		if (val<0 || val>2)
+		if (val<0 || val>3)
 			goto e_inval;
 		np->pmtudisc = val;
 		retv = 0;
-- 
1.5.1.rc3.30.ga8f4-dirty


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] [NET] Add IP(V6)_PMTUDISC_RPOBE
  2007-04-19  1:07       ` [PATCH] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
@ 2007-04-19  1:11         ` John Heffner
  2007-04-19  1:25           ` David Miller
  0 siblings, 1 reply; 26+ messages in thread
From: John Heffner @ 2007-04-19  1:11 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Sorry, forgot the -n flag on git-format-patch.  Patches resent with 
correct sequence numbers.

Thanks,
   -John

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] [NET] Add IP(V6)_PMTUDISC_RPOBE
  2007-04-19  1:11         ` John Heffner
@ 2007-04-19  1:25           ` David Miller
  0 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2007-04-19  1:25 UTC (permalink / raw)
  To: jheffner; +Cc: netdev

From: John Heffner <jheffner@psc.edu>
Date: Wed, 18 Apr 2007 21:11:26 -0400

> Sorry, forgot the -n flag on git-format-patch.  Patches resent with 
> correct sequence numbers.

Thanks for fixing that.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/0] Re-try changes for PMTUDISC_PROBE
  2007-04-19  1:07   ` [PATCH 0/0] Re-try changes for PMTUDISC_PROBE John Heffner
@ 2007-04-20 22:55     ` David Miller
  0 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2007-04-20 22:55 UTC (permalink / raw)
  To: jheffner; +Cc: kaber, netdev

From: John Heffner <jheffner@psc.edu>
Date: Wed, 18 Apr 2007 21:07:10 -0400

> This backs out the the transport layer MTU checks that don't work.  As a 
> consequence, I had to back out the PMTUDISC_PROBE patch as well.  These 
> patches should fix the problem with ipv6 that the transport layer change 
> tried to address, and re-implement PMTUDISC_PROBE.  I think this 
> approach is nicer than the last one, since it doesn't require a bit in 
> struct sk_buff.

Since I was rebasing net-2.6.22, here is what I did.  I simply
elided the two patches you backed out, then I'll add patch
3 and 4, the new one's, on top.

This will show up when I publish the rebased net-2.6.22 tree
which I hope will happen later this evening.

Thanks!

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-04-20 22:55 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-24  0:06 [PATCH 1/3] [NET] Do pmtu check in transport layer John Heffner
2007-03-24  0:06 ` [PATCH 2/3] [NET] Move DF check to ip_forward John Heffner
2007-03-24  0:06   ` [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
2007-03-25  4:23     ` David Miller
2007-03-27 14:18     ` Andi Kleen
     [not found]       ` <4609640D.7010709@psc.edu>
     [not found]         ` <20070327193115.GA28138@one.firstfloor.org>
2007-03-27 19:52           ` [PATCH] ip(7) IP_PMTUDISC_PROBE John Heffner
2007-04-08 18:08             ` Michael Kerrisk
2007-03-25  4:17   ` [PATCH 2/3] [NET] Move DF check to ip_forward David Miller
2007-03-25 13:37   ` [NET]: Fix breakage, use ip_hdr() for DF check in ip_forward Thomas Graf
2007-03-25 20:27     ` David Miller
2007-03-25  4:14 ` [PATCH 1/3] [NET] Do pmtu check in transport layer David Miller
2007-04-09  8:40 ` Patrick McHardy
2007-04-09 16:23   ` John Heffner
2007-04-09 16:40     ` Patrick McHardy
2007-04-19  1:07   ` [PATCH 0/0] Re-try changes for PMTUDISC_PROBE John Heffner
2007-04-20 22:55     ` David Miller
2007-04-19  1:07 ` [PATCH] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
2007-04-19  1:07   ` [PATCH] Revert "[NET] Do pmtu check in transport layer" John Heffner
2007-04-19  1:07     ` [PATCH] [NET] MTU discovery check in ip6_fragment() John Heffner
2007-04-19  1:07       ` [PATCH] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
2007-04-19  1:11         ` John Heffner
2007-04-19  1:25           ` David Miller
2007-04-19  1:09 ` [PATCH 1/4] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
2007-04-19  1:09   ` [PATCH 2/4] Revert "[NET] Do pmtu check in transport layer" John Heffner
2007-04-19  1:09     ` [PATCH 3/4] [NET] MTU discovery check in ip6_fragment() John Heffner
2007-04-19  1:09       ` [PATCH 4/4] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.