netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
@ 2012-06-28 10:59 David Miller
  2012-06-28 23:16 ` Julian Anastasov
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2012-06-28 10:59 UTC (permalink / raw)
  To: netdev


The specific destination is the host we direct unicast replies to.
Usually this is the original packet source address, but if we are
responding to a multicast or broadcast packet we have to use something
different.

Specifically we must use the source address we would use if we were to
send a packet to the unicast source of the original packet.

The routing cache precomputes this value, but we want to remove that
precomputation because it creates a hard dependency on the expensive
rpfilter source address validation which we'd like to make cheaper.

There are only three places where this matters:

1) ICMP replies.

2) pktinfo CMSG

3) IP options

Now there will be no real users of rt->rt_spec_dst and we can simply
remove it altogether.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |    1 +
 net/ipv4/fib_frontend.c |   29 +++++++++++++++++++++++++++++
 net/ipv4/icmp.c         |    6 ++++--
 net/ipv4/ip_options.c   |   22 +++++++++++-----------
 net/ipv4/ip_sockglue.c  |    7 ++++---
 5 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4b347c0..1687b3d 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -230,6 +230,7 @@ extern struct fib_table *fib_get_table(struct net *net, u32 id);
 /* Exported by fib_frontend.c */
 extern const struct nla_policy rtm_ipv4_policy[];
 extern void		ip_fib_init(void);
+extern __be32 fib_compute_spec_dst(struct sk_buff *skb);
 extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			       u8 tos, int oif, struct net_device *dev,
 			       __be32 *spec_dst, u32 *itag);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3854411..451939b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -180,6 +180,35 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	struct fib_result res;
+	struct flowi4 fl4;
+	struct net *net;
+
+	if (skb->pkt_type != PACKET_BROADCAST &&
+	    skb->pkt_type != PACKET_MULTICAST)
+		return ip_hdr(skb)->daddr;
+
+	in_dev = __in_dev_get_rcu(dev);
+	BUG_ON(!in_dev);
+	fl4.flowi4_oif = 0;
+	fl4.flowi4_iif = 0;
+	fl4.daddr = ip_hdr(skb)->saddr;
+	fl4.saddr = ip_hdr(skb)->daddr;
+	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+
+	net = dev_net(dev);
+	if (!fib_lookup(net, &fl4, &res))
+		return FIB_RES_PREFSRC(net, res);
+	else
+		return inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+}
+
 /* Given (packet source, input interface) and optional (dst, oif, tos):
  * - (main) check, that source is valid i.e. not broadcast or our local
  *   address.
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 49a74cc..4bce5a2 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -95,6 +95,7 @@
 #include <net/checksum.h>
 #include <net/xfrm.h>
 #include <net/inet_common.h>
+#include <net/ip_fib.h>
 
 /*
  *	Build xmit assembly blocks
@@ -333,7 +334,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct flowi4 fl4;
 	struct sock *sk;
 	struct inet_sock *inet;
-	__be32 daddr;
+	__be32 daddr, saddr;
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
@@ -347,6 +348,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 
 	inet->tos = ip_hdr(skb)->tos;
 	daddr = ipc.addr = ip_hdr(skb)->saddr;
+	saddr = fib_compute_spec_dst(skb);
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
 	if (icmp_param->replyopts.opt.opt.optlen) {
@@ -356,7 +358,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	}
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = daddr;
-	fl4.saddr = rt->rt_spec_dst;
+	fl4.saddr = saddr;
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 708b994..766dfe56 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -27,6 +27,7 @@
 #include <net/icmp.h>
 #include <net/route.h>
 #include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
 
 /*
  * Write options to IP header, record destination address to
@@ -104,7 +105,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 	sptr = skb_network_header(skb);
 	dptr = dopt->__data;
 
-	daddr = skb_rtable(skb)->rt_spec_dst;
+	daddr = fib_compute_spec_dst(skb);
 
 	if (sopt->rr) {
 		optlen  = sptr[sopt->rr+1];
@@ -250,15 +251,14 @@ void ip_options_fragment(struct sk_buff *skb)
 int ip_options_compile(struct net *net,
 		       struct ip_options *opt, struct sk_buff *skb)
 {
-	int l;
-	unsigned char *iph;
-	unsigned char *optptr;
-	int optlen;
+	__be32 spec_dst = (__force __be32) 0;
 	unsigned char *pp_ptr = NULL;
-	struct rtable *rt = NULL;
+	unsigned char *optptr;
+	unsigned char *iph;
+	int optlen, l;
 
 	if (skb != NULL) {
-		rt = skb_rtable(skb);
+		spec_dst = fib_compute_spec_dst(skb);
 		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
 	} else
 		optptr = opt->__data;
@@ -330,8 +330,8 @@ int ip_options_compile(struct net *net,
 					pp_ptr = optptr + 2;
 					goto error;
 				}
-				if (rt) {
-					memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+				if (skb) {
+					memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 					opt->is_changed = 1;
 				}
 				optptr[2] += 4;
@@ -372,8 +372,8 @@ int ip_options_compile(struct net *net,
 						goto error;
 					}
 					opt->ts = optptr - iph;
-					if (rt)  {
-						memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+					if (skb)  {
+						memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 						timeptr = &optptr[optptr[2]+3];
 					}
 					opt->ts_needaddr = 1;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0d11f23..de29f46 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -40,6 +40,7 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/transp_v6.h>
 #endif
+#include <net/ip_fib.h>
 
 #include <linux/errqueue.h>
 #include <asm/uaccess.h>
@@ -1019,8 +1020,8 @@ e_inval:
  * @sk: socket
  * @skb: buffer
  *
- * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst
- * in skb->cb[] before dst drop.
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * destination in skb->cb[] before dst drop.
  * This way, receiver doesnt make cache line misses to read rtable.
  */
 void ipv4_pktinfo_prepare(struct sk_buff *skb)
@@ -1030,7 +1031,7 @@ void ipv4_pktinfo_prepare(struct sk_buff *skb)
 
 	if (rt) {
 		pktinfo->ipi_ifindex = rt->rt_iif;
-		pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst;
+		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
 	} else {
 		pktinfo->ipi_ifindex = 0;
 		pktinfo->ipi_spec_dst.s_addr = 0;
-- 
1.7.10.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-06-28 10:59 [PATCH] ipv4: Create and use fib_compute_spec_dst() helper David Miller
@ 2012-06-28 23:16 ` Julian Anastasov
  2012-06-28 23:27   ` David Miller
  2012-06-29  1:45   ` David Miller
  0 siblings, 2 replies; 10+ messages in thread
From: Julian Anastasov @ 2012-06-28 23:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


	Hello,

On Thu, 28 Jun 2012, David Miller wrote:

> +__be32 fib_compute_spec_dst(struct sk_buff *skb)
> +{
> +	struct net_device *dev = skb->dev;
> +	struct in_device *in_dev;
> +	struct fib_result res;
> +	struct flowi4 fl4;
> +	struct net *net;
> +

	This is bad for looped m/b-cast: ip_mc_output calls
ip_dev_loopback_xmit where pkt_type is set to PACKET_LOOPBACK.
May be we have to check skb_dst as below.

	Also, for rt_spec_dst __mkroute_output used fl4->daddr for
looped unicast traffic and fl4->saddr for
RTCF_BROADCAST | RTCF_MULTICAST. Can we check rt_flags instead?
It will catch some pkt_type values such as PACKET_LOOPBACK.
For example, check similar to icmp_send():

	struct rtable *rt = skb_rtable(skb);

	/* input or output route destined to local stack?
	 * daddr can be mcast/bcast
	 */
	if (rt && rt->rt_flags & RTCF_LOCAL) {
		if (rt_is_input_route(rt)) {
			if (!(rt->rt_flags & (RTCF_BROADCAST |
					      RTCF_MULTICAST)))
				return ip_hdr(skb)->daddr;
			/* select saddr */
		} else
			return (rt->rt_flags & (RTCF_BROADCAST | 
						RTCF_MULTICAST)) ?
				ip_hdr(skb)->saddr : ip_hdr(skb)->daddr;
	}

	Isn't pkt_type just a hint from sender? We can not use
it for such places. What if hackers use unicast mac (PACKET_HOST)
combined with multicast daddr? I'm not sure that pkt_type
is a strong validator for layer 3 addresses.

> +	if (skb->pkt_type != PACKET_BROADCAST &&
> +	    skb->pkt_type != PACKET_MULTICAST)
> +		return ip_hdr(skb)->daddr;

	So, above check should be removed if we check rt_flags.

> +
> +	in_dev = __in_dev_get_rcu(dev);
> +	BUG_ON(!in_dev);
> +	fl4.flowi4_oif = 0;
> +	fl4.flowi4_iif = 0;

	It is not clear to me why ip_route_input_mc and
ip_route_input_slow (brd_input) call fib_validate_source()
with arg oif=0. How would fib_rule_match match flowi_iif
for such traffic then?

	May be iif should be always set just like in
ip_route_output_slow because now we do output lookup to
unicast target?:

	fl4.flowi4_iif = net->loopback_dev->ifindex;

> +	fl4.daddr = ip_hdr(skb)->saddr;
> +	fl4.saddr = ip_hdr(skb)->daddr;

	Here only 0 is allowed for m/b-cast daddr, we are not
going to use ip_hdr(skb)->daddr, so no need to provide it:

	fl4.saddr = 0;

	fib_validate_source was called with dst=0 for bcast/mcast.

> +	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
> +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
> +	fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
> +
> +	net = dev_net(dev);

	Now net will be needed above for iif.

> +	if (!fib_lookup(net, &fl4, &res))
> +		return FIB_RES_PREFSRC(net, res);
> +	else

	What about providing ip_hdr(skb)->saddr as
2nd argument here (it is validated by input routing
to be unicast or 0.0.0.0):

> +		return inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);

	By this way we will prefer local address from the
same subnet as iph->saddr. Also, we should not call
fib_lookup if ipv4_is_zeronet(ip_hdr(skb)->saddr), we
should use RT_SCOPE_LINK for inet_select_addr in this case.
By this way we will prefer addresses with scope link for
target 0.0.0.0. There is such logic that uses RT_SCOPE_LINK in
ip_route_input_mc() and ip_route_input_slow().

>  int ip_options_compile(struct net *net,
>  		       struct ip_options *opt, struct sk_buff *skb)
>  {
> -	int l;
> -	unsigned char *iph;
> -	unsigned char *optptr;
> -	int optlen;
> +	__be32 spec_dst = (__force __be32) 0;
>  	unsigned char *pp_ptr = NULL;
> -	struct rtable *rt = NULL;
> +	unsigned char *optptr;
> +	unsigned char *iph;
> +	int optlen, l;
>  
>  	if (skb != NULL) {
> -		rt = skb_rtable(skb);
> +		spec_dst = fib_compute_spec_dst(skb);

	If we use rt_flags above I'm not sure whether
ip_options_compile is called with valid rt_flags from the
bridging code.

Regards
--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-06-28 23:16 ` Julian Anastasov
@ 2012-06-28 23:27   ` David Miller
  2012-06-29  1:45   ` David Miller
  1 sibling, 0 replies; 10+ messages in thread
From: David Miller @ 2012-06-28 23:27 UTC (permalink / raw)
  To: ja; +Cc: netdev

From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 29 Jun 2012 02:16:25 +0300 (EEST)

> 	Hello,

I really appreciate your detailed analysis, I'll look deeply into
all of your feedback.

Thanks!

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-06-28 23:16 ` Julian Anastasov
  2012-06-28 23:27   ` David Miller
@ 2012-06-29  1:45   ` David Miller
  2012-06-30  9:25     ` Julian Anastasov
  2012-07-04 15:21     ` Eric Dumazet
  1 sibling, 2 replies; 10+ messages in thread
From: David Miller @ 2012-06-29  1:45 UTC (permalink / raw)
  To: ja; +Cc: netdev

From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 29 Jun 2012 02:16:25 +0300 (EEST)

> 	This is bad for looped m/b-cast: ip_mc_output calls
> ip_dev_loopback_xmit where pkt_type is set to PACKET_LOOPBACK.
> May be we have to check skb_dst as below.

Agreed, for many reasons the pkt_type check is bogus.

> 	It is not clear to me why ip_route_input_mc and
> ip_route_input_slow (brd_input) call fib_validate_source()
> with arg oif=0. How would fib_rule_match match flowi_iif
> for such traffic then?
> 
> 	May be iif should be always set just like in
> ip_route_output_slow because now we do output lookup to
> unicast target?:
> 
> 	fl4.flowi4_iif = net->loopback_dev->ifindex;

Ok.  I am not brave enough to adjust what broadcast and
multicast code do in route.c :-)

> 	Here only 0 is allowed for m/b-cast daddr, we are not
> going to use ip_hdr(skb)->daddr, so no need to provide it:
> 
> 	fl4.saddr = 0;

Agreed.

> 	What about providing ip_hdr(skb)->saddr as
> 2nd argument here (it is validated by input routing
> to be unicast or 0.0.0.0):
 ...
> 	By this way we will prefer local address from the
> same subnet as iph->saddr. Also, we should not call
> fib_lookup if ipv4_is_zeronet(ip_hdr(skb)->saddr), we
> should use RT_SCOPE_LINK for inet_select_addr in this case.
> By this way we will prefer addresses with scope link for
> target 0.0.0.0. There is such logic that uses RT_SCOPE_LINK in
> ip_route_input_mc() and ip_route_input_slow().

Also agreed.

> 	If we use rt_flags above I'm not sure whether
> ip_options_compile is called with valid rt_flags from the
> bridging code.

It will not be the first time we need to fix bridging from
sending up garbage.

Here's what I committed based upon your feedback, thanks!

====================
ipv4: Fix bugs in fib_compute_spec_dst().

Based upon feedback from Julian Anastasov.

1) Use route flags to determine multicast/broadcast, not the
   packet flags.

2) Leave saddr unspecified in flow key.

3) Adjust how we invoke inet_select_addr().  Pass ip_hdr(skb)->saddr as
   second arg, and if it was zeronet use link scope.

4) Use loopback as input interface in flow key.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c |   34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 63b11ca..1d13217 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -185,28 +185,36 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct in_device *in_dev;
 	struct fib_result res;
+	struct rtable *rt;
 	struct flowi4 fl4;
 	struct net *net;
+	int scope;
 
-	if (skb->pkt_type != PACKET_BROADCAST &&
-	    skb->pkt_type != PACKET_MULTICAST)
+	rt = skb_rtable(skb);
+	if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)))
 		return ip_hdr(skb)->daddr;
 
 	in_dev = __in_dev_get_rcu(dev);
 	BUG_ON(!in_dev);
-	fl4.flowi4_oif = 0;
-	fl4.flowi4_iif = 0;
-	fl4.daddr = ip_hdr(skb)->saddr;
-	fl4.saddr = ip_hdr(skb)->daddr;
-	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
-	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
-	fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
 
 	net = dev_net(dev);
-	if (!fib_lookup(net, &fl4, &res))
-		return FIB_RES_PREFSRC(net, res);
-	else
-		return inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+
+	scope = RT_SCOPE_UNIVERSE;
+	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+		fl4.flowi4_oif = 0;
+		fl4.flowi4_iif = net->loopback_dev->ifindex;
+		fl4.daddr = ip_hdr(skb)->saddr;
+		fl4.saddr = 0;
+		fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+		fl4.flowi4_scope = scope;
+		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+		if (!fib_lookup(net, &fl4, &res))
+			return FIB_RES_PREFSRC(net, res);
+	} else {
+		scope = RT_SCOPE_LINK;
+	}
+
+	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 }
 
 /* Given (packet source, input interface) and optional (dst, oif, tos):
-- 
1.7.10

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-06-29  1:45   ` David Miller
@ 2012-06-30  9:25     ` Julian Anastasov
  2012-07-04 15:21     ` Eric Dumazet
  1 sibling, 0 replies; 10+ messages in thread
From: Julian Anastasov @ 2012-06-30  9:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


	Hello,

On Thu, 28 Jun 2012, David Miller wrote:

> ipv4: Fix bugs in fib_compute_spec_dst().

	Some more thoughts on this topic...

	I'm wondering, may be ip_options_echo wants to put
local IP for srr. ip_options_echo is called by ip_send_unicast_reply.
ip_send_unicast_reply supports source address spoofing for
tproxy (arg.flags & IP_REPLY_ARG_NOSRCCHECK).

	May be the tproxy users add local routes to redirect
the traffic to local stack but daddr is preserved (non-local).
So, rt_flags will have RTCF_LOCAL but for srr purposes we
need local address, right?

	There can be optimization in ip_options_echo to
avoid fib_compute_spec_dst if daddr is not needed. It seems
it is needed only in the sopt->srr case.

	It seems ip_options_compile can be called by
ip_rcv_options (ip_rcv_finish) just after ip_route_input_noref
but before dst_input. It means, it can happen for forwarding,
not just for local delivery.

	To summarize, we can not rely on iph->daddr to be
local address if RTCF_LOCAL is set. There is always the risk to
work with redirected or forwarded traffic. Even for the PKTINFO
case we should make sure ipi_spec_dst is a local address (original
daddr goes to ipi_addr anyways), in case later ipi_spec_dst
is used again for sending in PKTINFO.

	For now, I see only one possible optimization.
When fib_lookup returns res.fi and res.type is RTN_LOCAL
we can check fib_protocol. If fib_protocol is not
RTPROT_KERNEL we will add RTCF_MAYBE_LOCAL (new flag) to rt_flags.
It will lead to slow lookups to validate the iph->daddr
if used later as source address, like in the spec_dst case.

	For the common case of local routes created by fib_magic()
we will use iph->daddr in fib_compute_spec_dst as follows:

	if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
			    RTCF_LOCAL | RTCF_MAYBE_LOCAL) == RTCF_LOCAL))
		return ip_hdr(skb)->daddr;
	/* For mcast, forwarding and spoofing we take the slow path */

	If users add local RTPROT_KERNEL routes, later
the outgoing traffic will anyways fail in some output route lookup
because FLOWI_FLAG_ANYSRC is set in rare cases. But also
users can break srr in this way, so there is some risk.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-06-29  1:45   ` David Miller
  2012-06-30  9:25     ` Julian Anastasov
@ 2012-07-04 15:21     ` Eric Dumazet
  2012-07-04 23:13       ` David Miller
  1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2012-07-04 15:21 UTC (permalink / raw)
  To: David Miller; +Cc: ja, netdev

On Thu, 2012-06-28 at 18:45 -0700, David Miller wrote:
> From: Julian Anastasov <ja@ssi.bg>
> Date: Fri, 29 Jun 2012 02:16:25 +0300 (EEST)
> 
> > 	This is bad for looped m/b-cast: ip_mc_output calls
> > ip_dev_loopback_xmit where pkt_type is set to PACKET_LOOPBACK.
> > May be we have to check skb_dst as below.
> 
> Agreed, for many reasons the pkt_type check is bogus.
> 
> > 	It is not clear to me why ip_route_input_mc and
> > ip_route_input_slow (brd_input) call fib_validate_source()
> > with arg oif=0. How would fib_rule_match match flowi_iif
> > for such traffic then?
> > 
> > 	May be iif should be always set just like in
> > ip_route_output_slow because now we do output lookup to
> > unicast target?:
> > 
> > 	fl4.flowi4_iif = net->loopback_dev->ifindex;
> 
> Ok.  I am not brave enough to adjust what broadcast and
> multicast code do in route.c :-)
> 
> > 	Here only 0 is allowed for m/b-cast daddr, we are not
> > going to use ip_hdr(skb)->daddr, so no need to provide it:
> > 
> > 	fl4.saddr = 0;
> 
> Agreed.
> 
> > 	What about providing ip_hdr(skb)->saddr as
> > 2nd argument here (it is validated by input routing
> > to be unicast or 0.0.0.0):
>  ...
> > 	By this way we will prefer local address from the
> > same subnet as iph->saddr. Also, we should not call
> > fib_lookup if ipv4_is_zeronet(ip_hdr(skb)->saddr), we
> > should use RT_SCOPE_LINK for inet_select_addr in this case.
> > By this way we will prefer addresses with scope link for
> > target 0.0.0.0. There is such logic that uses RT_SCOPE_LINK in
> > ip_route_input_mc() and ip_route_input_slow().
> 
> Also agreed.
> 
> > 	If we use rt_flags above I'm not sure whether
> > ip_options_compile is called with valid rt_flags from the
> > bridging code.
> 
> It will not be the first time we need to fix bridging from
> sending up garbage.
> 
> Here's what I committed based upon your feedback, thanks!
> 
> ====================
> ipv4: Fix bugs in fib_compute_spec_dst().
> 
> Based upon feedback from Julian Anastasov.
> 
> 1) Use route flags to determine multicast/broadcast, not the
>    packet flags.
> 
> 2) Leave saddr unspecified in flow key.
> 
> 3) Adjust how we invoke inet_select_addr().  Pass ip_hdr(skb)->saddr as
>    second arg, and if it was zeronet use link scope.
> 
> 4) Use loopback as input interface in flow key.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  net/ipv4/fib_frontend.c |   34 +++++++++++++++++++++-------------
>  1 file changed, 21 insertions(+), 13 deletions(-)
> 
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 63b11ca..1d13217 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -185,28 +185,36 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
>  	struct net_device *dev = skb->dev;
>  	struct in_device *in_dev;
>  	struct fib_result res;
> +	struct rtable *rt;
>  	struct flowi4 fl4;
>  	struct net *net;
> +	int scope;
>  
> -	if (skb->pkt_type != PACKET_BROADCAST &&
> -	    skb->pkt_type != PACKET_MULTICAST)
> +	rt = skb_rtable(skb);
> +	if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)))
>  		return ip_hdr(skb)->daddr;
>  

David I tried to setup a bridge to investigate problems reported on
another thread and got immediate crash because of this patch.

Crash in fib_compute_spec_dst() if skb_rtable(skb) is NULL

brctl addbr br0
brctl addif br0 eth0
ifconfig br0 up
ip addr del 172.30.42.23/27 dev eth0
ip addr add 172.30.42.23/27 dev br0
...
<crash>

[  166.706951] Modules linked in: ip6table_filter ip6_tables ebtable_nat ebtables fuse ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT iptable_mangle iptable_filter bridge stp rt61pci crc_itu_t rt2x00pci rt2x00lib eeprom_93cx6 tg3 ixgbe mdio igb
[  166.706955] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc4+ #494
[  166.706956] Call Trace:
[  166.706965]  <IRQ>  [<ffffffff81039c5f>] warn_slowpath_common+0x7f/0xc0
[  166.706969]  [<ffffffff81039cba>] warn_slowpath_null+0x1a/0x20
[  166.706973]  [<ffffffff815baade>] fib_compute_spec_dst+0x12e/0x1c0
[  166.706979]  [<ffffffff81581d71>] ip_options_compile+0x31/0x5b0
[  166.706985]  [<ffffffff8125ddf6>] ? security_sock_rcv_skb+0x16/0x20
[  166.706991]  [<ffffffff816b8245>] ? _raw_read_unlock_bh+0x25/0x30
[  166.706996]  [<ffffffffa0119465>] ? ebt_do_table+0x6a5/0x770 [ebtables]
[  166.707004]  [<ffffffffa00b8ce5>] br_parse_ip_options+0x105/0x210 [bridge]
[  166.707010]  [<ffffffffa00b9ef8>] br_nf_pre_routing+0x398/0x6a0 [bridge]
[  166.707015]  [<ffffffff81573534>] nf_iterate+0x84/0xd0
[  166.707021]  [<ffffffffa00b3ba0>] ? br_handle_local_finish+0x50/0x50 [bridge]
[  166.707024]  [<ffffffff81573605>] nf_hook_slow+0x85/0x130
[  166.707029]  [<ffffffffa00b3ba0>] ? br_handle_local_finish+0x50/0x50 [bridge]
[  166.707035]  [<ffffffff816201c7>] ? packet_rcv_spkt+0x47/0x190
[  166.707041]  [<ffffffffa00b3f40>] br_handle_frame+0x1c0/0x260 [bridge]
[  166.707044]  [<ffffffff816201c7>] ? packet_rcv_spkt+0x47/0x190
[  166.707050]  [<ffffffffa00b3d80>] ? br_handle_frame_finish+0x1e0/0x1e0 [bridge]
[  166.707054]  [<ffffffff8154538b>] __netif_receive_skb+0x1bb/0x5f0
[  166.707058]  [<ffffffff81545955>] netif_receive_skb+0x25/0xc0
[  166.707061]  [<ffffffff81545ced>] ? dev_gro_receive+0x1dd/0x2f0
[  166.707065]  [<ffffffff81546450>] napi_skb_finish+0x70/0xa0
[  166.707068]  [<ffffffff815483e5>] napi_gro_receive+0xf5/0x140
[  166.707075]  [<ffffffffa00720f2>] tg3_poll_work+0xc42/0xde0 [tg3]
[  166.707080]  [<ffffffff816647c7>] ? ieee80211_rx+0x357/0x890
[  166.707086]  [<ffffffffa007976c>] tg3_poll+0x8c/0x3c0 [tg3]
[  166.707091]  [<ffffffffa009093f>] ? rt2x00lib_rxdone+0x28f/0x450 [rt2x00lib]
[  166.707095]  [<ffffffff81547a2b>] net_rx_action+0x12b/0x250

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-07-04 15:21     ` Eric Dumazet
@ 2012-07-04 23:13       ` David Miller
  2012-07-05  7:52         ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2012-07-04 23:13 UTC (permalink / raw)
  To: eric.dumazet; +Cc: ja, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 04 Jul 2012 17:21:07 +0200

> David I tried to setup a bridge to investigate problems reported on
> another thread and got immediate crash because of this patch.
> 
> Crash in fib_compute_spec_dst() if skb_rtable(skb) is NULL

I mixed up the tests in ip_options_compile(), I've pushed the following.

Thanks.

====================
ipv4: Fix crashes in ip_options_compile().

The spec_dst uses should be guarded by skb_rtable() being non-NULL
not just the SKB being non-null.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_options.c |    9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 766dfe56..1f02251 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -253,12 +253,15 @@ int ip_options_compile(struct net *net,
 {
 	__be32 spec_dst = (__force __be32) 0;
 	unsigned char *pp_ptr = NULL;
+	struct rtable *rt = NULL;
 	unsigned char *optptr;
 	unsigned char *iph;
 	int optlen, l;
 
 	if (skb != NULL) {
-		spec_dst = fib_compute_spec_dst(skb);
+		rt = skb_rtable(skb);
+		if (rt)
+			spec_dst = fib_compute_spec_dst(skb);
 		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
 	} else
 		optptr = opt->__data;
@@ -330,7 +333,7 @@ int ip_options_compile(struct net *net,
 					pp_ptr = optptr + 2;
 					goto error;
 				}
-				if (skb) {
+				if (rt) {
 					memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 					opt->is_changed = 1;
 				}
@@ -372,7 +375,7 @@ int ip_options_compile(struct net *net,
 						goto error;
 					}
 					opt->ts = optptr - iph;
-					if (skb)  {
+					if (rt)  {
 						memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 						timeptr = &optptr[optptr[2]+3];
 					}
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-07-04 23:13       ` David Miller
@ 2012-07-05  7:52         ` Eric Dumazet
  2012-07-05  7:59           ` David Miller
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2012-07-05  7:52 UTC (permalink / raw)
  To: David Miller; +Cc: ja, netdev

On Wed, 2012-07-04 at 16:13 -0700, David Miller wrote:

> ====================
> ipv4: Fix crashes in ip_options_compile().
> 
> The spec_dst uses should be guarded by skb_rtable() being non-NULL
> not just the SKB being non-null.
> 
> Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---

Seems good to me thanks.

By the way, maybe we can defer fib_compute_spec_dst() call to the point
we really need it ?

[PATCH] ipv4: defer fib_compute_spec_dst() call

ip_options_compile() can avoid calling fib_compute_spec_dst()
by default, and perform the call if needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1f02251..54ab83f 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -260,8 +260,6 @@ int ip_options_compile(struct net *net,
 
 	if (skb != NULL) {
 		rt = skb_rtable(skb);
-		if (rt)
-			spec_dst = fib_compute_spec_dst(skb);
 		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
 	} else
 		optptr = opt->__data;
@@ -334,6 +332,7 @@ int ip_options_compile(struct net *net,
 					goto error;
 				}
 				if (rt) {
+					spec_dst = fib_compute_spec_dst(skb);
 					memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 					opt->is_changed = 1;
 				}
@@ -376,6 +375,7 @@ int ip_options_compile(struct net *net,
 					}
 					opt->ts = optptr - iph;
 					if (rt)  {
+						spec_dst = fib_compute_spec_dst(skb);
 						memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 						timeptr = &optptr[optptr[2]+3];
 					}

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-07-05  7:52         ` Eric Dumazet
@ 2012-07-05  7:59           ` David Miller
  2012-07-05  8:10             ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2012-07-05  7:59 UTC (permalink / raw)
  To: eric.dumazet; +Cc: ja, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 05 Jul 2012 09:52:25 +0200

> [PATCH] ipv4: defer fib_compute_spec_dst() call
> 
> ip_options_compile() can avoid calling fib_compute_spec_dst()
> by default, and perform the call if needed.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Yes, this is a great idea.  Actually in some obscure cases your
change can cause us to compute it more than once I think.

I'd suggest we do something like create a helper function above this
code in ip_options.c that checks whether spec_dst is INADDR_ANY or
not, to guard computing it multiple times.

Could you put together a quick patch like that?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ipv4: Create and use fib_compute_spec_dst() helper.
  2012-07-05  7:59           ` David Miller
@ 2012-07-05  8:10             ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2012-07-05  8:10 UTC (permalink / raw)
  To: David Miller; +Cc: ja, netdev

On Thu, 2012-07-05 at 00:59 -0700, David Miller wrote:

> Yes, this is a great idea.  Actually in some obscure cases your
> change can cause us to compute it more than once I think.
> 
> I'd suggest we do something like create a helper function above this
> code in ip_options.c that checks whether spec_dst is INADDR_ANY or
> not, to guard computing it multiple times.
> 
> Could you put together a quick patch like that?

Sure I'll do that.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2012-07-05  8:10 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-28 10:59 [PATCH] ipv4: Create and use fib_compute_spec_dst() helper David Miller
2012-06-28 23:16 ` Julian Anastasov
2012-06-28 23:27   ` David Miller
2012-06-29  1:45   ` David Miller
2012-06-30  9:25     ` Julian Anastasov
2012-07-04 15:21     ` Eric Dumazet
2012-07-04 23:13       ` David Miller
2012-07-05  7:52         ` Eric Dumazet
2012-07-05  7:59           ` David Miller
2012-07-05  8:10             ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).