netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2 v3] ipv4: Cache dst in tunnels
@ 2014-01-02 19:48 Tom Herbert
  2014-04-24 12:16 ` [1/2,v3] " Timo Teras
  0 siblings, 1 reply; 8+ messages in thread
From: Tom Herbert @ 2014-01-02 19:48 UTC (permalink / raw)
  To: davem, netdev, eric.dumazet

Avoid doing a route lookup on every packet being tunneled.

In ip_tunnel.c cache the route returned from ip_route_output if
the tunnel is "connected" so that all the rouitng parameters are
taken from tunnel parms for a packet. Specifically, not NBMA tunnel
and tos is from tunnel parms (not inner packet).

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/net/ip_tunnels.h |   3 ++
 net/ipv4/ip_tunnel.c     | 113 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 92 insertions(+), 24 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 732f8c6..bde50fc 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -54,6 +54,9 @@ struct ip_tunnel {
 	int		hlen;		/* Precalculated header length */
 	int		mlink;
 
+	struct		dst_entry __rcu *dst_cache;
+	spinlock_t	dst_lock;
+
 	struct ip_tunnel_parm parms;
 
 	/* for SIT */
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 90ff957..27d756f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -68,6 +68,54 @@ static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
 			 IP_TNL_HASH_BITS);
 }
 
+static inline void __tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
+{
+	struct dst_entry *old_dst;
+
+	if (dst && (dst->flags & DST_NOCACHE))
+		dst = NULL;
+
+	spin_lock_bh(&t->dst_lock);
+	old_dst = rcu_dereference_raw(t->dst_cache);
+	rcu_assign_pointer(t->dst_cache, dst);
+	dst_release(old_dst);
+	spin_unlock_bh(&t->dst_lock);
+}
+
+static inline void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
+{
+	__tunnel_dst_set(t, dst);
+}
+
+static inline void tunnel_dst_reset(struct ip_tunnel *t)
+{
+	tunnel_dst_set(t, NULL);
+}
+
+static inline struct dst_entry *tunnel_dst_get(struct ip_tunnel *t)
+{
+	struct dst_entry *dst;
+
+	rcu_read_lock();
+	dst = rcu_dereference(t->dst_cache);
+	if (dst)
+		dst_hold(dst);
+	rcu_read_unlock();
+	return dst;
+}
+
+struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie)
+{
+	struct dst_entry *dst = tunnel_dst_get(t);
+
+	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+		tunnel_dst_reset(t);
+		return NULL;
+	}
+
+	return dst;
+}
+
 /* Often modified stats are per cpu, other are shared (netdev->stats) */
 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 						struct rtnl_link_stats64 *tot)
@@ -318,11 +366,10 @@ failed:
 	return ERR_PTR(err);
 }
 
-static inline struct rtable *ip_route_output_tunnel(struct net *net,
-						    struct flowi4 *fl4,
-						    int proto,
-						    __be32 daddr, __be32 saddr,
-						    __be32 key, __u8 tos, int oif)
+static inline void init_tunnel_flow(struct flowi4 *fl4,
+				    int proto,
+				    __be32 daddr, __be32 saddr,
+				    __be32 key, __u8 tos, int oif)
 {
 	memset(fl4, 0, sizeof(*fl4));
 	fl4->flowi4_oif = oif;
@@ -331,7 +378,6 @@ static inline struct rtable *ip_route_output_tunnel(struct net *net,
 	fl4->flowi4_tos = tos;
 	fl4->flowi4_proto = proto;
 	fl4->fl4_gre_key = key;
-	return ip_route_output_key(net, fl4);
 }
 
 static int ip_tunnel_bind_dev(struct net_device *dev)
@@ -350,14 +396,14 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 		struct flowi4 fl4;
 		struct rtable *rt;
 
-		rt = ip_route_output_tunnel(tunnel->net, &fl4,
-					    tunnel->parms.iph.protocol,
-					    iph->daddr, iph->saddr,
-					    tunnel->parms.o_key,
-					    RT_TOS(iph->tos),
-					    tunnel->parms.link);
+		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
+				 iph->saddr, tunnel->parms.o_key,
+				 RT_TOS(iph->tos), tunnel->parms.link);
+		rt = ip_route_output_key(tunnel->net, &fl4);
+
 		if (!IS_ERR(rt)) {
 			tdev = rt->dst.dev;
+			tunnel_dst_set(tunnel, dst_clone(&rt->dst));
 			ip_rt_put(rt);
 		}
 		if (dev->type != ARPHRD_ETHER)
@@ -528,10 +574,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	struct flowi4 fl4;
 	u8     tos, ttl;
 	__be16 df;
-	struct rtable *rt;		/* Route to the other host */
+	struct rtable *rt = NULL;	/* Route to the other host */
 	unsigned int max_headroom;	/* The extra header space needed */
 	__be32 dst;
 	int err;
+	bool connected = true;
 
 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 
@@ -581,27 +628,39 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 #endif
 		else
 			goto tx_error;
+
+		connected = false;
 	}
 
 	tos = tnl_params->tos;
 	if (tos & 0x1) {
 		tos &= ~0x1;
-		if (skb->protocol == htons(ETH_P_IP))
+		if (skb->protocol == htons(ETH_P_IP)) {
 			tos = inner_iph->tos;
-		else if (skb->protocol == htons(ETH_P_IPV6))
+			connected = false;
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+			connected = false;
+		}
 	}
 
-	rt = ip_route_output_tunnel(tunnel->net, &fl4,
-				    protocol,
-				    dst, tnl_params->saddr,
-				    tunnel->parms.o_key,
-				    RT_TOS(tos),
-				    tunnel->parms.link);
-	if (IS_ERR(rt)) {
-		dev->stats.tx_carrier_errors++;
-		goto tx_error;
+	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
+
+	if (connected)
+		rt = (struct rtable *)tunnel_dst_check(tunnel, 0);
+
+	if (!rt) {
+		rt = ip_route_output_key(tunnel->net, &fl4);
+
+		if (IS_ERR(rt)) {
+			dev->stats.tx_carrier_errors++;
+			goto tx_error;
+		}
+		if (connected)
+			tunnel_dst_set(tunnel, dst_clone(&rt->dst));
 	}
+
 	if (rt->dst.dev == dev) {
 		ip_rt_put(rt);
 		dev->stats.collisions++;
@@ -696,6 +755,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
 		if (set_mtu)
 			dev->mtu = mtu;
 	}
+	tunnel_dst_reset(t);
 	netdev_state_change(dev);
 }
 
@@ -1001,6 +1061,9 @@ int ip_tunnel_init(struct net_device *dev)
 	iph->version		= 4;
 	iph->ihl		= 5;
 
+	tunnel->dst_cache = NULL;
+	spin_lock_init(&tunnel->dst_lock);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@ -1015,6 +1078,8 @@ void ip_tunnel_uninit(struct net_device *dev)
 	/* fb_tunnel_dev will be unregisted in net-exit call. */
 	if (itn->fb_tunnel_dev != dev)
 		ip_tunnel_del(netdev_priv(dev));
+
+	tunnel_dst_reset(tunnel);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
 
-- 
1.8.5.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [1/2,v3] ipv4: Cache dst in tunnels
  2014-01-02 19:48 [PATCH 1/2 v3] ipv4: Cache dst in tunnels Tom Herbert
@ 2014-04-24 12:16 ` Timo Teras
  2014-05-15  8:35   ` [PATCH] ipv4: ip_tunnels: disable dst cache for nbma gre tunnels Timo Teräs
  0 siblings, 1 reply; 8+ messages in thread
From: Timo Teras @ 2014-04-24 12:16 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet

On Thu, 2 Jan 2014 11:48:26 -0800 (PST)
Tom Herbert <therbert@google.com> wrote:

> Avoid doing a route lookup on every packet being tunneled.
> 
> In ip_tunnel.c cache the route returned from ip_route_output if
> the tunnel is "connected" so that all the rouitng parameters are
> taken from tunnel parms for a packet. Specifically, not NBMA tunnel
> and tos is from tunnel parms (not inner packet).

Sorry for late reply. The 'connected' check you do is not sufficient.
It breaks NBMA mode gre. See below.

> @@ -528,10 +574,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct
> net_device *dev, struct flowi4 fl4;
>  	u8     tos, ttl;
>  	__be16 df;
> -	struct rtable *rt;		/* Route to the other host
> */
> +	struct rtable *rt = NULL;	/* Route to the other host
> */ unsigned int max_headroom;	/* The extra header space needed
> */ __be32 dst;
>  	int err;
> +	bool connected = true;
>  
>  	inner_iph = (const struct iphdr
> *)skb_inner_network_header(skb); 
> @@ -581,27 +628,39 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct
> net_device *dev, #endif
>  		else
>  			goto tx_error;
> +
> +		connected = false;
>  	}

The assumption in these two hunks is wrong. Even if tnl_params->daddr is
set, it might be an NBMA mode tunnel.

This happens (at least) when ipgre_header() is used and it pushes the
modified iphdr to skb. In that case ipgre_xmit just pulls it, and gives
that as the tnl_params argument.

I don't remember fully if tnl_params->daddr can be zero in
ip_tunnel_xmit. It *might* be a leftover dead-code path.

Any ideas how to fix this in a proper manner?

Thanks. Timo.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] ipv4: ip_tunnels: disable dst cache for nbma gre tunnels
  2014-04-24 12:16 ` [1/2,v3] " Timo Teras
@ 2014-05-15  8:35   ` Timo Teräs
  2014-05-15 17:35     ` Timo Teras
  2014-05-16  3:28     ` [PATCH] ipv4: ip_tunnels: disable dst " David Miller
  0 siblings, 2 replies; 8+ messages in thread
From: Timo Teräs @ 2014-05-15  8:35 UTC (permalink / raw)
  To: netdev; +Cc: Timo Teräs, Tom Herbert, Eric Dumazet

The connected check fails to check for ip_gre nbma mode tunnels
properly. ip_gre creates temporary tnl_params with daddr specified
to pass-in the actual target on per-packet basis from neighbor
layer. Detect these tunnels by inspecting the actual tunnel
configuration instead.

Fixes: 7d442fab0a67 ("ipv4: Cache dst in tunnels")
Signed-off-by: Timo Teräs <timo.teras@iki.fi>
Cc: Tom Herbert <therbert@google.com>
Cc: Eric Dumazet <edumazet@google.com>
---
Should go to 3.14-stable too.

(This does not fix all NBMA GRE regressions in 3.14.
 I'm also getting weird oopses from GRO; will post soon more info.)

 net/ipv4/ip_tunnel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index e77381d..100caf5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -538,9 +538,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	unsigned int max_headroom;	/* The extra header space needed */
 	__be32 dst;
 	int err;
-	bool connected = true;
+	bool connected;
 
 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	connected = (tunnel->daddr != 0);
 
 	dst = tnl_params->daddr;
 	if (dst == 0) {
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] ipv4: ip_tunnels: disable dst cache for nbma gre tunnels
  2014-05-15  8:35   ` [PATCH] ipv4: ip_tunnels: disable dst cache for nbma gre tunnels Timo Teräs
@ 2014-05-15 17:35     ` Timo Teras
  2014-05-16  3:29       ` David Miller
  2014-05-16  3:28     ` [PATCH] ipv4: ip_tunnels: disable dst " David Miller
  1 sibling, 1 reply; 8+ messages in thread
From: Timo Teras @ 2014-05-15 17:35 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev, Tom Herbert, Eric Dumazet

On Thu, 15 May 2014 11:35:17 +0300
Timo Teräs <timo.teras@iki.fi> wrote:

> The connected check fails to check for ip_gre nbma mode tunnels
> properly. ip_gre creates temporary tnl_params with daddr specified
> to pass-in the actual target on per-packet basis from neighbor
> layer. Detect these tunnels by inspecting the actual tunnel
> configuration instead.
> 
> Fixes: 7d442fab0a67 ("ipv4: Cache dst in tunnels")
> Signed-off-by: Timo Teräs <timo.teras@iki.fi>
> Cc: Tom Herbert <therbert@google.com>
> Cc: Eric Dumazet <edumazet@google.com>
> ---
> Should go to 3.14-stable too.
> 
> (This does not fix all NBMA GRE regressions in 3.14.
>  I'm also getting weird oopses from GRO; will post soon more info.)
> 
>  net/ipv4/ip_tunnel.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
> index e77381d..100caf5 100644
> --- a/net/ipv4/ip_tunnel.c
> +++ b/net/ipv4/ip_tunnel.c
> @@ -538,9 +538,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct
> net_device *dev, unsigned int max_headroom;	/* The extra
> header space needed */ __be32 dst;
>  	int err;
> -	bool connected = true;
> +	bool connected;
>  
>  	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
> +	connected = (tunnel->daddr != 0);
>  
>  	dst = tnl_params->daddr;
>  	if (dst == 0) {

I meant tunnel->parms.iph.daddr here.  tunnel->daddr does not even
exist. I wonder which tree I was testing. I'll retry tests and send new
patch.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ipv4: ip_tunnels: disable dst cache for nbma gre tunnels
  2014-05-15  8:35   ` [PATCH] ipv4: ip_tunnels: disable dst cache for nbma gre tunnels Timo Teräs
  2014-05-15 17:35     ` Timo Teras
@ 2014-05-16  3:28     ` David Miller
  1 sibling, 0 replies; 8+ messages in thread
From: David Miller @ 2014-05-16  3:28 UTC (permalink / raw)
  To: timo.teras; +Cc: netdev, therbert, edumazet

From: Timo Teräs <timo.teras@iki.fi>
Date: Thu, 15 May 2014 11:35:17 +0300

> The connected check fails to check for ip_gre nbma mode tunnels
> properly. ip_gre creates temporary tnl_params with daddr specified
> to pass-in the actual target on per-packet basis from neighbor
> layer. Detect these tunnels by inspecting the actual tunnel
> configuration instead.
> 
> Fixes: 7d442fab0a67 ("ipv4: Cache dst in tunnels")
> Signed-off-by: Timo Teräs <timo.teras@iki.fi>
> Cc: Tom Herbert <therbert@google.com>
> Cc: Eric Dumazet <edumazet@google.com>
> ---
> Should go to 3.14-stable too.

Good catch.

Applied and queued up for -stable, thank you.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ipv4: ip_tunnels: disable dst cache for nbma gre tunnels
  2014-05-15 17:35     ` Timo Teras
@ 2014-05-16  3:29       ` David Miller
  2014-05-16  5:34         ` [PATCH net,v2] ipv4: ip_tunnels: disable " Timo Teräs
  0 siblings, 1 reply; 8+ messages in thread
From: David Miller @ 2014-05-16  3:29 UTC (permalink / raw)
  To: timo.teras; +Cc: netdev, therbert, edumazet

From: Timo Teras <timo.teras@iki.fi>
Date: Thu, 15 May 2014 20:35:35 +0300

> I meant tunnel->parms.iph.daddr here.  tunnel->daddr does not even
> exist. I wonder which tree I was testing. I'll retry tests and send new
> patch.

Oops, let me know when you have this ready :-/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH net,v2] ipv4: ip_tunnels: disable cache for nbma gre tunnels
  2014-05-16  3:29       ` David Miller
@ 2014-05-16  5:34         ` Timo Teräs
  2014-05-16 20:59           ` David Miller
  0 siblings, 1 reply; 8+ messages in thread
From: Timo Teräs @ 2014-05-16  5:34 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: Timo Teräs, Tom Herbert, Eric Dumazet

The connected check fails to check for ip_gre nbma mode tunnels
properly. ip_gre creates temporary tnl_params with daddr specified
to pass-in the actual target on per-packet basis from neighbor
layer. Detect these tunnels by inspecting the actual tunnel
configuration.

Minimal test case:
 ip route add 192.168.1.1/32 via 10.0.0.1
 ip route add 192.168.1.2/32 via 10.0.0.2
 ip tunnel add nbma0 mode gre key 1 tos c0
 ip addr add 172.17.0.0/16 dev nbma0
 ip link set nbma0 up
 ip neigh add 172.17.0.1 lladdr 192.168.1.1 dev nbma0
 ip neigh add 172.17.0.2 lladdr 192.168.1.2 dev nbma0
 ping 172.17.0.1
 ping 172.17.0.2

The second ping should be going to 192.168.1.2 and head 10.0.0.2;
but cached gre tunnel level route is used and it's actually going
to 192.168.1.1 via 10.0.0.1.

The lladdr's need to go to separate dst for the bug to trigger.
Test case uses separate route entries, but this can also happen
when the route entry is same: if there is a nexthop exception or
the GRE tunnel is IPsec'ed in which case the dst points to xfrm
bundle unique to the gre lladdr.

Fixes: 7d442fab0a67 ("ipv4: Cache dst in tunnels")
Signed-off-by: Timo Teräs <timo.teras@iki.fi>
Cc: Tom Herbert <therbert@google.com>
Cc: Eric Dumazet <edumazet@google.com>
---
Should go to 3.14-stable too.

 net/ipv4/ip_tunnel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index e77381d..4eb9196 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -538,9 +538,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	unsigned int max_headroom;	/* The extra header space needed */
 	__be32 dst;
 	int err;
-	bool connected = true;
+	bool connected;
 
 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	connected = (tunnel->parms.iph.daddr != 0);
 
 	dst = tnl_params->daddr;
 	if (dst == 0) {
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH net,v2] ipv4: ip_tunnels: disable cache for nbma gre tunnels
  2014-05-16  5:34         ` [PATCH net,v2] ipv4: ip_tunnels: disable " Timo Teräs
@ 2014-05-16 20:59           ` David Miller
  0 siblings, 0 replies; 8+ messages in thread
From: David Miller @ 2014-05-16 20:59 UTC (permalink / raw)
  To: timo.teras; +Cc: netdev, therbert, edumazet

From: Timo Teräs <timo.teras@iki.fi>
Date: Fri, 16 May 2014 08:34:39 +0300

> The connected check fails to check for ip_gre nbma mode tunnels
> properly. ip_gre creates temporary tnl_params with daddr specified
> to pass-in the actual target on per-packet basis from neighbor
> layer. Detect these tunnels by inspecting the actual tunnel
> configuration.
> 
> Minimal test case:
>  ip route add 192.168.1.1/32 via 10.0.0.1
>  ip route add 192.168.1.2/32 via 10.0.0.2
>  ip tunnel add nbma0 mode gre key 1 tos c0
>  ip addr add 172.17.0.0/16 dev nbma0
>  ip link set nbma0 up
>  ip neigh add 172.17.0.1 lladdr 192.168.1.1 dev nbma0
>  ip neigh add 172.17.0.2 lladdr 192.168.1.2 dev nbma0
>  ping 172.17.0.1
>  ping 172.17.0.2
> 
> The second ping should be going to 192.168.1.2 and head 10.0.0.2;
> but cached gre tunnel level route is used and it's actually going
> to 192.168.1.1 via 10.0.0.1.
> 
> The lladdr's need to go to separate dst for the bug to trigger.
> Test case uses separate route entries, but this can also happen
> when the route entry is same: if there is a nexthop exception or
> the GRE tunnel is IPsec'ed in which case the dst points to xfrm
> bundle unique to the gre lladdr.
> 
> Fixes: 7d442fab0a67 ("ipv4: Cache dst in tunnels")
> Signed-off-by: Timo Teräs <timo.teras@iki.fi>

Yeah, this one compiles :-)

Applied and queued up for -stable, thanks!

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-05-16 20:59 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-02 19:48 [PATCH 1/2 v3] ipv4: Cache dst in tunnels Tom Herbert
2014-04-24 12:16 ` [1/2,v3] " Timo Teras
2014-05-15  8:35   ` [PATCH] ipv4: ip_tunnels: disable dst cache for nbma gre tunnels Timo Teräs
2014-05-15 17:35     ` Timo Teras
2014-05-16  3:29       ` David Miller
2014-05-16  5:34         ` [PATCH net,v2] ipv4: ip_tunnels: disable " Timo Teräs
2014-05-16 20:59           ` David Miller
2014-05-16  3:28     ` [PATCH] ipv4: ip_tunnels: disable dst " David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).