netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] net: Cache route in IP tunnels
@ 2013-12-12  6:17 Tom Herbert
  2013-12-12  7:37 ` Eric Dumazet
  2013-12-13  7:53 ` Julian Anastasov
  0 siblings, 2 replies; 10+ messages in thread
From: Tom Herbert @ 2013-12-12  6:17 UTC (permalink / raw)
  To: davem, netdev

Avoid doing a route lookup on every packet being tunneled.

In ip_tunnel.c cache the route returned from ip_route_output if
the tunnel is "connected", that is all the routing parameters are
taken from tunnel parms for a packet. Specifically, not NBMA tunnel
and tos is from tunnel parms (not inner packet).

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/net/ip_tunnels.h |   3 ++
 net/ipv4/ip_tunnel.c     | 106 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 86 insertions(+), 23 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 732f8c6..bde50fc 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -54,6 +54,9 @@ struct ip_tunnel {
 	int		hlen;		/* Precalculated header length */
 	int		mlink;
 
+	struct		dst_entry __rcu *dst_cache;
+	spinlock_t	dst_lock;
+
 	struct ip_tunnel_parm parms;
 
 	/* for SIT */
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 90ff957..c4db0cc 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -68,6 +68,50 @@ static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
 			 IP_TNL_HASH_BITS);
 }
 
+static inline void
+tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
+{
+	struct dst_entry *old_dst;
+
+	spin_lock(&t->dst_lock);
+	old_dst = rcu_dereference_raw(t->dst_cache);
+	rcu_assign_pointer(t->dst_cache, dst);
+	dst_release(old_dst);
+	spin_unlock(&t->dst_lock);
+}
+
+static inline void
+tunnel_dst_reset(struct ip_tunnel *t)
+{
+	tunnel_dst_set(t, NULL);
+}
+
+static inline struct dst_entry *
+tunnel_dst_get(struct ip_tunnel *t)
+{
+	struct dst_entry *dst;
+
+	rcu_read_lock();
+	dst = rcu_dereference(t->dst_cache);
+	if (dst)
+		dst_hold(dst);
+	rcu_read_unlock();
+	return dst;
+}
+
+struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie)
+{
+	struct dst_entry *dst = tunnel_dst_get(t);
+
+	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+		tunnel_dst_reset(t);
+		dst_release(dst);
+		return NULL;
+	}
+
+	return dst;
+}
+
 /* Often modified stats are per cpu, other are shared (netdev->stats) */
 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
 						struct rtnl_link_stats64 *tot)
@@ -318,11 +362,9 @@ failed:
 	return ERR_PTR(err);
 }
 
-static inline struct rtable *ip_route_output_tunnel(struct net *net,
-						    struct flowi4 *fl4,
-						    int proto,
-						    __be32 daddr, __be32 saddr,
-						    __be32 key, __u8 tos, int oif)
+static inline void init_tunnel_flow(struct flowi4 *fl4, int proto,
+				    __be32 daddr, __be32 saddr,
+				    __be32 key, __u8 tos, int oif)
 {
 	memset(fl4, 0, sizeof(*fl4));
 	fl4->flowi4_oif = oif;
@@ -331,7 +373,6 @@ static inline struct rtable *ip_route_output_tunnel(struct net *net,
 	fl4->flowi4_tos = tos;
 	fl4->flowi4_proto = proto;
 	fl4->fl4_gre_key = key;
-	return ip_route_output_key(net, fl4);
 }
 
 static int ip_tunnel_bind_dev(struct net_device *dev)
@@ -350,12 +391,11 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 		struct flowi4 fl4;
 		struct rtable *rt;
 
-		rt = ip_route_output_tunnel(tunnel->net, &fl4,
-					    tunnel->parms.iph.protocol,
-					    iph->daddr, iph->saddr,
-					    tunnel->parms.o_key,
-					    RT_TOS(iph->tos),
-					    tunnel->parms.link);
+		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
+				 iph->saddr, tunnel->parms.o_key,
+				 RT_TOS(iph->tos), tunnel->parms.link);
+		rt = ip_route_output_key(tunnel->net, &fl4);
+
 		if (!IS_ERR(rt)) {
 			tdev = rt->dst.dev;
 			ip_rt_put(rt);
@@ -532,6 +572,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	unsigned int max_headroom;	/* The extra header space needed */
 	__be32 dst;
 	int err;
+	bool connected = true;
 
 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 
@@ -581,26 +622,39 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 #endif
 		else
 			goto tx_error;
+
+		connected = false;
 	}
 
 	tos = tnl_params->tos;
 	if (tos & 0x1) {
 		tos &= ~0x1;
-		if (skb->protocol == htons(ETH_P_IP))
+		if (skb->protocol == htons(ETH_P_IP)) {
 			tos = inner_iph->tos;
-		else if (skb->protocol == htons(ETH_P_IPV6))
+			connected = false;
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+			connected = false;
+		}
 	}
 
-	rt = ip_route_output_tunnel(tunnel->net, &fl4,
-				    protocol,
-				    dst, tnl_params->saddr,
-				    tunnel->parms.o_key,
-				    RT_TOS(tos),
-				    tunnel->parms.link);
-	if (IS_ERR(rt)) {
-		dev->stats.tx_carrier_errors++;
-		goto tx_error;
+	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
+
+	if (connected)
+		rt = (struct rtable *)tunnel_dst_check(tunnel, 0);
+	else
+		rt = NULL;
+
+	if (!rt) {
+		rt = ip_route_output_key(tunnel->net, &fl4);
+
+		if (IS_ERR(rt)) {
+			dev->stats.tx_carrier_errors++;
+			goto tx_error;
+		}
+		if (connected)
+			tunnel_dst_set(tunnel, dst_clone(&rt->dst));
 	}
 	if (rt->dst.dev == dev) {
 		ip_rt_put(rt);
@@ -696,6 +750,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
 		if (set_mtu)
 			dev->mtu = mtu;
 	}
+	tunnel_dst_reset(t);
 	netdev_state_change(dev);
 }
 
@@ -1001,6 +1056,9 @@ int ip_tunnel_init(struct net_device *dev)
 	iph->version		= 4;
 	iph->ihl		= 5;
 
+	tunnel->dst_cache = NULL;
+	spin_lock_init(&tunnel->dst_lock);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@ -1015,6 +1073,8 @@ void ip_tunnel_uninit(struct net_device *dev)
 	/* fb_tunnel_dev will be unregisted in net-exit call. */
 	if (itn->fb_tunnel_dev != dev)
 		ip_tunnel_del(netdev_priv(dev));
+
+	tunnel_dst_reset(tunnel);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
 
-- 
1.8.5.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12  6:17 [PATCH] net: Cache route in IP tunnels Tom Herbert
@ 2013-12-12  7:37 ` Eric Dumazet
  2013-12-12 18:54   ` David Miller
  2013-12-13  7:53 ` Julian Anastasov
  1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2013-12-12  7:37 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev

On Wed, 2013-12-11 at 22:17 -0800, Tom Herbert wrote:
> Avoid doing a route lookup on every packet being tunneled.
> 
> In ip_tunnel.c cache the route returned from ip_route_output if
> the tunnel is "connected", that is all the routing parameters are
> taken from tunnel parms for a packet. Specifically, not NBMA tunnel
> and tos is from tunnel parms (not inner packet).
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---

Replacing per cpu dst (nh_pcpu_rth_output) with a single shared dst is
not going to be better on SMP.

Do you have any performance data using multiqueue NIC, with say 32 cpus
and 32 TX queues ?

I suspect a high false sharing on dst refcount.

Thanks

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12  7:37 ` Eric Dumazet
@ 2013-12-12 18:54   ` David Miller
  2013-12-12 20:19     ` Tom Herbert
  0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2013-12-12 18:54 UTC (permalink / raw)
  To: eric.dumazet; +Cc: therbert, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Dec 2013 23:37:18 -0800

> On Wed, 2013-12-11 at 22:17 -0800, Tom Herbert wrote:
>> Avoid doing a route lookup on every packet being tunneled.
>> 
>> In ip_tunnel.c cache the route returned from ip_route_output if
>> the tunnel is "connected", that is all the routing parameters are
>> taken from tunnel parms for a packet. Specifically, not NBMA tunnel
>> and tos is from tunnel parms (not inner packet).
>> 
>> Signed-off-by: Tom Herbert <therbert@google.com>
>> ---
> 
> Replacing per cpu dst (nh_pcpu_rth_output) with a single shared dst is
> not going to be better on SMP.
> 
> Do you have any performance data using multiqueue NIC, with say 32 cpus
> and 32 TX queues ?
> 
> I suspect a high false sharing on dst refcount.

First off I'm glad someone looked into this, I've been meaning to play
with this for a while.

Secondly, Eric's concern is valid but we should keep in mind that IPV6
tunnels do this kind of caching already so if it's undesirable we
should perhaps undo the caching there too.

But honestly I suspect that doing a lookup every packet is more
expensive than false sharing on a cache entry, but I'm happy to be
proven wrong :)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12 18:54   ` David Miller
@ 2013-12-12 20:19     ` Tom Herbert
  2013-12-12 20:29       ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Tom Herbert @ 2013-12-12 20:19 UTC (permalink / raw)
  To: David Miller; +Cc: Eric Dumazet, Linux Netdev List

On Thu, Dec 12, 2013 at 10:54 AM, David Miller <davem@davemloft.net> wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 11 Dec 2013 23:37:18 -0800
>
>> On Wed, 2013-12-11 at 22:17 -0800, Tom Herbert wrote:
>>> Avoid doing a route lookup on every packet being tunneled.
>>>
>>> In ip_tunnel.c cache the route returned from ip_route_output if
>>> the tunnel is "connected", that is all the routing parameters are
>>> taken from tunnel parms for a packet. Specifically, not NBMA tunnel
>>> and tos is from tunnel parms (not inner packet).
>>>
>>> Signed-off-by: Tom Herbert <therbert@google.com>
>>> ---
>>
>> Replacing per cpu dst (nh_pcpu_rth_output) with a single shared dst is
>> not going to be better on SMP.
>>
>> Do you have any performance data using multiqueue NIC, with say 32 cpus
>> and 32 TX queues ?
>>
>> I suspect a high false sharing on dst refcount.
>
> First off I'm glad someone looked into this, I've been meaning to play
> with this for a while.
>
> Secondly, Eric's concern is valid but we should keep in mind that IPV6
> tunnels do this kind of caching already so if it's undesirable we
> should perhaps undo the caching there too.
>
We could also use a per CPU dst cache in the tunnel. Might be
reasonable if people aren't configuring thousands of tunnels.

> But honestly I suspect that doing a lookup every packet is more
> expensive than false sharing on a cache entry, but I'm happy to be
> proven wrong :)

My performance runs were inconclusive, but I assumed that caching in
the tunnel would scale better with larger numbers of routes.

btw, I needed to disable gro on the tun interface. With it enabled,
performance in TCP_RR is abysmal. Looking into that.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12 20:19     ` Tom Herbert
@ 2013-12-12 20:29       ` Eric Dumazet
  2013-12-12 20:56         ` Tom Herbert
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2013-12-12 20:29 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

On Thu, 2013-12-12 at 12:19 -0800, Tom Herbert wrote:

> btw, I needed to disable gro on the tun interface. With it enabled,
> performance in TCP_RR is abysmal. Looking into that.

This is a known problem.

Given that Jerry is working on GRO I was not really wanting to fix
this...

Thats a basically revert of
( http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=693019e90ca45d881109d32c0c6d29adf03f6447 ) 

diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 42ffbc8d65c6..dc7de784e934 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -111,7 +111,6 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
 		skb->rxhash = 0;
 	skb_dst_drop(skb);
 	skb->vlan_tci = 0;
-	skb_set_queue_mapping(skb, 0);
 	skb->pkt_type = PACKET_HOST;
 	return 0;
 }

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12 20:29       ` Eric Dumazet
@ 2013-12-12 20:56         ` Tom Herbert
  2013-12-12 21:11           ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Tom Herbert @ 2013-12-12 20:56 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List

On Thu, Dec 12, 2013 at 12:29 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2013-12-12 at 12:19 -0800, Tom Herbert wrote:
>
>> btw, I needed to disable gro on the tun interface. With it enabled,
>> performance in TCP_RR is abysmal. Looking into that.
>
> This is a known problem.
>
> Given that Jerry is working on GRO I was not really wanting to fix
> this...
>
> Thats a basically revert of
> ( http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=693019e90ca45d881109d32c0c6d29adf03f6447 )

Reverting that patch doesn't fix the problem. We are hitting the
spinlock in gro_cells_receive a lot though.


> diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
> index 42ffbc8d65c6..dc7de784e934 100644
> --- a/net/ipv4/ip_tunnel_core.c
> +++ b/net/ipv4/ip_tunnel_core.c
> @@ -111,7 +111,6 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
>                 skb->rxhash = 0;
>         skb_dst_drop(skb);
>         skb->vlan_tci = 0;
> -       skb_set_queue_mapping(skb, 0);
>         skb->pkt_type = PACKET_HOST;
>         return 0;
>  }
>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12 20:56         ` Tom Herbert
@ 2013-12-12 21:11           ` Eric Dumazet
  2013-12-12 22:54             ` Tom Herbert
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2013-12-12 21:11 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

On Thu, 2013-12-12 at 12:56 -0800, Tom Herbert wrote:

> Reverting that patch doesn't fix the problem. We are hitting the
> spinlock in gro_cells_receive a lot though.

Well, something broke then.

I was able to reach line rate at the time I did this work.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12 21:11           ` Eric Dumazet
@ 2013-12-12 22:54             ` Tom Herbert
  2013-12-12 23:24               ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Tom Herbert @ 2013-12-12 22:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List

On Thu, Dec 12, 2013 at 1:11 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2013-12-12 at 12:56 -0800, Tom Herbert wrote:
>
>> Reverting that patch doesn't fix the problem. We are hitting the
>> spinlock in gro_cells_receive a lot though.
>
> Well, something broke then.
>

The test I'm running is over a single tunnel between two hosts, so all
packets will be hitting the same RX queue since we only steer by
3-tuple hash. Clearing or not clearing hash will have no effect.

Nevertheless, I don't see how code how gro_cells_receive (called via
ip_tunnel_rcv) could have ever have worked right for tunnels. By the
time this function is called, we've probably already spread the
packets from RX queue across multiple CPUs via RPS/RFS. So putting
them back on a queue based on RX queue would be a source of
contention.  It seems like we should be using another per CPU backlog.

> I was able to reach line rate at the time I did this work.
>
>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12 22:54             ` Tom Herbert
@ 2013-12-12 23:24               ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2013-12-12 23:24 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

On Thu, 2013-12-12 at 14:54 -0800, Tom Herbert wrote:

> 
> The test I'm running is over a single tunnel between two hosts, so all
> packets will be hitting the same RX queue since we only steer by
> 3-tuple hash. Clearing or not clearing hash will have no effect.

If a single RX queue is used, how can you have contention on a
spinlock ? A single CPU should be used.


> 
> Nevertheless, I don't see how code how gro_cells_receive (called via
> ip_tunnel_rcv) could have ever have worked right for tunnels. By the
> time this function is called, we've probably already spread the
> packets from RX queue across multiple CPUs via RPS/RFS. So putting
> them back on a queue based on RX queue would be a source of
> contention.  It seems like we should be using another per CPU backlog.

This is explained in c9e6bc644e557338221e75c242ab12c275a67d1b changelog

I explicitly explained all this.

The NAPI _is_ the backlog. But you need to avoid crossing cpus,
obviously.

If you spread the load on multiple cpus, you need to instruct the
gro_cell layer to use a proper key to spread the load on different NAPI
cells.

At the time I did this work, I stated the best performance would be
reached without playing with RPS and these awful IPI. Scheduler guys
have hard time with RPS/RFS.

This is all moot, as Jerry is working to do the GRO without having this
extra layer.

And RPS being done _after_ GRO, if you receive traffic on a single RX
queue, you are stuck. Its actually easy to play on outer IP headers
to get proper hash dispersion.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] net: Cache route in IP tunnels
  2013-12-12  6:17 [PATCH] net: Cache route in IP tunnels Tom Herbert
  2013-12-12  7:37 ` Eric Dumazet
@ 2013-12-13  7:53 ` Julian Anastasov
  1 sibling, 0 replies; 10+ messages in thread
From: Julian Anastasov @ 2013-12-13  7:53 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev


	Hello,

On Wed, 11 Dec 2013, Tom Herbert wrote:

> +static inline void
> +tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
> +{
> +	struct dst_entry *old_dst;
> +
> +	spin_lock(&t->dst_lock);
> +	old_dst = rcu_dereference_raw(t->dst_cache);
> +	rcu_assign_pointer(t->dst_cache, dst);
> +	dst_release(old_dst);
> +	spin_unlock(&t->dst_lock);

	It seems the func is also called by ip_tunnel_update(),
do we need _bh locks in such case?

> +}
> +
> +static inline void
> +tunnel_dst_reset(struct ip_tunnel *t)
> +{
> +	tunnel_dst_set(t, NULL);
> +}
> +

...

> @@ -696,6 +750,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
>  		if (set_mtu)
>  			dev->mtu = mtu;
>  	}
> +	tunnel_dst_reset(t);
>  	netdev_state_change(dev);
>  }

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2013-12-13  7:54 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-12-12  6:17 [PATCH] net: Cache route in IP tunnels Tom Herbert
2013-12-12  7:37 ` Eric Dumazet
2013-12-12 18:54   ` David Miller
2013-12-12 20:19     ` Tom Herbert
2013-12-12 20:29       ` Eric Dumazet
2013-12-12 20:56         ` Tom Herbert
2013-12-12 21:11           ` Eric Dumazet
2013-12-12 22:54             ` Tom Herbert
2013-12-12 23:24               ` Eric Dumazet
2013-12-13  7:53 ` Julian Anastasov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).