netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] ipv4: percpu nh_rth_output cache
@ 2012-07-31 15:45 Eric Dumazet
  2012-07-31 20:10 ` Alexander Duyck
  2012-07-31 21:43 ` David Miller
  0 siblings, 2 replies; 3+ messages in thread
From: Eric Dumazet @ 2012-07-31 15:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Alexander Duyck

From: Eric Dumazet <edumazet@google.com>

Input path is mostly run under RCU and doesnt touch dst refcnt

But output path on forwarding or UDP workloads hits
badly dst refcount, and we have lot of false sharing, for example
in ipv4_mtu() when reading rt->rt_pmtu

Using a percpu cache for nh_rth_output gives a nice performance
increase at a small cost.

24 udpflood test on my 24 cpu machine (dummy0 output device)
(each process sends 1.000.000 udp frames, 24 processes are started)

before : 5.24 s
after : 2.06 s
For reference, time on linux-3.5 : 6.60 s

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
v2: use __this_cpu_ptr() and slighly better annotations to avoid
ugly casts

On top on previous "ipv4: Restore old dst_free() behavior" patch

We probably can remove all paddings in struct dst_entry

 include/net/ip_fib.h     |    3 ++-
 net/ipv4/fib_semantics.c |   20 +++++++++++++++++++-
 net/ipv4/route.c         |   18 +++++++++++++-----
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e521a03..e331746 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -21,6 +21,7 @@
 #include <linux/rcupdate.h>
 #include <net/fib_rules.h>
 #include <net/inetpeer.h>
+#include <linux/percpu.h>
 
 struct fib_config {
 	u8			fc_dst_len;
@@ -81,7 +82,7 @@ struct fib_nh {
 	__be32			nh_gw;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
-	struct rtable __rcu	*nh_rth_output;
+	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
 	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	*nh_exceptions;
 };
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 625cf18..fe2ca02 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -176,6 +176,23 @@ static void rt_nexthop_free(struct rtable __rcu **rtp)
 	dst_free(&rt->dst);
 }
 
+static void rt_nexthop_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+	int cpu;
+
+	if (!rtp)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct rtable *rt;
+
+		rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+		if (rt)
+			dst_free(&rt->dst);
+	}
+	free_percpu(rtp);
+}
+
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
@@ -186,7 +203,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			dev_put(nexthop_nh->nh_dev);
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
-		rt_nexthop_free(&nexthop_nh->nh_rth_output);
+		rt_nexthop_free_cpus(nexthop_nh->nh_pcpu_rth_output);
 		rt_nexthop_free(&nexthop_nh->nh_rth_input);
 	} endfor_nexthops(fi);
 
@@ -817,6 +834,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
 		nexthop_nh->nh_parent = fi;
+		nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
 	} endfor_nexthops(fi)
 
 	if (cfg->fc_mx) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2bd1074..4f6276c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1206,11 +1206,15 @@ static inline void rt_free(struct rtable *rt)
 
 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 {
-	struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output;
+	struct rtable *orig, *prev, **p;
 
-	if (rt_is_input_route(rt))
+	if (rt_is_input_route(rt)) {
 		p = (struct rtable **)&nh->nh_rth_input;
-
+	} else {
+		if (!nh->nh_pcpu_rth_output)
+			goto nocache;
+		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
+	}
 	orig = *p;
 
 	prev = cmpxchg(p, orig, rt);
@@ -1223,6 +1227,7 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 		 * unsuccessful at storing this route into the cache
 		 * we really need to set it.
 		 */
+nocache:
 		rt->dst.flags |= DST_NOCACHE;
 	}
 }
@@ -1749,8 +1754,11 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	fnhe = NULL;
 	if (fi) {
 		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
-		if (!fnhe) {
-			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output);
+		if (!fnhe && FIB_RES_NH(*res).nh_pcpu_rth_output) {
+			struct rtable __rcu **prth;
+
+			prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
+			rth = rcu_dereference(*prth);
 			if (rt_cache_valid(rth)) {
 				dst_hold(&rth->dst);
 				return rth;

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] ipv4: percpu nh_rth_output cache
  2012-07-31 15:45 [PATCH v2] ipv4: percpu nh_rth_output cache Eric Dumazet
@ 2012-07-31 20:10 ` Alexander Duyck
  2012-07-31 21:43 ` David Miller
  1 sibling, 0 replies; 3+ messages in thread
From: Alexander Duyck @ 2012-07-31 20:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On 07/31/2012 08:45 AM, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Input path is mostly run under RCU and doesnt touch dst refcnt
>
> But output path on forwarding or UDP workloads hits
> badly dst refcount, and we have lot of false sharing, for example
> in ipv4_mtu() when reading rt->rt_pmtu
>
> Using a percpu cache for nh_rth_output gives a nice performance
> increase at a small cost.
>
> 24 udpflood test on my 24 cpu machine (dummy0 output device)
> (each process sends 1.000.000 udp frames, 24 processes are started)
>
> before : 5.24 s
> after : 2.06 s
> For reference, time on linux-3.5 : 6.60 s
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> v2: use __this_cpu_ptr() and slighly better annotations to avoid
> ugly casts
>
> On top on previous "ipv4: Restore old dst_free() behavior" patch
>
> We probably can remove all paddings in struct dst_entry
>
I've done some quick testing and it looks like it has little to no
effect on routing performance in my system, but for UDP workloads it is
making a huge difference.  I just ran a simple test with 16 sessions of
netperf all sending UDP small packets.  Without your patch it runs at
just over 2.7Mpps, with your patch it is runs at over 10.5Mpps.

Tested-by: Alexander Duyck <alexander.h.duyck@intel.com>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] ipv4: percpu nh_rth_output cache
  2012-07-31 15:45 [PATCH v2] ipv4: percpu nh_rth_output cache Eric Dumazet
  2012-07-31 20:10 ` Alexander Duyck
@ 2012-07-31 21:43 ` David Miller
  1 sibling, 0 replies; 3+ messages in thread
From: David Miller @ 2012-07-31 21:43 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, alexander.h.duyck

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 31 Jul 2012 17:45:30 +0200

> From: Eric Dumazet <edumazet@google.com>
> 
> Input path is mostly run under RCU and doesnt touch dst refcnt
> 
> But output path on forwarding or UDP workloads hits
> badly dst refcount, and we have lot of false sharing, for example
> in ipv4_mtu() when reading rt->rt_pmtu
> 
> Using a percpu cache for nh_rth_output gives a nice performance
> increase at a small cost.
> 
> 24 udpflood test on my 24 cpu machine (dummy0 output device)
> (each process sends 1.000.000 udp frames, 24 processes are started)
> 
> before : 5.24 s
> after : 2.06 s
> For reference, time on linux-3.5 : 6.60 s
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied.

However I think fib_create_info() should fail if it cannot alloc_percpu()
successfully, instead of having funny logic in rt_cache_route().

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2012-07-31 21:43 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-31 15:45 [PATCH v2] ipv4: percpu nh_rth_output cache Eric Dumazet
2012-07-31 20:10 ` Alexander Duyck
2012-07-31 21:43 ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).