All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ipv4: move rt_genid to different cache line
@ 2014-01-02 20:00 Tom Herbert
  2014-01-03  0:38 ` Eric Dumazet
  2014-01-04  0:48 ` David Miller
  0 siblings, 2 replies; 5+ messages in thread
From: Tom Herbert @ 2014-01-02 20:00 UTC (permalink / raw)
  To: davem, netdev, eric.dumazet

Running a simple netperf TCP_RR test with 200 clients shows that
ipv4_dst_check is high on the list of functions in 'perf top'. The
pertinent action in this function is in the call to rt_is_expired
which checks the route genid (rt->rt_gentid) against the global value.
rt_genid is in the same cacheline as dst->__refcnt which is causing
false sharing.

This fix moves rt_genid into the first cacheline of the dst structure.
The dst structue is explicitly packed for cacheline optimization, so to
make room for the genid, I moved xfrm to cacheline with __refcnt (under
the assumption it is less likely to be in the critical path).

I don't believe there is an issue in ip6_dst_check since rt6i_genid
and the dst are in separate cachelines already (struct rt6_info).

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/net/dst.h   | 10 ++++++----
 include/net/route.h |  1 -
 net/ipv4/route.c    | 13 +++++++------
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 77eb53f..91777b2 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -39,10 +39,9 @@ struct dst_entry {
 	unsigned long           expires;
 	struct dst_entry	*path;
 	struct dst_entry	*from;
-#ifdef CONFIG_XFRM
-	struct xfrm_state	*xfrm;
-#else
-	void			*__pad1;
+	int			genid;
+#ifdef CONFIG_64BIT
+	int			__pad0;
 #endif
 	int			(*input)(struct sk_buff *);
 	int			(*output)(struct sk_buff *);
@@ -104,6 +103,9 @@ struct dst_entry {
 		struct rt6_info		*rt6_next;
 		struct dn_route __rcu	*dn_next;
 	};
+#ifdef CONFIG_XFRM
+	struct xfrm_state	*xfrm;
+#endif
 };
 
 u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);
diff --git a/include/net/route.h b/include/net/route.h
index 638e3eb..72b4446 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -46,7 +46,6 @@ struct fib_info;
 struct rtable {
 	struct dst_entry	dst;
 
-	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
 	__u8			rt_is_input;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f8da282..8936da8 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -436,7 +436,7 @@ static inline int ip_rt_proc_init(void)
 
 static inline bool rt_is_expired(const struct rtable *rth)
 {
-	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
+	return rth->dst.genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 }
 
 void rt_cache_flush(struct net *net)
@@ -1459,8 +1459,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->dst.tclassid = itag;
 #endif
 	rth->dst.output = ip_rt_bug;
+	rth->dst.genid = rt_genid_ipv4(dev_net(dev));
 
-	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
 	rth->rt_is_input= 1;
@@ -1591,7 +1591,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
+
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
 	rth->rt_is_input = 1;
@@ -1603,6 +1603,7 @@ static int __mkroute_input(struct sk_buff *skb,
 
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
+	rth->dst.genid = rt_genid_ipv4(dev_net(rth->dst.dev));
 
 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
 	skb_dst_set(skb, &rth->dst);
@@ -1761,8 +1762,8 @@ local_input:
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	rth->dst.tclassid = itag;
 #endif
+	rth->dst.genid = rt_genid_ipv4(net);
 
-	rth->rt_genid = rt_genid_ipv4(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
 	rth->rt_is_input = 1;
@@ -1950,8 +1951,8 @@ add:
 		return ERR_PTR(-ENOBUFS);
 
 	rth->dst.output = ip_output;
+	rth->dst.genid = rt_genid_ipv4(dev_net(dev_out));
 
-	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
 	rth->rt_is_input = 0;
@@ -2228,12 +2229,12 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		new->dev = ort->dst.dev;
 		if (new->dev)
 			dev_hold(new->dev);
+		new->genid = rt_genid_ipv4(net);
 
 		rt->rt_is_input = ort->rt_is_input;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_pmtu = ort->rt_pmtu;
 
-		rt->rt_genid = rt_genid_ipv4(net);
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
 		rt->rt_gateway = ort->rt_gateway;
-- 
1.8.5.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] ipv4: move rt_genid to different cache line
  2014-01-02 20:00 [PATCH] ipv4: move rt_genid to different cache line Tom Herbert
@ 2014-01-03  0:38 ` Eric Dumazet
  2014-01-03 18:01   ` Tom Herbert
  2014-01-04  0:48 ` David Miller
  1 sibling, 1 reply; 5+ messages in thread
From: Eric Dumazet @ 2014-01-03  0:38 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev

On Thu, 2014-01-02 at 12:00 -0800, Tom Herbert wrote:
> Running a simple netperf TCP_RR test with 200 clients shows that
> ipv4_dst_check is high on the list of functions in 'perf top'. The
> pertinent action in this function is in the call to rt_is_expired
> which checks the route genid (rt->rt_gentid) against the global value.
> rt_genid is in the same cacheline as dst->__refcnt which is causing
> false sharing.
> 
> This fix moves rt_genid into the first cacheline of the dst structure.
> The dst structue is explicitly packed for cacheline optimization, so to
> make room for the genid, I moved xfrm to cacheline with __refcnt (under
> the assumption it is less likely to be in the critical path).

This looks like a difficult to check assumption to me ....

rcu_head surely could be moved without performance impact.

Again, optimizing network stack for 200 TCP_RR special case is very
questionable. The dst must be refcounted because of special prequeue
mode, where a thread is blocked in a recvmsg() call.

Any modern application handling lot of sockets uses poll()/epoll()
anyway and prequeue is disabled : No refcount is taken, as the dst is
released (no refcount touch because of RCU) before the skb is queued
into socket receive queue.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ipv4: move rt_genid to different cache line
  2014-01-03  0:38 ` Eric Dumazet
@ 2014-01-03 18:01   ` Tom Herbert
  2014-01-03 20:04     ` Eric Dumazet
  0 siblings, 1 reply; 5+ messages in thread
From: Tom Herbert @ 2014-01-03 18:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List

On Thu, Jan 2, 2014 at 4:38 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2014-01-02 at 12:00 -0800, Tom Herbert wrote:
>> Running a simple netperf TCP_RR test with 200 clients shows that
>> ipv4_dst_check is high on the list of functions in 'perf top'. The
>> pertinent action in this function is in the call to rt_is_expired
>> which checks the route genid (rt->rt_gentid) against the global value.
>> rt_genid is in the same cacheline as dst->__refcnt which is causing
>> false sharing.
>>
>> This fix moves rt_genid into the first cacheline of the dst structure.
>> The dst structue is explicitly packed for cacheline optimization, so to
>> make room for the genid, I moved xfrm to cacheline with __refcnt (under
>> the assumption it is less likely to be in the critical path).
>
> This looks like a difficult to check assumption to me ....
>
> rcu_head surely could be moved without performance impact.
>
Yes, I had thought about that. It's two pointers so this will generate
a extra hole, probably not worth moving it for 64 bit pointer case I
suppose. I can try a patch for that.

> Again, optimizing network stack for 200 TCP_RR special case is very
> questionable. The dst must be refcounted because of special prequeue
> mode, where a thread is blocked in a recvmsg() call.
>
If you can suggest an alternate test we should be running that
reflects the "real world", I can run that.

> Any modern application handling lot of sockets uses poll()/epoll()
> anyway and prequeue is disabled : No refcount is taken, as the dst is
> released (no refcount touch because of RCU) before the skb is queued
> into socket receive queue.
>
Notwithstanding that we can't feasibly audit a million applications to
verify that they follow the "correct" design, it is still common to
dispatch individual sockets to worker threads which will do blocking
reads for long lived requests.

It seems to me you have more of an issue with prequeue itself. Maybe
it is worth considering removing it to eliminate the disparate
behavior between read and poll+read, but then removing a long lived
optimization has is own pitfalls.

>
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ipv4: move rt_genid to different cache line
  2014-01-03 18:01   ` Tom Herbert
@ 2014-01-03 20:04     ` Eric Dumazet
  0 siblings, 0 replies; 5+ messages in thread
From: Eric Dumazet @ 2014-01-03 20:04 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List

On Fri, 2014-01-03 at 10:01 -0800, Tom Herbert wrote:

> It seems to me you have more of an issue with prequeue itself. Maybe
> it is worth considering removing it to eliminate the disparate
> behavior between read and poll+read, but then removing a long lived
> optimization has is own pitfalls.

The prequeue stuff made sense in the old model of a few concurrent flows
and one thread per flow, where the gain was worth it. It is still
interesting if you have a single flow trying to fully utilize the
bandwidth of a 20Gbp or 40Gbp link (no dst false sharing in this case)

Nowadays, the hit ratio of prequeue mode is almost 0 on servers where
network performance of concurrent flows is an issue.

https://code.google.com/p/sockperf/ is a good step using realistic
benchmark, trying to mimic the behavior of current high performance
programs.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ipv4: move rt_genid to different cache line
  2014-01-02 20:00 [PATCH] ipv4: move rt_genid to different cache line Tom Herbert
  2014-01-03  0:38 ` Eric Dumazet
@ 2014-01-04  0:48 ` David Miller
  1 sibling, 0 replies; 5+ messages in thread
From: David Miller @ 2014-01-04  0:48 UTC (permalink / raw)
  To: therbert; +Cc: netdev, eric.dumazet

From: Tom Herbert <therbert@google.com>
Date: Thu, 2 Jan 2014 12:00:23 -0800 (PST)

> Running a simple netperf TCP_RR test with 200 clients shows that
> ipv4_dst_check is high on the list of functions in 'perf top'. The
> pertinent action in this function is in the call to rt_is_expired
> which checks the route genid (rt->rt_gentid) against the global value.
> rt_genid is in the same cacheline as dst->__refcnt which is causing
> false sharing.
> 
> This fix moves rt_genid into the first cacheline of the dst structure.
> The dst structue is explicitly packed for cacheline optimization, so to
> make room for the genid, I moved xfrm to cacheline with __refcnt (under
> the assumption it is less likely to be in the critical path).
> 
> I don't believe there is an issue in ip6_dst_check since rt6i_genid
> and the dst are in separate cachelines already (struct rt6_info).
> 
> Signed-off-by: Tom Herbert <therbert@google.com>

Can you just put the genid deeper into "struct rtable" to solve this?

It might work well to put it next to the uncached list.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-01-04  0:48 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-02 20:00 [PATCH] ipv4: move rt_genid to different cache line Tom Herbert
2014-01-03  0:38 ` Eric Dumazet
2014-01-03 18:01   ` Tom Herbert
2014-01-03 20:04     ` Eric Dumazet
2014-01-04  0:48 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.