All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] udp4: Don't take socket reference in receive path
@ 2014-02-06 19:58 Tom Herbert
  2014-02-06 20:58 ` Eric Dumazet
  0 siblings, 1 reply; 4+ messages in thread
From: Tom Herbert @ 2014-02-06 19:58 UTC (permalink / raw)
  To: davem, netdev, edumazet

The reference counting in the UDP receive path is quite expensive for
a socket that is share amoungst CPUs. This is probably true for normal
sockets, but really is painful when just using the socket for
receive encapsulation.

udp4_lib_lookup always takes a socket reference, and we also put back
the reference after calling udp_queue_rcv_skb in the normal receive
path, so the need for taking the reference seems to be to hold the
socket after doing rcu_read_unlock. This patch modifies udp_lib_lookup
to optionally take a reference and is always called with rcu_read_lock.
In udp4_lib_rcv we call lib_lookup and udp_queue_rcv under the
rcu_read_lock but without having taken the reference.

Requesting comments because I suspect there are nuances to this!

Signed-off-by: Tom Herbert <therbert@google.com>
---
 net/ipv4/udp.c | 90 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 48d8cb2..6043a2f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -424,7 +424,8 @@ static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
 static struct sock *udp4_lib_lookup2(struct net *net,
 		__be32 saddr, __be16 sport,
 		__be32 daddr, unsigned int hnum, int dif,
-		struct udp_hslot *hslot2, unsigned int slot2)
+		struct udp_hslot *hslot2, unsigned int slot2,
+		bool get_ref)
 {
 	struct sock *sk, *result;
 	struct hlist_nulls_node *node;
@@ -461,12 +462,20 @@ begin:
 	if (get_nulls_value(node) != slot2)
 		goto begin;
 	if (result) {
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score2(result, net, saddr, sport,
-				  daddr, hnum, dif) < badness)) {
-			sock_put(result);
-			goto begin;
+		if (get_ref) {
+			if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+				result = NULL;
+			else if (unlikely(compute_score2(result, net, saddr, sport,
+-                                 daddr, hnum, dif) < badness)) {
+				sock_put(result);
+				goto begin;
+			}
+		} else {
+			if (unlikely(atomic_read(&result->sk_refcnt) == 0))
+				result = NULL;
+			else if (unlikely(compute_score2(result, net, saddr, sport,
+-                                 daddr, hnum, dif) < badness))
+				goto begin;
 		}
 	}
 	return result;
@@ -475,9 +484,10 @@ begin:
 /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
  * harder than this. -DaveM
  */
-struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+/* called with read_rcu_lock() */
+struct sock *___udp4_lib_lookup(struct net *net, __be32 saddr,
 		__be16 sport, __be32 daddr, __be16 dport,
-		int dif, struct udp_table *udptable)
+		int dif, struct udp_table *udptable, bool get_ref)
 {
 	struct sock *sk, *result;
 	struct hlist_nulls_node *node;
@@ -487,7 +497,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	int score, badness, matches = 0, reuseport = 0;
 	u32 hash = 0;
 
-	rcu_read_lock();
 	if (hslot->count > 10) {
 		hash2 = udp4_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
@@ -497,7 +506,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 
 		result = udp4_lib_lookup2(net, saddr, sport,
 					  daddr, hnum, dif,
-					  hslot2, slot2);
+					  hslot2, slot2, get_ref);
 		if (!result) {
 			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
 			slot2 = hash2 & udptable->mask;
@@ -507,9 +516,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 
 			result = udp4_lib_lookup2(net, saddr, sport,
 						  htonl(INADDR_ANY), hnum, dif,
-						  hslot2, slot2);
+						  hslot2, slot2, get_ref);
 		}
-		rcu_read_unlock();
 		return result;
 	}
 begin:
@@ -543,28 +551,50 @@ begin:
 		goto begin;
 
 	if (result) {
-		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
-			result = NULL;
-		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
-				  daddr, dport, dif) < badness)) {
-			sock_put(result);
-			goto begin;
+		if (get_ref) {
+			if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+				result = NULL;
+			else if (unlikely(compute_score(result, net, saddr, hnum, sport,
+					  daddr, dport, dif) < badness)) {
+				sock_put(result);
+				goto begin;
+			}
+		} else {
+			if (unlikely(atomic_read(&result->sk_refcnt) == 0))
+				result = NULL;
+			else if (unlikely(compute_score(result, net, saddr, hnum, sport,
+					  daddr, dport, dif) < badness))
+				goto begin;
 		}
 	}
+	return result;
+}
+
+struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+		__be16 sport, __be32 daddr, __be16 dport,
+		int dif, struct udp_table *udptable)
+{
+	struct sock *result;
+
+	rcu_read_lock();
+	result = ___udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udptable, true);
 	rcu_read_unlock();
+
 	return result;
 }
+
 EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
 
+/* called with read_rcu_lock() */
 static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 						 __be16 sport, __be16 dport,
 						 struct udp_table *udptable)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 
-	return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+	return ___udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
 				 iph->daddr, dport, inet_iif(skb),
-				 udptable);
+				 udptable, false);
 }
 
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
@@ -1755,19 +1785,21 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		if (ret > 0)
 			return -ret;
 		return 0;
-	} else {
-		if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
-			return __udp4_lib_mcast_deliver(net, skb, uh,
-					saddr, daddr, udptable);
-
-		sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
 	}
 
+
+	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+		return __udp4_lib_mcast_deliver(net, skb, uh,
+				saddr, daddr, udptable);
+
+	rcu_read_lock();
+	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+
 	if (sk != NULL) {
 		int ret;
 
 		ret = udp_queue_rcv_skb(sk, skb);
-		sock_put(sk);
+		rcu_read_unlock();
 
 		/* a return value > 0 means to resubmit the input, but
 		 * it wants the return to be -protocol, or 0
@@ -1777,6 +1809,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		return 0;
 	}
 
+	rcu_read_unlock();
+
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 		goto drop;
 	nf_reset(skb);
-- 
1.9.0.rc1.175.g0b1dcb5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH] udp4: Don't take socket reference in receive path
  2014-02-06 19:58 [RFC PATCH] udp4: Don't take socket reference in receive path Tom Herbert
@ 2014-02-06 20:58 ` Eric Dumazet
  2014-02-06 22:40   ` Tom Herbert
  0 siblings, 1 reply; 4+ messages in thread
From: Eric Dumazet @ 2014-02-06 20:58 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, edumazet

On Thu, 2014-02-06 at 11:58 -0800, Tom Herbert wrote:
> The reference counting in the UDP receive path is quite expensive for
> a socket that is share amoungst CPUs. This is probably true for normal
> sockets, but really is painful when just using the socket for
> receive encapsulation.
> 
> udp4_lib_lookup always takes a socket reference, and we also put back
> the reference after calling udp_queue_rcv_skb in the normal receive
> path, so the need for taking the reference seems to be to hold the
> socket after doing rcu_read_unlock. This patch modifies udp_lib_lookup
> to optionally take a reference and is always called with rcu_read_lock.
> In udp4_lib_rcv we call lib_lookup and udp_queue_rcv under the
> rcu_read_lock but without having taken the reference.
> 
> Requesting comments because I suspect there are nuances to this!
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---

Unfortunately this cant work.

When I did the RCU implementation for TCP/UDP, we chose to use
SLAB_DESTROY_BY_RCU.

This meant we have to take a reference, then check again the keys for
the lookup.

If we remove SLAB_DESTROY_BY_RCU, we kill performance for short lived
sessions, because of call_rcu() added latencies.

(One UDP socket is about 1024 bytes in memory, call_rcu() grace period
is throwing away 1024 bytes from cpu caches)

Sure, in your case you know your udp sessions are not short lived,
but many applications used UDP for DNS lookups, using few packets per
socket.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH] udp4: Don't take socket reference in receive path
  2014-02-06 20:58 ` Eric Dumazet
@ 2014-02-06 22:40   ` Tom Herbert
  2014-02-06 23:08     ` Eric Dumazet
  0 siblings, 1 reply; 4+ messages in thread
From: Tom Herbert @ 2014-02-06 22:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Linux Netdev List, Eric Dumazet

On Thu, Feb 6, 2014 at 12:58 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2014-02-06 at 11:58 -0800, Tom Herbert wrote:
>> The reference counting in the UDP receive path is quite expensive for
>> a socket that is share amoungst CPUs. This is probably true for normal
>> sockets, but really is painful when just using the socket for
>> receive encapsulation.
>>
>> udp4_lib_lookup always takes a socket reference, and we also put back
>> the reference after calling udp_queue_rcv_skb in the normal receive
>> path, so the need for taking the reference seems to be to hold the
>> socket after doing rcu_read_unlock. This patch modifies udp_lib_lookup
>> to optionally take a reference and is always called with rcu_read_lock.
>> In udp4_lib_rcv we call lib_lookup and udp_queue_rcv under the
>> rcu_read_lock but without having taken the reference.
>>
>> Requesting comments because I suspect there are nuances to this!
>>
>> Signed-off-by: Tom Herbert <therbert@google.com>
>> ---
>
> Unfortunately this cant work.
>
> When I did the RCU implementation for TCP/UDP, we chose to use
> SLAB_DESTROY_BY_RCU.
>
> This meant we have to take a reference, then check again the keys for
> the lookup.
>
> If we remove SLAB_DESTROY_BY_RCU, we kill performance for short lived
> sessions, because of call_rcu() added latencies.
>
Thanks for the explanation.

> (One UDP socket is about 1024 bytes in memory, call_rcu() grace period
> is throwing away 1024 bytes from cpu caches)
>
> Sure, in your case you know your udp sessions are not short lived,
> but many applications used UDP for DNS lookups, using few packets per
> socket.
>
The rationale for SLAB_DESTROY_BY_RCU might be different for UDP than
TCP. For instance, in the DNS example small connected UDP flows are
more an issue on the client, the server (which is likely to have much
greater load) should be using unconnected sockets.

In any case, I am still looking for a way to address this. Like I said
in the commit log, this per packet cost for UDP processing is far too
high at least in encapsulation path. I thought about extending
SO__REUSEPORT to provide CPU affinity but that seems like overkill
with its own performance implications. Alternatively, we could have
fast path for the encapsulation using UDP offload model which bypass
sockets completely which seems unpleasant.

>
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH] udp4: Don't take socket reference in receive path
  2014-02-06 22:40   ` Tom Herbert
@ 2014-02-06 23:08     ` Eric Dumazet
  0 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2014-02-06 23:08 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David Miller, Linux Netdev List, Eric Dumazet

On Thu, 2014-02-06 at 14:40 -0800, Tom Herbert wrote:

> >
> The rationale for SLAB_DESTROY_BY_RCU might be different for UDP than
> TCP. For instance, in the DNS example small connected UDP flows are
> more an issue on the client, the server (which is likely to have much
> greater load) should be using unconnected sockets.
> 

I know some servers have a non connected UDP socket acting as a
'listener' and instancing a new connected socket for ever incoming flow.

You cannot really know what model is used.

> In any case, I am still looking for a way to address this. Like I said
> in the commit log, this per packet cost for UDP processing is far too
> high at least in encapsulation path. I thought about extending
> SO__REUSEPORT to provide CPU affinity but that seems like overkill
> with its own performance implications. Alternatively, we could have
> fast path for the encapsulation using UDP offload model which bypass
> sockets completely which seems unpleasant.

Note that you can solve this before UDP layer, in GRO for example.

If layers before UDP already provide skb->sk (early demux), this could
be a socket that is plainly using call_rcu() for its destruction, and
you do not need to touch socket refcount.

Alternative would be to use a percpu refcnt for these special 'sockets'
that potentially are receiving XX millions frames per second using 48
cpus...

Kent Overstreet designed Percpu refcounts, maybe you should take a look
at this.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-02-06 23:09 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-02-06 19:58 [RFC PATCH] udp4: Don't take socket reference in receive path Tom Herbert
2014-02-06 20:58 ` Eric Dumazet
2014-02-06 22:40   ` Tom Herbert
2014-02-06 23:08     ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.