netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver.
@ 2014-07-14 22:30 David Held
  2014-07-14 22:30 ` [PATCH net-next 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
  2014-07-16  0:14 ` [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver David Miller
  0 siblings, 2 replies; 8+ messages in thread
From: David Held @ 2014-07-14 22:30 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, willemb, David Held

Switch to using sk_nulls_for_each which shortens the code and makes it
easier to update.

Signed-off-by: David Held <drheld@google.com>
---
 net/ipv4/udp.c | 48 ++++++++++----------------------
 net/ipv6/udp.c | 88 +++++++++++++++++++++++-----------------------------------
 2 files changed, 49 insertions(+), 87 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ac30e10..8089ba2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	return true;
 }
 
-static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
-					     __be16 loc_port, __be32 loc_addr,
-					     __be16 rmt_port, __be32 rmt_addr,
-					     int dif)
-{
-	struct hlist_nulls_node *node;
-	unsigned short hnum = ntohs(loc_port);
-
-	sk_nulls_for_each_from(sk, node) {
-		if (__udp_is_mcast_sock(net, sk,
-					loc_port, loc_addr,
-					rmt_port, rmt_addr,
-					dif, hnum))
-			goto found;
-	}
-	sk = NULL;
-found:
-	return sk;
-}
-
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -1664,23 +1644,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udp_table *udptable)
 {
 	struct sock *sk, *stack[256 / sizeof(struct sock *)];
-	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
-	int dif;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	int dif = skb->dev->ifindex;
 	unsigned int i, count = 0;
 
 	spin_lock(&hslot->lock);
-	sk = sk_nulls_head(&hslot->head);
-	dif = skb->dev->ifindex;
-	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
-	while (sk) {
-		stack[count++] = sk;
-		sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
-				       daddr, uh->source, saddr, dif);
-		if (unlikely(count == ARRAY_SIZE(stack))) {
-			if (!sk)
-				break;
-			flush_stack(stack, count, skb, ~0);
-			count = 0;
+	sk_nulls_for_each(sk, node, &hslot->head) {
+		if (__udp_is_mcast_sock(net, sk,
+					uh->dest, daddr,
+					uh->source, saddr,
+					dif, hnum)) {
+			stack[count++] = sk;
+			if (unlikely(count == ARRAY_SIZE(stack))) {
+				flush_stack(stack, count, skb, ~0);
+				count = 0;
+			}
 		}
 	}
 	/*
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c2bd28f..cade19b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -698,43 +698,26 @@ drop:
 	return -1;
 }
 
-static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
-				      __be16 loc_port, const struct in6_addr *loc_addr,
-				      __be16 rmt_port, const struct in6_addr *rmt_addr,
-				      int dif)
+static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
+				   __be16 loc_port, const struct in6_addr *loc_addr,
+				   __be16 rmt_port, const struct in6_addr *rmt_addr,
+				   int dif, unsigned short hnum)
 {
-	struct hlist_nulls_node *node;
-	unsigned short num = ntohs(loc_port);
-
-	sk_nulls_for_each_from(sk, node) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (!net_eq(sock_net(sk), net))
-			continue;
-
-		if (udp_sk(sk)->udp_port_hash == num &&
-		    sk->sk_family == PF_INET6) {
-			if (inet->inet_dport) {
-				if (inet->inet_dport != rmt_port)
-					continue;
-			}
-			if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
-			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
-				continue;
-
-			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
-				continue;
+	struct inet_sock *inet = inet_sk(sk);
 
-			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-				if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
-					continue;
-			}
-			if (!inet6_mc_check(sk, loc_addr, rmt_addr))
-				continue;
-			return sk;
-		}
-	}
-	return NULL;
+	if (!net_eq(sock_net(sk), net))
+		return false;
+
+	if (udp_sk(sk)->udp_port_hash != hnum ||
+	    sk->sk_family != PF_INET6 ||
+	    (inet->inet_dport && inet->inet_dport != rmt_port) ||
+	    (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+		    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+		return false;
+	if (!inet6_mc_check(sk, loc_addr, rmt_addr))
+		return false;
+	return true;
 }
 
 static void flush_stack(struct sock **stack, unsigned int count,
@@ -783,28 +766,27 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 {
 	struct sock *sk, *stack[256 / sizeof(struct sock *)];
 	const struct udphdr *uh = udp_hdr(skb);
-	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
-	int dif;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	int dif = inet6_iif(skb);
 	unsigned int i, count = 0;
 
 	spin_lock(&hslot->lock);
-	sk = sk_nulls_head(&hslot->head);
-	dif = inet6_iif(skb);
-	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
-	while (sk) {
-		/* If zero checksum and no_check is not on for
-		 * the socket then skip it.
-		 */
-		if (uh->check || udp_sk(sk)->no_check6_rx)
+	sk_nulls_for_each(sk, node, &hslot->head) {
+		if (__udp_v6_is_mcast_sock(net, sk,
+					   uh->dest, daddr,
+					   uh->source, saddr,
+					   dif, hnum) &&
+		    /* If zero checksum and no_check is not on for
+		     * the socket then skip it.
+		     */
+		    (uh->check || udp_sk(sk)->no_check6_rx)) {
 			stack[count++] = sk;
-
-		sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
-				       uh->source, saddr, dif);
-		if (unlikely(count == ARRAY_SIZE(stack))) {
-			if (!sk)
-				break;
-			flush_stack(stack, count, skb, ~0);
-			count = 0;
+			if (unlikely(count == ARRAY_SIZE(stack))) {
+				flush_stack(stack, count, skb, ~0);
+				count = 0;
+			}
 		}
 	}
 	/*
-- 
2.0.0.526.g5318336

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH net-next 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.
  2014-07-14 22:30 [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
@ 2014-07-14 22:30 ` David Held
  2014-07-15  8:33   ` Eric Dumazet
  2014-07-16  0:14 ` [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver David Miller
  1 sibling, 1 reply; 8+ messages in thread
From: David Held @ 2014-07-14 22:30 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, willemb, David Held

Many multicast sources can have the same port which can result in a very
large list when hashing by port only. Hash by address and port instead
if this is the case. This makes multicast more similar to unicast.

On a 24-core machine receiving from 500 multicast sockets on the same
port, before this patch 80% of system CPU was used up by spin locking
and only ~25% of packets were successfully delivered.

With this patch, all packets are delivered and kernel overhead is ~8%
system CPU on spinlocks.

Signed-off-by: David Held <drheld@google.com>
---
 include/net/sock.h | 14 ++++++++++++++
 net/ipv4/udp.c     | 30 ++++++++++++++++++++----------
 net/ipv6/udp.c     | 29 +++++++++++++++++++----------
 3 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index cb84b2f..6734cab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
 #define sk_for_each_bound(__sk, list) \
 	hlist_for_each_entry(__sk, list, sk_bind_node)
 
+/**
+ * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @offset:	offset of hlist_node within the struct.
+ *
+ */
+#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)		       \
+	for (pos = (head)->first;					       \
+	     (!is_a_nulls(pos)) &&					       \
+		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
+	     pos = pos->next)
+
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
 	/* Careful only use this in a context where these parameters
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8089ba2..b023a36 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1616,6 +1616,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
 			skb1 = NULL;
+
+		sock_put(sk);
 	}
 	if (unlikely(skb1))
 		kfree_skb(skb1);
@@ -1648,37 +1650,45 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
 	int dif = skb->dev->ifindex;
-	unsigned int i, count = 0;
+	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+	if (use_hash2) {
+		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+		hash2 = udp4_portaddr_hash(net, daddr, hnum);
+start_lookup:
+		hslot = &udp_table.hash2[hash2 & udp_table.mask];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
 
 	spin_lock(&hslot->lock);
-	sk_nulls_for_each(sk, node, &hslot->head) {
+	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
 		if (__udp_is_mcast_sock(net, sk,
 					uh->dest, daddr,
 					uh->source, saddr,
 					dif, hnum)) {
 			stack[count++] = sk;
+			sock_hold(sk);
 			if (unlikely(count == ARRAY_SIZE(stack))) {
 				flush_stack(stack, count, skb, ~0);
 				count = 0;
 			}
 		}
 	}
-	/*
-	 * before releasing chain lock, we must take a reference on sockets
-	 */
-	for (i = 0; i < count; i++)
-		sock_hold(stack[i]);
 
 	spin_unlock(&hslot->lock);
 
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
 	/*
 	 * do the slow work with no lock held
 	 */
 	if (count) {
 		flush_stack(stack, count, skb, count - 1);
-
-		for (i = 0; i < count; i++)
-			sock_put(stack[i]);
 	} else {
 		kfree_skb(skb);
 	}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index cade19b..d1c00c5 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -741,6 +741,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
 			skb1 = NULL;
+		sock_put(sk);
 	}
 	if (unlikely(skb1))
 		kfree_skb(skb1);
@@ -770,10 +771,19 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
 	int dif = inet6_iif(skb);
-	unsigned int i, count = 0;
+	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+	if (use_hash2) {
+		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum);
+		hash2 = udp6_portaddr_hash(net, daddr, hnum);
+start_lookup:
+		hslot = &udp_table.hash2[hash2 & udp_table.mask];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
 
 	spin_lock(&hslot->lock);
-	sk_nulls_for_each(sk, node, &hslot->head) {
+	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
 		if (__udp_v6_is_mcast_sock(net, sk,
 					   uh->dest, daddr,
 					   uh->source, saddr,
@@ -783,25 +793,24 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		     */
 		    (uh->check || udp_sk(sk)->no_check6_rx)) {
 			stack[count++] = sk;
+			sock_hold(sk);
 			if (unlikely(count == ARRAY_SIZE(stack))) {
 				flush_stack(stack, count, skb, ~0);
 				count = 0;
 			}
 		}
 	}
-	/*
-	 * before releasing the lock, we must take reference on sockets
-	 */
-	for (i = 0; i < count; i++)
-		sock_hold(stack[i]);
 
 	spin_unlock(&hslot->lock);
 
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
 	if (count) {
 		flush_stack(stack, count, skb, count - 1);
-
-		for (i = 0; i < count; i++)
-			sock_put(stack[i]);
 	} else {
 		kfree_skb(skb);
 	}
-- 
2.0.0.526.g5318336

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.
  2014-07-14 22:30 ` [PATCH net-next 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
@ 2014-07-15  8:33   ` Eric Dumazet
  2014-07-15 13:48     ` David Held
  0 siblings, 1 reply; 8+ messages in thread
From: Eric Dumazet @ 2014-07-15  8:33 UTC (permalink / raw)
  To: David Held; +Cc: netdev, davem, willemb

On Mon, 2014-07-14 at 18:30 -0400, David Held wrote:
> Many multicast sources can have the same port which can result in a very
> large list when hashing by port only. Hash by address and port instead
> if this is the case. This makes multicast more similar to unicast.
> 
> On a 24-core machine receiving from 500 multicast sockets on the same
> port, before this patch 80% of system CPU was used up by spin locking
> and only ~25% of packets were successfully delivered.
> 
> With this patch, all packets are delivered and kernel overhead is ~8%
> system CPU on spinlocks.
> 
> Signed-off-by: David Held <drheld@google.com>
> ---
>  include/net/sock.h | 14 ++++++++++++++
>  net/ipv4/udp.c     | 30 ++++++++++++++++++++----------
>  net/ipv6/udp.c     | 29 +++++++++++++++++++----------
>  3 files changed, 53 insertions(+), 20 deletions(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index cb84b2f..6734cab 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
>  #define sk_for_each_bound(__sk, list) \
>  	hlist_for_each_entry(__sk, list, sk_bind_node)
>  
> +/**
> + * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
> + * @tpos:	the type * to use as a loop cursor.
> + * @pos:	the &struct hlist_node to use as a loop cursor.
> + * @head:	the head for your list.
> + * @offset:	offset of hlist_node within the struct.
> + *
> + */
> +#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)		       \
> +	for (pos = (head)->first;					       \
> +	     (!is_a_nulls(pos)) &&					       \
> +		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
> +	     pos = pos->next)
> +
>  static inline struct user_namespace *sk_user_ns(struct sock *sk)
>  {
>  	/* Careful only use this in a context where these parameters
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 8089ba2..b023a36 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1616,6 +1616,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
>  
>  		if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
>  			skb1 = NULL;
> +
> +		sock_put(sk);
>  	}
>  	if (unlikely(skb1))
>  		kfree_skb(skb1);
> @@ -1648,37 +1650,45 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  	unsigned short hnum = ntohs(uh->dest);
>  	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
>  	int dif = skb->dev->ifindex;
> -	unsigned int i, count = 0;
> +	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
> +	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
> +
Not sure why hash2 and hash2_any are set to 0 here ?


> +	if (use_hash2) {
> +		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
> +		hash2 = udp4_portaddr_hash(net, daddr, hnum);

you probably could AND the mask here
	hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udp_table.mask;
	hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;

> +start_lookup:
> +		hslot = &udp_table.hash2[hash2 & udp_table.mask];
> +		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
> +	}
>  
>  	spin_lock(&hslot->lock);
> -	sk_nulls_for_each(sk, node, &hslot->head) {
> +	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
>  		if (__udp_is_mcast_sock(net, sk,
>  					uh->dest, daddr,
>  					uh->source, saddr,
>  					dif, hnum)) {
>  			stack[count++] = sk;
> +			sock_hold(sk);
>  			if (unlikely(count == ARRAY_SIZE(stack))) {
>  				flush_stack(stack, count, skb, ~0);
>  				count = 0;
>  			}
>  		}
>  	}
> -	/*
> -	 * before releasing chain lock, we must take a reference on sockets
> -	 */
> -	for (i = 0; i < count; i++)
> -		sock_hold(stack[i]);
>  
>  	spin_unlock(&hslot->lock);
>  
> +	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
> +	if (use_hash2 && hash2 != hash2_any) {

The thing that really matters here is not hash2 != hash2_any, but that
the hash bucket (after masking hash2 with udp_table.mask) is the same or
not.

If the test is not properly done here, we are doing to deliver two
copies of each packet to each socket.

> +		hash2 = hash2_any;
> +		goto start_lookup;
> +	}
> +

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.
  2014-07-15  8:33   ` Eric Dumazet
@ 2014-07-15 13:48     ` David Held
  0 siblings, 0 replies; 8+ messages in thread
From: David Held @ 2014-07-15 13:48 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem, Willem de Bruijn

On Tue, Jul 15, 2014 at 4:33 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2014-07-14 at 18:30 -0400, David Held wrote:
>> Many multicast sources can have the same port which can result in a very
>> large list when hashing by port only. Hash by address and port instead
>> if this is the case. This makes multicast more similar to unicast.
>>
>> On a 24-core machine receiving from 500 multicast sockets on the same
>> port, before this patch 80% of system CPU was used up by spin locking
>> and only ~25% of packets were successfully delivered.
>>
>> With this patch, all packets are delivered and kernel overhead is ~8%
>> system CPU on spinlocks.
>>
>> Signed-off-by: David Held <drheld@google.com>
>> ---
>>  include/net/sock.h | 14 ++++++++++++++
>>  net/ipv4/udp.c     | 30 ++++++++++++++++++++----------
>>  net/ipv6/udp.c     | 29 +++++++++++++++++++----------
>>  3 files changed, 53 insertions(+), 20 deletions(-)
>>
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index cb84b2f..6734cab 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
>>  #define sk_for_each_bound(__sk, list) \
>>       hlist_for_each_entry(__sk, list, sk_bind_node)
>>
>> +/**
>> + * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
>> + * @tpos:    the type * to use as a loop cursor.
>> + * @pos:     the &struct hlist_node to use as a loop cursor.
>> + * @head:    the head for your list.
>> + * @offset:  offset of hlist_node within the struct.
>> + *
>> + */
>> +#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)                     \
>> +     for (pos = (head)->first;                                              \
>> +          (!is_a_nulls(pos)) &&                                             \
>> +             ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
>> +          pos = pos->next)
>> +
>>  static inline struct user_namespace *sk_user_ns(struct sock *sk)
>>  {
>>       /* Careful only use this in a context where these parameters
>> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
>> index 8089ba2..b023a36 100644
>> --- a/net/ipv4/udp.c
>> +++ b/net/ipv4/udp.c
>> @@ -1616,6 +1616,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
>>
>>               if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
>>                       skb1 = NULL;
>> +
>> +             sock_put(sk);
>>       }
>>       if (unlikely(skb1))
>>               kfree_skb(skb1);
>> @@ -1648,37 +1650,45 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>>       unsigned short hnum = ntohs(uh->dest);
>>       struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
>>       int dif = skb->dev->ifindex;
>> -     unsigned int i, count = 0;
>> +     unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
>> +     unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
>> +
> Not sure why hash2 and hash2_any are set to 0 here ?

It shouldn't be needed, but there's a spurious compiler warning
otherwise as the compiler isn't smart enough to know that the second
use_hash2 implies that they will have been set.

>
>> +     if (use_hash2) {
>> +             hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
>> +             hash2 = udp4_portaddr_hash(net, daddr, hnum);
>
> you probably could AND the mask here
>         hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udp_table.mask;
>         hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;

Sounds good. That solves the issue you identified below.

>> +start_lookup:
>> +             hslot = &udp_table.hash2[hash2 & udp_table.mask];
>> +             offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
>> +     }
>>
>>       spin_lock(&hslot->lock);
>> -     sk_nulls_for_each(sk, node, &hslot->head) {
>> +     sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
>>               if (__udp_is_mcast_sock(net, sk,
>>                                       uh->dest, daddr,
>>                                       uh->source, saddr,
>>                                       dif, hnum)) {
>>                       stack[count++] = sk;
>> +                     sock_hold(sk);
>>                       if (unlikely(count == ARRAY_SIZE(stack))) {
>>                               flush_stack(stack, count, skb, ~0);
>>                               count = 0;
>>                       }
>>               }
>>       }
>> -     /*
>> -      * before releasing chain lock, we must take a reference on sockets
>> -      */
>> -     for (i = 0; i < count; i++)
>> -             sock_hold(stack[i]);
>>
>>       spin_unlock(&hslot->lock);
>>
>> +     /* Also lookup *:port if we are using hash2 and haven't done so yet. */
>> +     if (use_hash2 && hash2 != hash2_any) {
>
> The thing that really matters here is not hash2 != hash2_any, but that
> the hash bucket (after masking hash2 with udp_table.mask) is the same or
> not.
>
> If the test is not properly done here, we are doing to deliver two
> copies of each packet to each socket.

You're right. Thanks for spotting that! Will update.

>> +             hash2 = hash2_any;
>> +             goto start_lookup;
>> +     }
>> +
>
>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver.
  2014-07-14 22:30 [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
  2014-07-14 22:30 ` [PATCH net-next 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
@ 2014-07-16  0:14 ` David Miller
  2014-07-16  1:53   ` David Held
  1 sibling, 1 reply; 8+ messages in thread
From: David Miller @ 2014-07-16  0:14 UTC (permalink / raw)
  To: drheld; +Cc: netdev, eric.dumazet, willemb

From: David Held <drheld@google.com>
Date: Mon, 14 Jul 2014 18:30:38 -0400

>  	spin_lock(&hslot->lock);
> -	sk = sk_nulls_head(&hslot->head);
> -	dif = skb->dev->ifindex;
> -	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
> -	while (sk) {
> -		stack[count++] = sk;
> -		sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
> -				       daddr, uh->source, saddr, dif);
> -		if (unlikely(count == ARRAY_SIZE(stack))) {
> -			if (!sk)
> -				break;
> -			flush_stack(stack, count, skb, ~0);
> -			count = 0;
> +	sk_nulls_for_each(sk, node, &hslot->head) {
> +		if (__udp_is_mcast_sock(net, sk,
> +					uh->dest, daddr,
> +					uh->source, saddr,
> +					dif, hnum)) {
> +			stack[count++] = sk;
> +			if (unlikely(count == ARRAY_SIZE(stack))) {
> +				flush_stack(stack, count, skb, ~0);
> +				count = 0;
> +			}
>  		}
>  	}

I think you are changing the logic of the loop in the edge case.

If we have exactly ARRAY_SIZE(stack) sockets to process, the old code
performs the flush_stack() outside of the hslot->lock, but with your
change we'll do it inside the lock.

The tradeoff here is reducing hslot->lock hold times vs. avoiding
taking a hold on all the sockets in the stack[] array.

I just wanted to point this out and make sure you are aware of and
are ok with this.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver.
  2014-07-16  0:14 ` [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver David Miller
@ 2014-07-16  1:53   ` David Held
  2014-07-16  3:23     ` David Held
  2014-07-16  6:58     ` Eric Dumazet
  0 siblings, 2 replies; 8+ messages in thread
From: David Held @ 2014-07-16  1:53 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Eric Dumazet, Willem de Bruijn

On Tue, Jul 15, 2014 at 8:14 PM, David Miller <davem@davemloft.net> wrote:
> From: David Held <drheld@google.com>
> Date: Mon, 14 Jul 2014 18:30:38 -0400
>
>>       spin_lock(&hslot->lock);
>> -     sk = sk_nulls_head(&hslot->head);
>> -     dif = skb->dev->ifindex;
>> -     sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
>> -     while (sk) {
>> -             stack[count++] = sk;
>> -             sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
>> -                                    daddr, uh->source, saddr, dif);
>> -             if (unlikely(count == ARRAY_SIZE(stack))) {
>> -                     if (!sk)
>> -                             break;
>> -                     flush_stack(stack, count, skb, ~0);
>> -                     count = 0;
>> +     sk_nulls_for_each(sk, node, &hslot->head) {
>> +             if (__udp_is_mcast_sock(net, sk,
>> +                                     uh->dest, daddr,
>> +                                     uh->source, saddr,
>> +                                     dif, hnum)) {
>> +                     stack[count++] = sk;
>> +                     if (unlikely(count == ARRAY_SIZE(stack))) {
>> +                             flush_stack(stack, count, skb, ~0);
>> +                             count = 0;
>> +                     }
>>               }
>>       }
>
> I think you are changing the logic of the loop in the edge case.
>
> If we have exactly ARRAY_SIZE(stack) sockets to process, the old code
> performs the flush_stack() outside of the hslot->lock, but with your
> change we'll do it inside the lock.
>
> The tradeoff here is reducing hslot->lock hold times vs. avoiding
> taking a hold on all the sockets in the stack[] array.
>
> I just wanted to point this out and make sure you are aware of and
> are ok with this.

The followup patch makes it so we always take a hold on the sockets
anyway (it makes things simpler and only changes things for the too
large array case), so might as well reduce the lock time for the
exactly ARRAY_SIZE case.

Should just be a matter of moving the stack addition below the ARRAY_SIZE check:
+                       if (unlikely(count == ARRAY_SIZE(stack))) {
+                               flush_stack(stack, count, skb, ~0);
+                               count = 0;
+                       }
+                       stack[count++] = sk;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver.
  2014-07-16  1:53   ` David Held
@ 2014-07-16  3:23     ` David Held
  2014-07-16  6:58     ` Eric Dumazet
  1 sibling, 0 replies; 8+ messages in thread
From: David Held @ 2014-07-16  3:23 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Eric Dumazet, Willem de Bruijn

Will resubmit the patchset with that update.

On Tue, Jul 15, 2014 at 9:53 PM, David Held <drheld@google.com> wrote:
> On Tue, Jul 15, 2014 at 8:14 PM, David Miller <davem@davemloft.net> wrote:
>> From: David Held <drheld@google.com>
>> Date: Mon, 14 Jul 2014 18:30:38 -0400
>>
>>>       spin_lock(&hslot->lock);
>>> -     sk = sk_nulls_head(&hslot->head);
>>> -     dif = skb->dev->ifindex;
>>> -     sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
>>> -     while (sk) {
>>> -             stack[count++] = sk;
>>> -             sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
>>> -                                    daddr, uh->source, saddr, dif);
>>> -             if (unlikely(count == ARRAY_SIZE(stack))) {
>>> -                     if (!sk)
>>> -                             break;
>>> -                     flush_stack(stack, count, skb, ~0);
>>> -                     count = 0;
>>> +     sk_nulls_for_each(sk, node, &hslot->head) {
>>> +             if (__udp_is_mcast_sock(net, sk,
>>> +                                     uh->dest, daddr,
>>> +                                     uh->source, saddr,
>>> +                                     dif, hnum)) {
>>> +                     stack[count++] = sk;
>>> +                     if (unlikely(count == ARRAY_SIZE(stack))) {
>>> +                             flush_stack(stack, count, skb, ~0);
>>> +                             count = 0;
>>> +                     }
>>>               }
>>>       }
>>
>> I think you are changing the logic of the loop in the edge case.
>>
>> If we have exactly ARRAY_SIZE(stack) sockets to process, the old code
>> performs the flush_stack() outside of the hslot->lock, but with your
>> change we'll do it inside the lock.
>>
>> The tradeoff here is reducing hslot->lock hold times vs. avoiding
>> taking a hold on all the sockets in the stack[] array.
>>
>> I just wanted to point this out and make sure you are aware of and
>> are ok with this.
>
> The followup patch makes it so we always take a hold on the sockets
> anyway (it makes things simpler and only changes things for the too
> large array case), so might as well reduce the lock time for the
> exactly ARRAY_SIZE case.
>
> Should just be a matter of moving the stack addition below the ARRAY_SIZE check:
> +                       if (unlikely(count == ARRAY_SIZE(stack))) {
> +                               flush_stack(stack, count, skb, ~0);
> +                               count = 0;
> +                       }
> +                       stack[count++] = sk;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver.
  2014-07-16  1:53   ` David Held
  2014-07-16  3:23     ` David Held
@ 2014-07-16  6:58     ` Eric Dumazet
  1 sibling, 0 replies; 8+ messages in thread
From: Eric Dumazet @ 2014-07-16  6:58 UTC (permalink / raw)
  To: David Held; +Cc: David Miller, netdev, Willem de Bruijn

On Tue, 2014-07-15 at 21:53 -0400, David Held wrote:
> On Tue, Jul 15, 2014 at 8:14 PM, David Miller <davem@davemloft.net> wrote:
> >
> >
> > I think you are changing the logic of the loop in the edge case.
> >
> > If we have exactly ARRAY_SIZE(stack) sockets to process, the old code
> > performs the flush_stack() outside of the hslot->lock, but with your
> > change we'll do it inside the lock.
> >
> > The tradeoff here is reducing hslot->lock hold times vs. avoiding
> > taking a hold on all the sockets in the stack[] array.
> >
> > I just wanted to point this out and make sure you are aware of and
> > are ok with this.
> 
> The followup patch makes it so we always take a hold on the sockets
> anyway (it makes things simpler and only changes things for the too
> large array case), so might as well reduce the lock time for the
> exactly ARRAY_SIZE case.

Note that my initial motivation for this array was to convert the
multicast lookup into a RCU one, but I never finished this.

If we hold a spinlock, there is no need for this array in the first
place.

If we dont hole a spinlock, we are forced to temporarily store the
sockets into the array, just in case we restart the lookup because we
are directed into another chain.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-07-16  6:58 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-14 22:30 [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
2014-07-14 22:30 ` [PATCH net-next 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
2014-07-15  8:33   ` Eric Dumazet
2014-07-15 13:48     ` David Held
2014-07-16  0:14 ` [PATCH net-next 1/2] udp: Simplify __udp*_lib_mcast_deliver David Miller
2014-07-16  1:53   ` David Held
2014-07-16  3:23     ` David Held
2014-07-16  6:58     ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).