netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v2 0/2] udp: Fix multicast performance issues.
@ 2014-07-15 21:22 David Held
  2014-07-15 21:22 ` [PATCH net-next v2 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: David Held @ 2014-07-15 21:22 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, willemb

Fix performance issues with listening to many different multicast
sockets on different addresses with the same port. Instead of always
using hash1, fall back to hash2 lookup when hash1 lookup is long.
Patch 1 is a general cleanup and simplification which also makes the
main implementation in Patch 2 simpler.

Eric's recent change 63c6f81cdde5 avoided this being an issue in early
demux. This makes it work for regular delivery as well.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH net-next v2 1/2] udp: Simplify __udp*_lib_mcast_deliver.
  2014-07-15 21:22 [PATCH net-next v2 0/2] udp: Fix multicast performance issues David Held
@ 2014-07-15 21:22 ` David Held
  2014-07-15 21:22 ` [PATCH net-next v2 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
  2014-07-15 21:23 ` [PATCH net-next v2 0/2] udp: Fix multicast performance issues David Held
  2 siblings, 0 replies; 4+ messages in thread
From: David Held @ 2014-07-15 21:22 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, willemb, David Held

Switch to using sk_nulls_for_each which shortens the code and makes it
easier to update.

Signed-off-by: David Held <drheld@google.com>
---
 net/ipv4/udp.c | 48 ++++++++++----------------------
 net/ipv6/udp.c | 88 +++++++++++++++++++++++-----------------------------------
 2 files changed, 49 insertions(+), 87 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f6dfe52..ef531bd 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	return true;
 }
 
-static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
-					     __be16 loc_port, __be32 loc_addr,
-					     __be16 rmt_port, __be32 rmt_addr,
-					     int dif)
-{
-	struct hlist_nulls_node *node;
-	unsigned short hnum = ntohs(loc_port);
-
-	sk_nulls_for_each_from(sk, node) {
-		if (__udp_is_mcast_sock(net, sk,
-					loc_port, loc_addr,
-					rmt_port, rmt_addr,
-					dif, hnum))
-			goto found;
-	}
-	sk = NULL;
-found:
-	return sk;
-}
-
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -1664,23 +1644,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udp_table *udptable)
 {
 	struct sock *sk, *stack[256 / sizeof(struct sock *)];
-	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
-	int dif;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	int dif = skb->dev->ifindex;
 	unsigned int i, count = 0;
 
 	spin_lock(&hslot->lock);
-	sk = sk_nulls_head(&hslot->head);
-	dif = skb->dev->ifindex;
-	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
-	while (sk) {
-		stack[count++] = sk;
-		sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
-				       daddr, uh->source, saddr, dif);
-		if (unlikely(count == ARRAY_SIZE(stack))) {
-			if (!sk)
-				break;
-			flush_stack(stack, count, skb, ~0);
-			count = 0;
+	sk_nulls_for_each(sk, node, &hslot->head) {
+		if (__udp_is_mcast_sock(net, sk,
+					uh->dest, daddr,
+					uh->source, saddr,
+					dif, hnum)) {
+			stack[count++] = sk;
+			if (unlikely(count == ARRAY_SIZE(stack))) {
+				flush_stack(stack, count, skb, ~0);
+				count = 0;
+			}
 		}
 	}
 	/*
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c2bd28f..cade19b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -698,43 +698,26 @@ drop:
 	return -1;
 }
 
-static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
-				      __be16 loc_port, const struct in6_addr *loc_addr,
-				      __be16 rmt_port, const struct in6_addr *rmt_addr,
-				      int dif)
+static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
+				   __be16 loc_port, const struct in6_addr *loc_addr,
+				   __be16 rmt_port, const struct in6_addr *rmt_addr,
+				   int dif, unsigned short hnum)
 {
-	struct hlist_nulls_node *node;
-	unsigned short num = ntohs(loc_port);
-
-	sk_nulls_for_each_from(sk, node) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (!net_eq(sock_net(sk), net))
-			continue;
-
-		if (udp_sk(sk)->udp_port_hash == num &&
-		    sk->sk_family == PF_INET6) {
-			if (inet->inet_dport) {
-				if (inet->inet_dport != rmt_port)
-					continue;
-			}
-			if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
-			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
-				continue;
-
-			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
-				continue;
+	struct inet_sock *inet = inet_sk(sk);
 
-			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-				if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
-					continue;
-			}
-			if (!inet6_mc_check(sk, loc_addr, rmt_addr))
-				continue;
-			return sk;
-		}
-	}
-	return NULL;
+	if (!net_eq(sock_net(sk), net))
+		return false;
+
+	if (udp_sk(sk)->udp_port_hash != hnum ||
+	    sk->sk_family != PF_INET6 ||
+	    (inet->inet_dport && inet->inet_dport != rmt_port) ||
+	    (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+		    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+		return false;
+	if (!inet6_mc_check(sk, loc_addr, rmt_addr))
+		return false;
+	return true;
 }
 
 static void flush_stack(struct sock **stack, unsigned int count,
@@ -783,28 +766,27 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 {
 	struct sock *sk, *stack[256 / sizeof(struct sock *)];
 	const struct udphdr *uh = udp_hdr(skb);
-	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
-	int dif;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	int dif = inet6_iif(skb);
 	unsigned int i, count = 0;
 
 	spin_lock(&hslot->lock);
-	sk = sk_nulls_head(&hslot->head);
-	dif = inet6_iif(skb);
-	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
-	while (sk) {
-		/* If zero checksum and no_check is not on for
-		 * the socket then skip it.
-		 */
-		if (uh->check || udp_sk(sk)->no_check6_rx)
+	sk_nulls_for_each(sk, node, &hslot->head) {
+		if (__udp_v6_is_mcast_sock(net, sk,
+					   uh->dest, daddr,
+					   uh->source, saddr,
+					   dif, hnum) &&
+		    /* If zero checksum and no_check is not on for
+		     * the socket then skip it.
+		     */
+		    (uh->check || udp_sk(sk)->no_check6_rx)) {
 			stack[count++] = sk;
-
-		sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
-				       uh->source, saddr, dif);
-		if (unlikely(count == ARRAY_SIZE(stack))) {
-			if (!sk)
-				break;
-			flush_stack(stack, count, skb, ~0);
-			count = 0;
+			if (unlikely(count == ARRAY_SIZE(stack))) {
+				flush_stack(stack, count, skb, ~0);
+				count = 0;
+			}
 		}
 	}
 	/*
-- 
2.0.0.526.g5318336

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH net-next v2 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.
  2014-07-15 21:22 [PATCH net-next v2 0/2] udp: Fix multicast performance issues David Held
  2014-07-15 21:22 ` [PATCH net-next v2 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
@ 2014-07-15 21:22 ` David Held
  2014-07-15 21:23 ` [PATCH net-next v2 0/2] udp: Fix multicast performance issues David Held
  2 siblings, 0 replies; 4+ messages in thread
From: David Held @ 2014-07-15 21:22 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, willemb, David Held

Many multicast sources can have the same port which can result in a very
large list when hashing by port only. Hash by address and port instead
if this is the case. This makes multicast more similar to unicast.

On a 24-core machine receiving from 500 multicast sockets on the same
port, before this patch 80% of system CPU was used up by spin locking
and only ~25% of packets were successfully delivered.

With this patch, all packets are delivered and kernel overhead is ~8%
system CPU on spinlocks.

Signed-off-by: David Held <drheld@google.com>
---
 include/net/sock.h | 14 ++++++++++++++
 net/ipv4/udp.c     | 31 +++++++++++++++++++++----------
 net/ipv6/udp.c     | 30 ++++++++++++++++++++----------
 3 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index cb84b2f..6734cab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
 #define sk_for_each_bound(__sk, list) \
 	hlist_for_each_entry(__sk, list, sk_bind_node)
 
+/**
+ * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @offset:	offset of hlist_node within the struct.
+ *
+ */
+#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)		       \
+	for (pos = (head)->first;					       \
+	     (!is_a_nulls(pos)) &&					       \
+		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
+	     pos = pos->next)
+
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
 	/* Careful only use this in a context where these parameters
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ef531bd..06cc9c9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1616,6 +1616,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
 			skb1 = NULL;
+
+		sock_put(sk);
 	}
 	if (unlikely(skb1))
 		kfree_skb(skb1);
@@ -1648,37 +1650,46 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
 	int dif = skb->dev->ifindex;
-	unsigned int i, count = 0;
+	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+	if (use_hash2) {
+		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+			    udp_table.mask;
+		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+		hslot = &udp_table.hash2[hash2];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
 
 	spin_lock(&hslot->lock);
-	sk_nulls_for_each(sk, node, &hslot->head) {
+	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
 		if (__udp_is_mcast_sock(net, sk,
 					uh->dest, daddr,
 					uh->source, saddr,
 					dif, hnum)) {
 			stack[count++] = sk;
+			sock_hold(sk);
 			if (unlikely(count == ARRAY_SIZE(stack))) {
 				flush_stack(stack, count, skb, ~0);
 				count = 0;
 			}
 		}
 	}
-	/*
-	 * before releasing chain lock, we must take a reference on sockets
-	 */
-	for (i = 0; i < count; i++)
-		sock_hold(stack[i]);
 
 	spin_unlock(&hslot->lock);
 
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
 	/*
 	 * do the slow work with no lock held
 	 */
 	if (count) {
 		flush_stack(stack, count, skb, count - 1);
-
-		for (i = 0; i < count; i++)
-			sock_put(stack[i]);
 	} else {
 		kfree_skb(skb);
 	}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index cade19b..93d7fdb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -741,6 +741,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
 			skb1 = NULL;
+		sock_put(sk);
 	}
 	if (unlikely(skb1))
 		kfree_skb(skb1);
@@ -770,10 +771,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
 	int dif = inet6_iif(skb);
-	unsigned int i, count = 0;
+	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+	if (use_hash2) {
+		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+			    udp_table.mask;
+		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+		hslot = &udp_table.hash2[hash2];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
 
 	spin_lock(&hslot->lock);
-	sk_nulls_for_each(sk, node, &hslot->head) {
+	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
 		if (__udp_v6_is_mcast_sock(net, sk,
 					   uh->dest, daddr,
 					   uh->source, saddr,
@@ -783,25 +794,24 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		     */
 		    (uh->check || udp_sk(sk)->no_check6_rx)) {
 			stack[count++] = sk;
+			sock_hold(sk);
 			if (unlikely(count == ARRAY_SIZE(stack))) {
 				flush_stack(stack, count, skb, ~0);
 				count = 0;
 			}
 		}
 	}
-	/*
-	 * before releasing the lock, we must take reference on sockets
-	 */
-	for (i = 0; i < count; i++)
-		sock_hold(stack[i]);
 
 	spin_unlock(&hslot->lock);
 
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
 	if (count) {
 		flush_stack(stack, count, skb, count - 1);
-
-		for (i = 0; i < count; i++)
-			sock_put(stack[i]);
 	} else {
 		kfree_skb(skb);
 	}
-- 
2.0.0.526.g5318336

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next v2 0/2] udp: Fix multicast performance issues.
  2014-07-15 21:22 [PATCH net-next v2 0/2] udp: Fix multicast performance issues David Held
  2014-07-15 21:22 ` [PATCH net-next v2 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
  2014-07-15 21:22 ` [PATCH net-next v2 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
@ 2014-07-15 21:23 ` David Held
  2 siblings, 0 replies; 4+ messages in thread
From: David Held @ 2014-07-15 21:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, Eric Dumazet, Willem de Bruijn

v1->v2
 - updated hash collision detection

On Tue, Jul 15, 2014 at 5:22 PM, David Held <drheld@google.com> wrote:
> Fix performance issues with listening to many different multicast
> sockets on different addresses with the same port. Instead of always
> using hash1, fall back to hash2 lookup when hash1 lookup is long.
> Patch 1 is a general cleanup and simplification which also makes the
> main implementation in Patch 2 simpler.
>
> Eric's recent change 63c6f81cdde5 avoided this being an issue in early
> demux. This makes it work for regular delivery as well.
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-07-15 21:23 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-15 21:22 [PATCH net-next v2 0/2] udp: Fix multicast performance issues David Held
2014-07-15 21:22 ` [PATCH net-next v2 1/2] udp: Simplify __udp*_lib_mcast_deliver David Held
2014-07-15 21:22 ` [PATCH net-next v2 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver David Held
2014-07-15 21:23 ` [PATCH net-next v2 0/2] udp: Fix multicast performance issues David Held

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).