netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/4] inet: add second hash table
@ 2012-05-30  7:36 Alexandru Copot
  2012-05-30  7:36 ` [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket Alexandru Copot
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Alexandru Copot, Daniel Baluta, Lucian Grijincu

This patchset implements all the operations needed to use a second
(port,address) bind hash table for inet. It uses a similar approach
as the UDP implementation.

The performance improvements for port allocation are very good and
detailed in the last message.

This is based on a series of patches written by Lucian Grijincu at Ixia.

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
Alexandru Copot (4):
      inet: add counter to inet_bind_hashbucket
      inet: add a second bind hash
      inet: add/remove inet buckets in the second bind hash
      inet: use second hash in inet_csk_get_port

 include/net/inet_hashtables.h    |  140 +++++++++++++++++++++++++++++++--
 include/net/inet_timewait_sock.h |    5 +-
 net/dccp/proto.c                 |   37 ++++++++-
 net/ipv4/inet_connection_sock.c  |   66 ++++++++--------
 net/ipv4/inet_hashtables.c       |  158 ++++++++++++++++++++++++++++++++++++--
 net/ipv4/inet_timewait_sock.c    |   16 ++--
 net/ipv4/tcp.c                   |   17 ++++
 net/ipv6/inet6_hashtables.c      |   95 +++++++++++++++++++++++
 8 files changed, 477 insertions(+), 57 deletions(-)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket
  2012-05-30  7:36 [RFC PATCH 0/4] inet: add second hash table Alexandru Copot
@ 2012-05-30  7:36 ` Alexandru Copot
  2012-05-30  8:00   ` Eric Dumazet
  2012-05-30  7:36 ` [RFC PATCH 2/4] inet: add a second bind hash Alexandru Copot
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Alexandru Copot, Daniel Baluta, Lucian Grijincu

The counter will be used by the upcoming INET lookup algorithm to
choose the shortest chain after secondary hash is added.

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h    |    4 +++-
 include/net/inet_timewait_sock.h |    4 +++-
 net/dccp/proto.c                 |    1 +
 net/ipv4/inet_hashtables.c       |    9 ++++++---
 net/ipv4/inet_timewait_sock.c    |    7 ++++---
 net/ipv4/tcp.c                   |    1 +
 6 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 808fc5f..8c6addc 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -98,6 +98,7 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
 struct inet_bind_hashbucket {
 	spinlock_t		lock;
 	struct hlist_head	chain;
+	unsigned int		count;
 };
 
 /*
@@ -222,7 +223,8 @@ extern struct inet_bind_bucket *
 					    struct inet_bind_hashbucket *head,
 					    const unsigned short snum);
 extern void inet_bind_bucket_destroy(struct kmem_cache *cachep,
-				     struct inet_bind_bucket *tb);
+				     struct inet_bind_bucket *tb,
+				     struct inet_bind_hashbucket *head);
 
 static inline int inet_bhashfn(struct net *net,
 		const __u16 lport, const int bhash_size)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index ba52c83..725e903 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -30,6 +30,7 @@
 #include <linux/atomic.h>
 
 struct inet_hashinfo;
+struct inet_bind_hashbucket;
 
 #define INET_TWDR_RECYCLE_SLOTS_LOG	5
 #define INET_TWDR_RECYCLE_SLOTS		(1 << INET_TWDR_RECYCLE_SLOTS_LOG)
@@ -197,7 +198,8 @@ extern void inet_twsk_put(struct inet_timewait_sock *tw);
 extern int inet_twsk_unhash(struct inet_timewait_sock *tw);
 
 extern int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
-				 struct inet_hashinfo *hashinfo);
+				 struct inet_hashinfo *hashinfo,
+				 struct inet_bind_hashbucket *head);
 
 extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 						  const int state);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 6c7c78b..e777beb 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1184,6 +1184,7 @@ static int __init dccp_init(void)
 	}
 
 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
+		dccp_hashinfo.bhash[i].count = 0;
 		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
 	}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 7880af9..c1f6f28 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -42,6 +42,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 		tb->num_owners = 0;
 		INIT_HLIST_HEAD(&tb->owners);
 		hlist_add_head(&tb->node, &head->chain);
+		head->count++;
 	}
 	return tb;
 }
@@ -49,9 +50,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 /*
  * Caller must hold hashbucket lock for this tb with local BH disabled
  */
-void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb,
+			      struct inet_bind_hashbucket *head)
 {
 	if (hlist_empty(&tb->owners)) {
+		head->count--;
 		__hlist_del(&tb->node);
 		release_net(ib_net(tb));
 		kmem_cache_free(cachep, tb);
@@ -90,7 +93,7 @@ static void __inet_put_port(struct sock *sk)
 	tb->num_owners--;
 	inet_csk(sk)->icsk_bind_hash = NULL;
 	inet_sk(sk)->inet_num = 0;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb, head);
 	spin_unlock(&head->lock);
 }
 
@@ -527,7 +530,7 @@ ok:
 			twrefcnt += hash(sk, tw);
 		}
 		if (tw)
-			twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+			twrefcnt += inet_twsk_bind_unhash(tw, hinfo, head);
 		spin_unlock(&head->lock);
 
 		if (tw) {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2784db3..5b7bcd0 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -49,7 +49,8 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
  *	Returns 1 if caller should call inet_twsk_put() after lock release.
  */
 int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
-			  struct inet_hashinfo *hashinfo)
+			  struct inet_hashinfo *hashinfo,
+			  struct inet_bind_hashbucket *head)
 {
 	struct inet_bind_bucket *tb = tw->tw_tb;
 
@@ -58,7 +59,7 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb, head);
 	/*
 	 * We cannot call inet_twsk_put() ourself under lock,
 	 * caller must call it for us.
@@ -84,7 +85,7 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 			hashinfo->bhash_size)];
 
 	spin_lock(&bhead->lock);
-	refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+	refcnt += inet_twsk_bind_unhash(tw, hashinfo, bhead);
 	spin_unlock(&bhead->lock);
 
 #ifdef SOCK_REFCNT_DEBUG
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bb485fc..52cdf67 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3533,6 +3533,7 @@ void __init tcp_init(void)
 					64 * 1024);
 	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+		tcp_hashinfo.bhash[i].count = 0;
 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
 	}
-- 
1.7.10.2

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC PATCH 2/4] inet: add a second bind hash
  2012-05-30  7:36 [RFC PATCH 0/4] inet: add second hash table Alexandru Copot
  2012-05-30  7:36 ` [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket Alexandru Copot
@ 2012-05-30  7:36 ` Alexandru Copot
  2012-05-30  7:36 ` [RFC PATCH 3/4] inet: add/remove inet buckets in the " Alexandru Copot
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Alexandru Copot, Daniel Baluta, Lucian Grijincu

Add a second bind hash table which hashes by bound port and address.

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h |   13 ++++++++++---
 net/dccp/proto.c              |   36 ++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp.c                |   16 ++++++++++++++++
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 8c6addc..a6d0db2 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -84,6 +84,7 @@ struct inet_bind_bucket {
 	signed short		fastreuse;
 	int			num_owners;
 	struct hlist_node	node;
+	struct hlist_node	portaddr_node;
 	struct hlist_head	owners;
 };
 
@@ -94,6 +95,8 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
 
 #define inet_bind_bucket_for_each(tb, pos, head) \
 	hlist_for_each_entry(tb, pos, head, node)
+#define inet_portaddr_bind_bucket_for_each(tb, pos, head) \
+	hlist_for_each_entry(tb, pos, head, portaddr_node)
 
 struct inet_bind_hashbucket {
 	spinlock_t		lock;
@@ -129,13 +132,17 @@ struct inet_hashinfo {
 	unsigned int			ehash_mask;
 	unsigned int			ehash_locks_mask;
 
-	/* Ok, let's try this, I give up, we do need a local binding
-	 * TCP hash as well as the others for fast bind/connect.
+	/*
+	 * bhash:		hashes the buckets by port.
+	 * portaddr_bhash:	hashes bind buckets by bound port and address.
+	 *			When bhash gets too large, we try to lookup on
+	 *			portaddr_bhash.
 	 */
 	struct inet_bind_hashbucket	*bhash;
+	struct inet_bind_hashbucket	*portaddr_bhash;
 
 	unsigned int			bhash_size;
-	/* 4 bytes hole on 64 bit */
+	unsigned int			portaddr_bhash_size;
 
 	struct kmem_cache		*bind_bucket_cachep;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e777beb..298f5c1 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1109,7 +1109,7 @@ EXPORT_SYMBOL_GPL(dccp_debug);
 static int __init dccp_init(void)
 {
 	unsigned long goal;
-	int ehash_order, bhash_order, i;
+	int ehash_order, bhash_order, portaddr_bhash_order, i;
 	int rc;
 
 	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
@@ -1189,9 +1189,34 @@ static int __init dccp_init(void)
 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
 	}
 
+	portaddr_bhash_order = bhash_order;
+
+	do {
+		dccp_hashinfo.portaddr_bhash_size =
+			(1UL << portaddr_bhash_order) *
+			PAGE_SIZE / sizeof(struct inet_bind_hashbucket);
+		if ((dccp_hashinfo.portaddr_bhash_size > (64 * 1024)) &&
+				portaddr_bhash_order > 0)
+			continue;
+		dccp_hashinfo.portaddr_bhash = (struct inet_bind_hashbucket *)
+			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN,
+					 portaddr_bhash_order);
+	} while (!dccp_hashinfo.portaddr_bhash && --portaddr_bhash_order >= 0);
+
+	if (!dccp_hashinfi.portaddr_bhash) {
+		DCCP_CRIT("Failed to allocate DCCP portaddr bind hash table");
+		goto out_free_dccp_hash;
+	}
+
+	for (i = 0; i < dccp_hashinfo.portaddr_bhash_size; i++) {
+		dccp_hashinfo.portaddr_bhash[i].count = 0;
+		spin_lock_init(&dccp_hashinfo.portaddr_bhash[i].lock);
+		INIT_HLIST_HEAD(&dccp_hashinfo.portaddr_bhash[i].chain);
+	}
+
 	rc = dccp_mib_init();
 	if (rc)
-		goto out_free_dccp_bhash;
+		goto out_free_dccp_portaddr_bhash;
 
 	rc = dccp_ackvec_init();
 	if (rc)
@@ -1215,6 +1240,10 @@ out_ackvec_exit:
 	dccp_ackvec_exit();
 out_free_dccp_mib:
 	dccp_mib_exit();
+out_free_dccp_portaddr_bhash:
+	free_pages((unsigned long)dccp_hashinfo.portaddr_bhash,
+		   portaddr_bhash_order);
+	dccp_hashinfo.portaddr_bhash = NULL;
 out_free_dccp_bhash:
 	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
 out_free_dccp_locks:
@@ -1239,6 +1268,9 @@ static void __exit dccp_fini(void)
 	free_pages((unsigned long)dccp_hashinfo.bhash,
 		   get_order(dccp_hashinfo.bhash_size *
 			     sizeof(struct inet_bind_hashbucket)));
+	free_pages((unsigned long)dccp_hashinfo.portaddr_bhash,
+		   get_order(dccp_hashinfo.portaddr_bhash_size *
+			     sizeof(struct inet_bind_hashbucket)));
 	free_pages((unsigned long)dccp_hashinfo.ehash,
 		   get_order((dccp_hashinfo.ehash_mask + 1) *
 			     sizeof(struct inet_ehash_bucket)));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 52cdf67..7dd3e19 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3538,6 +3538,22 @@ void __init tcp_init(void)
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
 	}
 
+	tcp_hashinfo.portaddr_bhash =
+		alloc_large_system_hash("TCP portaddr_bind",
+					sizeof(struct inet_bind_hashbucket),
+					tcp_hashinfo.bhash_size,
+					(totalram_pages >= 128 * 1024) ?
+					13 : 15,
+					0,
+					&tcp_hashinfo.portaddr_bhash_size,
+					NULL,
+					64 * 1024);
+	tcp_hashinfo.portaddr_bhash_size = 1U << tcp_hashinfo.portaddr_bhash_size;
+	for (i = 0; i < tcp_hashinfo.portaddr_bhash_size; i++) {
+		tcp_hashinfo.portaddr_bhash[i].count = 0;
+		spin_lock_init(&tcp_hashinfo.portaddr_bhash[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.portaddr_bhash[i].chain);
+	}
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
 
-- 
1.7.10.2

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC PATCH 3/4] inet: add/remove inet buckets in the second bind hash
  2012-05-30  7:36 [RFC PATCH 0/4] inet: add second hash table Alexandru Copot
  2012-05-30  7:36 ` [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket Alexandru Copot
  2012-05-30  7:36 ` [RFC PATCH 2/4] inet: add a second bind hash Alexandru Copot
@ 2012-05-30  7:36 ` Alexandru Copot
  2012-05-30  7:36 ` [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port Alexandru Copot
  2012-05-30  7:57 ` [RFC PATCH 0/4] inet: add second hash table Eric Dumazet
  4 siblings, 0 replies; 13+ messages in thread
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Alexandru Copot, Daniel Baluta, Lucian Grijincu

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h    |   77 +++++++++++++++++++++++++++++++++++---
 include/net/inet_timewait_sock.h |    3 +-
 net/ipv4/inet_connection_sock.c  |   13 +++++--
 net/ipv4/inet_hashtables.c       |   34 ++++++++++++++---
 net/ipv4/inet_timewait_sock.c    |   15 +++++---
 5 files changed, 122 insertions(+), 20 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index a6d0db2..bc06168 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -225,13 +225,15 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
 }
 
 extern struct inet_bind_bucket *
-		    inet_bind_bucket_create(struct kmem_cache *cachep,
-					    struct net *net,
-					    struct inet_bind_hashbucket *head,
-					    const unsigned short snum);
+	    inet_bind_bucket_create(struct kmem_cache *cachep,
+				    struct net *net,
+				    struct inet_bind_hashbucket *head,
+				    struct inet_bind_hashbucket *portaddr_head,
+				    const unsigned short snum);
 extern void inet_bind_bucket_destroy(struct kmem_cache *cachep,
 				     struct inet_bind_bucket *tb,
-				     struct inet_bind_hashbucket *head);
+				     struct inet_bind_hashbucket *head,
+				     struct inet_bind_hashbucket *portaddr_head);
 
 static inline int inet_bhashfn(struct net *net,
 		const __u16 lport, const int bhash_size)
@@ -239,6 +241,71 @@ static inline int inet_bhashfn(struct net *net,
 	return (lport + net_hash_mix(net)) & (bhash_size - 1);
 }
 
+static inline unsigned int inet4_portaddr_bhashfn(struct net *net, __be32 saddr,
+						  unsigned int port)
+{
+	return jhash_1word(saddr, net_hash_mix(net)) ^ port;
+}
+
+static inline struct inet_bind_hashbucket *
+		inet4_portaddr_hashbucket(struct inet_hashinfo *hinfo,
+					  struct net *net,
+					  __be32 saddr,
+					  unsigned int port)
+{
+	unsigned int h = inet4_portaddr_bhashfn(net, saddr, port);
+	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static inline unsigned int inet6_portaddr_bhashfn(struct net *net,
+						  const struct in6_addr *addr6,
+						  unsigned int port)
+{
+	unsigned int hash, mix = net_hash_mix(net);
+
+	if (ipv6_addr_any(addr6))
+		hash = jhash_1word(0, mix);
+	else if (ipv6_addr_v4mapped(addr6))
+		hash = jhash_1word(addr6->s6_addr32[3], mix);
+	else
+		hash = jhash2(addr6->s6_addr32, 4, mix);
+
+	return hash ^ port;
+}
+
+static inline struct inet_bind_hashbucket *
+		inet6_portaddr_hashbucket(struct inet_hashinfo *hinfo,
+					  struct net *net,
+					  const struct in6_addr *addr6,
+					  unsigned int port)
+{
+	unsigned int h = inet6_portaddr_bhashfn(net, addr6, port);
+	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
+}
+#endif
+
+
+static inline struct inet_bind_hashbucket *
+		inet_portaddr_hashbucket(struct inet_hashinfo *hinfo,
+					 struct sock  *sk,
+					 unsigned int port)
+{
+	struct net *net = sock_net(sk);
+	switch (sk->sk_family) {
+	case AF_INET:
+		return inet4_portaddr_hashbucket(hinfo, net,
+				inet_sk(sk)->inet_rcv_saddr, port);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		return inet6_portaddr_hashbucket(hinfo, net,
+				&inet6_sk(sk)->rcv_saddr, port);
+#endif
+	}
+	WARN(1, "unrecognised sk->sk_family in inet_portaddr_hashbucket");
+	return inet4_portaddr_hashbucket(hinfo, net, INADDR_ANY, port);
+}
+
 extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 			   const unsigned short snum);
 
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 725e903..d60d8a9 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -199,7 +199,8 @@ extern int inet_twsk_unhash(struct inet_timewait_sock *tw);
 
 extern int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 				 struct inet_hashinfo *hashinfo,
-				 struct inet_bind_hashbucket *head);
+				 struct inet_bind_hashbucket *head,
+				 struct inet_bind_hashbucket *portaddr_head);
 
 extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 						  const int state);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 95e61596..336531a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -204,9 +204,16 @@ tb_found:
 	}
 tb_not_found:
 	ret = 1;
-	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
-					net, head, snum)) == NULL)
-		goto fail_unlock;
+	if (!tb) {
+		struct inet_bind_hashbucket *portaddr_head;
+		portaddr_head = inet_portaddr_hashbucket(hashinfo, sk, snum);
+		spin_lock(&portaddr_head->lock);
+		tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
+				net, head, portaddr_head, snum);
+		spin_unlock(&portaddr_head->lock);
+		if (!tb)
+			goto fail_unlock;
+	}
 	if (hlist_empty(&tb->owners)) {
 		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 			tb->fastreuse = 1;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c1f6f28..edb2a4e 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -31,6 +31,7 @@
 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 						 struct net *net,
 						 struct inet_bind_hashbucket *head,
+						 struct inet_bind_hashbucket *portaddr_head,
 						 const unsigned short snum)
 {
 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
@@ -43,6 +44,8 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 		INIT_HLIST_HEAD(&tb->owners);
 		hlist_add_head(&tb->node, &head->chain);
 		head->count++;
+		hlist_add_head(&tb->portaddr_node, &portaddr_head->chain);
+		portaddr_head->count++;
 	}
 	return tb;
 }
@@ -51,11 +54,14 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
  * Caller must hold hashbucket lock for this tb with local BH disabled
  */
 void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb,
-			      struct inet_bind_hashbucket *head)
+			      struct inet_bind_hashbucket *head,
+			      struct inet_bind_hashbucket *portaddr_head)
 {
 	if (hlist_empty(&tb->owners)) {
 		head->count--;
 		__hlist_del(&tb->node);
+		portaddr_head->count--;
+		__hlist_del(&tb->portaddr_node);
 		release_net(ib_net(tb));
 		kmem_cache_free(cachep, tb);
 	}
@@ -83,17 +89,22 @@ static void __inet_put_port(struct sock *sk)
 	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
 			hashinfo->bhash_size);
 	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+	struct inet_bind_hashbucket *portaddr_head =
+		inet_portaddr_hashbucket(hashinfo, sk, inet_sk(sk)->inet_num);
 	struct inet_bind_bucket *tb;
 
 	atomic_dec(&hashinfo->bsockets);
 
 	spin_lock(&head->lock);
+	spin_lock(&portaddr_head->lock);
 	tb = inet_csk(sk)->icsk_bind_hash;
 	__sk_del_bind_node(sk);
 	tb->num_owners--;
 	inet_csk(sk)->icsk_bind_hash = NULL;
 	inet_sk(sk)->inet_num = 0;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb, head);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb,
+				 head, portaddr_head);
+	spin_unlock(&portaddr_head->lock);
 	spin_unlock(&head->lock);
 }
 
@@ -112,6 +123,8 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 	const int bhash = inet_bhashfn(sock_net(sk), port,
 			table->bhash_size);
 	struct inet_bind_hashbucket *head = &table->bhash[bhash];
+	struct inet_bind_hashbucket *portaddr_head =
+		inet_portaddr_hashbucket(table, sk, port);
 	struct inet_bind_bucket *tb;
 
 	spin_lock(&head->lock);
@@ -130,7 +143,8 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 		}
 		if (!node) {
 			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
-						     sock_net(sk), head, port);
+						     sock_net(sk), head,
+						     portaddr_head, port);
 			if (!tb) {
 				spin_unlock(&head->lock);
 				return -ENOMEM;
@@ -462,7 +476,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	const unsigned short snum = inet_sk(sk)->inet_num;
-	struct inet_bind_hashbucket *head;
+	struct inet_bind_hashbucket *head, *portaddr_head;
 	struct inet_bind_bucket *tb;
 	int ret;
 	struct net *net = sock_net(sk);
@@ -504,8 +518,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 				}
 			}
 
+			portaddr_head = inet_portaddr_hashbucket(hinfo, sk, port);
+			spin_lock(&portaddr_head->lock);
 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
-					net, head, port);
+					net, head, portaddr_head, port);
+			spin_unlock(&portaddr_head->lock);
+
 			if (!tb) {
 				spin_unlock(&head->lock);
 				break;
@@ -529,8 +547,12 @@ ok:
 			inet_sk(sk)->inet_sport = htons(port);
 			twrefcnt += hash(sk, tw);
 		}
+		portaddr_head = inet_portaddr_hashbucket(hinfo, sk, port);
+		spin_lock(&portaddr_head->lock);
 		if (tw)
-			twrefcnt += inet_twsk_bind_unhash(tw, hinfo, head);
+			twrefcnt += inet_twsk_bind_unhash(tw, hinfo,
+							  head, portaddr_head);
+		spin_unlock(&portaddr_head->lock);
 		spin_unlock(&head->lock);
 
 		if (tw) {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 5b7bcd0..29f8061 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -50,7 +50,8 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
  */
 int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 			  struct inet_hashinfo *hashinfo,
-			  struct inet_bind_hashbucket *head)
+			  struct inet_bind_hashbucket *head,
+			  struct inet_bind_hashbucket *portaddr_head)
 {
 	struct inet_bind_bucket *tb = tw->tw_tb;
 
@@ -59,7 +60,8 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb, head);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb,
+				 head, portaddr_head);
 	/*
 	 * We cannot call inet_twsk_put() ourself under lock,
 	 * caller must call it for us.
@@ -71,7 +73,7 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 			     struct inet_hashinfo *hashinfo)
 {
-	struct inet_bind_hashbucket *bhead;
+	struct inet_bind_hashbucket *bhead, *portaddr_bhead;
 	int refcnt;
 	/* Unlink from established hashes. */
 	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
@@ -83,9 +85,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 	/* Disassociate with bind bucket. */
 	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 			hashinfo->bhash_size)];
-
+	portaddr_bhead = inet_portaddr_hashbucket(hashinfo, (struct sock *)tw,
+						  tw->tw_num);
 	spin_lock(&bhead->lock);
-	refcnt += inet_twsk_bind_unhash(tw, hashinfo, bhead);
+	spin_lock(&portaddr_bhead->lock);
+	refcnt += inet_twsk_bind_unhash(tw, hashinfo, bhead, portaddr_bhead);
+	spin_unlock(&portaddr_bhead->lock);
 	spin_unlock(&bhead->lock);
 
 #ifdef SOCK_REFCNT_DEBUG
-- 
1.7.10.2

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port
  2012-05-30  7:36 [RFC PATCH 0/4] inet: add second hash table Alexandru Copot
                   ` (2 preceding siblings ...)
  2012-05-30  7:36 ` [RFC PATCH 3/4] inet: add/remove inet buckets in the " Alexandru Copot
@ 2012-05-30  7:36 ` Alexandru Copot
  2012-05-30 16:42   ` Eric Dumazet
  2012-05-30 17:20   ` Eric Dumazet
  2012-05-30  7:57 ` [RFC PATCH 0/4] inet: add second hash table Eric Dumazet
  4 siblings, 2 replies; 13+ messages in thread
From: Alexandru Copot @ 2012-05-30  7:36 UTC (permalink / raw)
  To: davem
  Cc: gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Alexandru Copot, Daniel Baluta, Lucian Grijincu

This results in a massive improvement when there are many sockets
bound to the same port, but different addresses for both bind() and
listen() system calls (both call inet_csk_get_port).

Tests were run with 16000 subinterfaces each with a distinct
IPv4 address. The sockets are first bound to the same port and
then put on listen().

* Without patch and without SO_REUSEADDR:
    * bind:   1.543 s
    * listen: 3.050 s

* Without patch and with SO_REUSEADDR set:
    * bind:   0.066 s
    * listen: 3.050 s

* With patch and SO_REUSEADDR set / without SO_REUSEADDR:
    * bind:   0.066 s
    * listen: 0.095 s

Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/inet_hashtables.h   |   48 +++++++++++++++
 net/ipv4/inet_connection_sock.c |   63 ++++++++------------
 net/ipv4/inet_hashtables.c      |  125 ++++++++++++++++++++++++++++++++++++++-
 net/ipv6/inet6_hashtables.c     |   95 +++++++++++++++++++++++++++++
 4 files changed, 292 insertions(+), 39 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index bc06168..2f589bb 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -81,6 +81,15 @@ struct inet_bind_bucket {
 	struct net		*ib_net;
 #endif
 	unsigned short		port;
+	union {
+		struct in6_addr ib_addr_ipv6;
+		struct {
+			__be32	_1;
+			__be32	_2;
+			__be32	_3;
+			__be32	ib_addr_ipv4;
+		};
+	};
 	signed short		fastreuse;
 	int			num_owners;
 	struct hlist_node	node;
@@ -226,6 +235,7 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
 
 extern struct inet_bind_bucket *
 	    inet_bind_bucket_create(struct kmem_cache *cachep,
+				    struct sock *sk,
 				    struct net *net,
 				    struct inet_bind_hashbucket *head,
 				    struct inet_bind_hashbucket *portaddr_head,
@@ -257,6 +267,14 @@ static inline struct inet_bind_hashbucket *
 	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
 }
 
+
+struct inet_bind_bucket *
+inet4_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead);
+
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static inline unsigned int inet6_portaddr_bhashfn(struct net *net,
 						  const struct in6_addr *addr6,
@@ -283,6 +301,14 @@ static inline struct inet_bind_hashbucket *
 	unsigned int h = inet6_portaddr_bhashfn(net, addr6, port);
 	return &hinfo->portaddr_bhash[h & (hinfo->portaddr_bhash_size - 1)];
 }
+
+
+struct inet_bind_bucket *
+	inet6_find_bind_buckets(struct sock *sk,
+				unsigned short port,
+				struct inet_bind_hashbucket **p_bhead,
+				struct inet_bind_hashbucket **p_portaddr_bhead);
+
 #endif
 
 
@@ -306,6 +332,28 @@ static inline struct inet_bind_hashbucket *
 	return inet4_portaddr_hashbucket(hinfo, net, INADDR_ANY, port);
 }
 
+
+static inline struct inet_bind_bucket *
+	inet_find_bind_buckets(struct sock *sk,
+			       unsigned short port,
+			       struct inet_bind_hashbucket **p_bhead,
+			       struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		return inet4_find_bind_buckets(sk, port, p_bhead,
+				p_portaddr_bhead);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		return inet6_find_bind_buckets(sk, port, p_bhead,
+				p_portaddr_bhead);
+#endif
+	}
+	WARN(1, "unrecognised sk->sk_family in inet_portaddr_hashbucket");
+	return NULL;
+}
+
+
 extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 			   const unsigned short snum);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 336531a..bd92466 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -100,8 +100,7 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	struct inet_bind_hashbucket *head;
-	struct hlist_node *node;
+	struct inet_bind_hashbucket *head, *portaddr_bhead;
 	struct inet_bind_bucket *tb;
 	int ret, attempts = 5;
 	struct net *net = sock_net(sk);
@@ -120,31 +119,26 @@ again:
 		do {
 			if (inet_is_reserved_local_port(rover))
 				goto next_nolock;
-			head = &hashinfo->bhash[inet_bhashfn(net, rover,
-					hashinfo->bhash_size)];
-			spin_lock(&head->lock);
-			inet_bind_bucket_for_each(tb, node, &head->chain)
-				if (net_eq(ib_net(tb), net) && tb->port == rover) {
-					if (tb->fastreuse > 0 &&
-					    sk->sk_reuse &&
-					    sk->sk_state != TCP_LISTEN &&
-					    (tb->num_owners < smallest_size || smallest_size == -1)) {
-						smallest_size = tb->num_owners;
-						smallest_rover = rover;
-						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
-						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-							snum = smallest_rover;
-							goto tb_found;
-						}
-					}
-					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-						snum = rover;
-						goto tb_found;
-					}
-					goto next;
+
+			tb = inet_find_bind_buckets(sk, rover, &head, &portaddr_bhead);
+			if (!tb)
+				break;
+			if (tb->fastreuse > 0 && sk->sk_reuse &&
+			    sk->sk_state != TCP_LISTEN &&
+			    (tb->num_owners < smallest_size || smallest_size == -1)) {
+				smallest_size = tb->num_owners;
+				smallest_rover = rover;
+				if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+				    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+					snum = smallest_rover;
+					goto tb_found;
 				}
-			break;
-		next:
+			}
+			if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+				snum = rover;
+				goto tb_found;
+			}
+			spin_unlock(&portaddr_bhead->lock);
 			spin_unlock(&head->lock);
 		next_nolock:
 			if (++rover > high)
@@ -171,12 +165,9 @@ again:
 		snum = rover;
 	} else {
 have_snum:
-		head = &hashinfo->bhash[inet_bhashfn(net, snum,
-				hashinfo->bhash_size)];
-		spin_lock(&head->lock);
-		inet_bind_bucket_for_each(tb, node, &head->chain)
-			if (net_eq(ib_net(tb), net) && tb->port == snum)
-				goto tb_found;
+		tb = inet_find_bind_buckets(sk, snum, &head, &portaddr_bhead);
+		if (tb)
+			goto tb_found;
 	}
 	tb = NULL;
 	goto tb_not_found;
@@ -194,6 +185,7 @@ tb_found:
 			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
 				if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
 				    smallest_size != -1 && --attempts >= 0) {
+					spin_unlock(&portaddr_bhead->lock);
 					spin_unlock(&head->lock);
 					goto again;
 				}
@@ -205,12 +197,8 @@ tb_found:
 tb_not_found:
 	ret = 1;
 	if (!tb) {
-		struct inet_bind_hashbucket *portaddr_head;
-		portaddr_head = inet_portaddr_hashbucket(hashinfo, sk, snum);
-		spin_lock(&portaddr_head->lock);
 		tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
-				net, head, portaddr_head, snum);
-		spin_unlock(&portaddr_head->lock);
+				sk, net, head, portaddr_bhead, snum);
 		if (!tb)
 			goto fail_unlock;
 	}
@@ -229,6 +217,7 @@ success:
 	ret = 0;
 
 fail_unlock:
+	spin_unlock(&portaddr_bhead->lock);
 	spin_unlock(&head->lock);
 fail:
 	local_bh_enable();
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index edb2a4e..26c7f9d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -29,6 +29,7 @@
  * The bindhash mutex for snum's hash chain must be held here.
  */
 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+						 struct sock *sk,
 						 struct net *net,
 						 struct inet_bind_hashbucket *head,
 						 struct inet_bind_hashbucket *portaddr_head,
@@ -37,6 +38,32 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
 
 	if (tb != NULL) {
+		switch (sk->sk_family) {
+		case AF_INET:
+			/* ::ffff:x.y.z.y is the IPv4-mapped IPv6 address for
+			 * IPv4 address x.y.z.t, but only if it's not the any addr */
+			if (INADDR_ANY == sk_rcv_saddr(sk))
+				memset(&tb->ib_addr_ipv6, 0, sizeof(struct in6_addr));
+			else
+				ipv6_addr_set(&tb->ib_addr_ipv6, 0, 0,
+					      htonl(0x0000FFFF),
+					      sk_rcv_saddr(sk));
+
+			/* if no alignment problems appear, the IPv4 address
+			 * should be written to ib_addr_ipv6. If this gets
+			 * triggered check the inet_bind_bucket structure. */
+			WARN_ON(tb->ib_addr_ipv4 != sk_rcv_saddr(sk));
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case AF_INET6:
+			memcpy(&tb->ib_addr_ipv6, &inet6_sk(sk)->rcv_saddr,
+					sizeof(struct in6_addr));
+			break;
+#endif
+		default:
+			WARN(1, "unrecognised sk_family in inet_bind_bucket_create");
+		}
+
 		write_pnet(&tb->ib_net, hold_net(net));
 		tb->port      = snum;
 		tb->fastreuse = 0;
@@ -142,8 +169,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 				break;
 		}
 		if (!node) {
+			portaddr_head = inet_portaddr_hashbucket(table, sk, tb->port);
+
 			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
-						     sock_net(sk), head,
+						     sk, sock_net(sk), head,
 						     portaddr_head, port);
 			if (!tb) {
 				spin_unlock(&head->lock);
@@ -521,7 +550,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			portaddr_head = inet_portaddr_hashbucket(hinfo, sk, port);
 			spin_lock(&portaddr_head->lock);
 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
-					net, head, portaddr_head, port);
+					sk, net, head, portaddr_head, port);
 			spin_unlock(&portaddr_head->lock);
 
 			if (!tb) {
@@ -584,6 +613,98 @@ out:
 	}
 }
 
+struct inet_bind_bucket *
+inet4_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	struct net *net = sock_net(sk);
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	struct inet_bind_bucket *tb = NULL;
+	struct hlist_node *node;
+
+	struct inet_bind_hashbucket *bhead, *portaddr_bhead, *portaddrany_bhead;
+	bhead = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
+	portaddr_bhead = inet4_portaddr_hashbucket(hinfo, net,
+				sk_rcv_saddr(sk), port);
+	portaddrany_bhead = inet4_portaddr_hashbucket(hinfo, net,
+						INADDR_ANY, port);
+
+	*p_portaddr_bhead = portaddr_bhead;
+	*p_bhead = bhead;
+
+	/*
+	 * prevent dead locks by always taking locks in a fixed order:
+	 * - always take the port-only lock first. This is done because in some
+	 *   other places this is the lock taken, being folllowed in only some
+	 *   cases by the portaddr lock.
+	 * - between portaddr and portaddrany always choose the one with the
+	 *   lower address. Unlock ordering is not important, as long as the
+	 *   locking order is consistent.
+	 * - make sure to not take the same lock twice
+	 */
+	spin_lock(&bhead->lock);
+	if (portaddr_bhead > portaddrany_bhead) {
+		spin_lock(&portaddrany_bhead->lock);
+		spin_lock(&portaddr_bhead->lock);
+	} else if (portaddr_bhead < portaddrany_bhead) {
+		spin_lock(&portaddr_bhead->lock);
+		spin_lock(&portaddrany_bhead->lock);
+	} else {
+		spin_lock(&portaddr_bhead->lock);
+	}
+
+	if (sk_rcv_saddr(sk) != INADDR_ANY) {
+		struct inet_bind_hashbucket *_head;
+
+		_head = portaddr_bhead;
+		if (bhead->count < portaddr_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
+					goto found;
+		}
+		_head = portaddrany_bhead;
+		if (bhead->count < portaddrany_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == INADDR_ANY))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    (tb->ib_addr_ipv4 == INADDR_ANY))
+					goto found;
+		}
+	} else {
+		inet_bind_bucket_for_each(tb, node, &bhead->chain)
+			if ((ib_net(tb) == net) && (tb->port == port))
+				goto found;
+	}
+
+	tb = NULL;
+found:
+	if (portaddr_bhead != portaddrany_bhead)
+		spin_unlock(&portaddrany_bhead->lock);
+
+	/* the other locks remain taken, as the caller
+	 * may want to change the hash tabels */
+	return tb;
+}
+
+
 /*
  * Bind a port for a connect operation and hash it.
  */
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 73f1a00..62f1eff 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -294,6 +294,101 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk)
 					  inet->inet_dport);
 }
 
+
+struct inet_bind_bucket *
+inet6_find_bind_buckets(struct sock *sk,
+			unsigned short port,
+			struct inet_bind_hashbucket **p_bhead,
+			struct inet_bind_hashbucket **p_portaddr_bhead)
+{
+	struct net *net = sock_net(sk);
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	struct inet_bind_bucket *tb = NULL;
+	struct hlist_node *node;
+
+	struct inet_bind_hashbucket *bhead, *portaddr_bhead, *portaddrany_bhead;
+	bhead = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
+	portaddr_bhead = inet6_portaddr_hashbucket(hinfo, net,
+				inet6_rcv_saddr(sk), port);
+	portaddrany_bhead = inet6_portaddr_hashbucket(hinfo, net,
+				&in6addr_any, port);
+
+	*p_portaddr_bhead = portaddr_bhead;
+	*p_bhead = bhead;
+
+	/*
+	 * prevent dead locks by always taking locks in a fixed order:
+	 * - always take the port-only lock first. This is done because in some
+	 *   other places this is the lock taken, being folllowed in only some
+	 *   cases by the portaddr lock.
+	 * - between portaddr and portaddrany always choose the one with the
+	 *   lower address. Unlock ordering is not important, as long as the
+	 *   locking order is consistent.
+	 * - make sure to not take the same lock twice
+	 */
+	spin_lock(&bhead->lock);
+	if (portaddr_bhead > portaddrany_bhead) {
+		spin_lock(&portaddrany_bhead->lock);
+		spin_lock(&portaddr_bhead->lock);
+	} else if (portaddr_bhead < portaddrany_bhead) {
+		spin_lock(&portaddr_bhead->lock);
+		spin_lock(&portaddrany_bhead->lock);
+	} else {
+		spin_lock(&portaddr_bhead->lock);
+	}
+
+	if (ipv6_addr_any(inet6_rcv_saddr(sk))) {
+		struct inet_bind_hashbucket *_head;
+
+		_head = portaddr_bhead;
+		if (bhead->count < portaddr_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    ipv6_addr_equal(&tb->ib_addr_ipv6,
+						    inet6_rcv_saddr(sk)))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((net_eq(ib_net(tb), net)) &&
+				    (tb->port == port) &&
+				    ipv6_addr_equal(&tb->ib_addr_ipv6,
+						    inet6_rcv_saddr(sk)))
+					goto found;
+		}
+		_head = portaddrany_bhead;
+		if (bhead->count < portaddrany_bhead->count) {
+			_head = bhead;
+			inet_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    ipv6_addr_any(&tb->ib_addr_ipv6))
+					goto found;
+		} else {
+			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
+				if ((ib_net(tb) == net) &&
+				    (tb->port == port) &&
+				    ipv6_addr_any(&tb->ib_addr_ipv6))
+					goto found;
+		}
+	} else {
+		inet_bind_bucket_for_each(tb, node, &bhead->chain)
+			if ((ib_net(tb) == net) && (tb->port == port))
+				goto found;
+	}
+
+	tb = NULL;
+found:
+	if (portaddr_bhead != portaddrany_bhead)
+		spin_unlock(&portaddrany_bhead->lock);
+
+	/* the other locks remain taken, as the caller
+	 * may want to change the hash tabels */
+	return tb;
+}
+
+
 int inet6_hash_connect(struct inet_timewait_death_row *death_row,
 		       struct sock *sk)
 {
-- 
1.7.10.2

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 0/4] inet: add second hash table
  2012-05-30  7:36 [RFC PATCH 0/4] inet: add second hash table Alexandru Copot
                   ` (3 preceding siblings ...)
  2012-05-30  7:36 ` [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port Alexandru Copot
@ 2012-05-30  7:57 ` Eric Dumazet
  2012-05-30 12:32   ` Daniel Baluta
  4 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2012-05-30  7:57 UTC (permalink / raw)
  To: Alexandru Copot
  Cc: davem, gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Daniel Baluta, Lucian Grijincu

On Wed, 2012-05-30 at 10:36 +0300, Alexandru Copot wrote:
> This patchset implements all the operations needed to use a second
> (port,address) bind hash table for inet. It uses a similar approach
> as the UDP implementation.
> 
> The performance improvements for port allocation are very good and
> detailed in the last message.
> 
> This is based on a series of patches written by Lucian Grijincu at Ixia.
> 
> Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
> Cc: Daniel Baluta <dbaluta@ixiacom.com>
> Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
> ---
> Alexandru Copot (4):
>       inet: add counter to inet_bind_hashbucket
>       inet: add a second bind hash
>       inet: add/remove inet buckets in the second bind hash
>       inet: use second hash in inet_csk_get_port
> 
>  include/net/inet_hashtables.h    |  140 +++++++++++++++++++++++++++++++--
>  include/net/inet_timewait_sock.h |    5 +-
>  net/dccp/proto.c                 |   37 ++++++++-
>  net/ipv4/inet_connection_sock.c  |   66 ++++++++--------
>  net/ipv4/inet_hashtables.c       |  158 ++++++++++++++++++++++++++++++++++++--
>  net/ipv4/inet_timewait_sock.c    |   16 ++--
>  net/ipv4/tcp.c                   |   17 ++++
>  net/ipv6/inet6_hashtables.c      |   95 +++++++++++++++++++++++
>  8 files changed, 477 insertions(+), 57 deletions(-)


Its a huge change (with many details to look at), for a yet to be
understood need.

What sensible workload needs this at all ?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket
  2012-05-30  7:36 ` [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket Alexandru Copot
@ 2012-05-30  8:00   ` Eric Dumazet
  0 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2012-05-30  8:00 UTC (permalink / raw)
  To: Alexandru Copot
  Cc: davem, gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Daniel Baluta, Lucian Grijincu

On Wed, 2012-05-30 at 10:36 +0300, Alexandru Copot wrote:
> The counter will be used by the upcoming INET lookup algorithm to
> choose the shortest chain after secondary hash is added.
> 
> Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
> Cc: Daniel Baluta <dbaluta@ixiacom.com>
> Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
> ---
>  include/net/inet_hashtables.h    |    4 +++-
>  include/net/inet_timewait_sock.h |    4 +++-
>  net/dccp/proto.c                 |    1 +
>  net/ipv4/inet_hashtables.c       |    9 ++++++---
>  net/ipv4/inet_timewait_sock.c    |    7 ++++---
>  net/ipv4/tcp.c                   |    1 +
>  6 files changed, 18 insertions(+), 8 deletions(-)
> 
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index 808fc5f..8c6addc 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -98,6 +98,7 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
>  struct inet_bind_hashbucket {
>  	spinlock_t		lock;
>  	struct hlist_head	chain;
> +	unsigned int		count;
>  };
>  

Are you still using 32bit kernel ?

better use :

struct inet_bind_hashbucket {
	spinlock_t		lock;
	unsigned int		count;
 	struct hlist_head	chain;
};

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 0/4] inet: add second hash table
  2012-05-30  7:57 ` [RFC PATCH 0/4] inet: add second hash table Eric Dumazet
@ 2012-05-30 12:32   ` Daniel Baluta
  2012-05-30 12:41     ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Daniel Baluta @ 2012-05-30 12:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexandru Copot, davem, gerrit, kuznet, jmorris, yoshfuji, kaber,
	netdev, Lucian Grijincu

On Wed, May 30, 2012 at 10:57 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2012-05-30 at 10:36 +0300, Alexandru Copot wrote:
>> This patchset implements all the operations needed to use a second
>> (port,address) bind hash table for inet. It uses a similar approach
>> as the UDP implementation.
>>
>> The performance improvements for port allocation are very good and
>> detailed in the last message.
>>
>> This is based on a series of patches written by Lucian Grijincu at Ixia.
>>
>> Signed-off-by: Alexandru Copot <alex.mihai.c@gmail.com>
>> Cc: Daniel Baluta <dbaluta@ixiacom.com>
>> Cc: Lucian Grijincu <lucian.grijincu@gmail.com>
>> ---
>> Alexandru Copot (4):
>>       inet: add counter to inet_bind_hashbucket
>>       inet: add a second bind hash
>>       inet: add/remove inet buckets in the second bind hash
>>       inet: use second hash in inet_csk_get_port
>>
>>  include/net/inet_hashtables.h    |  140 +++++++++++++++++++++++++++++++--
>>  include/net/inet_timewait_sock.h |    5 +-
>>  net/dccp/proto.c                 |   37 ++++++++-
>>  net/ipv4/inet_connection_sock.c  |   66 ++++++++--------
>>  net/ipv4/inet_hashtables.c       |  158 ++++++++++++++++++++++++++++++++++++--
>>  net/ipv4/inet_timewait_sock.c    |   16 ++--
>>  net/ipv4/tcp.c                   |   17 ++++
>>  net/ipv6/inet6_hashtables.c      |   95 +++++++++++++++++++++++
>>  8 files changed, 477 insertions(+), 57 deletions(-)
>
>
> Its a huge change (with many details to look at), for a yet to be
> understood need.
>
> What sensible workload needs this at all ?

Hi Eric,

Usually our tests use a huge number of virtual interfaces.
Using this patch we get a massive improvement when there are many sockets
bound to the same port, but different addresses for both bind() and
listen() system calls (both call inet_csk_get_port).
	
We provided some data points in the fourth patch:

For 16.000 interfaces each with a distinct IPv4 address, doing bind
and then listen we get:

* Without patch and without SO_REUSEADDR:
   * bind:   1.543 s
   * listen: 3.050 s

* Without patch and with SO_REUSEADDR set:
   * bind:   0.066 s
   * listen: 3.050 s

* With patch and SO_REUSEADDR set / without SO_REUSEADDR:
   * bind:   0.066 s
   * listen: 0.095 s

The source code for tests can be found here [1].
Just run:
* ./prepare_test2.sh
* ./avg_tcp.sh

If I understood it correctly, a similar patch was introduced
for UDP some time ago. [2]

thanks,
Daniel.

[1] http://ixlabs.cs.pub.ro/gitweb/?p=port-allocation.git;a=tree;f=testbind;h=687e4452101e13cb5995b43c1351d76786d98fdd;hb=HEAD
[2] http://www.spinics.net/lists/netdev/msg112056.html

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 0/4] inet: add second hash table
  2012-05-30 12:32   ` Daniel Baluta
@ 2012-05-30 12:41     ` Eric Dumazet
  2012-05-30 16:27       ` Ben Greear
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2012-05-30 12:41 UTC (permalink / raw)
  To: Daniel Baluta
  Cc: Alexandru Copot, davem, gerrit, kuznet, jmorris, yoshfuji, kaber,
	netdev, Lucian Grijincu

On Wed, 2012-05-30 at 15:32 +0300, Daniel Baluta wrote:


> Hi Eric,
> 
> Usually our tests use a huge number of virtual interfaces.
> Using this patch we get a massive improvement when there are many sockets
> bound to the same port, but different addresses for both bind() and
> listen() system calls (both call inet_csk_get_port).
> 	
> We provided some data points in the fourth patch:
> 
> For 16.000 interfaces each with a distinct IPv4 address, doing bind
> and then listen we get:
> 

> 
> If I understood it correctly, a similar patch was introduced
> for UDP some time ago. [2]
> 
> thanks,
> Daniel.
> 
> [1] http://ixlabs.cs.pub.ro/gitweb/?p=port-allocation.git;a=tree;f=testbind;h=687e4452101e13cb5995b43c1351d76786d98fdd;hb=HEAD
> [2] http://www.spinics.net/lists/netdev/msg112056.html


UDP case was a bit different, since production machine could really have
thousand of UDP flows for tunnel terminations.

But for TCP, unless your very specific needs I don't see the real need
to review 400 lines of patches ?

Nobody but you ever complained of listen() being performance critical
with 16.000 IP on a machime...

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 0/4] inet: add second hash table
  2012-05-30 12:41     ` Eric Dumazet
@ 2012-05-30 16:27       ` Ben Greear
  0 siblings, 0 replies; 13+ messages in thread
From: Ben Greear @ 2012-05-30 16:27 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Daniel Baluta, Alexandru Copot, davem, gerrit, kuznet, jmorris,
	yoshfuji, kaber, netdev, Lucian Grijincu

On 05/30/2012 05:41 AM, Eric Dumazet wrote:
> On Wed, 2012-05-30 at 15:32 +0300, Daniel Baluta wrote:

> UDP case was a bit different, since production machine could really have
> thousand of UDP flows for tunnel terminations.
>
> But for TCP, unless your very specific needs I don't see the real need
> to review 400 lines of patches ?
>
> Nobody but you ever complained of listen() being performance critical
> with 16.000 IP on a machime...

Well, we do similar things and would probably benefit from this change...

If it would help, I'll add these to our kernels and run them through
some of our test cases..but will probably be a week or two at soonest...

Thanks,
Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port
  2012-05-30  7:36 ` [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port Alexandru Copot
@ 2012-05-30 16:42   ` Eric Dumazet
  2012-05-30 17:20   ` Eric Dumazet
  1 sibling, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2012-05-30 16:42 UTC (permalink / raw)
  To: Alexandru Copot
  Cc: davem, gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Daniel Baluta, Lucian Grijincu

On Wed, 2012-05-30 at 10:36 +0300, Alexandru Copot wrote:

> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index bc06168..2f589bb 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -81,6 +81,15 @@ struct inet_bind_bucket {
>  	struct net		*ib_net;
>  #endif
>  	unsigned short		port;
> +	union {
> +		struct in6_addr ib_addr_ipv6;
> +		struct {
> +			__be32	_1;
> +			__be32	_2;
> +			__be32	_3;
> +			__be32	ib_addr_ipv4;
> +		};
> +	};
>  	signed short		fastreuse;
>  	int			num_owners;
>  	struct hlist_node	node;

Yet another poor choice, adding two holes in this structure.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port
  2012-05-30  7:36 ` [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port Alexandru Copot
  2012-05-30 16:42   ` Eric Dumazet
@ 2012-05-30 17:20   ` Eric Dumazet
  2012-05-30 19:11     ` Alexandru Copot
  1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2012-05-30 17:20 UTC (permalink / raw)
  To: Alexandru Copot
  Cc: davem, gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Daniel Baluta, Lucian Grijincu

On Wed, 2012-05-30 at 10:36 +0300, Alexandru Copot wrote:

> +struct inet_bind_bucket *
> +inet4_find_bind_buckets(struct sock *sk,
> +			unsigned short port,
> +			struct inet_bind_hashbucket **p_bhead,
> +			struct inet_bind_hashbucket **p_portaddr_bhead)
> +{
> +	struct net *net = sock_net(sk);
> +	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
> +	struct inet_bind_bucket *tb = NULL;
> +	struct hlist_node *node;
> +
> +	struct inet_bind_hashbucket *bhead, *portaddr_bhead, *portaddrany_bhead;
> +	bhead = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
> +	portaddr_bhead = inet4_portaddr_hashbucket(hinfo, net,
> +				sk_rcv_saddr(sk), port);
> +	portaddrany_bhead = inet4_portaddr_hashbucket(hinfo, net,
> +						INADDR_ANY, port);
> +
> +	*p_portaddr_bhead = portaddr_bhead;
> +	*p_bhead = bhead;
> +
> +	/*
> +	 * prevent dead locks by always taking locks in a fixed order:
> +	 * - always take the port-only lock first. This is done because in some
> +	 *   other places this is the lock taken, being folllowed in only some
> +	 *   cases by the portaddr lock.
> +	 * - between portaddr and portaddrany always choose the one with the
> +	 *   lower address. Unlock ordering is not important, as long as the
> +	 *   locking order is consistent.
> +	 * - make sure to not take the same lock twice
> +	 */
> +	spin_lock(&bhead->lock);
> +	if (portaddr_bhead > portaddrany_bhead) {
> +		spin_lock(&portaddrany_bhead->lock);
> +		spin_lock(&portaddr_bhead->lock);
> +	} else if (portaddr_bhead < portaddrany_bhead) {
> +		spin_lock(&portaddr_bhead->lock);
> +		spin_lock(&portaddrany_bhead->lock);
> +	} else {
> +		spin_lock(&portaddr_bhead->lock);
> +	}
> +
> +	if (sk_rcv_saddr(sk) != INADDR_ANY) {
> +		struct inet_bind_hashbucket *_head;
> +
> +		_head = portaddr_bhead;
> +		if (bhead->count < portaddr_bhead->count) {
> +			_head = bhead;
> +			inet_bind_bucket_for_each(tb, node, &_head->chain)
> +				if ((net_eq(ib_net(tb), net)) &&
> +				    (tb->port == port) &&
> +				    (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
> +					goto found;
> +		} else {
> +			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
> +				if ((net_eq(ib_net(tb), net)) &&
> +				    (tb->port == port) &&
> +				    (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
> +					goto found;
> +		}
> +		_head = portaddrany_bhead;
> +		if (bhead->count < portaddrany_bhead->count) {
> +			_head = bhead;
> +			inet_bind_bucket_for_each(tb, node, &_head->chain)
> +				if ((ib_net(tb) == net) &&
> +				    (tb->port == port) &&
> +				    (tb->ib_addr_ipv4 == INADDR_ANY))
> +					goto found;
> +		} else {
> +			inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
> +				if ((ib_net(tb) == net) &&
> +				    (tb->port == port) &&
> +				    (tb->ib_addr_ipv4 == INADDR_ANY))
> +					goto found;
> +		}
> +	} else {
> +		inet_bind_bucket_for_each(tb, node, &bhead->chain)
> +			if ((ib_net(tb) == net) && (tb->port == port))
> +				goto found;
> +	}
> +
> +	tb = NULL;
> +found:
> +	if (portaddr_bhead != portaddrany_bhead)
> +		spin_unlock(&portaddrany_bhead->lock);
> +
> +	/* the other locks remain taken, as the caller
> +	 * may want to change the hash tabels */
> +	return tb;
> +}
> +
> +

How this is going to work with IPv6 sockets in the middle of the
chains ?

Also, comments are not properly formatted, they should all look like :

	/* the other locks remain taken, as the caller
	 * may want to change the hash tables
	 */

And finally, make sure LOCKDEP is happy with your locking code.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port
  2012-05-30 17:20   ` Eric Dumazet
@ 2012-05-30 19:11     ` Alexandru Copot
  0 siblings, 0 replies; 13+ messages in thread
From: Alexandru Copot @ 2012-05-30 19:11 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, gerrit, kuznet, jmorris, yoshfuji, kaber, netdev,
	Daniel Baluta, Lucian Grijincu

On Wed, May 30, 2012 at 8:20 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2012-05-30 at 10:36 +0300, Alexandru Copot wrote:
>
>> +struct inet_bind_bucket *
>> +inet4_find_bind_buckets(struct sock *sk,
>> +                     unsigned short port,
>> +                     struct inet_bind_hashbucket **p_bhead,
>> +                     struct inet_bind_hashbucket **p_portaddr_bhead)
>> +{
>> +     struct net *net = sock_net(sk);
>> +     struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
>> +     struct inet_bind_bucket *tb = NULL;
>> +     struct hlist_node *node;
>> +
>> +     struct inet_bind_hashbucket *bhead, *portaddr_bhead, *portaddrany_bhead;
>> +     bhead = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
>> +     portaddr_bhead = inet4_portaddr_hashbucket(hinfo, net,
>> +                             sk_rcv_saddr(sk), port);
>> +     portaddrany_bhead = inet4_portaddr_hashbucket(hinfo, net,
>> +                                             INADDR_ANY, port);
>> +
>> +     *p_portaddr_bhead = portaddr_bhead;
>> +     *p_bhead = bhead;
>> +
>> +     /*
>> +      * prevent dead locks by always taking locks in a fixed order:
>> +      * - always take the port-only lock first. This is done because in some
>> +      *   other places this is the lock taken, being folllowed in only some
>> +      *   cases by the portaddr lock.
>> +      * - between portaddr and portaddrany always choose the one with the
>> +      *   lower address. Unlock ordering is not important, as long as the
>> +      *   locking order is consistent.
>> +      * - make sure to not take the same lock twice
>> +      */
>> +     spin_lock(&bhead->lock);
>> +     if (portaddr_bhead > portaddrany_bhead) {
>> +             spin_lock(&portaddrany_bhead->lock);
>> +             spin_lock(&portaddr_bhead->lock);
>> +     } else if (portaddr_bhead < portaddrany_bhead) {
>> +             spin_lock(&portaddr_bhead->lock);
>> +             spin_lock(&portaddrany_bhead->lock);
>> +     } else {
>> +             spin_lock(&portaddr_bhead->lock);
>> +     }
>> +
>> +     if (sk_rcv_saddr(sk) != INADDR_ANY) {
>> +             struct inet_bind_hashbucket *_head;
>> +
>> +             _head = portaddr_bhead;
>> +             if (bhead->count < portaddr_bhead->count) {
>> +                     _head = bhead;
>> +                     inet_bind_bucket_for_each(tb, node, &_head->chain)
>> +                             if ((net_eq(ib_net(tb), net)) &&
>> +                                 (tb->port == port) &&
>> +                                 (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
>> +                                     goto found;
>> +             } else {
>> +                     inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
>> +                             if ((net_eq(ib_net(tb), net)) &&
>> +                                 (tb->port == port) &&
>> +                                 (tb->ib_addr_ipv4 == sk_rcv_saddr(sk)))
>> +                                     goto found;
>> +             }
>> +             _head = portaddrany_bhead;
>> +             if (bhead->count < portaddrany_bhead->count) {
>> +                     _head = bhead;
>> +                     inet_bind_bucket_for_each(tb, node, &_head->chain)
>> +                             if ((ib_net(tb) == net) &&
>> +                                 (tb->port == port) &&
>> +                                 (tb->ib_addr_ipv4 == INADDR_ANY))
>> +                                     goto found;
>> +             } else {
>> +                     inet_portaddr_bind_bucket_for_each(tb, node, &_head->chain)
>> +                             if ((ib_net(tb) == net) &&
>> +                                 (tb->port == port) &&
>> +                                 (tb->ib_addr_ipv4 == INADDR_ANY))
>> +                                     goto found;
>> +             }
>> +     } else {
>> +             inet_bind_bucket_for_each(tb, node, &bhead->chain)
>> +                     if ((ib_net(tb) == net) && (tb->port == port))
>> +                             goto found;
>> +     }
>> +
>> +     tb = NULL;
>> +found:
>> +     if (portaddr_bhead != portaddrany_bhead)
>> +             spin_unlock(&portaddrany_bhead->lock);
>> +
>> +     /* the other locks remain taken, as the caller
>> +      * may want to change the hash tabels */
>> +     return tb;
>> +}
>> +
>> +
>
> How this is going to work with IPv6 sockets in the middle of the
> chains ?

Now I see it might not work that well. I think I should just skip them
here and only check the IPv4 sockets.

> Also, comments are not properly formatted, they should all look like :
>
>        /* the other locks remain taken, as the caller
>         * may want to change the hash tables
>         */
>
> And finally, make sure LOCKDEP is happy with your locking code.
>
I will check that too.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2012-05-30 19:11 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-30  7:36 [RFC PATCH 0/4] inet: add second hash table Alexandru Copot
2012-05-30  7:36 ` [RFC PATCH 1/4] inet: add counter to inet_bind_hashbucket Alexandru Copot
2012-05-30  8:00   ` Eric Dumazet
2012-05-30  7:36 ` [RFC PATCH 2/4] inet: add a second bind hash Alexandru Copot
2012-05-30  7:36 ` [RFC PATCH 3/4] inet: add/remove inet buckets in the " Alexandru Copot
2012-05-30  7:36 ` [RFC PATCH 4/4] inet: use second hash in inet_csk_get_port Alexandru Copot
2012-05-30 16:42   ` Eric Dumazet
2012-05-30 17:20   ` Eric Dumazet
2012-05-30 19:11     ` Alexandru Copot
2012-05-30  7:57 ` [RFC PATCH 0/4] inet: add second hash table Eric Dumazet
2012-05-30 12:32   ` Daniel Baluta
2012-05-30 12:41     ` Eric Dumazet
2012-05-30 16:27       ` Ben Greear

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).