All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr)
@ 2017-11-30 23:23 Martin KaFai Lau
  2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

This patch set adds a 2nd listener hashtable.  It is to resolve
the performance issue when a process is listening at many IP
addresses with the same port (e.g. [IP1]:443, [IP2]:443... [IPN]:443)

Martin KaFai Lau (4):
  inet: Add a count to struct inet_listen_hashbucket
  udp: Move udp[46]_portaddr_hash() to net/ip[v6].h
  inet: Add a 2nd listener hashtable (port+addr)
  tcp: Enable 2nd listener hashtable in TCP

 include/net/inet_connection_sock.h |   2 +
 include/net/inet_hashtables.h      |  16 ++++
 include/net/ip.h                   |   9 ++
 include/net/ipv6.h                 |  17 ++++
 net/ipv4/inet_hashtables.c         | 173 +++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp.c                     |   3 +
 net/ipv4/udp.c                     |  22 ++---
 net/ipv6/inet6_hashtables.c        |  66 ++++++++++++++
 net/ipv6/udp.c                     |  32 ++-----
 9 files changed, 294 insertions(+), 46 deletions(-)

-- 
2.9.5

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket
  2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
@ 2017-11-30 23:23 ` Martin KaFai Lau
  2017-12-01 17:19   ` Eric Dumazet
  2017-11-30 23:23 ` [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h Martin KaFai Lau
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

This patch adds a count to the 'struct inet_listen_hashbucket'.
It counts how many sk is hashed to a bucket.  It will be
used to decide if the (to-be-added) portaddr listener's hashtable
should be used during inet[6]_lookup_listener().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/net/inet_hashtables.h |  1 +
 net/ipv4/inet_hashtables.c    | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 2dbbbff5e1e3..4cce516c41ac 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -111,6 +111,7 @@ struct inet_bind_hashbucket {
  */
 struct inet_listen_hashbucket {
 	spinlock_t		lock;
+	unsigned int		count;
 	struct hlist_head	head;
 };
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 427b705d7c64..80cfd3fa21ca 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -476,6 +476,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
 	else
 		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+	ilb->count++;
 	sock_set_flag(sk, SOCK_RCU_FREE);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
@@ -502,6 +503,7 @@ EXPORT_SYMBOL_GPL(inet_hash);
 void inet_unhash(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	struct inet_listen_hashbucket *ilb;
 	spinlock_t *lock;
 	bool listener = false;
 	int done;
@@ -510,7 +512,8 @@ void inet_unhash(struct sock *sk)
 		return;
 
 	if (sk->sk_state == TCP_LISTEN) {
-		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		lock = &ilb->lock;
 		listener = true;
 	} else {
 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
@@ -522,8 +525,11 @@ void inet_unhash(struct sock *sk)
 		done = __sk_del_node_init(sk);
 	else
 		done = __sk_nulls_del_node_init_rcu(sk);
-	if (done)
+	if (done) {
+		if (listener)
+			ilb->count--;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	}
 	spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
@@ -658,6 +664,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
 		spin_lock_init(&h->listening_hash[i].lock);
 		INIT_HLIST_HEAD(&h->listening_hash[i].head);
+		h->listening_hash[i].count = 0;
 	}
 }
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h
  2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
  2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
@ 2017-11-30 23:23 ` Martin KaFai Lau
  2017-12-01 17:22   ` Eric Dumazet
  2017-11-30 23:23 ` [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
  2017-11-30 23:23 ` [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP Martin KaFai Lau
  3 siblings, 1 reply; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

This patch moves the udp[46]_portaddr_hash()
to net/ip[v6].h.  The function name is renamed to
ipv[46]_portaddr_hash().

It will be used by a later patch which adds a second listener
hashtable hashed by the address and port.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/net/ip.h   |  9 +++++++++
 include/net/ipv6.h | 17 +++++++++++++++++
 net/ipv4/udp.c     | 22 ++++++++--------------
 net/ipv6/udp.c     | 32 ++++++++------------------------
 4 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 9896f46cbbf1..fc9bf1b1fe2c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -26,12 +26,14 @@
 #include <linux/ip.h>
 #include <linux/in.h>
 #include <linux/skbuff.h>
+#include <linux/jhash.h>
 
 #include <net/inet_sock.h>
 #include <net/route.h>
 #include <net/snmp.h>
 #include <net/flow.h>
 #include <net/flow_dissector.h>
+#include <net/netns/hash.h>
 
 #define IPV4_MAX_PMTU		65535U		/* RFC 2675, Section 5.1 */
 
@@ -521,6 +523,13 @@ static inline unsigned int ipv4_addr_hash(__be32 ip)
 	return (__force unsigned int) ip;
 }
 
+static inline u32 ipv4_portaddr_hash(const struct net *net,
+				     __be32 saddr,
+				     unsigned int port)
+{
+	return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
+}
+
 bool ip_call_ra_chain(struct sk_buff *skb);
 
 /*
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f73797e2fa60..25be4715578c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -22,6 +22,7 @@
 #include <net/flow.h>
 #include <net/flow_dissector.h>
 #include <net/snmp.h>
+#include <net/netns/hash.h>
 
 #define SIN6_LEN_RFC2133	24
 
@@ -673,6 +674,22 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
 					cpu_to_be32(0x0000ffff))) == 0UL;
 }
 
+static inline u32 ipv6_portaddr_hash(const struct net *net,
+				     const struct in6_addr *addr6,
+				     unsigned int port)
+{
+	unsigned int hash, mix = net_hash_mix(net);
+
+	if (ipv6_addr_any(addr6))
+		hash = jhash_1word(0, mix);
+	else if (ipv6_addr_v4mapped(addr6))
+		hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
+	else
+		hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
+
+	return hash ^ port;
+}
+
 /*
  * Check for a RFC 4843 ORCHID address
  * (Overlay Routable Cryptographic Hash Identifiers)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 36f857c87fe2..e9c0d1e1772e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -357,18 +357,12 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 }
 EXPORT_SYMBOL(udp_lib_get_port);
 
-static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
-			      unsigned int port)
-{
-	return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
-}
-
 int udp_v4_get_port(struct sock *sk, unsigned short snum)
 {
 	unsigned int hash2_nulladdr =
-		udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+		ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
 	unsigned int hash2_partial =
-		udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+		ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -485,7 +479,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	u32 hash = 0;
 
 	if (hslot->count > 10) {
-		hash2 = udp4_portaddr_hash(net, daddr, hnum);
+		hash2 = ipv4_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
 		hslot2 = &udptable->hash2[slot2];
 		if (hslot->count < hslot2->count)
@@ -496,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 					  exact_dif, hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
-			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+			hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
 			slot2 = hash2 & udptable->mask;
 			/* avoid searching the same slot again. */
 			if (unlikely(slot2 == old_slot2))
@@ -1761,7 +1755,7 @@ EXPORT_SYMBOL(udp_lib_rehash);
 
 static void udp_v4_rehash(struct sock *sk)
 {
-	u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+	u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
 					  inet_sk(sk)->inet_rcv_saddr,
 					  inet_sk(sk)->inet_num);
 	udp_lib_rehash(sk, new_hash);
@@ -1952,9 +1946,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	struct sk_buff *nskb;
 
 	if (use_hash2) {
-		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+		hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
 			    udptable->mask;
-		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
+		hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
 		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -2186,7 +2180,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 					    int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
-	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+	unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
 	unsigned int slot2 = hash2 & udp_table.mask;
 	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
 	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c9f91c28b81d..eecf9f0faf29 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -89,28 +89,12 @@ static u32 udp6_ehashfn(const struct net *net,
 			       udp_ipv6_hash_secret + net_hash_mix(net));
 }
 
-static u32 udp6_portaddr_hash(const struct net *net,
-			      const struct in6_addr *addr6,
-			      unsigned int port)
-{
-	unsigned int hash, mix = net_hash_mix(net);
-
-	if (ipv6_addr_any(addr6))
-		hash = jhash_1word(0, mix);
-	else if (ipv6_addr_v4mapped(addr6))
-		hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
-	else
-		hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
-
-	return hash ^ port;
-}
-
 int udp_v6_get_port(struct sock *sk, unsigned short snum)
 {
 	unsigned int hash2_nulladdr =
-		udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+		ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
 	unsigned int hash2_partial =
-		udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
+		ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
 
 	/* precompute partial secondary hash */
 	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -119,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 
 static void udp_v6_rehash(struct sock *sk)
 {
-	u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+	u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
 					  &sk->sk_v6_rcv_saddr,
 					  inet_sk(sk)->inet_num);
 
@@ -225,7 +209,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 	u32 hash = 0;
 
 	if (hslot->count > 10) {
-		hash2 = udp6_portaddr_hash(net, daddr, hnum);
+		hash2 = ipv6_portaddr_hash(net, daddr, hnum);
 		slot2 = hash2 & udptable->mask;
 		hslot2 = &udptable->hash2[slot2];
 		if (hslot->count < hslot2->count)
@@ -236,7 +220,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 					  hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
-			hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
+			hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
 			slot2 = hash2 & udptable->mask;
 			/* avoid searching the same slot again. */
 			if (unlikely(slot2 == old_slot2))
@@ -705,9 +689,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	struct sk_buff *nskb;
 
 	if (use_hash2) {
-		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+		hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) &
 			    udptable->mask;
-		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
+		hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
 		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -895,7 +879,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
 			int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
-	unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
+	unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
 	unsigned int slot2 = hash2 & udp_table.mask;
 	struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
 	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr)
  2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
  2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
  2017-11-30 23:23 ` [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h Martin KaFai Lau
@ 2017-11-30 23:23 ` Martin KaFai Lau
  2017-12-01 17:26   ` Eric Dumazet
  2017-11-30 23:23 ` [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP Martin KaFai Lau
  3 siblings, 1 reply; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

The current listener hashtable is hashed by port only.
When a process is listening at many IP addresses with the same port (e.g.
[IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
performance is degraded to a link list.  It is prone to syn attack.

UDP had a similar issue and a second hashtable was added to resolve it.

This patch adds a second hashtable for the listener's sockets.
The second hashtable is hashed by port and address.

It cannot reuse the existing skc_portaddr_node which is shared
with skc_bind_node.  TCP listener needs to use skc_bind_node.
Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to
the inet_connection_sock which the listener (like TCP) also belongs to.

The new portaddr hashtable may need two lookup (First by IP:PORT.
Second by INADDR_ANY:PORT if the IP:PORT is a not found).   Hence,
it implements a similar cut off as UDP such that it will only consult the
new portaddr hashtable if the current port-only hashtable has >10
sk in the link-list.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/net/inet_connection_sock.h |   2 +
 include/net/inet_hashtables.h      |  15 ++++
 net/ipv4/inet_hashtables.c         | 168 +++++++++++++++++++++++++++++++++++--
 net/ipv6/inet6_hashtables.c        |  66 +++++++++++++++
 4 files changed, 242 insertions(+), 9 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 0358745ea059..8e1bf9ae4a5e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
  * @icsk_ulp_ops	   Pluggable ULP control hook
  * @icsk_ulp_data	   ULP private data
+ * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
  * @icsk_pending:	   Scheduled timer event
@@ -101,6 +102,7 @@ struct inet_connection_sock {
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
 	const struct tcp_ulp_ops  *icsk_ulp_ops;
 	void			  *icsk_ulp_data;
+	struct hlist_node         icsk_listen_portaddr_node;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state:6,
 				  icsk_ca_setsockopt:1,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4cce516c41ac..ebce55d694e7 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -152,8 +152,19 @@ struct inet_hashinfo {
 	 */
 	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
 					____cacheline_aligned_in_smp;
+	struct inet_listen_hashbucket	*lhash2;
+	unsigned int			lhash2_mask;
 };
 
+#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
+	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
+
+static inline struct inet_listen_hashbucket *
+inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
+{
+	return &h->lhash2[hash & h->lhash2_mask];
+}
+
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
 	struct inet_hashinfo *hashinfo,
 	unsigned int hash)
@@ -209,6 +220,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child);
 void inet_put_port(struct sock *sk);
 
 void inet_hashinfo_init(struct inet_hashinfo *h);
+void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+			 unsigned long numentries, int scale,
+			 unsigned long low_limit,
+			 unsigned long high_limit);
 
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 80cfd3fa21ca..f6f58108b4c5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
+#include <linux/bootmem.h>
 
 #include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
@@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
 }
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+	u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		hash = ipv6_portaddr_hash(sock_net(sk),
+					  &sk->sk_v6_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	else
+#endif
+		hash = ipv4_portaddr_hash(sock_net(sk),
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	return inet_lhash2_bucket(h, hash);
+}
+
+static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
+{
+	struct inet_listen_hashbucket *ilb2;
+
+	if (!h->lhash2)
+		return;
+
+	ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+	spin_lock(&ilb2->lock);
+	if (sk->sk_reuseport && sk->sk_family == AF_INET6)
+		hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+				   &ilb2->head);
+	else
+		hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+				   &ilb2->head);
+	ilb2->count++;
+	spin_unlock(&ilb2->lock);
+}
+
+static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
+{
+	struct inet_listen_hashbucket *ilb2;
+
+	if (!h->lhash2 ||
+	    WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
+		return;
+
+	ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+	spin_lock(&ilb2->lock);
+	hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
+	ilb2->count--;
+	spin_unlock(&ilb2->lock);
+}
+
 static inline int compute_score(struct sock *sk, struct net *net,
 				const unsigned short hnum, const __be32 daddr,
 				const int dif, const int sdif, bool exact_dif)
@@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
  */
 
 /* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(struct net *net,
+				struct inet_listen_hashbucket *ilb2,
+				struct sk_buff *skb, int doff,
+				const __be32 saddr, __be16 sport,
+				const __be32 daddr, const unsigned short hnum,
+				const int dif, const int sdif)
+{
+	bool exact_dif = inet_exact_dif_match(net, skb);
+	struct inet_connection_sock *icsk;
+	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
+	u32 phash = 0;
+
+	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+		sk = (struct sock *)icsk;
+		score = compute_score(sk, net, hnum, daddr,
+				      dif, sdif, exact_dif);
+		if (score > hiscore) {
+			if (sk->sk_reuseport) {
+				phash = inet_ehashfn(net, daddr, hnum,
+						     saddr, sport);
+				result = reuseport_select_sock(sk, phash,
+							       skb, doff);
+				if (result)
+					return result;
+			}
+			result = sk;
+			hiscore = score;
+		}
+	}
+
+	return result;
+}
+
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
 				    struct sk_buff *skb, int doff,
@@ -217,10 +306,42 @@ struct sock *__inet_lookup_listener(struct net *net,
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 	bool exact_dif = inet_exact_dif_match(net, skb);
+	struct inet_listen_hashbucket *ilb2;
 	struct sock *sk, *result = NULL;
 	int score, hiscore = 0;
+	unsigned int hash2;
 	u32 phash = 0;
 
+	if (ilb->count <= 10 || !hashinfo->lhash2)
+		goto port_lookup;
+
+	/* Too many sk in the ilb bucket (which is hashed by port alone).
+	 * Try lhash2 (which is hashed by port and addr) instead.
+	 */
+
+	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	result = inet_lhash2_lookup(net, ilb2, skb, doff,
+				    saddr, sport, daddr, hnum,
+				    dif, sdif);
+	if (result)
+		return result;
+
+	/* Lookup lhash2 with INADDR_ANY */
+
+	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	return inet_lhash2_lookup(net, ilb2, skb, doff,
+				  saddr, sport, daddr, hnum,
+				  dif, sdif);
+
+port_lookup:
 	sk_for_each_rcu(sk, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr,
 				      dif, sdif, exact_dif);
@@ -476,6 +597,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
 		hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
 	else
 		hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+	inet_hash2(hashinfo, sk);
 	ilb->count++;
 	sock_set_flag(sk, SOCK_RCU_FREE);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -506,7 +628,6 @@ void inet_unhash(struct sock *sk)
 	struct inet_listen_hashbucket *ilb;
 	spinlock_t *lock;
 	bool listener = false;
-	int done;
 
 	if (sk_unhashed(sk))
 		return;
@@ -519,17 +640,20 @@ void inet_unhash(struct sock *sk)
 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 	}
 	spin_lock_bh(lock);
+	if (sk_unhashed(sk))
+		goto unlock;
+
 	if (rcu_access_pointer(sk->sk_reuseport_cb))
 		reuseport_detach_sock(sk);
-	if (listener)
-		done = __sk_del_node_init(sk);
-	else
-		done = __sk_nulls_del_node_init_rcu(sk);
-	if (done) {
-		if (listener)
-			ilb->count--;
-		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	if (listener) {
+		inet_unhash2(hashinfo, sk);
+		 __sk_del_node_init(sk);
+		 ilb->count--;
+	} else {
+		__sk_nulls_del_node_init_rcu(sk);
 	}
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+unlock:
 	spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
@@ -666,9 +790,35 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 		INIT_HLIST_HEAD(&h->listening_hash[i].head);
 		h->listening_hash[i].count = 0;
 	}
+
+	h->lhash2 = NULL;
 }
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
 
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+				unsigned long numentries, int scale,
+				unsigned long low_limit,
+				unsigned long high_limit)
+{
+	unsigned int i;
+
+	h->lhash2 = alloc_large_system_hash(name,
+					    sizeof(*h->lhash2),
+					    numentries,
+					    scale,
+					    0,
+					    NULL,
+					    &h->lhash2_mask,
+					    low_limit,
+					    high_limit);
+
+	for (i = 0; i <= h->lhash2_mask; i++) {
+		spin_lock_init(&h->lhash2[i].lock);
+		INIT_HLIST_HEAD(&h->lhash2[i].head);
+		h->lhash2[i].count = 0;
+	}
+}
+
 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
 {
 	unsigned int locksz = sizeof(spinlock_t);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 0d1451381f5c..2febe26de6a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
 }
 
 /* called with rcu_read_lock() */
+static struct sock *inet6_lhash2_lookup(struct net *net,
+		struct inet_listen_hashbucket *ilb2,
+		struct sk_buff *skb, int doff,
+		const struct in6_addr *saddr,
+		const __be16 sport, const struct in6_addr *daddr,
+		const unsigned short hnum, const int dif, const int sdif)
+{
+	bool exact_dif = inet6_exact_dif_match(net, skb);
+	struct inet_connection_sock *icsk;
+	struct sock *sk, *result = NULL;
+	int score, hiscore = 0;
+	u32 phash = 0;
+
+	inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+		sk = (struct sock *)icsk;
+		score = compute_score(sk, net, hnum, daddr, dif, sdif,
+				      exact_dif);
+		if (score > hiscore) {
+			if (sk->sk_reuseport) {
+				phash = inet6_ehashfn(net, daddr, hnum,
+						      saddr, sport);
+				result = reuseport_select_sock(sk, phash,
+							       skb, doff);
+				if (result)
+					return result;
+			}
+			result = sk;
+			hiscore = score;
+		}
+	}
+
+	return result;
+}
+
 struct sock *inet6_lookup_listener(struct net *net,
 		struct inet_hashinfo *hashinfo,
 		struct sk_buff *skb, int doff,
@@ -135,10 +169,42 @@ struct sock *inet6_lookup_listener(struct net *net,
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 	bool exact_dif = inet6_exact_dif_match(net, skb);
+	struct inet_listen_hashbucket *ilb2;
 	struct sock *sk, *result = NULL;
 	int score, hiscore = 0;
+	unsigned int hash2;
 	u32 phash = 0;
 
+	if (ilb->count <= 10 || !hashinfo->lhash2)
+		goto port_lookup;
+
+	/* Too many sk in the ilb bucket (which is hashed by port alone).
+	 * Try lhash2 (which is hashed by port and addr) instead.
+	 */
+
+	hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+				     saddr, sport, daddr, hnum,
+				     dif, sdif);
+	if (result)
+		return result;
+
+	/* Lookup lhash2 with in6addr_any */
+
+	hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+	if (ilb2->count > ilb->count)
+		goto port_lookup;
+
+	return inet6_lhash2_lookup(net, ilb2, skb, doff,
+				   saddr, sport, daddr, hnum,
+				   dif, sdif);
+
+port_lookup:
 	sk_for_each(sk, &ilb->head) {
 		score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
 		if (score > hiscore) {
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP
  2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
                   ` (2 preceding siblings ...)
  2017-11-30 23:23 ` [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
@ 2017-11-30 23:23 ` Martin KaFai Lau
  2017-12-01 17:29   ` Eric Dumazet
  3 siblings, 1 reply; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
  To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

Enable the second listener hashtable in TCP.
The scale is the same as UDP which is one slot per 2MB.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv4/tcp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bf97317e6c97..180311636023 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3577,6 +3577,9 @@ void __init tcp_init(void)
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
 	inet_hashinfo_init(&tcp_hashinfo);
+	inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+			    thash_entries, 21,  /* one slot per 2 MB*/
+			    0, 64 * 1024);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket
  2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
@ 2017-12-01 17:19   ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2017-12-01 17:19 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> This patch adds a count to the 'struct inet_listen_hashbucket'.
> It counts how many sk is hashed to a bucket.  It will be
> used to decide if the (to-be-added) portaddr listener's hashtable
> should be used during inet[6]_lookup_listener().
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
> 

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h
  2017-11-30 23:23 ` [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h Martin KaFai Lau
@ 2017-12-01 17:22   ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2017-12-01 17:22 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> This patch moves the udp[46]_portaddr_hash()
> to net/ip[v6].h.  The function name is renamed to
> ipv[46]_portaddr_hash().
> 
> It will be used by a later patch which adds a second listener
> hashtable hashed by the address and port.
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr)
  2017-11-30 23:23 ` [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
@ 2017-12-01 17:26   ` Eric Dumazet
  2017-12-01 17:47     ` Martin KaFai Lau
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2017-12-01 17:26 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> The current listener hashtable is hashed by port only.
> When a process is listening at many IP addresses with the same port
> (e.g.
> [IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
> performance is degraded to a link list.  It is prone to syn attack.
> 
> 

...

> diff --git a/include/net/inet_hashtables.h
> b/include/net/inet_hashtables.h
> index 4cce516c41ac..ebce55d694e7 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -152,8 +152,19 @@ struct inet_hashinfo {
>  	 */
>  	struct inet_listen_hashbucket	listening_hash[INET_LHT
> ABLE_SIZE]
>  					____cacheline_aligned_in_smp
> ;
> +	struct inet_listen_hashbucket	*lhash2;
> +	unsigned int			lhash2_mask;
>  };
> 

Please move these two fields before listening_hash[], to avoid adding
another cache line to "struct inet_hashinfo"

( We have a 16-byte hole there, so lets use it ;) )

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP
  2017-11-30 23:23 ` [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP Martin KaFai Lau
@ 2017-12-01 17:29   ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2017-12-01 17:29 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team

On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> Enable the second listener hashtable in TCP.
> The scale is the same as UDP which is one slot per 2MB.
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  net/ipv4/tcp.c | 3 +++
>  1 file changed, 3 insertions(+)
> 

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr)
  2017-12-01 17:26   ` Eric Dumazet
@ 2017-12-01 17:47     ` Martin KaFai Lau
  0 siblings, 0 replies; 10+ messages in thread
From: Martin KaFai Lau @ 2017-12-01 17:47 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David S . Miller, Eric Dumazet, Kernel Team

On Fri, Dec 01, 2017 at 09:26:46AM -0800, Eric Dumazet wrote:
> On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> > The current listener hashtable is hashed by port only.
> > When a process is listening at many IP addresses with the same port
> > (e.g.
> > [IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
> > performance is degraded to a link list.  It is prone to syn attack.
> > 
> > 
> 
> ...
> 
> > diff --git a/include/net/inet_hashtables.h
> > b/include/net/inet_hashtables.h
> > index 4cce516c41ac..ebce55d694e7 100644
> > --- a/include/net/inet_hashtables.h
> > +++ b/include/net/inet_hashtables.h
> > @@ -152,8 +152,19 @@ struct inet_hashinfo {
> >  	 */
> >  	struct inet_listen_hashbucket	listening_hash[INET_LHT
> > ABLE_SIZE]
> >  					____cacheline_aligned_in_smp
> > ;
> > +	struct inet_listen_hashbucket	*lhash2;
> > +	unsigned int			lhash2_mask;
> >  };
> > 
> 
> Please move these two fields before listening_hash[], to avoid adding
> another cache line to "struct inet_hashinfo"
> 
> ( We have a 16-byte hole there, so lets use it ;) )
Oops. Indeed. Thanks for pointing it out.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-12-01 17:50 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
2017-12-01 17:19   ` Eric Dumazet
2017-11-30 23:23 ` [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h Martin KaFai Lau
2017-12-01 17:22   ` Eric Dumazet
2017-11-30 23:23 ` [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
2017-12-01 17:26   ` Eric Dumazet
2017-12-01 17:47     ` Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP Martin KaFai Lau
2017-12-01 17:29   ` Eric Dumazet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.