* [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr)
@ 2017-11-30 23:23 Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
` (3 more replies)
0 siblings, 4 replies; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
This patch set adds a 2nd listener hashtable. It is to resolve
the performance issue when a process is listening at many IP
addresses with the same port (e.g. [IP1]:443, [IP2]:443... [IPN]:443)
Martin KaFai Lau (4):
inet: Add a count to struct inet_listen_hashbucket
udp: Move udp[46]_portaddr_hash() to net/ip[v6].h
inet: Add a 2nd listener hashtable (port+addr)
tcp: Enable 2nd listener hashtable in TCP
include/net/inet_connection_sock.h | 2 +
include/net/inet_hashtables.h | 16 ++++
include/net/ip.h | 9 ++
include/net/ipv6.h | 17 ++++
net/ipv4/inet_hashtables.c | 173 +++++++++++++++++++++++++++++++++++--
net/ipv4/tcp.c | 3 +
net/ipv4/udp.c | 22 ++---
net/ipv6/inet6_hashtables.c | 66 ++++++++++++++
net/ipv6/udp.c | 32 ++-----
9 files changed, 294 insertions(+), 46 deletions(-)
--
2.9.5
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket
2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
@ 2017-11-30 23:23 ` Martin KaFai Lau
2017-12-01 17:19 ` Eric Dumazet
2017-11-30 23:23 ` [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h Martin KaFai Lau
` (2 subsequent siblings)
3 siblings, 1 reply; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
This patch adds a count to the 'struct inet_listen_hashbucket'.
It counts how many sk is hashed to a bucket. It will be
used to decide if the (to-be-added) portaddr listener's hashtable
should be used during inet[6]_lookup_listener().
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
include/net/inet_hashtables.h | 1 +
net/ipv4/inet_hashtables.c | 11 +++++++++--
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 2dbbbff5e1e3..4cce516c41ac 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -111,6 +111,7 @@ struct inet_bind_hashbucket {
*/
struct inet_listen_hashbucket {
spinlock_t lock;
+ unsigned int count;
struct hlist_head head;
};
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 427b705d7c64..80cfd3fa21ca 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -476,6 +476,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
else
hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ ilb->count++;
sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
@@ -502,6 +503,7 @@ EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_listen_hashbucket *ilb;
spinlock_t *lock;
bool listener = false;
int done;
@@ -510,7 +512,8 @@ void inet_unhash(struct sock *sk)
return;
if (sk->sk_state == TCP_LISTEN) {
- lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ lock = &ilb->lock;
listener = true;
} else {
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
@@ -522,8 +525,11 @@ void inet_unhash(struct sock *sk)
done = __sk_del_node_init(sk);
else
done = __sk_nulls_del_node_init_rcu(sk);
- if (done)
+ if (done) {
+ if (listener)
+ ilb->count--;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ }
spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(inet_unhash);
@@ -658,6 +664,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_HEAD(&h->listening_hash[i].head);
+ h->listening_hash[i].count = 0;
}
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h
2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
@ 2017-11-30 23:23 ` Martin KaFai Lau
2017-12-01 17:22 ` Eric Dumazet
2017-11-30 23:23 ` [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP Martin KaFai Lau
3 siblings, 1 reply; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
This patch moves the udp[46]_portaddr_hash()
to net/ip[v6].h. The function name is renamed to
ipv[46]_portaddr_hash().
It will be used by a later patch which adds a second listener
hashtable hashed by the address and port.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
include/net/ip.h | 9 +++++++++
include/net/ipv6.h | 17 +++++++++++++++++
net/ipv4/udp.c | 22 ++++++++--------------
net/ipv6/udp.c | 32 ++++++++------------------------
4 files changed, 42 insertions(+), 38 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index 9896f46cbbf1..fc9bf1b1fe2c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -26,12 +26,14 @@
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/skbuff.h>
+#include <linux/jhash.h>
#include <net/inet_sock.h>
#include <net/route.h>
#include <net/snmp.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
+#include <net/netns/hash.h>
#define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */
@@ -521,6 +523,13 @@ static inline unsigned int ipv4_addr_hash(__be32 ip)
return (__force unsigned int) ip;
}
+static inline u32 ipv4_portaddr_hash(const struct net *net,
+ __be32 saddr,
+ unsigned int port)
+{
+ return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
+}
+
bool ip_call_ra_chain(struct sk_buff *skb);
/*
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f73797e2fa60..25be4715578c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -22,6 +22,7 @@
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/snmp.h>
+#include <net/netns/hash.h>
#define SIN6_LEN_RFC2133 24
@@ -673,6 +674,22 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
cpu_to_be32(0x0000ffff))) == 0UL;
}
+static inline u32 ipv6_portaddr_hash(const struct net *net,
+ const struct in6_addr *addr6,
+ unsigned int port)
+{
+ unsigned int hash, mix = net_hash_mix(net);
+
+ if (ipv6_addr_any(addr6))
+ hash = jhash_1word(0, mix);
+ else if (ipv6_addr_v4mapped(addr6))
+ hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
+ else
+ hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
+
+ return hash ^ port;
+}
+
/*
* Check for a RFC 4843 ORCHID address
* (Overlay Routable Cryptographic Hash Identifiers)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 36f857c87fe2..e9c0d1e1772e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -357,18 +357,12 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
}
EXPORT_SYMBOL(udp_lib_get_port);
-static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
- unsigned int port)
-{
- return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
-}
-
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
unsigned int hash2_nulladdr =
- udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+ ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
unsigned int hash2_partial =
- udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+ ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -485,7 +479,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
u32 hash = 0;
if (hslot->count > 10) {
- hash2 = udp4_portaddr_hash(net, daddr, hnum);
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
@@ -496,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
- hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
/* avoid searching the same slot again. */
if (unlikely(slot2 == old_slot2))
@@ -1761,7 +1755,7 @@ EXPORT_SYMBOL(udp_lib_rehash);
static void udp_v4_rehash(struct sock *sk)
{
- u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+ u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
inet_sk(sk)->inet_rcv_saddr,
inet_sk(sk)->inet_num);
udp_lib_rehash(sk, new_hash);
@@ -1952,9 +1946,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct sk_buff *nskb;
if (use_hash2) {
- hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+ hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
udptable->mask;
- hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -2186,7 +2180,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+ unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c9f91c28b81d..eecf9f0faf29 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -89,28 +89,12 @@ static u32 udp6_ehashfn(const struct net *net,
udp_ipv6_hash_secret + net_hash_mix(net));
}
-static u32 udp6_portaddr_hash(const struct net *net,
- const struct in6_addr *addr6,
- unsigned int port)
-{
- unsigned int hash, mix = net_hash_mix(net);
-
- if (ipv6_addr_any(addr6))
- hash = jhash_1word(0, mix);
- else if (ipv6_addr_v4mapped(addr6))
- hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
- else
- hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
-
- return hash ^ port;
-}
-
int udp_v6_get_port(struct sock *sk, unsigned short snum)
{
unsigned int hash2_nulladdr =
- udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+ ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
unsigned int hash2_partial =
- udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
+ ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -119,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
static void udp_v6_rehash(struct sock *sk)
{
- u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+ u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
&sk->sk_v6_rcv_saddr,
inet_sk(sk)->inet_num);
@@ -225,7 +209,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
u32 hash = 0;
if (hslot->count > 10) {
- hash2 = udp6_portaddr_hash(net, daddr, hnum);
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
@@ -236,7 +220,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
- hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
slot2 = hash2 & udptable->mask;
/* avoid searching the same slot again. */
if (unlikely(slot2 == old_slot2))
@@ -705,9 +689,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct sk_buff *nskb;
if (use_hash2) {
- hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+ hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) &
udptable->mask;
- hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -895,7 +879,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
+ unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr)
2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h Martin KaFai Lau
@ 2017-11-30 23:23 ` Martin KaFai Lau
2017-12-01 17:26 ` Eric Dumazet
2017-11-30 23:23 ` [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP Martin KaFai Lau
3 siblings, 1 reply; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
The current listener hashtable is hashed by port only.
When a process is listening at many IP addresses with the same port (e.g.
[IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
performance is degraded to a link list. It is prone to syn attack.
UDP had a similar issue and a second hashtable was added to resolve it.
This patch adds a second hashtable for the listener's sockets.
The second hashtable is hashed by port and address.
It cannot reuse the existing skc_portaddr_node which is shared
with skc_bind_node. TCP listener needs to use skc_bind_node.
Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to
the inet_connection_sock which the listener (like TCP) also belongs to.
The new portaddr hashtable may need two lookup (First by IP:PORT.
Second by INADDR_ANY:PORT if the IP:PORT is a not found). Hence,
it implements a similar cut off as UDP such that it will only consult the
new portaddr hashtable if the current port-only hashtable has >10
sk in the link-list.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
include/net/inet_connection_sock.h | 2 +
include/net/inet_hashtables.h | 15 ++++
net/ipv4/inet_hashtables.c | 168 +++++++++++++++++++++++++++++++++++--
net/ipv6/inet6_hashtables.c | 66 +++++++++++++++
4 files changed, 242 insertions(+), 9 deletions(-)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 0358745ea059..8e1bf9ae4a5e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops {
* @icsk_af_ops Operations which are AF_INET{4,6} specific
* @icsk_ulp_ops Pluggable ULP control hook
* @icsk_ulp_data ULP private data
+ * @icsk_listen_portaddr_node hash to the portaddr listener hashtable
* @icsk_ca_state: Congestion control state
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
* @icsk_pending: Scheduled timer event
@@ -101,6 +102,7 @@ struct inet_connection_sock {
const struct inet_connection_sock_af_ops *icsk_af_ops;
const struct tcp_ulp_ops *icsk_ulp_ops;
void *icsk_ulp_data;
+ struct hlist_node icsk_listen_portaddr_node;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state:6,
icsk_ca_setsockopt:1,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4cce516c41ac..ebce55d694e7 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -152,8 +152,19 @@ struct inet_hashinfo {
*/
struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE]
____cacheline_aligned_in_smp;
+ struct inet_listen_hashbucket *lhash2;
+ unsigned int lhash2_mask;
};
+#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
+ hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
+
+static inline struct inet_listen_hashbucket *
+inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
+{
+ return &h->lhash2[hash & h->lhash2_mask];
+}
+
static inline struct inet_ehash_bucket *inet_ehash_bucket(
struct inet_hashinfo *hashinfo,
unsigned int hash)
@@ -209,6 +220,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child);
void inet_put_port(struct sock *sk);
void inet_hashinfo_init(struct inet_hashinfo *h);
+void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+ unsigned long numentries, int scale,
+ unsigned long low_limit,
+ unsigned long high_limit);
bool inet_ehash_insert(struct sock *sk, struct sock *osk);
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 80cfd3fa21ca..f6f58108b4c5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -19,6 +19,7 @@
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/vmalloc.h>
+#include <linux/bootmem.h>
#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
@@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+ u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ hash = ipv6_portaddr_hash(sock_net(sk),
+ &sk->sk_v6_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ else
+#endif
+ hash = ipv4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ return inet_lhash2_bucket(h, hash);
+}
+
+static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
+{
+ struct inet_listen_hashbucket *ilb2;
+
+ if (!h->lhash2)
+ return;
+
+ ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+ spin_lock(&ilb2->lock);
+ if (sk->sk_reuseport && sk->sk_family == AF_INET6)
+ hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+ &ilb2->head);
+ else
+ hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+ &ilb2->head);
+ ilb2->count++;
+ spin_unlock(&ilb2->lock);
+}
+
+static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
+{
+ struct inet_listen_hashbucket *ilb2;
+
+ if (!h->lhash2 ||
+ WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
+ return;
+
+ ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+ spin_lock(&ilb2->lock);
+ hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
+ ilb2->count--;
+ spin_unlock(&ilb2->lock);
+}
+
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
const int dif, const int sdif, bool exact_dif)
@@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
*/
/* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(struct net *net,
+ struct inet_listen_hashbucket *ilb2,
+ struct sk_buff *skb, int doff,
+ const __be32 saddr, __be16 sport,
+ const __be32 daddr, const unsigned short hnum,
+ const int dif, const int sdif)
+{
+ bool exact_dif = inet_exact_dif_match(net, skb);
+ struct inet_connection_sock *icsk;
+ struct sock *sk, *result = NULL;
+ int score, hiscore = 0;
+ u32 phash = 0;
+
+ inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+ sk = (struct sock *)icsk;
+ score = compute_score(sk, net, hnum, daddr,
+ dif, sdif, exact_dif);
+ if (score > hiscore) {
+ if (sk->sk_reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
+ }
+ result = sk;
+ hiscore = score;
+ }
+ }
+
+ return result;
+}
+
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -217,10 +306,42 @@ struct sock *__inet_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
bool exact_dif = inet_exact_dif_match(net, skb);
+ struct inet_listen_hashbucket *ilb2;
struct sock *sk, *result = NULL;
int score, hiscore = 0;
+ unsigned int hash2;
u32 phash = 0;
+ if (ilb->count <= 10 || !hashinfo->lhash2)
+ goto port_lookup;
+
+ /* Too many sk in the ilb bucket (which is hashed by port alone).
+ * Try lhash2 (which is hashed by port and addr) instead.
+ */
+
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ result = inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ if (result)
+ return result;
+
+ /* Lookup lhash2 with INADDR_ANY */
+
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ return inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+
+port_lookup:
sk_for_each_rcu(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr,
dif, sdif, exact_dif);
@@ -476,6 +597,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
else
hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ inet_hash2(hashinfo, sk);
ilb->count++;
sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -506,7 +628,6 @@ void inet_unhash(struct sock *sk)
struct inet_listen_hashbucket *ilb;
spinlock_t *lock;
bool listener = false;
- int done;
if (sk_unhashed(sk))
return;
@@ -519,17 +640,20 @@ void inet_unhash(struct sock *sk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
}
spin_lock_bh(lock);
+ if (sk_unhashed(sk))
+ goto unlock;
+
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- if (listener)
- done = __sk_del_node_init(sk);
- else
- done = __sk_nulls_del_node_init_rcu(sk);
- if (done) {
- if (listener)
- ilb->count--;
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ if (listener) {
+ inet_unhash2(hashinfo, sk);
+ __sk_del_node_init(sk);
+ ilb->count--;
+ } else {
+ __sk_nulls_del_node_init_rcu(sk);
}
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+unlock:
spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(inet_unhash);
@@ -666,9 +790,35 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
INIT_HLIST_HEAD(&h->listening_hash[i].head);
h->listening_hash[i].count = 0;
}
+
+ h->lhash2 = NULL;
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+ unsigned long numentries, int scale,
+ unsigned long low_limit,
+ unsigned long high_limit)
+{
+ unsigned int i;
+
+ h->lhash2 = alloc_large_system_hash(name,
+ sizeof(*h->lhash2),
+ numentries,
+ scale,
+ 0,
+ NULL,
+ &h->lhash2_mask,
+ low_limit,
+ high_limit);
+
+ for (i = 0; i <= h->lhash2_mask; i++) {
+ spin_lock_init(&h->lhash2[i].lock);
+ INIT_HLIST_HEAD(&h->lhash2[i].head);
+ h->lhash2[i].count = 0;
+ }
+}
+
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
unsigned int locksz = sizeof(spinlock_t);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 0d1451381f5c..2febe26de6a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
}
/* called with rcu_read_lock() */
+static struct sock *inet6_lhash2_lookup(struct net *net,
+ struct inet_listen_hashbucket *ilb2,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport, const struct in6_addr *daddr,
+ const unsigned short hnum, const int dif, const int sdif)
+{
+ bool exact_dif = inet6_exact_dif_match(net, skb);
+ struct inet_connection_sock *icsk;
+ struct sock *sk, *result = NULL;
+ int score, hiscore = 0;
+ u32 phash = 0;
+
+ inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+ sk = (struct sock *)icsk;
+ score = compute_score(sk, net, hnum, daddr, dif, sdif,
+ exact_dif);
+ if (score > hiscore) {
+ if (sk->sk_reuseport) {
+ phash = inet6_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
+ }
+ result = sk;
+ hiscore = score;
+ }
+ }
+
+ return result;
+}
+
struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -135,10 +169,42 @@ struct sock *inet6_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
bool exact_dif = inet6_exact_dif_match(net, skb);
+ struct inet_listen_hashbucket *ilb2;
struct sock *sk, *result = NULL;
int score, hiscore = 0;
+ unsigned int hash2;
u32 phash = 0;
+ if (ilb->count <= 10 || !hashinfo->lhash2)
+ goto port_lookup;
+
+ /* Too many sk in the ilb bucket (which is hashed by port alone).
+ * Try lhash2 (which is hashed by port and addr) instead.
+ */
+
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ if (result)
+ return result;
+
+ /* Lookup lhash2 with in6addr_any */
+
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ return inet6_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+
+port_lookup:
sk_for_each(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
if (score > hiscore) {
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP
2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
` (2 preceding siblings ...)
2017-11-30 23:23 ` [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
@ 2017-11-30 23:23 ` Martin KaFai Lau
2017-12-01 17:29 ` Eric Dumazet
3 siblings, 1 reply; 10+ messages in thread
From: Martin KaFai Lau @ 2017-11-30 23:23 UTC (permalink / raw)
To: netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
Enable the second listener hashtable in TCP.
The scale is the same as UDP which is one slot per 2MB.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
net/ipv4/tcp.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bf97317e6c97..180311636023 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3577,6 +3577,9 @@ void __init tcp_init(void)
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
inet_hashinfo_init(&tcp_hashinfo);
+ inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+ thash_entries, 21, /* one slot per 2 MB*/
+ 0, 64 * 1024);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
--
2.9.5
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket
2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
@ 2017-12-01 17:19 ` Eric Dumazet
0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2017-12-01 17:19 UTC (permalink / raw)
To: Martin KaFai Lau, netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> This patch adds a count to the 'struct inet_listen_hashbucket'.
> It counts how many sk is hashed to a bucket. It will be
> used to decide if the (to-be-added) portaddr listener's hashtable
> should be used during inet[6]_lookup_listener().
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h
2017-11-30 23:23 ` [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h Martin KaFai Lau
@ 2017-12-01 17:22 ` Eric Dumazet
0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2017-12-01 17:22 UTC (permalink / raw)
To: Martin KaFai Lau, netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> This patch moves the udp[46]_portaddr_hash()
> to net/ip[v6].h. The function name is renamed to
> ipv[46]_portaddr_hash().
>
> It will be used by a later patch which adds a second listener
> hashtable hashed by the address and port.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr)
2017-11-30 23:23 ` [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
@ 2017-12-01 17:26 ` Eric Dumazet
2017-12-01 17:47 ` Martin KaFai Lau
0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2017-12-01 17:26 UTC (permalink / raw)
To: Martin KaFai Lau, netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> The current listener hashtable is hashed by port only.
> When a process is listening at many IP addresses with the same port
> (e.g.
> [IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
> performance is degraded to a link list. It is prone to syn attack.
>
>
...
> diff --git a/include/net/inet_hashtables.h
> b/include/net/inet_hashtables.h
> index 4cce516c41ac..ebce55d694e7 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -152,8 +152,19 @@ struct inet_hashinfo {
> */
> struct inet_listen_hashbucket listening_hash[INET_LHT
> ABLE_SIZE]
> ____cacheline_aligned_in_smp
> ;
> + struct inet_listen_hashbucket *lhash2;
> + unsigned int lhash2_mask;
> };
>
Please move these two fields before listening_hash[], to avoid adding
another cache line to "struct inet_hashinfo"
( We have a 16-byte hole there, so lets use it ;) )
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP
2017-11-30 23:23 ` [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP Martin KaFai Lau
@ 2017-12-01 17:29 ` Eric Dumazet
0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2017-12-01 17:29 UTC (permalink / raw)
To: Martin KaFai Lau, netdev; +Cc: David S . Miller, Eric Dumazet, Kernel Team
On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> Enable the second listener hashtable in TCP.
> The scale is the same as UDP which is one slot per 2MB.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
> net/ipv4/tcp.c | 3 +++
> 1 file changed, 3 insertions(+)
>
Reviewed-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr)
2017-12-01 17:26 ` Eric Dumazet
@ 2017-12-01 17:47 ` Martin KaFai Lau
0 siblings, 0 replies; 10+ messages in thread
From: Martin KaFai Lau @ 2017-12-01 17:47 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, David S . Miller, Eric Dumazet, Kernel Team
On Fri, Dec 01, 2017 at 09:26:46AM -0800, Eric Dumazet wrote:
> On Thu, 2017-11-30 at 15:23 -0800, Martin KaFai Lau wrote:
> > The current listener hashtable is hashed by port only.
> > When a process is listening at many IP addresses with the same port
> > (e.g.
> > [IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
> > performance is degraded to a link list. It is prone to syn attack.
> >
> >
>
> ...
>
> > diff --git a/include/net/inet_hashtables.h
> > b/include/net/inet_hashtables.h
> > index 4cce516c41ac..ebce55d694e7 100644
> > --- a/include/net/inet_hashtables.h
> > +++ b/include/net/inet_hashtables.h
> > @@ -152,8 +152,19 @@ struct inet_hashinfo {
> > */
> > struct inet_listen_hashbucket listening_hash[INET_LHT
> > ABLE_SIZE]
> > ____cacheline_aligned_in_smp
> > ;
> > + struct inet_listen_hashbucket *lhash2;
> > + unsigned int lhash2_mask;
> > };
> >
>
> Please move these two fields before listening_hash[], to avoid adding
> another cache line to "struct inet_hashinfo"
>
> ( We have a 16-byte hole there, so lets use it ;) )
Oops. Indeed. Thanks for pointing it out.
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2017-12-01 17:50 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-30 23:23 [PATCH net-next 0/4] tcp: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 1/4] inet: Add a count to struct inet_listen_hashbucket Martin KaFai Lau
2017-12-01 17:19 ` Eric Dumazet
2017-11-30 23:23 ` [PATCH net-next 2/4] udp: Move udp[46]_portaddr_hash() to net/ip[v6].h Martin KaFai Lau
2017-12-01 17:22 ` Eric Dumazet
2017-11-30 23:23 ` [PATCH net-next 3/4] inet: Add a 2nd listener hashtable (port+addr) Martin KaFai Lau
2017-12-01 17:26 ` Eric Dumazet
2017-12-01 17:47 ` Martin KaFai Lau
2017-11-30 23:23 ` [PATCH net-next 4/4] tcp: Enable 2nd listener hashtable in TCP Martin KaFai Lau
2017-12-01 17:29 ` Eric Dumazet
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.