netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC net 0/2] tcp: Fix bhash2 and TIME_WAIT regression.
@ 2022-12-21 15:12 Kuniyuki Iwashima
  2022-12-21 15:12 ` [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2 Kuniyuki Iwashima
  2022-12-21 15:12 ` [PATCH RFC net 2/2] tcp: Add selftest for bind() and TIME_WAIT Kuniyuki Iwashima
  0 siblings, 2 replies; 15+ messages in thread
From: Kuniyuki Iwashima @ 2022-12-21 15:12 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Jiri Slaby, Joanne Koong, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

We forgot to add TIME_WAIT sockets to bhash2.  Therefore twsk cannot
prevent bind() to the same local address and port.

The first patch fixes the issue, but the layout change in struct
sock could have a negative impact.  So, this series is RFC.


Kuniyuki Iwashima (2):
  tcp: Add TIME_WAIT sockets in bhash2.
  tcp: Add selftest for bind() and TIME_WAIT.

 include/net/inet_timewait_sock.h            |  2 +
 include/net/sock.h                          |  5 +-
 net/ipv4/inet_hashtables.c                  |  5 +-
 net/ipv4/inet_timewait_sock.c               | 31 ++++++-
 tools/testing/selftests/net/.gitignore      |  1 +
 tools/testing/selftests/net/bind_timewait.c | 93 +++++++++++++++++++++
 6 files changed, 131 insertions(+), 6 deletions(-)
 create mode 100644 tools/testing/selftests/net/bind_timewait.c

-- 
2.30.2


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-21 15:12 [PATCH RFC net 0/2] tcp: Fix bhash2 and TIME_WAIT regression Kuniyuki Iwashima
@ 2022-12-21 15:12 ` Kuniyuki Iwashima
  2022-12-21 16:37   ` Jiri Slaby
  2022-12-22 15:05   ` Paolo Abeni
  2022-12-21 15:12 ` [PATCH RFC net 2/2] tcp: Add selftest for bind() and TIME_WAIT Kuniyuki Iwashima
  1 sibling, 2 replies; 15+ messages in thread
From: Kuniyuki Iwashima @ 2022-12-21 15:12 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Jiri Slaby, Joanne Koong, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

Jiri Slaby reported regression of bind() with a simple repro. [0]

The repro creates a TIME_WAIT socket and tries to bind() a new socket
with the same local address and port.  Before commit 28044fc1d495 ("net:
Add a bhash2 table hashed by port and address"), the bind() failed with
-EADDRINUSE, but now it succeeds.

The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
requests if the address is not a wildcard one.

[0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/

Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
Reported-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/net/inet_timewait_sock.h |  2 ++
 include/net/sock.h               |  5 +++--
 net/ipv4/inet_hashtables.c       |  5 +++--
 net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
 4 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 5b47545f22d3..c46ed239ad9a 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -44,6 +44,7 @@ struct inet_timewait_sock {
 #define tw_bound_dev_if		__tw_common.skc_bound_dev_if
 #define tw_node			__tw_common.skc_nulls_node
 #define tw_bind_node		__tw_common.skc_bind_node
+#define tw_bind2_node		__tw_common.skc_bind2_node
 #define tw_refcnt		__tw_common.skc_refcnt
 #define tw_hash			__tw_common.skc_hash
 #define tw_prot			__tw_common.skc_prot
@@ -73,6 +74,7 @@ struct inet_timewait_sock {
 	u32			tw_priority;
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
+	struct inet_bind2_bucket	*tw_tb2;
 };
 #define tw_tclass tw_tos
 
diff --git a/include/net/sock.h b/include/net/sock.h
index dcd72e6285b2..aaec985c1b5b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
  *	@skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
  *		[union with @skc_incoming_cpu]
  *	@skc_refcnt: reference count
+ *	@skc_bind2_node: bind node in the bhash2 table
  *
  *	This is the minimal network layer representation of sockets, the header
  *	for struct sock and struct inet_timewait_sock.
@@ -241,6 +242,7 @@ struct sock_common {
 		u32		skc_window_clamp;
 		u32		skc_tw_snd_nxt; /* struct tcp_timewait_sock */
 	};
+	struct hlist_node	skc_bind2_node;
 	/* public: */
 };
 
@@ -351,7 +353,6 @@ struct sk_filter;
   *	@sk_txtime_report_errors: set report errors mode for SO_TXTIME
   *	@sk_txtime_unused: unused txtime flags
   *	@ns_tracker: tracker for netns reference
-  *	@sk_bind2_node: bind node in the bhash2 table
   */
 struct sock {
 	/*
@@ -384,6 +385,7 @@ struct sock {
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
+#define sk_bind2_node		__sk_common.skc_bind2_node
 #define sk_prot			__sk_common.skc_prot
 #define sk_net			__sk_common.skc_net
 #define sk_v6_daddr		__sk_common.skc_v6_daddr
@@ -542,7 +544,6 @@ struct sock {
 #endif
 	struct rcu_head		sk_rcu;
 	netns_tracker		ns_tracker;
-	struct hlist_node	sk_bind2_node;
 };
 
 enum sk_pacing {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index d039b4e732a3..1e81dc7c6de4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1103,15 +1103,16 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	/* Head lock still held and bh's disabled */
 	inet_bind_hash(sk, tb, tb2, port);
 
-	spin_unlock(&head2->lock);
-
 	if (sk_unhashed(sk)) {
 		inet_sk(sk)->inet_sport = htons(port);
 		inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
 	}
 	if (tw)
 		inet_twsk_bind_unhash(tw, hinfo);
+
+	spin_unlock(&head2->lock);
 	spin_unlock(&head->lock);
+
 	if (tw)
 		inet_twsk_deschedule_put(tw);
 	local_bh_enable();
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 66fc940f9521..bec037d9ab8e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -29,6 +29,7 @@
 void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 			  struct inet_hashinfo *hashinfo)
 {
+	struct inet_bind2_bucket *tb2 = tw->tw_tb2;
 	struct inet_bind_bucket *tb = tw->tw_tb;
 
 	if (!tb)
@@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
 	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+
+	__hlist_del(&tw->tw_bind2_node);
+	tw->tw_tb2 = NULL;
+	inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+
 	__sock_put((struct sock *)tw);
 }
 
@@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
 {
 	struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
 	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
-	struct inet_bind_hashbucket *bhead;
+	struct inet_bind_hashbucket *bhead, *bhead2;
 
 	spin_lock(lock);
 	sk_nulls_del_node_init_rcu((struct sock *)tw);
@@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
 	/* Disassociate with bind bucket. */
 	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 			hashinfo->bhash_size)];
+	bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
+				       twsk_net(tw), tw->tw_num);
 
 	spin_lock(&bhead->lock);
+	spin_lock(&bhead2->lock);
 	inet_twsk_bind_unhash(tw, hashinfo);
+	spin_unlock(&bhead2->lock);
 	spin_unlock(&bhead->lock);
 
 	refcount_dec(&tw->tw_dr->tw_refcount);
@@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
 	hlist_add_head(&tw->tw_bind_node, list);
 }
 
+static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
+				     struct hlist_head *list)
+{
+	hlist_add_head(&tw->tw_bind2_node, list);
+}
+
 /*
  * Enter the time wait state. This is called with locally disabled BH.
  * Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
 	spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-	struct inet_bind_hashbucket *bhead;
+	struct inet_bind_hashbucket *bhead, *bhead2;
+
 	/* Step 1: Put TW into bind hash. Original socket stays there too.
 	   Note, that any socket with inet->num != 0 MUST be bound in
 	   binding cache, even if it is closed.
 	 */
 	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
 			hashinfo->bhash_size)];
+	bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
+
 	spin_lock(&bhead->lock);
+	spin_lock(&bhead2->lock);
+
 	tw->tw_tb = icsk->icsk_bind_hash;
 	WARN_ON(!icsk->icsk_bind_hash);
 	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+
+	tw->tw_tb2 = icsk->icsk_bind2_hash;
+	WARN_ON(!icsk->icsk_bind2_hash);
+	inet_twsk_add_bind2_node(tw, &tw->tw_tb2->owners);
+
+	spin_unlock(&bhead2->lock);
 	spin_unlock(&bhead->lock);
 
 	spin_lock(lock);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH RFC net 2/2] tcp: Add selftest for bind() and TIME_WAIT.
  2022-12-21 15:12 [PATCH RFC net 0/2] tcp: Fix bhash2 and TIME_WAIT regression Kuniyuki Iwashima
  2022-12-21 15:12 ` [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2 Kuniyuki Iwashima
@ 2022-12-21 15:12 ` Kuniyuki Iwashima
  2022-12-22 21:41   ` Joanne Koong
  1 sibling, 1 reply; 15+ messages in thread
From: Kuniyuki Iwashima @ 2022-12-21 15:12 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Jiri Slaby, Joanne Koong, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

bhash2 split the bind() validation logic into wildcard and non-wildcard
cases.  Let's add a test to catch the same regression.

Before the previous patch:

  # ./bind_timewait
  TAP version 13
  1..2
  # Starting 2 tests from 3 test cases.
  #  RUN           bind_timewait.localhost.1 ...
  # bind_timewait.c:87:1:Expected ret (0) == -1 (-1)
  # 1: Test terminated by assertion
  #          FAIL  bind_timewait.localhost.1
  not ok 1 bind_timewait.localhost.1
  #  RUN           bind_timewait.addrany.1 ...
  #            OK  bind_timewait.addrany.1
  ok 2 bind_timewait.addrany.1
  # FAILED: 1 / 2 tests passed.
  # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0

After:

  # ./bind_timewait
  TAP version 13
  1..2
  # Starting 2 tests from 3 test cases.
  #  RUN           bind_timewait.localhost.1 ...
  #            OK  bind_timewait.localhost.1
  ok 1 bind_timewait.localhost.1
  #  RUN           bind_timewait.addrany.1 ...
  #            OK  bind_timewait.addrany.1
  ok 2 bind_timewait.addrany.1
  # PASSED: 2 / 2 tests passed.
  # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 tools/testing/selftests/net/.gitignore      |  1 +
 tools/testing/selftests/net/bind_timewait.c | 93 +++++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 tools/testing/selftests/net/bind_timewait.c

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 9cc84114741d..a6911cae368c 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 bind_bhash
+bind_timewait
 csum
 cmsg_sender
 diag_uid
diff --git a/tools/testing/selftests/net/bind_timewait.c b/tools/testing/selftests/net/bind_timewait.c
new file mode 100644
index 000000000000..2d40403128ff
--- /dev/null
+++ b/tools/testing/selftests/net/bind_timewait.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+
+#include "../kselftest_harness.h"
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+FIXTURE(bind_timewait)
+{
+	struct sockaddr_in addr;
+	socklen_t addrlen;
+};
+
+FIXTURE_VARIANT(bind_timewait)
+{
+	__u32 addr_const;
+};
+
+FIXTURE_VARIANT_ADD(bind_timewait, localhost)
+{
+	.addr_const = INADDR_LOOPBACK
+};
+
+FIXTURE_VARIANT_ADD(bind_timewait, addrany)
+{
+	.addr_const = INADDR_ANY
+};
+
+FIXTURE_SETUP(bind_timewait)
+{
+	self->addr.sin_family = AF_INET;
+	self->addr.sin_port = 0;
+	self->addr.sin_addr.s_addr = htonl(variant->addr_const);
+	self->addrlen = sizeof(self->addr);
+}
+
+FIXTURE_TEARDOWN(bind_timewait)
+{
+}
+
+void create_timewait_socket(struct __test_metadata *_metadata,
+			    FIXTURE_DATA(bind_timewait) *self)
+{
+	int server_fd, client_fd, child_fd, ret;
+	struct sockaddr_in addr;
+	socklen_t addrlen;
+
+	server_fd = socket(AF_INET, SOCK_STREAM, 0);
+	ASSERT_GT(server_fd, 0);
+
+	ret = bind(server_fd, (struct sockaddr *)&self->addr, self->addrlen);
+	ASSERT_EQ(ret, 0);
+
+	ret = listen(server_fd, 1);
+	ASSERT_EQ(ret, 0);
+
+	ret = getsockname(server_fd, (struct sockaddr *)&self->addr, &self->addrlen);
+	ASSERT_EQ(ret, 0);
+
+	client_fd = socket(AF_INET, SOCK_STREAM, 0);
+	ASSERT_GT(client_fd, 0);
+
+	ret = connect(client_fd, (struct sockaddr *)&self->addr, self->addrlen);
+	ASSERT_EQ(ret, 0);
+
+	addrlen = sizeof(addr);
+	child_fd = accept(server_fd, (struct sockaddr *)&addr, &addrlen);
+	ASSERT_GT(child_fd, 0);
+
+	close(child_fd);
+	close(client_fd);
+	close(server_fd);
+}
+
+TEST_F(bind_timewait, 1)
+{
+	int fd, ret;
+
+	create_timewait_socket(_metadata, self);
+
+	fd = socket(AF_INET, SOCK_STREAM, 0);
+	ASSERT_GT(fd, 0);
+
+	ret = bind(fd, (struct sockaddr *)&self->addr, self->addrlen);
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EADDRINUSE);
+
+	close(fd);
+}
+
+TEST_HARNESS_MAIN
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-21 15:12 ` [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2 Kuniyuki Iwashima
@ 2022-12-21 16:37   ` Jiri Slaby
  2022-12-22 15:05   ` Paolo Abeni
  1 sibling, 0 replies; 15+ messages in thread
From: Jiri Slaby @ 2022-12-21 16:37 UTC (permalink / raw)
  To: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Joanne Koong, Kuniyuki Iwashima, netdev

On 21. 12. 22, 16:12, Kuniyuki Iwashima wrote:
> Jiri Slaby reported regression of bind() with a simple repro. [0]
> 
> The repro creates a TIME_WAIT socket and tries to bind() a new socket
> with the same local address and port.  Before commit 28044fc1d495 ("net:
> Add a bhash2 table hashed by port and address"), the bind() failed with
> -EADDRINUSE, but now it succeeds.
> 
> The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> requests if the address is not a wildcard one.
> 
> [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> 
> Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> Reported-by: Jiri Slaby <jirislaby@kernel.org>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>

Tested-by: Jiri Slaby <jirislaby@kernel.org>


> ---
>   include/net/inet_timewait_sock.h |  2 ++
>   include/net/sock.h               |  5 +++--
>   net/ipv4/inet_hashtables.c       |  5 +++--
>   net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
>   4 files changed, 37 insertions(+), 6 deletions(-)
> 
> diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> index 5b47545f22d3..c46ed239ad9a 100644
> --- a/include/net/inet_timewait_sock.h
> +++ b/include/net/inet_timewait_sock.h
> @@ -44,6 +44,7 @@ struct inet_timewait_sock {
>   #define tw_bound_dev_if		__tw_common.skc_bound_dev_if
>   #define tw_node			__tw_common.skc_nulls_node
>   #define tw_bind_node		__tw_common.skc_bind_node
> +#define tw_bind2_node		__tw_common.skc_bind2_node
>   #define tw_refcnt		__tw_common.skc_refcnt
>   #define tw_hash			__tw_common.skc_hash
>   #define tw_prot			__tw_common.skc_prot
> @@ -73,6 +74,7 @@ struct inet_timewait_sock {
>   	u32			tw_priority;
>   	struct timer_list	tw_timer;
>   	struct inet_bind_bucket	*tw_tb;
> +	struct inet_bind2_bucket	*tw_tb2;
>   };
>   #define tw_tclass tw_tos
>   
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dcd72e6285b2..aaec985c1b5b 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
>    *	@skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
>    *		[union with @skc_incoming_cpu]
>    *	@skc_refcnt: reference count
> + *	@skc_bind2_node: bind node in the bhash2 table
>    *
>    *	This is the minimal network layer representation of sockets, the header
>    *	for struct sock and struct inet_timewait_sock.
> @@ -241,6 +242,7 @@ struct sock_common {
>   		u32		skc_window_clamp;
>   		u32		skc_tw_snd_nxt; /* struct tcp_timewait_sock */
>   	};
> +	struct hlist_node	skc_bind2_node;
>   	/* public: */
>   };
>   
> @@ -351,7 +353,6 @@ struct sk_filter;
>     *	@sk_txtime_report_errors: set report errors mode for SO_TXTIME
>     *	@sk_txtime_unused: unused txtime flags
>     *	@ns_tracker: tracker for netns reference
> -  *	@sk_bind2_node: bind node in the bhash2 table
>     */
>   struct sock {
>   	/*
> @@ -384,6 +385,7 @@ struct sock {
>   #define sk_net_refcnt		__sk_common.skc_net_refcnt
>   #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
>   #define sk_bind_node		__sk_common.skc_bind_node
> +#define sk_bind2_node		__sk_common.skc_bind2_node
>   #define sk_prot			__sk_common.skc_prot
>   #define sk_net			__sk_common.skc_net
>   #define sk_v6_daddr		__sk_common.skc_v6_daddr
> @@ -542,7 +544,6 @@ struct sock {
>   #endif
>   	struct rcu_head		sk_rcu;
>   	netns_tracker		ns_tracker;
> -	struct hlist_node	sk_bind2_node;
>   };
>   
>   enum sk_pacing {
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index d039b4e732a3..1e81dc7c6de4 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -1103,15 +1103,16 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>   	/* Head lock still held and bh's disabled */
>   	inet_bind_hash(sk, tb, tb2, port);
>   
> -	spin_unlock(&head2->lock);
> -
>   	if (sk_unhashed(sk)) {
>   		inet_sk(sk)->inet_sport = htons(port);
>   		inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
>   	}
>   	if (tw)
>   		inet_twsk_bind_unhash(tw, hinfo);
> +
> +	spin_unlock(&head2->lock);
>   	spin_unlock(&head->lock);
> +
>   	if (tw)
>   		inet_twsk_deschedule_put(tw);
>   	local_bh_enable();
> diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
> index 66fc940f9521..bec037d9ab8e 100644
> --- a/net/ipv4/inet_timewait_sock.c
> +++ b/net/ipv4/inet_timewait_sock.c
> @@ -29,6 +29,7 @@
>   void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
>   			  struct inet_hashinfo *hashinfo)
>   {
> +	struct inet_bind2_bucket *tb2 = tw->tw_tb2;
>   	struct inet_bind_bucket *tb = tw->tw_tb;
>   
>   	if (!tb)
> @@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
>   	__hlist_del(&tw->tw_bind_node);
>   	tw->tw_tb = NULL;
>   	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
> +
> +	__hlist_del(&tw->tw_bind2_node);
> +	tw->tw_tb2 = NULL;
> +	inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
> +
>   	__sock_put((struct sock *)tw);
>   }
>   
> @@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
>   {
>   	struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
>   	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
> -	struct inet_bind_hashbucket *bhead;
> +	struct inet_bind_hashbucket *bhead, *bhead2;
>   
>   	spin_lock(lock);
>   	sk_nulls_del_node_init_rcu((struct sock *)tw);
> @@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
>   	/* Disassociate with bind bucket. */
>   	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
>   			hashinfo->bhash_size)];
> +	bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
> +				       twsk_net(tw), tw->tw_num);
>   
>   	spin_lock(&bhead->lock);
> +	spin_lock(&bhead2->lock);
>   	inet_twsk_bind_unhash(tw, hashinfo);
> +	spin_unlock(&bhead2->lock);
>   	spin_unlock(&bhead->lock);
>   
>   	refcount_dec(&tw->tw_dr->tw_refcount);
> @@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
>   	hlist_add_head(&tw->tw_bind_node, list);
>   }
>   
> +static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
> +				     struct hlist_head *list)
> +{
> +	hlist_add_head(&tw->tw_bind2_node, list);
> +}
> +
>   /*
>    * Enter the time wait state. This is called with locally disabled BH.
>    * Essentially we whip up a timewait bucket, copy the relevant info into it
> @@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
>   	const struct inet_connection_sock *icsk = inet_csk(sk);
>   	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
>   	spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
> -	struct inet_bind_hashbucket *bhead;
> +	struct inet_bind_hashbucket *bhead, *bhead2;
> +
>   	/* Step 1: Put TW into bind hash. Original socket stays there too.
>   	   Note, that any socket with inet->num != 0 MUST be bound in
>   	   binding cache, even if it is closed.
>   	 */
>   	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
>   			hashinfo->bhash_size)];
> +	bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
> +
>   	spin_lock(&bhead->lock);
> +	spin_lock(&bhead2->lock);
> +
>   	tw->tw_tb = icsk->icsk_bind_hash;
>   	WARN_ON(!icsk->icsk_bind_hash);
>   	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
> +
> +	tw->tw_tb2 = icsk->icsk_bind2_hash;
> +	WARN_ON(!icsk->icsk_bind2_hash);
> +	inet_twsk_add_bind2_node(tw, &tw->tw_tb2->owners);
> +
> +	spin_unlock(&bhead2->lock);
>   	spin_unlock(&bhead->lock);
>   
>   	spin_lock(lock);

-- 
js
suse labs


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-21 15:12 ` [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2 Kuniyuki Iwashima
  2022-12-21 16:37   ` Jiri Slaby
@ 2022-12-22 15:05   ` Paolo Abeni
  2022-12-22 21:46     ` Joanne Koong
  1 sibling, 1 reply; 15+ messages in thread
From: Paolo Abeni @ 2022-12-22 15:05 UTC (permalink / raw)
  To: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski
  Cc: Jiri Slaby, Joanne Koong, Kuniyuki Iwashima, netdev

On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> Jiri Slaby reported regression of bind() with a simple repro. [0]
> 
> The repro creates a TIME_WAIT socket and tries to bind() a new socket
> with the same local address and port.  Before commit 28044fc1d495 ("net:
> Add a bhash2 table hashed by port and address"), the bind() failed with
> -EADDRINUSE, but now it succeeds.
> 
> The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> requests if the address is not a wildcard one.

How does keeping the timewait sockets inside bhash2 affect the bind
loopup performance? I fear that could defeat completely the goal of
28044fc1d495, on quite busy server we could have quite a bit of tw with
the same address/port. If so, we could even consider reverting
28044fc1d495.

> [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> 
> Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> Reported-by: Jiri Slaby <jirislaby@kernel.org>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> ---
>  include/net/inet_timewait_sock.h |  2 ++
>  include/net/sock.h               |  5 +++--
>  net/ipv4/inet_hashtables.c       |  5 +++--
>  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
>  4 files changed, 37 insertions(+), 6 deletions(-)
> 
> diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> index 5b47545f22d3..c46ed239ad9a 100644
> --- a/include/net/inet_timewait_sock.h
> +++ b/include/net/inet_timewait_sock.h
> @@ -44,6 +44,7 @@ struct inet_timewait_sock {
>  #define tw_bound_dev_if		__tw_common.skc_bound_dev_if
>  #define tw_node			__tw_common.skc_nulls_node
>  #define tw_bind_node		__tw_common.skc_bind_node
> +#define tw_bind2_node		__tw_common.skc_bind2_node
>  #define tw_refcnt		__tw_common.skc_refcnt
>  #define tw_hash			__tw_common.skc_hash
>  #define tw_prot			__tw_common.skc_prot
> @@ -73,6 +74,7 @@ struct inet_timewait_sock {
>  	u32			tw_priority;
>  	struct timer_list	tw_timer;
>  	struct inet_bind_bucket	*tw_tb;
> +	struct inet_bind2_bucket	*tw_tb2;
>  };
>  #define tw_tclass tw_tos
>  
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dcd72e6285b2..aaec985c1b5b 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
>   *	@skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
>   *		[union with @skc_incoming_cpu]
>   *	@skc_refcnt: reference count
> + *	@skc_bind2_node: bind node in the bhash2 table
>   *
>   *	This is the minimal network layer representation of sockets, the header
>   *	for struct sock and struct inet_timewait_sock.
> @@ -241,6 +242,7 @@ struct sock_common {
>  		u32		skc_window_clamp;
>  		u32		skc_tw_snd_nxt; /* struct tcp_timewait_sock */
>  	};
> +	struct hlist_node	skc_bind2_node;

I *think* it would be better adding a tw_bind2_node field to the
inet_timewait_sock struct, so that we leave unmodified the request
socket and we don't change the struct sock binary layout. That could
affect performances moving hot fields on different cachelines.


Thanks,

Paolo


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 2/2] tcp: Add selftest for bind() and TIME_WAIT.
  2022-12-21 15:12 ` [PATCH RFC net 2/2] tcp: Add selftest for bind() and TIME_WAIT Kuniyuki Iwashima
@ 2022-12-22 21:41   ` Joanne Koong
  2022-12-22 23:54     ` Kuniyuki Iwashima
  0 siblings, 1 reply; 15+ messages in thread
From: Joanne Koong @ 2022-12-22 21:41 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jiri Slaby, Kuniyuki Iwashima, netdev

On Wed, Dec 21, 2022 at 7:14 AM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
> bhash2 split the bind() validation logic into wildcard and non-wildcard
> cases.  Let's add a test to catch the same regression.
>
> Before the previous patch:
>
>   # ./bind_timewait
>   TAP version 13
>   1..2
>   # Starting 2 tests from 3 test cases.
>   #  RUN           bind_timewait.localhost.1 ...
>   # bind_timewait.c:87:1:Expected ret (0) == -1 (-1)
>   # 1: Test terminated by assertion
>   #          FAIL  bind_timewait.localhost.1
>   not ok 1 bind_timewait.localhost.1
>   #  RUN           bind_timewait.addrany.1 ...
>   #            OK  bind_timewait.addrany.1
>   ok 2 bind_timewait.addrany.1
>   # FAILED: 1 / 2 tests passed.
>   # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0
>
> After:
>
>   # ./bind_timewait
>   TAP version 13
>   1..2
>   # Starting 2 tests from 3 test cases.
>   #  RUN           bind_timewait.localhost.1 ...
>   #            OK  bind_timewait.localhost.1
>   ok 1 bind_timewait.localhost.1
>   #  RUN           bind_timewait.addrany.1 ...
>   #            OK  bind_timewait.addrany.1
>   ok 2 bind_timewait.addrany.1
>   # PASSED: 2 / 2 tests passed.
>   # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0
>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> ---
>  tools/testing/selftests/net/.gitignore      |  1 +
>  tools/testing/selftests/net/bind_timewait.c | 93 +++++++++++++++++++++
>  2 files changed, 94 insertions(+)
>  create mode 100644 tools/testing/selftests/net/bind_timewait.c
>
> diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
> index 9cc84114741d..a6911cae368c 100644
> --- a/tools/testing/selftests/net/.gitignore
> +++ b/tools/testing/selftests/net/.gitignore
> @@ -1,5 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  bind_bhash
> +bind_timewait
>  csum
>  cmsg_sender
>  diag_uid
> diff --git a/tools/testing/selftests/net/bind_timewait.c b/tools/testing/selftests/net/bind_timewait.c
> new file mode 100644
> index 000000000000..2d40403128ff
> --- /dev/null
> +++ b/tools/testing/selftests/net/bind_timewait.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright Amazon.com Inc. or its affiliates. */
> +
> +#include "../kselftest_harness.h"

nit: Not sure if this matters or not, but from looking at the other
selftests/net it seems like the convention is to have relative path
#include defined below absolute path #includes.

> +
> +#include <sys/socket.h>
> +#include <netinet/in.h>
> +#include <netinet/tcp.h>

nit: i don't think we need this netinet/tcp.h include

> +
> +FIXTURE(bind_timewait)
> +{
> +       struct sockaddr_in addr;
> +       socklen_t addrlen;
> +};
> +
> +FIXTURE_VARIANT(bind_timewait)
> +{
> +       __u32 addr_const;
> +};
> +
> +FIXTURE_VARIANT_ADD(bind_timewait, localhost)
> +{
> +       .addr_const = INADDR_LOOPBACK
> +};
> +
> +FIXTURE_VARIANT_ADD(bind_timewait, addrany)
> +{
> +       .addr_const = INADDR_ANY
> +};
> +
> +FIXTURE_SETUP(bind_timewait)
> +{
> +       self->addr.sin_family = AF_INET;
> +       self->addr.sin_port = 0;
> +       self->addr.sin_addr.s_addr = htonl(variant->addr_const);
> +       self->addrlen = sizeof(self->addr);
> +}
> +
> +FIXTURE_TEARDOWN(bind_timewait)
> +{
> +}
> +
> +void create_timewait_socket(struct __test_metadata *_metadata,
> +                           FIXTURE_DATA(bind_timewait) *self)
> +{
> +       int server_fd, client_fd, child_fd, ret;
> +       struct sockaddr_in addr;
> +       socklen_t addrlen;
> +
> +       server_fd = socket(AF_INET, SOCK_STREAM, 0);
> +       ASSERT_GT(server_fd, 0);

If any of these assertions fail, do we leak fds because we don't get
to calling the close()s at the end of this function? Do we need to
have the fds cleaned up in the teardown fixture function?

> +
> +       ret = bind(server_fd, (struct sockaddr *)&self->addr, self->addrlen);
> +       ASSERT_EQ(ret, 0);
> +
> +       ret = listen(server_fd, 1);
> +       ASSERT_EQ(ret, 0);
> +
> +       ret = getsockname(server_fd, (struct sockaddr *)&self->addr, &self->addrlen);
> +       ASSERT_EQ(ret, 0);
> +
> +       client_fd = socket(AF_INET, SOCK_STREAM, 0);
> +       ASSERT_GT(client_fd, 0);
> +
> +       ret = connect(client_fd, (struct sockaddr *)&self->addr, self->addrlen);
> +       ASSERT_EQ(ret, 0);
> +
> +       addrlen = sizeof(addr);
> +       child_fd = accept(server_fd, (struct sockaddr *)&addr, &addrlen);
> +       ASSERT_GT(child_fd, 0);
> +
> +       close(child_fd);
> +       close(client_fd);
> +       close(server_fd);
> +}
> +
> +TEST_F(bind_timewait, 1)
> +{
> +       int fd, ret;
> +
> +       create_timewait_socket(_metadata, self);
> +
> +       fd = socket(AF_INET, SOCK_STREAM, 0);
> +       ASSERT_GT(fd, 0);
> +
> +       ret = bind(fd, (struct sockaddr *)&self->addr, self->addrlen);
> +       ASSERT_EQ(ret, -1);
> +       ASSERT_EQ(errno, EADDRINUSE);
> +
> +       close(fd);
> +}
> +
> +TEST_HARNESS_MAIN
> --
> 2.30.2
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-22 15:05   ` Paolo Abeni
@ 2022-12-22 21:46     ` Joanne Koong
  2022-12-22 23:26       ` Kuniyuki Iwashima
  0 siblings, 1 reply; 15+ messages in thread
From: Joanne Koong @ 2022-12-22 21:46 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Jiri Slaby, Kuniyuki Iwashima, netdev

On Thu, Dec 22, 2022 at 7:06 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> > Jiri Slaby reported regression of bind() with a simple repro. [0]
> >
> > The repro creates a TIME_WAIT socket and tries to bind() a new socket
> > with the same local address and port.  Before commit 28044fc1d495 ("net:
> > Add a bhash2 table hashed by port and address"), the bind() failed with
> > -EADDRINUSE, but now it succeeds.
> >
> > The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> > inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> > requests if the address is not a wildcard one.

(resending my reply because it wasn't in plaintext mode)

Thanks for adding this! I hadn't realized TIME_WAIT sockets also are
considered when checking against inet bind conflicts.

>
> How does keeping the timewait sockets inside bhash2 affect the bind
> loopup performance? I fear that could defeat completely the goal of
> 28044fc1d495, on quite busy server we could have quite a bit of tw with
> the same address/port. If so, we could even consider reverting
> 28044fc1d495.
>

Can you clarify what you mean by bind loopup?

> > [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> >
> > Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> > Reported-by: Jiri Slaby <jirislaby@kernel.org>
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > ---
> >  include/net/inet_timewait_sock.h |  2 ++
> >  include/net/sock.h               |  5 +++--
> >  net/ipv4/inet_hashtables.c       |  5 +++--
> >  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
> >  4 files changed, 37 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> > index 5b47545f22d3..c46ed239ad9a 100644
> > --- a/include/net/inet_timewait_sock.h
> > +++ b/include/net/inet_timewait_sock.h
> > @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> >  #define tw_bound_dev_if              __tw_common.skc_bound_dev_if
> >  #define tw_node                      __tw_common.skc_nulls_node
> >  #define tw_bind_node         __tw_common.skc_bind_node
> > +#define tw_bind2_node                __tw_common.skc_bind2_node
> >  #define tw_refcnt            __tw_common.skc_refcnt
> >  #define tw_hash                      __tw_common.skc_hash
> >  #define tw_prot                      __tw_common.skc_prot
> > @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> >       u32                     tw_priority;
> >       struct timer_list       tw_timer;
> >       struct inet_bind_bucket *tw_tb;
> > +     struct inet_bind2_bucket        *tw_tb2;
> >  };
> >  #define tw_tclass tw_tos
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index dcd72e6285b2..aaec985c1b5b 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> >   *   @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> >   *           [union with @skc_incoming_cpu]
> >   *   @skc_refcnt: reference count
> > + *   @skc_bind2_node: bind node in the bhash2 table
> >   *
> >   *   This is the minimal network layer representation of sockets, the header
> >   *   for struct sock and struct inet_timewait_sock.
> > @@ -241,6 +242,7 @@ struct sock_common {
> >               u32             skc_window_clamp;
> >               u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> >       };
> > +     struct hlist_node       skc_bind2_node;
>
> I *think* it would be better adding a tw_bind2_node field to the
> inet_timewait_sock struct, so that we leave unmodified the request
> socket and we don't change the struct sock binary layout. That could
> affect performances moving hot fields on different cachelines.
>
+1. The rest of this patch LGTM.

>
> Thanks,
>
> Paolo
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-22 21:46     ` Joanne Koong
@ 2022-12-22 23:26       ` Kuniyuki Iwashima
  2022-12-23  0:25         ` Joanne Koong
  2022-12-23 10:15         ` Paolo Abeni
  0 siblings, 2 replies; 15+ messages in thread
From: Kuniyuki Iwashima @ 2022-12-22 23:26 UTC (permalink / raw)
  To: joannelkoong
  Cc: davem, edumazet, jirislaby, kuba, kuni1840, kuniyu, netdev, pabeni

From:   Joanne Koong <joannelkoong@gmail.com>
Date:   Thu, 22 Dec 2022 13:46:57 -0800
> On Thu, Dec 22, 2022 at 7:06 AM Paolo Abeni <pabeni@redhat.com> wrote:
> >
> > On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> > > Jiri Slaby reported regression of bind() with a simple repro. [0]
> > >
> > > The repro creates a TIME_WAIT socket and tries to bind() a new socket
> > > with the same local address and port.  Before commit 28044fc1d495 ("net:
> > > Add a bhash2 table hashed by port and address"), the bind() failed with
> > > -EADDRINUSE, but now it succeeds.
> > >
> > > The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> > > inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> > > requests if the address is not a wildcard one.
> 
> (resending my reply because it wasn't in plaintext mode)
> 
> Thanks for adding this! I hadn't realized TIME_WAIT sockets also are
> considered when checking against inet bind conflicts.
> 
> >
> > How does keeping the timewait sockets inside bhash2 affect the bind
> > loopup performance? I fear that could defeat completely the goal of
> > 28044fc1d495, on quite busy server we could have quite a bit of tw with
> > the same address/port. If so, we could even consider reverting
> > 28044fc1d495.

It will slow down along the number of twsk, but I think it's still faster
than bhash if we listen() on multiple IP.  If we don't, bhash is always
faster because of bhash2's additional locking.  However, this is the
nature of bhash2 from the beginning.


> >
> 
> Can you clarify what you mean by bind loopup?

I think it means just bhash2 traversal.  (s/loopup/lookup/)

> 
> > > [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> > >
> > > Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> > > Reported-by: Jiri Slaby <jirislaby@kernel.org>
> > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > ---
> > >  include/net/inet_timewait_sock.h |  2 ++
> > >  include/net/sock.h               |  5 +++--
> > >  net/ipv4/inet_hashtables.c       |  5 +++--
> > >  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
> > >  4 files changed, 37 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> > > index 5b47545f22d3..c46ed239ad9a 100644
> > > --- a/include/net/inet_timewait_sock.h
> > > +++ b/include/net/inet_timewait_sock.h
> > > @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> > >  #define tw_bound_dev_if              __tw_common.skc_bound_dev_if
> > >  #define tw_node                      __tw_common.skc_nulls_node
> > >  #define tw_bind_node         __tw_common.skc_bind_node
> > > +#define tw_bind2_node                __tw_common.skc_bind2_node
> > >  #define tw_refcnt            __tw_common.skc_refcnt
> > >  #define tw_hash                      __tw_common.skc_hash
> > >  #define tw_prot                      __tw_common.skc_prot
> > > @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> > >       u32                     tw_priority;
> > >       struct timer_list       tw_timer;
> > >       struct inet_bind_bucket *tw_tb;
> > > +     struct inet_bind2_bucket        *tw_tb2;
> > >  };
> > >  #define tw_tclass tw_tos
> > >
> > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > index dcd72e6285b2..aaec985c1b5b 100644
> > > --- a/include/net/sock.h
> > > +++ b/include/net/sock.h
> > > @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> > >   *   @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> > >   *           [union with @skc_incoming_cpu]
> > >   *   @skc_refcnt: reference count
> > > + *   @skc_bind2_node: bind node in the bhash2 table
> > >   *
> > >   *   This is the minimal network layer representation of sockets, the header
> > >   *   for struct sock and struct inet_timewait_sock.
> > > @@ -241,6 +242,7 @@ struct sock_common {
> > >               u32             skc_window_clamp;
> > >               u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> > >       };
> > > +     struct hlist_node       skc_bind2_node;
> >
> > I *think* it would be better adding a tw_bind2_node field to the
> > inet_timewait_sock struct, so that we leave unmodified the request
> > socket and we don't change the struct sock binary layout. That could
> > affect performances moving hot fields on different cachelines.
> >
> +1. The rest of this patch LGTM.

Then we can't use sk_for_each_bound_bhash2(), or we have to guarantee this.

  BUILD_BUG_ON(offsetof(struct sock, sk_bind2_node),
               offsetof(struct inet_timewait_sock, tw_bind2_node))

Considering the number of members in struct sock, at least we have
to move sk_bind2_node forward.

Another option is to have another TIME_WAIT list in inet_bind2_bucket like
tb2->deathrow or something.  sk_for_each_bound_bhash2() is used only in
inet_bhash2_conflict(), so I think this is feasible.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 2/2] tcp: Add selftest for bind() and TIME_WAIT.
  2022-12-22 21:41   ` Joanne Koong
@ 2022-12-22 23:54     ` Kuniyuki Iwashima
  0 siblings, 0 replies; 15+ messages in thread
From: Kuniyuki Iwashima @ 2022-12-22 23:54 UTC (permalink / raw)
  To: joannelkoong
  Cc: davem, edumazet, jirislaby, kuba, kuni1840, kuniyu, netdev, pabeni

From:   Joanne Koong <joannelkoong@gmail.com>
Date:   Thu, 22 Dec 2022 13:41:11 -0800
> On Wed, Dec 21, 2022 at 7:14 AM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> >
> > bhash2 split the bind() validation logic into wildcard and non-wildcard
> > cases.  Let's add a test to catch the same regression.
> >
> > Before the previous patch:
> >
> >   # ./bind_timewait
> >   TAP version 13
> >   1..2
> >   # Starting 2 tests from 3 test cases.
> >   #  RUN           bind_timewait.localhost.1 ...
> >   # bind_timewait.c:87:1:Expected ret (0) == -1 (-1)
> >   # 1: Test terminated by assertion
> >   #          FAIL  bind_timewait.localhost.1
> >   not ok 1 bind_timewait.localhost.1
> >   #  RUN           bind_timewait.addrany.1 ...
> >   #            OK  bind_timewait.addrany.1
> >   ok 2 bind_timewait.addrany.1
> >   # FAILED: 1 / 2 tests passed.
> >   # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0
> >
> > After:
> >
> >   # ./bind_timewait
> >   TAP version 13
> >   1..2
> >   # Starting 2 tests from 3 test cases.
> >   #  RUN           bind_timewait.localhost.1 ...
> >   #            OK  bind_timewait.localhost.1
> >   ok 1 bind_timewait.localhost.1
> >   #  RUN           bind_timewait.addrany.1 ...
> >   #            OK  bind_timewait.addrany.1
> >   ok 2 bind_timewait.addrany.1
> >   # PASSED: 2 / 2 tests passed.
> >   # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0
> >
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > ---
> >  tools/testing/selftests/net/.gitignore      |  1 +
> >  tools/testing/selftests/net/bind_timewait.c | 93 +++++++++++++++++++++
> >  2 files changed, 94 insertions(+)
> >  create mode 100644 tools/testing/selftests/net/bind_timewait.c
> >
> > diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
> > index 9cc84114741d..a6911cae368c 100644
> > --- a/tools/testing/selftests/net/.gitignore
> > +++ b/tools/testing/selftests/net/.gitignore
> > @@ -1,5 +1,6 @@
> >  # SPDX-License-Identifier: GPL-2.0-only
> >  bind_bhash
> > +bind_timewait
> >  csum
> >  cmsg_sender
> >  diag_uid
> > diff --git a/tools/testing/selftests/net/bind_timewait.c b/tools/testing/selftests/net/bind_timewait.c
> > new file mode 100644
> > index 000000000000..2d40403128ff
> > --- /dev/null
> > +++ b/tools/testing/selftests/net/bind_timewait.c
> > @@ -0,0 +1,93 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright Amazon.com Inc. or its affiliates. */
> > +
> > +#include "../kselftest_harness.h"
> 
> nit: Not sure if this matters or not, but from looking at the other
> selftests/net it seems like the convention is to have relative path
> #include defined below absolute path #includes.

Will fix.


> 
> > +
> > +#include <sys/socket.h>
> > +#include <netinet/in.h>
> > +#include <netinet/tcp.h>
> 
> nit: i don't think we need this netinet/tcp.h include

Good catch, I was seeing an old man page.


> 
> > +
> > +FIXTURE(bind_timewait)
> > +{
> > +       struct sockaddr_in addr;
> > +       socklen_t addrlen;
> > +};
> > +
> > +FIXTURE_VARIANT(bind_timewait)
> > +{
> > +       __u32 addr_const;
> > +};
> > +
> > +FIXTURE_VARIANT_ADD(bind_timewait, localhost)
> > +{
> > +       .addr_const = INADDR_LOOPBACK
> > +};
> > +
> > +FIXTURE_VARIANT_ADD(bind_timewait, addrany)
> > +{
> > +       .addr_const = INADDR_ANY
> > +};
> > +
> > +FIXTURE_SETUP(bind_timewait)
> > +{
> > +       self->addr.sin_family = AF_INET;
> > +       self->addr.sin_port = 0;
> > +       self->addr.sin_addr.s_addr = htonl(variant->addr_const);
> > +       self->addrlen = sizeof(self->addr);
> > +}
> > +
> > +FIXTURE_TEARDOWN(bind_timewait)
> > +{
> > +}
> > +
> > +void create_timewait_socket(struct __test_metadata *_metadata,
> > +                           FIXTURE_DATA(bind_timewait) *self)
> > +{
> > +       int server_fd, client_fd, child_fd, ret;
> > +       struct sockaddr_in addr;
> > +       socklen_t addrlen;
> > +
> > +       server_fd = socket(AF_INET, SOCK_STREAM, 0);
> > +       ASSERT_GT(server_fd, 0);
> 
> If any of these assertions fail, do we leak fds because we don't get
> to calling the close()s at the end of this function? Do we need to
> have the fds cleaned up in the teardown fixture function?

I think exit() cleans up fds in the case.  IIUC, the parent process
catches SIGABRT in __wait_for_test(), but the child does not call
FIXTURE_TEARDOWN().

Thank you!


> 
> > +
> > +       ret = bind(server_fd, (struct sockaddr *)&self->addr, self->addrlen);
> > +       ASSERT_EQ(ret, 0);
> > +
> > +       ret = listen(server_fd, 1);
> > +       ASSERT_EQ(ret, 0);
> > +
> > +       ret = getsockname(server_fd, (struct sockaddr *)&self->addr, &self->addrlen);
> > +       ASSERT_EQ(ret, 0);
> > +
> > +       client_fd = socket(AF_INET, SOCK_STREAM, 0);
> > +       ASSERT_GT(client_fd, 0);
> > +
> > +       ret = connect(client_fd, (struct sockaddr *)&self->addr, self->addrlen);
> > +       ASSERT_EQ(ret, 0);
> > +
> > +       addrlen = sizeof(addr);
> > +       child_fd = accept(server_fd, (struct sockaddr *)&addr, &addrlen);
> > +       ASSERT_GT(child_fd, 0);
> > +
> > +       close(child_fd);
> > +       close(client_fd);
> > +       close(server_fd);
> > +}
> > +
> > +TEST_F(bind_timewait, 1)
> > +{
> > +       int fd, ret;
> > +
> > +       create_timewait_socket(_metadata, self);
> > +
> > +       fd = socket(AF_INET, SOCK_STREAM, 0);
> > +       ASSERT_GT(fd, 0);
> > +
> > +       ret = bind(fd, (struct sockaddr *)&self->addr, self->addrlen);
> > +       ASSERT_EQ(ret, -1);
> > +       ASSERT_EQ(errno, EADDRINUSE);
> > +
> > +       close(fd);
> > +}
> > +
> > +TEST_HARNESS_MAIN
> > --
> > 2.30.2

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-22 23:26       ` Kuniyuki Iwashima
@ 2022-12-23  0:25         ` Joanne Koong
  2022-12-23  1:55           ` Kuniyuki Iwashima
  2022-12-23 10:15         ` Paolo Abeni
  1 sibling, 1 reply; 15+ messages in thread
From: Joanne Koong @ 2022-12-23  0:25 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: davem, edumazet, jirislaby, kuba, kuni1840, netdev, pabeni

On Thu, Dec 22, 2022 at 3:27 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
> From:   Joanne Koong <joannelkoong@gmail.com>
> Date:   Thu, 22 Dec 2022 13:46:57 -0800
> > On Thu, Dec 22, 2022 at 7:06 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > >
> > > On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> > > > Jiri Slaby reported regression of bind() with a simple repro. [0]
> > > >
> > > > The repro creates a TIME_WAIT socket and tries to bind() a new socket
> > > > with the same local address and port.  Before commit 28044fc1d495 ("net:
> > > > Add a bhash2 table hashed by port and address"), the bind() failed with
> > > > -EADDRINUSE, but now it succeeds.
> > > >
> > > > The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> > > > inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> > > > requests if the address is not a wildcard one.
> >
> > (resending my reply because it wasn't in plaintext mode)
> >
> > Thanks for adding this! I hadn't realized TIME_WAIT sockets also are
> > considered when checking against inet bind conflicts.
> >
> > >
> > > How does keeping the timewait sockets inside bhash2 affect the bind
> > > loopup performance? I fear that could defeat completely the goal of
> > > 28044fc1d495, on quite busy server we could have quite a bit of tw with
> > > the same address/port. If so, we could even consider reverting
> > > 28044fc1d495.
>
> It will slow down along the number of twsk, but I think it's still faster
> than bhash if we listen() on multiple IP.  If we don't, bhash is always
> faster because of bhash2's additional locking.  However, this is the
> nature of bhash2 from the beginning.
>
>
> > >
> >
> > Can you clarify what you mean by bind loopup?
>
> I think it means just bhash2 traversal.  (s/loopup/lookup/)
>
> >
> > > > [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> > > >
> > > > Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> > > > Reported-by: Jiri Slaby <jirislaby@kernel.org>
> > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > ---
> > > >  include/net/inet_timewait_sock.h |  2 ++
> > > >  include/net/sock.h               |  5 +++--
> > > >  net/ipv4/inet_hashtables.c       |  5 +++--
> > > >  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
> > > >  4 files changed, 37 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> > > > index 5b47545f22d3..c46ed239ad9a 100644
> > > > --- a/include/net/inet_timewait_sock.h
> > > > +++ b/include/net/inet_timewait_sock.h
> > > > @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> > > >  #define tw_bound_dev_if              __tw_common.skc_bound_dev_if
> > > >  #define tw_node                      __tw_common.skc_nulls_node
> > > >  #define tw_bind_node         __tw_common.skc_bind_node
> > > > +#define tw_bind2_node                __tw_common.skc_bind2_node
> > > >  #define tw_refcnt            __tw_common.skc_refcnt
> > > >  #define tw_hash                      __tw_common.skc_hash
> > > >  #define tw_prot                      __tw_common.skc_prot
> > > > @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> > > >       u32                     tw_priority;
> > > >       struct timer_list       tw_timer;
> > > >       struct inet_bind_bucket *tw_tb;
> > > > +     struct inet_bind2_bucket        *tw_tb2;
> > > >  };
> > > >  #define tw_tclass tw_tos
> > > >
> > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > index dcd72e6285b2..aaec985c1b5b 100644
> > > > --- a/include/net/sock.h
> > > > +++ b/include/net/sock.h
> > > > @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> > > >   *   @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> > > >   *           [union with @skc_incoming_cpu]
> > > >   *   @skc_refcnt: reference count
> > > > + *   @skc_bind2_node: bind node in the bhash2 table
> > > >   *
> > > >   *   This is the minimal network layer representation of sockets, the header
> > > >   *   for struct sock and struct inet_timewait_sock.
> > > > @@ -241,6 +242,7 @@ struct sock_common {
> > > >               u32             skc_window_clamp;
> > > >               u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> > > >       };
> > > > +     struct hlist_node       skc_bind2_node;
> > >
> > > I *think* it would be better adding a tw_bind2_node field to the
> > > inet_timewait_sock struct, so that we leave unmodified the request
> > > socket and we don't change the struct sock binary layout. That could
> > > affect performances moving hot fields on different cachelines.
> > >
> > +1. The rest of this patch LGTM.
>
> Then we can't use sk_for_each_bound_bhash2(), or we have to guarantee this.
>
>   BUILD_BUG_ON(offsetof(struct sock, sk_bind2_node),
>                offsetof(struct inet_timewait_sock, tw_bind2_node))
>
> Considering the number of members in struct sock, at least we have
> to move sk_bind2_node forward.
>
> Another option is to have another TIME_WAIT list in inet_bind2_bucket like
> tb2->deathrow or something.  sk_for_each_bound_bhash2() is used only in
> inet_bhash2_conflict(), so I think this is feasible.

Oh I see, thanks for clarifying!

I think we could also check sk_state (which is in __sk_common already)
and if it's TCP_TIME_WAIT, then we know sk is at offsetof(struct
inet_timewait_sock, tw_bind2_node), whereas otherwise it's at
offsetof(struct sock, sk_bind2_node). This seems simpler/cleaner to me
than the other approaches. What are your thoughts?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-23  0:25         ` Joanne Koong
@ 2022-12-23  1:55           ` Kuniyuki Iwashima
  2022-12-23 19:34             ` Joanne Koong
  0 siblings, 1 reply; 15+ messages in thread
From: Kuniyuki Iwashima @ 2022-12-23  1:55 UTC (permalink / raw)
  To: joannelkoong
  Cc: davem, edumazet, jirislaby, kuba, kuni1840, kuniyu, netdev, pabeni

From:   Joanne Koong <joannelkoong@gmail.com>
Date:   Thu, 22 Dec 2022 16:25:10 -0800
> On Thu, Dec 22, 2022 at 3:27 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> >
> > From:   Joanne Koong <joannelkoong@gmail.com>
> > Date:   Thu, 22 Dec 2022 13:46:57 -0800
> > > On Thu, Dec 22, 2022 at 7:06 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > >
> > > > On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> > > > > Jiri Slaby reported regression of bind() with a simple repro. [0]
> > > > >
> > > > > The repro creates a TIME_WAIT socket and tries to bind() a new socket
> > > > > with the same local address and port.  Before commit 28044fc1d495 ("net:
> > > > > Add a bhash2 table hashed by port and address"), the bind() failed with
> > > > > -EADDRINUSE, but now it succeeds.
> > > > >
> > > > > The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> > > > > inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> > > > > requests if the address is not a wildcard one.
> > >
> > > (resending my reply because it wasn't in plaintext mode)
> > >
> > > Thanks for adding this! I hadn't realized TIME_WAIT sockets also are
> > > considered when checking against inet bind conflicts.
> > >
> > > >
> > > > How does keeping the timewait sockets inside bhash2 affect the bind
> > > > loopup performance? I fear that could defeat completely the goal of
> > > > 28044fc1d495, on quite busy server we could have quite a bit of tw with
> > > > the same address/port. If so, we could even consider reverting
> > > > 28044fc1d495.
> >
> > It will slow down along the number of twsk, but I think it's still faster
> > than bhash if we listen() on multiple IP.  If we don't, bhash is always
> > faster because of bhash2's additional locking.  However, this is the
> > nature of bhash2 from the beginning.
> >
> >
> > > >
> > >
> > > Can you clarify what you mean by bind loopup?
> >
> > I think it means just bhash2 traversal.  (s/loopup/lookup/)
> >
> > >
> > > > > [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> > > > >
> > > > > Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> > > > > Reported-by: Jiri Slaby <jirislaby@kernel.org>
> > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > > ---
> > > > >  include/net/inet_timewait_sock.h |  2 ++
> > > > >  include/net/sock.h               |  5 +++--
> > > > >  net/ipv4/inet_hashtables.c       |  5 +++--
> > > > >  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
> > > > >  4 files changed, 37 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> > > > > index 5b47545f22d3..c46ed239ad9a 100644
> > > > > --- a/include/net/inet_timewait_sock.h
> > > > > +++ b/include/net/inet_timewait_sock.h
> > > > > @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> > > > >  #define tw_bound_dev_if              __tw_common.skc_bound_dev_if
> > > > >  #define tw_node                      __tw_common.skc_nulls_node
> > > > >  #define tw_bind_node         __tw_common.skc_bind_node
> > > > > +#define tw_bind2_node                __tw_common.skc_bind2_node
> > > > >  #define tw_refcnt            __tw_common.skc_refcnt
> > > > >  #define tw_hash                      __tw_common.skc_hash
> > > > >  #define tw_prot                      __tw_common.skc_prot
> > > > > @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> > > > >       u32                     tw_priority;
> > > > >       struct timer_list       tw_timer;
> > > > >       struct inet_bind_bucket *tw_tb;
> > > > > +     struct inet_bind2_bucket        *tw_tb2;
> > > > >  };
> > > > >  #define tw_tclass tw_tos
> > > > >
> > > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > > index dcd72e6285b2..aaec985c1b5b 100644
> > > > > --- a/include/net/sock.h
> > > > > +++ b/include/net/sock.h
> > > > > @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> > > > >   *   @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> > > > >   *           [union with @skc_incoming_cpu]
> > > > >   *   @skc_refcnt: reference count
> > > > > + *   @skc_bind2_node: bind node in the bhash2 table
> > > > >   *
> > > > >   *   This is the minimal network layer representation of sockets, the header
> > > > >   *   for struct sock and struct inet_timewait_sock.
> > > > > @@ -241,6 +242,7 @@ struct sock_common {
> > > > >               u32             skc_window_clamp;
> > > > >               u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> > > > >       };
> > > > > +     struct hlist_node       skc_bind2_node;
> > > >
> > > > I *think* it would be better adding a tw_bind2_node field to the
> > > > inet_timewait_sock struct, so that we leave unmodified the request
> > > > socket and we don't change the struct sock binary layout. That could
> > > > affect performances moving hot fields on different cachelines.
> > > >
> > > +1. The rest of this patch LGTM.
> >
> > Then we can't use sk_for_each_bound_bhash2(), or we have to guarantee this.
> >
> >   BUILD_BUG_ON(offsetof(struct sock, sk_bind2_node),
> >                offsetof(struct inet_timewait_sock, tw_bind2_node))
> >
> > Considering the number of members in struct sock, at least we have
> > to move sk_bind2_node forward.
> >
> > Another option is to have another TIME_WAIT list in inet_bind2_bucket like
> > tb2->deathrow or something.  sk_for_each_bound_bhash2() is used only in
> > inet_bhash2_conflict(), so I think this is feasible.
> 
> Oh I see, thanks for clarifying!
> 
> I think we could also check sk_state (which is in __sk_common already)
> and if it's TCP_TIME_WAIT, then we know sk is at offsetof(struct
> inet_timewait_sock, tw_bind2_node), whereas otherwise it's at
> offsetof(struct sock, sk_bind2_node). This seems simpler/cleaner to me
> than the other approaches. What are your thoughts?

Sorry, I don't get it.  You mean we can check sk_state first and change
how we traverse ?  But then we cannot know the offset of sk_state if we
don't know if the socket is TIME_WAIT ... ?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-22 23:26       ` Kuniyuki Iwashima
  2022-12-23  0:25         ` Joanne Koong
@ 2022-12-23 10:15         ` Paolo Abeni
  2022-12-23 17:08           ` Kuniyuki Iwashima
  1 sibling, 1 reply; 15+ messages in thread
From: Paolo Abeni @ 2022-12-23 10:15 UTC (permalink / raw)
  To: Kuniyuki Iwashima, joannelkoong
  Cc: davem, edumazet, jirislaby, kuba, kuni1840, netdev

On Fri, 2022-12-23 at 08:26 +0900, Kuniyuki Iwashima wrote:
> From:   Joanne Koong <joannelkoong@gmail.com>
> Date:   Thu, 22 Dec 2022 13:46:57 -0800
> > On Thu, Dec 22, 2022 at 7:06 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > 
> > > On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> > > > Jiri Slaby reported regression of bind() with a simple repro. [0]
> > > > 
> > > > The repro creates a TIME_WAIT socket and tries to bind() a new socket
> > > > with the same local address and port.  Before commit 28044fc1d495 ("net:
> > > > Add a bhash2 table hashed by port and address"), the bind() failed with
> > > > -EADDRINUSE, but now it succeeds.
> > > > 
> > > > The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> > > > inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> > > > requests if the address is not a wildcard one.
> > 
> > (resending my reply because it wasn't in plaintext mode)
> > 
> > Thanks for adding this! I hadn't realized TIME_WAIT sockets also are
> > considered when checking against inet bind conflicts.
> > 
> > > 
> > > How does keeping the timewait sockets inside bhash2 affect the bind
> > > loopup performance? I fear that could defeat completely the goal of
> > > 28044fc1d495, on quite busy server we could have quite a bit of tw with
> > > the same address/port. If so, we could even consider reverting
> > > 28044fc1d495.
> 
> It will slow down along the number of twsk, but I think it's still faster
> than bhash if we listen() on multiple IP.  If we don't, bhash is always
> faster because of bhash2's additional locking.  However, this is the
> nature of bhash2 from the beginning.

I see. Before 28044fc1d495, todo a bind lookup, we had to traverse all
the tw in bhash. After 28044fc1d495, tw were ignored (in some cases).
My point is: if the number of tw sockets is >> the number of listeners
on multiple IPs, the bhash2 traversal time after this patch will be
comparable to the bhash traversal time before 28044fc1d495, with the
added cost of the double spin lock.

> > 
> > Can you clarify what you mean by bind loopup?
> 
> I think it means just bhash2 traversal.  (s/loopup/lookup/)

Indeed, sorry for the typo!
> 
> > 
> > > > [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> > > > 
> > > > Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> > > > Reported-by: Jiri Slaby <jirislaby@kernel.org>
> > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > ---
> > > >  include/net/inet_timewait_sock.h |  2 ++
> > > >  include/net/sock.h               |  5 +++--
> > > >  net/ipv4/inet_hashtables.c       |  5 +++--
> > > >  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
> > > >  4 files changed, 37 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> > > > index 5b47545f22d3..c46ed239ad9a 100644
> > > > --- a/include/net/inet_timewait_sock.h
> > > > +++ b/include/net/inet_timewait_sock.h
> > > > @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> > > >  #define tw_bound_dev_if              __tw_common.skc_bound_dev_if
> > > >  #define tw_node                      __tw_common.skc_nulls_node
> > > >  #define tw_bind_node         __tw_common.skc_bind_node
> > > > +#define tw_bind2_node                __tw_common.skc_bind2_node
> > > >  #define tw_refcnt            __tw_common.skc_refcnt
> > > >  #define tw_hash                      __tw_common.skc_hash
> > > >  #define tw_prot                      __tw_common.skc_prot
> > > > @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> > > >       u32                     tw_priority;
> > > >       struct timer_list       tw_timer;
> > > >       struct inet_bind_bucket *tw_tb;
> > > > +     struct inet_bind2_bucket        *tw_tb2;
> > > >  };
> > > >  #define tw_tclass tw_tos
> > > > 
> > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > index dcd72e6285b2..aaec985c1b5b 100644
> > > > --- a/include/net/sock.h
> > > > +++ b/include/net/sock.h
> > > > @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> > > >   *   @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> > > >   *           [union with @skc_incoming_cpu]
> > > >   *   @skc_refcnt: reference count
> > > > + *   @skc_bind2_node: bind node in the bhash2 table
> > > >   *
> > > >   *   This is the minimal network layer representation of sockets, the header
> > > >   *   for struct sock and struct inet_timewait_sock.
> > > > @@ -241,6 +242,7 @@ struct sock_common {
> > > >               u32             skc_window_clamp;
> > > >               u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> > > >       };
> > > > +     struct hlist_node       skc_bind2_node;
> > > 
> > > I *think* it would be better adding a tw_bind2_node field to the
> > > inet_timewait_sock struct, so that we leave unmodified the request
> > > socket and we don't change the struct sock binary layout. That could
> > > affect performances moving hot fields on different cachelines.
> > > 
> > +1. The rest of this patch LGTM.
> 
> Then we can't use sk_for_each_bound_bhash2(), or we have to guarantee this.
> 
>   BUILD_BUG_ON(offsetof(struct sock, sk_bind2_node),
>                offsetof(struct inet_timewait_sock, tw_bind2_node))
> 
> Considering the number of members in struct sock, at least we have
> to move sk_bind2_node forward.

You are right, I missed that point.


> Another option is to have another TIME_WAIT list in inet_bind2_bucket like
> tb2->deathrow or something.  sk_for_each_bound_bhash2() is used only in
> inet_bhash2_conflict(), so I think this is feasible.

This will add some more memory cost. I hope it's not a big deal, and
looks like the better option IMHO.

Cheers,

Paolo


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-23 10:15         ` Paolo Abeni
@ 2022-12-23 17:08           ` Kuniyuki Iwashima
  0 siblings, 0 replies; 15+ messages in thread
From: Kuniyuki Iwashima @ 2022-12-23 17:08 UTC (permalink / raw)
  To: pabeni
  Cc: davem, edumazet, jirislaby, joannelkoong, kuba, kuni1840, kuniyu, netdev

From:   Paolo Abeni <pabeni@redhat.com>
Date:   Fri, 23 Dec 2022 11:15:24 +0100
> On Fri, 2022-12-23 at 08:26 +0900, Kuniyuki Iwashima wrote:
> > From:   Joanne Koong <joannelkoong@gmail.com>
> > Date:   Thu, 22 Dec 2022 13:46:57 -0800
> > > On Thu, Dec 22, 2022 at 7:06 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > > 
> > > > On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> > > > > Jiri Slaby reported regression of bind() with a simple repro. [0]
> > > > > 
> > > > > The repro creates a TIME_WAIT socket and tries to bind() a new socket
> > > > > with the same local address and port.  Before commit 28044fc1d495 ("net:
> > > > > Add a bhash2 table hashed by port and address"), the bind() failed with
> > > > > -EADDRINUSE, but now it succeeds.
> > > > > 
> > > > > The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> > > > > inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> > > > > requests if the address is not a wildcard one.
> > > 
> > > (resending my reply because it wasn't in plaintext mode)
> > > 
> > > Thanks for adding this! I hadn't realized TIME_WAIT sockets also are
> > > considered when checking against inet bind conflicts.
> > > 
> > > > 
> > > > How does keeping the timewait sockets inside bhash2 affect the bind
> > > > loopup performance? I fear that could defeat completely the goal of
> > > > 28044fc1d495, on quite busy server we could have quite a bit of tw with
> > > > the same address/port. If so, we could even consider reverting
> > > > 28044fc1d495.
> > 
> > It will slow down along the number of twsk, but I think it's still faster
> > than bhash if we listen() on multiple IP.  If we don't, bhash is always
> > faster because of bhash2's additional locking.  However, this is the
> > nature of bhash2 from the beginning.
> 
> I see. Before 28044fc1d495, todo a bind lookup, we had to traverse all
> the tw in bhash. After 28044fc1d495, tw were ignored (in some cases).
> My point is: if the number of tw sockets is >> the number of listeners
> on multiple IPs, the bhash2 traversal time after this patch will be
> comparable to the bhash traversal time before 28044fc1d495, with the
> added cost of the double spin lock.

Ah, I get your point.  I think it's possible.  At least bhash2
has still merit when adding new IP.  Let's say we have X.X.X.X:443
listeners and adding Y.Y.Y.Y:443 does not cause CPU soft lockup if
we have bhash2.  But yes, more tests would be helpful.


> 
> > > 
> > > Can you clarify what you mean by bind loopup?
> > 
> > I think it means just bhash2 traversal.  (s/loopup/lookup/)
> 
> Indeed, sorry for the typo!
> > 
> > > 
> > > > > [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> > > > > 
> > > > > Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> > > > > Reported-by: Jiri Slaby <jirislaby@kernel.org>
> > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > > ---
> > > > >  include/net/inet_timewait_sock.h |  2 ++
> > > > >  include/net/sock.h               |  5 +++--
> > > > >  net/ipv4/inet_hashtables.c       |  5 +++--
> > > > >  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
> > > > >  4 files changed, 37 insertions(+), 6 deletions(-)
> > > > > 
> > > > > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> > > > > index 5b47545f22d3..c46ed239ad9a 100644
> > > > > --- a/include/net/inet_timewait_sock.h
> > > > > +++ b/include/net/inet_timewait_sock.h
> > > > > @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> > > > >  #define tw_bound_dev_if              __tw_common.skc_bound_dev_if
> > > > >  #define tw_node                      __tw_common.skc_nulls_node
> > > > >  #define tw_bind_node         __tw_common.skc_bind_node
> > > > > +#define tw_bind2_node                __tw_common.skc_bind2_node
> > > > >  #define tw_refcnt            __tw_common.skc_refcnt
> > > > >  #define tw_hash                      __tw_common.skc_hash
> > > > >  #define tw_prot                      __tw_common.skc_prot
> > > > > @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> > > > >       u32                     tw_priority;
> > > > >       struct timer_list       tw_timer;
> > > > >       struct inet_bind_bucket *tw_tb;
> > > > > +     struct inet_bind2_bucket        *tw_tb2;
> > > > >  };
> > > > >  #define tw_tclass tw_tos
> > > > > 
> > > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > > index dcd72e6285b2..aaec985c1b5b 100644
> > > > > --- a/include/net/sock.h
> > > > > +++ b/include/net/sock.h
> > > > > @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> > > > >   *   @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> > > > >   *           [union with @skc_incoming_cpu]
> > > > >   *   @skc_refcnt: reference count
> > > > > + *   @skc_bind2_node: bind node in the bhash2 table
> > > > >   *
> > > > >   *   This is the minimal network layer representation of sockets, the header
> > > > >   *   for struct sock and struct inet_timewait_sock.
> > > > > @@ -241,6 +242,7 @@ struct sock_common {
> > > > >               u32             skc_window_clamp;
> > > > >               u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> > > > >       };
> > > > > +     struct hlist_node       skc_bind2_node;
> > > > 
> > > > I *think* it would be better adding a tw_bind2_node field to the
> > > > inet_timewait_sock struct, so that we leave unmodified the request
> > > > socket and we don't change the struct sock binary layout. That could
> > > > affect performances moving hot fields on different cachelines.
> > > > 
> > > +1. The rest of this patch LGTM.
> > 
> > Then we can't use sk_for_each_bound_bhash2(), or we have to guarantee this.
> > 
> >   BUILD_BUG_ON(offsetof(struct sock, sk_bind2_node),
> >                offsetof(struct inet_timewait_sock, tw_bind2_node))
> > 
> > Considering the number of members in struct sock, at least we have
> > to move sk_bind2_node forward.
> 
> You are right, I missed that point.
> 
> 
> > Another option is to have another TIME_WAIT list in inet_bind2_bucket like
> > tb2->deathrow or something.  sk_for_each_bound_bhash2() is used only in
> > inet_bhash2_conflict(), so I think this is feasible.
> 
> This will add some more memory cost. I hope it's not a big deal, and
> looks like the better option IMHO.

Thank you, I'll post a new version on Monday.

Marry Chirstmas!

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-23  1:55           ` Kuniyuki Iwashima
@ 2022-12-23 19:34             ` Joanne Koong
  2022-12-26 13:21               ` Kuniyuki Iwashima
  0 siblings, 1 reply; 15+ messages in thread
From: Joanne Koong @ 2022-12-23 19:34 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: davem, edumazet, jirislaby, kuba, kuni1840, netdev, pabeni

On Thu, Dec 22, 2022 at 5:55 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
> From:   Joanne Koong <joannelkoong@gmail.com>
> Date:   Thu, 22 Dec 2022 16:25:10 -0800
> > On Thu, Dec 22, 2022 at 3:27 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > >
> > > From:   Joanne Koong <joannelkoong@gmail.com>
> > > Date:   Thu, 22 Dec 2022 13:46:57 -0800
> > > > On Thu, Dec 22, 2022 at 7:06 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > > >
> > > > > On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> > > > > > Jiri Slaby reported regression of bind() with a simple repro. [0]
> > > > > >
> > > > > > The repro creates a TIME_WAIT socket and tries to bind() a new socket
> > > > > > with the same local address and port.  Before commit 28044fc1d495 ("net:
> > > > > > Add a bhash2 table hashed by port and address"), the bind() failed with
> > > > > > -EADDRINUSE, but now it succeeds.
> > > > > >
> > > > > > The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> > > > > > inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> > > > > > requests if the address is not a wildcard one.
> > > >
> > > > (resending my reply because it wasn't in plaintext mode)
> > > >
> > > > Thanks for adding this! I hadn't realized TIME_WAIT sockets also are
> > > > considered when checking against inet bind conflicts.
> > > >
> > > > >
> > > > > How does keeping the timewait sockets inside bhash2 affect the bind
> > > > > loopup performance? I fear that could defeat completely the goal of
> > > > > 28044fc1d495, on quite busy server we could have quite a bit of tw with
> > > > > the same address/port. If so, we could even consider reverting
> > > > > 28044fc1d495.
> > >
> > > It will slow down along the number of twsk, but I think it's still faster
> > > than bhash if we listen() on multiple IP.  If we don't, bhash is always
> > > faster because of bhash2's additional locking.  However, this is the
> > > nature of bhash2 from the beginning.
> > >
> > >
> > > > >
> > > >
> > > > Can you clarify what you mean by bind loopup?
> > >
> > > I think it means just bhash2 traversal.  (s/loopup/lookup/)
> > >
> > > >
> > > > > > [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> > > > > >
> > > > > > Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> > > > > > Reported-by: Jiri Slaby <jirislaby@kernel.org>
> > > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > > > ---
> > > > > >  include/net/inet_timewait_sock.h |  2 ++
> > > > > >  include/net/sock.h               |  5 +++--
> > > > > >  net/ipv4/inet_hashtables.c       |  5 +++--
> > > > > >  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
> > > > > >  4 files changed, 37 insertions(+), 6 deletions(-)
> > > > > >
> > > > > > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> > > > > > index 5b47545f22d3..c46ed239ad9a 100644
> > > > > > --- a/include/net/inet_timewait_sock.h
> > > > > > +++ b/include/net/inet_timewait_sock.h
> > > > > > @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> > > > > >  #define tw_bound_dev_if              __tw_common.skc_bound_dev_if
> > > > > >  #define tw_node                      __tw_common.skc_nulls_node
> > > > > >  #define tw_bind_node         __tw_common.skc_bind_node
> > > > > > +#define tw_bind2_node                __tw_common.skc_bind2_node
> > > > > >  #define tw_refcnt            __tw_common.skc_refcnt
> > > > > >  #define tw_hash                      __tw_common.skc_hash
> > > > > >  #define tw_prot                      __tw_common.skc_prot
> > > > > > @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> > > > > >       u32                     tw_priority;
> > > > > >       struct timer_list       tw_timer;
> > > > > >       struct inet_bind_bucket *tw_tb;
> > > > > > +     struct inet_bind2_bucket        *tw_tb2;
> > > > > >  };
> > > > > >  #define tw_tclass tw_tos
> > > > > >
> > > > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > > > index dcd72e6285b2..aaec985c1b5b 100644
> > > > > > --- a/include/net/sock.h
> > > > > > +++ b/include/net/sock.h
> > > > > > @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> > > > > >   *   @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> > > > > >   *           [union with @skc_incoming_cpu]
> > > > > >   *   @skc_refcnt: reference count
> > > > > > + *   @skc_bind2_node: bind node in the bhash2 table
> > > > > >   *
> > > > > >   *   This is the minimal network layer representation of sockets, the header
> > > > > >   *   for struct sock and struct inet_timewait_sock.
> > > > > > @@ -241,6 +242,7 @@ struct sock_common {
> > > > > >               u32             skc_window_clamp;
> > > > > >               u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> > > > > >       };
> > > > > > +     struct hlist_node       skc_bind2_node;
> > > > >
> > > > > I *think* it would be better adding a tw_bind2_node field to the
> > > > > inet_timewait_sock struct, so that we leave unmodified the request
> > > > > socket and we don't change the struct sock binary layout. That could
> > > > > affect performances moving hot fields on different cachelines.
> > > > >
> > > > +1. The rest of this patch LGTM.
> > >
> > > Then we can't use sk_for_each_bound_bhash2(), or we have to guarantee this.
> > >
> > >   BUILD_BUG_ON(offsetof(struct sock, sk_bind2_node),
> > >                offsetof(struct inet_timewait_sock, tw_bind2_node))
> > >
> > > Considering the number of members in struct sock, at least we have
> > > to move sk_bind2_node forward.
> > >
> > > Another option is to have another TIME_WAIT list in inet_bind2_bucket like
> > > tb2->deathrow or something.  sk_for_each_bound_bhash2() is used only in
> > > inet_bhash2_conflict(), so I think this is feasible.
> >
> > Oh I see, thanks for clarifying!
> >
> > I think we could also check sk_state (which is in __sk_common already)
> > and if it's TCP_TIME_WAIT, then we know sk is at offsetof(struct
> > inet_timewait_sock, tw_bind2_node), whereas otherwise it's at
> > offsetof(struct sock, sk_bind2_node). This seems simpler/cleaner to me
> > than the other approaches. What are your thoughts?
>
> Sorry, I don't get it.  You mean we can check sk_state first and change
> how we traverse ?  But then we cannot know the offset of sk_state if we
> don't know if the socket is TIME_WAIT ... ?

I think the offset of sk_state is the same for both sockets because
sk_state is in "struct sock_common" (__sk_common.skc_state) that both
share.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2.
  2022-12-23 19:34             ` Joanne Koong
@ 2022-12-26 13:21               ` Kuniyuki Iwashima
  0 siblings, 0 replies; 15+ messages in thread
From: Kuniyuki Iwashima @ 2022-12-26 13:21 UTC (permalink / raw)
  To: joannelkoong
  Cc: davem, edumazet, jirislaby, kuba, kuni1840, kuniyu, netdev, pabeni

From:   Joanne Koong <joannelkoong@gmail.com>
Date:   Fri, 23 Dec 2022 11:34:40 -0800
> On Thu, Dec 22, 2022 at 5:55 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> >
> > From:   Joanne Koong <joannelkoong@gmail.com>
> > Date:   Thu, 22 Dec 2022 16:25:10 -0800
> > > On Thu, Dec 22, 2022 at 3:27 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > > >
> > > > From:   Joanne Koong <joannelkoong@gmail.com>
> > > > Date:   Thu, 22 Dec 2022 13:46:57 -0800
> > > > > On Thu, Dec 22, 2022 at 7:06 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > > > >
> > > > > > On Thu, 2022-12-22 at 00:12 +0900, Kuniyuki Iwashima wrote:
> > > > > > > Jiri Slaby reported regression of bind() with a simple repro. [0]
> > > > > > >
> > > > > > > The repro creates a TIME_WAIT socket and tries to bind() a new socket
> > > > > > > with the same local address and port.  Before commit 28044fc1d495 ("net:
> > > > > > > Add a bhash2 table hashed by port and address"), the bind() failed with
> > > > > > > -EADDRINUSE, but now it succeeds.
> > > > > > >
> > > > > > > The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
> > > > > > > inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
> > > > > > > requests if the address is not a wildcard one.
> > > > >
> > > > > (resending my reply because it wasn't in plaintext mode)
> > > > >
> > > > > Thanks for adding this! I hadn't realized TIME_WAIT sockets also are
> > > > > considered when checking against inet bind conflicts.
> > > > >
> > > > > >
> > > > > > How does keeping the timewait sockets inside bhash2 affect the bind
> > > > > > loopup performance? I fear that could defeat completely the goal of
> > > > > > 28044fc1d495, on quite busy server we could have quite a bit of tw with
> > > > > > the same address/port. If so, we could even consider reverting
> > > > > > 28044fc1d495.
> > > >
> > > > It will slow down along the number of twsk, but I think it's still faster
> > > > than bhash if we listen() on multiple IP.  If we don't, bhash is always
> > > > faster because of bhash2's additional locking.  However, this is the
> > > > nature of bhash2 from the beginning.
> > > >
> > > >
> > > > > >
> > > > >
> > > > > Can you clarify what you mean by bind loopup?
> > > >
> > > > I think it means just bhash2 traversal.  (s/loopup/lookup/)
> > > >
> > > > >
> > > > > > > [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/
> > > > > > >
> > > > > > > Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
> > > > > > > Reported-by: Jiri Slaby <jirislaby@kernel.org>
> > > > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > > > > ---
> > > > > > >  include/net/inet_timewait_sock.h |  2 ++
> > > > > > >  include/net/sock.h               |  5 +++--
> > > > > > >  net/ipv4/inet_hashtables.c       |  5 +++--
> > > > > > >  net/ipv4/inet_timewait_sock.c    | 31 +++++++++++++++++++++++++++++--
> > > > > > >  4 files changed, 37 insertions(+), 6 deletions(-)
> > > > > > >
> > > > > > > diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> > > > > > > index 5b47545f22d3..c46ed239ad9a 100644
> > > > > > > --- a/include/net/inet_timewait_sock.h
> > > > > > > +++ b/include/net/inet_timewait_sock.h
> > > > > > > @@ -44,6 +44,7 @@ struct inet_timewait_sock {
> > > > > > >  #define tw_bound_dev_if              __tw_common.skc_bound_dev_if
> > > > > > >  #define tw_node                      __tw_common.skc_nulls_node
> > > > > > >  #define tw_bind_node         __tw_common.skc_bind_node
> > > > > > > +#define tw_bind2_node                __tw_common.skc_bind2_node
> > > > > > >  #define tw_refcnt            __tw_common.skc_refcnt
> > > > > > >  #define tw_hash                      __tw_common.skc_hash
> > > > > > >  #define tw_prot                      __tw_common.skc_prot
> > > > > > > @@ -73,6 +74,7 @@ struct inet_timewait_sock {
> > > > > > >       u32                     tw_priority;
> > > > > > >       struct timer_list       tw_timer;
> > > > > > >       struct inet_bind_bucket *tw_tb;
> > > > > > > +     struct inet_bind2_bucket        *tw_tb2;
> > > > > > >  };
> > > > > > >  #define tw_tclass tw_tos
> > > > > > >
> > > > > > > diff --git a/include/net/sock.h b/include/net/sock.h
> > > > > > > index dcd72e6285b2..aaec985c1b5b 100644
> > > > > > > --- a/include/net/sock.h
> > > > > > > +++ b/include/net/sock.h
> > > > > > > @@ -156,6 +156,7 @@ typedef __u64 __bitwise __addrpair;
> > > > > > >   *   @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
> > > > > > >   *           [union with @skc_incoming_cpu]
> > > > > > >   *   @skc_refcnt: reference count
> > > > > > > + *   @skc_bind2_node: bind node in the bhash2 table
> > > > > > >   *
> > > > > > >   *   This is the minimal network layer representation of sockets, the header
> > > > > > >   *   for struct sock and struct inet_timewait_sock.
> > > > > > > @@ -241,6 +242,7 @@ struct sock_common {
> > > > > > >               u32             skc_window_clamp;
> > > > > > >               u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
> > > > > > >       };
> > > > > > > +     struct hlist_node       skc_bind2_node;
> > > > > >
> > > > > > I *think* it would be better adding a tw_bind2_node field to the
> > > > > > inet_timewait_sock struct, so that we leave unmodified the request
> > > > > > socket and we don't change the struct sock binary layout. That could
> > > > > > affect performances moving hot fields on different cachelines.
> > > > > >
> > > > > +1. The rest of this patch LGTM.
> > > >
> > > > Then we can't use sk_for_each_bound_bhash2(), or we have to guarantee this.
> > > >
> > > >   BUILD_BUG_ON(offsetof(struct sock, sk_bind2_node),
> > > >                offsetof(struct inet_timewait_sock, tw_bind2_node))
> > > >
> > > > Considering the number of members in struct sock, at least we have
> > > > to move sk_bind2_node forward.
> > > >
> > > > Another option is to have another TIME_WAIT list in inet_bind2_bucket like
> > > > tb2->deathrow or something.  sk_for_each_bound_bhash2() is used only in
> > > > inet_bhash2_conflict(), so I think this is feasible.
> > >
> > > Oh I see, thanks for clarifying!
> > >
> > > I think we could also check sk_state (which is in __sk_common already)
> > > and if it's TCP_TIME_WAIT, then we know sk is at offsetof(struct
> > > inet_timewait_sock, tw_bind2_node), whereas otherwise it's at
> > > offsetof(struct sock, sk_bind2_node). This seems simpler/cleaner to me
> > > than the other approaches. What are your thoughts?
> >
> > Sorry, I don't get it.  You mean we can check sk_state first and change
> > how we traverse ?  But then we cannot know the offset of sk_state if we
> > don't know if the socket is TIME_WAIT ... ?
> 
> I think the offset of sk_state is the same for both sockets because
> sk_state is in "struct sock_common" (__sk_common.skc_state) that both
> share.

Yes, but the problem is we don't know the address of the first member
of struct sock_common.  The hlist_head/hlist_node point to the next
hlist_node, not the head of sock_common.  In the loop, we calculate
a pointer address like this.

  ptr = (struct sock_common *)(list_node - offsetof(struct X, sk_bind2_node))

We have to know which struct the next node is, otherwise, the ptr will
points to an invalid address.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-12-26 13:21 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-21 15:12 [PATCH RFC net 0/2] tcp: Fix bhash2 and TIME_WAIT regression Kuniyuki Iwashima
2022-12-21 15:12 ` [PATCH RFC net 1/2] tcp: Add TIME_WAIT sockets in bhash2 Kuniyuki Iwashima
2022-12-21 16:37   ` Jiri Slaby
2022-12-22 15:05   ` Paolo Abeni
2022-12-22 21:46     ` Joanne Koong
2022-12-22 23:26       ` Kuniyuki Iwashima
2022-12-23  0:25         ` Joanne Koong
2022-12-23  1:55           ` Kuniyuki Iwashima
2022-12-23 19:34             ` Joanne Koong
2022-12-26 13:21               ` Kuniyuki Iwashima
2022-12-23 10:15         ` Paolo Abeni
2022-12-23 17:08           ` Kuniyuki Iwashima
2022-12-21 15:12 ` [PATCH RFC net 2/2] tcp: Add selftest for bind() and TIME_WAIT Kuniyuki Iwashima
2022-12-22 21:41   ` Joanne Koong
2022-12-22 23:54     ` Kuniyuki Iwashima

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).