All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET 0/4] Add support for no-lock sockets
@ 2022-04-12 20:26 Jens Axboe
  2022-04-12 20:26 ` [PATCH 1/4] net: add sock 'sk_no_lock' member Jens Axboe
                   ` (4 more replies)
  0 siblings, 5 replies; 18+ messages in thread
From: Jens Axboe @ 2022-04-12 20:26 UTC (permalink / raw)
  To: io-uring, netdev

Hi,

If we accept a connection directly, eg without installing a file
descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
we have a socket for recv/send that we can fully serialize access to.

With that in mind, we can feasibly skip locking on the socket for TCP
in that case. Some of the testing I've done has shown as much as 15%
of overhead in the lock_sock/release_sock part, with this change then
we see none.

Comments welcome!

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/4] net: add sock 'sk_no_lock' member
  2022-04-12 20:26 [PATCHSET 0/4] Add support for no-lock sockets Jens Axboe
@ 2022-04-12 20:26 ` Jens Axboe
  2022-04-12 20:26 ` [PATCH 2/4] net: allow sk_prot->release_cb() without sock lock held Jens Axboe
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2022-04-12 20:26 UTC (permalink / raw)
  To: io-uring, netdev; +Cc: Jens Axboe

In preparation for allowing lockless access to the socket for specialized
use cases, add a member denoting that the socket supports this.

No functional changes in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/net/sock.h | 3 +++
 net/core/sock.c    | 1 +
 2 files changed, 4 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index c4b91fc19b9c..e8283a65b757 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -131,6 +131,7 @@ typedef __u64 __bitwise __addrpair;
  *	@skc_reuseport: %SO_REUSEPORT setting
  *	@skc_ipv6only: socket is IPV6 only
  *	@skc_net_refcnt: socket is using net ref counting
+ *	@skc_no_lock: socket is private, no locking needed
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -190,6 +191,7 @@ struct sock_common {
 	unsigned char		skc_reuseport:1;
 	unsigned char		skc_ipv6only:1;
 	unsigned char		skc_net_refcnt:1;
+	unsigned char		skc_no_lock:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -382,6 +384,7 @@ struct sock {
 #define sk_reuseport		__sk_common.skc_reuseport
 #define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
+#define sk_no_lock		__sk_common.skc_no_lock
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
diff --git a/net/core/sock.c b/net/core/sock.c
index 1180a0cb0110..fec892b384a4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2101,6 +2101,7 @@ EXPORT_SYMBOL(sk_free);
 
 static void sk_init_common(struct sock *sk)
 {
+	sk->sk_no_lock = false;
 	skb_queue_head_init(&sk->sk_receive_queue);
 	skb_queue_head_init(&sk->sk_write_queue);
 	skb_queue_head_init(&sk->sk_error_queue);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/4] net: allow sk_prot->release_cb() without sock lock held
  2022-04-12 20:26 [PATCHSET 0/4] Add support for no-lock sockets Jens Axboe
  2022-04-12 20:26 ` [PATCH 1/4] net: add sock 'sk_no_lock' member Jens Axboe
@ 2022-04-12 20:26 ` Jens Axboe
  2022-04-12 20:26 ` [PATCH 3/4] net: add support for socket no-lock Jens Axboe
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2022-04-12 20:26 UTC (permalink / raw)
  To: io-uring, netdev; +Cc: Jens Axboe

Add helpers that allow ->release_cb() to acquire the socket bh lock
when needed. For normal sockets, ->release_cb() is always invoked with
that lock held. For nolock sockets it will not be held, so provide an
easy way to acquire it when necessary.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/net/sock.h    | 10 ++++++++++
 net/atm/common.c      |  5 ++++-
 net/ipv4/tcp_output.c |  2 ++
 net/mptcp/protocol.c  |  3 +++
 net/smc/af_smc.c      |  2 ++
 5 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index e8283a65b757..99fcc4d7eed9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1696,6 +1696,16 @@ void release_sock(struct sock *sk);
 				SINGLE_DEPTH_NESTING)
 #define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
 
+/* nolock helpers */
+#define bh_lock_sock_on_nolock(__sk)	do {			\
+	if ((__sk)->sk_no_lock)					\
+		spin_lock_bh(&(__sk)->sk_lock.slock);		\
+} while (0)
+#define bh_unlock_sock_on_nolock(__sk)	do {			\
+	if ((__sk)->sk_no_lock)					\
+		spin_unlock_bh(&(__sk)->sk_lock.slock);		\
+} while (0)
+
 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock);
 
 /**
diff --git a/net/atm/common.c b/net/atm/common.c
index 1cfa9bf1d187..471363e929f6 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -126,8 +126,11 @@ static void vcc_release_cb(struct sock *sk)
 {
 	struct atm_vcc *vcc = atm_sk(sk);
 
-	if (vcc->release_cb)
+	if (vcc->release_cb) {
+		bh_lock_sock_on_nolock(sk);
 		vcc->release_cb(vcc);
+		bh_lock_sock_on_nolock(sk);
+	}
 }
 
 static struct proto vcc_proto = {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9ede847f4199..9f86ea63cbac 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1100,6 +1100,7 @@ void tcp_release_cb(struct sock *sk)
 	 * But following code is meant to be called from BH handlers,
 	 * so we should keep BH disabled, but early release socket ownership
 	 */
+	bh_lock_sock_on_nolock(sk);
 	sock_release_ownership(sk);
 
 	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
@@ -1114,6 +1115,7 @@ void tcp_release_cb(struct sock *sk)
 		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 		__sock_put(sk);
 	}
+	bh_unlock_sock_on_nolock(sk);
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 0cbea3b6d0a4..ae9078e8e137 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3065,6 +3065,8 @@ static void mptcp_release_cb(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
+	bh_lock_sock_on_nolock(sk);
+
 	for (;;) {
 		unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
 				      msk->push_pending;
@@ -3103,6 +3105,7 @@ static void mptcp_release_cb(struct sock *sk)
 		__mptcp_error_report(sk);
 
 	__mptcp_update_rmem(sk);
+	bh_unlock_sock_on_nolock(sk);
 }
 
 /* MP_JOIN client subflow must wait for 4th ack before sending any data:
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index f0d118e9f155..3456dc6cd38b 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -201,10 +201,12 @@ static void smc_release_cb(struct sock *sk)
 {
 	struct smc_sock *smc = smc_sk(sk);
 
+	bh_lock_sock_on_nolock(sk);
 	if (smc->conn.tx_in_release_sock) {
 		smc_tx_pending(&smc->conn);
 		smc->conn.tx_in_release_sock = false;
 	}
+	bh_unlock_sock_on_nolock(sk);
 }
 
 struct proto smc_proto = {
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/4] net: add support for socket no-lock
  2022-04-12 20:26 [PATCHSET 0/4] Add support for no-lock sockets Jens Axboe
  2022-04-12 20:26 ` [PATCH 1/4] net: add sock 'sk_no_lock' member Jens Axboe
  2022-04-12 20:26 ` [PATCH 2/4] net: allow sk_prot->release_cb() without sock lock held Jens Axboe
@ 2022-04-12 20:26 ` Jens Axboe
  2022-04-12 20:26 ` [PATCH 4/4] io_uring: mark accept direct socket as no-lock Jens Axboe
  2022-04-13  0:40 ` [PATCHSET 0/4] Add support for no-lock sockets Eric Dumazet
  4 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2022-04-12 20:26 UTC (permalink / raw)
  To: io-uring, netdev; +Cc: Jens Axboe

If we have a guaranteed single user of a socket, then we can optimize
the lock/release of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/net/sock.h | 10 ++++++++--
 net/core/sock.c    | 31 +++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 99fcc4d7eed9..aefc94677c94 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1674,7 +1674,7 @@ do {									\
 
 static inline bool lockdep_sock_is_held(const struct sock *sk)
 {
-	return lockdep_is_held(&sk->sk_lock) ||
+	return sk->sk_no_lock || lockdep_is_held(&sk->sk_lock) ||
 	       lockdep_is_held(&sk->sk_lock.slock);
 }
 
@@ -1774,18 +1774,20 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 static inline void sock_owned_by_me(const struct sock *sk)
 {
 #ifdef CONFIG_LOCKDEP
-	WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
+	WARN_ON_ONCE(!sk->sk_no_lock && !lockdep_sock_is_held(sk) && debug_locks);
 #endif
 }
 
 static inline bool sock_owned_by_user(const struct sock *sk)
 {
 	sock_owned_by_me(sk);
+	smp_rmb();
 	return sk->sk_lock.owned;
 }
 
 static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
 {
+	smp_rmb();
 	return sk->sk_lock.owned;
 }
 
@@ -1794,6 +1796,10 @@ static inline void sock_release_ownership(struct sock *sk)
 	if (sock_owned_by_user_nocheck(sk)) {
 		sk->sk_lock.owned = 0;
 
+		if (sk->sk_no_lock) {
+			smp_wmb();
+			return;
+		}
 		/* The sk_lock has mutex_unlock() semantics: */
 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index fec892b384a4..d7eea29c5699 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2764,6 +2764,9 @@ void __lock_sock(struct sock *sk)
 {
 	DEFINE_WAIT(wait);
 
+	if (WARN_ON_ONCE(sk->sk_no_lock))
+		return;
+
 	for (;;) {
 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
 					TASK_UNINTERRUPTIBLE);
@@ -3307,8 +3310,21 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 }
 EXPORT_SYMBOL(sock_init_data);
 
+static inline bool lock_sock_nolock(struct sock *sk)
+{
+	if (sk->sk_no_lock) {
+		sk->sk_lock.owned = 1;
+		smp_wmb();
+		return true;
+	}
+	return false;
+}
+
 void lock_sock_nested(struct sock *sk, int subclass)
 {
+	if (lock_sock_nolock(sk))
+		return;
+
 	/* The sk_lock has mutex_lock() semantics here. */
 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
 
@@ -3321,8 +3337,23 @@ void lock_sock_nested(struct sock *sk, int subclass)
 }
 EXPORT_SYMBOL(lock_sock_nested);
 
+static inline bool release_sock_nolock(struct sock *sk)
+{
+	if (!sk->sk_no_lock)
+		return false;
+	if (READ_ONCE(sk->sk_backlog.tail))
+		return false;
+	if (sk->sk_prot->release_cb)
+		sk->sk_prot->release_cb(sk);
+	sock_release_ownership(sk);
+	return true;
+}
+
 void release_sock(struct sock *sk)
 {
+	if (release_sock_nolock(sk))
+		return;
+
 	spin_lock_bh(&sk->sk_lock.slock);
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/4] io_uring: mark accept direct socket as no-lock
  2022-04-12 20:26 [PATCHSET 0/4] Add support for no-lock sockets Jens Axboe
                   ` (2 preceding siblings ...)
  2022-04-12 20:26 ` [PATCH 3/4] net: add support for socket no-lock Jens Axboe
@ 2022-04-12 20:26 ` Jens Axboe
  2022-04-13  0:40 ` [PATCHSET 0/4] Add support for no-lock sockets Eric Dumazet
  4 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2022-04-12 20:26 UTC (permalink / raw)
  To: io-uring, netdev; +Cc: Jens Axboe

Mark a socket as nolock if we're accepting it directly, eg without
installing it into the process file table.

For direct issue or task_work issue, we already grab the uring_lock
for those, and hence they are serializing access to the socket for
send/recv already. The only case where we don't always grab the lock
is for async issue. Add a helper to ensure that it gets done if this
is a nolock socket.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0a6bcc077637..17b4dc9f130f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5918,6 +5918,19 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+/*
+ * Mark the socket as not needing locking, io_uring will serialize access
+ * to it. Note there's no matching clear of this condition, as this is only
+ * applicable for a fixed/registerd file, and those go away when we unregister
+ * anyway.
+ */
+static void io_sock_nolock_set(struct file *file)
+{
+	struct sock *sk = sock_from_file(file)->sk;
+
+	sk->sk_no_lock = true;
+}
+
 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_accept *accept = &req->accept;
@@ -5947,6 +5960,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 		fd_install(fd, file);
 		ret = fd;
 	} else {
+		io_sock_nolock_set(file);
 		ret = io_install_fixed_file(req, file, issue_flags,
 					    accept->file_slot - 1);
 	}
@@ -7604,11 +7618,31 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 	return req ? &req->work : NULL;
 }
 
+/*
+ * This could be improved with an FFS flag, but since it's only done for
+ * the slower path of io-wq offload, no point in optimizing it further.
+ */
+static bool io_req_needs_lock(struct io_kiocb *req)
+{
+#if defined(CONFIG_NET)
+	struct socket *sock;
+
+	if (!req->file)
+		return false;
+
+	sock = sock_from_file(req->file);
+	if (sock && sock->sk->sk_no_lock)
+		return true;
+#endif
+	return false;
+}
+
 static void io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 	unsigned int issue_flags = IO_URING_F_UNLOCKED;
+	struct io_ring_ctx *ctx = req->ctx;
 	bool needs_poll = false;
 	struct io_kiocb *timeout;
 	int ret = 0, err = -ECANCELED;
@@ -7645,6 +7679,11 @@ static void io_wq_submit_work(struct io_wq_work *work)
 		}
 	}
 
+	if (io_req_needs_lock(req)) {
+		mutex_lock(&ctx->uring_lock);
+		issue_flags &= ~IO_URING_F_UNLOCKED;
+	}
+
 	do {
 		ret = io_issue_sqe(req, issue_flags);
 		if (ret != -EAGAIN)
@@ -7659,8 +7698,10 @@ static void io_wq_submit_work(struct io_wq_work *work)
 			continue;
 		}
 
-		if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
-			return;
+		if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) {
+			ret = 0;
+			break;
+		}
 		/* aborted or ready, in either case retry blocking */
 		needs_poll = false;
 		issue_flags &= ~IO_URING_F_NONBLOCK;
@@ -7669,6 +7710,9 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	/* avoid locking problems by failing it from a clean context */
 	if (ret)
 		io_req_task_queue_fail(req, ret);
+
+	if (!(issue_flags & IO_URING_F_UNLOCKED))
+		mutex_unlock(&ctx->uring_lock);
 }
 
 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-12 20:26 [PATCHSET 0/4] Add support for no-lock sockets Jens Axboe
                   ` (3 preceding siblings ...)
  2022-04-12 20:26 ` [PATCH 4/4] io_uring: mark accept direct socket as no-lock Jens Axboe
@ 2022-04-13  0:40 ` Eric Dumazet
  2022-04-13  1:26   ` Jens Axboe
  4 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2022-04-13  0:40 UTC (permalink / raw)
  To: Jens Axboe, io-uring, netdev, edumazet


On 4/12/22 13:26, Jens Axboe wrote:
> Hi,
>
> If we accept a connection directly, eg without installing a file
> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
> we have a socket for recv/send that we can fully serialize access to.
>
> With that in mind, we can feasibly skip locking on the socket for TCP
> in that case. Some of the testing I've done has shown as much as 15%
> of overhead in the lock_sock/release_sock part, with this change then
> we see none.
>
> Comments welcome!
>
How BH handlers (including TCP timers) and io_uring are going to run 
safely ?

Even if a tcp socket had one user, (private fd opened by a non 
multi-threaded

program), we would still to use the spinlock.


Maybe I am missing something, but so far your patches make no sense to me.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  0:40 ` [PATCHSET 0/4] Add support for no-lock sockets Eric Dumazet
@ 2022-04-13  1:26   ` Jens Axboe
  2022-04-13  1:54     ` Eric Dumazet
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2022-04-13  1:26 UTC (permalink / raw)
  To: Eric Dumazet, io-uring, netdev, edumazet

On 4/12/22 6:40 PM, Eric Dumazet wrote:
> 
> On 4/12/22 13:26, Jens Axboe wrote:
>> Hi,
>>
>> If we accept a connection directly, eg without installing a file
>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
>> we have a socket for recv/send that we can fully serialize access to.
>>
>> With that in mind, we can feasibly skip locking on the socket for TCP
>> in that case. Some of the testing I've done has shown as much as 15%
>> of overhead in the lock_sock/release_sock part, with this change then
>> we see none.
>>
>> Comments welcome!
>>
> How BH handlers (including TCP timers) and io_uring are going to run
> safely ? Even if a tcp socket had one user, (private fd opened by a
> non multi-threaded program), we would still to use the spinlock.

But we don't even hold the spinlock over lock_sock() and release_sock(),
just the mutex. And we do check for running eg the backlog on release,
which I believe is done safely and similarly in other places too.

> Maybe I am missing something, but so far your patches make no sense to
> me.

It's probably more likely I'm missing something, since I don't know this
area nearly as well as you. But it'd be great if you could be specific.


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  1:26   ` Jens Axboe
@ 2022-04-13  1:54     ` Eric Dumazet
  2022-04-13  2:01       ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2022-04-13  1:54 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Eric Dumazet, io-uring, netdev

On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 4/12/22 6:40 PM, Eric Dumazet wrote:
> >
> > On 4/12/22 13:26, Jens Axboe wrote:
> >> Hi,
> >>
> >> If we accept a connection directly, eg without installing a file
> >> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
> >> we have a socket for recv/send that we can fully serialize access to.
> >>
> >> With that in mind, we can feasibly skip locking on the socket for TCP
> >> in that case. Some of the testing I've done has shown as much as 15%
> >> of overhead in the lock_sock/release_sock part, with this change then
> >> we see none.
> >>
> >> Comments welcome!
> >>
> > How BH handlers (including TCP timers) and io_uring are going to run
> > safely ? Even if a tcp socket had one user, (private fd opened by a
> > non multi-threaded program), we would still to use the spinlock.
>
> But we don't even hold the spinlock over lock_sock() and release_sock(),
> just the mutex. And we do check for running eg the backlog on release,
> which I believe is done safely and similarly in other places too.

So lets say TCP stack receives a packet in BH handler... it proceeds
using many tcp sock fields.

Then io_uring wants to read/write stuff from another cpu, while BH
handler(s) is(are) not done yet,
and will happily read/change many of the same fields

Writing a 1 and a 0 in a bit field to ensure mutual exclusion is not
going to work,
even with the smp_rmb() and smp_wmb() you added (adding more costs for
non io_uring users
which already pay a high lock tax)

If we want to optimize the lock_sock()/release_sock() for common cases
(a single user thread per TCP socket),
then maybe we can play games with some kind of cmpxchg() games, but
that would be a generic change.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  1:54     ` Eric Dumazet
@ 2022-04-13  2:01       ` Jens Axboe
  2022-04-13  2:05         ` Eric Dumazet
  2022-04-13  5:23         ` dust.li
  0 siblings, 2 replies; 18+ messages in thread
From: Jens Axboe @ 2022-04-13  2:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Eric Dumazet, io-uring, netdev

On 4/12/22 7:54 PM, Eric Dumazet wrote:
> On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 4/12/22 6:40 PM, Eric Dumazet wrote:
>>>
>>> On 4/12/22 13:26, Jens Axboe wrote:
>>>> Hi,
>>>>
>>>> If we accept a connection directly, eg without installing a file
>>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
>>>> we have a socket for recv/send that we can fully serialize access to.
>>>>
>>>> With that in mind, we can feasibly skip locking on the socket for TCP
>>>> in that case. Some of the testing I've done has shown as much as 15%
>>>> of overhead in the lock_sock/release_sock part, with this change then
>>>> we see none.
>>>>
>>>> Comments welcome!
>>>>
>>> How BH handlers (including TCP timers) and io_uring are going to run
>>> safely ? Even if a tcp socket had one user, (private fd opened by a
>>> non multi-threaded program), we would still to use the spinlock.
>>
>> But we don't even hold the spinlock over lock_sock() and release_sock(),
>> just the mutex. And we do check for running eg the backlog on release,
>> which I believe is done safely and similarly in other places too.
> 
> So lets say TCP stack receives a packet in BH handler... it proceeds
> using many tcp sock fields.
> 
> Then io_uring wants to read/write stuff from another cpu, while BH
> handler(s) is(are) not done yet,
> and will happily read/change many of the same fields

But how is that currently protected? The bh spinlock is only held
briefly while locking the socket, and ditto on the relase. Outside of
that, the owner field is used. At least as far as I can tell. I'm
assuming the mutex exists solely to serialize acess to eg send/recv on
the system call side.

Hence if we can just make the owner check/set sane, then it would seem
to be that it'd work just fine. Unless I'm still missing something here.

> Writing a 1 and a 0 in a bit field to ensure mutual exclusion is not
> going to work,
> even with the smp_rmb() and smp_wmb() you added (adding more costs for
> non io_uring users
> which already pay a high lock tax)

Right, that's what the set was supposed to improve :-)

In all fairness, the rmb/wmb doesn't even measure compared to the
current socket locking, so I highly doubt that any high frequency TCP
would notice _any_ difference there. It's dwarfed by fiddling the mutex
and spinlock already.

But I agree, it may not be 100% bullet proof. May need actual bitops to
be totally safe. Outside of that, I'm still failing to see what kind of
mutual exclusion exists between BH handlers and a system call doing a
send or receive on the socket.

> If we want to optimize the lock_sock()/release_sock() for common cases
> (a single user thread per TCP socket),
> then maybe we can play games with some kind of cmpxchg() games, but
> that would be a generic change.

Sure, not disagreeing on that, but you'd supposedly still need the mutex
to serialize send or receives on the socket for those cases.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  2:01       ` Jens Axboe
@ 2022-04-13  2:05         ` Eric Dumazet
  2022-04-13  2:12           ` Jens Axboe
  2022-04-13  5:23         ` dust.li
  1 sibling, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2022-04-13  2:05 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Eric Dumazet, io-uring, netdev

On Tue, Apr 12, 2022 at 7:01 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 4/12/22 7:54 PM, Eric Dumazet wrote:
> > On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>
> >> On 4/12/22 6:40 PM, Eric Dumazet wrote:
> >>>
> >>> On 4/12/22 13:26, Jens Axboe wrote:
> >>>> Hi,
> >>>>
> >>>> If we accept a connection directly, eg without installing a file
> >>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
> >>>> we have a socket for recv/send that we can fully serialize access to.
> >>>>
> >>>> With that in mind, we can feasibly skip locking on the socket for TCP
> >>>> in that case. Some of the testing I've done has shown as much as 15%
> >>>> of overhead in the lock_sock/release_sock part, with this change then
> >>>> we see none.
> >>>>
> >>>> Comments welcome!
> >>>>
> >>> How BH handlers (including TCP timers) and io_uring are going to run
> >>> safely ? Even if a tcp socket had one user, (private fd opened by a
> >>> non multi-threaded program), we would still to use the spinlock.
> >>
> >> But we don't even hold the spinlock over lock_sock() and release_sock(),
> >> just the mutex. And we do check for running eg the backlog on release,
> >> which I believe is done safely and similarly in other places too.
> >
> > So lets say TCP stack receives a packet in BH handler... it proceeds
> > using many tcp sock fields.
> >
> > Then io_uring wants to read/write stuff from another cpu, while BH
> > handler(s) is(are) not done yet,
> > and will happily read/change many of the same fields
>
> But how is that currently protected?

It is protected by current code.

What you wrote would break TCP stack quite badly.

I suggest you setup/run a syzbot server/farm, then you will have a
hundred reports quite easily.


 The bh spinlock is only held
> briefly while locking the socket, and ditto on the relase.

The 'briefly' is exactly what is needed to ensure exclusion between BH
and the user thread.

Yes, this is unfortunate but this is what it is.

 Outside of
> that, the owner field is used. At least as far as I can tell. I'm
> assuming the mutex exists solely to serialize acess to eg send/recv on
> the system call side.
>
> Hence if we can just make the owner check/set sane, then it would seem
> to be that it'd work just fine. Unless I'm still missing something here.
>
> > Writing a 1 and a 0 in a bit field to ensure mutual exclusion is not
> > going to work,
> > even with the smp_rmb() and smp_wmb() you added (adding more costs for
> > non io_uring users
> > which already pay a high lock tax)
>
> Right, that's what the set was supposed to improve :-)
>
> In all fairness, the rmb/wmb doesn't even measure compared to the
> current socket locking, so I highly doubt that any high frequency TCP
> would notice _any_ difference there. It's dwarfed by fiddling the mutex
> and spinlock already.
>
> But I agree, it may not be 100% bullet proof. May need actual bitops to
> be totally safe. Outside of that, I'm still failing to see what kind of
> mutual exclusion exists between BH handlers and a system call doing a
> send or receive on the socket.
>
> > If we want to optimize the lock_sock()/release_sock() for common cases
> > (a single user thread per TCP socket),
> > then maybe we can play games with some kind of cmpxchg() games, but
> > that would be a generic change.
>
> Sure, not disagreeing on that, but you'd supposedly still need the mutex
> to serialize send or receives on the socket for those cases.
>
> --
> Jens Axboe
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  2:05         ` Eric Dumazet
@ 2022-04-13  2:12           ` Jens Axboe
  2022-04-13  2:19             ` Eric Dumazet
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2022-04-13  2:12 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Eric Dumazet, io-uring, netdev

On 4/12/22 8:05 PM, Eric Dumazet wrote:
> On Tue, Apr 12, 2022 at 7:01 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 4/12/22 7:54 PM, Eric Dumazet wrote:
>>> On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>> On 4/12/22 6:40 PM, Eric Dumazet wrote:
>>>>>
>>>>> On 4/12/22 13:26, Jens Axboe wrote:
>>>>>> Hi,
>>>>>>
>>>>>> If we accept a connection directly, eg without installing a file
>>>>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
>>>>>> we have a socket for recv/send that we can fully serialize access to.
>>>>>>
>>>>>> With that in mind, we can feasibly skip locking on the socket for TCP
>>>>>> in that case. Some of the testing I've done has shown as much as 15%
>>>>>> of overhead in the lock_sock/release_sock part, with this change then
>>>>>> we see none.
>>>>>>
>>>>>> Comments welcome!
>>>>>>
>>>>> How BH handlers (including TCP timers) and io_uring are going to run
>>>>> safely ? Even if a tcp socket had one user, (private fd opened by a
>>>>> non multi-threaded program), we would still to use the spinlock.
>>>>
>>>> But we don't even hold the spinlock over lock_sock() and release_sock(),
>>>> just the mutex. And we do check for running eg the backlog on release,
>>>> which I believe is done safely and similarly in other places too.
>>>
>>> So lets say TCP stack receives a packet in BH handler... it proceeds
>>> using many tcp sock fields.
>>>
>>> Then io_uring wants to read/write stuff from another cpu, while BH
>>> handler(s) is(are) not done yet,
>>> and will happily read/change many of the same fields
>>
>> But how is that currently protected?
> 
> It is protected by current code.
> 
> What you wrote would break TCP stack quite badly.

No offense, but your explanations are severely lacking. By "current
code"? So what you're saying is that it's protected by how the code
currently works? From how that it currently is? Yeah, that surely
explains it.

> I suggest you setup/run a syzbot server/farm, then you will have a
> hundred reports quite easily.

Nowhere am I claiming this is currently perfect, and it should have had
an RFC on it. Was hoping for some constructive criticism on how to move
this forward, as high frequency TCP currently _sucks_ in the stack.
Instead I get useless replies, not very encouraging.

I've run this quite extensively on just basic send/receive over sockets,
so it's not like it hasn't been run at all. And it's been fine so far,
no ill effects observed. If we need to tighten down the locking, perhaps
a valid use would be to simply skip the mutex and retain the bh lock for
setting owner. As far as I can tell, should still be safe to skip on
release, except if we need to process the backlog. And it'd serialize
the owner setting with the BH, which seems to be your main objection in.
Mostly guessing here, based on the in-depth replies.

But it'd be nice if we could have a more constructive dialogue about
this, rather than the weird dismisiveness.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  2:12           ` Jens Axboe
@ 2022-04-13  2:19             ` Eric Dumazet
  2022-04-13  2:26               ` Eric Dumazet
  2022-04-13  2:27               ` Jens Axboe
  0 siblings, 2 replies; 18+ messages in thread
From: Eric Dumazet @ 2022-04-13  2:19 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Eric Dumazet, io-uring, netdev

On Tue, Apr 12, 2022 at 7:12 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 4/12/22 8:05 PM, Eric Dumazet wrote:
> > On Tue, Apr 12, 2022 at 7:01 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>
> >> On 4/12/22 7:54 PM, Eric Dumazet wrote:
> >>> On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>>
> >>>> On 4/12/22 6:40 PM, Eric Dumazet wrote:
> >>>>>
> >>>>> On 4/12/22 13:26, Jens Axboe wrote:
> >>>>>> Hi,
> >>>>>>
> >>>>>> If we accept a connection directly, eg without installing a file
> >>>>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
> >>>>>> we have a socket for recv/send that we can fully serialize access to.
> >>>>>>
> >>>>>> With that in mind, we can feasibly skip locking on the socket for TCP
> >>>>>> in that case. Some of the testing I've done has shown as much as 15%
> >>>>>> of overhead in the lock_sock/release_sock part, with this change then
> >>>>>> we see none.
> >>>>>>
> >>>>>> Comments welcome!
> >>>>>>
> >>>>> How BH handlers (including TCP timers) and io_uring are going to run
> >>>>> safely ? Even if a tcp socket had one user, (private fd opened by a
> >>>>> non multi-threaded program), we would still to use the spinlock.
> >>>>
> >>>> But we don't even hold the spinlock over lock_sock() and release_sock(),
> >>>> just the mutex. And we do check for running eg the backlog on release,
> >>>> which I believe is done safely and similarly in other places too.
> >>>
> >>> So lets say TCP stack receives a packet in BH handler... it proceeds
> >>> using many tcp sock fields.
> >>>
> >>> Then io_uring wants to read/write stuff from another cpu, while BH
> >>> handler(s) is(are) not done yet,
> >>> and will happily read/change many of the same fields
> >>
> >> But how is that currently protected?
> >
> > It is protected by current code.
> >
> > What you wrote would break TCP stack quite badly.
>
> No offense, but your explanations are severely lacking. By "current
> code"? So what you're saying is that it's protected by how the code
> currently works? From how that it currently is? Yeah, that surely
> explains it.
>
> > I suggest you setup/run a syzbot server/farm, then you will have a
> > hundred reports quite easily.
>
> Nowhere am I claiming this is currently perfect, and it should have had
> an RFC on it. Was hoping for some constructive criticism on how to move
> this forward, as high frequency TCP currently _sucks_ in the stack.
> Instead I get useless replies, not very encouraging.
>
> I've run this quite extensively on just basic send/receive over sockets,
> so it's not like it hasn't been run at all. And it's been fine so far,
> no ill effects observed. If we need to tighten down the locking, perhaps
> a valid use would be to simply skip the mutex and retain the bh lock for
> setting owner. As far as I can tell, should still be safe to skip on
> release, except if we need to process the backlog. And it'd serialize
> the owner setting with the BH, which seems to be your main objection in.
> Mostly guessing here, based on the in-depth replies.
>
> But it'd be nice if we could have a more constructive dialogue about
> this, rather than the weird dismisiveness.
>
>

Sure. It would be nice that I have not received such a patch series
the day I am sick.

Jakub, David, Paolo, please provide details to Jens, thanks.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  2:19             ` Eric Dumazet
@ 2022-04-13  2:26               ` Eric Dumazet
  2022-04-13  2:27               ` Jens Axboe
  1 sibling, 0 replies; 18+ messages in thread
From: Eric Dumazet @ 2022-04-13  2:26 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Eric Dumazet, io-uring, netdev

On Tue, Apr 12, 2022 at 7:19 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Apr 12, 2022 at 7:12 PM Jens Axboe <axboe@kernel.dk> wrote:
> >
> > On 4/12/22 8:05 PM, Eric Dumazet wrote:
> > > On Tue, Apr 12, 2022 at 7:01 PM Jens Axboe <axboe@kernel.dk> wrote:
> > >>
> > >> On 4/12/22 7:54 PM, Eric Dumazet wrote:
> > >>> On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
> > >>>>
> > >>>> On 4/12/22 6:40 PM, Eric Dumazet wrote:
> > >>>>>
> > >>>>> On 4/12/22 13:26, Jens Axboe wrote:
> > >>>>>> Hi,
> > >>>>>>
> > >>>>>> If we accept a connection directly, eg without installing a file
> > >>>>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
> > >>>>>> we have a socket for recv/send that we can fully serialize access to.
> > >>>>>>
> > >>>>>> With that in mind, we can feasibly skip locking on the socket for TCP
> > >>>>>> in that case. Some of the testing I've done has shown as much as 15%
> > >>>>>> of overhead in the lock_sock/release_sock part, with this change then
> > >>>>>> we see none.
> > >>>>>>
> > >>>>>> Comments welcome!
> > >>>>>>
> > >>>>> How BH handlers (including TCP timers) and io_uring are going to run
> > >>>>> safely ? Even if a tcp socket had one user, (private fd opened by a
> > >>>>> non multi-threaded program), we would still to use the spinlock.
> > >>>>
> > >>>> But we don't even hold the spinlock over lock_sock() and release_sock(),
> > >>>> just the mutex. And we do check for running eg the backlog on release,
> > >>>> which I believe is done safely and similarly in other places too.
> > >>>
> > >>> So lets say TCP stack receives a packet in BH handler... it proceeds
> > >>> using many tcp sock fields.
> > >>>
> > >>> Then io_uring wants to read/write stuff from another cpu, while BH
> > >>> handler(s) is(are) not done yet,
> > >>> and will happily read/change many of the same fields
> > >>
> > >> But how is that currently protected?
> > >
> > > It is protected by current code.
> > >
> > > What you wrote would break TCP stack quite badly.
> >
> > No offense, but your explanations are severely lacking. By "current
> > code"? So what you're saying is that it's protected by how the code
> > currently works? From how that it currently is? Yeah, that surely
> > explains it.
> >
> > > I suggest you setup/run a syzbot server/farm, then you will have a
> > > hundred reports quite easily.
> >
> > Nowhere am I claiming this is currently perfect, and it should have had
> > an RFC on it. Was hoping for some constructive criticism on how to move
> > this forward, as high frequency TCP currently _sucks_ in the stack.
> > Instead I get useless replies, not very encouraging.
> >
> > I've run this quite extensively on just basic send/receive over sockets,
> > so it's not like it hasn't been run at all. And it's been fine so far,
> > no ill effects observed. If we need to tighten down the locking, perhaps
> > a valid use would be to simply skip the mutex and retain the bh lock for
> > setting owner. As far as I can tell, should still be safe to skip on
> > release, except if we need to process the backlog. And it'd serialize
> > the owner setting with the BH, which seems to be your main objection in.
> > Mostly guessing here, based on the in-depth replies.
> >
> > But it'd be nice if we could have a more constructive dialogue about
> > this, rather than the weird dismisiveness.
> >
> >
>
> Sure. It would be nice that I have not received such a patch series
> the day I am sick.
>
> Jakub, David, Paolo, please provide details to Jens, thanks.

FYI, include/net/sock.h has this comment, which has been served for
20+ years just fine.

/* Used by processes to "lock" a socket state, so that
 * interrupts and bottom half handlers won't change it
 * from under us. It essentially blocks any incoming
 * packets, so that we won't get any new data or any
 * packets that change the state of the socket.
 *
 * While locked, BH processing will add new packets to
 * the backlog queue.  This queue is processed by the
 * owner of the socket lock right before it is released.
 *
 * Since ~2.3.5 it is also exclusive sleep lock serializing
 * accesses from user process context.
 */

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  2:19             ` Eric Dumazet
  2022-04-13  2:26               ` Eric Dumazet
@ 2022-04-13  2:27               ` Jens Axboe
  2022-04-13  2:32                 ` Eric Dumazet
  1 sibling, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2022-04-13  2:27 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Eric Dumazet, io-uring, netdev

On 4/12/22 8:19 PM, Eric Dumazet wrote:
> On Tue, Apr 12, 2022 at 7:12 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 4/12/22 8:05 PM, Eric Dumazet wrote:
>>> On Tue, Apr 12, 2022 at 7:01 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>> On 4/12/22 7:54 PM, Eric Dumazet wrote:
>>>>> On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>
>>>>>> On 4/12/22 6:40 PM, Eric Dumazet wrote:
>>>>>>>
>>>>>>> On 4/12/22 13:26, Jens Axboe wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> If we accept a connection directly, eg without installing a file
>>>>>>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
>>>>>>>> we have a socket for recv/send that we can fully serialize access to.
>>>>>>>>
>>>>>>>> With that in mind, we can feasibly skip locking on the socket for TCP
>>>>>>>> in that case. Some of the testing I've done has shown as much as 15%
>>>>>>>> of overhead in the lock_sock/release_sock part, with this change then
>>>>>>>> we see none.
>>>>>>>>
>>>>>>>> Comments welcome!
>>>>>>>>
>>>>>>> How BH handlers (including TCP timers) and io_uring are going to run
>>>>>>> safely ? Even if a tcp socket had one user, (private fd opened by a
>>>>>>> non multi-threaded program), we would still to use the spinlock.
>>>>>>
>>>>>> But we don't even hold the spinlock over lock_sock() and release_sock(),
>>>>>> just the mutex. And we do check for running eg the backlog on release,
>>>>>> which I believe is done safely and similarly in other places too.
>>>>>
>>>>> So lets say TCP stack receives a packet in BH handler... it proceeds
>>>>> using many tcp sock fields.
>>>>>
>>>>> Then io_uring wants to read/write stuff from another cpu, while BH
>>>>> handler(s) is(are) not done yet,
>>>>> and will happily read/change many of the same fields
>>>>
>>>> But how is that currently protected?
>>>
>>> It is protected by current code.
>>>
>>> What you wrote would break TCP stack quite badly.
>>
>> No offense, but your explanations are severely lacking. By "current
>> code"? So what you're saying is that it's protected by how the code
>> currently works? From how that it currently is? Yeah, that surely
>> explains it.
>>
>>> I suggest you setup/run a syzbot server/farm, then you will have a
>>> hundred reports quite easily.
>>
>> Nowhere am I claiming this is currently perfect, and it should have had
>> an RFC on it. Was hoping for some constructive criticism on how to move
>> this forward, as high frequency TCP currently _sucks_ in the stack.
>> Instead I get useless replies, not very encouraging.
>>
>> I've run this quite extensively on just basic send/receive over sockets,
>> so it's not like it hasn't been run at all. And it's been fine so far,
>> no ill effects observed. If we need to tighten down the locking, perhaps
>> a valid use would be to simply skip the mutex and retain the bh lock for
>> setting owner. As far as I can tell, should still be safe to skip on
>> release, except if we need to process the backlog. And it'd serialize
>> the owner setting with the BH, which seems to be your main objection in.
>> Mostly guessing here, based on the in-depth replies.
>>
>> But it'd be nice if we could have a more constructive dialogue about
>> this, rather than the weird dismisiveness.
>>
>>
> 
> Sure. It would be nice that I have not received such a patch series
> the day I am sick.

I'm sorry that you are sick - but if you are not in a state to reply,
then please just don't. It sets a bad example. It was sent to the list,
not to you personally.

Don't check email then, putting the blame on ME for posting a patchset
while you are sick is uncalled for and rude. If I had a crystal ball, I
would not be spending my time working on the kernel. You know what
would've been a better idea? Replying that you are sick and that you are
sorry for being an ass on the mailing list.

> Jakub, David, Paolo, please provide details to Jens, thanks.

There's no rush here fwiw - I'm heading out on PTO rest of the week,
so we can pick this back up when I get back. I'll check in on emails,
but activity will be sparse.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  2:27               ` Jens Axboe
@ 2022-04-13  2:32                 ` Eric Dumazet
  2022-04-13  2:38                   ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2022-04-13  2:32 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Eric Dumazet, io-uring, netdev

On Tue, Apr 12, 2022 at 7:27 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 4/12/22 8:19 PM, Eric Dumazet wrote:
> > On Tue, Apr 12, 2022 at 7:12 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>
> >> On 4/12/22 8:05 PM, Eric Dumazet wrote:
> >>> On Tue, Apr 12, 2022 at 7:01 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>>
> >>>> On 4/12/22 7:54 PM, Eric Dumazet wrote:
> >>>>> On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>>>>
> >>>>>> On 4/12/22 6:40 PM, Eric Dumazet wrote:
> >>>>>>>
> >>>>>>> On 4/12/22 13:26, Jens Axboe wrote:
> >>>>>>>> Hi,
> >>>>>>>>
> >>>>>>>> If we accept a connection directly, eg without installing a file
> >>>>>>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
> >>>>>>>> we have a socket for recv/send that we can fully serialize access to.
> >>>>>>>>
> >>>>>>>> With that in mind, we can feasibly skip locking on the socket for TCP
> >>>>>>>> in that case. Some of the testing I've done has shown as much as 15%
> >>>>>>>> of overhead in the lock_sock/release_sock part, with this change then
> >>>>>>>> we see none.
> >>>>>>>>
> >>>>>>>> Comments welcome!
> >>>>>>>>
> >>>>>>> How BH handlers (including TCP timers) and io_uring are going to run
> >>>>>>> safely ? Even if a tcp socket had one user, (private fd opened by a
> >>>>>>> non multi-threaded program), we would still to use the spinlock.
> >>>>>>
> >>>>>> But we don't even hold the spinlock over lock_sock() and release_sock(),
> >>>>>> just the mutex. And we do check for running eg the backlog on release,
> >>>>>> which I believe is done safely and similarly in other places too.
> >>>>>
> >>>>> So lets say TCP stack receives a packet in BH handler... it proceeds
> >>>>> using many tcp sock fields.
> >>>>>
> >>>>> Then io_uring wants to read/write stuff from another cpu, while BH
> >>>>> handler(s) is(are) not done yet,
> >>>>> and will happily read/change many of the same fields
> >>>>
> >>>> But how is that currently protected?
> >>>
> >>> It is protected by current code.
> >>>
> >>> What you wrote would break TCP stack quite badly.
> >>
> >> No offense, but your explanations are severely lacking. By "current
> >> code"? So what you're saying is that it's protected by how the code
> >> currently works? From how that it currently is? Yeah, that surely
> >> explains it.
> >>
> >>> I suggest you setup/run a syzbot server/farm, then you will have a
> >>> hundred reports quite easily.
> >>
> >> Nowhere am I claiming this is currently perfect, and it should have had
> >> an RFC on it. Was hoping for some constructive criticism on how to move
> >> this forward, as high frequency TCP currently _sucks_ in the stack.
> >> Instead I get useless replies, not very encouraging.
> >>
> >> I've run this quite extensively on just basic send/receive over sockets,
> >> so it's not like it hasn't been run at all. And it's been fine so far,
> >> no ill effects observed. If we need to tighten down the locking, perhaps
> >> a valid use would be to simply skip the mutex and retain the bh lock for
> >> setting owner. As far as I can tell, should still be safe to skip on
> >> release, except if we need to process the backlog. And it'd serialize
> >> the owner setting with the BH, which seems to be your main objection in.
> >> Mostly guessing here, based on the in-depth replies.
> >>
> >> But it'd be nice if we could have a more constructive dialogue about
> >> this, rather than the weird dismisiveness.
> >>
> >>
> >
> > Sure. It would be nice that I have not received such a patch series
> > the day I am sick.
>
> I'm sorry that you are sick - but if you are not in a state to reply,
> then please just don't. It sets a bad example. It was sent to the list,
> not to you personally.

I tried to be as constructive as possible, and Jakub pinged me about
this series,
so I really thought Jakub was okay with it.

So I am a bit concerned.

>
> Don't check email then, putting the blame on ME for posting a patchset
> while you are sick is uncalled for and rude. If I had a crystal ball, I
> would not be spending my time working on the kernel. You know what
> would've been a better idea? Replying that you are sick and that you are
> sorry for being an ass on the mailing list.

Wow.


>
> > Jakub, David, Paolo, please provide details to Jens, thanks.
>
> There's no rush here fwiw - I'm heading out on PTO rest of the week,
> so we can pick this back up when I get back. I'll check in on emails,
> but activity will be sparse.
>
> --
> Jens Axboe
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  2:32                 ` Eric Dumazet
@ 2022-04-13  2:38                   ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2022-04-13  2:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Eric Dumazet, io-uring, netdev

On 4/12/22 8:32 PM, Eric Dumazet wrote:
> On Tue, Apr 12, 2022 at 7:27 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 4/12/22 8:19 PM, Eric Dumazet wrote:
>>> On Tue, Apr 12, 2022 at 7:12 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>> On 4/12/22 8:05 PM, Eric Dumazet wrote:
>>>>> On Tue, Apr 12, 2022 at 7:01 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>
>>>>>> On 4/12/22 7:54 PM, Eric Dumazet wrote:
>>>>>>> On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>
>>>>>>>> On 4/12/22 6:40 PM, Eric Dumazet wrote:
>>>>>>>>>
>>>>>>>>> On 4/12/22 13:26, Jens Axboe wrote:
>>>>>>>>>> Hi,
>>>>>>>>>>
>>>>>>>>>> If we accept a connection directly, eg without installing a file
>>>>>>>>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
>>>>>>>>>> we have a socket for recv/send that we can fully serialize access to.
>>>>>>>>>>
>>>>>>>>>> With that in mind, we can feasibly skip locking on the socket for TCP
>>>>>>>>>> in that case. Some of the testing I've done has shown as much as 15%
>>>>>>>>>> of overhead in the lock_sock/release_sock part, with this change then
>>>>>>>>>> we see none.
>>>>>>>>>>
>>>>>>>>>> Comments welcome!
>>>>>>>>>>
>>>>>>>>> How BH handlers (including TCP timers) and io_uring are going to run
>>>>>>>>> safely ? Even if a tcp socket had one user, (private fd opened by a
>>>>>>>>> non multi-threaded program), we would still to use the spinlock.
>>>>>>>>
>>>>>>>> But we don't even hold the spinlock over lock_sock() and release_sock(),
>>>>>>>> just the mutex. And we do check for running eg the backlog on release,
>>>>>>>> which I believe is done safely and similarly in other places too.
>>>>>>>
>>>>>>> So lets say TCP stack receives a packet in BH handler... it proceeds
>>>>>>> using many tcp sock fields.
>>>>>>>
>>>>>>> Then io_uring wants to read/write stuff from another cpu, while BH
>>>>>>> handler(s) is(are) not done yet,
>>>>>>> and will happily read/change many of the same fields
>>>>>>
>>>>>> But how is that currently protected?
>>>>>
>>>>> It is protected by current code.
>>>>>
>>>>> What you wrote would break TCP stack quite badly.
>>>>
>>>> No offense, but your explanations are severely lacking. By "current
>>>> code"? So what you're saying is that it's protected by how the code
>>>> currently works? From how that it currently is? Yeah, that surely
>>>> explains it.
>>>>
>>>>> I suggest you setup/run a syzbot server/farm, then you will have a
>>>>> hundred reports quite easily.
>>>>
>>>> Nowhere am I claiming this is currently perfect, and it should have had
>>>> an RFC on it. Was hoping for some constructive criticism on how to move
>>>> this forward, as high frequency TCP currently _sucks_ in the stack.
>>>> Instead I get useless replies, not very encouraging.
>>>>
>>>> I've run this quite extensively on just basic send/receive over sockets,
>>>> so it's not like it hasn't been run at all. And it's been fine so far,
>>>> no ill effects observed. If we need to tighten down the locking, perhaps
>>>> a valid use would be to simply skip the mutex and retain the bh lock for
>>>> setting owner. As far as I can tell, should still be safe to skip on
>>>> release, except if we need to process the backlog. And it'd serialize
>>>> the owner setting with the BH, which seems to be your main objection in.
>>>> Mostly guessing here, based on the in-depth replies.
>>>>
>>>> But it'd be nice if we could have a more constructive dialogue about
>>>> this, rather than the weird dismisiveness.
>>>>
>>>>
>>>
>>> Sure. It would be nice that I have not received such a patch series
>>> the day I am sick.
>>
>> I'm sorry that you are sick - but if you are not in a state to reply,
>> then please just don't. It sets a bad example. It was sent to the list,
>> not to you personally.
> 
> I tried to be as constructive as possible, and Jakub pinged me about

Are you serious?! I don't think I've ever received less constructive
feedback in 20+ years of working on the kernel.

> this series,
> so I really thought Jakub was okay with it.
> 
> So I am a bit concerned.

I did show it to Jakub a week or so ago, probably that was why. But why
the concern?! It's just a patchseries proposed for discussion. Something
that happens every day.

>> Don't check email then, putting the blame on ME for posting a patchset
>> while you are sick is uncalled for and rude. If I had a crystal ball, I
>> would not be spending my time working on the kernel. You know what
>> would've been a better idea? Replying that you are sick and that you are
>> sorry for being an ass on the mailing list.
> 
> Wow.

Putting the blame on me for your emails, since I posted a patchset while
you're sick, is just rude.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  2:01       ` Jens Axboe
  2022-04-13  2:05         ` Eric Dumazet
@ 2022-04-13  5:23         ` dust.li
  2022-04-13  7:53           ` Paolo Abeni
  1 sibling, 1 reply; 18+ messages in thread
From: dust.li @ 2022-04-13  5:23 UTC (permalink / raw)
  To: Jens Axboe, Eric Dumazet; +Cc: Eric Dumazet, io-uring, netdev

On Tue, Apr 12, 2022 at 08:01:10PM -0600, Jens Axboe wrote:
>On 4/12/22 7:54 PM, Eric Dumazet wrote:
>> On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>> On 4/12/22 6:40 PM, Eric Dumazet wrote:
>>>>
>>>> On 4/12/22 13:26, Jens Axboe wrote:
>>>>> Hi,
>>>>>
>>>>> If we accept a connection directly, eg without installing a file
>>>>> descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
>>>>> we have a socket for recv/send that we can fully serialize access to.
>>>>>
>>>>> With that in mind, we can feasibly skip locking on the socket for TCP
>>>>> in that case. Some of the testing I've done has shown as much as 15%
>>>>> of overhead in the lock_sock/release_sock part, with this change then
>>>>> we see none.
>>>>>
>>>>> Comments welcome!
>>>>>
>>>> How BH handlers (including TCP timers) and io_uring are going to run
>>>> safely ? Even if a tcp socket had one user, (private fd opened by a
>>>> non multi-threaded program), we would still to use the spinlock.
>>>
>>> But we don't even hold the spinlock over lock_sock() and release_sock(),
>>> just the mutex. And we do check for running eg the backlog on release,
>>> which I believe is done safely and similarly in other places too.
>> 
>> So lets say TCP stack receives a packet in BH handler... it proceeds
>> using many tcp sock fields.
>> 
>> Then io_uring wants to read/write stuff from another cpu, while BH
>> handler(s) is(are) not done yet,
>> and will happily read/change many of the same fields
>
>But how is that currently protected? The bh spinlock is only held
>briefly while locking the socket, and ditto on the relase. Outside of
>that, the owner field is used. At least as far as I can tell. I'm
>assuming the mutex exists solely to serialize acess to eg send/recv on
>the system call side.

Hi jens,

I personally like the idea of using iouring to improve the performance
of the socket API.

AFAIU, the bh spinlock will be held by the BH when trying to make
changes to those protected fields on the socket, and the userspace
will try to hold that spinlock before it can change the sock lock
owner field.

For example:
in tcp_v4_rcv() we have

        bh_lock_sock_nested(sk);
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
                ret = tcp_v4_do_rcv(sk, skb);
        } else {
                if (tcp_add_backlog(sk, skb, &drop_reason))
                        goto discard_and_relse;
        }
        bh_unlock_sock(sk);

When this is called in the BH, it will first hold the bh spinlock
and then check the owner field, tcp_v4_do_rcv() will always been
protected by the bh spinlock.

If the user thread tries to make changes to the socket, it first
call lock_sock() which will also try to hold the bh spinlock, I
think that prevent the race.

  void lock_sock_nested(struct sock *sk, int subclass)
  {
          /* The sk_lock has mutex_lock() semantics here. */
          mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);

          might_sleep();
          spin_lock_bh(&sk->sk_lock.slock);
          if (sock_owned_by_user_nocheck(sk))
                  __lock_sock(sk);
          sk->sk_lock.owned = 1;
          spin_unlock_bh(&sk->sk_lock.slock);
  }

But if we remove the spinlock in the lock_sock() when sk_no_lock
is set to true. When the the bh spinlock is already held by the BH,
it seems the userspace won't respect that anymore ?

Maybe I missed something too...

>
>Hence if we can just make the owner check/set sane, then it would seem
>to be that it'd work just fine. Unless I'm still missing something here.
>
>> Writing a 1 and a 0 in a bit field to ensure mutual exclusion is not
>> going to work,
>> even with the smp_rmb() and smp_wmb() you added (adding more costs for
>> non io_uring users
>> which already pay a high lock tax)
>
>Right, that's what the set was supposed to improve :-)
>
>In all fairness, the rmb/wmb doesn't even measure compared to the
>current socket locking, so I highly doubt that any high frequency TCP
>would notice _any_ difference there. It's dwarfed by fiddling the mutex
>and spinlock already.
>
>But I agree, it may not be 100% bullet proof. May need actual bitops to
>be totally safe. Outside of that, I'm still failing to see what kind of
>mutual exclusion exists between BH handlers and a system call doing a
>send or receive on the socket.
>
>> If we want to optimize the lock_sock()/release_sock() for common cases
>> (a single user thread per TCP socket),
>> then maybe we can play games with some kind of cmpxchg() games, but
>> that would be a generic change.
>
>Sure, not disagreeing on that, but you'd supposedly still need the mutex
>to serialize send or receives on the socket for those cases.
>
>-- 
>Jens Axboe

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET 0/4] Add support for no-lock sockets
  2022-04-13  5:23         ` dust.li
@ 2022-04-13  7:53           ` Paolo Abeni
  0 siblings, 0 replies; 18+ messages in thread
From: Paolo Abeni @ 2022-04-13  7:53 UTC (permalink / raw)
  To: dust.li, Jens Axboe, Eric Dumazet; +Cc: Eric Dumazet, io-uring, netdev

On Wed, 2022-04-13 at 13:23 +0800, dust.li wrote:
> On Tue, Apr 12, 2022 at 08:01:10PM -0600, Jens Axboe wrote:
> > On 4/12/22 7:54 PM, Eric Dumazet wrote:
> > > On Tue, Apr 12, 2022 at 6:26 PM Jens Axboe <axboe@kernel.dk> wrote:
> > > > 
> > > > On 4/12/22 6:40 PM, Eric Dumazet wrote:
> > > > > 
> > > > > On 4/12/22 13:26, Jens Axboe wrote:
> > > > > > Hi,
> > > > > > 
> > > > > > If we accept a connection directly, eg without installing a file
> > > > > > descriptor for it, or if we use IORING_OP_SOCKET in direct mode, then
> > > > > > we have a socket for recv/send that we can fully serialize access to.
> > > > > > 
> > > > > > With that in mind, we can feasibly skip locking on the socket for TCP
> > > > > > in that case. Some of the testing I've done has shown as much as 15%
> > > > > > of overhead in the lock_sock/release_sock part, with this change then
> > > > > > we see none.
> > > > > > 
> > > > > > Comments welcome!
> > > > > > 
> > > > > How BH handlers (including TCP timers) and io_uring are going to run
> > > > > safely ? Even if a tcp socket had one user, (private fd opened by a
> > > > > non multi-threaded program), we would still to use the spinlock.
> > > > 
> > > > But we don't even hold the spinlock over lock_sock() and release_sock(),
> > > > just the mutex. And we do check for running eg the backlog on release,
> > > > which I believe is done safely and similarly in other places too.
> > > 
> > > So lets say TCP stack receives a packet in BH handler... it proceeds
> > > using many tcp sock fields.
> > > 
> > > Then io_uring wants to read/write stuff from another cpu, while BH
> > > handler(s) is(are) not done yet,
> > > and will happily read/change many of the same fields
> > 
> > But how is that currently protected? The bh spinlock is only held
> > briefly while locking the socket, and ditto on the relase. Outside of
> > that, the owner field is used. At least as far as I can tell. I'm
> > assuming the mutex exists solely to serialize acess to eg send/recv on
> > the system call side.
> 
> Hi jens,
> 
> I personally like the idea of using iouring to improve the performance
> of the socket API.
> 
> AFAIU, the bh spinlock will be held by the BH when trying to make
> changes to those protected fields on the socket, and the userspace
> will try to hold that spinlock before it can change the sock lock
> owner field.
> 
> For example:
> in tcp_v4_rcv() we have
> 
>         bh_lock_sock_nested(sk);
>         tcp_segs_in(tcp_sk(sk), skb);
>         ret = 0;
>         if (!sock_owned_by_user(sk)) {
>                 ret = tcp_v4_do_rcv(sk, skb);
>         } else {
>                 if (tcp_add_backlog(sk, skb, &drop_reason))
>                         goto discard_and_relse;
>         }
>         bh_unlock_sock(sk);
> 
> When this is called in the BH, it will first hold the bh spinlock
> and then check the owner field, tcp_v4_do_rcv() will always been
> protected by the bh spinlock.
> 
> If the user thread tries to make changes to the socket, it first
> call lock_sock() which will also try to hold the bh spinlock, I
> think that prevent the race.
> 
>   void lock_sock_nested(struct sock *sk, int subclass)
>   {
>           /* The sk_lock has mutex_lock() semantics here. */
>           mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
> 
>           might_sleep();
>           spin_lock_bh(&sk->sk_lock.slock);
>           if (sock_owned_by_user_nocheck(sk))
>                   __lock_sock(sk);
>           sk->sk_lock.owned = 1;
>           spin_unlock_bh(&sk->sk_lock.slock);
>   }
> 
> But if we remove the spinlock in the lock_sock() when sk_no_lock
> is set to true. When the the bh spinlock is already held by the BH,
> it seems the userspace won't respect that anymore ?

Exactly, with sk_no_lock we will have the following race:

[BH/timer on CPU 0]			[ reader/writer on CPU 1]

bh_lock_sock_nested(sk);
// owned is currently 0
if (!sock_owned_by_user(sk)) {
    // modify sk state

					if (sk->sk_no_lock) {
						sk->sk_lock.owned = 1;
						smp_wmb();
   // still touching sk state
					// cuncurrently modify sk
state
					// sk is corrupted

We need both the sk spinlock and the 'owned' bit to ensure mutually
exclusive access WRT soft interrupts. 

I personally don't see any way to fix the above without the sk spinlock
- or an equivalent contended atomic operation.

Additionally these changes add relevant overhead for the !sk_no_lock
case - the additional memory barriers and conditionals - which will
impact most/all existing users.

Touching a very fundamental and internal piece of the core networking,
corrently extremelly stable, similar changes will require a very
extensive testing, comprising benchmarking for the current/!sk_no_lock
code paths with different workloads and additional self-tests.

Thanks.

Paolo


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2022-04-13  7:54 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-12 20:26 [PATCHSET 0/4] Add support for no-lock sockets Jens Axboe
2022-04-12 20:26 ` [PATCH 1/4] net: add sock 'sk_no_lock' member Jens Axboe
2022-04-12 20:26 ` [PATCH 2/4] net: allow sk_prot->release_cb() without sock lock held Jens Axboe
2022-04-12 20:26 ` [PATCH 3/4] net: add support for socket no-lock Jens Axboe
2022-04-12 20:26 ` [PATCH 4/4] io_uring: mark accept direct socket as no-lock Jens Axboe
2022-04-13  0:40 ` [PATCHSET 0/4] Add support for no-lock sockets Eric Dumazet
2022-04-13  1:26   ` Jens Axboe
2022-04-13  1:54     ` Eric Dumazet
2022-04-13  2:01       ` Jens Axboe
2022-04-13  2:05         ` Eric Dumazet
2022-04-13  2:12           ` Jens Axboe
2022-04-13  2:19             ` Eric Dumazet
2022-04-13  2:26               ` Eric Dumazet
2022-04-13  2:27               ` Jens Axboe
2022-04-13  2:32                 ` Eric Dumazet
2022-04-13  2:38                   ` Jens Axboe
2022-04-13  5:23         ` dust.li
2022-04-13  7:53           ` Paolo Abeni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.