netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 net-next 0/4] af_unix: Random improvements for GC.
@ 2023-11-23  1:47 Kuniyuki Iwashima
  2023-11-23  1:47 ` [PATCH v2 net-next 1/4] af_unix: Do not use atomic ops for unix_sk(sk)->inflight Kuniyuki Iwashima
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Kuniyuki Iwashima @ 2023-11-23  1:47 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Ivan Babrou, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

If more than 16000 inflight AF_UNIX sockets exist on a host, each
sendmsg() will be forced to wait for unix_gc() even if a process
is not sending any FD.

This series tries not to impose such a penalty on sane users.


Changes:
  v2:
    Patch 4: Fix build error when CONFIG_UNIX=n (kernel test robot)

  v1: https://lore.kernel.org/netdev/20231122013629.28554-1-kuniyu@amazon.com/


Kuniyuki Iwashima (4):
  af_unix: Do not use atomic ops for unix_sk(sk)->inflight.
  af_unix: Return struct unix_sock from unix_get_socket().
  af_unix: Run GC on only one CPU.
  af_unix: Try to run GC async.

 include/linux/io_uring.h |  4 +-
 include/net/af_unix.h    |  6 +--
 include/net/scm.h        |  1 +
 io_uring/io_uring.c      |  5 ++-
 net/core/scm.c           |  5 +++
 net/unix/af_unix.c       | 10 +++--
 net/unix/garbage.c       | 84 ++++++++++++++++++----------------------
 net/unix/scm.c           | 34 ++++++++--------
 8 files changed, 74 insertions(+), 75 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 net-next 1/4] af_unix: Do not use atomic ops for unix_sk(sk)->inflight.
  2023-11-23  1:47 [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Kuniyuki Iwashima
@ 2023-11-23  1:47 ` Kuniyuki Iwashima
  2023-12-01  9:34   ` Simon Horman
  2023-11-23  1:47 ` [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket() Kuniyuki Iwashima
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Kuniyuki Iwashima @ 2023-11-23  1:47 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Ivan Babrou, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

When touching unix_sk(sk)->inflight, we are always under
spin_lock(&unix_gc_lock).

Let's convert unix_sk(sk)->inflight to the normal unsigned long.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    |  4 ++--
 net/unix/garbage.c    | 17 ++++++++---------
 net/unix/scm.c        |  8 +++++---
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 824c258143a3..5a8a670b1920 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -61,7 +61,7 @@ struct unix_sock {
 	struct mutex		iolock, bindlock;
 	struct sock		*peer;
 	struct list_head	link;
-	atomic_long_t		inflight;
+	unsigned long		inflight;
 	spinlock_t		lock;
 	unsigned long		gc_flags;
 #define UNIX_GC_CANDIDATE	0
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a357dc5f2404..1e6f5aaf1cc9 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -995,11 +995,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
 	sk->sk_write_space	= unix_write_space;
 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
 	sk->sk_destruct		= unix_sock_destructor;
-	u	  = unix_sk(sk);
+	u = unix_sk(sk);
+	u->inflight = 0;
 	u->path.dentry = NULL;
 	u->path.mnt = NULL;
 	spin_lock_init(&u->lock);
-	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->iolock); /* single task reading lock */
 	mutex_init(&u->bindlock); /* single task binding lock */
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 2405f0f9af31..db1bb99bb793 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -166,17 +166,18 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
 
 static void dec_inflight(struct unix_sock *usk)
 {
-	atomic_long_dec(&usk->inflight);
+	usk->inflight--;
 }
 
 static void inc_inflight(struct unix_sock *usk)
 {
-	atomic_long_inc(&usk->inflight);
+	usk->inflight++;
 }
 
 static void inc_inflight_move_tail(struct unix_sock *u)
 {
-	atomic_long_inc(&u->inflight);
+	u->inflight++;
+
 	/* If this still might be part of a cycle, move it to the end
 	 * of the list, so that it's checked even if it was already
 	 * passed over
@@ -237,14 +238,12 @@ void unix_gc(void)
 	 */
 	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
 		long total_refs;
-		long inflight_refs;
 
 		total_refs = file_count(u->sk.sk_socket->file);
-		inflight_refs = atomic_long_read(&u->inflight);
 
-		BUG_ON(inflight_refs < 1);
-		BUG_ON(total_refs < inflight_refs);
-		if (total_refs == inflight_refs) {
+		BUG_ON(!u->inflight);
+		BUG_ON(total_refs < u->inflight);
+		if (total_refs == u->inflight) {
 			list_move_tail(&u->link, &gc_candidates);
 			__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
 			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
@@ -271,7 +270,7 @@ void unix_gc(void)
 		/* Move cursor to after the current position. */
 		list_move(&cursor, &u->link);
 
-		if (atomic_long_read(&u->inflight) > 0) {
+		if (u->inflight) {
 			list_move_tail(&u->link, &not_cycle_list);
 			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
 			scan_children(&u->sk, inc_inflight_move_tail, NULL);
diff --git a/net/unix/scm.c b/net/unix/scm.c
index 6ff628f2349f..4b3979272a81 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -55,12 +55,13 @@ void unix_inflight(struct user_struct *user, struct file *fp)
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
 
-		if (atomic_long_inc_return(&u->inflight) == 1) {
+		if (!u->inflight) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
 		} else {
 			BUG_ON(list_empty(&u->link));
 		}
+		u->inflight++;
 		/* Paired with READ_ONCE() in wait_for_unix_gc() */
 		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
 	}
@@ -77,10 +78,11 @@ void unix_notinflight(struct user_struct *user, struct file *fp)
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
 
-		BUG_ON(!atomic_long_read(&u->inflight));
+		BUG_ON(!u->inflight);
 		BUG_ON(list_empty(&u->link));
 
-		if (atomic_long_dec_and_test(&u->inflight))
+		u->inflight--;
+		if (!u->inflight)
 			list_del_init(&u->link);
 		/* Paired with READ_ONCE() in wait_for_unix_gc() */
 		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket().
  2023-11-23  1:47 [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Kuniyuki Iwashima
  2023-11-23  1:47 ` [PATCH v2 net-next 1/4] af_unix: Do not use atomic ops for unix_sk(sk)->inflight Kuniyuki Iwashima
@ 2023-11-23  1:47 ` Kuniyuki Iwashima
  2023-11-27  9:08   ` Paolo Abeni
  2023-12-01  9:35   ` Simon Horman
  2023-11-23  1:47 ` [PATCH v2 net-next 3/4] af_unix: Run GC on only one CPU Kuniyuki Iwashima
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 12+ messages in thread
From: Kuniyuki Iwashima @ 2023-11-23  1:47 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Ivan Babrou, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

Currently, unix_get_socket() returns struct sock, but after calling
it, we always cast it to unix_sk().

Let's return struct unix_sock from unix_get_socket().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/linux/io_uring.h |  4 ++--
 include/net/af_unix.h    |  2 +-
 io_uring/io_uring.c      |  5 +++--
 net/unix/garbage.c       | 19 +++++++------------
 net/unix/scm.c           | 26 +++++++++++---------------
 5 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index aefb73eeeebf..be16677f0e4c 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -54,7 +54,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd);
 void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
 			unsigned issue_flags);
-struct sock *io_uring_get_socket(struct file *file);
+struct unix_sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
 void io_uring_unreg_ringfd(void);
@@ -111,7 +111,7 @@ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
 			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
 {
 }
-static inline struct sock *io_uring_get_socket(struct file *file)
+static inline struct unix_sock *io_uring_get_socket(struct file *file)
 {
 	return NULL;
 }
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 5a8a670b1920..c628d30ceb19 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -14,7 +14,7 @@ void unix_destruct_scm(struct sk_buff *skb);
 void io_uring_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
 void wait_for_unix_gc(void);
-struct sock *unix_get_socket(struct file *filp);
+struct unix_sock *unix_get_socket(struct file *filp);
 struct sock *unix_peer_get(struct sock *sk);
 
 #define UNIX_HASH_MOD	(256 - 1)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ed254076c723..daed897f5975 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -177,13 +177,14 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
 };
 #endif
 
-struct sock *io_uring_get_socket(struct file *file)
+struct unix_sock *io_uring_get_socket(struct file *file)
 {
 #if defined(CONFIG_UNIX)
 	if (io_is_uring_fops(file)) {
 		struct io_ring_ctx *ctx = file->private_data;
 
-		return ctx->ring_sock->sk;
+		if (ctx->ring_sock->sk)
+			return unix_sk(ctx->ring_sock->sk);
 	}
 #endif
 	return NULL;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index db1bb99bb793..4d634f5f6a55 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 
 			while (nfd--) {
 				/* Get the socket the fd matches if it indeed does so */
-				struct sock *sk = unix_get_socket(*fp++);
+				struct unix_sock *u = unix_get_socket(*fp++);
 
-				if (sk) {
-					struct unix_sock *u = unix_sk(sk);
+				/* Ignore non-candidates, they could have been added
+				 * to the queues after starting the garbage collection
+				 */
+				if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
+					hit = true;
 
-					/* Ignore non-candidates, they could
-					 * have been added to the queues after
-					 * starting the garbage collection
-					 */
-					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
-						hit = true;
-
-						func(u);
-					}
+					func(u);
 				}
 			}
 			if (hit && hitlist != NULL) {
diff --git a/net/unix/scm.c b/net/unix/scm.c
index 4b3979272a81..36ce8fed9acc 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list);
 DEFINE_SPINLOCK(unix_gc_lock);
 EXPORT_SYMBOL(unix_gc_lock);
 
-struct sock *unix_get_socket(struct file *filp)
+struct unix_sock *unix_get_socket(struct file *filp)
 {
-	struct sock *u_sock = NULL;
 	struct inode *inode = file_inode(filp);
 
 	/* Socket ? */
@@ -34,12 +33,13 @@ struct sock *unix_get_socket(struct file *filp)
 
 		/* PF_UNIX ? */
 		if (s && ops && ops->family == PF_UNIX)
-			u_sock = s;
-	} else {
-		/* Could be an io_uring instance */
-		u_sock = io_uring_get_socket(filp);
+			return unix_sk(s);
+
+		return NULL;
 	}
-	return u_sock;
+
+	/* Could be an io_uring instance */
+	return io_uring_get_socket(filp);
 }
 EXPORT_SYMBOL(unix_get_socket);
 
@@ -48,13 +48,11 @@ EXPORT_SYMBOL(unix_get_socket);
  */
 void unix_inflight(struct user_struct *user, struct file *fp)
 {
-	struct sock *s = unix_get_socket(fp);
+	struct unix_sock *u = unix_get_socket(fp);
 
 	spin_lock(&unix_gc_lock);
 
-	if (s) {
-		struct unix_sock *u = unix_sk(s);
-
+	if (u) {
 		if (!u->inflight) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
@@ -71,13 +69,11 @@ void unix_inflight(struct user_struct *user, struct file *fp)
 
 void unix_notinflight(struct user_struct *user, struct file *fp)
 {
-	struct sock *s = unix_get_socket(fp);
+	struct unix_sock *u = unix_get_socket(fp);
 
 	spin_lock(&unix_gc_lock);
 
-	if (s) {
-		struct unix_sock *u = unix_sk(s);
-
+	if (u) {
 		BUG_ON(!u->inflight);
 		BUG_ON(list_empty(&u->link));
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 net-next 3/4] af_unix: Run GC on only one CPU.
  2023-11-23  1:47 [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Kuniyuki Iwashima
  2023-11-23  1:47 ` [PATCH v2 net-next 1/4] af_unix: Do not use atomic ops for unix_sk(sk)->inflight Kuniyuki Iwashima
  2023-11-23  1:47 ` [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket() Kuniyuki Iwashima
@ 2023-11-23  1:47 ` Kuniyuki Iwashima
  2023-11-23  1:47 ` [PATCH v2 net-next 4/4] af_unix: Try to run GC async Kuniyuki Iwashima
  2023-11-29  0:47 ` [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Ivan Babrou
  4 siblings, 0 replies; 12+ messages in thread
From: Kuniyuki Iwashima @ 2023-11-23  1:47 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Ivan Babrou, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

If more than 16000 inflight AF_UNIX sockets exist and the garbage
collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
Also, they wait for unix_gc() to complete.

In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
and more if they are the GC candidate.  Thus, sendmsg() significantly
slows down with too many inflight AF_UNIX sockets.

There is a small window to invoke multiple unix_gc() instances, which
will then be blocked by the same spinlock except for one.

Let's convert unix_gc() to use struct work so that it will not consume
CPUs unnecessarily.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 net/unix/garbage.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 4d634f5f6a55..8bc93a7e745f 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -86,7 +86,9 @@
 /* Internal data structures and random procedures: */
 
 static LIST_HEAD(gc_candidates);
-static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
+
+static void __unix_gc(struct work_struct *work);
+static DECLARE_WORK(unix_gc_work, __unix_gc);
 
 static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 			  struct sk_buff_head *hitlist)
@@ -181,24 +183,22 @@ static void inc_inflight_move_tail(struct unix_sock *u)
 		list_move_tail(&u->link, &gc_candidates);
 }
 
-static bool gc_in_progress;
 #define UNIX_INFLIGHT_TRIGGER_GC 16000
 
 void wait_for_unix_gc(void)
 {
-	/* If number of inflight sockets is insane,
-	 * force a garbage collect right now.
+	/* If number of inflight sockets is insane, kick a
+	 * garbage collect right now.
 	 * Paired with the WRITE_ONCE() in unix_inflight(),
-	 * unix_notinflight() and gc_in_progress().
+	 * unix_notinflight().
 	 */
-	if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
-	    !READ_ONCE(gc_in_progress))
-		unix_gc();
-	wait_event(unix_gc_wait, gc_in_progress == false);
+	if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC)
+		queue_work(system_unbound_wq, &unix_gc_work);
+
+	flush_work(&unix_gc_work);
 }
 
-/* The external entry point: unix_gc() */
-void unix_gc(void)
+static void __unix_gc(struct work_struct *work)
 {
 	struct sk_buff *next_skb, *skb;
 	struct unix_sock *u;
@@ -209,13 +209,6 @@ void unix_gc(void)
 
 	spin_lock(&unix_gc_lock);
 
-	/* Avoid a recursive GC. */
-	if (gc_in_progress)
-		goto out;
-
-	/* Paired with READ_ONCE() in wait_for_unix_gc(). */
-	WRITE_ONCE(gc_in_progress, true);
-
 	/* First, select candidates for garbage collection.  Only
 	 * in-flight sockets are considered, and from those only ones
 	 * which don't have any external reference.
@@ -319,11 +312,10 @@ void unix_gc(void)
 	/* All candidates should have been detached by now. */
 	BUG_ON(!list_empty(&gc_candidates));
 
-	/* Paired with READ_ONCE() in wait_for_unix_gc(). */
-	WRITE_ONCE(gc_in_progress, false);
-
-	wake_up(&unix_gc_wait);
-
- out:
 	spin_unlock(&unix_gc_lock);
 }
+
+void unix_gc(void)
+{
+	queue_work(system_unbound_wq, &unix_gc_work);
+}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 net-next 4/4] af_unix: Try to run GC async.
  2023-11-23  1:47 [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Kuniyuki Iwashima
                   ` (2 preceding siblings ...)
  2023-11-23  1:47 ` [PATCH v2 net-next 3/4] af_unix: Run GC on only one CPU Kuniyuki Iwashima
@ 2023-11-23  1:47 ` Kuniyuki Iwashima
  2023-11-27  9:59   ` Paolo Abeni
  2023-11-29  0:47 ` [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Ivan Babrou
  4 siblings, 1 reply; 12+ messages in thread
From: Kuniyuki Iwashima @ 2023-11-23  1:47 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Ivan Babrou, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

If more than 16000 inflight AF_UNIX sockets exist and the garbage
collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
Also, they wait for unix_gc() to complete.

In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
and more if they are the GC candidate.  Thus, sendmsg() significantly
slows down with too many inflight AF_UNIX sockets.

However, if a process sends data with no AF_UNIX FD, the sendmsg() call
does not need to wait for GC.  After this change, only the process that
meets the condition below will be blocked under such a situation.

  1) cmsg contains AF_UNIX socket
  2) more than 32 AF_UNIX sent by the same user are still inflight

Note that even a sendmsg() call that does not meet the condition but has
AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
but we allow that as a bonus for sane users.

The results below are the time spent in unix_dgram_sendmsg() sending 1
byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
sockets exist.

Without series: the sane sendmsg() needs to wait gc unreasonably.

  $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
  Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
  ^C
       nsecs               : count     distribution
  [...]
      524288 -> 1048575    : 0        |                                        |
     1048576 -> 2097151    : 3881     |****************************************|
     2097152 -> 4194303    : 214      |**                                      |
     4194304 -> 8388607    : 1        |                                        |

  avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096

With series: the sane sendmsg() can finish much faster.

  $ sudo /usr/share/bcc/tools/funclatency -p 8702  unix_dgram_sendmsg
  Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
  ^C
       nsecs               : count     distribution
  [...]
         128 -> 255        : 0        |                                        |
         256 -> 511        : 4092     |****************************************|
         512 -> 1023       : 2        |                                        |
        1024 -> 2047       : 0        |                                        |
        2048 -> 4095       : 0        |                                        |
        4096 -> 8191       : 1        |                                        |
        8192 -> 16383      : 1        |                                        |

  avg = 410 nsecs, total: 1680510 nsecs, count: 4096

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 include/net/af_unix.h |  2 +-
 include/net/scm.h     |  1 +
 net/core/scm.c        |  5 +++++
 net/unix/af_unix.c    |  6 ++++--
 net/unix/garbage.c    | 10 ++++++++--
 5 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index c628d30ceb19..f8e654d418e6 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -13,7 +13,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_destruct_scm(struct sk_buff *skb);
 void io_uring_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
-void wait_for_unix_gc(void);
+void wait_for_unix_gc(struct scm_fp_list *fpl);
 struct unix_sock *unix_get_socket(struct file *filp);
 struct sock *unix_peer_get(struct sock *sk);
 
diff --git a/include/net/scm.h b/include/net/scm.h
index e8c76b4be2fe..1ff6a2855064 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -24,6 +24,7 @@ struct scm_creds {
 
 struct scm_fp_list {
 	short			count;
+	short			count_unix;
 	short			max;
 	struct user_struct	*user;
 	struct file		*fp[SCM_MAX_FD];
diff --git a/net/core/scm.c b/net/core/scm.c
index 880027ecf516..c1aae77d120b 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -35,6 +35,7 @@
 #include <net/compat.h>
 #include <net/scm.h>
 #include <net/cls_cgroup.h>
+#include <net/af_unix.h>
 
 
 /*
@@ -105,6 +106,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 			return -EBADF;
 		*fpp++ = file;
 		fpl->count++;
+#if IS_ENABLED(CONFIG_UNIX)
+		if (unix_get_socket(file))
+			fpl->count_unix++;
+#endif
 	}
 
 	if (!fpl->user)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1e6f5aaf1cc9..bbad3959751d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1925,11 +1925,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	long timeo;
 	int err;
 
-	wait_for_unix_gc();
 	err = scm_send(sock, msg, &scm, false);
 	if (err < 0)
 		return err;
 
+	wait_for_unix_gc(scm.fp);
+
 	err = -EOPNOTSUPP;
 	if (msg->msg_flags&MSG_OOB)
 		goto out;
@@ -2201,11 +2202,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	bool fds_sent = false;
 	int data_len;
 
-	wait_for_unix_gc();
 	err = scm_send(sock, msg, &scm, false);
 	if (err < 0)
 		return err;
 
+	wait_for_unix_gc(scm.fp);
+
 	err = -EOPNOTSUPP;
 	if (msg->msg_flags & MSG_OOB) {
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 8bc93a7e745f..73091d6b7fc4 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -184,8 +184,9 @@ static void inc_inflight_move_tail(struct unix_sock *u)
 }
 
 #define UNIX_INFLIGHT_TRIGGER_GC 16000
+#define UNIX_INFLIGHT_SANE_USER 32
 
-void wait_for_unix_gc(void)
+void wait_for_unix_gc(struct scm_fp_list *fpl)
 {
 	/* If number of inflight sockets is insane, kick a
 	 * garbage collect right now.
@@ -195,7 +196,12 @@ void wait_for_unix_gc(void)
 	if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC)
 		queue_work(system_unbound_wq, &unix_gc_work);
 
-	flush_work(&unix_gc_work);
+	/* Penalise users who want to send AF_UNIX sockets
+	 * but whose sockets have not been received yet.
+	 */
+	if (fpl && fpl->count_unix &&
+	    READ_ONCE(fpl->user->unix_inflight) > UNIX_INFLIGHT_SANE_USER)
+		flush_work(&unix_gc_work);
 }
 
 static void __unix_gc(struct work_struct *work)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket().
  2023-11-23  1:47 ` [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket() Kuniyuki Iwashima
@ 2023-11-27  9:08   ` Paolo Abeni
  2023-11-27 12:33     ` Pavel Begunkov
  2023-12-01  9:35   ` Simon Horman
  1 sibling, 1 reply; 12+ messages in thread
From: Paolo Abeni @ 2023-11-27  9:08 UTC (permalink / raw)
  To: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski
  Cc: Ivan Babrou, Kuniyuki Iwashima, netdev, Jens Axboe,
	Pavel Begunkov, io-uring

On Wed, 2023-11-22 at 17:47 -0800, Kuniyuki Iwashima wrote:
> Currently, unix_get_socket() returns struct sock, but after calling
> it, we always cast it to unix_sk().
> 
> Let's return struct unix_sock from unix_get_socket().
> 
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> ---
>  include/linux/io_uring.h |  4 ++--
>  include/net/af_unix.h    |  2 +-
>  io_uring/io_uring.c      |  5 +++--
>  net/unix/garbage.c       | 19 +++++++------------
>  net/unix/scm.c           | 26 +++++++++++---------------
>  5 files changed, 24 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index aefb73eeeebf..be16677f0e4c 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -54,7 +54,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
>  			      struct iov_iter *iter, void *ioucmd);
>  void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
>  			unsigned issue_flags);
> -struct sock *io_uring_get_socket(struct file *file);
> +struct unix_sock *io_uring_get_socket(struct file *file);
>  void __io_uring_cancel(bool cancel_all);
>  void __io_uring_free(struct task_struct *tsk);
>  void io_uring_unreg_ringfd(void);
> @@ -111,7 +111,7 @@ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
>  			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
>  {
>  }
> -static inline struct sock *io_uring_get_socket(struct file *file)
> +static inline struct unix_sock *io_uring_get_socket(struct file *file)
>  {
>  	return NULL;
>  }
> diff --git a/include/net/af_unix.h b/include/net/af_unix.h
> index 5a8a670b1920..c628d30ceb19 100644
> --- a/include/net/af_unix.h
> +++ b/include/net/af_unix.h
> @@ -14,7 +14,7 @@ void unix_destruct_scm(struct sk_buff *skb);
>  void io_uring_destruct_scm(struct sk_buff *skb);
>  void unix_gc(void);
>  void wait_for_unix_gc(void);
> -struct sock *unix_get_socket(struct file *filp);
> +struct unix_sock *unix_get_socket(struct file *filp);
>  struct sock *unix_peer_get(struct sock *sk);
>  
>  #define UNIX_HASH_MOD	(256 - 1)
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index ed254076c723..daed897f5975 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -177,13 +177,14 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
>  };
>  #endif
>  
> -struct sock *io_uring_get_socket(struct file *file)
> +struct unix_sock *io_uring_get_socket(struct file *file)
>  {
>  #if defined(CONFIG_UNIX)
>  	if (io_is_uring_fops(file)) {
>  		struct io_ring_ctx *ctx = file->private_data;
>  
> -		return ctx->ring_sock->sk;
> +		if (ctx->ring_sock->sk)
> +			return unix_sk(ctx->ring_sock->sk);
>  	}
>  #endif
>  	return NULL;
> diff --git a/net/unix/garbage.c b/net/unix/garbage.c
> index db1bb99bb793..4d634f5f6a55 100644
> --- a/net/unix/garbage.c
> +++ b/net/unix/garbage.c
> @@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
>  
>  			while (nfd--) {
>  				/* Get the socket the fd matches if it indeed does so */
> -				struct sock *sk = unix_get_socket(*fp++);
> +				struct unix_sock *u = unix_get_socket(*fp++);
>  
> -				if (sk) {
> -					struct unix_sock *u = unix_sk(sk);
> +				/* Ignore non-candidates, they could have been added
> +				 * to the queues after starting the garbage collection
> +				 */
> +				if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
> +					hit = true;
>  
> -					/* Ignore non-candidates, they could
> -					 * have been added to the queues after
> -					 * starting the garbage collection
> -					 */
> -					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
> -						hit = true;
> -
> -						func(u);
> -					}
> +					func(u);
>  				}
>  			}
>  			if (hit && hitlist != NULL) {
> diff --git a/net/unix/scm.c b/net/unix/scm.c
> index 4b3979272a81..36ce8fed9acc 100644
> --- a/net/unix/scm.c
> +++ b/net/unix/scm.c
> @@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list);
>  DEFINE_SPINLOCK(unix_gc_lock);
>  EXPORT_SYMBOL(unix_gc_lock);
>  
> -struct sock *unix_get_socket(struct file *filp)
> +struct unix_sock *unix_get_socket(struct file *filp)
>  {
> -	struct sock *u_sock = NULL;
>  	struct inode *inode = file_inode(filp);
>  
>  	/* Socket ? */
> @@ -34,12 +33,13 @@ struct sock *unix_get_socket(struct file *filp)
>  
>  		/* PF_UNIX ? */
>  		if (s && ops && ops->family == PF_UNIX)
> -			u_sock = s;
> -	} else {
> -		/* Could be an io_uring instance */
> -		u_sock = io_uring_get_socket(filp);
> +			return unix_sk(s);
> +
> +		return NULL;
>  	}
> -	return u_sock;
> +
> +	/* Could be an io_uring instance */
> +	return io_uring_get_socket(filp);
>  }
>  EXPORT_SYMBOL(unix_get_socket);
>  
> @@ -48,13 +48,11 @@ EXPORT_SYMBOL(unix_get_socket);
>   */
>  void unix_inflight(struct user_struct *user, struct file *fp)
>  {
> -	struct sock *s = unix_get_socket(fp);
> +	struct unix_sock *u = unix_get_socket(fp);
>  
>  	spin_lock(&unix_gc_lock);
>  
> -	if (s) {
> -		struct unix_sock *u = unix_sk(s);
> -
> +	if (u) {
>  		if (!u->inflight) {
>  			BUG_ON(!list_empty(&u->link));
>  			list_add_tail(&u->link, &gc_inflight_list);
> @@ -71,13 +69,11 @@ void unix_inflight(struct user_struct *user, struct file *fp)
>  
>  void unix_notinflight(struct user_struct *user, struct file *fp)
>  {
> -	struct sock *s = unix_get_socket(fp);
> +	struct unix_sock *u = unix_get_socket(fp);
>  
>  	spin_lock(&unix_gc_lock);
>  
> -	if (s) {
> -		struct unix_sock *u = unix_sk(s);
> -
> +	if (u) {
>  		BUG_ON(!u->inflight);
>  		BUG_ON(list_empty(&u->link));
>  

Adding the io_uring peoples to the recipient list for awareness. I
guess this deserves an explicit ack from them.

Cheers,

Paolo



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 4/4] af_unix: Try to run GC async.
  2023-11-23  1:47 ` [PATCH v2 net-next 4/4] af_unix: Try to run GC async Kuniyuki Iwashima
@ 2023-11-27  9:59   ` Paolo Abeni
  2023-11-27 23:00     ` Kuniyuki Iwashima
  0 siblings, 1 reply; 12+ messages in thread
From: Paolo Abeni @ 2023-11-27  9:59 UTC (permalink / raw)
  To: Kuniyuki Iwashima, David S. Miller, Eric Dumazet, Jakub Kicinski
  Cc: Ivan Babrou, Kuniyuki Iwashima, netdev

On Wed, 2023-11-22 at 17:47 -0800, Kuniyuki Iwashima wrote:
> If more than 16000 inflight AF_UNIX sockets exist and the garbage
> collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
> Also, they wait for unix_gc() to complete.
> 
> In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
> and more if they are the GC candidate.  Thus, sendmsg() significantly
> slows down with too many inflight AF_UNIX sockets.
> 
> However, if a process sends data with no AF_UNIX FD, the sendmsg() call
> does not need to wait for GC.  After this change, only the process that
> meets the condition below will be blocked under such a situation.
> 
>   1) cmsg contains AF_UNIX socket
>   2) more than 32 AF_UNIX sent by the same user are still inflight
> 
> Note that even a sendmsg() call that does not meet the condition but has
> AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
> but we allow that as a bonus for sane users.
> 
> The results below are the time spent in unix_dgram_sendmsg() sending 1
> byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
> sockets exist.
> 
> Without series: the sane sendmsg() needs to wait gc unreasonably.
> 
>   $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
>   Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
>   ^C
>        nsecs               : count     distribution
>   [...]
>       524288 -> 1048575    : 0        |                                        |
>      1048576 -> 2097151    : 3881     |****************************************|
>      2097152 -> 4194303    : 214      |**                                      |
>      4194304 -> 8388607    : 1        |                                        |
> 
>   avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096
> 
> With series: the sane sendmsg() can finish much faster.
> 
>   $ sudo /usr/share/bcc/tools/funclatency -p 8702  unix_dgram_sendmsg
>   Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
>   ^C
>        nsecs               : count     distribution
>   [...]
>          128 -> 255        : 0        |                                        |
>          256 -> 511        : 4092     |****************************************|
>          512 -> 1023       : 2        |                                        |
>         1024 -> 2047       : 0        |                                        |
>         2048 -> 4095       : 0        |                                        |
>         4096 -> 8191       : 1        |                                        |
>         8192 -> 16383      : 1        |                                        |
> 
>   avg = 410 nsecs, total: 1680510 nsecs, count: 4096
> 
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> ---
>  include/net/af_unix.h |  2 +-
>  include/net/scm.h     |  1 +
>  net/core/scm.c        |  5 +++++
>  net/unix/af_unix.c    |  6 ++++--
>  net/unix/garbage.c    | 10 ++++++++--
>  5 files changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/include/net/af_unix.h b/include/net/af_unix.h
> index c628d30ceb19..f8e654d418e6 100644
> --- a/include/net/af_unix.h
> +++ b/include/net/af_unix.h
> @@ -13,7 +13,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp);
>  void unix_destruct_scm(struct sk_buff *skb);
>  void io_uring_destruct_scm(struct sk_buff *skb);
>  void unix_gc(void);
> -void wait_for_unix_gc(void);
> +void wait_for_unix_gc(struct scm_fp_list *fpl);
>  struct unix_sock *unix_get_socket(struct file *filp);
>  struct sock *unix_peer_get(struct sock *sk);
>  
> diff --git a/include/net/scm.h b/include/net/scm.h
> index e8c76b4be2fe..1ff6a2855064 100644
> --- a/include/net/scm.h
> +++ b/include/net/scm.h
> @@ -24,6 +24,7 @@ struct scm_creds {
>  
>  struct scm_fp_list {
>  	short			count;
> +	short			count_unix;
>  	short			max;
>  	struct user_struct	*user;
>  	struct file		*fp[SCM_MAX_FD];
> diff --git a/net/core/scm.c b/net/core/scm.c
> index 880027ecf516..c1aae77d120b 100644
> --- a/net/core/scm.c
> +++ b/net/core/scm.c
> @@ -35,6 +35,7 @@
>  #include <net/compat.h>
>  #include <net/scm.h>
>  #include <net/cls_cgroup.h>
> +#include <net/af_unix.h>
>  
>  
>  /*
> @@ -105,6 +106,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
>  			return -EBADF;
>  		*fpp++ = file;
>  		fpl->count++;
> +#if IS_ENABLED(CONFIG_UNIX)
> +		if (unix_get_socket(file))
> +			fpl->count_unix++;
> +#endif
>  	}
>  
>  	if (!fpl->user)
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 1e6f5aaf1cc9..bbad3959751d 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1925,11 +1925,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
>  	long timeo;
>  	int err;
>  
> -	wait_for_unix_gc();
>  	err = scm_send(sock, msg, &scm, false);
>  	if (err < 0)
>  		return err;
>  
> +	wait_for_unix_gc(scm.fp);
> +
>  	err = -EOPNOTSUPP;
>  	if (msg->msg_flags&MSG_OOB)
>  		goto out;
> @@ -2201,11 +2202,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
>  	bool fds_sent = false;
>  	int data_len;
>  
> -	wait_for_unix_gc();
>  	err = scm_send(sock, msg, &scm, false);
>  	if (err < 0)
>  		return err;
>  
> +	wait_for_unix_gc(scm.fp);
> +
>  	err = -EOPNOTSUPP;
>  	if (msg->msg_flags & MSG_OOB) {
>  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
> diff --git a/net/unix/garbage.c b/net/unix/garbage.c
> index 8bc93a7e745f..73091d6b7fc4 100644
> --- a/net/unix/garbage.c
> +++ b/net/unix/garbage.c
> @@ -184,8 +184,9 @@ static void inc_inflight_move_tail(struct unix_sock *u)
>  }
>  
>  #define UNIX_INFLIGHT_TRIGGER_GC 16000
> +#define UNIX_INFLIGHT_SANE_USER 32

I don't have any relevant usage stats for unix sockets, but out of
sheer ignorance on my side '32' looks a bit low. Why/how did you pick
such value?

> -void wait_for_unix_gc(void)
> +void wait_for_unix_gc(struct scm_fp_list *fpl)
>  {
>  	/* If number of inflight sockets is insane, kick a
>  	 * garbage collect right now.
> @@ -195,7 +196,12 @@ void wait_for_unix_gc(void)
>  	if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC)
>  		queue_work(system_unbound_wq, &unix_gc_work);
>  
> -	flush_work(&unix_gc_work);
> +	/* Penalise users who want to send AF_UNIX sockets
> +	 * but whose sockets have not been received yet.
> +	 */
> +	if (fpl && fpl->count_unix &&
> +	    READ_ONCE(fpl->user->unix_inflight) > UNIX_INFLIGHT_SANE_USER)
> +		flush_work(&unix_gc_work);

flush_work() will be called even when 'unix_tot_inflight' is (much)
less then 'UNIX_INFLIGHT_TRIGGER_GC'. Could that cause some regressions
for workload with moderated numbers of fd in flights, where the GC was
never triggered before this series?

Thanks!

Paolo


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket().
  2023-11-27  9:08   ` Paolo Abeni
@ 2023-11-27 12:33     ` Pavel Begunkov
  0 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2023-11-27 12:33 UTC (permalink / raw)
  To: Paolo Abeni, Kuniyuki Iwashima, David S. Miller, Eric Dumazet,
	Jakub Kicinski
  Cc: Ivan Babrou, Kuniyuki Iwashima, netdev, Jens Axboe, io-uring

On 11/27/23 09:08, Paolo Abeni wrote:
> On Wed, 2023-11-22 at 17:47 -0800, Kuniyuki Iwashima wrote:
>> Currently, unix_get_socket() returns struct sock, but after calling
>> it, we always cast it to unix_sk().
>>
>> Let's return struct unix_sock from unix_get_socket().
>>
>> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
>> ---
>>   include/linux/io_uring.h |  4 ++--
>>   include/net/af_unix.h    |  2 +-
>>   io_uring/io_uring.c      |  5 +++--
>>   net/unix/garbage.c       | 19 +++++++------------
>>   net/unix/scm.c           | 26 +++++++++++---------------
>>   5 files changed, 24 insertions(+), 32 deletions(-)
>>
>> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
>> index aefb73eeeebf..be16677f0e4c 100644
>> --- a/include/linux/io_uring.h
>> +++ b/include/linux/io_uring.h
>> @@ -54,7 +54,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
>>   			      struct iov_iter *iter, void *ioucmd);
>>   void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
>>   			unsigned issue_flags);
>> -struct sock *io_uring_get_socket(struct file *file);
>> +struct unix_sock *io_uring_get_socket(struct file *file);
>>   void __io_uring_cancel(bool cancel_all);
>>   void __io_uring_free(struct task_struct *tsk);
>>   void io_uring_unreg_ringfd(void);
>> @@ -111,7 +111,7 @@ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
>>   			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
>>   {
>>   }
>> -static inline struct sock *io_uring_get_socket(struct file *file)
>> +static inline struct unix_sock *io_uring_get_socket(struct file *file)
>>   {
>>   	return NULL;
>>   }
>> diff --git a/include/net/af_unix.h b/include/net/af_unix.h
>> index 5a8a670b1920..c628d30ceb19 100644
>> --- a/include/net/af_unix.h
>> +++ b/include/net/af_unix.h
>> @@ -14,7 +14,7 @@ void unix_destruct_scm(struct sk_buff *skb);
>>   void io_uring_destruct_scm(struct sk_buff *skb);
>>   void unix_gc(void);
>>   void wait_for_unix_gc(void);
>> -struct sock *unix_get_socket(struct file *filp);
>> +struct unix_sock *unix_get_socket(struct file *filp);
>>   struct sock *unix_peer_get(struct sock *sk);
>>   
>>   #define UNIX_HASH_MOD	(256 - 1)
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index ed254076c723..daed897f5975 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -177,13 +177,14 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
>>   };
>>   #endif
>>   
>> -struct sock *io_uring_get_socket(struct file *file)
>> +struct unix_sock *io_uring_get_socket(struct file *file)
>>   {
>>   #if defined(CONFIG_UNIX)
>>   	if (io_is_uring_fops(file)) {
>>   		struct io_ring_ctx *ctx = file->private_data;
>>   
>> -		return ctx->ring_sock->sk;
>> +		if (ctx->ring_sock->sk)
>> +			return unix_sk(ctx->ring_sock->sk);
>>   	}
>>   #endif
>>   	return NULL;
>> diff --git a/net/unix/garbage.c b/net/unix/garbage.c
>> index db1bb99bb793..4d634f5f6a55 100644
>> --- a/net/unix/garbage.c
>> +++ b/net/unix/garbage.c
>> @@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
>>   
>>   			while (nfd--) {
>>   				/* Get the socket the fd matches if it indeed does so */
>> -				struct sock *sk = unix_get_socket(*fp++);
>> +				struct unix_sock *u = unix_get_socket(*fp++);
>>   
>> -				if (sk) {
>> -					struct unix_sock *u = unix_sk(sk);
>> +				/* Ignore non-candidates, they could have been added
>> +				 * to the queues after starting the garbage collection
>> +				 */
>> +				if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
>> +					hit = true;
>>   
>> -					/* Ignore non-candidates, they could
>> -					 * have been added to the queues after
>> -					 * starting the garbage collection
>> -					 */
>> -					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
>> -						hit = true;
>> -
>> -						func(u);
>> -					}
>> +					func(u);
>>   				}
>>   			}
>>   			if (hit && hitlist != NULL) {
>> diff --git a/net/unix/scm.c b/net/unix/scm.c
>> index 4b3979272a81..36ce8fed9acc 100644
>> --- a/net/unix/scm.c
>> +++ b/net/unix/scm.c
>> @@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list);
>>   DEFINE_SPINLOCK(unix_gc_lock);
>>   EXPORT_SYMBOL(unix_gc_lock);
>>   
>> -struct sock *unix_get_socket(struct file *filp)
>> +struct unix_sock *unix_get_socket(struct file *filp)
>>   {
>> -	struct sock *u_sock = NULL;
>>   	struct inode *inode = file_inode(filp);
>>   
>>   	/* Socket ? */
>> @@ -34,12 +33,13 @@ struct sock *unix_get_socket(struct file *filp)
>>   
>>   		/* PF_UNIX ? */
>>   		if (s && ops && ops->family == PF_UNIX)
>> -			u_sock = s;
>> -	} else {
>> -		/* Could be an io_uring instance */
>> -		u_sock = io_uring_get_socket(filp);
>> +			return unix_sk(s);
>> +
>> +		return NULL;
>>   	}
>> -	return u_sock;
>> +
>> +	/* Could be an io_uring instance */
>> +	return io_uring_get_socket(filp);
>>   }
>>   EXPORT_SYMBOL(unix_get_socket);
>>   
>> @@ -48,13 +48,11 @@ EXPORT_SYMBOL(unix_get_socket);
>>    */
>>   void unix_inflight(struct user_struct *user, struct file *fp)
>>   {
>> -	struct sock *s = unix_get_socket(fp);
>> +	struct unix_sock *u = unix_get_socket(fp);
>>   
>>   	spin_lock(&unix_gc_lock);
>>   
>> -	if (s) {
>> -		struct unix_sock *u = unix_sk(s);
>> -
>> +	if (u) {
>>   		if (!u->inflight) {
>>   			BUG_ON(!list_empty(&u->link));
>>   			list_add_tail(&u->link, &gc_inflight_list);
>> @@ -71,13 +69,11 @@ void unix_inflight(struct user_struct *user, struct file *fp)
>>   
>>   void unix_notinflight(struct user_struct *user, struct file *fp)
>>   {
>> -	struct sock *s = unix_get_socket(fp);
>> +	struct unix_sock *u = unix_get_socket(fp);
>>   
>>   	spin_lock(&unix_gc_lock);
>>   
>> -	if (s) {
>> -		struct unix_sock *u = unix_sk(s);
>> -
>> +	if (u) {
>>   		BUG_ON(!u->inflight);
>>   		BUG_ON(list_empty(&u->link));
>>   
> 
> Adding the io_uring peoples to the recipient list for awareness. I
> guess this deserves an explicit ack from them.

Thanks Paolo, lgtm

Acked-by: Pavel Begunkov <asml.silence@gmail.com>

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 4/4] af_unix: Try to run GC async.
  2023-11-27  9:59   ` Paolo Abeni
@ 2023-11-27 23:00     ` Kuniyuki Iwashima
  0 siblings, 0 replies; 12+ messages in thread
From: Kuniyuki Iwashima @ 2023-11-27 23:00 UTC (permalink / raw)
  To: pabeni; +Cc: davem, edumazet, ivan, kuba, kuni1840, kuniyu, netdev

From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 27 Nov 2023 10:59:03 +0100
> On Wed, 2023-11-22 at 17:47 -0800, Kuniyuki Iwashima wrote:
> > If more than 16000 inflight AF_UNIX sockets exist and the garbage
> > collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
> > Also, they wait for unix_gc() to complete.
> > 
> > In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
> > and more if they are the GC candidate.  Thus, sendmsg() significantly
> > slows down with too many inflight AF_UNIX sockets.
> > 
> > However, if a process sends data with no AF_UNIX FD, the sendmsg() call
> > does not need to wait for GC.  After this change, only the process that
> > meets the condition below will be blocked under such a situation.
> > 
> >   1) cmsg contains AF_UNIX socket
> >   2) more than 32 AF_UNIX sent by the same user are still inflight
> > 
> > Note that even a sendmsg() call that does not meet the condition but has
> > AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
> > but we allow that as a bonus for sane users.
> > 
> > The results below are the time spent in unix_dgram_sendmsg() sending 1
> > byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
> > sockets exist.
> > 
> > Without series: the sane sendmsg() needs to wait gc unreasonably.
> > 
> >   $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
> >   Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
> >   ^C
> >        nsecs               : count     distribution
> >   [...]
> >       524288 -> 1048575    : 0        |                                        |
> >      1048576 -> 2097151    : 3881     |****************************************|
> >      2097152 -> 4194303    : 214      |**                                      |
> >      4194304 -> 8388607    : 1        |                                        |
> > 
> >   avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096
> > 
> > With series: the sane sendmsg() can finish much faster.
> > 
> >   $ sudo /usr/share/bcc/tools/funclatency -p 8702  unix_dgram_sendmsg
> >   Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
> >   ^C
> >        nsecs               : count     distribution
> >   [...]
> >          128 -> 255        : 0        |                                        |
> >          256 -> 511        : 4092     |****************************************|
> >          512 -> 1023       : 2        |                                        |
> >         1024 -> 2047       : 0        |                                        |
> >         2048 -> 4095       : 0        |                                        |
> >         4096 -> 8191       : 1        |                                        |
> >         8192 -> 16383      : 1        |                                        |
> > 
> >   avg = 410 nsecs, total: 1680510 nsecs, count: 4096
> > 
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > ---
> >  include/net/af_unix.h |  2 +-
> >  include/net/scm.h     |  1 +
> >  net/core/scm.c        |  5 +++++
> >  net/unix/af_unix.c    |  6 ++++--
> >  net/unix/garbage.c    | 10 ++++++++--
> >  5 files changed, 19 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/net/af_unix.h b/include/net/af_unix.h
> > index c628d30ceb19..f8e654d418e6 100644
> > --- a/include/net/af_unix.h
> > +++ b/include/net/af_unix.h
> > @@ -13,7 +13,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp);
> >  void unix_destruct_scm(struct sk_buff *skb);
> >  void io_uring_destruct_scm(struct sk_buff *skb);
> >  void unix_gc(void);
> > -void wait_for_unix_gc(void);
> > +void wait_for_unix_gc(struct scm_fp_list *fpl);
> >  struct unix_sock *unix_get_socket(struct file *filp);
> >  struct sock *unix_peer_get(struct sock *sk);
> >  
> > diff --git a/include/net/scm.h b/include/net/scm.h
> > index e8c76b4be2fe..1ff6a2855064 100644
> > --- a/include/net/scm.h
> > +++ b/include/net/scm.h
> > @@ -24,6 +24,7 @@ struct scm_creds {
> >  
> >  struct scm_fp_list {
> >  	short			count;
> > +	short			count_unix;
> >  	short			max;
> >  	struct user_struct	*user;
> >  	struct file		*fp[SCM_MAX_FD];
> > diff --git a/net/core/scm.c b/net/core/scm.c
> > index 880027ecf516..c1aae77d120b 100644
> > --- a/net/core/scm.c
> > +++ b/net/core/scm.c
> > @@ -35,6 +35,7 @@
> >  #include <net/compat.h>
> >  #include <net/scm.h>
> >  #include <net/cls_cgroup.h>
> > +#include <net/af_unix.h>
> >  
> >  
> >  /*
> > @@ -105,6 +106,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
> >  			return -EBADF;
> >  		*fpp++ = file;
> >  		fpl->count++;
> > +#if IS_ENABLED(CONFIG_UNIX)
> > +		if (unix_get_socket(file))
> > +			fpl->count_unix++;
> > +#endif
> >  	}
> >  
> >  	if (!fpl->user)
> > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> > index 1e6f5aaf1cc9..bbad3959751d 100644
> > --- a/net/unix/af_unix.c
> > +++ b/net/unix/af_unix.c
> > @@ -1925,11 +1925,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
> >  	long timeo;
> >  	int err;
> >  
> > -	wait_for_unix_gc();
> >  	err = scm_send(sock, msg, &scm, false);
> >  	if (err < 0)
> >  		return err;
> >  
> > +	wait_for_unix_gc(scm.fp);
> > +
> >  	err = -EOPNOTSUPP;
> >  	if (msg->msg_flags&MSG_OOB)
> >  		goto out;
> > @@ -2201,11 +2202,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
> >  	bool fds_sent = false;
> >  	int data_len;
> >  
> > -	wait_for_unix_gc();
> >  	err = scm_send(sock, msg, &scm, false);
> >  	if (err < 0)
> >  		return err;
> >  
> > +	wait_for_unix_gc(scm.fp);
> > +
> >  	err = -EOPNOTSUPP;
> >  	if (msg->msg_flags & MSG_OOB) {
> >  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
> > diff --git a/net/unix/garbage.c b/net/unix/garbage.c
> > index 8bc93a7e745f..73091d6b7fc4 100644
> > --- a/net/unix/garbage.c
> > +++ b/net/unix/garbage.c
> > @@ -184,8 +184,9 @@ static void inc_inflight_move_tail(struct unix_sock *u)
> >  }
> >  
> >  #define UNIX_INFLIGHT_TRIGGER_GC 16000
> > +#define UNIX_INFLIGHT_SANE_USER 32
> 
> I don't have any relevant usage stats for unix sockets, but out of
> sheer ignorance on my side '32' looks a bit low. Why/how did you pick
> such value?

My take was that the peer should receive the fds in timely manner so that
no one will be punished, but I admit 32 is small enough, which can be
reached by a single SCM_RIGHTS (SCM_MAX_FD == 253) cmsg.  So, probably we
can bump it up to 1024 or 2048 (> (4 or 8) * SCM_MAX_FD).


> > -void wait_for_unix_gc(void)
> > +void wait_for_unix_gc(struct scm_fp_list *fpl)
> >  {
> >  	/* If number of inflight sockets is insane, kick a
> >  	 * garbage collect right now.
> > @@ -195,7 +196,12 @@ void wait_for_unix_gc(void)
> >  	if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC)
> >  		queue_work(system_unbound_wq, &unix_gc_work);
> >  
> > -	flush_work(&unix_gc_work);
> > +	/* Penalise users who want to send AF_UNIX sockets
> > +	 * but whose sockets have not been received yet.
> > +	 */
> > +	if (fpl && fpl->count_unix &&
> > +	    READ_ONCE(fpl->user->unix_inflight) > UNIX_INFLIGHT_SANE_USER)
> > +		flush_work(&unix_gc_work);
> 
> flush_work() will be called even when 'unix_tot_inflight' is (much)
> less then 'UNIX_INFLIGHT_TRIGGER_GC'. Could that cause some regressions
> for workload with moderated numbers of fd in flights, where the GC was
> never triggered before this series?

Ah exactly, I'll add work_pending() in v3.

	if (!fpl || !fpl->count_unix)
		return

	if (work_pending(&unix_gc_work) &&
	    READ_ONCE(fpl->user->unix_inflight) > UNIX_INFLIGHT_SANE_USER)
		flush_work(&unix_gc_work)

Thanks!

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 0/4] af_unix: Random improvements for GC.
  2023-11-23  1:47 [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Kuniyuki Iwashima
                   ` (3 preceding siblings ...)
  2023-11-23  1:47 ` [PATCH v2 net-next 4/4] af_unix: Try to run GC async Kuniyuki Iwashima
@ 2023-11-29  0:47 ` Ivan Babrou
  4 siblings, 0 replies; 12+ messages in thread
From: Ivan Babrou @ 2023-11-29  0:47 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Kuniyuki Iwashima, netdev

On Wed, Nov 22, 2023 at 5:48 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
> If more than 16000 inflight AF_UNIX sockets exist on a host, each
> sendmsg() will be forced to wait for unix_gc() even if a process
> is not sending any FD.
>
> This series tries not to impose such a penalty on sane users.
>
>
> Changes:
>   v2:
>     Patch 4: Fix build error when CONFIG_UNIX=n (kernel test robot)
>
>   v1: https://lore.kernel.org/netdev/20231122013629.28554-1-kuniyu@amazon.com/
>
>
> Kuniyuki Iwashima (4):
>   af_unix: Do not use atomic ops for unix_sk(sk)->inflight.
>   af_unix: Return struct unix_sock from unix_get_socket().
>   af_unix: Run GC on only one CPU.
>   af_unix: Try to run GC async.
>
>  include/linux/io_uring.h |  4 +-
>  include/net/af_unix.h    |  6 +--
>  include/net/scm.h        |  1 +
>  io_uring/io_uring.c      |  5 ++-
>  net/core/scm.c           |  5 +++
>  net/unix/af_unix.c       | 10 +++--
>  net/unix/garbage.c       | 84 ++++++++++++++++++----------------------
>  net/unix/scm.c           | 34 ++++++++--------
>  8 files changed, 74 insertions(+), 75 deletions(-)
>
> --
> 2.30.2
>

LGTM. Not sure if reviewed-by is warranted from me, but you can at
least take tested-by.

Tested-by: Ivan Babrou <ivan@cloudflare.com>

Thank you for working on it.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 1/4] af_unix: Do not use atomic ops for unix_sk(sk)->inflight.
  2023-11-23  1:47 ` [PATCH v2 net-next 1/4] af_unix: Do not use atomic ops for unix_sk(sk)->inflight Kuniyuki Iwashima
@ 2023-12-01  9:34   ` Simon Horman
  0 siblings, 0 replies; 12+ messages in thread
From: Simon Horman @ 2023-12-01  9:34 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Ivan Babrou, Kuniyuki Iwashima, netdev

On Wed, Nov 22, 2023 at 05:47:44PM -0800, Kuniyuki Iwashima wrote:
> When touching unix_sk(sk)->inflight, we are always under
> spin_lock(&unix_gc_lock).
> 
> Let's convert unix_sk(sk)->inflight to the normal unsigned long.
> 
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>

Thanks Iwashima-san,

I agree with your analysis. This looks good to me.

Reviewed-by: Simon Horman <horms@kernel.org>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket().
  2023-11-23  1:47 ` [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket() Kuniyuki Iwashima
  2023-11-27  9:08   ` Paolo Abeni
@ 2023-12-01  9:35   ` Simon Horman
  1 sibling, 0 replies; 12+ messages in thread
From: Simon Horman @ 2023-12-01  9:35 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Ivan Babrou, Kuniyuki Iwashima, netdev

On Wed, Nov 22, 2023 at 05:47:45PM -0800, Kuniyuki Iwashima wrote:
> Currently, unix_get_socket() returns struct sock, but after calling
> it, we always cast it to unix_sk().
> 
> Let's return struct unix_sock from unix_get_socket().
> 
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>

Thanks Iwashima-san,

this looks like a nice clean-up to me.

Reviewed-by: Simon Horman <horms@kernel.org>


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-12-01  9:36 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-23  1:47 [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Kuniyuki Iwashima
2023-11-23  1:47 ` [PATCH v2 net-next 1/4] af_unix: Do not use atomic ops for unix_sk(sk)->inflight Kuniyuki Iwashima
2023-12-01  9:34   ` Simon Horman
2023-11-23  1:47 ` [PATCH v2 net-next 2/4] af_unix: Return struct unix_sock from unix_get_socket() Kuniyuki Iwashima
2023-11-27  9:08   ` Paolo Abeni
2023-11-27 12:33     ` Pavel Begunkov
2023-12-01  9:35   ` Simon Horman
2023-11-23  1:47 ` [PATCH v2 net-next 3/4] af_unix: Run GC on only one CPU Kuniyuki Iwashima
2023-11-23  1:47 ` [PATCH v2 net-next 4/4] af_unix: Try to run GC async Kuniyuki Iwashima
2023-11-27  9:59   ` Paolo Abeni
2023-11-27 23:00     ` Kuniyuki Iwashima
2023-11-29  0:47 ` [PATCH v2 net-next 0/4] af_unix: Random improvements for GC Ivan Babrou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).