All of lore.kernel.org
 help / color / mirror / Atom feed
From: "NeilBrown" <neilb@suse.de>
To: Al Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Jens Axboe <axboe@kernel.dk>, Oleg Nesterov <oleg@redhat.com>,
	Chuck Lever <chuck.lever@oracle.com>,
	Jeff Layton <jlayton@kernel.org>, Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>
Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-nfs@vger.kernel.org
Subject: [PATCH/RFC] core/nfsd: allow kernel threads to use task_work.
Date: Tue, 28 Nov 2023 09:05:21 +1100	[thread overview]
Message-ID: <170112272125.7109.6245462722883333440@noble.neil.brown.name> (raw)


I have evidence from a customer site of 256 nfsd threads adding files to
delayed_fput_lists nearly twice as fast they are retired by a single
work-queue thread running delayed_fput().  As you might imagine this
does not end well (20 million files in the queue at the time a snapshot
was taken for analysis).

While this might point to a problem with the filesystem not handling the
final close efficiently, such problems should only hurt throughput, not
lead to memory exhaustion.

For normal threads, the thread that closes the file also calls the
final fput so there is natural rate limiting preventing excessive growth
in the list of delayed fputs.  For kernel threads, and particularly for
nfsd, delayed in the final fput do not impose any throttling to prevent
the thread from closing more files.

A simple way to fix this is to treat nfsd threads like normal processes
for task_work.  Thus the pending files are queued for the thread, and
the same thread finishes the work.

Currently KTHREADs are assumed never to call task_work_run().  With this
patch that it still the default but it is implemented by storing the
magic value TASK_WORKS_DISABLED in ->task_works.  If a kthread, such as
nfsd, will call task_work_run() periodically, it sets ->task_works
to NULL to indicate this.

Signed-off-by: NeilBrown <neilb@suse.de>
---

I wonder which tree this should go through assuming everyone likes it.
VFS maybe??

Thanks.

 fs/file_table.c           | 2 +-
 fs/nfsd/nfssvc.c          | 4 ++++
 include/linux/sched.h     | 1 +
 include/linux/task_work.h | 4 +++-
 kernel/fork.c             | 2 +-
 kernel/task_work.c        | 7 ++++---
 6 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index de4a2915bfd4..e79351df22be 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -445,7 +445,7 @@ void fput(struct file *file)
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		struct task_struct *task = current;
 
-		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+		if (likely(!in_interrupt())) {
 			init_task_work(&file->f_rcuhead, ____fput);
 			if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
 				return;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 66ca50b38b27..c047961262ca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -13,6 +13,7 @@
 #include <linux/fs_struct.h>
 #include <linux/swap.h>
 #include <linux/siphash.h>
+#include <linux/task_work.h>
 
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/svcsock.h>
@@ -941,6 +942,7 @@ nfsd(void *vrqstp)
 	}
 
 	current->fs->umask = 0;
+	current->task_works = NULL; /* Declare that I will call task_work_run() */
 
 	atomic_inc(&nfsdstats.th_cnt);
 
@@ -955,6 +957,8 @@ nfsd(void *vrqstp)
 
 		svc_recv(rqstp);
 		validate_process_creds();
+		if (task_work_pending(current))
+			task_work_run();
 	}
 
 	atomic_dec(&nfsdstats.th_cnt);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..c63c2bedbf71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,6 +1117,7 @@ struct task_struct {
 	unsigned int			sas_ss_flags;
 
 	struct callback_head		*task_works;
+#define	TASK_WORKS_DISABLED	((void*)1)
 
 #ifdef CONFIG_AUDIT
 #ifdef CONFIG_AUDITSYSCALL
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 795ef5a68429..3c74e3de81ed 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -22,7 +22,9 @@ enum task_work_notify_mode {
 
 static inline bool task_work_pending(struct task_struct *task)
 {
-	return READ_ONCE(task->task_works);
+	struct callback_head *works = READ_ONCE(task->task_works);
+
+	return works && works != TASK_WORKS_DISABLED;
 }
 
 int task_work_add(struct task_struct *task, struct callback_head *twork,
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..903b29804fe1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2577,7 +2577,7 @@ __latent_entropy struct task_struct *copy_process(
 	p->dirty_paused_when = 0;
 
 	p->pdeath_signal = 0;
-	p->task_works = NULL;
+	p->task_works = args->kthread ? TASK_WORKS_DISABLED : NULL;
 	clear_posix_cputimers_work(p);
 
 #ifdef CONFIG_KRETPROBES
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 95a7e1b7f1da..ffdf4b0d7a0e 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -49,7 +49,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 
 	head = READ_ONCE(task->task_works);
 	do {
-		if (unlikely(head == &work_exited))
+		if (unlikely(head == &work_exited ||
+			     head == TASK_WORKS_DISABLED))
 			return -ESRCH;
 		work->next = head;
 	} while (!try_cmpxchg(&task->task_works, &head, work));
@@ -157,7 +158,7 @@ void task_work_run(void)
 		work = READ_ONCE(task->task_works);
 		do {
 			head = NULL;
-			if (!work) {
+			if (!work || work == TASK_WORKS_DISABLED) {
 				if (task->flags & PF_EXITING)
 					head = &work_exited;
 				else
@@ -165,7 +166,7 @@ void task_work_run(void)
 			}
 		} while (!try_cmpxchg(&task->task_works, &work, head));
 
-		if (!work)
+		if (!work || work == TASK_WORKS_DISABLED)
 			break;
 		/*
 		 * Synchronize with task_work_cancel(). It can not remove
-- 
2.42.1


             reply	other threads:[~2023-11-27 22:05 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-27 22:05 NeilBrown [this message]
2023-11-27 22:30 ` [PATCH/RFC] core/nfsd: allow kernel threads to use task_work Al Viro
2023-11-27 22:43   ` NeilBrown
2023-11-27 22:59 ` Chuck Lever
2023-11-28  0:16   ` NeilBrown
2023-11-28  1:37     ` Chuck Lever
2023-11-28  2:57       ` NeilBrown
2023-11-28 15:34         ` Chuck Lever
2023-11-30 17:50           ` Jeff Layton
2023-11-28 13:51     ` Christian Brauner
2023-11-28 14:15       ` Jeff Layton
2023-11-28 15:22         ` Chuck Lever
2023-11-28 23:31         ` NeilBrown
2023-11-28 23:20       ` NeilBrown
2023-11-29 11:43         ` Christian Brauner
2023-12-04  1:30           ` NeilBrown
2023-11-29 14:04         ` Chuck Lever
2023-11-30 17:47           ` Jeff Layton
2023-11-30 18:07             ` Chuck Lever
2023-11-30 18:33               ` Jeff Layton
2023-11-28 11:24 ` Christian Brauner
2023-11-28 13:52   ` Oleg Nesterov
2023-11-28 15:33     ` Christian Brauner
2023-11-28 16:59       ` Oleg Nesterov
2023-11-28 17:29         ` Oleg Nesterov
2023-11-28 23:40           ` NeilBrown
2023-11-29 11:38           ` Christian Brauner
2023-11-28 14:01 ` Oleg Nesterov
2023-11-28 14:20   ` Oleg Nesterov
2023-11-29  0:14   ` NeilBrown
2023-11-29  7:55     ` Oleg Nesterov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=170112272125.7109.6245462722883333440@noble.neil.brown.name \
    --to=neilb@suse.de \
    --cc=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=jlayton@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=oleg@redhat.com \
    --cc=peterz@infradead.org \
    --cc=vincent.guittot@linaro.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.