linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Suren Baghdasaryan <surenb@google.com>
To: surenb@google.com
Cc: akpm@linux-foundation.org, mhocko@kernel.org,
	rientjes@google.com, willy@infradead.org, hannes@cmpxchg.org,
	guro@fb.com, riel@surriel.com, minchan@kernel.org,
	christian@brauner.io, oleg@redhat.com, timmurray@google.com,
	linux-api@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, kernel-team@android.com
Subject: [PATCH 1/1] RFC: add pidfd_send_signal flag to reclaim mm while killing a process
Date: Fri, 13 Nov 2020 09:34:48 -0800	[thread overview]
Message-ID: <20201113173448.1863419-1-surenb@google.com> (raw)

When a process is being killed it might be in an uninterruptible sleep
which leads to an unpredictable delay in its memory reclaim. In low memory
situations, when it's important to free up memory quickly, such delay is
problematic. Kernel solves this problem with oom-reaper thread which
performs memory reclaim even when the victim process is not runnable.
Userspace currently lacks such mechanisms and the need and potential
solutions were discussed before (see links below).
This patch provides a mechanism to perform memory reclaim in the context
of the process that sends SIGKILL signal. New SYNC_REAP_MM flag for
pidfd_send_signal syscall can be used only when sending SIGKILL signal
and will lead to the caller synchronously reclaiming the memory that
belongs to the victim and can be easily reclaimed.

1. https://patchwork.kernel.org/cover/10894999
2. https://lwn.net/Articles/787217
3. https://lore.kernel.org/linux-api/CAJuCfpGz1kPM3G1gZH+09Z7aoWKg05QSAMMisJ7H5MdmRrRhNQ@mail.gmail.com

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/oom.h    |  2 ++
 include/linux/signal.h |  7 ++++
 kernel/signal.c        | 73 ++++++++++++++++++++++++++++++++++++++++--
 mm/oom_kill.c          |  2 +-
 4 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 2db9a1432511..9a8dcabdfdf1 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -111,6 +111,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm);
 long oom_badness(struct task_struct *p,
 		unsigned long totalpages);
 
+extern bool task_will_free_mem(struct task_struct *task);
+
 extern bool out_of_memory(struct oom_control *oc);
 
 extern void exit_oom_victim(void);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index b256f9c65661..5deafc99035d 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -449,6 +449,13 @@ extern bool unhandled_signal(struct task_struct *tsk, int sig);
 	(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
 	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
 
+/*
+ * Flag values used in pidfd_send_signal:
+ *
+ * SYNC_REAP_MM indicates request to reclaim mm after SIGKILL.
+ */
+#define SYNC_REAP_MM	0x1
+
 void signals_init(void);
 
 int restore_altstack(const stack_t __user *);
diff --git a/kernel/signal.c b/kernel/signal.c
index ef8f2a28d37c..15d4be5600a3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -46,6 +46,7 @@
 #include <linux/livepatch.h>
 #include <linux/cgroup.h>
 #include <linux/audit.h>
+#include <linux/oom.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -3711,6 +3712,63 @@ static struct pid *pidfd_to_pid(const struct file *file)
 	return tgid_pidfd_to_pid(file);
 }
 
+static int reap_mm(struct pid *pid)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int ret = 0;
+
+	/* Get the task_struct */
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		ret = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+
+	/* Check if memory can be easily reclaimed */
+	if (!task_will_free_mem(task)) {
+		task_unlock(task);
+		ret = -EBUSY;
+		goto release_task;
+	}
+
+	/* Get mm to prevent exit_mmap */
+	mm = task->mm;
+	mmget(mm);
+
+	/* Ensure no competition with OOM-killer to prevent contention */
+	if (unlikely(mm_is_oom_victim(mm)) ||
+	    unlikely(test_bit(MMF_OOM_SKIP, &mm->flags))) {
+		/* Already being reclaimed */
+		task_unlock(task);
+		goto drop_mm;
+	}
+	/*
+	 * Prevent OOM-killer or other pidfd_send_signal from considering
+	 * this task
+	 */
+	set_bit(MMF_OOM_SKIP, &mm->flags);
+
+	task_unlock(task);
+
+	mmap_read_lock(mm);
+	if (!__oom_reap_task_mm(mm)) {
+		/* Failed to reap part of the address space. User can retry */
+		ret = -EAGAIN;
+		clear_bit(MMF_OOM_SKIP, &mm->flags);
+	}
+	mmap_read_unlock(mm);
+
+drop_mm:
+	mmput(mm);
+release_task:
+	put_task_struct(task);
+out:
+	return ret;
+}
+
 /**
  * sys_pidfd_send_signal - Signal a process through a pidfd
  * @pidfd:  file descriptor of the process
@@ -3737,10 +3795,16 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	struct pid *pid;
 	kernel_siginfo_t kinfo;
 
-	/* Enforce flags be set to 0 until we add an extension. */
-	if (flags)
+	/* Enforce only valid flags. */
+	if (flags) {
+		/* Allow SYNC_REAP_MM only with SIGKILL. */
+		if (flags == SYNC_REAP_MM && sig == SIGKILL)
+			goto valid;
+
 		return -EINVAL;
+	}
 
+valid:
 	f = fdget(pidfd);
 	if (!f.file)
 		return -EBADF;
@@ -3775,6 +3839,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	}
 
 	ret = kill_pid_info(sig, &kinfo, pid);
+	if (unlikely(ret))
+		goto err;
+
+	if (flags & SYNC_REAP_MM)
+		ret = reap_mm(pid);
 
 err:
 	fdput(f);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8b84661a6410..66c90bca25bc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -808,7 +808,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
  * Caller has to make sure that task->mm is stable (hold task_lock or
  * it operates on the current).
  */
-static bool task_will_free_mem(struct task_struct *task)
+bool task_will_free_mem(struct task_struct *task)
 {
 	struct mm_struct *mm = task->mm;
 	struct task_struct *p;
-- 
2.29.2.299.gdc1121823c-goog


             reply	other threads:[~2020-11-13 17:34 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-13 17:34 Suren Baghdasaryan [this message]
2020-11-13 23:55 ` Andrew Morton
2020-11-14  0:06   ` Suren Baghdasaryan
2020-11-14  1:00     ` Andrew Morton
2020-11-14  1:09       ` Suren Baghdasaryan
2020-11-14  1:18         ` Andrew Morton
2020-11-14  1:57           ` Suren Baghdasaryan
2020-11-14  2:16             ` Andrew Morton
2020-11-14  2:51               ` Suren Baghdasaryan
2020-11-16 23:24               ` Minchan Kim
2020-11-18 19:10               ` Michal Hocko
2020-11-18 19:22                 ` Suren Baghdasaryan
2020-11-18 19:32                   ` Michal Hocko
2020-11-18 19:51                     ` Suren Baghdasaryan
2020-11-18 19:55                       ` Suren Baghdasaryan
2020-11-19  0:13                         ` Suren Baghdasaryan
2020-11-24  5:45                           ` Suren Baghdasaryan
2020-11-18 10:32   ` Christian Brauner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201113173448.1863419-1-surenb@google.com \
    --to=surenb@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=christian@brauner.io \
    --cc=guro@fb.com \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@android.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=minchan@kernel.org \
    --cc=oleg@redhat.com \
    --cc=riel@surriel.com \
    --cc=rientjes@google.com \
    --cc=timmurray@google.com \
    --cc=willy@infradead.org \
    --subject='Re: [PATCH 1/1] RFC: add pidfd_send_signal flag to reclaim mm while killing a process' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).