linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] selftests: vm: add process_mrelease tests
@ 2022-05-10  3:00 Suren Baghdasaryan
  2022-05-10  3:00 ` [PATCH 2/3] mm: drop oom code from exit_mmap Suren Baghdasaryan
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10  3:00 UTC (permalink / raw)
  To: akpm
  Cc: mhocko, rientjes, willy, hannes, guro, minchan, kirill, aarcange,
	brauner, hch, oleg, david, jannh, shakeelb, peterx, jhubbard,
	shuah, linux-kernel, linux-mm, linux-kselftest, kernel-team,
	surenb

Introduce process_mrelease syscall sanity tests. They include tests of
invalid pidfd and flags inputs, attempting to call process_mrelease
with a live process and a valid usage of process_mrelease. Because
process_mrelease has to be used against a process with a pending SIGKILL,
it's possible that the process exits before process_mrelease gets called.
In such cases we retry the test with a victim that allocates twice more
memory up to 1GB. This would require the victim process to spend more
time during exit and process_mrelease has a better chance of catching
the process before it exits.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 tools/testing/selftests/vm/Makefile        |   1 +
 tools/testing/selftests/vm/mrelease_test.c | 176 +++++++++++++++++++++
 tools/testing/selftests/vm/run_vmtests.sh  |  16 ++
 3 files changed, 193 insertions(+)
 create mode 100644 tools/testing/selftests/vm/mrelease_test.c

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 04a49e876a46..733fccbff0ef 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -43,6 +43,7 @@ TEST_GEN_FILES += map_populate
 TEST_GEN_FILES += memfd_secret
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += mrelease_test
 TEST_GEN_FILES += mremap_dontunmap
 TEST_GEN_FILES += mremap_test
 TEST_GEN_FILES += on-fault-limit
diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c
new file mode 100644
index 000000000000..a61061bf8433
--- /dev/null
+++ b/tools/testing/selftests/vm/mrelease_test.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2022 Google LLC
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "util.h"
+
+static inline int pidfd_open(pid_t pid, unsigned int flags)
+{
+#ifdef __NR_pidfd_open
+	return syscall(__NR_pidfd_open, pid, flags);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+static inline int process_mrelease(int pidfd, unsigned int flags)
+{
+#ifdef __NR_process_mrelease
+	return syscall(__NR_process_mrelease, pidfd, flags);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+static void write_fault_pages(char *addr, unsigned long nr_pages)
+{
+	unsigned long i;
+
+	for (i = 0; i < nr_pages; i++)
+		*((unsigned long *)(addr + (i * PAGE_SIZE))) = i;
+}
+
+static int alloc_noexit(unsigned long nr_pages, int pipefd)
+{
+	int ppid = getppid();
+	void *buf;
+
+	buf = mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANON, 0, 0);
+	if (buf == MAP_FAILED) {
+		perror("mmap");
+		return 1;
+	}
+
+	write_fault_pages((char *)buf, nr_pages);
+
+	/* Signal the parent that the child is ready */
+	if (write(pipefd, "", 1) < 0) {
+		perror("write");
+		return 1;
+	}
+
+	/* Wait to be killed (when reparenting happens) */
+	while (getppid() == ppid)
+		sleep(1);
+
+	munmap(buf, nr_pages * PAGE_SIZE);
+
+	return 0;
+}
+
+
+#define MB(x) (x << 20)
+#define MAX_SIZE_MB 1024
+
+int main(void)
+{
+	int res;
+	int pipefd[2], pidfd;
+	pid_t pid;
+	char byte;
+	size_t size;
+	int negative_tests_done = 0;
+
+	/* Test a wrong pidfd */
+	if (!process_mrelease(-1, 0) || errno != EBADF) {
+		perror("process_mrelease with wring pidfd");
+		exit(1);
+	}
+
+	/*
+	 * Start the test with 1MB allocation and double every time
+	 * process_mrelease fails
+	 */
+	for (size = 1; size <= MAX_SIZE_MB; size *= 2) {
+		/*
+		 * Pipe for the child to signal when it's done allocating
+		 * memory
+		 */
+		if (pipe(pipefd)) {
+			perror("pipe");
+			exit(1);
+		}
+		pid = fork();
+		if (pid < 0) {
+			perror("fork");
+			exit(1);
+		}
+
+		if (pid == 0) {
+			close(pipefd[0]);
+			res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]);
+			close(pipefd[1]);
+			exit(res);
+		}
+
+		close(pipefd[1]);
+		/* Block until the child is ready */
+		res = read(pipefd[0], &byte, 1);
+		close(pipefd[0]);
+		if (res < 0) {
+			perror("read");
+			exit(1);
+		}
+
+		pidfd = pidfd_open(pid, 0);
+		if (pidfd < 0) {
+			perror("pidfd_open");
+			exit(1);
+		}
+
+		/* Run negative tests which require a valid child only once */
+		if (!negative_tests_done) {
+			/* Test invalid flags */
+			if (!process_mrelease(pidfd, (unsigned int)-1) ||
+			    errno != EINVAL) {
+				perror("process_mrelease with wrong flags");
+				exit(1);
+			}
+			/* Test reapling while process is still alive */
+			if (!process_mrelease(pidfd, 0) ||
+			    errno != EINVAL) {
+				perror("process_mrelease on a live process");
+				exit(1);
+			}
+			negative_tests_done = 1;
+		}
+
+		if (kill(pid, SIGKILL)) {
+			perror("kill");
+			exit(1);
+		}
+
+		if (!process_mrelease(pidfd, 0)) {
+			/* Terminate the test once process_mrelease succeeds */
+			return 0;
+		}
+
+		/*
+		 * Ignore the failure if the child exited before mrelease got
+		 * called, increase allocation size and retry the test
+		 */
+		if (errno != ESRCH) {
+			perror("process_mrelease");
+			exit(1);
+		}
+
+		if (waitpid(pid, NULL, 0) < 0) {
+			perror("waitpid");
+			exit(1);
+		}
+		close(pidfd);
+	}
+
+	printf("All process_mrelease attempts failed!\n");
+	exit(1);
+}
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index 352ba00cf26b..1986162fea39 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -287,6 +287,22 @@ else
 	echo "[PASS]"
 fi
 
+echo "---------------------"
+echo "running mrelease_test"
+echo "---------------------"
+./mrelease_test
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+	echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+	 echo "[SKIP]"
+	 exitcode=$ksft_skip
+else
+	echo "[FAIL]"
+	exitcode=1
+fi
+
 echo "-------------------"
 echo "running mremap_test"
 echo "-------------------"
-- 
2.36.0.512.ge40c2bad7a-goog


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/3] mm: drop oom code from exit_mmap
  2022-05-10  3:00 [PATCH 1/3] selftests: vm: add process_mrelease tests Suren Baghdasaryan
@ 2022-05-10  3:00 ` Suren Baghdasaryan
  2022-05-10 13:05   ` Michal Hocko
  2022-05-10 15:46   ` Shuah Khan
  2022-05-10  3:00 ` [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag Suren Baghdasaryan
  2022-05-10 15:43 ` [PATCH 1/3] selftests: vm: add process_mrelease tests Shuah Khan
  2 siblings, 2 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10  3:00 UTC (permalink / raw)
  To: akpm
  Cc: mhocko, rientjes, willy, hannes, guro, minchan, kirill, aarcange,
	brauner, hch, oleg, david, jannh, shakeelb, peterx, jhubbard,
	shuah, linux-kernel, linux-mm, linux-kselftest, kernel-team,
	surenb

With the oom-killer being able to operate on locked pages, exit_mmap
does not need to ensure that oom_reap_task_mm is done before it can
proceed. Instead it can rely on mmap_lock write lock to prevent
oom-killer from operating on the vma tree while it's freeing page
tables. exit_mmap can hold mmap_lock read lock when unmapping vmas
and then take mmap_lock write lock before freeing page tables.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/oom.h |  2 --
 mm/mmap.c           | 25 ++++++-------------------
 mm/oom_kill.c       |  2 +-
 3 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 2db9a1432511..6cdf0772dbae 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -106,8 +106,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 	return 0;
 }
 
-bool __oom_reap_task_mm(struct mm_struct *mm);
-
 long oom_badness(struct task_struct *p,
 		unsigned long totalpages);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 313b57d55a63..feaa840fb95d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3105,30 +3105,13 @@ void exit_mmap(struct mm_struct *mm)
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
 
-	if (unlikely(mm_is_oom_victim(mm))) {
-		/*
-		 * Manually reap the mm to free as much memory as possible.
-		 * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
-		 * this mm from further consideration.  Taking mm->mmap_lock for
-		 * write after setting MMF_OOM_SKIP will guarantee that the oom
-		 * reaper will not run on this mm again after mmap_lock is
-		 * dropped.
-		 *
-		 * Nothing can be holding mm->mmap_lock here and the above call
-		 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
-		 * __oom_reap_task_mm() will not block.
-		 */
-		(void)__oom_reap_task_mm(mm);
-		set_bit(MMF_OOM_SKIP, &mm->flags);
-	}
-
-	mmap_write_lock(mm);
+	mmap_read_lock(mm);
 	arch_exit_mmap(mm);
 
 	vma = mm->mmap;
 	if (!vma) {
 		/* Can happen if dup_mmap() received an OOM */
-		mmap_write_unlock(mm);
+		mmap_read_unlock(mm);
 		return;
 	}
 
@@ -3138,6 +3121,10 @@ void exit_mmap(struct mm_struct *mm)
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
+	mmap_read_unlock(mm);
+	/* Set MMF_OOM_SKIP to disregard this mm from further consideration.*/
+	set_bit(MMF_OOM_SKIP, &mm->flags);
+	mmap_write_lock(mm);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 49d7df39b02d..36355b162727 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -509,7 +509,7 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-bool __oom_reap_task_mm(struct mm_struct *mm)
+static bool __oom_reap_task_mm(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	bool ret = true;
-- 
2.36.0.512.ge40c2bad7a-goog


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag
  2022-05-10  3:00 [PATCH 1/3] selftests: vm: add process_mrelease tests Suren Baghdasaryan
  2022-05-10  3:00 ` [PATCH 2/3] mm: drop oom code from exit_mmap Suren Baghdasaryan
@ 2022-05-10  3:00 ` Suren Baghdasaryan
  2022-05-10 13:08   ` Michal Hocko
  2022-05-10 15:51   ` Shuah Khan
  2022-05-10 15:43 ` [PATCH 1/3] selftests: vm: add process_mrelease tests Shuah Khan
  2 siblings, 2 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10  3:00 UTC (permalink / raw)
  To: akpm
  Cc: mhocko, rientjes, willy, hannes, guro, minchan, kirill, aarcange,
	brauner, hch, oleg, david, jannh, shakeelb, peterx, jhubbard,
	shuah, linux-kernel, linux-mm, linux-kselftest, kernel-team,
	surenb

With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
now unused and can be removed.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/oom.h            | 9 ---------
 include/linux/sched/coredump.h | 1 -
 mm/oom_kill.c                  | 4 +---
 3 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6cdf0772dbae..25990e9d9e15 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
-/*
- * Use this helper if tsk->mm != mm and the victim mm needs a special
- * handling. This is guaranteed to stay true after once set.
- */
-static inline bool mm_is_oom_victim(struct mm_struct *mm)
-{
-	return test_bit(MMF_OOM_VICTIM, &mm->flags);
-}
-
 /*
  * Checks whether a page fault on the given mm is still reliable.
  * This is no longer true if the oom reaper started to reap the
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 4d9e3a656875..746f6cb07a20 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -70,7 +70,6 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
-#define MMF_OOM_VICTIM		25	/* mm is the oom victim */
 #define MMF_OOM_REAP_QUEUED	26	/* mm was queued for oom_reaper */
 #define MMF_MULTIPROCESS	27	/* mm is shared between processes */
 /*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 36355b162727..11291b99599f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -732,10 +732,8 @@ static void mark_oom_victim(struct task_struct *tsk)
 		return;
 
 	/* oom_mm is bound to the signal struct life time. */
-	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
 		mmgrab(tsk->signal->oom_mm);
-		set_bit(MMF_OOM_VICTIM, &mm->flags);
-	}
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
-- 
2.36.0.512.ge40c2bad7a-goog


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] mm: drop oom code from exit_mmap
  2022-05-10  3:00 ` [PATCH 2/3] mm: drop oom code from exit_mmap Suren Baghdasaryan
@ 2022-05-10 13:05   ` Michal Hocko
  2022-05-10 16:31     ` Suren Baghdasaryan
  2022-05-10 15:46   ` Shuah Khan
  1 sibling, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2022-05-10 13:05 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: akpm, rientjes, willy, hannes, guro, minchan, kirill, aarcange,
	brauner, hch, oleg, david, jannh, shakeelb, peterx, jhubbard,
	shuah, linux-kernel, linux-mm, linux-kselftest, kernel-team

On Mon 09-05-22 20:00:13, Suren Baghdasaryan wrote:
> With the oom-killer being able to operate on locked pages, exit_mmap
> does not need to ensure that oom_reap_task_mm is done before it can
> proceed. Instead it can rely on mmap_lock write lock to prevent
> oom-killer from operating on the vma tree while it's freeing page
> tables. exit_mmap can hold mmap_lock read lock when unmapping vmas
> and then take mmap_lock write lock before freeing page tables.

The changelog is rather light on nasty details which might be good but
for the sake of our future us let's be more verbose so that we do not
have to reinvent the prior history each time we are looking into this
code. I would go with something like this instead:
"
The primary reason to invoke the oom reaper from the exit_mmap path used
to be a prevention of an excessive oom killing if the oom victim exit
races with the oom reaper (see 212925802454 ("mm: oom: let oom_reap_task
and exit_mmap run concurrently") for more details. The invocation has
moved around since then because of the interaction with the munlock
logic but the underlying reason has remained the same (see 27ae357fa82b
("mm, oom: fix concurrent munlock and oom reaper unmap, v3").

Munlock code is no longer a problem since a213e5cf71cb ("mm/munlock:
delete munlock_vma_pages_all(), allow oomreap") and there shouldn't be
any blocking operation before the memory is unmapped by exit_mmap so
the oom reaper invocation can be dropped. The unmapping part can be done
with the non-exclusive mmap_sem and the exclusive one is only required
when page tables are freed.

Remove the oom_reaper from exit_mmap which will make the code easier to
read. This is really unlikely to make any observable difference although
some microbenchmarks could benefit from one less branch that needs to be
evaluated even though it almost never is true.
"

One minor comment below. Other than that \o/ this is finally going away.
I strongly suspect that the history of this code is a nice example about how
over optimizing code can cause more harm than good.

Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  include/linux/oom.h |  2 --
>  mm/mmap.c           | 25 ++++++-------------------
>  mm/oom_kill.c       |  2 +-
>  3 files changed, 7 insertions(+), 22 deletions(-)
> 
[...]
> @@ -3138,6 +3121,10 @@ void exit_mmap(struct mm_struct *mm)
>  	/* update_hiwater_rss(mm) here? but nobody should be looking */
>  	/* Use -1 here to ensure all VMAs in the mm are unmapped */
>  	unmap_vmas(&tlb, vma, 0, -1);
> +	mmap_read_unlock(mm);
> +	/* Set MMF_OOM_SKIP to disregard this mm from further consideration.*/
> +	set_bit(MMF_OOM_SKIP, &mm->flags);

I think that it would be slightly more readable to add an empty line
above and below of this. Also the comment would be more helpful if it
explaind what the further consideration actually means. I would go with

	/*
	 * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
	 * because the memory has been already freed. Do not bother
	 * checking mm_is_oom_victim because setting a bit
	 * unconditionally is just cheaper.
	 */

> +	mmap_write_lock(mm);
>  	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
>  	tlb_finish_mmu(&tlb);

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag
  2022-05-10  3:00 ` [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag Suren Baghdasaryan
@ 2022-05-10 13:08   ` Michal Hocko
  2022-05-16  2:46     ` Suren Baghdasaryan
  2022-05-10 15:51   ` Shuah Khan
  1 sibling, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2022-05-10 13:08 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: akpm, rientjes, willy, hannes, guro, minchan, kirill, aarcange,
	brauner, hch, oleg, david, jannh, shakeelb, peterx, jhubbard,
	shuah, linux-kernel, linux-mm, linux-kselftest, kernel-team

On Mon 09-05-22 20:00:14, Suren Baghdasaryan wrote:
> With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
> now unused and can be removed.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

LGTM
Acked-by: Michal Hocko <mhocko@suse.com>

One question below
[...]
> diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
> index 4d9e3a656875..746f6cb07a20 100644
> --- a/include/linux/sched/coredump.h
> +++ b/include/linux/sched/coredump.h
> @@ -70,7 +70,6 @@ static inline int get_dumpable(struct mm_struct *mm)
>  #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
>  #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
>  #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
> -#define MMF_OOM_VICTIM		25	/* mm is the oom victim */
>  #define MMF_OOM_REAP_QUEUED	26	/* mm was queued for oom_reaper */
>  #define MMF_MULTIPROCESS	27	/* mm is shared between processes */

Have you consider renumbering the follow up flags so that we do not have
holes in there. Nothing really important but it can confuse somebody in
the future.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] selftests: vm: add process_mrelease tests
  2022-05-10  3:00 [PATCH 1/3] selftests: vm: add process_mrelease tests Suren Baghdasaryan
  2022-05-10  3:00 ` [PATCH 2/3] mm: drop oom code from exit_mmap Suren Baghdasaryan
  2022-05-10  3:00 ` [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag Suren Baghdasaryan
@ 2022-05-10 15:43 ` Shuah Khan
  2022-05-10 16:29   ` Suren Baghdasaryan
  2 siblings, 1 reply; 17+ messages in thread
From: Shuah Khan @ 2022-05-10 15:43 UTC (permalink / raw)
  To: Suren Baghdasaryan, akpm
  Cc: mhocko, rientjes, willy, hannes, guro, minchan, kirill, aarcange,
	brauner, hch, oleg, david, jannh, shakeelb, peterx, jhubbard,
	shuah, linux-kernel, linux-mm, linux-kselftest, kernel-team,
	Shuah Khan

On 5/9/22 9:00 PM, Suren Baghdasaryan wrote:
> Introduce process_mrelease syscall sanity tests. They include tests of
> invalid pidfd and flags inputs, attempting to call process_mrelease
> with a live process and a valid usage of process_mrelease. Because
> process_mrelease has to be used against a process with a pending SIGKILL,
> it's possible that the process exits before process_mrelease gets called.
> In such cases we retry the test with a victim that allocates twice more
> memory up to 1GB. This would require the victim process to spend more
> time during exit and process_mrelease has a better chance of catching
> the process before it exits.
> 

+1 on Mike's comments on improving the change log. List what is getting
tested as opposed to describing the test code.

> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>   tools/testing/selftests/vm/Makefile        |   1 +
>   tools/testing/selftests/vm/mrelease_test.c | 176 +++++++++++++++++++++
>   tools/testing/selftests/vm/run_vmtests.sh  |  16 ++
>   3 files changed, 193 insertions(+)
>   create mode 100644 tools/testing/selftests/vm/mrelease_test.c

Please update .gitignore with the new executable.

> 
> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
> index 04a49e876a46..733fccbff0ef 100644
> --- a/tools/testing/selftests/vm/Makefile
> +++ b/tools/testing/selftests/vm/Makefile
> @@ -43,6 +43,7 @@ TEST_GEN_FILES += map_populate
>   TEST_GEN_FILES += memfd_secret
>   TEST_GEN_FILES += mlock-random-test
>   TEST_GEN_FILES += mlock2-tests
> +TEST_GEN_FILES += mrelease_test
>   TEST_GEN_FILES += mremap_dontunmap
>   TEST_GEN_FILES += mremap_test
>   TEST_GEN_FILES += on-fault-limit
> diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c
> new file mode 100644
> index 000000000000..a61061bf8433
> --- /dev/null
> +++ b/tools/testing/selftests/vm/mrelease_test.c
> @@ -0,0 +1,176 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2022 Google LLC
> + */
> +#define _GNU_SOURCE
> +#include <errno.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <sys/wait.h>
> +#include <unistd.h>
> +
> +#include "util.h"
> +
> +static inline int pidfd_open(pid_t pid, unsigned int flags)
> +{
> +#ifdef __NR_pidfd_open
> +	return syscall(__NR_pidfd_open, pid, flags);
> +#else
> +	errno = ENOSYS;

This isn't an error - this would be skip because this syscall
isn't supported.

> +	return -1;
> +#endif

Key off of syscall return instead of these ifdefs - same comment
on all of the ifdefs

> +}
> +

I am not seeing any reason for breaking this code up have a separate
routine for pidfd_open().

> +static inline int process_mrelease(int pidfd, unsigned int flags)
> +{
> +#ifdef __NR_process_mrelease
> +	return syscall(__NR_process_mrelease, pidfd, flags);
> +#else
> +	errno = ENOSYS;
> +	return -1;
> +#endif> +}
> +

Same comments on ifdefs and skips here as well.

> +static void write_fault_pages(char *addr, unsigned long nr_pages)
> +{
> +	unsigned long i;
> +
> +	for (i = 0; i < nr_pages; i++)
> +		*((unsigned long *)(addr + (i * PAGE_SIZE))) = i;
> +}
> +
> +static int alloc_noexit(unsigned long nr_pages, int pipefd)
> +{
> +	int ppid = getppid();
> +	void *buf;
> +
> +	buf = mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
> +		   MAP_PRIVATE | MAP_ANON, 0, 0);
> +	if (buf == MAP_FAILED) {
> +		perror("mmap");

A bit more descriptive message what the test would do will be helpful.
Also consider if this should be a skip or fail for the test.

> +		return 1;
> +	}
> +
> +	write_fault_pages((char *)buf, nr_pages);
> +
> +	/* Signal the parent that the child is ready */
> +	if (write(pipefd, "", 1) < 0) {
> +		perror("write");
> +		return 1;
> +	}
> +
> +	/* Wait to be killed (when reparenting happens) */
> +	while (getppid() == ppid)
> +		sleep(1);
> +

What happens if reparenting doesn't happen? Will this loop for ever?
This test could hang?

> +	munmap(buf, nr_pages * PAGE_SIZE);
> +
> +	return 0;
> +}
> +
> +
> +#define MB(x) (x << 20)
> +#define MAX_SIZE_MB 1024
> +
> +int main(void)
> +{
> +	int res;
> +	int pipefd[2], pidfd;
> +	pid_t pid;
> +	char byte;
> +	size_t size;
> +	int negative_tests_done = 0;
> +
> +	/* Test a wrong pidfd */
> +	if (!process_mrelease(-1, 0) || errno != EBADF) {
> +		perror("process_mrelease with wring pidfd");

Incorrect spelling "wring/wrong"

> +		exit(1);
> +	}
> +
> +	/*
> +	 * Start the test with 1MB allocation and double every time
> +	 * process_mrelease fails
> +	 */
> +	for (size = 1; size <= MAX_SIZE_MB; size *= 2) {
> +		/*
> +		 * Pipe for the child to signal when it's done allocating
> +		 * memory
> +		 */
> +		if (pipe(pipefd)) {
> +			perror("pipe");
> +			exit(1);
> +		}
> +		pid = fork();
> +		if (pid < 0) {
> +			perror("fork");

Close the pipe?

> +			exit(1);
> +		}
> +
> +		if (pid == 0) {
> +			close(pipefd[0]);
> +			res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]);
> +			close(pipefd[1]);
> +			exit(res);
> +		}
> +
> +		close(pipefd[1]);
> +		/* Block until the child is ready */
> +		res = read(pipefd[0], &byte, 1);
> +		close(pipefd[0]);
> +		if (res < 0) {
> +			perror("read");
> +			exit(1);
> +		}
> +
> +		pidfd = pidfd_open(pid, 0);
> +		if (pidfd < 0) {
> +			perror("pidfd_open");
> +			exit(1);
> +		}
> +

The code is very hard to read. Add comments to indicate parent and child
paths clearly so reviewers can follow the logic and be able to do effective
review.

> +		/* Run negative tests which require a valid child only once */
> +		if (!negative_tests_done) {
> +			/* Test invalid flags */
> +			if (!process_mrelease(pidfd, (unsigned int)-1) ||
> +			    errno != EINVAL) {
> +				perror("process_mrelease with wrong flags");
> +				exit(1);

So is this an expected fail or a test fail?

> +			}
> +			/* Test reapling while process is still alive */
> +			if (!process_mrelease(pidfd, 0) ||
> +			    errno != EINVAL) {
> +				perror("process_mrelease on a live process");

So is this an expected fail or a test fail?

> +				exit(1);
> +			}
> +			negative_tests_done = 1;
> +		}

Now the above negative_tests_done block could be in a separate function ---
All the others aren't really needed. It will be good for abstraction and
readability.

> +
> +		if (kill(pid, SIGKILL)) {
> +			perror("kill");

Include test results in the change log - so we can see the test report.

> +			exit(1);
> +		}
> +
> +		if (!process_mrelease(pidfd, 0)) {
> +			/* Terminate the test once process_mrelease succeeds */
> +			return 0;
> +		}
> +
> +		/*
> +		 * Ignore the failure if the child exited before mrelease got
> +		 * called, increase allocation size and retry the test
> +		 */

Add more info. on why allocating more memory helps.

> +		if (errno != ESRCH) {
> +			perror("process_mrelease");
> +			exit(1);
> +		}
> +
> +		if (waitpid(pid, NULL, 0) < 0) {
> +			perror("waitpid");
> +			exit(1);
> +		}
> +		close(pidfd);
> +	}
> +
> +	printf("All process_mrelease attempts failed!\n");
> +	exit(1);
> +}
> diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
> index 352ba00cf26b..1986162fea39 100755
> --- a/tools/testing/selftests/vm/run_vmtests.sh
> +++ b/tools/testing/selftests/vm/run_vmtests.sh
> @@ -287,6 +287,22 @@ else
>   	echo "[PASS]"
>   fi
>   
> +echo "---------------------"
> +echo "running mrelease_test"
> +echo "---------------------"
> +./mrelease_test
> +ret_val=$?
> +
> +if [ $ret_val -eq 0 ]; then
> +	echo "[PASS]"
> +elif [ $ret_val -eq $ksft_skip ]; then
> +	 echo "[SKIP]"
> +	 exitcode=$ksft_skip
> +else
> +	echo "[FAIL]"
> +	exitcode=1
> +fi
> +
>   echo "-------------------"
>   echo "running mremap_test"
>   echo "-------------------"
> 

In general, the code flow is hard to read to make sure resources
are released e.g: pipefd in all the error paths. The code is broken
up into smaller chunks where it isn't needed in some cases and left
as a large block when it could benefit from abstraction e.g: negative
test block.

Please make changes and send v2.

thanks,
-- Shuah

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] mm: drop oom code from exit_mmap
  2022-05-10  3:00 ` [PATCH 2/3] mm: drop oom code from exit_mmap Suren Baghdasaryan
  2022-05-10 13:05   ` Michal Hocko
@ 2022-05-10 15:46   ` Shuah Khan
  2022-05-10 16:35     ` Suren Baghdasaryan
  1 sibling, 1 reply; 17+ messages in thread
From: Shuah Khan @ 2022-05-10 15:46 UTC (permalink / raw)
  To: Suren Baghdasaryan, akpm
  Cc: mhocko, rientjes, willy, hannes, guro, minchan, kirill, aarcange,
	brauner, hch, oleg, david, jannh, shakeelb, peterx, jhubbard,
	shuah, linux-kernel, linux-mm, linux-kselftest, kernel-team,
	Shuah Khan

On 5/9/22 9:00 PM, Suren Baghdasaryan wrote:
> With the oom-killer being able to operate on locked pages, exit_mmap
> does not need to ensure that oom_reap_task_mm is done before it can
> proceed. Instead it can rely on mmap_lock write lock to prevent
> oom-killer from operating on the vma tree while it's freeing page
> tables. exit_mmap can hold mmap_lock read lock when unmapping vmas
> and then take mmap_lock write lock before freeing page tables.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>   include/linux/oom.h |  2 --
>   mm/mmap.c           | 25 ++++++-------------------
>   mm/oom_kill.c       |  2 +-
>   3 files changed, 7 insertions(+), 22 deletions(-)
> 

How does this improve the test? Include the information on why this
change is needed as opposed describing what this does?

thanks,
-- Shuah

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag
  2022-05-10  3:00 ` [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag Suren Baghdasaryan
  2022-05-10 13:08   ` Michal Hocko
@ 2022-05-10 15:51   ` Shuah Khan
  2022-05-10 16:10     ` Suren Baghdasaryan
  1 sibling, 1 reply; 17+ messages in thread
From: Shuah Khan @ 2022-05-10 15:51 UTC (permalink / raw)
  To: Suren Baghdasaryan, akpm
  Cc: mhocko, rientjes, willy, hannes, guro, minchan, kirill, aarcange,
	brauner, hch, oleg, david, jannh, shakeelb, peterx, jhubbard,
	shuah, linux-kernel, linux-mm, linux-kselftest, kernel-team,
	Shuah Khan

On 5/9/22 9:00 PM, Suren Baghdasaryan wrote:
> With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
> now unused and can be removed.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>   include/linux/oom.h            | 9 ---------
>   include/linux/sched/coredump.h | 1 -
>   mm/oom_kill.c                  | 4 +---
>   3 files changed, 1 insertion(+), 13 deletions(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 6cdf0772dbae..25990e9d9e15 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
>   	return tsk->signal->oom_mm;
>   }
>   
> -/*
> - * Use this helper if tsk->mm != mm and the victim mm needs a special
> - * handling. This is guaranteed to stay true after once set.
> - */
> -static inline bool mm_is_oom_victim(struct mm_struct *mm)
> -{
> -	return test_bit(MMF_OOM_VICTIM, &mm->flags);
> -}
> -
>   /*
>    * Checks whether a page fault on the given mm is still reliable.
>    * This is no longer true if the oom reaper started to reap the
> diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
> index 4d9e3a656875..746f6cb07a20 100644
> --- a/include/linux/sched/coredump.h
> +++ b/include/linux/sched/coredump.h
> @@ -70,7 +70,6 @@ static inline int get_dumpable(struct mm_struct *mm)
>   #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
>   #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
>   #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
> -#define MMF_OOM_VICTIM		25	/* mm is the oom victim */
>   #define MMF_OOM_REAP_QUEUED	26	/* mm was queued for oom_reaper */
>   #define MMF_MULTIPROCESS	27	/* mm is shared between processes */
>   /*
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 36355b162727..11291b99599f 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -732,10 +732,8 @@ static void mark_oom_victim(struct task_struct *tsk)
>   		return;
>   
>   	/* oom_mm is bound to the signal struct life time. */
> -	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
> +	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
>   		mmgrab(tsk->signal->oom_mm);
> -		set_bit(MMF_OOM_VICTIM, &mm->flags);
> -	}
>   
>   	/*
>   	 * Make sure that the task is woken up from uninterruptible sleep
> 

Thank you for working on the new tests and cleanups.

This series needs a cover-letter that explains why this series is needed
that includes the information from this last patch.

Please send v2 with a proper cover letter starting with why this series
is necessary. If you did that, it would have reviewers job is lot easier.

Also it appears you are combining new tests with cleanup patches. I think
patches 2/3 and 3/3 can be a separate series and the new test can be a
separate patch.

thanks,
-- Shuah

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag
  2022-05-10 15:51   ` Shuah Khan
@ 2022-05-10 16:10     ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10 16:10 UTC (permalink / raw)
  To: Shuah Khan
  Cc: Andrew Morton, Michal Hocko, David Rientjes, Matthew Wilcox,
	Johannes Weiner, Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team

On Tue, May 10, 2022 at 8:51 AM Shuah Khan <skhan@linuxfoundation.org> wrote:
>
> On 5/9/22 9:00 PM, Suren Baghdasaryan wrote:
> > With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
> > now unused and can be removed.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >   include/linux/oom.h            | 9 ---------
> >   include/linux/sched/coredump.h | 1 -
> >   mm/oom_kill.c                  | 4 +---
> >   3 files changed, 1 insertion(+), 13 deletions(-)
> >
> > diff --git a/include/linux/oom.h b/include/linux/oom.h
> > index 6cdf0772dbae..25990e9d9e15 100644
> > --- a/include/linux/oom.h
> > +++ b/include/linux/oom.h
> > @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
> >       return tsk->signal->oom_mm;
> >   }
> >
> > -/*
> > - * Use this helper if tsk->mm != mm and the victim mm needs a special
> > - * handling. This is guaranteed to stay true after once set.
> > - */
> > -static inline bool mm_is_oom_victim(struct mm_struct *mm)
> > -{
> > -     return test_bit(MMF_OOM_VICTIM, &mm->flags);
> > -}
> > -
> >   /*
> >    * Checks whether a page fault on the given mm is still reliable.
> >    * This is no longer true if the oom reaper started to reap the
> > diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
> > index 4d9e3a656875..746f6cb07a20 100644
> > --- a/include/linux/sched/coredump.h
> > +++ b/include/linux/sched/coredump.h
> > @@ -70,7 +70,6 @@ static inline int get_dumpable(struct mm_struct *mm)
> >   #define MMF_UNSTABLE                22      /* mm is unstable for copy_from_user */
> >   #define MMF_HUGE_ZERO_PAGE  23      /* mm has ever used the global huge zero page */
> >   #define MMF_DISABLE_THP             24      /* disable THP for all VMAs */
> > -#define MMF_OOM_VICTIM               25      /* mm is the oom victim */
> >   #define MMF_OOM_REAP_QUEUED 26      /* mm was queued for oom_reaper */
> >   #define MMF_MULTIPROCESS    27      /* mm is shared between processes */
> >   /*
> > diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> > index 36355b162727..11291b99599f 100644
> > --- a/mm/oom_kill.c
> > +++ b/mm/oom_kill.c
> > @@ -732,10 +732,8 @@ static void mark_oom_victim(struct task_struct *tsk)
> >               return;
> >
> >       /* oom_mm is bound to the signal struct life time. */
> > -     if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
> > +     if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
> >               mmgrab(tsk->signal->oom_mm);
> > -             set_bit(MMF_OOM_VICTIM, &mm->flags);
> > -     }
> >
> >       /*
> >        * Make sure that the task is woken up from uninterruptible sleep
> >
>
> Thank you for working on the new tests and cleanups.

Thanks for the review!

>
> This series needs a cover-letter that explains why this series is needed
> that includes the information from this last patch.
>
> Please send v2 with a proper cover letter starting with why this series
> is necessary. If you did that, it would have reviewers job is lot easier.
>
> Also it appears you are combining new tests with cleanup patches. I think
> patches 2/3 and 3/3 can be a separate series and the new test can be a
> separate patch.

I used the new selftest to test the patches but otherwise it's true,
they are unrelated. I was debating whether to send them separately and
with your blessing I'll split them up.
Thanks,
Suren.

>
> thanks,
> -- Shuah

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] selftests: vm: add process_mrelease tests
  2022-05-10 15:43 ` [PATCH 1/3] selftests: vm: add process_mrelease tests Shuah Khan
@ 2022-05-10 16:29   ` Suren Baghdasaryan
  2022-05-10 16:35     ` Shuah Khan
  0 siblings, 1 reply; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10 16:29 UTC (permalink / raw)
  To: Shuah Khan
  Cc: Andrew Morton, Michal Hocko, David Rientjes, Matthew Wilcox,
	Johannes Weiner, Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team

On Tue, May 10, 2022 at 8:43 AM Shuah Khan <skhan@linuxfoundation.org> wrote:
>
> On 5/9/22 9:00 PM, Suren Baghdasaryan wrote:
> > Introduce process_mrelease syscall sanity tests. They include tests of
> > invalid pidfd and flags inputs, attempting to call process_mrelease
> > with a live process and a valid usage of process_mrelease. Because
> > process_mrelease has to be used against a process with a pending SIGKILL,
> > it's possible that the process exits before process_mrelease gets called.
> > In such cases we retry the test with a victim that allocates twice more
> > memory up to 1GB. This would require the victim process to spend more
> > time during exit and process_mrelease has a better chance of catching
> > the process before it exits.
> >
>
> +1 on Mike's comments on improving the change log. List what is getting
> tested as opposed to describing the test code.

I'll try to improve the description but IMHO it does describe what
it's testing - the process_mrelease syscall with valid and invalid
inputs. I could omit the implementation details if that helps.

>
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >   tools/testing/selftests/vm/Makefile        |   1 +
> >   tools/testing/selftests/vm/mrelease_test.c | 176 +++++++++++++++++++++
> >   tools/testing/selftests/vm/run_vmtests.sh  |  16 ++
> >   3 files changed, 193 insertions(+)
> >   create mode 100644 tools/testing/selftests/vm/mrelease_test.c
>
> Please update .gitignore with the new executable.

Ack.

>
> >
> > diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
> > index 04a49e876a46..733fccbff0ef 100644
> > --- a/tools/testing/selftests/vm/Makefile
> > +++ b/tools/testing/selftests/vm/Makefile
> > @@ -43,6 +43,7 @@ TEST_GEN_FILES += map_populate
> >   TEST_GEN_FILES += memfd_secret
> >   TEST_GEN_FILES += mlock-random-test
> >   TEST_GEN_FILES += mlock2-tests
> > +TEST_GEN_FILES += mrelease_test
> >   TEST_GEN_FILES += mremap_dontunmap
> >   TEST_GEN_FILES += mremap_test
> >   TEST_GEN_FILES += on-fault-limit
> > diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c
> > new file mode 100644
> > index 000000000000..a61061bf8433
> > --- /dev/null
> > +++ b/tools/testing/selftests/vm/mrelease_test.c
> > @@ -0,0 +1,176 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright 2022 Google LLC
> > + */
> > +#define _GNU_SOURCE
> > +#include <errno.h>
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <sys/wait.h>
> > +#include <unistd.h>
> > +
> > +#include "util.h"
> > +
> > +static inline int pidfd_open(pid_t pid, unsigned int flags)
> > +{
> > +#ifdef __NR_pidfd_open
> > +     return syscall(__NR_pidfd_open, pid, flags);
> > +#else
> > +     errno = ENOSYS;
>
> This isn't an error - this would be skip because this syscall
> isn't supported.

Ack.

>
> > +     return -1;
> > +#endif
>
> Key off of syscall return instead of these ifdefs - same comment
> on all of the ifdefs

Ack. I was using some other test as an example but I guess that was
not a good model.

>
> > +}
> > +
>
> I am not seeing any reason for breaking this code up have a separate
> routine for pidfd_open().

I'm a bit unclear what you mean. Do you mean that userspace headers
should already define pidfd_open() and I don't need to define it?

>
> > +static inline int process_mrelease(int pidfd, unsigned int flags)
> > +{
> > +#ifdef __NR_process_mrelease
> > +     return syscall(__NR_process_mrelease, pidfd, flags);
> > +#else
> > +     errno = ENOSYS;
> > +     return -1;
> > +#endif> +}
> > +
>
> Same comments on ifdefs and skips here as well.

Ack.

>
> > +static void write_fault_pages(char *addr, unsigned long nr_pages)
> > +{
> > +     unsigned long i;
> > +
> > +     for (i = 0; i < nr_pages; i++)
> > +             *((unsigned long *)(addr + (i * PAGE_SIZE))) = i;
> > +}
> > +
> > +static int alloc_noexit(unsigned long nr_pages, int pipefd)
> > +{
> > +     int ppid = getppid();
> > +     void *buf;
> > +
> > +     buf = mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
> > +                MAP_PRIVATE | MAP_ANON, 0, 0);
> > +     if (buf == MAP_FAILED) {
> > +             perror("mmap");
>
> A bit more descriptive message what the test would do will be helpful.
> Also consider if this should be a skip or fail for the test.

I guess the question is whether an OOM condition should be considered
a test failure? I guess we could skip since the test is specifically
for process_mrelease.

>
> > +             return 1;
> > +     }
> > +
> > +     write_fault_pages((char *)buf, nr_pages);
> > +
> > +     /* Signal the parent that the child is ready */
> > +     if (write(pipefd, "", 1) < 0) {
> > +             perror("write");
> > +             return 1;
> > +     }
> > +
> > +     /* Wait to be killed (when reparenting happens) */
> > +     while (getppid() == ppid)
> > +             sleep(1);
> > +
>
> What happens if reparenting doesn't happen? Will this loop for ever?
> This test could hang?

Technically that should not happen because we kill the child and after
that it has to reparent but to be safe I'll add a timeout to prevent
an infinite loop.

>
> > +     munmap(buf, nr_pages * PAGE_SIZE);
> > +
> > +     return 0;
> > +}
> > +
> > +
> > +#define MB(x) (x << 20)
> > +#define MAX_SIZE_MB 1024
> > +
> > +int main(void)
> > +{
> > +     int res;
> > +     int pipefd[2], pidfd;
> > +     pid_t pid;
> > +     char byte;
> > +     size_t size;
> > +     int negative_tests_done = 0;
> > +
> > +     /* Test a wrong pidfd */
> > +     if (!process_mrelease(-1, 0) || errno != EBADF) {
> > +             perror("process_mrelease with wring pidfd");
>
> Incorrect spelling "wring/wrong"

Ack.

>
> > +             exit(1);
> > +     }
> > +
> > +     /*
> > +      * Start the test with 1MB allocation and double every time
> > +      * process_mrelease fails
> > +      */
> > +     for (size = 1; size <= MAX_SIZE_MB; size *= 2) {
> > +             /*
> > +              * Pipe for the child to signal when it's done allocating
> > +              * memory
> > +              */
> > +             if (pipe(pipefd)) {
> > +                     perror("pipe");
> > +                     exit(1);
> > +             }
> > +             pid = fork();
> > +             if (pid < 0) {
> > +                     perror("fork");
>
> Close the pipe?

Ack.
Although I though all FDs are closed implicitly when the process exits
(which is the case in here).

>
> > +                     exit(1);
> > +             }
> > +
> > +             if (pid == 0) {
> > +                     close(pipefd[0]);
> > +                     res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]);
> > +                     close(pipefd[1]);
> > +                     exit(res);
> > +             }
> > +
> > +             close(pipefd[1]);
> > +             /* Block until the child is ready */
> > +             res = read(pipefd[0], &byte, 1);
> > +             close(pipefd[0]);
> > +             if (res < 0) {
> > +                     perror("read");
> > +                     exit(1);
> > +             }
> > +
> > +             pidfd = pidfd_open(pid, 0);
> > +             if (pidfd < 0) {
> > +                     perror("pidfd_open");
> > +                     exit(1);
> > +             }
> > +
>
> The code is very hard to read. Add comments to indicate parent and child
> paths clearly so reviewers can follow the logic and be able to do effective
> review.

Ack.

>
> > +             /* Run negative tests which require a valid child only once */
> > +             if (!negative_tests_done) {
> > +                     /* Test invalid flags */
> > +                     if (!process_mrelease(pidfd, (unsigned int)-1) ||
> > +                         errno != EINVAL) {
> > +                             perror("process_mrelease with wrong flags");
> > +                             exit(1);
>
> So is this an expected fail or a test fail?

We expect it to fail with EINVAL. We fail the test if the call
succeeds or fails for any other reason.

>
> > +                     }
> > +                     /* Test reapling while process is still alive */
> > +                     if (!process_mrelease(pidfd, 0) ||
> > +                         errno != EINVAL) {
> > +                             perror("process_mrelease on a live process");
>
> So is this an expected fail or a test fail?

Expected to fail with EINVAL.

>
> > +                             exit(1);
> > +                     }
> > +                     negative_tests_done = 1;
> > +             }
>
> Now the above negative_tests_done block could be in a separate function ---
> All the others aren't really needed. It will be good for abstraction and
> readability.

I'll separate them into a stand-alone function.

>
> > +
> > +             if (kill(pid, SIGKILL)) {
> > +                     perror("kill");
>
> Include test results in the change log - so we can see the test report.

Ack. Although the test does not print any results on success, it just
succeeds or fails with an error description. I'll add outputs for both
cases.

>
> > +                     exit(1);
> > +             }
> > +
> > +             if (!process_mrelease(pidfd, 0)) {
> > +                     /* Terminate the test once process_mrelease succeeds */
> > +                     return 0;
> > +             }
> > +
> > +             /*
> > +              * Ignore the failure if the child exited before mrelease got
> > +              * called, increase allocation size and retry the test
> > +              */
>
> Add more info. on why allocating more memory helps.

Ack.

>
> > +             if (errno != ESRCH) {
> > +                     perror("process_mrelease");
> > +                     exit(1);
> > +             }
> > +
> > +             if (waitpid(pid, NULL, 0) < 0) {
> > +                     perror("waitpid");
> > +                     exit(1);
> > +             }
> > +             close(pidfd);
> > +     }
> > +
> > +     printf("All process_mrelease attempts failed!\n");
> > +     exit(1);
> > +}
> > diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
> > index 352ba00cf26b..1986162fea39 100755
> > --- a/tools/testing/selftests/vm/run_vmtests.sh
> > +++ b/tools/testing/selftests/vm/run_vmtests.sh
> > @@ -287,6 +287,22 @@ else
> >       echo "[PASS]"
> >   fi
> >
> > +echo "---------------------"
> > +echo "running mrelease_test"
> > +echo "---------------------"
> > +./mrelease_test
> > +ret_val=$?
> > +
> > +if [ $ret_val -eq 0 ]; then
> > +     echo "[PASS]"
> > +elif [ $ret_val -eq $ksft_skip ]; then
> > +      echo "[SKIP]"
> > +      exitcode=$ksft_skip
> > +else
> > +     echo "[FAIL]"
> > +     exitcode=1
> > +fi
> > +
> >   echo "-------------------"
> >   echo "running mremap_test"
> >   echo "-------------------"
> >
>
> In general, the code flow is hard to read to make sure resources
> are released e.g: pipefd in all the error paths. The code is broken
> up into smaller chunks where it isn't needed in some cases and left
> as a large block when it could benefit from abstraction e.g: negative
> test block.
>
> Please make changes and send v2.

Will try to refactor it for a better readability. Thanks for the review!

>
> thanks,
> -- Shuah

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] mm: drop oom code from exit_mmap
  2022-05-10 13:05   ` Michal Hocko
@ 2022-05-10 16:31     ` Suren Baghdasaryan
  2022-05-10 20:53       ` Michal Hocko
  0 siblings, 1 reply; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10 16:31 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, David Rientjes, Matthew Wilcox, Johannes Weiner,
	Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team

On Tue, May 10, 2022 at 6:06 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 09-05-22 20:00:13, Suren Baghdasaryan wrote:
> > With the oom-killer being able to operate on locked pages, exit_mmap
> > does not need to ensure that oom_reap_task_mm is done before it can
> > proceed. Instead it can rely on mmap_lock write lock to prevent
> > oom-killer from operating on the vma tree while it's freeing page
> > tables. exit_mmap can hold mmap_lock read lock when unmapping vmas
> > and then take mmap_lock write lock before freeing page tables.
>
> The changelog is rather light on nasty details which might be good but
> for the sake of our future us let's be more verbose so that we do not
> have to reinvent the prior history each time we are looking into this
> code. I would go with something like this instead:
> "
> The primary reason to invoke the oom reaper from the exit_mmap path used
> to be a prevention of an excessive oom killing if the oom victim exit
> races with the oom reaper (see 212925802454 ("mm: oom: let oom_reap_task
> and exit_mmap run concurrently") for more details. The invocation has
> moved around since then because of the interaction with the munlock
> logic but the underlying reason has remained the same (see 27ae357fa82b
> ("mm, oom: fix concurrent munlock and oom reaper unmap, v3").
>
> Munlock code is no longer a problem since a213e5cf71cb ("mm/munlock:
> delete munlock_vma_pages_all(), allow oomreap") and there shouldn't be
> any blocking operation before the memory is unmapped by exit_mmap so
> the oom reaper invocation can be dropped. The unmapping part can be done
> with the non-exclusive mmap_sem and the exclusive one is only required
> when page tables are freed.
>
> Remove the oom_reaper from exit_mmap which will make the code easier to
> read. This is really unlikely to make any observable difference although
> some microbenchmarks could benefit from one less branch that needs to be
> evaluated even though it almost never is true.
> "

Looks great! Thanks for collecting all the history. Will update the description.

>
> One minor comment below. Other than that \o/ this is finally going away.
> I strongly suspect that the history of this code is a nice example about how
> over optimizing code can cause more harm than good.
>
> Acked-by: Michal Hocko <mhocko@suse.com>

Thanks.

>
> Thanks!
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  include/linux/oom.h |  2 --
> >  mm/mmap.c           | 25 ++++++-------------------
> >  mm/oom_kill.c       |  2 +-
> >  3 files changed, 7 insertions(+), 22 deletions(-)
> >
> [...]
> > @@ -3138,6 +3121,10 @@ void exit_mmap(struct mm_struct *mm)
> >       /* update_hiwater_rss(mm) here? but nobody should be looking */
> >       /* Use -1 here to ensure all VMAs in the mm are unmapped */
> >       unmap_vmas(&tlb, vma, 0, -1);
> > +     mmap_read_unlock(mm);
> > +     /* Set MMF_OOM_SKIP to disregard this mm from further consideration.*/
> > +     set_bit(MMF_OOM_SKIP, &mm->flags);
>
> I think that it would be slightly more readable to add an empty line
> above and below of this. Also the comment would be more helpful if it
> explaind what the further consideration actually means. I would go with
>
>         /*
>          * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
>          * because the memory has been already freed. Do not bother
>          * checking mm_is_oom_victim because setting a bit
>          * unconditionally is just cheaper.
>          */
>

Ack.

> > +     mmap_write_lock(mm);
> >       free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
> >       tlb_finish_mmu(&tlb);
>
> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] mm: drop oom code from exit_mmap
  2022-05-10 15:46   ` Shuah Khan
@ 2022-05-10 16:35     ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10 16:35 UTC (permalink / raw)
  To: Shuah Khan
  Cc: Andrew Morton, Michal Hocko, David Rientjes, Matthew Wilcox,
	Johannes Weiner, Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team

On Tue, May 10, 2022 at 8:46 AM Shuah Khan <skhan@linuxfoundation.org> wrote:
>
> On 5/9/22 9:00 PM, Suren Baghdasaryan wrote:
> > With the oom-killer being able to operate on locked pages, exit_mmap
> > does not need to ensure that oom_reap_task_mm is done before it can
> > proceed. Instead it can rely on mmap_lock write lock to prevent
> > oom-killer from operating on the vma tree while it's freeing page
> > tables. exit_mmap can hold mmap_lock read lock when unmapping vmas
> > and then take mmap_lock write lock before freeing page tables.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >   include/linux/oom.h |  2 --
> >   mm/mmap.c           | 25 ++++++-------------------
> >   mm/oom_kill.c       |  2 +-
> >   3 files changed, 7 insertions(+), 22 deletions(-)
> >
>
> How does this improve the test? Include the information on why this
> change is needed as opposed describing what this does?

It doesn't improve the test. I used the test to verify this change and
wanted to keep them together so that others have an easy way to
exercise the same code path. That's the only relation between the test
and this cleanup. I'll split them into separate patchsets to avoid
further confusion.

>
> thanks,
> -- Shuah

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] selftests: vm: add process_mrelease tests
  2022-05-10 16:29   ` Suren Baghdasaryan
@ 2022-05-10 16:35     ` Shuah Khan
  2022-05-10 16:42       ` Suren Baghdasaryan
  0 siblings, 1 reply; 17+ messages in thread
From: Shuah Khan @ 2022-05-10 16:35 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Andrew Morton, Michal Hocko, David Rientjes, Matthew Wilcox,
	Johannes Weiner, Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team, Shuah Khan

On 5/10/22 10:29 AM, Suren Baghdasaryan wrote:
> On Tue, May 10, 2022 at 8:43 AM Shuah Khan <skhan@linuxfoundation.org> wrote:
>>
>> On 5/9/22 9:00 PM, Suren Baghdasaryan wrote:
>>> Introduce process_mrelease syscall sanity tests. They include tests of
>>> invalid pidfd and flags inputs, attempting to call process_mrelease
>>> with a live process and a valid usage of process_mrelease. Because
>>> process_mrelease has to be used against a process with a pending SIGKILL,
>>> it's possible that the process exits before process_mrelease gets called.
>>> In such cases we retry the test with a victim that allocates twice more
>>> memory up to 1GB. This would require the victim process to spend more
>>> time during exit and process_mrelease has a better chance of catching
>>> the process before it exits.
>>>
>>
>> +1 on Mike's comments on improving the change log. List what is getting
>> tested as opposed to describing the test code.
> 
> I'll try to improve the description but IMHO it does describe what
> it's testing - the process_mrelease syscall with valid and invalid
> inputs. I could omit the implementation details if that helps.
> 
>>
>>> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>>> ---
>>>    tools/testing/selftests/vm/Makefile        |   1 +
>>>    tools/testing/selftests/vm/mrelease_test.c | 176 +++++++++++++++++++++
>>>    tools/testing/selftests/vm/run_vmtests.sh  |  16 ++
>>>    3 files changed, 193 insertions(+)
>>>    create mode 100644 tools/testing/selftests/vm/mrelease_test.c
>>
>> Please update .gitignore with the new executable.
> 
> Ack.
> 
>>
>>>
>>> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
>>> index 04a49e876a46..733fccbff0ef 100644
>>> --- a/tools/testing/selftests/vm/Makefile
>>> +++ b/tools/testing/selftests/vm/Makefile
>>> @@ -43,6 +43,7 @@ TEST_GEN_FILES += map_populate
>>>    TEST_GEN_FILES += memfd_secret
>>>    TEST_GEN_FILES += mlock-random-test
>>>    TEST_GEN_FILES += mlock2-tests
>>> +TEST_GEN_FILES += mrelease_test
>>>    TEST_GEN_FILES += mremap_dontunmap
>>>    TEST_GEN_FILES += mremap_test
>>>    TEST_GEN_FILES += on-fault-limit
>>> diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c
>>> new file mode 100644
>>> index 000000000000..a61061bf8433
>>> --- /dev/null
>>> +++ b/tools/testing/selftests/vm/mrelease_test.c
>>> @@ -0,0 +1,176 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * Copyright 2022 Google LLC
>>> + */
>>> +#define _GNU_SOURCE
>>> +#include <errno.h>
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +#include <sys/wait.h>
>>> +#include <unistd.h>
>>> +
>>> +#include "util.h"
>>> +
>>> +static inline int pidfd_open(pid_t pid, unsigned int flags)
>>> +{
>>> +#ifdef __NR_pidfd_open
>>> +     return syscall(__NR_pidfd_open, pid, flags);
>>> +#else
>>> +     errno = ENOSYS;
>>
>> This isn't an error - this would be skip because this syscall
>> isn't supported.
> 
> Ack.
> 
>>
>>> +     return -1;
>>> +#endif
>>
>> Key off of syscall return instead of these ifdefs - same comment
>> on all of the ifdefs
> 
> Ack. I was using some other test as an example but I guess that was
> not a good model.
> 
>>
>>> +}
>>> +
>>
>> I am not seeing any reason for breaking this code up have a separate
>> routine for pidfd_open().
> 
> I'm a bit unclear what you mean. Do you mean that userspace headers
> should already define pidfd_open() and I don't need to define it?
> 

Do you need pidfd_open() or can this be part of main? Without the ifdefs,
it is really a one line code.

thanks,
-- Shuah

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] selftests: vm: add process_mrelease tests
  2022-05-10 16:35     ` Shuah Khan
@ 2022-05-10 16:42       ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10 16:42 UTC (permalink / raw)
  To: Shuah Khan
  Cc: Andrew Morton, Michal Hocko, David Rientjes, Matthew Wilcox,
	Johannes Weiner, Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team

On Tue, May 10, 2022 at 9:36 AM Shuah Khan <skhan@linuxfoundation.org> wrote:
>
> On 5/10/22 10:29 AM, Suren Baghdasaryan wrote:
> > On Tue, May 10, 2022 at 8:43 AM Shuah Khan <skhan@linuxfoundation.org> wrote:
> >>
> >> On 5/9/22 9:00 PM, Suren Baghdasaryan wrote:
> >>> Introduce process_mrelease syscall sanity tests. They include tests of
> >>> invalid pidfd and flags inputs, attempting to call process_mrelease
> >>> with a live process and a valid usage of process_mrelease. Because
> >>> process_mrelease has to be used against a process with a pending SIGKILL,
> >>> it's possible that the process exits before process_mrelease gets called.
> >>> In such cases we retry the test with a victim that allocates twice more
> >>> memory up to 1GB. This would require the victim process to spend more
> >>> time during exit and process_mrelease has a better chance of catching
> >>> the process before it exits.
> >>>
> >>
> >> +1 on Mike's comments on improving the change log. List what is getting
> >> tested as opposed to describing the test code.
> >
> > I'll try to improve the description but IMHO it does describe what
> > it's testing - the process_mrelease syscall with valid and invalid
> > inputs. I could omit the implementation details if that helps.
> >
> >>
> >>> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> >>> ---
> >>>    tools/testing/selftests/vm/Makefile        |   1 +
> >>>    tools/testing/selftests/vm/mrelease_test.c | 176 +++++++++++++++++++++
> >>>    tools/testing/selftests/vm/run_vmtests.sh  |  16 ++
> >>>    3 files changed, 193 insertions(+)
> >>>    create mode 100644 tools/testing/selftests/vm/mrelease_test.c
> >>
> >> Please update .gitignore with the new executable.
> >
> > Ack.
> >
> >>
> >>>
> >>> diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
> >>> index 04a49e876a46..733fccbff0ef 100644
> >>> --- a/tools/testing/selftests/vm/Makefile
> >>> +++ b/tools/testing/selftests/vm/Makefile
> >>> @@ -43,6 +43,7 @@ TEST_GEN_FILES += map_populate
> >>>    TEST_GEN_FILES += memfd_secret
> >>>    TEST_GEN_FILES += mlock-random-test
> >>>    TEST_GEN_FILES += mlock2-tests
> >>> +TEST_GEN_FILES += mrelease_test
> >>>    TEST_GEN_FILES += mremap_dontunmap
> >>>    TEST_GEN_FILES += mremap_test
> >>>    TEST_GEN_FILES += on-fault-limit
> >>> diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c
> >>> new file mode 100644
> >>> index 000000000000..a61061bf8433
> >>> --- /dev/null
> >>> +++ b/tools/testing/selftests/vm/mrelease_test.c
> >>> @@ -0,0 +1,176 @@
> >>> +// SPDX-License-Identifier: GPL-2.0
> >>> +/*
> >>> + * Copyright 2022 Google LLC
> >>> + */
> >>> +#define _GNU_SOURCE
> >>> +#include <errno.h>
> >>> +#include <stdio.h>
> >>> +#include <stdlib.h>
> >>> +#include <sys/wait.h>
> >>> +#include <unistd.h>
> >>> +
> >>> +#include "util.h"
> >>> +
> >>> +static inline int pidfd_open(pid_t pid, unsigned int flags)
> >>> +{
> >>> +#ifdef __NR_pidfd_open
> >>> +     return syscall(__NR_pidfd_open, pid, flags);
> >>> +#else
> >>> +     errno = ENOSYS;
> >>
> >> This isn't an error - this would be skip because this syscall
> >> isn't supported.
> >
> > Ack.
> >
> >>
> >>> +     return -1;
> >>> +#endif
> >>
> >> Key off of syscall return instead of these ifdefs - same comment
> >> on all of the ifdefs
> >
> > Ack. I was using some other test as an example but I guess that was
> > not a good model.
> >
> >>
> >>> +}
> >>> +
> >>
> >> I am not seeing any reason for breaking this code up have a separate
> >> routine for pidfd_open().
> >
> > I'm a bit unclear what you mean. Do you mean that userspace headers
> > should already define pidfd_open() and I don't need to define it?
> >
>
> Do you need pidfd_open() or can this be part of main? Without the ifdefs,
> it is really a one line code.

Ah, I see. I think it's cleaner that way but I'll make them one-line
inline functions.

>
> thanks,
> -- Shuah

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] mm: drop oom code from exit_mmap
  2022-05-10 16:31     ` Suren Baghdasaryan
@ 2022-05-10 20:53       ` Michal Hocko
  2022-05-10 20:59         ` Suren Baghdasaryan
  0 siblings, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2022-05-10 20:53 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Andrew Morton, David Rientjes, Matthew Wilcox, Johannes Weiner,
	Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team

On Tue 10-05-22 09:31:50, Suren Baghdasaryan wrote:
> On Tue, May 10, 2022 at 6:06 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Mon 09-05-22 20:00:13, Suren Baghdasaryan wrote:
> > > With the oom-killer being able to operate on locked pages, exit_mmap
> > > does not need to ensure that oom_reap_task_mm is done before it can
> > > proceed. Instead it can rely on mmap_lock write lock to prevent
> > > oom-killer from operating on the vma tree while it's freeing page
> > > tables. exit_mmap can hold mmap_lock read lock when unmapping vmas
> > > and then take mmap_lock write lock before freeing page tables.
> >
> > The changelog is rather light on nasty details which might be good but
> > for the sake of our future us let's be more verbose so that we do not
> > have to reinvent the prior history each time we are looking into this
> > code. I would go with something like this instead:
> > "
> > The primary reason to invoke the oom reaper from the exit_mmap path used
> > to be a prevention of an excessive oom killing if the oom victim exit
> > races with the oom reaper (see 212925802454 ("mm: oom: let oom_reap_task
> > and exit_mmap run concurrently") for more details. The invocation has
> > moved around since then because of the interaction with the munlock
> > logic but the underlying reason has remained the same (see 27ae357fa82b
> > ("mm, oom: fix concurrent munlock and oom reaper unmap, v3").
> >
> > Munlock code is no longer a problem since a213e5cf71cb ("mm/munlock:
> > delete munlock_vma_pages_all(), allow oomreap") and there shouldn't be
> > any blocking operation before the memory is unmapped by exit_mmap so
> > the oom reaper invocation can be dropped. The unmapping part can be done
> > with the non-exclusive mmap_sem and the exclusive one is only required
> > when page tables are freed.
> >
> > Remove the oom_reaper from exit_mmap which will make the code easier to
> > read. This is really unlikely to make any observable difference although
> > some microbenchmarks could benefit from one less branch that needs to be
> > evaluated even though it almost never is true.
> > "
> 
> Looks great! Thanks for collecting all the history. Will update the description.

Please make sure you double check the story. This is mostly my
recollection and brief reading through the said commits. I might
misremember here and there.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] mm: drop oom code from exit_mmap
  2022-05-10 20:53       ` Michal Hocko
@ 2022-05-10 20:59         ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-10 20:59 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, David Rientjes, Matthew Wilcox, Johannes Weiner,
	Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team

On Tue, May 10, 2022 at 1:53 PM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 10-05-22 09:31:50, Suren Baghdasaryan wrote:
> > On Tue, May 10, 2022 at 6:06 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Mon 09-05-22 20:00:13, Suren Baghdasaryan wrote:
> > > > With the oom-killer being able to operate on locked pages, exit_mmap
> > > > does not need to ensure that oom_reap_task_mm is done before it can
> > > > proceed. Instead it can rely on mmap_lock write lock to prevent
> > > > oom-killer from operating on the vma tree while it's freeing page
> > > > tables. exit_mmap can hold mmap_lock read lock when unmapping vmas
> > > > and then take mmap_lock write lock before freeing page tables.
> > >
> > > The changelog is rather light on nasty details which might be good but
> > > for the sake of our future us let's be more verbose so that we do not
> > > have to reinvent the prior history each time we are looking into this
> > > code. I would go with something like this instead:
> > > "
> > > The primary reason to invoke the oom reaper from the exit_mmap path used
> > > to be a prevention of an excessive oom killing if the oom victim exit
> > > races with the oom reaper (see 212925802454 ("mm: oom: let oom_reap_task
> > > and exit_mmap run concurrently") for more details. The invocation has
> > > moved around since then because of the interaction with the munlock
> > > logic but the underlying reason has remained the same (see 27ae357fa82b
> > > ("mm, oom: fix concurrent munlock and oom reaper unmap, v3").
> > >
> > > Munlock code is no longer a problem since a213e5cf71cb ("mm/munlock:
> > > delete munlock_vma_pages_all(), allow oomreap") and there shouldn't be
> > > any blocking operation before the memory is unmapped by exit_mmap so
> > > the oom reaper invocation can be dropped. The unmapping part can be done
> > > with the non-exclusive mmap_sem and the exclusive one is only required
> > > when page tables are freed.
> > >
> > > Remove the oom_reaper from exit_mmap which will make the code easier to
> > > read. This is really unlikely to make any observable difference although
> > > some microbenchmarks could benefit from one less branch that needs to be
> > > evaluated even though it almost never is true.
> > > "
> >
> > Looks great! Thanks for collecting all the history. Will update the description.
>
> Please make sure you double check the story. This is mostly my
> recollection and brief reading through the said commits. I might
> misremember here and there.

Will do. Thanks!

> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag
  2022-05-10 13:08   ` Michal Hocko
@ 2022-05-16  2:46     ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2022-05-16  2:46 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, David Rientjes, Matthew Wilcox, Johannes Weiner,
	Roman Gushchin, Minchan Kim, Kirill A. Shutemov,
	Andrea Arcangeli, Christian Brauner, Christoph Hellwig,
	Oleg Nesterov, David Hildenbrand, Jann Horn, Shakeel Butt,
	Peter Xu, John Hubbard, shuah, LKML, linux-mm, linux-kselftest,
	kernel-team

On Tue, May 10, 2022 at 6:08 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 09-05-22 20:00:14, Suren Baghdasaryan wrote:
> > With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is
> > now unused and can be removed.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>
> LGTM
> Acked-by: Michal Hocko <mhocko@suse.com>
>
> One question below
> [...]
> > diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
> > index 4d9e3a656875..746f6cb07a20 100644
> > --- a/include/linux/sched/coredump.h
> > +++ b/include/linux/sched/coredump.h
> > @@ -70,7 +70,6 @@ static inline int get_dumpable(struct mm_struct *mm)
> >  #define MMF_UNSTABLE         22      /* mm is unstable for copy_from_user */
> >  #define MMF_HUGE_ZERO_PAGE   23      /* mm has ever used the global huge zero page */
> >  #define MMF_DISABLE_THP              24      /* disable THP for all VMAs */
> > -#define MMF_OOM_VICTIM               25      /* mm is the oom victim */
> >  #define MMF_OOM_REAP_QUEUED  26      /* mm was queued for oom_reaper */
> >  #define MMF_MULTIPROCESS     27      /* mm is shared between processes */
>
> Have you consider renumbering the follow up flags so that we do not have
> holes in there. Nothing really important but it can confuse somebody in
> the future.

Missed this note until now. I will renumber the constants to avoid confusion.
Thanks,
Suren.

>
> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2022-05-16  2:46 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-10  3:00 [PATCH 1/3] selftests: vm: add process_mrelease tests Suren Baghdasaryan
2022-05-10  3:00 ` [PATCH 2/3] mm: drop oom code from exit_mmap Suren Baghdasaryan
2022-05-10 13:05   ` Michal Hocko
2022-05-10 16:31     ` Suren Baghdasaryan
2022-05-10 20:53       ` Michal Hocko
2022-05-10 20:59         ` Suren Baghdasaryan
2022-05-10 15:46   ` Shuah Khan
2022-05-10 16:35     ` Suren Baghdasaryan
2022-05-10  3:00 ` [PATCH 3/3] mm: delete unused MMF_OOM_VICTIM flag Suren Baghdasaryan
2022-05-10 13:08   ` Michal Hocko
2022-05-16  2:46     ` Suren Baghdasaryan
2022-05-10 15:51   ` Shuah Khan
2022-05-10 16:10     ` Suren Baghdasaryan
2022-05-10 15:43 ` [PATCH 1/3] selftests: vm: add process_mrelease tests Shuah Khan
2022-05-10 16:29   ` Suren Baghdasaryan
2022-05-10 16:35     ` Shuah Khan
2022-05-10 16:42       ` Suren Baghdasaryan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).