[to-be-updated] mm-oom-fix-concurrent-munlock-and-oom-reaper-unmap.patch removed from -mm tree

* [to-be-updated] mm-oom-fix-concurrent-munlock-and-oom-reaper-unmap.patch removed from -mm tree
@ 2018-05-01 23:49 akpm
  0 siblings, 0 replies; only message in thread
From: akpm @ 2018-05-01 23:49 UTC (permalink / raw)
  To: mm-commits, stable, penguin-kernel, mhocko, guro, aarcange, rientjes


The patch titled
     Subject: mm, oom: fix concurrent munlock and oom reaper unmap
has been removed from the -mm tree.  Its filename was
     mm-oom-fix-concurrent-munlock-and-oom-reaper-unmap.patch

This patch was dropped because an updated version will be merged

------------------------------------------------------
From: David Rientjes <rientjes@google.com>
Subject: mm, oom: fix concurrent munlock and oom reaper unmap

Since exit_mmap() is done without the protection of mm->mmap_sem, it is
possible for the oom reaper to concurrently operate on an mm until
MMF_OOM_SKIP is set.

This allows munlock_vma_pages_all() to concurrently run while the oom
reaper is operating on a vma.  Since munlock_vma_pages_range() depends on
clearing VM_LOCKED from vm_flags before actually doing the munlock to
determine if any other vmas are locking the same memory, the check for
VM_LOCKED in the oom reaper is racy.

This is especially noticeable on architectures such as powerpc where
clearing a huge pmd requires serialize_against_pte_lookup().  If the pmd
is zapped by the oom reaper during follow_page_mask() after the check for
pmd_none() is bypassed, this ends up deferencing a NULL ptl.

Fix this by reusing MMF_UNSTABLE to specify that an mm should not be
reaped.  This prevents the concurrent munlock_vma_pages_range() and
unmap_page_range().  The oom reaper will simply not operate on an mm that
has the bit set and leave the unmapping to exit_mmap().

[rientjes@google.com: v2]
  Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1804171951440.105401@chino.kir.corp.google.com
Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1804171545460.53786@chino.kir.corp.google.com
Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Roman Gushchin <guro@fb.com>
Cc: <stable@vger.kernel.org>	[4.14+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/mmap.c     |   38 ++++++++++++++++++++------------------
 mm/oom_kill.c |   28 +++++++++++++---------------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff -puN mm/mmap.c~mm-oom-fix-concurrent-munlock-and-oom-reaper-unmap mm/mmap.c

--- a/mm/mmap.c~mm-oom-fix-concurrent-munlock-and-oom-reaper-unmap
+++ a/mm/mmap.c
@@ -3024,6 +3024,25 @@ void exit_mmap(struct mm_struct *mm)
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
 
+	if (unlikely(mm_is_oom_victim(mm))) {
+		/*
+		 * Wait for oom_reap_task() to stop working on this mm.  Because
+		 * MMF_UNSTABLE is already set before calling down_read(),
+		 * oom_reap_task() will not run on this mm after up_write().
+		 * oom_reap_task() also depends on a stable VM_LOCKED flag to
+		 * indicate it should not unmap during munlock_vma_pages_all().
+		 *
+		 * mm_is_oom_victim() cannot be set from under us because
+		 * victim->mm is already set to NULL under task_lock before
+		 * calling mmput() and victim->signal->oom_mm is set by the oom
+		 * killer only if victim->mm is non-NULL while holding
+		 * task_lock().
+		 */
+		set_bit(MMF_UNSTABLE, &mm->flags);
+		down_write(&mm->mmap_sem);
+		up_write(&mm->mmap_sem);
+	}
+
 	if (mm->locked_vm) {
 		vma = mm->mmap;
 		while (vma) {
@@ -3045,26 +3064,9 @@ void exit_mmap(struct mm_struct *mm)
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
-
-	if (unlikely(mm_is_oom_victim(mm))) {
-		/*
-		 * Wait for oom_reap_task() to stop working on this
-		 * mm. Because MMF_OOM_SKIP is already set before
-		 * calling down_read(), oom_reap_task() will not run
-		 * on this "mm" post up_write().
-		 *
-		 * mm_is_oom_victim() cannot be set from under us
-		 * either because victim->mm is already set to NULL
-		 * under task_lock before calling mmput and oom_mm is
-		 * set not NULL by the OOM killer only if victim->mm
-		 * is found not NULL while holding the task_lock.
-		 */
-		set_bit(MMF_OOM_SKIP, &mm->flags);
-		down_write(&mm->mmap_sem);
-		up_write(&mm->mmap_sem);
-	}
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, 0, -1);
+	set_bit(MMF_OOM_SKIP, &mm->flags);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
diff -puN mm/oom_kill.c~mm-oom-fix-concurrent-munlock-and-oom-reaper-unmap mm/oom_kill.c
--- a/mm/oom_kill.c~mm-oom-fix-concurrent-munlock-and-oom-reaper-unmap
+++ a/mm/oom_kill.c
@@ -521,12 +521,17 @@ static bool __oom_reap_task_mm(struct ta
 	}
 
 	/*
-	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
-	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
+	 * Tell all users of get_user/copy_from_user etc... that the content
+	 * is no longer stable. No barriers really needed because unmapping
+	 * should imply barriers already and the reader would hit a page fault
+	 * if it stumbled over reaped memory.
+	 *
+	 * MMF_UNSTABLE is also set by exit_mmap when the OOM reaper shouldn't
+	 * work on the mm anymore. The check for MMF_OOM_UNSTABLE must run
 	 * under mmap_sem for reading because it serializes against the
 	 * down_write();up_write() cycle in exit_mmap().
 	 */
-	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+	if (test_and_set_bit(MMF_UNSTABLE, &mm->flags)) {
 		up_read(&mm->mmap_sem);
 		trace_skip_task_reaping(tsk->pid);
 		goto unlock_oom;
@@ -534,14 +539,6 @@ static bool __oom_reap_task_mm(struct ta
 
 	trace_start_task_reaping(tsk->pid);
 
-	/*
-	 * Tell all users of get_user/copy_from_user etc... that the content
-	 * is no longer stable. No barriers really needed because unmapping
-	 * should imply barriers already and the reader would hit a page fault
-	 * if it stumbled over a reaped memory.
-	 */
-	set_bit(MMF_UNSTABLE, &mm->flags);
-
 	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
 		if (!can_madv_dontneed_vma(vma))
 			continue;
@@ -567,6 +564,7 @@ static bool __oom_reap_task_mm(struct ta
 			tlb_finish_mmu(&tlb, start, end);
 		}
 	}
+	set_bit(MMF_OOM_SKIP, &mm->flags);
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
 			task_pid_nr(tsk), tsk->comm,
 			K(get_mm_counter(mm, MM_ANONPAGES)),
@@ -594,7 +592,6 @@ static void oom_reap_task(struct task_st
 	    test_bit(MMF_OOM_SKIP, &mm->flags))
 		goto done;
 
-
 	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
 		task_pid_nr(tsk), tsk->comm);
 	debug_show_all_locks();
@@ -603,10 +600,11 @@ done:
 	tsk->oom_reaper_list = NULL;
 
 	/*
-	 * Hide this mm from OOM killer because it has been either reaped or
-	 * somebody can't call up_write(mmap_sem).
+	 * If the oom reaper could not get started on this mm and it has not yet
+	 * reached exit_mmap(), set MMF_OOM_SKIP to disregard.
 	 */
-	set_bit(MMF_OOM_SKIP, &mm->flags);
+	if (!test_bit(MMF_UNSTABLE, &mm->flags))
+		set_bit(MMF_OOM_SKIP, &mm->flags);
 
 	/* Drop a reference taken by wake_oom_reaper */
 	put_task_struct(tsk);
_

Patches currently in -mm which might be from rientjes@google.com are

^ permalink raw reply	[flat|nested] only message in thread