[PATCH v5] mm/oom_kill.c: futex: Close a race between do_exit and the oom_reaper

* [PATCH v5] mm/oom_kill.c: futex: Close a race between do_exit and the oom_reaper
@ 2022-03-18  3:36 Nico Pache
  2022-03-21  8:55 ` Michal Hocko
  0 siblings, 1 reply; 17+ messages in thread
From: Nico Pache @ 2022-03-18  3:36 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Rafael Aquini, Waiman Long, Baoquan He,
	Christoph von Recklinghausen, Don Dutile, Herton R . Krzesinski,
	Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
	Davidlohr Bueso, Andre Almeida, David Rientjes, Michal Hocko,
	Andrea Arcangeli, Andrew Morton, Joel Savitz

The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which can
be targeted by the oom reaper. This mapping is used to store the futex
robust list; the kernel does not keep a copy of the robust list and instead
references a userspace address to maintain the robustness during a process
death. A race can occur between exit_mm and the oom reaper that allows
the oom reaper to free the memory of the futex robust list before the
exit path has handled the futex death:

    CPU1                               CPU2
------------------------------------------------------------------------
    page_fault
    out_of_memory
    do_exit "signal"
    wake_oom_reaper
					oom_reaper
                        		oom_reap_task_mm (invalidates mm)
    exit_mm
    exit_mm_release
    futex_exit_release
    futex_cleanup
    exit_robust_list
    get_user (EFAULT- can't access memory)

While in the oom reaper thread, we must handle the futex cleanup without
sleeping. To achieve this, add the boolean `try` to futex_exit_begin().
This will control weather or not we use a trylock. Also introduce
try_futex_exit_release() which will utilize the trylock version of the
futex_cleanup_begin(). Also call kthread_use_mm in this context to assure
the get_user call in futex_cleanup() does not return with EFAULT.

If the robust list exists, and the mutex_trylock fails, we prevent the OOM
reaper from concurrently reaping the mappings. If the dying task_struct
does not contain a pointer in tsk->robust_list, we can assume there was
either never one setup for this task struct, or the exit path's call to
futex_cleanup() has properly handled the futex death, and we can safely
reap this memory.

Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer

[1] https://elixir.bootlin.com/glibc/latest/source/nptl/allocatestack.c#L370

Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph von Recklinghausen <crecklin@redhat.com>
Cc: Don Dutile <ddutile@redhat.com>
Cc: Herton R. Krzesinski <herton@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Andre Almeida <andrealmeid@collabora.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Co-developed-by: Joel Savitz <jsavitz@redhat.com>
Signed-off-by: Joel Savitz <jsavitz@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 include/linux/futex.h | 14 ++++++++++++++
 kernel/futex/core.c   | 35 +++++++++++++++++++++++++++++++----
 mm/oom_kill.c         | 13 +++++++++++++
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index b70df27d7e85..64d6e89294ac 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -71,9 +71,21 @@ static inline void futex_init_task(struct task_struct *tsk)
 	mutex_init(&tsk->futex_exit_mutex);
 }
 
+static inline bool task_has_robust_list(struct task_struct *tsk)
+{
+	bool robust = false;
+
+	robust = tsk->robust_list != NULL;
+#ifdef CONFIG_COMPAT
+	robust |= tsk->compat_robust_list != NULL;
+#endif
+	return robust;
+}
+
 void futex_exit_recursive(struct task_struct *tsk);
 void futex_exit_release(struct task_struct *tsk);
 void futex_exec_release(struct task_struct *tsk);
+bool try_futex_exit_release(struct task_struct *tsk);
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
@@ -82,6 +94,8 @@ static inline void futex_init_task(struct task_struct *tsk) { }
 static inline void futex_exit_recursive(struct task_struct *tsk) { }
 static inline void futex_exit_release(struct task_struct *tsk) { }
 static inline void futex_exec_release(struct task_struct *tsk) { }
+static inline bool task_has_robust_list(struct task_struct *tsk) { return false; }
+static inline bool try_futex_exit_release(struct task_struct *tsk) { return true; }
 static inline long do_futex(u32 __user *uaddr, int op, u32 val,
 			    ktime_t *timeout, u32 __user *uaddr2,
 			    u32 val2, u32 val3)
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 51dd822a8060..81aa60ce1ed6 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -37,6 +37,7 @@
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
+#include <linux/kthread.h>
 
 #include "futex.h"
 #include "../locking/rtmutex_common.h"
@@ -1047,7 +1048,7 @@ void futex_exit_recursive(struct task_struct *tsk)
 	tsk->futex_state = FUTEX_STATE_DEAD;
 }
 
-static void futex_cleanup_begin(struct task_struct *tsk)
+static bool futex_cleanup_begin(struct task_struct *tsk, bool try)
 {
 	/*
 	 * Prevent various race issues against a concurrent incoming waiter
@@ -1055,7 +1056,12 @@ static void futex_cleanup_begin(struct task_struct *tsk)
 	 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
 	 * attach_to_pi_owner().
 	 */
-	mutex_lock(&tsk->futex_exit_mutex);
+	if (try) {
+		if (!mutex_trylock(&tsk->futex_exit_mutex))
+			return false;
+	} else {
+		mutex_lock(&tsk->futex_exit_mutex);
+	}
 
 	/*
 	 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
@@ -1071,6 +1077,7 @@ static void futex_cleanup_begin(struct task_struct *tsk)
 	raw_spin_lock_irq(&tsk->pi_lock);
 	tsk->futex_state = FUTEX_STATE_EXITING;
 	raw_spin_unlock_irq(&tsk->pi_lock);
+	return true;
 }
 
 static void futex_cleanup_end(struct task_struct *tsk, int state)
@@ -1096,7 +1103,7 @@ void futex_exec_release(struct task_struct *tsk)
 	 * futex is held on exec(), this provides at least as much state
 	 * consistency protection which is possible.
 	 */
-	futex_cleanup_begin(tsk);
+	futex_cleanup_begin(tsk, false);
 	futex_cleanup(tsk);
 	/*
 	 * Reset the state to FUTEX_STATE_OK. The task is alive and about
@@ -1107,9 +1114,29 @@ void futex_exec_release(struct task_struct *tsk)
 
 void futex_exit_release(struct task_struct *tsk)
 {
-	futex_cleanup_begin(tsk);
+	futex_cleanup_begin(tsk, false);
+	futex_cleanup(tsk);
+	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+}
+
+/* Try to perform the futex_cleanup and return true if successful.
+ * Designed to be called from the context of the OOM Reaper.
+ */
+bool try_futex_exit_release(struct task_struct *tsk)
+{
+	if (!futex_cleanup_begin(tsk, true))
+		return false;
+
+	/* We are calling this from the context of a kthread. We need to
+	 * instruct the kthread to use the address space of the given mm
+	 * so the get_user won't return -EFAULT.
+	 */
+	kthread_use_mm(tsk->mm);
 	futex_cleanup(tsk);
+	kthread_unuse_mm(tsk->mm);
+
 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+	return true;
 }
 
 static int __init futex_init(void)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 832fb330376e..f7834c53d874 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/futex.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -587,6 +588,18 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		goto out_unlock;
 	}
 
+	/* We can't reap a process holding a robust_list; the pthread
+	 * struct is allocated in userspace using PRIVATE | ANONYMOUS
+	 * memory which when reaped before futex_cleanup() can leave
+	 * the waiting process stuck. Try to perform the futex_cleanup,
+	 * and if unsuccessful, skip the OOM reaping.
+	 */
+	if (task_has_robust_list(tsk) && !try_futex_exit_release(tsk)) {
+		trace_skip_task_reaping(tsk->pid);
+		pr_info("oom_reaper: skipping task as it contains a robust list");
+		goto out_finish;
+	}
+
 	trace_start_task_reaping(tsk->pid);
 
 	/* failed to reap part of the address space. Try again later */
-- 
2.35.1



^ permalink raw reply related	[flat|nested] 17+ messages in thread