[PATCH v2] sched/core: Don't use dying mm as active_mm of kthreads

* [PATCH v2] sched/core: Don't use dying mm as active_mm of kthreads
@ 2019-07-27 17:10 Waiman Long
  2019-07-29  8:18 ` Qais Yousef
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Waiman Long @ 2019-07-27 17:10 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: linux-kernel, linux-mm, Andrew Morton, Phil Auld, Waiman Long

It was found that a dying mm_struct where the owning task has exited
can stay on as active_mm of kernel threads as long as no other user
tasks run on those CPUs that use it as active_mm. This prolongs the
life time of dying mm holding up memory and other resources like swap
space that cannot be freed.

Fix that by forcing the kernel threads to use init_mm as the active_mm
if the previous active_mm is dying.

The determination of a dying mm is based on the absence of an owning
task. The selection of the owning task only happens with the CONFIG_MEMCG
option. Without that, there is no simple way to determine the life span
of a given mm. So it falls back to the old behavior.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 include/linux/mm_types.h | 15 +++++++++++++++
 kernel/sched/core.c      | 13 +++++++++++--
 mm/init-mm.c             |  4 ++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3a37a89eb7a7..32712e78763c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -623,6 +623,21 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
 	return atomic_read(&mm->tlb_flush_pending) > 1;
 }
 
+#ifdef CONFIG_MEMCG
+/*
+ * A mm is considered dying if there is no owning task.
+ */
+static inline bool mm_dying(struct mm_struct *mm)
+{
+	return !mm->owner;
+}
+#else
+static inline bool mm_dying(struct mm_struct *mm)
+{
+	return false;
+}
+#endif
+
 struct vm_fault;
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..923a63262dfd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3233,13 +3233,22 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * Both of these contain the full memory barrier required by
 	 * membarrier after storing to rq->curr, before returning to
 	 * user-space.
+	 *
+	 * If mm is NULL and oldmm is dying (!owner), we switch to
+	 * init_mm instead to make sure that oldmm can be freed ASAP.
 	 */
-	if (!mm) {
+	if (!mm && !mm_dying(oldmm)) {
 		next->active_mm = oldmm;
 		mmgrab(oldmm);
 		enter_lazy_tlb(oldmm, next);
-	} else
+	} else {
+		if (!mm) {
+			mm = &init_mm;
+			next->active_mm = mm;
+			mmgrab(mm);
+		}
 		switch_mm_irqs_off(oldmm, mm, next);
+	}
 
 	if (!prev->mm) {
 		prev->active_mm = NULL;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a787a319211e..69090a11249c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,6 +5,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/cpumask.h>
+#include <linux/sched/task.h>
 
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
@@ -36,5 +37,8 @@ struct mm_struct init_mm = {
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= { [BITS_TO_LONGS(NR_CPUS)] = 0},
+#ifdef CONFIG_MEMCG
+	.owner		= &init_task,
+#endif
 	INIT_MM_CONTEXT(init_mm)
 };
-- 
2.18.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread