All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andy Lutomirski <luto@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>, Linux-MM <linux-mm@kvack.org>
Cc: Nicholas Piggin <npiggin@gmail.com>,
	Anton Blanchard <anton@ozlabs.org>,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	Paul Mackerras <paulus@ozlabs.org>,
	Randy Dunlap <rdunlap@infradead.org>,
	linux-arch <linux-arch@vger.kernel.org>,
	x86@kernel.org, Rik van Riel <riel@surriel.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Nadav Amit <nadav.amit@gmail.com>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	Andy Lutomirski <luto@kernel.org>
Subject: [PATCH 14/23] sched, exec: Factor current mm changes out from exec
Date: Sat,  8 Jan 2022 08:43:59 -0800	[thread overview]
Message-ID: <60eb8a98061100f95e53e7868841fbb9a68237c8.1641659630.git.luto@kernel.org> (raw)
In-Reply-To: <cover.1641659630.git.luto@kernel.org>

Currently, exec_mmap() open-codes an mm change.  Create new core
__change_current_mm() and __change_current_mm_to_kernel() helpers
and use the former from exec_mmap().  This moves the nasty scheduler
details out of exec.c and prepares for reusing this code elsewhere.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 fs/exec.c                |  32 +----------
 include/linux/sched/mm.h |  20 +++++++
 kernel/sched/core.c      | 119 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+), 30 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 2afa7b0c75f2..9e1c2ee7c986 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -971,15 +971,13 @@ EXPORT_SYMBOL(read_code);
 static int exec_mmap(struct mm_struct *mm)
 {
 	struct task_struct *tsk;
-	struct mm_struct *old_mm, *active_mm;
+	struct mm_struct *old_mm;
 	int ret;
 
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
 	exec_mm_release(tsk, old_mm);
-	if (old_mm)
-		sync_mm_rss(old_mm);
 
 	ret = down_write_killable(&tsk->signal->exec_update_lock);
 	if (ret)
@@ -1000,41 +998,15 @@ static int exec_mmap(struct mm_struct *mm)
 		}
 	}
 
-	task_lock(tsk);
-	/*
-	 * membarrier() requires a full barrier before switching mm.
-	 */
-	smp_mb__after_spinlock();
+	__change_current_mm(mm, true);
 
-	local_irq_disable();
-	active_mm = tsk->active_mm;
-	tsk->active_mm = mm;
-	WRITE_ONCE(tsk->mm, mm);  /* membarrier reads this without locks */
-	membarrier_update_current_mm(mm);
-	/*
-	 * This prevents preemption while active_mm is being loaded and
-	 * it and mm are being updated, which could cause problems for
-	 * lazy tlb mm refcounting when these are updated by context
-	 * switches. Not all architectures can handle irqs off over
-	 * activate_mm yet.
-	 */
-	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
-		local_irq_enable();
-	activate_mm(active_mm, mm);
-	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
-		local_irq_enable();
-	membarrier_finish_switch_mm(mm);
-	vmacache_flush(tsk);
-	task_unlock(tsk);
 	if (old_mm) {
 		mmap_read_unlock(old_mm);
-		BUG_ON(active_mm != old_mm);
 		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
 		return 0;
 	}
-	mmdrop(active_mm);
 	return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index f1d2beac464c..7509b2b2e99d 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -83,6 +83,26 @@ extern void mmput(struct mm_struct *);
 void mmput_async(struct mm_struct *);
 #endif
 
+/*
+ * Switch the mm for current.  This does not mmget() mm, nor does it mmput()
+ * the previous mm, if any.  The caller is responsible for reference counting,
+ * although __change_current_mm() handles all details related to lazy mm
+ * refcounting.
+ *
+ * If the caller is a user task, the caller must call mm_update_next_owner().
+ */
+void __change_current_mm(struct mm_struct *mm, bool mm_is_brand_new);
+
+/*
+ * Switch the mm for current to the kernel mm.  This does not mmdrop()
+ * -- the caller is responsible for reference counting, although
+ * __change_current_mm_to_kernel() handles all details related to lazy
+ * mm refcounting.
+ *
+ * If the caller is a user task, the caller must call mm_update_next_owner().
+ */
+void __change_current_mm_to_kernel(void);
+
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32275b4ff141..95eb0e78f74c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -14,6 +14,7 @@
 
 #include <linux/nospec.h>
 
+#include <linux/vmacache.h>
 #include <linux/kcov.h>
 #include <linux/scs.h>
 
@@ -4934,6 +4935,124 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	return finish_task_switch(prev);
 }
 
+void __change_current_mm(struct mm_struct *mm, bool mm_is_brand_new)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *old_active_mm, *mm_to_drop = NULL;
+
+	BUG_ON(!mm);	/* likely to cause corruption if we continue */
+
+	/*
+	 * We do not want to schedule, nor should procfs peek at current->mm
+	 * while we're modifying it.  task_lock() disables preemption and
+	 * locks against procfs.
+	 */
+	task_lock(tsk);
+	/*
+	 * membarrier() requires a full barrier before switching mm.
+	 */
+	smp_mb__after_spinlock();
+
+	local_irq_disable();
+
+	if (tsk->mm) {
+		/* We're detaching from an old mm.  Sync stats. */
+		sync_mm_rss(tsk->mm);
+	} else {
+		/*
+		 * Switching from kernel mm to user.  Drop the old lazy
+		 * mm reference.
+		 */
+		mm_to_drop = tsk->active_mm;
+	}
+
+	old_active_mm = tsk->active_mm;
+	tsk->active_mm = mm;
+	WRITE_ONCE(tsk->mm, mm);  /* membarrier reads this without locks */
+	membarrier_update_current_mm(mm);
+
+	if (mm_is_brand_new) {
+		/*
+		 * For historical reasons, some architectures want IRQs on
+		 * when activate_mm() is called.  If we're going to call
+		 * activate_mm(), turn on IRQs but leave preemption
+		 * disabled.
+		 */
+		if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+			local_irq_enable();
+		activate_mm(old_active_mm, mm);
+		if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+			local_irq_enable();
+	} else {
+		switch_mm_irqs_off(old_active_mm, mm, tsk);
+		local_irq_enable();
+	}
+
+	/* IRQs are on now.  Preemption is still disabled by task_lock(). */
+
+	membarrier_finish_switch_mm(mm);
+	vmacache_flush(tsk);
+	task_unlock(tsk);
+
+#ifdef finish_arch_post_lock_switch
+	if (!mm_is_brand_new) {
+		/*
+		 * Some architectures want a callback after
+		 * switch_mm_irqs_off() once locks are dropped.  Callers of
+		 * activate_mm() historically did not do this, so skip it if
+		 * we did activate_mm().  On arm, this is because
+		 * activate_mm() switches mm with IRQs on, which uses a
+		 * different code path.
+		 *
+		 * Yes, this is extremely fragile and be cleaned up.
+		 */
+		finish_arch_post_lock_switch();
+	}
+#endif
+
+	if (mm_to_drop)
+		mmdrop(mm_to_drop);
+}
+
+void __change_current_mm_to_kernel(void)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *old_mm = tsk->mm;
+
+	if (!old_mm)
+		return;	/* nothing to do */
+
+	/*
+	 * We do not want to schedule, nor should procfs peek at current->mm
+	 * while we're modifying it.  task_lock() disables preemption and
+	 * locks against procfs.
+	 */
+	task_lock(tsk);
+	/*
+	 * membarrier() requires a full barrier before switching mm.
+	 */
+	smp_mb__after_spinlock();
+
+	/* current has a real mm, so it must be active */
+	WARN_ON_ONCE(tsk->active_mm != tsk->mm);
+
+	local_irq_disable();
+
+	sync_mm_rss(old_mm);
+
+	WRITE_ONCE(tsk->mm, NULL);  /* membarrier reads this without locks */
+	membarrier_update_current_mm(NULL);
+	vmacache_flush(tsk);
+
+	/* active_mm is still 'old_mm' */
+	mmgrab(old_mm);
+	enter_lazy_tlb(old_mm, tsk);
+
+	local_irq_enable();
+
+	task_unlock(tsk);
+}
+
 /*
  * nr_running and nr_context_switches:
  *
-- 
2.33.1


  parent reply	other threads:[~2022-01-08 16:44 UTC|newest]

Thread overview: 79+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-08 16:43 [PATCH 00/23] mm, sched: Rework lazy mm handling Andy Lutomirski
2022-01-08 16:43 ` [PATCH 01/23] membarrier: Document why membarrier() works Andy Lutomirski
2022-01-12 15:30   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 02/23] x86/mm: Handle unlazying membarrier core sync in the arch code Andy Lutomirski
2022-01-12 15:40   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 03/23] membarrier: Remove membarrier_arch_switch_mm() prototype in core code Andy Lutomirski
2022-01-08 16:43 ` [PATCH 04/23] membarrier: Make the post-switch-mm barrier explicit Andy Lutomirski
2022-01-12 15:52   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 05/23] membarrier, kthread: Use _ONCE accessors for task->mm Andy Lutomirski
2022-01-12 15:55   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 06/23] powerpc/membarrier: Remove special barrier on mm switch Andy Lutomirski
2022-01-08 16:43   ` Andy Lutomirski
2022-01-10  8:42   ` Christophe Leroy
2022-01-10  8:42     ` Christophe Leroy
2022-01-12 15:57   ` Mathieu Desnoyers
2022-01-12 15:57     ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 07/23] membarrier: Rewrite sync_core_before_usermode() and improve documentation Andy Lutomirski
2022-01-08 16:43   ` Andy Lutomirski
2022-01-08 16:43   ` Andy Lutomirski
2022-01-12 16:11   ` Mathieu Desnoyers
2022-01-12 16:11     ` Mathieu Desnoyers
2022-01-12 16:11     ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 08/23] membarrier: Remove redundant clear of mm->membarrier_state in exec_mmap() Andy Lutomirski
2022-01-12 16:13   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 09/23] membarrier: Fix incorrect barrier positions during exec and kthread_use_mm() Andy Lutomirski
2022-01-12 16:30   ` Mathieu Desnoyers
2022-01-12 17:08     ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 10/23] x86/events, x86/insn-eval: Remove incorrect active_mm references Andy Lutomirski
2022-01-08 16:43 ` [PATCH 11/23] sched/scs: Initialize shadow stack on idle thread bringup, not shutdown Andy Lutomirski
2022-01-10 22:06   ` Sami Tolvanen
2022-01-08 16:43 ` [PATCH 12/23] Rework "sched/core: Fix illegal RCU from offline CPUs" Andy Lutomirski
2022-01-08 16:43 ` [PATCH 13/23] exec: Remove unnecessary vmacache_seqnum clear in exec_mmap() Andy Lutomirski
2022-01-08 16:43 ` Andy Lutomirski [this message]
2022-01-08 16:44 ` [PATCH 15/23] kthread: Switch to __change_current_mm() Andy Lutomirski
2022-01-08 16:44 ` [PATCH 16/23] sched: Use lightweight hazard pointers to grab lazy mms Andy Lutomirski
2022-01-08 19:22   ` Linus Torvalds
2022-01-08 22:04     ` Andy Lutomirski
2022-01-09  0:27       ` Linus Torvalds
2022-01-09  0:53       ` Linus Torvalds
2022-01-09  3:58         ` Andy Lutomirski
2022-01-09  4:38           ` Linus Torvalds
2022-01-09 20:19             ` Andy Lutomirski
2022-01-09 20:48               ` Linus Torvalds
2022-01-09 21:51                 ` Linus Torvalds
2022-01-10  0:52                   ` Andy Lutomirski
2022-01-10  2:36                     ` Rik van Riel
2022-01-10  3:51                       ` Linus Torvalds
2022-01-10  4:56                   ` Nicholas Piggin
2022-01-10  5:17                     ` Nicholas Piggin
2022-01-10 17:19                       ` Linus Torvalds
2022-01-11  2:24                         ` Nicholas Piggin
2022-01-10 20:52                     ` Andy Lutomirski
2022-01-11  3:10                       ` Nicholas Piggin
2022-01-11 15:39                         ` Andy Lutomirski
2022-01-11 22:48                           ` Nicholas Piggin
2022-01-12  0:42                             ` Nicholas Piggin
2022-01-11 10:39                 ` Will Deacon
2022-01-11 15:22                   ` Andy Lutomirski
2022-01-09  5:56   ` Nadav Amit
2022-01-09  6:48     ` Linus Torvalds
2022-01-09  8:49       ` Nadav Amit
2022-01-09 19:10         ` Linus Torvalds
2022-01-09 19:52           ` Andy Lutomirski
2022-01-09 20:00             ` Linus Torvalds
2022-01-09 20:34             ` Nadav Amit
2022-01-09 20:48               ` Andy Lutomirski
2022-01-09 19:22         ` Rik van Riel
2022-01-09 19:34           ` Nadav Amit
2022-01-09 19:37             ` Rik van Riel
2022-01-09 19:51               ` Nadav Amit
2022-01-09 19:54                 ` Linus Torvalds
2022-01-08 16:44 ` [PATCH 17/23] x86/mm: Make use/unuse_temporary_mm() non-static Andy Lutomirski
2022-01-08 16:44 ` [PATCH 18/23] x86/mm: Allow temporary mms when IRQs are on Andy Lutomirski
2022-01-08 16:44 ` [PATCH 19/23] x86/efi: Make efi_enter/leave_mm use the temporary_mm machinery Andy Lutomirski
2022-01-10 13:13   ` Ard Biesheuvel
2022-01-08 16:44 ` [PATCH 20/23] x86/mm: Remove leave_mm() in favor of unlazy_mm_irqs_off() Andy Lutomirski
2022-01-08 16:44 ` [PATCH 21/23] x86/mm: Use unlazy_mm_irqs_off() in TLB flush IPIs Andy Lutomirski
2022-01-08 16:44 ` [PATCH 22/23] x86/mm: Optimize for_each_possible_lazymm_cpu() Andy Lutomirski
2022-01-08 16:44 ` [PATCH 23/23] x86/mm: Opt in to IRQs-off activate_mm() Andy Lutomirski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=60eb8a98061100f95e53e7868841fbb9a68237c8.1641659630.git.luto@kernel.org \
    --to=luto@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=anton@ozlabs.org \
    --cc=benh@kernel.crashing.org \
    --cc=dave.hansen@intel.com \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=nadav.amit@gmail.com \
    --cc=npiggin@gmail.com \
    --cc=paulus@ozlabs.org \
    --cc=peterz@infradead.org \
    --cc=rdunlap@infradead.org \
    --cc=riel@surriel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.