All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andy Lutomirski <luto@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>, Linux-MM <linux-mm@kvack.org>
Cc: Nicholas Piggin <npiggin@gmail.com>,
	Anton Blanchard <anton@ozlabs.org>,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	Paul Mackerras <paulus@ozlabs.org>,
	Randy Dunlap <rdunlap@infradead.org>,
	linux-arch <linux-arch@vger.kernel.org>,
	x86@kernel.org, Rik van Riel <riel@surriel.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Nadav Amit <nadav.amit@gmail.com>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	Andy Lutomirski <luto@kernel.org>
Subject: [PATCH 20/23] x86/mm: Remove leave_mm() in favor of unlazy_mm_irqs_off()
Date: Sat,  8 Jan 2022 08:44:05 -0800	[thread overview]
Message-ID: <5e80aa6deb3f0a7bdcba5e9f20c48df50b752fd3.1641659630.git.luto@kernel.org> (raw)
In-Reply-To: <cover.1641659630.git.luto@kernel.org>

x86's mm_cpumask() precisely tracks every CPU using an mm, with one
major caveat: x86 internally switches back to init_mm more
aggressively than the core code.  This means that it's possible for
x86 to point CR3 to init_mm and drop current->active_mm from
mm_cpumask().  The core scheduler doesn't know when this happens,
which is currently fine.

But if we want to use mm_cpumask() to optimize
for_each_possible_lazymm_cpu(), we need to keep mm_cpumask() in
sync with the core scheduler.

This patch removes x86's bespoke leave_mm() and uses the core scheduler's
unlazy_mm_irqs_off() so that a lazy mm can be dropped and ->active_mm
cleaned up together.  This allows for_each_possible_lazymm_cpu() to be
wired up on x86.

As a side effect, non-x86 architectures that use ACPI C3 will now leave
lazy mm mode before entering C3.  This can only possibly affect ia64,
because only x86 and ia64 enable CONFIG_ACPI_PROCESSOR_CSTATE.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/include/asm/mmu.h  |  2 --
 arch/x86/mm/tlb.c           | 29 +++--------------------------
 arch/x86/xen/mmu_pv.c       |  2 +-
 drivers/cpuidle/cpuidle.c   |  2 +-
 drivers/idle/intel_idle.c   |  4 ++--
 include/linux/mmu_context.h |  4 +---
 kernel/sched/sched.h        |  2 --
 7 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea9..03ba71420ff3 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -63,7 +63,5 @@ typedef struct {
 		.lock = __MUTEX_INITIALIZER(mm.context.lock),		\
 	}
 
-void leave_mm(int cpu);
-#define leave_mm leave_mm
 
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36ce9dffb963..e502565176b9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
 #include <linux/export.h>
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
+#include <linux/mmu_context.h>
 #include <linux/sched/smt.h>
 #include <linux/sched/mm.h>
 
@@ -294,28 +295,6 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
 	write_cr3(new_mm_cr3);
 }
 
-void leave_mm(int cpu)
-{
-	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
-
-	/*
-	 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
-	 * If so, our callers still expect us to flush the TLB, but there
-	 * aren't any user TLB entries in init_mm to worry about.
-	 *
-	 * This needs to happen before any other sanity checks due to
-	 * intel_idle's shenanigans.
-	 */
-	if (loaded_mm == &init_mm)
-		return;
-
-	/* Warn if we're not lazy. */
-	WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
-
-	switch_mm(NULL, &init_mm, NULL);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	       struct task_struct *tsk)
 {
@@ -512,8 +491,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * from lazy TLB mode to normal mode if active_mm isn't changing.
 	 * When this happens, we don't assume that CR3 (and hence
 	 * cpu_tlbstate.loaded_mm) matches next.
-	 *
-	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
 	 */
 
 	/* We don't want flush_tlb_func() to run concurrently with us. */
@@ -523,7 +500,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	/*
 	 * Verify that CR3 is what we think it is.  This will catch
 	 * hypothetical buggy code that directly switches to swapper_pg_dir
-	 * without going through leave_mm() / switch_mm_irqs_off() or that
+	 * without going through switch_mm_irqs_off() or that
 	 * does something like write_cr3(read_cr3_pa()).
 	 *
 	 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
@@ -732,7 +709,7 @@ temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
 	 * restoring the previous mm.
 	 */
 	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
-		leave_mm(smp_processor_id());
+		unlazy_mm_irqs_off();
 
 	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 	switch_mm_irqs_off(NULL, mm, current);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 3359c23573c5..ba849185810a 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -898,7 +898,7 @@ static void drop_mm_ref_this_cpu(void *info)
 	struct mm_struct *mm = info;
 
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
-		leave_mm(smp_processor_id());
+		unlazy_mm_irqs_off();
 
 	/*
 	 * If this cpu still has a stale cr3 reference, then make sure
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ef2ea1b12cd8..b865822a6278 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -223,7 +223,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	}
 
 	if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-		leave_mm(dev->cpu);
+		unlazy_mm_irqs_off();
 
 	/* Take note of the planned idle state. */
 	sched_idle_set_state(target_state);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index e6c543b5ee1d..bb5d3b3e28df 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -115,8 +115,8 @@ static unsigned int mwait_substates __initdata;
  * If the local APIC timer is not known to be reliable in the target idle state,
  * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
  *
- * Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
- * flushing user TLBs.
+ * Optionally call unlazy_mm_irqs_off() for the target CPU upfront to avoid
+ * wakeups due to flushing user TLBs.
  *
  * Must be called under local_irq_disable().
  */
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index b9b970f7ab45..035e8e42eb78 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -10,9 +10,7 @@
 # define switch_mm_irqs_off switch_mm
 #endif
 
-#ifndef leave_mm
-static inline void leave_mm(int cpu) { }
-#endif
+extern void unlazy_mm_irqs_off(void);
 
 /*
  * CPUs that are capable of running user task @p. Must contain at least one
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1010e63962d9..e57121bc84d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3071,5 +3071,3 @@ extern int preempt_dynamic_mode;
 extern int sched_dynamic_mode(const char *str);
 extern void sched_dynamic_update(int mode);
 #endif
-
-extern void unlazy_mm_irqs_off(void);
-- 
2.33.1


  parent reply	other threads:[~2022-01-08 16:44 UTC|newest]

Thread overview: 79+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-08 16:43 [PATCH 00/23] mm, sched: Rework lazy mm handling Andy Lutomirski
2022-01-08 16:43 ` [PATCH 01/23] membarrier: Document why membarrier() works Andy Lutomirski
2022-01-12 15:30   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 02/23] x86/mm: Handle unlazying membarrier core sync in the arch code Andy Lutomirski
2022-01-12 15:40   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 03/23] membarrier: Remove membarrier_arch_switch_mm() prototype in core code Andy Lutomirski
2022-01-08 16:43 ` [PATCH 04/23] membarrier: Make the post-switch-mm barrier explicit Andy Lutomirski
2022-01-12 15:52   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 05/23] membarrier, kthread: Use _ONCE accessors for task->mm Andy Lutomirski
2022-01-12 15:55   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 06/23] powerpc/membarrier: Remove special barrier on mm switch Andy Lutomirski
2022-01-08 16:43   ` Andy Lutomirski
2022-01-10  8:42   ` Christophe Leroy
2022-01-10  8:42     ` Christophe Leroy
2022-01-12 15:57   ` Mathieu Desnoyers
2022-01-12 15:57     ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 07/23] membarrier: Rewrite sync_core_before_usermode() and improve documentation Andy Lutomirski
2022-01-08 16:43   ` Andy Lutomirski
2022-01-08 16:43   ` Andy Lutomirski
2022-01-12 16:11   ` Mathieu Desnoyers
2022-01-12 16:11     ` Mathieu Desnoyers
2022-01-12 16:11     ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 08/23] membarrier: Remove redundant clear of mm->membarrier_state in exec_mmap() Andy Lutomirski
2022-01-12 16:13   ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 09/23] membarrier: Fix incorrect barrier positions during exec and kthread_use_mm() Andy Lutomirski
2022-01-12 16:30   ` Mathieu Desnoyers
2022-01-12 17:08     ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 10/23] x86/events, x86/insn-eval: Remove incorrect active_mm references Andy Lutomirski
2022-01-08 16:43 ` [PATCH 11/23] sched/scs: Initialize shadow stack on idle thread bringup, not shutdown Andy Lutomirski
2022-01-10 22:06   ` Sami Tolvanen
2022-01-08 16:43 ` [PATCH 12/23] Rework "sched/core: Fix illegal RCU from offline CPUs" Andy Lutomirski
2022-01-08 16:43 ` [PATCH 13/23] exec: Remove unnecessary vmacache_seqnum clear in exec_mmap() Andy Lutomirski
2022-01-08 16:43 ` [PATCH 14/23] sched, exec: Factor current mm changes out from exec Andy Lutomirski
2022-01-08 16:44 ` [PATCH 15/23] kthread: Switch to __change_current_mm() Andy Lutomirski
2022-01-08 16:44 ` [PATCH 16/23] sched: Use lightweight hazard pointers to grab lazy mms Andy Lutomirski
2022-01-08 19:22   ` Linus Torvalds
2022-01-08 22:04     ` Andy Lutomirski
2022-01-09  0:27       ` Linus Torvalds
2022-01-09  0:53       ` Linus Torvalds
2022-01-09  3:58         ` Andy Lutomirski
2022-01-09  4:38           ` Linus Torvalds
2022-01-09 20:19             ` Andy Lutomirski
2022-01-09 20:48               ` Linus Torvalds
2022-01-09 21:51                 ` Linus Torvalds
2022-01-10  0:52                   ` Andy Lutomirski
2022-01-10  2:36                     ` Rik van Riel
2022-01-10  3:51                       ` Linus Torvalds
2022-01-10  4:56                   ` Nicholas Piggin
2022-01-10  5:17                     ` Nicholas Piggin
2022-01-10 17:19                       ` Linus Torvalds
2022-01-11  2:24                         ` Nicholas Piggin
2022-01-10 20:52                     ` Andy Lutomirski
2022-01-11  3:10                       ` Nicholas Piggin
2022-01-11 15:39                         ` Andy Lutomirski
2022-01-11 22:48                           ` Nicholas Piggin
2022-01-12  0:42                             ` Nicholas Piggin
2022-01-11 10:39                 ` Will Deacon
2022-01-11 15:22                   ` Andy Lutomirski
2022-01-09  5:56   ` Nadav Amit
2022-01-09  6:48     ` Linus Torvalds
2022-01-09  8:49       ` Nadav Amit
2022-01-09 19:10         ` Linus Torvalds
2022-01-09 19:52           ` Andy Lutomirski
2022-01-09 20:00             ` Linus Torvalds
2022-01-09 20:34             ` Nadav Amit
2022-01-09 20:48               ` Andy Lutomirski
2022-01-09 19:22         ` Rik van Riel
2022-01-09 19:34           ` Nadav Amit
2022-01-09 19:37             ` Rik van Riel
2022-01-09 19:51               ` Nadav Amit
2022-01-09 19:54                 ` Linus Torvalds
2022-01-08 16:44 ` [PATCH 17/23] x86/mm: Make use/unuse_temporary_mm() non-static Andy Lutomirski
2022-01-08 16:44 ` [PATCH 18/23] x86/mm: Allow temporary mms when IRQs are on Andy Lutomirski
2022-01-08 16:44 ` [PATCH 19/23] x86/efi: Make efi_enter/leave_mm use the temporary_mm machinery Andy Lutomirski
2022-01-10 13:13   ` Ard Biesheuvel
2022-01-08 16:44 ` Andy Lutomirski [this message]
2022-01-08 16:44 ` [PATCH 21/23] x86/mm: Use unlazy_mm_irqs_off() in TLB flush IPIs Andy Lutomirski
2022-01-08 16:44 ` [PATCH 22/23] x86/mm: Optimize for_each_possible_lazymm_cpu() Andy Lutomirski
2022-01-08 16:44 ` [PATCH 23/23] x86/mm: Opt in to IRQs-off activate_mm() Andy Lutomirski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5e80aa6deb3f0a7bdcba5e9f20c48df50b752fd3.1641659630.git.luto@kernel.org \
    --to=luto@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=anton@ozlabs.org \
    --cc=benh@kernel.crashing.org \
    --cc=dave.hansen@intel.com \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=nadav.amit@gmail.com \
    --cc=npiggin@gmail.com \
    --cc=paulus@ozlabs.org \
    --cc=peterz@infradead.org \
    --cc=rdunlap@infradead.org \
    --cc=riel@surriel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.