All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86/nmi: Fix some races in NMI uaccess
@ 2018-08-29 15:47 Andy Lutomirski
  2018-08-29 16:06 ` Rik van Riel
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Andy Lutomirski @ 2018-08-29 15:47 UTC (permalink / raw)
  To: x86, Nadav Amit
  Cc: Borislav Petkov, Rik van Riel, Jann Horn, LKML, Andy Lutomirski,
	stable, Peter Zijlstra

In NMI context, we might be in the middle of context switching or in
the middle of switch_mm_irqs_off().  In either case, CR3 might not
match current->mm, which could cause copy_from_user_nmi() and
friends to read the wrong memory.

Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

Cc: stable@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---

Nadav, this is intended for your series.  Want to add it right
before the use_temporary_mm() stuff?

Changes from v1:
 - Improved comments
 - Add debugging
 - Fix bugs

arch/x86/events/core.c          |  2 +-
 arch/x86/include/asm/tlbflush.h | 44 +++++++++++++++++++++++++++++++++
 arch/x86/lib/usercopy.c         |  5 ++++
 arch/x86/mm/tlb.c               |  7 ++++++
 4 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	perf_callchain_store(entry, regs->ip);
 
-	if (!current->mm)
+	if (!nmi_uaccess_okay())
 		return;
 
 	if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..183d36a98b04 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -175,8 +175,16 @@ struct tlb_state {
 	 * are on.  This means that it may not match current->active_mm,
 	 * which will contain the previous user mm when we're in lazy TLB
 	 * mode even if we've already switched back to swapper_pg_dir.
+	 *
+	 * During switch_mm_irqs_off(), loaded_mm will be set to
+	 * LOADED_MM_SWITCHING during the brief interrupts-off window
+	 * when CR3 and loaded_mm would otherwise be inconsistent.  This
+	 * is for nmi_uaccess_okay()'s benefit.
 	 */
 	struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
 	u16 loaded_mm_asid;
 	u16 next_asid;
 	/* last user mm's ctx id */
@@ -246,6 +254,42 @@ struct tlb_state {
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm.  It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	struct mm_struct *current_mm = current->mm;
+
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON_ONCE(!loaded_mm);
+#endif
+
+	/*
+	 * The condition we want to check is
+	 * current_mm->pgd == __va(read_cr3_pa()).  This may be slow, though,
+	 * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+	 * is supposed to be reasonably fast.
+	 *
+	 * Instead, we check the almost equivalent but somewhat conservative
+	 * condition below, and we rely on the fact that switch_mm_irqs_off()
+	 * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+	 */
+	if (loaded_mm != current_mm)
+		return false;
+
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+#endif
+
+	return true;
+}
+
 /* Initialize cr4 shadow for this CPU. */
 static inline void cr4_init_shadow(void)
 {
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
 #include <linux/uaccess.h>
 #include <linux/export.h>
 
+#include <asm/tlbflush.h>
+
 /*
  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
  * nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	if (__range_not_ok(from, n, TASK_SIZE))
 		return n;
 
+	if (!nmi_uaccess_okay())
+		return n;
+
 	/*
 	 * Even though this function is typically called from NMI/IRQ context
 	 * disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e721cf2d4290..707d2b2a333f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -304,6 +304,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 
+		/* Let nmi_uaccess_okay() know that we're changing CR3. */
+		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+		barrier();
+
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -334,6 +338,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (next != &init_mm)
 			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 
+		/* Make sure we write CR3 before loaded_mm. */
+		barrier();
+
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
 		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 	}
-- 
2.17.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-29 15:47 [PATCH v2] x86/nmi: Fix some races in NMI uaccess Andy Lutomirski
@ 2018-08-29 16:06 ` Rik van Riel
  2018-08-29 18:56 ` Nadav Amit
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Rik van Riel @ 2018-08-29 16:06 UTC (permalink / raw)
  To: Andy Lutomirski, x86, Nadav Amit
  Cc: Borislav Petkov, Jann Horn, LKML, stable, Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 692 bytes --]

On Wed, 2018-08-29 at 08:47 -0700, Andy Lutomirski wrote:
> In NMI context, we might be in the middle of context switching or in
> the middle of switch_mm_irqs_off().  In either case, CR3 might not
> match current->mm, which could cause copy_from_user_nmi() and
> friends to read the wrong memory.
> 
> Fix it by adding a new nmi_uaccess_okay() helper and checking it in
> copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
> 
> Cc: stable@vger.kernel.org
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Nadav Amit <nadav.amit@gmail.com>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>

Reviewed-by: Rik van Riel <riel@surriel.com>

-- 
All Rights Reversed.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-29 15:47 [PATCH v2] x86/nmi: Fix some races in NMI uaccess Andy Lutomirski
  2018-08-29 16:06 ` Rik van Riel
@ 2018-08-29 18:56 ` Nadav Amit
  2018-08-30 13:36   ` Thomas Gleixner
  2018-08-30 14:39 ` [tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching tip-bot for Andy Lutomirski
  2018-08-31 15:13 ` tip-bot for Andy Lutomirski
  3 siblings, 1 reply; 14+ messages in thread
From: Nadav Amit @ 2018-08-29 18:56 UTC (permalink / raw)
  To: Andy Lutomirski, x86
  Cc: Borislav Petkov, Rik van Riel, Jann Horn, LKML, stable, Peter Zijlstra

at 8:47 AM, Andy Lutomirski <luto@kernel.org> wrote:

> In NMI context, we might be in the middle of context switching or in
> the middle of switch_mm_irqs_off().  In either case, CR3 might not
> match current->mm, which could cause copy_from_user_nmi() and
> friends to read the wrong memory.
> 
> Fix it by adding a new nmi_uaccess_okay() helper and checking it in
> copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
> 
> Cc: stable@vger.kernel.org
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Nadav Amit <nadav.amit@gmail.com>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
> 
> Nadav, this is intended for your series.  Want to add it right
> before the use_temporary_mm() stuff?

Sure. Thanks! I will apply the following small fix:

> +
> +#ifdef CONFIG_DEBUG_VM
> +	WARN_ON_ONCE(!loaded_mm);
> +#endif

Will be changed to VM_WARN_ON_ONCE() in the two instances.

Regards,
Nadav

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-29 18:56 ` Nadav Amit
@ 2018-08-30 13:36   ` Thomas Gleixner
  2018-08-30 14:21     ` Andy Lutomirski
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Gleixner @ 2018-08-30 13:36 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Andy Lutomirski, x86, Borislav Petkov, Rik van Riel, Jann Horn,
	LKML, stable, Peter Zijlstra

On Wed, 29 Aug 2018, Nadav Amit wrote:
> at 8:47 AM, Andy Lutomirski <luto@kernel.org> wrote:
> 
> > In NMI context, we might be in the middle of context switching or in
> > the middle of switch_mm_irqs_off().  In either case, CR3 might not
> > match current->mm, which could cause copy_from_user_nmi() and
> > friends to read the wrong memory.
> > 
> > Fix it by adding a new nmi_uaccess_okay() helper and checking it in
> > copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
> > 
> > Cc: stable@vger.kernel.org
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Nadav Amit <nadav.amit@gmail.com>
> > Signed-off-by: Andy Lutomirski <luto@kernel.org>
> > ---
> > 
> > Nadav, this is intended for your series.  Want to add it right
> > before the use_temporary_mm() stuff?
> 
> Sure. Thanks! I will apply the following small fix:
> 
> > +
> > +#ifdef CONFIG_DEBUG_VM
> > +	WARN_ON_ONCE(!loaded_mm);
> > +#endif
> 
> Will be changed to VM_WARN_ON_ONCE() in the two instances.

Unless I'm completely lost, this can just be applied to tip right
away. It's not depending on anything else.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-30 13:36   ` Thomas Gleixner
@ 2018-08-30 14:21     ` Andy Lutomirski
  2018-08-30 14:27       ` Thomas Gleixner
  0 siblings, 1 reply; 14+ messages in thread
From: Andy Lutomirski @ 2018-08-30 14:21 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Nadav Amit, Andy Lutomirski, x86, Borislav Petkov, Rik van Riel,
	Jann Horn, LKML, stable, Peter Zijlstra



> On Aug 30, 2018, at 6:36 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
>> On Wed, 29 Aug 2018, Nadav Amit wrote:
>> at 8:47 AM, Andy Lutomirski <luto@kernel.org> wrote:
>> 
>>> In NMI context, we might be in the middle of context switching or in
>>> the middle of switch_mm_irqs_off().  In either case, CR3 might not
>>> match current->mm, which could cause copy_from_user_nmi() and
>>> friends to read the wrong memory.
>>> 
>>> Fix it by adding a new nmi_uaccess_okay() helper and checking it in
>>> copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
>>> 
>>> Cc: stable@vger.kernel.org
>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>> Cc: Nadav Amit <nadav.amit@gmail.com>
>>> Signed-off-by: Andy Lutomirski <luto@kernel.org>
>>> ---
>>> 
>>> Nadav, this is intended for your series.  Want to add it right
>>> before the use_temporary_mm() stuff?
>> 
>> Sure. Thanks! I will apply the following small fix:
>> 
>>> +
>>> +#ifdef CONFIG_DEBUG_VM
>>> +    WARN_ON_ONCE(!loaded_mm);
>>> +#endif
>> 
>> Will be changed to VM_WARN_ON_ONCE() in the two instances.
> 
> Unless I'm completely lost, this can just be applied to tip right
> away. It's not depending on anything else.
> 

Fine with me. Do you want to do the VM_WARN_ON cleanup yourself or should I send a v3?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-30 14:21     ` Andy Lutomirski
@ 2018-08-30 14:27       ` Thomas Gleixner
  0 siblings, 0 replies; 14+ messages in thread
From: Thomas Gleixner @ 2018-08-30 14:27 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nadav Amit, Andy Lutomirski, x86, Borislav Petkov, Rik van Riel,
	Jann Horn, LKML, stable, Peter Zijlstra

On Thu, 30 Aug 2018, Andy Lutomirski wrote:
> > On Aug 30, 2018, at 6:36 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > 
> >> On Wed, 29 Aug 2018, Nadav Amit wrote:
> >> at 8:47 AM, Andy Lutomirski <luto@kernel.org> wrote:
> >> 
> >>> In NMI context, we might be in the middle of context switching or in
> >>> the middle of switch_mm_irqs_off().  In either case, CR3 might not
> >>> match current->mm, which could cause copy_from_user_nmi() and
> >>> friends to read the wrong memory.
> >>> 
> >>> Fix it by adding a new nmi_uaccess_okay() helper and checking it in
> >>> copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
> >>> 
> >>> Cc: stable@vger.kernel.org
> >>> Cc: Peter Zijlstra <peterz@infradead.org>
> >>> Cc: Nadav Amit <nadav.amit@gmail.com>
> >>> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> >>> ---
> >>> 
> >>> Nadav, this is intended for your series.  Want to add it right
> >>> before the use_temporary_mm() stuff?
> >> 
> >> Sure. Thanks! I will apply the following small fix:
> >> 
> >>> +
> >>> +#ifdef CONFIG_DEBUG_VM
> >>> +    WARN_ON_ONCE(!loaded_mm);
> >>> +#endif
> >> 
> >> Will be changed to VM_WARN_ON_ONCE() in the two instances.
> > 
> > Unless I'm completely lost, this can just be applied to tip right
> > away. It's not depending on anything else.
> > 
> 
> Fine with me. Do you want to do the VM_WARN_ON cleanup yourself or should
> I send a v3?

I think, I'll manage

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching
  2018-08-29 15:47 [PATCH v2] x86/nmi: Fix some races in NMI uaccess Andy Lutomirski
  2018-08-29 16:06 ` Rik van Riel
  2018-08-29 18:56 ` Nadav Amit
@ 2018-08-30 14:39 ` tip-bot for Andy Lutomirski
  2018-08-31 15:13 ` tip-bot for Andy Lutomirski
  3 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Andy Lutomirski @ 2018-08-30 14:39 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: bp, linux-kernel, luto, mingo, hpa, nadav.amit, riel, tglx,
	jannh, peterz

Commit-ID:  16f54a362e4083218ac8d67a4879532c6eef2d98
Gitweb:     https://git.kernel.org/tip/16f54a362e4083218ac8d67a4879532c6eef2d98
Author:     Andy Lutomirski <luto@kernel.org>
AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Thu, 30 Aug 2018 16:31:19 +0200

x86/nmi: Fix NMI uaccess race against CR3 switching

A NMI can hit in the middle of context switching or in the middle of
switch_mm_irqs_off().  In either case, CR3 might not match current->mm,
which could cause copy_from_user_nmi() and friends to read the wrong
memory.

Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rik van Riel <riel@surriel.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jann Horn <jannh@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.luto@kernel.org

---
 arch/x86/events/core.c          |  2 +-
 arch/x86/include/asm/tlbflush.h | 40 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/usercopy.c         |  5 +++++
 arch/x86/mm/tlb.c               |  7 +++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	perf_callchain_store(entry, regs->ip);
 
-	if (!current->mm)
+	if (!nmi_uaccess_okay())
 		return;
 
 	if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..58ce5288878e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -175,8 +175,16 @@ struct tlb_state {
 	 * are on.  This means that it may not match current->active_mm,
 	 * which will contain the previous user mm when we're in lazy TLB
 	 * mode even if we've already switched back to swapper_pg_dir.
+	 *
+	 * During switch_mm_irqs_off(), loaded_mm will be set to
+	 * LOADED_MM_SWITCHING during the brief interrupts-off window
+	 * when CR3 and loaded_mm would otherwise be inconsistent.  This
+	 * is for nmi_uaccess_okay()'s benefit.
 	 */
 	struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
 	u16 loaded_mm_asid;
 	u16 next_asid;
 	/* last user mm's ctx id */
@@ -246,6 +254,38 @@ struct tlb_state {
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm.  It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	struct mm_struct *current_mm = current->mm;
+
+	VM_WARN_ON_ONCE(!loaded_mm);
+
+	/*
+	 * The condition we want to check is
+	 * current_mm->pgd == __va(read_cr3_pa()).  This may be slow, though,
+	 * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+	 * is supposed to be reasonably fast.
+	 *
+	 * Instead, we check the almost equivalent but somewhat conservative
+	 * condition below, and we rely on the fact that switch_mm_irqs_off()
+	 * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+	 */
+	if (loaded_mm != current_mm)
+		return false;
+
+	VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+	return true;
+}
+
 /* Initialize cr4 shadow for this CPU. */
 static inline void cr4_init_shadow(void)
 {
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
 #include <linux/uaccess.h>
 #include <linux/export.h>
 
+#include <asm/tlbflush.h>
+
 /*
  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
  * nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	if (__range_not_ok(from, n, TASK_SIZE))
 		return n;
 
+	if (!nmi_uaccess_okay())
+		return n;
+
 	/*
 	 * Even though this function is typically called from NMI/IRQ context
 	 * disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9517d1b2a281..e96b99eb800c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 
+		/* Let nmi_uaccess_okay() know that we're changing CR3. */
+		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+		barrier();
+
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -335,6 +339,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (next != &init_mm)
 			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 
+		/* Make sure we write CR3 before loaded_mm. */
+		barrier();
+
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
 		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 	}

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching
  2018-08-29 15:47 [PATCH v2] x86/nmi: Fix some races in NMI uaccess Andy Lutomirski
                   ` (2 preceding siblings ...)
  2018-08-30 14:39 ` [tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching tip-bot for Andy Lutomirski
@ 2018-08-31 15:13 ` tip-bot for Andy Lutomirski
  3 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Andy Lutomirski @ 2018-08-31 15:13 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: peterz, hpa, mingo, jannh, tglx, nadav.amit, bp, luto, riel,
	linux-kernel

Commit-ID:  4012e77a903d114f915fc607d6d2ed54a3d6c9b1
Gitweb:     https://git.kernel.org/tip/4012e77a903d114f915fc607d6d2ed54a3d6c9b1
Author:     Andy Lutomirski <luto@kernel.org>
AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 31 Aug 2018 17:08:22 +0200

x86/nmi: Fix NMI uaccess race against CR3 switching

A NMI can hit in the middle of context switching or in the middle of
switch_mm_irqs_off().  In either case, CR3 might not match current->mm,
which could cause copy_from_user_nmi() and friends to read the wrong
memory.

Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rik van Riel <riel@surriel.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jann Horn <jannh@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.luto@kernel.org


---
 arch/x86/events/core.c          |  2 +-
 arch/x86/include/asm/tlbflush.h | 40 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/usercopy.c         |  5 +++++
 arch/x86/mm/tlb.c               |  7 +++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	perf_callchain_store(entry, regs->ip);
 
-	if (!current->mm)
+	if (!nmi_uaccess_okay())
 		return;
 
 	if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..58ce5288878e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -175,8 +175,16 @@ struct tlb_state {
 	 * are on.  This means that it may not match current->active_mm,
 	 * which will contain the previous user mm when we're in lazy TLB
 	 * mode even if we've already switched back to swapper_pg_dir.
+	 *
+	 * During switch_mm_irqs_off(), loaded_mm will be set to
+	 * LOADED_MM_SWITCHING during the brief interrupts-off window
+	 * when CR3 and loaded_mm would otherwise be inconsistent.  This
+	 * is for nmi_uaccess_okay()'s benefit.
 	 */
 	struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
 	u16 loaded_mm_asid;
 	u16 next_asid;
 	/* last user mm's ctx id */
@@ -246,6 +254,38 @@ struct tlb_state {
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm.  It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	struct mm_struct *current_mm = current->mm;
+
+	VM_WARN_ON_ONCE(!loaded_mm);
+
+	/*
+	 * The condition we want to check is
+	 * current_mm->pgd == __va(read_cr3_pa()).  This may be slow, though,
+	 * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+	 * is supposed to be reasonably fast.
+	 *
+	 * Instead, we check the almost equivalent but somewhat conservative
+	 * condition below, and we rely on the fact that switch_mm_irqs_off()
+	 * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+	 */
+	if (loaded_mm != current_mm)
+		return false;
+
+	VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+	return true;
+}
+
 /* Initialize cr4 shadow for this CPU. */
 static inline void cr4_init_shadow(void)
 {
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
 #include <linux/uaccess.h>
 #include <linux/export.h>
 
+#include <asm/tlbflush.h>
+
 /*
  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
  * nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	if (__range_not_ok(from, n, TASK_SIZE))
 		return n;
 
+	if (!nmi_uaccess_okay())
+		return n;
+
 	/*
 	 * Even though this function is typically called from NMI/IRQ context
 	 * disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9517d1b2a281..e96b99eb800c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 
+		/* Let nmi_uaccess_okay() know that we're changing CR3. */
+		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+		barrier();
+
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -335,6 +339,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (next != &init_mm)
 			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 
+		/* Make sure we write CR3 before loaded_mm. */
+		barrier();
+
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
 		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 	}

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-29 15:49         ` Rik van Riel
@ 2018-08-29 16:14           ` Andy Lutomirski
  0 siblings, 0 replies; 14+ messages in thread
From: Andy Lutomirski @ 2018-08-29 16:14 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andy Lutomirski, X86 ML, Borislav Petkov, Jann Horn, LKML,
	stable, Peter Zijlstra, Nadav Amit

On Wed, Aug 29, 2018 at 8:49 AM, Rik van Riel <riel@surriel.com> wrote:
> On Wed, 2018-08-29 at 08:36 -0700, Andy Lutomirski wrote:
>> On Wed, Aug 29, 2018 at 8:17 AM, Rik van Riel <riel@surriel.com>
>> wrote:
>> > On Tue, 2018-08-28 at 20:46 -0700, Andy Lutomirski wrote:
>> > > On Tue, Aug 28, 2018 at 10:56 AM, Rik van Riel <riel@surriel.com>
>> > > wrote:
>> > > > On Mon, 27 Aug 2018 16:04:16 -0700
>> > > > Andy Lutomirski <luto@kernel.org> wrote:
>> > > >
>> > > > > The 0day bot is still chewing on this, but I've tested it a
>> > > > > bit
>> > > > > locally
>> > > > > and it seems to do the right thing.
>> > > >
>> > > > Hi Andy,
>> > > >
>> > > > the version of the patch below should fix the bug we talked
>> > > > about
>> > > > in email yesterday.  It should automatically cover kernel
>> > > > threads
>> > > > in lazy TLB mode, because current->mm will be NULL, while the
>> > > > cpu_tlbstate.loaded_mm should never be NULL.
>> > > >
>> > >
>> > > That's better than mine.  I tweaked it a bit and added some
>> > > debugging,
>> > > and I got this:
>> > >
>> > >
>> >
>> >
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/fixes&id=dd956eba16646fd0b15c3c0741269dfd84452dac
>> > >
>> > > I made the loaded_mm handling a little more conservative to make
>> > > it
>> > > more obvious that switch_mm_irqs_off() is safe regardless of
>> > > exactly
>> > > when it gets called relative to switching current.
>> >
>> > I am not convinced that the dance of writing
>> > cpu_tlbstate.loaded_mm twice, with a barrier on
>> > each end, is useful or necessary.
>> >
>> > At the time switch_mm_irqs_off returns, nmi_uaccess_ok()
>> > will still return false, because we have not switched
>> > "current" to the task that owns the next mm_struct yet.
>> >
>> > We just have to make sure to:
>> > 1) Change cpu_tlbstate.loaded_mm before we manipulate
>> >    CR3, and
>> > 2) Change "current" only once enough of the mm stuff has
>> >    been switched, __switch_to seems to get that right.
>> >
>> > Between the time switch_mm_irqs_off() sets cpu_tlbstate
>> > to the next mm, and __switch_to moves() over current,
>> > nmi_uaccess_ok() will return false.
>>
>> All true, but I think it stops working as soon as someone starts
>> calling switch_mm_irqs_off() for some other reason, such as during
>> text_poke().  And that was the original motivation for this patch.
>
> How does calling switch_mm_irqs_off() for text_poke()
> change around current->mm and cpu_tlbstate.loaded_mm?
>
> Does current->mm stay the same throughout the entire
> text_poke() chain, while cpustate.loaded_mm is the
> only thing that is changed out?

This is exactly what happens.  It seemed considerably more complicated
and error-prone to fiddle with current->mm.  Instead the idea was to
turn off IRQs, get NMI to stay out of the way, and put everything back
the way it was before the scheduler, any syscalls, etc could notice
that we were messing around.

--Andy

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-29 15:36       ` Andy Lutomirski
@ 2018-08-29 15:49         ` Rik van Riel
  2018-08-29 16:14           ` Andy Lutomirski
  0 siblings, 1 reply; 14+ messages in thread
From: Rik van Riel @ 2018-08-29 15:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: X86 ML, Borislav Petkov, Jann Horn, LKML, stable, Peter Zijlstra,
	Nadav Amit

[-- Attachment #1: Type: text/plain, Size: 2698 bytes --]

On Wed, 2018-08-29 at 08:36 -0700, Andy Lutomirski wrote:
> On Wed, Aug 29, 2018 at 8:17 AM, Rik van Riel <riel@surriel.com>
> wrote:
> > On Tue, 2018-08-28 at 20:46 -0700, Andy Lutomirski wrote:
> > > On Tue, Aug 28, 2018 at 10:56 AM, Rik van Riel <riel@surriel.com>
> > > wrote:
> > > > On Mon, 27 Aug 2018 16:04:16 -0700
> > > > Andy Lutomirski <luto@kernel.org> wrote:
> > > > 
> > > > > The 0day bot is still chewing on this, but I've tested it a
> > > > > bit
> > > > > locally
> > > > > and it seems to do the right thing.
> > > > 
> > > > Hi Andy,
> > > > 
> > > > the version of the patch below should fix the bug we talked
> > > > about
> > > > in email yesterday.  It should automatically cover kernel
> > > > threads
> > > > in lazy TLB mode, because current->mm will be NULL, while the
> > > > cpu_tlbstate.loaded_mm should never be NULL.
> > > > 
> > > 
> > > That's better than mine.  I tweaked it a bit and added some
> > > debugging,
> > > and I got this:
> > > 
> > > 
> > 
> > 
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/fixes&id=dd956eba16646fd0b15c3c0741269dfd84452dac
> > > 
> > > I made the loaded_mm handling a little more conservative to make
> > > it
> > > more obvious that switch_mm_irqs_off() is safe regardless of
> > > exactly
> > > when it gets called relative to switching current.
> > 
> > I am not convinced that the dance of writing
> > cpu_tlbstate.loaded_mm twice, with a barrier on
> > each end, is useful or necessary.
> > 
> > At the time switch_mm_irqs_off returns, nmi_uaccess_ok()
> > will still return false, because we have not switched
> > "current" to the task that owns the next mm_struct yet.
> > 
> > We just have to make sure to:
> > 1) Change cpu_tlbstate.loaded_mm before we manipulate
> >    CR3, and
> > 2) Change "current" only once enough of the mm stuff has
> >    been switched, __switch_to seems to get that right.
> > 
> > Between the time switch_mm_irqs_off() sets cpu_tlbstate
> > to the next mm, and __switch_to moves() over current,
> > nmi_uaccess_ok() will return false.
> 
> All true, but I think it stops working as soon as someone starts
> calling switch_mm_irqs_off() for some other reason, such as during
> text_poke().  And that was the original motivation for this patch.

How does calling switch_mm_irqs_off() for text_poke()
change around current->mm and cpu_tlbstate.loaded_mm?

Does current->mm stay the same throughout the entire
text_poke() chain, while cpustate.loaded_mm is the
only thing that is changed out?

If so, then yes the double assignment is indeed
necessary. Good point.

-- 
All Rights Reversed.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-29 15:17     ` Rik van Riel
@ 2018-08-29 15:36       ` Andy Lutomirski
  2018-08-29 15:49         ` Rik van Riel
  0 siblings, 1 reply; 14+ messages in thread
From: Andy Lutomirski @ 2018-08-29 15:36 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andy Lutomirski, X86 ML, Borislav Petkov, Jann Horn, LKML,
	stable, Peter Zijlstra, Nadav Amit

On Wed, Aug 29, 2018 at 8:17 AM, Rik van Riel <riel@surriel.com> wrote:
> On Tue, 2018-08-28 at 20:46 -0700, Andy Lutomirski wrote:
>> On Tue, Aug 28, 2018 at 10:56 AM, Rik van Riel <riel@surriel.com>
>> wrote:
>> > On Mon, 27 Aug 2018 16:04:16 -0700
>> > Andy Lutomirski <luto@kernel.org> wrote:
>> >
>> > > The 0day bot is still chewing on this, but I've tested it a bit
>> > > locally
>> > > and it seems to do the right thing.
>> >
>> > Hi Andy,
>> >
>> > the version of the patch below should fix the bug we talked about
>> > in email yesterday.  It should automatically cover kernel threads
>> > in lazy TLB mode, because current->mm will be NULL, while the
>> > cpu_tlbstate.loaded_mm should never be NULL.
>> >
>>
>> That's better than mine.  I tweaked it a bit and added some
>> debugging,
>> and I got this:
>>
>>
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/fixes&id=dd956eba16646fd0b15c3c0741269dfd84452dac
>>
>> I made the loaded_mm handling a little more conservative to make it
>> more obvious that switch_mm_irqs_off() is safe regardless of exactly
>> when it gets called relative to switching current.
>
> I am not convinced that the dance of writing
> cpu_tlbstate.loaded_mm twice, with a barrier on
> each end, is useful or necessary.
>
> At the time switch_mm_irqs_off returns, nmi_uaccess_ok()
> will still return false, because we have not switched
> "current" to the task that owns the next mm_struct yet.
>
> We just have to make sure to:
> 1) Change cpu_tlbstate.loaded_mm before we manipulate
>    CR3, and
> 2) Change "current" only once enough of the mm stuff has
>    been switched, __switch_to seems to get that right.
>
> Between the time switch_mm_irqs_off() sets cpu_tlbstate
> to the next mm, and __switch_to moves() over current,
> nmi_uaccess_ok() will return false.

All true, but I think it stops working as soon as someone starts
calling switch_mm_irqs_off() for some other reason, such as during
text_poke().  And that was the original motivation for this patch.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-29  3:46   ` Andy Lutomirski
@ 2018-08-29 15:17     ` Rik van Riel
  2018-08-29 15:36       ` Andy Lutomirski
  0 siblings, 1 reply; 14+ messages in thread
From: Rik van Riel @ 2018-08-29 15:17 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: X86 ML, Borislav Petkov, Jann Horn, LKML, stable, Peter Zijlstra,
	Nadav Amit

[-- Attachment #1: Type: text/plain, Size: 1786 bytes --]

On Tue, 2018-08-28 at 20:46 -0700, Andy Lutomirski wrote:
> On Tue, Aug 28, 2018 at 10:56 AM, Rik van Riel <riel@surriel.com>
> wrote:
> > On Mon, 27 Aug 2018 16:04:16 -0700
> > Andy Lutomirski <luto@kernel.org> wrote:
> > 
> > > The 0day bot is still chewing on this, but I've tested it a bit
> > > locally
> > > and it seems to do the right thing.
> > 
> > Hi Andy,
> > 
> > the version of the patch below should fix the bug we talked about
> > in email yesterday.  It should automatically cover kernel threads
> > in lazy TLB mode, because current->mm will be NULL, while the
> > cpu_tlbstate.loaded_mm should never be NULL.
> > 
> 
> That's better than mine.  I tweaked it a bit and added some
> debugging,
> and I got this:
> 
> 
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/fixes&id=dd956eba16646fd0b15c3c0741269dfd84452dac
> 
> I made the loaded_mm handling a little more conservative to make it
> more obvious that switch_mm_irqs_off() is safe regardless of exactly
> when it gets called relative to switching current.

I am not convinced that the dance of writing
cpu_tlbstate.loaded_mm twice, with a barrier on
each end, is useful or necessary.

At the time switch_mm_irqs_off returns, nmi_uaccess_ok()
will still return false, because we have not switched
"current" to the task that owns the next mm_struct yet.

We just have to make sure to:
1) Change cpu_tlbstate.loaded_mm before we manipulate
   CR3, and
2) Change "current" only once enough of the mm stuff has
   been switched, __switch_to seems to get that right.

Between the time switch_mm_irqs_off() sets cpu_tlbstate
to the next mm, and __switch_to moves() over current,
nmi_uaccess_ok() will return false.

-- 
All Rights Reversed.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-28 17:56 ` [PATCH v2] " Rik van Riel
@ 2018-08-29  3:46   ` Andy Lutomirski
  2018-08-29 15:17     ` Rik van Riel
  0 siblings, 1 reply; 14+ messages in thread
From: Andy Lutomirski @ 2018-08-29  3:46 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andy Lutomirski, X86 ML, Borislav Petkov, Jann Horn, LKML,
	stable, Peter Zijlstra, Nadav Amit

On Tue, Aug 28, 2018 at 10:56 AM, Rik van Riel <riel@surriel.com> wrote:
> On Mon, 27 Aug 2018 16:04:16 -0700
> Andy Lutomirski <luto@kernel.org> wrote:
>
>> The 0day bot is still chewing on this, but I've tested it a bit locally
>> and it seems to do the right thing.
>
> Hi Andy,
>
> the version of the patch below should fix the bug we talked about
> in email yesterday.  It should automatically cover kernel threads
> in lazy TLB mode, because current->mm will be NULL, while the
> cpu_tlbstate.loaded_mm should never be NULL.
>

That's better than mine.  I tweaked it a bit and added some debugging,
and I got this:

https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/fixes&id=dd956eba16646fd0b15c3c0741269dfd84452dac

I made the loaded_mm handling a little more conservative to make it
more obvious that switch_mm_irqs_off() is safe regardless of exactly
when it gets called relative to switching current.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] x86/nmi: Fix some races in NMI uaccess
  2018-08-27 23:04 [PATCH] x86/nmi: Fix some races in NMI uaccess Andy Lutomirski
@ 2018-08-28 17:56 ` Rik van Riel
  2018-08-29  3:46   ` Andy Lutomirski
  0 siblings, 1 reply; 14+ messages in thread
From: Rik van Riel @ 2018-08-28 17:56 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: x86, Borislav Petkov, Jann Horn, LKML, stable, Peter Zijlstra,
	Nadav Amit

On Mon, 27 Aug 2018 16:04:16 -0700
Andy Lutomirski <luto@kernel.org> wrote:

> The 0day bot is still chewing on this, but I've tested it a bit locally
> and it seems to do the right thing.

Hi Andy,

the version of the patch below should fix the bug we talked about
in email yesterday.  It should automatically cover kernel threads
in lazy TLB mode, because current->mm will be NULL, while the
cpu_tlbstate.loaded_mm should never be NULL.

---8<---
From: Andy Lutomirski <luto@kernel.org>
Subject: x86/nmi: Fix some races in NMI uaccess

In NMI context, we might be in the middle of context switching or in
the middle of switch_mm_irqs_off().  In either case, CR3 might not
match current->mm, which could cause copy_from_user_nmi() and
friends to read the wrong memory.

Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

Cc: stable@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/events/core.c          |  2 +-
 arch/x86/include/asm/tlbflush.h | 15 +++++++++++++++
 arch/x86/lib/usercopy.c         |  5 +++++
 arch/x86/mm/tlb.c               |  9 ++++++++-
 4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	perf_callchain_store(entry, regs->ip);
 
-	if (!current->mm)
+	if (!nmi_uaccess_okay())
 		return;
 
 	if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..dafe649b18e1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -246,6 +246,21 @@ struct tlb_state {
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm.  It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	struct mm_struct *current_mm = current->mm;
+
+	return (loaded_mm == current_mm);
+}
+
 /* Initialize cr4 shadow for this CPU. */
 static inline void cr4_init_shadow(void)
 {
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
 #include <linux/uaccess.h>
 #include <linux/export.h>
 
+#include <asm/tlbflush.h>
+
 /*
  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
  * nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	if (__range_not_ok(from, n, TASK_SIZE))
 		return n;
 
+	if (!nmi_uaccess_okay())
+		return n;
+
 	/*
 	 * Even though this function is typically called from NMI/IRQ context
 	 * disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9517d1b2a281..5b75e2fed2b6 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -305,6 +305,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 
+		/*
+		 * Ensure cpu_tlbstate.loaded_mm differs from current.mm
+		 * until the context switch is complete, so NMI handlers
+		 * do not try to access userspace. See nmi_uaccess_okay.
+		 */
+		this_cpu_write(cpu_tlbstate.loaded_mm, next);
+		smp_wmb();
+
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -335,7 +343,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (next != &init_mm)
 			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 
-		this_cpu_write(cpu_tlbstate.loaded_mm, next);
 		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 	}
 


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-08-31 15:15 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-29 15:47 [PATCH v2] x86/nmi: Fix some races in NMI uaccess Andy Lutomirski
2018-08-29 16:06 ` Rik van Riel
2018-08-29 18:56 ` Nadav Amit
2018-08-30 13:36   ` Thomas Gleixner
2018-08-30 14:21     ` Andy Lutomirski
2018-08-30 14:27       ` Thomas Gleixner
2018-08-30 14:39 ` [tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching tip-bot for Andy Lutomirski
2018-08-31 15:13 ` tip-bot for Andy Lutomirski
  -- strict thread matches above, loose matches on Subject: below --
2018-08-27 23:04 [PATCH] x86/nmi: Fix some races in NMI uaccess Andy Lutomirski
2018-08-28 17:56 ` [PATCH v2] " Rik van Riel
2018-08-29  3:46   ` Andy Lutomirski
2018-08-29 15:17     ` Rik van Riel
2018-08-29 15:36       ` Andy Lutomirski
2018-08-29 15:49         ` Rik van Riel
2018-08-29 16:14           ` Andy Lutomirski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.