linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Alexandre Chartre <alexandre.chartre@oracle.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	hpa@zytor.com, x86@kernel.org, dave.hansen@linux.intel.com,
	luto@kernel.org, peterz@infradead.org,
	linux-kernel@vger.kernel.org, thomas.lendacky@amd.com,
	jroedel@suse.de
Cc: konrad.wilk@oracle.com, jan.setjeeilers@oracle.com,
	junaids@google.com, oweisse@google.com, rppt@linux.vnet.ibm.com,
	graf@amazon.de, mgross@linux.intel.com, kuzuno@gmail.com,
	alexandre.chartre@oracle.com
Subject: [RFC][PATCH v2 19/21] x86/pti: Defer CR3 switch to C code for IST entries
Date: Mon, 16 Nov 2020 15:47:55 +0100	[thread overview]
Message-ID: <20201116144757.1920077-20-alexandre.chartre@oracle.com> (raw)
In-Reply-To: <20201116144757.1920077-1-alexandre.chartre@oracle.com>

IST entries from the kernel use paranoid entry and exit
assembly functions to ensure the CR3 and GS registers are
updated with correct values for the kernel. Move the update
of the CR3 inside the C code of IST handlers.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
---
 arch/x86/entry/entry_64.S      | 34 ++--------------------------------
 arch/x86/kernel/cpu/mce/core.c |  3 +++
 arch/x86/kernel/nmi.c          | 18 +++++++++++++++---
 arch/x86/kernel/sev-es.c       | 13 ++++++++++++-
 arch/x86/kernel/traps.c        | 30 ++++++++++++++++++++++++++----
 5 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6b88a0eb8975..1715bc0cefff 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -900,23 +900,6 @@ SYM_CODE_START_LOCAL(paranoid_entry)
 	PUSH_AND_CLEAR_REGS save_ret=1
 	ENCODE_FRAME_POINTER 8
 
-	/*
-	 * Always stash CR3 in %r14.  This value will be restored,
-	 * verbatim, at exit.  Needed if paranoid_entry interrupted
-	 * another entry that already switched to the user CR3 value
-	 * but has not yet returned to userspace.
-	 *
-	 * This is also why CS (stashed in the "iret frame" by the
-	 * hardware at entry) can not be used: this may be a return
-	 * to kernel code, but with a user CR3 value.
-	 *
-	 * Switching CR3 does not depend on kernel GSBASE so it can
-	 * be done before switching to the kernel GSBASE. This is
-	 * required for FSGSBASE because the kernel GSBASE has to
-	 * be retrieved from a kernel internal table.
-	 */
-	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
-
 	/*
 	 * Handling GSBASE depends on the availability of FSGSBASE.
 	 *
@@ -956,9 +939,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
 	SWAPGS
 
 	/*
-	 * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
-	 * unconditional CR3 write, even in the PTI case.  So do an lfence
-	 * to prevent GS speculation, regardless of whether PTI is enabled.
+	 * Do an lfence prevent GS speculation.
 	 */
 	FENCE_SWAPGS_KERNEL_ENTRY
 
@@ -989,14 +970,10 @@ SYM_CODE_END(paranoid_entry)
 SYM_CODE_START_LOCAL(paranoid_exit)
 	UNWIND_HINT_REGS
 	/*
-	 * The order of operations is important. RESTORE_CR3 requires
-	 * kernel GSBASE.
-	 *
 	 * NB to anyone to try to optimize this code: this code does
 	 * not execute at all for exceptions from user mode. Those
 	 * exceptions go through error_exit instead.
 	 */
-	RESTORE_CR3	scratch_reg=%rax save_reg=%r14
 
 	/* Handle the three GSBASE cases */
 	ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
@@ -1119,10 +1096,6 @@ SYM_CODE_END(error_return)
 /*
  * Runs on exception stack.  Xen PV does not go through this path at all,
  * so we can use real assembly here.
- *
- * Registers:
- *	%r14: Used to save/restore the CR3 of the interrupted context
- *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
  */
 SYM_CODE_START(asm_exc_nmi)
 	/*
@@ -1173,7 +1146,7 @@ SYM_CODE_START(asm_exc_nmi)
 	 * We also must not push anything to the stack before switching
 	 * stacks lest we corrupt the "NMI executing" variable.
 	 */
-	ist_entry_user exc_nmi
+	ist_entry_user exc_nmi_user
 
 	/* NMI from kernel */
 
@@ -1385,9 +1358,6 @@ end_repeat_nmi:
 	movq	$-1, %rsi
 	call	exc_nmi
 
-	/* Always restore stashed CR3 value (see paranoid_entry) */
-	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
-
 	/*
 	 * The above invocation of paranoid_entry stored the GSBASE
 	 * related information in R/EBX depending on the availability
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 9407c3cd9355..31ac01c1155d 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2022,11 +2022,14 @@ static __always_inline void exc_machine_check_user(struct pt_regs *regs)
 /* MCE hit kernel mode */
 DEFINE_IDTENTRY_MCE(exc_machine_check)
 {
+	unsigned long saved_cr3;
 	unsigned long dr7;
 
+	saved_cr3 = save_and_switch_to_kernel_cr3();
 	dr7 = local_db_save();
 	exc_machine_check_kernel(regs);
 	local_db_restore(dr7);
+	restore_cr3(saved_cr3);
 }
 
 /* The user mode variant. */
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index be0f654c3095..523d88c3fea1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -473,7 +473,7 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state);
 static DEFINE_PER_CPU(unsigned long, nmi_cr2);
 static DEFINE_PER_CPU(unsigned long, nmi_dr7);
 
-DEFINE_IDTENTRY_RAW(exc_nmi)
+static noinstr void handle_nmi(struct pt_regs *regs)
 {
 	bool irq_state;
 
@@ -529,9 +529,21 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
 		write_cr2(this_cpu_read(nmi_cr2));
 	if (this_cpu_dec_return(nmi_state))
 		goto nmi_restart;
+}
+
+DEFINE_IDTENTRY_NMI(exc_nmi)
+{
+	unsigned long saved_cr3;
 
-	if (user_mode(regs))
-		mds_user_clear_cpu_buffers();
+	saved_cr3 = save_and_switch_to_kernel_cr3();
+	handle_nmi(regs);
+	restore_cr3(saved_cr3);
+}
+
+__visible noinstr void exc_nmi_user(struct pt_regs *regs)
+{
+	handle_nmi(regs);
+	mds_user_clear_cpu_buffers();
 }
 
 void stop_nmi(void)
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index bd977c917cd6..59fc7c472525 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -1352,13 +1352,18 @@ DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
 struct exc_vc_frame {
 	/* pt_regs should be first */
 	struct pt_regs regs;
+	/* extra parameters for the handler */
+	unsigned long saved_cr3;
 };
 
 DEFINE_IDTENTRY_VC_SETUP_STACK(exc_vmm_communication)
 {
 	struct exc_vc_frame *frame;
+	unsigned long saved_cr3;
 	unsigned long sp;
 
+	saved_cr3 = save_and_switch_to_kernel_cr3();
+
 	/*
 	 * Switch off the IST stack to make it free for nested exceptions.
 	 * The vc_switch_off_ist() function will switch back to the
@@ -1370,7 +1375,8 @@ DEFINE_IDTENTRY_VC_SETUP_STACK(exc_vmm_communication)
 	/*
 	 * Found a safe stack. Set it up as if the entry has happened on
 	 * that stack. This means that we need to have pt_regs at the top
-	 * of the stack.
+	 * of the stack, and we can use the bottom of the stack to pass
+	 * extra parameters (like the saved cr3 value) to the handler.
 	 *
 	 * The effective stack switch happens in assembly code before
 	 * the #VC handler is called.
@@ -1379,16 +1385,21 @@ DEFINE_IDTENTRY_VC_SETUP_STACK(exc_vmm_communication)
 
 	frame = (struct exc_vc_frame *)sp;
 	frame->regs = *regs;
+	frame->saved_cr3 = saved_cr3;
 
 	return sp;
 }
 
 DEFINE_IDTENTRY_VC(exc_vmm_communication)
 {
+	struct exc_vc_frame *frame = (struct exc_vc_frame *)regs;
+
 	if (likely(!on_vc_fallback_stack(regs)))
 		safe_stack_exc_vmm_communication(regs, error_code);
 	else
 		ist_exc_vmm_communication(regs, error_code);
+
+	restore_cr3(frame->saved_cr3);
 }
 
 bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9a51aa016fb3..14d2d6f15184 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -344,10 +344,10 @@ __visible void __noreturn handle_stack_overflow(const char *message,
 DEFINE_IDTENTRY_DF(exc_double_fault)
 {
 	static const char str[] = "double fault";
-	struct task_struct *tsk = current;
-
+	struct task_struct *tsk;
+	unsigned long saved_cr3;
 #ifdef CONFIG_VMAP_STACK
-	unsigned long address = read_cr2();
+	unsigned long address;
 #endif
 
 #ifdef CONFIG_X86_ESPFIX64
@@ -371,8 +371,12 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 		regs->cs == __KERNEL_CS &&
 		regs->ip == (unsigned long)native_irq_return_iret)
 	{
-		struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 		unsigned long *p = (unsigned long *)regs->sp;
+		struct pt_regs *gpregs;
+
+		saved_cr3 = save_and_switch_to_kernel_cr3();
+
+		gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
 		/*
 		 * regs->sp points to the failing IRET frame on the
@@ -401,14 +405,28 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 		regs->ip = (unsigned long)asm_exc_general_protection;
 		regs->sp = (unsigned long)&gpregs->orig_ax;
 
+		restore_cr3(saved_cr3);
+
 		return;
 	}
 #endif
 
+	/*
+	 * Switch to the kernel page-table. We are on an IST stack, and
+	 * we are going to die so there is no need to switch to the kernel
+	 * stack even if we are coming from userspace.
+	 */
+	saved_cr3 = save_and_switch_to_kernel_cr3();
+
+#ifdef CONFIG_VMAP_STACK
+	address = read_cr2();
+#endif
+
 	idtentry_enter_nmi(regs);
 	instrumentation_begin();
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
+	tsk = current;
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_nr = X86_TRAP_DF;
 
@@ -973,7 +991,11 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
 /* IST stack entry */
 DEFINE_IDTENTRY_DEBUG(exc_debug)
 {
+	unsigned long saved_cr3;
+
+	saved_cr3 = save_and_switch_to_kernel_cr3();
 	exc_debug_kernel(regs, debug_read_clear_dr6());
+	restore_cr3(saved_cr3);
 }
 
 /* User entry, runs on regular task stack */
-- 
2.18.4


  parent reply	other threads:[~2020-11-16 14:48 UTC|newest]

Thread overview: 69+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-16 14:47 [RFC][PATCH v2 00/21] x86/pti: Defer CR3 switch to C code Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 01/21] x86/syscall: Add wrapper for invoking syscall function Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 02/21] x86/entry: Update asm_call_on_stack to support more function arguments Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 03/21] x86/entry: Consolidate IST entry from userspace Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 04/21] x86/sev-es: Define a setup stack function for the VC idtentry Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 05/21] x86/entry: Implement ret_from_fork body with C code Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 06/21] x86/pti: Provide C variants of PTI switch CR3 macros Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 07/21] x86/entry: Fill ESPFIX stack using C code Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 08/21] x86/pti: Introduce per-task PTI trampoline stack Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 09/21] x86/pti: Function to clone page-table entries from a specified mm Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 10/21] x86/pti: Function to map per-cpu page-table entry Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 11/21] x86/pti: Extend PTI user mappings Alexandre Chartre
2020-11-16 19:48   ` Andy Lutomirski
2020-11-16 20:21     ` Alexandre Chartre
2020-11-16 23:06       ` Andy Lutomirski
2020-11-17  8:42         ` Alexandre Chartre
2020-11-17 15:49           ` Andy Lutomirski
2020-11-19 19:15           ` Thomas Gleixner
2020-11-16 14:47 ` [RFC][PATCH v2 12/21] x86/pti: Use PTI stack instead of trampoline stack Alexandre Chartre
2020-11-16 16:57   ` Andy Lutomirski
2020-11-16 18:10     ` Alexandre Chartre
2020-11-16 18:34       ` Andy Lutomirski
2020-11-16 19:37         ` Alexandre Chartre
2020-11-17 15:09         ` Alexandre Chartre
2020-11-17 15:52           ` Andy Lutomirski
2020-11-17 17:01             ` Alexandre Chartre
2020-11-19  1:49               ` Andy Lutomirski
2020-11-19  8:05                 ` Alexandre Chartre
2020-11-19 12:06                   ` Alexandre Chartre
2020-11-19 16:06                     ` Andy Lutomirski
2020-11-19 17:02                       ` Alexandre Chartre
2020-11-16 21:24       ` David Laight
2020-11-17  8:27         ` Alexandre Chartre
2020-11-19 19:10       ` Thomas Gleixner
2020-11-19 19:55         ` Alexandre Chartre
2020-11-19 21:20           ` Thomas Gleixner
2020-11-24  7:20   ` [x86/pti] 5da9e742d1: PANIC:double_fault kernel test robot
2020-11-16 14:47 ` [RFC][PATCH v2 13/21] x86/pti: Execute syscall functions on the kernel stack Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 14/21] x86/pti: Execute IDT handlers " Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 15/21] x86/pti: Execute IDT handlers with error code " Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 16/21] x86/pti: Execute system vector handlers " Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 17/21] x86/pti: Execute page fault handler " Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 18/21] x86/pti: Execute NMI " Alexandre Chartre
2020-11-16 14:47 ` Alexandre Chartre [this message]
2020-11-16 14:47 ` [RFC][PATCH v2 20/21] x86/pti: Defer CR3 switch to C code for non-IST and syscall entries Alexandre Chartre
2020-11-16 14:47 ` [RFC][PATCH v2 21/21] x86/pti: Use a different stack canary with the user and kernel page-table Alexandre Chartre
2020-11-16 16:56   ` Andy Lutomirski
2020-11-16 18:34     ` Alexandre Chartre
2020-11-16 20:17 ` [RFC][PATCH v2 00/21] x86/pti: Defer CR3 switch to C code Borislav Petkov
2020-11-17  7:56   ` Alexandre Chartre
2020-11-17 16:55     ` Borislav Petkov
2020-11-17 18:12       ` Alexandre Chartre
2020-11-17 18:28         ` Borislav Petkov
2020-11-17 19:02           ` Alexandre Chartre
2020-11-17 21:23             ` Borislav Petkov
2020-11-18  7:08               ` Alexandre Chartre
2020-11-17 21:26         ` Borislav Petkov
2020-11-18  7:41           ` Alexandre Chartre
2020-11-18  9:30             ` David Laight
2020-11-18 10:29               ` Alexandre Chartre
2020-11-18 13:22                 ` David Laight
2020-11-18 17:15                   ` Alexandre Chartre
2020-11-18 11:29             ` Borislav Petkov
2020-11-18 19:37               ` Alexandre Chartre
2020-11-16 20:24 ` Borislav Petkov
2020-11-17  8:19   ` Alexandre Chartre
2020-11-17 17:07     ` Borislav Petkov
2020-11-17 18:24       ` Alexandre Chartre
2020-11-19 19:32     ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201116144757.1920077-20-alexandre.chartre@oracle.com \
    --to=alexandre.chartre@oracle.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=graf@amazon.de \
    --cc=hpa@zytor.com \
    --cc=jan.setjeeilers@oracle.com \
    --cc=jroedel@suse.de \
    --cc=junaids@google.com \
    --cc=konrad.wilk@oracle.com \
    --cc=kuzuno@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mgross@linux.intel.com \
    --cc=mingo@redhat.com \
    --cc=oweisse@google.com \
    --cc=peterz@infradead.org \
    --cc=rppt@linux.vnet.ibm.com \
    --cc=tglx@linutronix.de \
    --cc=thomas.lendacky@amd.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).