linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/3] x86: Clean up fast syscall return validation
@ 2023-10-11 22:43 Brian Gerst
  2023-10-11 22:43 ` [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C Brian Gerst
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Brian Gerst @ 2023-10-11 22:43 UTC (permalink / raw)
  To: linux-kernel, x86
  Cc: Ingo Molnar, Thomas Gleixner, Borislav Petkov, H . Peter Anvin,
	Andy Lutomirski, Brian Gerst

This patch set cleans up the tests done to determine if a fast syscall
return instruction can be used to return to userspace.  It converts the
code to C, and refactors existing code to be more readable.

v3:
 - Remove patches already applied to -tip.
 - Keep Xen alternatives on the asm side to skip testing the return
   value.
 - Add patch to simplify canonical-RIP test.

v2:
 - Fix shift value for canonical RIP test and use
   __is_canonical_address()

Brian Gerst (3):
  x86/entry/64: Convert SYSRET validation tests to C
  x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test
  x86/entry/32: Clean up syscall fast exit tests

 arch/x86/entry/common.c        | 91 ++++++++++++++++++++++++----------
 arch/x86/entry/entry_64.S      | 53 +-------------------
 arch/x86/include/asm/syscall.h |  2 +-
 3 files changed, 67 insertions(+), 79 deletions(-)

-- 
2.41.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C
  2023-10-11 22:43 [PATCH v3 0/3] x86: Clean up fast syscall return validation Brian Gerst
@ 2023-10-11 22:43 ` Brian Gerst
  2023-10-13 11:18   ` [tip: x86/entry] " tip-bot2 for Brian Gerst
  2023-10-13 12:39   ` [PATCH v3 1/3] " Nikolay Borisov
  2023-10-11 22:43 ` [PATCH v3 2/3] x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test Brian Gerst
  2023-10-11 22:43 ` [PATCH v3 3/3] x86/entry/32: Clean up syscall fast exit tests Brian Gerst
  2 siblings, 2 replies; 9+ messages in thread
From: Brian Gerst @ 2023-10-11 22:43 UTC (permalink / raw)
  To: linux-kernel, x86
  Cc: Ingo Molnar, Thomas Gleixner, Borislav Petkov, H . Peter Anvin,
	Andy Lutomirski, Brian Gerst

Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/entry/common.c        | 43 ++++++++++++++++++++++++++-
 arch/x86/entry/entry_64.S      | 53 ++--------------------------------
 arch/x86/include/asm/syscall.h |  2 +-
 3 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 0551bcb197fb..207149a0a9b3 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 	return false;
 }
 
-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
 {
 	add_random_kstack_offset();
 	nr = syscall_enter_from_user_mode(regs, nr);
@@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
 
 	instrumentation_end();
 	syscall_exit_to_user_mode(regs);
+
+	/*
+	 * Check that the register state is valid for using SYSRET to exit
+	 * to userspace.  Otherwise use the slower but fully capable IRET
+	 * exit path.
+	 */
+
+	/* XEN PV guests always use IRET path */
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return false;
+
+	/* SYSRET requires RCX == RIP and R11 == EFLAGS */
+	if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+		return false;
+
+	/* CS and SS must match the values set in MSR_STAR */
+	if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+		return false;
+
+	/*
+	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+	 * in kernel space.  This essentially lets the user take over
+	 * the kernel, since userspace controls RSP.
+	 *
+	 * Change top bits to match most significant bit (47th or 56th bit
+	 * depending on paging mode) in the address.
+	 */
+	if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
+		return false;
+
+	/*
+	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
+	 * restoring TF results in a trap from userspace immediately after
+	 * SYSRET.
+	 */
+	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+		return false;
+
+	/* Use SYSRET to exit to userspace */
+	return true;
 }
 #endif
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bdc22d7e78f..de6469dffe3a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -126,57 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
 	 * In the Xen PV case we must use iret anyway.
 	 */
 
-	ALTERNATIVE "", "jmp	swapgs_restore_regs_and_return_to_usermode", \
-		X86_FEATURE_XENPV
-
-	movq	RCX(%rsp), %rcx
-	movq	RIP(%rsp), %r11
-
-	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	/*
-	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
-	 * in kernel space.  This essentially lets the user take over
-	 * the kernel, since userspace controls RSP.
-	 *
-	 * If width of "canonical tail" ever becomes variable, this will need
-	 * to be updated to remain correct on both old and new CPUs.
-	 *
-	 * Change top bits to match most significant bit (47th or 56th bit
-	 * depending on paging mode) in the address.
-	 */
-#ifdef CONFIG_X86_5LEVEL
-	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
-		"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
-	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
-	/* If this changed %rcx, it was not canonical */
-	cmpq	%rcx, %r11
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	movq	R11(%rsp), %r11
-	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	/*
-	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
-	 * restoring TF results in a trap from userspace immediately after
-	 * SYSRET.
-	 */
-	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz	swapgs_restore_regs_and_return_to_usermode
-
-	/* nothing to check for RSP */
-
-	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
-	jne	swapgs_restore_regs_and_return_to_usermode
+	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 
 	/*
 	 * We win! This label is here just for ease of understanding
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c7e25c940f1a..f44e2f9ab65d 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task)
 		? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
 }
 
-void do_syscall_64(struct pt_regs *regs, int nr);
+bool do_syscall_64(struct pt_regs *regs, int nr);
 
 #endif	/* CONFIG_X86_32 */
 
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v3 2/3] x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test
  2023-10-11 22:43 [PATCH v3 0/3] x86: Clean up fast syscall return validation Brian Gerst
  2023-10-11 22:43 ` [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C Brian Gerst
@ 2023-10-11 22:43 ` Brian Gerst
  2023-10-13 11:18   ` [tip: x86/entry] " tip-bot2 for Brian Gerst
  2023-10-11 22:43 ` [PATCH v3 3/3] x86/entry/32: Clean up syscall fast exit tests Brian Gerst
  2 siblings, 1 reply; 9+ messages in thread
From: Brian Gerst @ 2023-10-11 22:43 UTC (permalink / raw)
  To: linux-kernel, x86
  Cc: Ingo Molnar, Thomas Gleixner, Borislav Petkov, H . Peter Anvin,
	Andy Lutomirski, Brian Gerst

Using shifts to determine if an address is canonical is difficult for
the compiler to optimize when the virtual address width is variable
(LA57 feature) without using inline assembly.  Instead, compare RIP
against TASK_SIZE_MAX.  The only user executable address outside of that
range is the deprecated vsyscall page, which can fall back to using IRET.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/entry/common.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 207149a0a9b3..e3d6f255379f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -110,10 +110,10 @@ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
 	 * in kernel space.  This essentially lets the user take over
 	 * the kernel, since userspace controls RSP.
 	 *
-	 * Change top bits to match most significant bit (47th or 56th bit
-	 * depending on paging mode) in the address.
+	 * TASK_SIZE_MAX covers all user-accessible addresses other than
+	 * the deprecated vsyscall page.
 	 */
-	if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
+	if (unlikely(regs->ip >= TASK_SIZE_MAX))
 		return false;
 
 	/*
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v3 3/3] x86/entry/32: Clean up syscall fast exit tests
  2023-10-11 22:43 [PATCH v3 0/3] x86: Clean up fast syscall return validation Brian Gerst
  2023-10-11 22:43 ` [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C Brian Gerst
  2023-10-11 22:43 ` [PATCH v3 2/3] x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test Brian Gerst
@ 2023-10-11 22:43 ` Brian Gerst
  2023-10-13 11:18   ` [tip: x86/entry] " tip-bot2 for Brian Gerst
  2 siblings, 1 reply; 9+ messages in thread
From: Brian Gerst @ 2023-10-11 22:43 UTC (permalink / raw)
  To: linux-kernel, x86
  Cc: Ingo Molnar, Thomas Gleixner, Borislav Petkov, H . Peter Anvin,
	Andy Lutomirski, Brian Gerst

Merge compat and native code and clarify comments.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/entry/common.c | 48 +++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index e3d6f255379f..0acf35d7fe55 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -255,34 +255,30 @@ __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
 	if (!__do_fast_syscall_32(regs))
 		return false;
 
-#ifdef CONFIG_X86_64
 	/*
-	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
-	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
-	 * bother with SYSEXIT.
-	 *
-	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
-	 * because the ECX fixup above will ensure that this is essentially
-	 * never the case.
+	 * Check that the register state is valid for using SYSRETL/SYSEXIT
+	 * to exit to userspace.  Otherwise use the slower but fully capable
+	 * IRET exit path.
 	 */
-	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
-		regs->ip == landing_pad &&
-		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
-	/*
-	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
-	 *
-	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
-	 * because the ECX fixup above will ensure that this is essentially
-	 * never the case.
-	 *
-	 * We don't allow syscalls at all from VM86 mode, but we still
-	 * need to check VM, because we might be returning from sys_vm86.
-	 */
-	return regs->cs == __USER_CS && regs->ss == __USER_DS &&
-		regs->ip == landing_pad &&
-		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
+
+	/* XEN PV guests always use IRET path */
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return false;
+
+	/* EIP must point to the VDSO landing pad */
+	if (unlikely(regs->ip != landing_pad))
+		return false;
+
+	/* CS and SS must match the values set in MSR_STAR */
+	if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+		return false;
+
+	/* If the TF, RF, or VM flags are set, use IRET */
+	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+		return false;
+
+	/* Use SYSRETL/SYSEXIT to exit to userspace */
+	return true;
 }
 
 /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: x86/entry] x86/entry/32: Clean up syscall fast exit tests
  2023-10-11 22:43 ` [PATCH v3 3/3] x86/entry/32: Clean up syscall fast exit tests Brian Gerst
@ 2023-10-13 11:18   ` tip-bot2 for Brian Gerst
  0 siblings, 0 replies; 9+ messages in thread
From: tip-bot2 for Brian Gerst @ 2023-10-13 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Brian Gerst, Ingo Molnar, Andy Lutomirski, Borislav Petkov,
	Denys Vlasenko, H. Peter Anvin, Linus Torvalds, Peter Zijlstra,
	Thomas Gleixner, Josh Poimboeuf, Uros Bizjak, x86, linux-kernel

The following commit has been merged into the x86/entry branch of tip:

Commit-ID:     1a09a27153f91cd7676b2d4ca574577572a8c999
Gitweb:        https://git.kernel.org/tip/1a09a27153f91cd7676b2d4ca574577572a8c999
Author:        Brian Gerst <brgerst@gmail.com>
AuthorDate:    Wed, 11 Oct 2023 18:43:51 -04:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 13 Oct 2023 13:05:28 +02:00

x86/entry/32: Clean up syscall fast exit tests

Merge compat and native code and clarify comments.

No change in functionality expected.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Uros Bizjak <ubizjak@gmail.com>
Link: https://lore.kernel.org/r/20231011224351.130935-4-brgerst@gmail.com
---
 arch/x86/entry/common.c | 48 ++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 4c7154d..d813160 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -255,34 +255,30 @@ __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
 	if (!__do_fast_syscall_32(regs))
 		return false;
 
-#ifdef CONFIG_X86_64
 	/*
-	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
-	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
-	 * bother with SYSEXIT.
-	 *
-	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
-	 * because the ECX fixup above will ensure that this is essentially
-	 * never the case.
+	 * Check that the register state is valid for using SYSRETL/SYSEXIT
+	 * to exit to userspace.  Otherwise use the slower but fully capable
+	 * IRET exit path.
 	 */
-	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
-		regs->ip == landing_pad &&
-		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
-	/*
-	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
-	 *
-	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
-	 * because the ECX fixup above will ensure that this is essentially
-	 * never the case.
-	 *
-	 * We don't allow syscalls at all from VM86 mode, but we still
-	 * need to check VM, because we might be returning from sys_vm86.
-	 */
-	return regs->cs == __USER_CS && regs->ss == __USER_DS &&
-		regs->ip == landing_pad &&
-		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
+
+	/* XEN PV guests always use the IRET path */
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return false;
+
+	/* EIP must point to the VDSO landing pad */
+	if (unlikely(regs->ip != landing_pad))
+		return false;
+
+	/* CS and SS must match the values set in MSR_STAR */
+	if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+		return false;
+
+	/* If the TF, RF, or VM flags are set, use IRET */
+	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+		return false;
+
+	/* Use SYSRETL/SYSEXIT to exit to userspace */
+	return true;
 }
 
 /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: x86/entry] x86/entry/64: Convert SYSRET validation tests to C
  2023-10-11 22:43 ` [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C Brian Gerst
@ 2023-10-13 11:18   ` tip-bot2 for Brian Gerst
  2023-10-13 12:39   ` [PATCH v3 1/3] " Nikolay Borisov
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot2 for Brian Gerst @ 2023-10-13 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Brian Gerst, Ingo Molnar, Andy Lutomirski, Borislav Petkov,
	Denys Vlasenko, H. Peter Anvin, Linus Torvalds, Peter Zijlstra,
	Thomas Gleixner, Josh Poimboeuf, Uros Bizjak, x86, linux-kernel

The following commit has been merged into the x86/entry branch of tip:

Commit-ID:     ca282b486a570a0bfda5c1a4595ace7fa14243bf
Gitweb:        https://git.kernel.org/tip/ca282b486a570a0bfda5c1a4595ace7fa14243bf
Author:        Brian Gerst <brgerst@gmail.com>
AuthorDate:    Wed, 11 Oct 2023 18:43:49 -04:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 13 Oct 2023 13:05:28 +02:00

x86/entry/64: Convert SYSRET validation tests to C

No change in functionality expected.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Uros Bizjak <ubizjak@gmail.com>
Link: https://lore.kernel.org/r/20231011224351.130935-2-brgerst@gmail.com
---
 arch/x86/entry/common.c        | 43 ++++++++++++++++++++++++++-
 arch/x86/entry/entry_64.S      | 53 +--------------------------------
 arch/x86/include/asm/syscall.h |  2 +-
 3 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 0551bcb..9021465 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 	return false;
 }
 
-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
 {
 	add_random_kstack_offset();
 	nr = syscall_enter_from_user_mode(regs, nr);
@@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
 
 	instrumentation_end();
 	syscall_exit_to_user_mode(regs);
+
+	/*
+	 * Check that the register state is valid for using SYSRET to exit
+	 * to userspace.  Otherwise use the slower but fully capable IRET
+	 * exit path.
+	 */
+
+	/* XEN PV guests always use the IRET path */
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return false;
+
+	/* SYSRET requires RCX == RIP and R11 == EFLAGS */
+	if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+		return false;
+
+	/* CS and SS must match the values set in MSR_STAR */
+	if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+		return false;
+
+	/*
+	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+	 * in kernel space.  This essentially lets the user take over
+	 * the kernel, since userspace controls RSP.
+	 *
+	 * Change top bits to match the most significant bit (47th or 56th bit
+	 * depending on paging mode) in the address.
+	 */
+	if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
+		return false;
+
+	/*
+	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
+	 * restoring TF results in a trap from userspace immediately after
+	 * SYSRET.
+	 */
+	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+		return false;
+
+	/* Use SYSRET to exit to userspace */
+	return true;
 }
 #endif
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7574639..1730640 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -126,57 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
 	 * In the Xen PV case we must use iret anyway.
 	 */
 
-	ALTERNATIVE "", "jmp	swapgs_restore_regs_and_return_to_usermode", \
-		X86_FEATURE_XENPV
-
-	movq	RCX(%rsp), %rcx
-	movq	RIP(%rsp), %r11
-
-	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	/*
-	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
-	 * in kernel space.  This essentially lets the user take over
-	 * the kernel, since userspace controls RSP.
-	 *
-	 * If width of "canonical tail" ever becomes variable, this will need
-	 * to be updated to remain correct on both old and new CPUs.
-	 *
-	 * Change top bits to match most significant bit (47th or 56th bit
-	 * depending on paging mode) in the address.
-	 */
-#ifdef CONFIG_X86_5LEVEL
-	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
-		"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
-	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
-	/* If this changed %rcx, it was not canonical */
-	cmpq	%rcx, %r11
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	movq	R11(%rsp), %r11
-	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	/*
-	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
-	 * restoring TF results in a trap from userspace immediately after
-	 * SYSRET.
-	 */
-	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz	swapgs_restore_regs_and_return_to_usermode
-
-	/* nothing to check for RSP */
-
-	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
-	jne	swapgs_restore_regs_and_return_to_usermode
+	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 
 	/*
 	 * We win! This label is here just for ease of understanding
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c7e25c9..f44e2f9 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task)
 		? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
 }
 
-void do_syscall_64(struct pt_regs *regs, int nr);
+bool do_syscall_64(struct pt_regs *regs, int nr);
 
 #endif	/* CONFIG_X86_32 */
 

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: x86/entry] x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test
  2023-10-11 22:43 ` [PATCH v3 2/3] x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test Brian Gerst
@ 2023-10-13 11:18   ` tip-bot2 for Brian Gerst
  0 siblings, 0 replies; 9+ messages in thread
From: tip-bot2 for Brian Gerst @ 2023-10-13 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Brian Gerst, Ingo Molnar, Andy Lutomirski, Borislav Petkov,
	Denys Vlasenko, H. Peter Anvin, Linus Torvalds, Peter Zijlstra,
	Thomas Gleixner, Josh Poimboeuf, Uros Bizjak, x86, linux-kernel

The following commit has been merged into the x86/entry branch of tip:

Commit-ID:     58978b44df7276f7c75a2c6aad6c201421cd4daa
Gitweb:        https://git.kernel.org/tip/58978b44df7276f7c75a2c6aad6c201421cd4daa
Author:        Brian Gerst <brgerst@gmail.com>
AuthorDate:    Wed, 11 Oct 2023 18:43:50 -04:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 13 Oct 2023 13:05:28 +02:00

x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test

Using shifts to determine if an address is canonical is difficult for
the compiler to optimize when the virtual address width is variable
(LA57 feature) without using inline assembly.  Instead, compare RIP
against TASK_SIZE_MAX.  The only user executable address outside of that
range is the deprecated vsyscall page, which can fall back to using IRET.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Uros Bizjak <ubizjak@gmail.com>
Link: https://lore.kernel.org/r/20231011224351.130935-3-brgerst@gmail.com
---
 arch/x86/entry/common.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 9021465..4c7154d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -110,10 +110,10 @@ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
 	 * in kernel space.  This essentially lets the user take over
 	 * the kernel, since userspace controls RSP.
 	 *
-	 * Change top bits to match the most significant bit (47th or 56th bit
-	 * depending on paging mode) in the address.
+	 * TASK_SIZE_MAX covers all user-accessible addresses other than
+	 * the deprecated vsyscall page.
 	 */
-	if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
+	if (unlikely(regs->ip >= TASK_SIZE_MAX))
 		return false;
 
 	/*

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C
  2023-10-11 22:43 ` [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C Brian Gerst
  2023-10-13 11:18   ` [tip: x86/entry] " tip-bot2 for Brian Gerst
@ 2023-10-13 12:39   ` Nikolay Borisov
  2023-10-13 13:05     ` Brian Gerst
  1 sibling, 1 reply; 9+ messages in thread
From: Nikolay Borisov @ 2023-10-13 12:39 UTC (permalink / raw)
  To: Brian Gerst, linux-kernel, x86, Andy Lutomirski
  Cc: Ingo Molnar, Thomas Gleixner, Borislav Petkov, H . Peter Anvin



On 12.10.23 г. 1:43 ч., Brian Gerst wrote:
> Signed-off-by: Brian Gerst <brgerst@gmail.com>
> ---
>   arch/x86/entry/common.c        | 43 ++++++++++++++++++++++++++-
>   arch/x86/entry/entry_64.S      | 53 ++--------------------------------
>   arch/x86/include/asm/syscall.h |  2 +-
>   3 files changed, 45 insertions(+), 53 deletions(-)
> 
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 0551bcb197fb..207149a0a9b3 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
>   	return false;
>   }
>   
> -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
> +/* Returns true to return using SYSRET, or false to use IRET */
> +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
>   {
>   	add_random_kstack_offset();
>   	nr = syscall_enter_from_user_mode(regs, nr);
> @@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
>   
>   	instrumentation_end();
>   	syscall_exit_to_user_mode(regs);
> +
> +	/*
> +	 * Check that the register state is valid for using SYSRET to exit
> +	 * to userspace.  Otherwise use the slower but fully capable IRET
> +	 * exit path.
> +	 */
> +
> +	/* XEN PV guests always use IRET path */
> +	if (cpu_feature_enabled(X86_FEATURE_XENPV))
> +		return false;
> +
> +	/* SYSRET requires RCX == RIP and R11 == EFLAGS */
> +	if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
> +		return false;


Under what conditions do we expect this to not be true since we've come 
via the syscall which adheres to this layout? IOW isn't this always 
going to be true and we can simply eliminate it?

> +
> +	/* CS and SS must match the values set in MSR_STAR */
> +	if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
> +		return false;
> +
> +	/*
> +	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
> +	 * in kernel space.  This essentially lets the user take over
> +	 * the kernel, since userspace controls RSP.
> +	 *
> +	 * Change top bits to match most significant bit (47th or 56th bit
> +	 * depending on paging mode) in the address.
> +	 */
> +	if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
> +		return false;
> +
> +	/*
> +	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
> +	 * restoring TF results in a trap from userspace immediately after
> +	 * SYSRET.
> +	 */
> +	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
> +		return false;
> +
> +	/* Use SYSRET to exit to userspace */
> +	return true;
>   }
>   #endif
>   
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 3bdc22d7e78f..de6469dffe3a 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -126,57 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
>   	 * In the Xen PV case we must use iret anyway.
>   	 */
>   
> -	ALTERNATIVE "", "jmp	swapgs_restore_regs_and_return_to_usermode", \
> -		X86_FEATURE_XENPV
> -
> -	movq	RCX(%rsp), %rcx
> -	movq	RIP(%rsp), %r11
> -
> -	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
> -	jne	swapgs_restore_regs_and_return_to_usermode
> -
> -	/*
> -	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
> -	 * in kernel space.  This essentially lets the user take over
> -	 * the kernel, since userspace controls RSP.
> -	 *
> -	 * If width of "canonical tail" ever becomes variable, this will need
> -	 * to be updated to remain correct on both old and new CPUs.
> -	 *
> -	 * Change top bits to match most significant bit (47th or 56th bit
> -	 * depending on paging mode) in the address.
> -	 */
> -#ifdef CONFIG_X86_5LEVEL
> -	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
> -		"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
> -#else
> -	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
> -	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
> -#endif
> -
> -	/* If this changed %rcx, it was not canonical */
> -	cmpq	%rcx, %r11
> -	jne	swapgs_restore_regs_and_return_to_usermode
> -
> -	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
> -	jne	swapgs_restore_regs_and_return_to_usermode
> -
> -	movq	R11(%rsp), %r11
> -	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
> -	jne	swapgs_restore_regs_and_return_to_usermode
> -
> -	/*
> -	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
> -	 * restoring TF results in a trap from userspace immediately after
> -	 * SYSRET.
> -	 */
> -	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
> -	jnz	swapgs_restore_regs_and_return_to_usermode
> -
> -	/* nothing to check for RSP */
> -
> -	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
> -	jne	swapgs_restore_regs_and_return_to_usermode
> +	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
> +		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
>   
>   	/*
>   	 * We win! This label is here just for ease of understanding
> diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
> index c7e25c940f1a..f44e2f9ab65d 100644
> --- a/arch/x86/include/asm/syscall.h
> +++ b/arch/x86/include/asm/syscall.h
> @@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task)
>   		? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
>   }
>   
> -void do_syscall_64(struct pt_regs *regs, int nr);
> +bool do_syscall_64(struct pt_regs *regs, int nr);
>   
>   #endif	/* CONFIG_X86_32 */
>   

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C
  2023-10-13 12:39   ` [PATCH v3 1/3] " Nikolay Borisov
@ 2023-10-13 13:05     ` Brian Gerst
  0 siblings, 0 replies; 9+ messages in thread
From: Brian Gerst @ 2023-10-13 13:05 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: linux-kernel, x86, Andy Lutomirski, Ingo Molnar, Thomas Gleixner,
	Borislav Petkov, H . Peter Anvin

On Fri, Oct 13, 2023 at 8:39 AM Nikolay Borisov <nik.borisov@suse.com> wrote:
>
>
>
> On 12.10.23 г. 1:43 ч., Brian Gerst wrote:
> > Signed-off-by: Brian Gerst <brgerst@gmail.com>
> > ---
> >   arch/x86/entry/common.c        | 43 ++++++++++++++++++++++++++-
> >   arch/x86/entry/entry_64.S      | 53 ++--------------------------------
> >   arch/x86/include/asm/syscall.h |  2 +-
> >   3 files changed, 45 insertions(+), 53 deletions(-)
> >
> > diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> > index 0551bcb197fb..207149a0a9b3 100644
> > --- a/arch/x86/entry/common.c
> > +++ b/arch/x86/entry/common.c
> > @@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
> >       return false;
> >   }
> >
> > -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
> > +/* Returns true to return using SYSRET, or false to use IRET */
> > +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
> >   {
> >       add_random_kstack_offset();
> >       nr = syscall_enter_from_user_mode(regs, nr);
> > @@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
> >
> >       instrumentation_end();
> >       syscall_exit_to_user_mode(regs);
> > +
> > +     /*
> > +      * Check that the register state is valid for using SYSRET to exit
> > +      * to userspace.  Otherwise use the slower but fully capable IRET
> > +      * exit path.
> > +      */
> > +
> > +     /* XEN PV guests always use IRET path */
> > +     if (cpu_feature_enabled(X86_FEATURE_XENPV))
> > +             return false;
> > +
> > +     /* SYSRET requires RCX == RIP and R11 == EFLAGS */
> > +     if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
> > +             return false;
>
>
> Under what conditions do we expect this to not be true since we've come
> via the syscall which adheres to this layout? IOW isn't this always
> going to be true and we can simply eliminate it?

Any syscall that can modify pt_regs, like sigreturn(), exec(), etc.
Also ptrace can change registers.

Brian Gerst

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-10-13 13:06 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-11 22:43 [PATCH v3 0/3] x86: Clean up fast syscall return validation Brian Gerst
2023-10-11 22:43 ` [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C Brian Gerst
2023-10-13 11:18   ` [tip: x86/entry] " tip-bot2 for Brian Gerst
2023-10-13 12:39   ` [PATCH v3 1/3] " Nikolay Borisov
2023-10-13 13:05     ` Brian Gerst
2023-10-11 22:43 ` [PATCH v3 2/3] x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test Brian Gerst
2023-10-13 11:18   ` [tip: x86/entry] " tip-bot2 for Brian Gerst
2023-10-11 22:43 ` [PATCH v3 3/3] x86/entry/32: Clean up syscall fast exit tests Brian Gerst
2023-10-13 11:18   ` [tip: x86/entry] " tip-bot2 for Brian Gerst

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).