linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86_64: move cpu_current_top_of_stack out of TSS
@ 2021-01-23  8:48 Lai Jiangshan
  2021-01-25  2:24 ` Andy Lutomirski
  0 siblings, 1 reply; 11+ messages in thread
From: Lai Jiangshan @ 2021-01-23  8:48 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	x86, H. Peter Anvin, Dave Hansen, Andy Lutomirski,
	Peter Zijlstra, Al Viro, Sasha Levin, Vincenzo Frascino,
	Joerg Roedel, Ricardo Neri, Reinette Chatre, Chang S. Bae,
	Andrew Morton, Gabriel Krisman Bertazi, Kees Cook,
	Frederic Weisbecker, Jens Axboe, Andi Kleen, Mike Rapoport,
	Mike Hommey, Mark Gross, Fenghua Yu, Tony Luck,
	Anthony Steinhauser, Jay Lang

From: Lai Jiangshan <laijs@linux.alibaba.com>

When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
TSS which is also in the user CR3 and it becomes a coveted fruit.  An
attacker can fetch the kernel stack top from it and continue next steps
of actions based on the kernel stack.

The address might not be very usefull for attacker, but it is not so
necessary to be in TSS either.  It is only accessed when CR3 is kernel CR3
and gs_base is kernel gs_base which means it can be in any percpu variable.

The major reason it is in TSS might be performance because it is hot in
cache and tlb since we just access sp2 as the scratch space in syscall.

So we can move it to a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.

tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result.  And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/include/asm/processor.h   | 10 ----------
 arch/x86/include/asm/switch_to.h   |  7 +------
 arch/x86/include/asm/thread_info.h |  6 ------
 arch/x86/kernel/cpu/common.c       |  3 +++
 arch/x86/kernel/process.c          |  8 ++------
 arch/x86/mm/pti.c                  |  7 +++----
 6 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..886d32da1318 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
 	u32			reserved1;
 	u64			sp0;
-
-	/*
-	 * We store cpu_current_top_of_stack in sp1 so it's always accessible.
-	 * Linux does not use ring 1, so sp1 is not otherwise needed.
-	 */
 	u64			sp1;
 
 	/*
@@ -428,12 +423,7 @@ struct irq_stack {
 
 DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..4f0bc8533a54 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
 	else
 		this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-	/*
-	 * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-	 * doesn't work on x86-32 because sp1 and
-	 * cpu_current_top_of_stack have different values (because of
-	 * the non-zero stack-padding on 32bit).
-	 */
+	/* XENPV keeps its entry stack to be kernel stack.  */
 	if (static_cpu_has(X86_FEATURE_XENPV))
 		load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..3dc93d8df425 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
 #endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f3d7fd7e9684 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1745,6 +1745,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..7c4d0184a44a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,10 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 		 */
 		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-		/*
-		 * .sp1 is cpu_current_top_of_stack.  The init task never
-		 * runs user code, but cpu_current_top_of_stack should still
-		 * be well defined before the first context switch.
-		 */
+#ifdef CONFIG_X86_32
+		/* .sp1 is used via TSS_entry2task_stack when swtiching stack */
 		.sp1 = TOP_OF_INIT_STACK,
 
-#ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
 #endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 1aab92930569..e101cd87d038 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
 
 	for_each_possible_cpu(cpu) {
 		/*
-		 * The SYSCALL64 entry code needs to be able to find the
-		 * thread stack and needs one word of scratch space in which
-		 * to spill a register.  All of this lives in the TSS, in
-		 * the sp1 and sp2 slots.
+		 * The SYSCALL64 entry code needs one word of scratch space
+		 * in which to spill a register.  It lives in the sp2 slot
+		 * of the CPU's TSS.
 		 *
 		 * This is done for all possible CPUs during boot to ensure
 		 * that it's propagated to all mms.
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86_64: move cpu_current_top_of_stack out of TSS
  2021-01-23  8:48 [PATCH] x86_64: move cpu_current_top_of_stack out of TSS Lai Jiangshan
@ 2021-01-25  2:24 ` Andy Lutomirski
  2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
  0 siblings, 1 reply; 11+ messages in thread
From: Andy Lutomirski @ 2021-01-25  2:24 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: LKML, Lai Jiangshan, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, X86 ML, H. Peter Anvin, Dave Hansen,
	Andy Lutomirski, Peter Zijlstra, Al Viro, Sasha Levin,
	Vincenzo Frascino, Joerg Roedel, Ricardo Neri, Reinette Chatre,
	Chang S. Bae, Andrew Morton, Gabriel Krisman Bertazi, Kees Cook,
	Frederic Weisbecker, Jens Axboe, Andi Kleen, Mike Rapoport,
	Mike Hommey, Mark Gross, Fenghua Yu, Tony Luck,
	Anthony Steinhauser, Jay Lang

On Fri, Jan 22, 2021 at 11:48 PM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>
> From: Lai Jiangshan <laijs@linux.alibaba.com>
>
> When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
> TSS which is also in the user CR3 and it becomes a coveted fruit.  An
> attacker can fetch the kernel stack top from it and continue next steps
> of actions based on the kernel stack.
>
> The address might not be very usefull for attacker, but it is not so
> necessary to be in TSS either.  It is only accessed when CR3 is kernel CR3
> and gs_base is kernel gs_base which means it can be in any percpu variable.
>
> The major reason it is in TSS might be performance because it is hot in
> cache and tlb since we just access sp2 as the scratch space in syscall.
>
> So we can move it to a percpu variable near other hot percpu variables,
> such as current_task, __preempt_count, and they are in the same
> cache line.
>
> tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
> performance lost in "getpid native" result.  And actually, the result
> changes from 93ns before patch to 92ns after patch when !KPTI, and the
> test is very stable although the test desn't show a higher degree of
> precision but enough to know it doesn't cause degression for the test.

I'm okay with this concept, and it's a decent cleanup.  But the patch
is incomplete.  There are a whole bunch of other sp1 users in
arch/x86, and we should get rid of all of them at once.  Most notably,
the 32-bit code uses both the percpu variable and sp1 right now, and
that should be cleaned up.  (It's also quite obfuscated in the current
code.)

See below for minor additional comments.

>
> Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
> ---
>  arch/x86/include/asm/processor.h   | 10 ----------
>  arch/x86/include/asm/switch_to.h   |  7 +------
>  arch/x86/include/asm/thread_info.h |  6 ------
>  arch/x86/kernel/cpu/common.c       |  3 +++
>  arch/x86/kernel/process.c          |  8 ++------
>  arch/x86/mm/pti.c                  |  7 +++----
>  6 files changed, 9 insertions(+), 32 deletions(-)
>
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index c20a52b5534b..886d32da1318 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -314,11 +314,6 @@ struct x86_hw_tss {
>  struct x86_hw_tss {
>         u32                     reserved1;
>         u64                     sp0;
> -
> -       /*
> -        * We store cpu_current_top_of_stack in sp1 so it's always accessible.
> -        * Linux does not use ring 1, so sp1 is not otherwise needed.
> -        */
>         u64                     sp1;
>
>         /*
> @@ -428,12 +423,7 @@ struct irq_stack {
>
>  DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
>
> -#ifdef CONFIG_X86_32
>  DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
> -#else
> -/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
> -#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
> -#endif
>
>  #ifdef CONFIG_X86_64
>  struct fixed_percpu_data {
> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index 9f69cc497f4b..4f0bc8533a54 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
>         else
>                 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
>  #else
> -       /*
> -        * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
> -        * doesn't work on x86-32 because sp1 and
> -        * cpu_current_top_of_stack have different values (because of
> -        * the non-zero stack-padding on 32bit).
> -        */
> +       /* XENPV keeps its entry stack to be kernel stack.  */

How about:

Xen PV enters the kernel on the thread stack.

>         if (static_cpu_has(X86_FEATURE_XENPV))
>                 load_sp0(task_top_of_stack(task));
>  #endif
> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
> index 0d751d5da702..3dc93d8df425 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * const stack,
>  #endif
>  }
>
> -#else /* !__ASSEMBLY__ */
> -
> -#ifdef CONFIG_X86_64
> -# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)

A complete patch would also remove TSS_sp1 from asm-offsets.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH V2 0/6] x86: don't abuse tss.sp1
  2021-01-25  2:24 ` Andy Lutomirski
@ 2021-01-25 17:34   ` Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 1/6] x86_64: move cpu_current_top_of_stack out of TSS Lai Jiangshan
                       ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Lai Jiangshan @ 2021-01-25 17:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Dave Hansen,
	Peter Zijlstra, Al Viro, Vincenzo Frascino, Joerg Roedel,
	Ricardo Neri, Reinette Chatre, Andrew Morton,
	Gabriel Krisman Bertazi, Kees Cook, Frederic Weisbecker,
	Jens Axboe, Arvind Sankar, Brian Gerst, Ard Biesheuvel,
	Andi Kleen, Mike Rapoport, Mike Hommey, Mark Gross, Fenghua Yu,
	Tony Luck, Anthony Steinhauser, Jay Lang, Chang S. Bae

From: Lai Jiangshan <laijs@linux.alibaba.com>

In x86_64, tss.sp1 is reused as cpu_current_top_of_stack.  But we can
directly use percpu since CR3 and gs_base is correct when it is used.

In x86_32, tss.sp1 is resued as thread.sp0 in three places in entry
code.  We have the correct CR3 and %fs at two of the places.  The last
one is sysenter.  This patchset makes %fs available earlier so that
we can also use percpu in sysenter.  And add a percpu cpu_current_thread_sp0
for thread.sp0 instead of tss.sp1

Changed from V1
	Requested from Andy to also fix sp1 for x86_32.
	Update comments in the x86_64 patch as Andy sugguested.

Lai Jiangshan (6):
  x86_64: move cpu_current_top_of_stack out of TSS
  x86_32: use percpu instead of offset-calculation to get thread.sp0
    when SWITCH_TO_KERNEL_STACK
  x86_32/sysenter: switch to the task stack without emptying the entry
    stack
  x86_32/sysenter: restore %fs before switching stack
  x86_32/sysenter: use percpu to get thread.sp0 when sysenter
  x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1

 arch/x86/entry/entry_32.S          | 38 +++++++++++++++++-------------
 arch/x86/include/asm/processor.h   | 12 ++--------
 arch/x86/include/asm/switch_to.h   |  9 ++-----
 arch/x86/include/asm/thread_info.h |  6 -----
 arch/x86/kernel/asm-offsets.c      |  1 -
 arch/x86/kernel/asm-offsets_32.c   | 10 --------
 arch/x86/kernel/cpu/common.c       | 12 +++++++++-
 arch/x86/kernel/process.c          |  7 ------
 arch/x86/mm/pti.c                  |  7 +++---
 9 files changed, 40 insertions(+), 62 deletions(-)

-- 
2.19.1.6.gb485710b


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH V2 1/6] x86_64: move cpu_current_top_of_stack out of TSS
  2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
@ 2021-01-25 17:34     ` Lai Jiangshan
  2021-03-28 20:46       ` [tip: x86/cleanups] x86/process/64: Move " tip-bot2 for Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 2/6] x86_32: use percpu instead of offset-calculation to get thread.sp0 when SWITCH_TO_KERNEL_STACK Lai Jiangshan
                       ` (4 subsequent siblings)
  5 siblings, 1 reply; 11+ messages in thread
From: Lai Jiangshan @ 2021-01-25 17:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Dave Hansen,
	Peter Zijlstra, Al Viro, Vincenzo Frascino, Joerg Roedel,
	Ricardo Neri, Reinette Chatre, Andrew Morton,
	Gabriel Krisman Bertazi, Kees Cook, Frederic Weisbecker,
	Jens Axboe, Arvind Sankar, Brian Gerst, Ard Biesheuvel,
	Andi Kleen, Mike Rapoport, Mike Hommey, Mark Gross, Fenghua Yu,
	Tony Luck, Anthony Steinhauser, Jay Lang, Chang S. Bae

From: Lai Jiangshan <laijs@linux.alibaba.com>

When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
TSS which is also in the user CR3 and it becomes a coveted fruit.  An
attacker can fetch the kernel stack top from it and continue next steps
of actions based on the kernel stack.

The address might not be very usefull for attacker, but it is not so
necessary to be in TSS either.  It is only accessed when CR3 is kernel CR3
and gs_base is kernel gs_base which means it can be in any percpu variable.

The major reason it is in TSS might be performance because it is hot in
cache and tlb since we just access sp2 as the scratch space in syscall.

So we can move it to a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.

tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result.  And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/include/asm/processor.h   | 10 ----------
 arch/x86/include/asm/switch_to.h   |  7 +------
 arch/x86/include/asm/thread_info.h |  6 ------
 arch/x86/kernel/cpu/common.c       |  3 +++
 arch/x86/kernel/process.c          |  7 +------
 arch/x86/mm/pti.c                  |  7 +++----
 6 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..886d32da1318 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
 	u32			reserved1;
 	u64			sp0;
-
-	/*
-	 * We store cpu_current_top_of_stack in sp1 so it's always accessible.
-	 * Linux does not use ring 1, so sp1 is not otherwise needed.
-	 */
 	u64			sp1;
 
 	/*
@@ -428,12 +423,7 @@ struct irq_stack {
 
 DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..b5f0d2ff47e4 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
 	else
 		this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-	/*
-	 * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-	 * doesn't work on x86-32 because sp1 and
-	 * cpu_current_top_of_stack have different values (because of
-	 * the non-zero stack-padding on 32bit).
-	 */
+	/* Xen PV enters the kernel on the thread stack. */
 	if (static_cpu_has(X86_FEATURE_XENPV))
 		load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..3dc93d8df425 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
 #endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f3d7fd7e9684 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1745,6 +1745,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..296de77da4b2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 		 */
 		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-		/*
-		 * .sp1 is cpu_current_top_of_stack.  The init task never
-		 * runs user code, but cpu_current_top_of_stack should still
-		 * be well defined before the first context switch.
-		 */
+#ifdef CONFIG_X86_32
 		.sp1 = TOP_OF_INIT_STACK,
 
-#ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
 #endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 1aab92930569..e101cd87d038 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
 
 	for_each_possible_cpu(cpu) {
 		/*
-		 * The SYSCALL64 entry code needs to be able to find the
-		 * thread stack and needs one word of scratch space in which
-		 * to spill a register.  All of this lives in the TSS, in
-		 * the sp1 and sp2 slots.
+		 * The SYSCALL64 entry code needs one word of scratch space
+		 * in which to spill a register.  It lives in the sp2 slot
+		 * of the CPU's TSS.
 		 *
 		 * This is done for all possible CPUs during boot to ensure
 		 * that it's propagated to all mms.
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH V2 2/6] x86_32: use percpu instead of offset-calculation to get thread.sp0 when SWITCH_TO_KERNEL_STACK
  2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 1/6] x86_64: move cpu_current_top_of_stack out of TSS Lai Jiangshan
@ 2021-01-25 17:34     ` Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack Lai Jiangshan
                       ` (3 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Lai Jiangshan @ 2021-01-25 17:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Dave Hansen,
	Peter Zijlstra, Al Viro, Vincenzo Frascino, Joerg Roedel,
	Ricardo Neri, Reinette Chatre, Andrew Morton,
	Gabriel Krisman Bertazi, Kees Cook, Frederic Weisbecker,
	Jens Axboe, Arvind Sankar, Brian Gerst, Ard Biesheuvel,
	Andi Kleen, Mike Rapoport, Mike Hommey, Mark Gross, Fenghua Yu,
	Tony Luck, Anthony Steinhauser, Jay Lang, Chang S. Bae

From: Lai Jiangshan <laijs@linux.alibaba.com>

TSS_entry2task_stack is used to refer to tss.sp1 which is stored the value
of thread.sp0.

At the code where TSS_entry2task_stack is used in SWITCH_TO_KERNEL_STACK,
the CR3 is already kernel CR3 and kernel segments is loaded.

So we can directly use the percpu to get tss.sp1(thread.sp0) instead of
the complex offset-calculation.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/entry/entry_32.S | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..3b4d1a63d1f0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -465,16 +465,11 @@
 	cmpl	$SIZEOF_entry_stack, %ecx
 	jae	.Lend_\@
 
-	/* Load stack pointer into %esi and %edi */
+	/* Load stack pointer into %esi */
 	movl	%esp, %esi
-	movl	%esi, %edi
-
-	/* Move %edi to the top of the entry stack */
-	andl	$(MASK_entry_stack), %edi
-	addl	$(SIZEOF_entry_stack), %edi
 
 	/* Load top of task-stack into %edi */
-	movl	TSS_entry2task_stack(%edi), %edi
+	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
 
 	/* Special case - entry from kernel mode via entry stack */
 #ifdef CONFIG_VM86
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH V2 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack
  2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 1/6] x86_64: move cpu_current_top_of_stack out of TSS Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 2/6] x86_32: use percpu instead of offset-calculation to get thread.sp0 when SWITCH_TO_KERNEL_STACK Lai Jiangshan
@ 2021-01-25 17:34     ` Lai Jiangshan
  2021-01-26  6:04       ` Brian Gerst
  2021-01-25 17:34     ` [PATCH V2 4/6] x86_32/sysenter: restore %fs before switching stack Lai Jiangshan
                       ` (2 subsequent siblings)
  5 siblings, 1 reply; 11+ messages in thread
From: Lai Jiangshan @ 2021-01-25 17:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Dave Hansen,
	Peter Zijlstra, Al Viro, Vincenzo Frascino, Joerg Roedel,
	Ricardo Neri, Reinette Chatre, Andrew Morton,
	Gabriel Krisman Bertazi, Kees Cook, Frederic Weisbecker,
	Jens Axboe, Arvind Sankar, Brian Gerst, Ard Biesheuvel,
	Andi Kleen, Mike Rapoport, Mike Hommey, Mark Gross, Fenghua Yu,
	Tony Luck, Anthony Steinhauser, Jay Lang, Chang S. Bae

From: Lai Jiangshan <laijs@linux.alibaba.com>

Like the way x86_64 uses the "old" stack, we can save the entry stack
pointer to a register and switch to the task stack.  So that we have
space on the "old" stack to save more things or scratch registers.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/entry/entry_32.S | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3b4d1a63d1f0..4513702ba45d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -905,19 +905,18 @@ SYM_FUNC_START(entry_SYSENTER_32)
 	pushl	%eax
 	BUG_IF_WRONG_CR3 no_user_check=1
 	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
-	popl	%eax
-	popfl
 
-	/* Stack empty again, switch to task stack */
-	movl	TSS_entry2task_stack(%esp), %esp
+	/* Switch to task stack */
+	movl	%esp, %eax
+	movl	(2*4+TSS_entry2task_stack)(%esp), %esp
 
 .Lsysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
 	pushl	$0			/* pt_regs->sp (placeholder) */
-	pushfl				/* pt_regs->flags (except IF = 0) */
+	pushl	4(%eax)			/* pt_regs->flags (except IF = 0) */
 	pushl	$__USER_CS		/* pt_regs->cs */
 	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
-	pushl	%eax			/* pt_regs->orig_ax */
+	pushl	(%eax)			/* pt_regs->orig_ax */
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest, stack already switched */
 
 	/*
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH V2 4/6] x86_32/sysenter: restore %fs before switching stack
  2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
                       ` (2 preceding siblings ...)
  2021-01-25 17:34     ` [PATCH V2 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack Lai Jiangshan
@ 2021-01-25 17:34     ` Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 5/6] x86_32/sysenter: use percpu to get thread.sp0 when sysenter Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 6/6] x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1 Lai Jiangshan
  5 siblings, 0 replies; 11+ messages in thread
From: Lai Jiangshan @ 2021-01-25 17:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Dave Hansen,
	Peter Zijlstra, Al Viro, Vincenzo Frascino, Joerg Roedel,
	Ricardo Neri, Reinette Chatre, Andrew Morton,
	Gabriel Krisman Bertazi, Kees Cook, Frederic Weisbecker,
	Jens Axboe, Arvind Sankar, Brian Gerst, Ard Biesheuvel,
	Andi Kleen, Mike Rapoport, Mike Hommey, Mark Gross, Fenghua Yu,
	Tony Luck, Anthony Steinhauser, Jay Lang, Chang S. Bae

From: Lai Jiangshan <laijs@linux.alibaba.com>

Prepare for using percpu and removing TSS_entry2task_stack

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/entry/entry_32.S | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4513702ba45d..a8d2640394f9 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -279,11 +279,13 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 skip_fs=0 unwind_espfix=0
 	cld
 .if \skip_gs == 0
 	PUSH_GS
 .endif
+
+.if \skip_fs == 0
 	pushl	%fs
 
 	pushl	%eax
@@ -293,6 +295,7 @@
 	UNWIND_ESPFIX_STACK
 .endif
 	popl	%eax
+.endif
 
 	FIXUP_FRAME
 	pushl	%es
@@ -906,18 +909,27 @@ SYM_FUNC_START(entry_SYSENTER_32)
 	BUG_IF_WRONG_CR3 no_user_check=1
 	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
 
+	/* Restore kernel %fs, so that we can use PERCPU */
+	pushl	%fs
+	movl	$(__KERNEL_PERCPU), %eax
+	movl	%eax, %fs
+
 	/* Switch to task stack */
 	movl	%esp, %eax
-	movl	(2*4+TSS_entry2task_stack)(%esp), %esp
+	movl	(3*4+TSS_entry2task_stack)(%esp), %esp
 
 .Lsysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
 	pushl	$0			/* pt_regs->sp (placeholder) */
-	pushl	4(%eax)			/* pt_regs->flags (except IF = 0) */
+	pushl	8(%eax)			/* pt_regs->flags (except IF = 0) */
 	pushl	$__USER_CS		/* pt_regs->cs */
 	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
-	pushl	(%eax)			/* pt_regs->orig_ax */
-	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest, stack already switched */
+	pushl	4(%eax)			/* pt_regs->orig_ax */
+	PUSH_GS				/* pt_regs->gs */
+	pushl	(%eax)			/* pt_regs->fs */
+	/* save rest, stack and %fs already switched */
+	SAVE_ALL pt_regs_ax=$-ENOSYS skip_gs=1 skip_fs=1
+	SET_KERNEL_GS %edx
 
 	/*
 	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH V2 5/6] x86_32/sysenter: use percpu to get thread.sp0 when sysenter
  2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
                       ` (3 preceding siblings ...)
  2021-01-25 17:34     ` [PATCH V2 4/6] x86_32/sysenter: restore %fs before switching stack Lai Jiangshan
@ 2021-01-25 17:34     ` Lai Jiangshan
  2021-01-25 17:34     ` [PATCH V2 6/6] x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1 Lai Jiangshan
  5 siblings, 0 replies; 11+ messages in thread
From: Lai Jiangshan @ 2021-01-25 17:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Dave Hansen,
	Peter Zijlstra, Al Viro, Vincenzo Frascino, Joerg Roedel,
	Ricardo Neri, Reinette Chatre, Andrew Morton,
	Gabriel Krisman Bertazi, Kees Cook, Frederic Weisbecker,
	Jens Axboe, Arvind Sankar, Brian Gerst, Ard Biesheuvel,
	Andi Kleen, Mike Rapoport, Mike Hommey, Mark Gross, Fenghua Yu,
	Tony Luck, Anthony Steinhauser, Jay Lang, Chang S. Bae

From: Lai Jiangshan <laijs@linux.alibaba.com>

TSS_entry2task_stack is used to refer to tss.sp1 which is stored the value
of thread.sp0.

At the code where TSS_entry2task_stack is used in sysenter, the CR3 is
already kernel CR3 and kernel segments is loaded.

So that we can directly use percpu for it instead of offset-calculation.

And we remove the unused TSS_entry2task_stack.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/entry/entry_32.S        |  2 +-
 arch/x86/kernel/asm-offsets_32.c | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a8d2640394f9..3cb42efb3c04 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
 	/* Switch to task stack */
 	movl	%esp, %eax
-	movl	(3*4+TSS_entry2task_stack)(%esp), %esp
+	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
 
 .Lsysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6e043f295a60..6d4143cfbf03 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -43,16 +43,6 @@ void foo(void)
 	OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
 	BLANK();
 
-	/*
-	 * Offset from the entry stack to task stack stored in TSS. Kernel entry
-	 * happens on the per-cpu entry-stack, and the asm code switches to the
-	 * task-stack pointer stored in x86_tss.sp1, which is a copy of
-	 * task->thread.sp0 where entry code can find it.
-	 */
-	DEFINE(TSS_entry2task_stack,
-	       offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
-	       offsetofend(struct cpu_entry_area, entry_stack_page.stack));
-
 #ifdef CONFIG_STACKPROTECTOR
 	BLANK();
 	OFFSET(stack_canary_offset, stack_canary, canary);
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH V2 6/6] x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1
  2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
                       ` (4 preceding siblings ...)
  2021-01-25 17:34     ` [PATCH V2 5/6] x86_32/sysenter: use percpu to get thread.sp0 when sysenter Lai Jiangshan
@ 2021-01-25 17:34     ` Lai Jiangshan
  5 siblings, 0 replies; 11+ messages in thread
From: Lai Jiangshan @ 2021-01-25 17:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Dave Hansen,
	Peter Zijlstra, Al Viro, Vincenzo Frascino, Joerg Roedel,
	Ricardo Neri, Reinette Chatre, Andrew Morton,
	Gabriel Krisman Bertazi, Kees Cook, Frederic Weisbecker,
	Jens Axboe, Arvind Sankar, Brian Gerst, Ard Biesheuvel,
	Andi Kleen, Mike Rapoport, Mike Hommey, Mark Gross, Fenghua Yu,
	Tony Luck, Anthony Steinhauser, Jay Lang, Chang S. Bae

From: Lai Jiangshan <laijs@linux.alibaba.com>

sp1 is not used by hardware and is used as thread.sp0.  We should just
use new percpu variable.

And remove unneeded TSS_sp1.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/entry/entry_32.S        | 6 +++---
 arch/x86/include/asm/processor.h | 2 ++
 arch/x86/include/asm/switch_to.h | 2 +-
 arch/x86/kernel/asm-offsets.c    | 1 -
 arch/x86/kernel/cpu/common.c     | 9 ++++++++-
 arch/x86/kernel/process.c        | 2 --
 6 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3cb42efb3c04..22cd3d8fd23e 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -472,7 +472,7 @@
 	movl	%esp, %esi
 
 	/* Load top of task-stack into %edi */
-	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
+	movl	PER_CPU_VAR(cpu_current_thread_sp0), %edi
 
 	/* Special case - entry from kernel mode via entry stack */
 #ifdef CONFIG_VM86
@@ -658,7 +658,7 @@
 	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
 
 	/* Bytes on the task-stack to ecx */
-	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+	movl	PER_CPU_VAR(cpu_current_thread_sp0), %ecx
 	subl	%esi, %ecx
 
 	/* Allocate stack-frame on entry-stack */
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
 	/* Switch to task stack */
 	movl	%esp, %eax
-	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
+	movl	PER_CPU_VAR(cpu_current_thread_sp0), %esp
 
 .Lsysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 886d32da1318..4265884c33e7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -774,6 +774,8 @@ static inline void spin_lock_prefetch(const void *x)
 
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 
+DECLARE_PER_CPU(unsigned long, cpu_current_thread_sp0);
+
 #else
 #define INIT_THREAD { }
 
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index b5f0d2ff47e4..e27eb7974797 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -69,7 +69,7 @@ static inline void update_task_stack(struct task_struct *task)
 	if (static_cpu_has(X86_FEATURE_XENPV))
 		load_sp0(task->thread.sp0);
 	else
-		this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+		this_cpu_write(cpu_current_thread_sp0, task->thread.sp0);
 #else
 	/* Xen PV enters the kernel on the thread stack. */
 	if (static_cpu_has(X86_FEATURE_XENPV))
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 60b9f42ce3c1..3b63b6062792 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -98,6 +98,5 @@ static void __used common(void)
 
 	/* Offset for fields in tss_struct */
 	OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
-	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
 	OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f3d7fd7e9684..b2c37d369137 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1789,12 +1789,19 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
 /*
  * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
  * the top of the kernel stack.  Use an extra percpu variable to track the
- * top of the kernel stack directly.
+ * top of the kernel stack directly and an percpu variable to track the
+ * thread.sp0 for using in entry code.  cpu_current_top_of_stack and
+ * cpu_current_thread_sp0 are different value because of the non-zero
+ * stack-padding on 32bit.  See more comment at TOP_OF_KERNEL_STACK_PADDING
+ * and vm86.
  */
 DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
 	(unsigned long)&init_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_thread_sp0) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_thread_sp0);
+
 #ifdef CONFIG_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 296de77da4b2..e6d4b5399a81 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -64,8 +64,6 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
 #ifdef CONFIG_X86_32
-		.sp1 = TOP_OF_INIT_STACK,
-
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
 #endif
-- 
2.19.1.6.gb485710b


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH V2 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack
  2021-01-25 17:34     ` [PATCH V2 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack Lai Jiangshan
@ 2021-01-26  6:04       ` Brian Gerst
  0 siblings, 0 replies; 11+ messages in thread
From: Brian Gerst @ 2021-01-26  6:04 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Linux Kernel Mailing List, Lai Jiangshan, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	the arch/x86 maintainers, H. Peter Anvin, Dave Hansen,
	Peter Zijlstra, Al Viro, Vincenzo Frascino, Joerg Roedel,
	Ricardo Neri, Reinette Chatre, Andrew Morton,
	Gabriel Krisman Bertazi, Kees Cook, Frederic Weisbecker,
	Jens Axboe, Arvind Sankar, Ard Biesheuvel, Andi Kleen,
	Mike Rapoport, Mike Hommey, Mark Gross, Fenghua Yu, Tony Luck,
	Anthony Steinhauser, Jay Lang, Chang S. Bae

On Mon, Jan 25, 2021 at 11:35 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>
> From: Lai Jiangshan <laijs@linux.alibaba.com>
>
> Like the way x86_64 uses the "old" stack, we can save the entry stack
> pointer to a register and switch to the task stack.  So that we have
> space on the "old" stack to save more things or scratch registers.
>
> Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
> ---
>  arch/x86/entry/entry_32.S | 11 +++++------
>  1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index 3b4d1a63d1f0..4513702ba45d 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -905,19 +905,18 @@ SYM_FUNC_START(entry_SYSENTER_32)
>         pushl   %eax
>         BUG_IF_WRONG_CR3 no_user_check=1
>         SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
> -       popl    %eax
> -       popfl
>
> -       /* Stack empty again, switch to task stack */
> -       movl    TSS_entry2task_stack(%esp), %esp
> +       /* Switch to task stack */
> +       movl    %esp, %eax
> +       movl    (2*4+TSS_entry2task_stack)(%esp), %esp
>
>  .Lsysenter_past_esp:
>         pushl   $__USER_DS              /* pt_regs->ss */
>         pushl   $0                      /* pt_regs->sp (placeholder) */
> -       pushfl                          /* pt_regs->flags (except IF = 0) */
> +       pushl   4(%eax)                 /* pt_regs->flags (except IF = 0) */

__KERNEL_DS isn't loaded at this point, so this needs an explicit %ss:
override.  You probably didn't catch this because the default
__USER_DS was still loaded.

>         pushl   $__USER_CS              /* pt_regs->cs */
>         pushl   $0                      /* pt_regs->ip = 0 (placeholder) */
> -       pushl   %eax                    /* pt_regs->orig_ax */
> +       pushl   (%eax)                  /* pt_regs->orig_ax */

Add an %ss: override here too.

>         SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest, stack already switched */
>
>         /*
> --
> 2.19.1.6.gb485710b
>

--
Brian Gerst

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [tip: x86/cleanups] x86/process/64: Move cpu_current_top_of_stack out of TSS
  2021-01-25 17:34     ` [PATCH V2 1/6] x86_64: move cpu_current_top_of_stack out of TSS Lai Jiangshan
@ 2021-03-28 20:46       ` tip-bot2 for Lai Jiangshan
  0 siblings, 0 replies; 11+ messages in thread
From: tip-bot2 for Lai Jiangshan @ 2021-03-28 20:46 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Lai Jiangshan, Thomas Gleixner, x86, linux-kernel

The following commit has been merged into the x86/cleanups branch of tip:

Commit-ID:     1591584e2e762edecefde403c44d9c26c9ff72c9
Gitweb:        https://git.kernel.org/tip/1591584e2e762edecefde403c44d9c26c9ff72c9
Author:        Lai Jiangshan <laijs@linux.alibaba.com>
AuthorDate:    Tue, 26 Jan 2021 01:34:29 +08:00
Committer:     Thomas Gleixner <tglx@linutronix.de>
CommitterDate: Sun, 28 Mar 2021 22:40:10 +02:00

x86/process/64: Move cpu_current_top_of_stack out of TSS

cpu_current_top_of_stack is currently stored in TSS.sp1. TSS is exposed
through the cpu_entry_area which is visible with user CR3 when PTI is
enabled and active.

This makes it a coveted fruit for attackers.  An attacker can fetch the
kernel stack top from it and continue next steps of actions based on the
kernel stack.

But it is actualy not necessary to be stored in the TSS.  It is only
accessed after the entry code switched to kernel CR3 and kernel GS_BASE
which means it can be in any regular percpu variable.

The reason why it is in TSS is historical (pre PTI) because TSS is also
used as scratch space in SYSCALL_64 and therefore cache hot.

A syscall also needs the per CPU variable current_task and eventually
__preempt_count, so placing cpu_current_top_of_stack next to them makes it
likely that they end up in the same cache line which should avoid
performance regressions. This is not enforced as the compiler is free to
place these variables, so these entry relevant variables should move into
a data structure to make this enforceable.

The seccomp_benchmark doesn't show any performance loss in the "getpid
native" test result.  Actually, the result changes from 93ns before to 92ns
with this change when KPTI is disabled. The test is very stable and
although the test doesn't show a higher degree of precision it gives enough
confidence that moving cpu_current_top_of_stack does not cause a
regression.

[ tglx: Removed unneeded export. Massaged changelog ]

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210125173444.22696-2-jiangshanlai@gmail.com
---
 arch/x86/include/asm/processor.h   | 10 ----------
 arch/x86/include/asm/switch_to.h   |  7 +------
 arch/x86/include/asm/thread_info.h |  8 +-------
 arch/x86/kernel/cpu/common.c       |  2 ++
 arch/x86/kernel/process.c          |  7 +------
 arch/x86/mm/pti.c                  |  7 +++----
 6 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 8b3ed21..185142b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
 	u32			reserved1;
 	u64			sp0;
-
-	/*
-	 * We store cpu_current_top_of_stack in sp1 so it's always accessible.
-	 * Linux does not use ring 1, so sp1 is not otherwise needed.
-	 */
 	u64			sp1;
 
 	/*
@@ -426,12 +421,7 @@ struct irq_stack {
 	char		stack[IRQ_STACK_SIZE];
 } __aligned(IRQ_STACK_SIZE);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc4..b5f0d2f 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
 	else
 		this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-	/*
-	 * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-	 * doesn't work on x86-32 because sp1 and
-	 * cpu_current_top_of_stack have different values (because of
-	 * the non-zero stack-padding on 32bit).
-	 */
+	/* Xen PV enters the kernel on the thread stack. */
 	if (static_cpu_has(X86_FEATURE_XENPV))
 		load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 06b740b..de406d9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,13 +197,7 @@ static inline int arch_within_stack_frames(const void * const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
-#endif
+#endif  /* !__ASSEMBLY__ */
 
 /*
  * Thread-synchronous status.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1aa5f0a..3401078 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1748,6 +1748,8 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse);
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cdfe5b4..43cbfc8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 		 */
 		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-		/*
-		 * .sp1 is cpu_current_top_of_stack.  The init task never
-		 * runs user code, but cpu_current_top_of_stack should still
-		 * be well defined before the first context switch.
-		 */
+#ifdef CONFIG_X86_32
 		.sp1 = TOP_OF_INIT_STACK,
 
-#ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
 #endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index b377604..5d5c7bb 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
 
 	for_each_possible_cpu(cpu) {
 		/*
-		 * The SYSCALL64 entry code needs to be able to find the
-		 * thread stack and needs one word of scratch space in which
-		 * to spill a register.  All of this lives in the TSS, in
-		 * the sp1 and sp2 slots.
+		 * The SYSCALL64 entry code needs one word of scratch space
+		 * in which to spill a register.  It lives in the sp2 slot
+		 * of the CPU's TSS.
 		 *
 		 * This is done for all possible CPUs during boot to ensure
 		 * that it's propagated to all mms.

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-03-28 20:47 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-23  8:48 [PATCH] x86_64: move cpu_current_top_of_stack out of TSS Lai Jiangshan
2021-01-25  2:24 ` Andy Lutomirski
2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 1/6] x86_64: move cpu_current_top_of_stack out of TSS Lai Jiangshan
2021-03-28 20:46       ` [tip: x86/cleanups] x86/process/64: Move " tip-bot2 for Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 2/6] x86_32: use percpu instead of offset-calculation to get thread.sp0 when SWITCH_TO_KERNEL_STACK Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack Lai Jiangshan
2021-01-26  6:04       ` Brian Gerst
2021-01-25 17:34     ` [PATCH V2 4/6] x86_32/sysenter: restore %fs before switching stack Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 5/6] x86_32/sysenter: use percpu to get thread.sp0 when sysenter Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 6/6] x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1 Lai Jiangshan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).