linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Lai Jiangshan <jiangshanlai@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: Lai Jiangshan <laijs@linux.alibaba.com>,
	Andy Lutomirski <luto@kernel.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	x86@kernel.org, "H. Peter Anvin" <hpa@zytor.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Al Viro <viro@zeniv.linux.org.uk>,
	Vincenzo Frascino <vincenzo.frascino@arm.com>,
	Joerg Roedel <jroedel@suse.de>,
	Ricardo Neri <ricardo.neri-calderon@linux.intel.com>,
	Reinette Chatre <reinette.chatre@intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Gabriel Krisman Bertazi <krisman@collabora.com>,
	Kees Cook <keescook@chromium.org>,
	Frederic Weisbecker <frederic@kernel.org>,
	Jens Axboe <axboe@kernel.dk>,
	Arvind Sankar <nivedita@alum.mit.edu>,
	Brian Gerst <brgerst@gmail.com>, Ard Biesheuvel <ardb@kernel.org>,
	Andi Kleen <ak@linux.intel.com>, Mike Rapoport <rppt@kernel.org>,
	Mike Hommey <mh@glandium.org>,
	Mark Gross <mgross@linux.intel.com>,
	Fenghua Yu <fenghua.yu@intel.com>,
	Tony Luck <tony.luck@intel.com>,
	Anthony Steinhauser <asteinhauser@google.com>,
	Jay Lang <jaytlang@mit.edu>,
	"Chang S. Bae" <chang.seok.bae@intel.com>
Subject: [PATCH V2 1/6] x86_64: move cpu_current_top_of_stack out of TSS
Date: Tue, 26 Jan 2021 01:34:29 +0800	[thread overview]
Message-ID: <20210125173444.22696-2-jiangshanlai@gmail.com> (raw)
In-Reply-To: <20210125173444.22696-1-jiangshanlai@gmail.com>

From: Lai Jiangshan <laijs@linux.alibaba.com>

When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
TSS which is also in the user CR3 and it becomes a coveted fruit.  An
attacker can fetch the kernel stack top from it and continue next steps
of actions based on the kernel stack.

The address might not be very usefull for attacker, but it is not so
necessary to be in TSS either.  It is only accessed when CR3 is kernel CR3
and gs_base is kernel gs_base which means it can be in any percpu variable.

The major reason it is in TSS might be performance because it is hot in
cache and tlb since we just access sp2 as the scratch space in syscall.

So we can move it to a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.

tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result.  And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/include/asm/processor.h   | 10 ----------
 arch/x86/include/asm/switch_to.h   |  7 +------
 arch/x86/include/asm/thread_info.h |  6 ------
 arch/x86/kernel/cpu/common.c       |  3 +++
 arch/x86/kernel/process.c          |  7 +------
 arch/x86/mm/pti.c                  |  7 +++----
 6 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..886d32da1318 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
 	u32			reserved1;
 	u64			sp0;
-
-	/*
-	 * We store cpu_current_top_of_stack in sp1 so it's always accessible.
-	 * Linux does not use ring 1, so sp1 is not otherwise needed.
-	 */
 	u64			sp1;
 
 	/*
@@ -428,12 +423,7 @@ struct irq_stack {
 
 DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..b5f0d2ff47e4 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
 	else
 		this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-	/*
-	 * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-	 * doesn't work on x86-32 because sp1 and
-	 * cpu_current_top_of_stack have different values (because of
-	 * the non-zero stack-padding on 32bit).
-	 */
+	/* Xen PV enters the kernel on the thread stack. */
 	if (static_cpu_has(X86_FEATURE_XENPV))
 		load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..3dc93d8df425 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
 #endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f3d7fd7e9684 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1745,6 +1745,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..296de77da4b2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
 		 */
 		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-		/*
-		 * .sp1 is cpu_current_top_of_stack.  The init task never
-		 * runs user code, but cpu_current_top_of_stack should still
-		 * be well defined before the first context switch.
-		 */
+#ifdef CONFIG_X86_32
 		.sp1 = TOP_OF_INIT_STACK,
 
-#ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
 #endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 1aab92930569..e101cd87d038 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
 
 	for_each_possible_cpu(cpu) {
 		/*
-		 * The SYSCALL64 entry code needs to be able to find the
-		 * thread stack and needs one word of scratch space in which
-		 * to spill a register.  All of this lives in the TSS, in
-		 * the sp1 and sp2 slots.
+		 * The SYSCALL64 entry code needs one word of scratch space
+		 * in which to spill a register.  It lives in the sp2 slot
+		 * of the CPU's TSS.
 		 *
 		 * This is done for all possible CPUs during boot to ensure
 		 * that it's propagated to all mms.
-- 
2.19.1.6.gb485710b


  reply	other threads:[~2021-01-25 16:37 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-23  8:48 [PATCH] x86_64: move cpu_current_top_of_stack out of TSS Lai Jiangshan
2021-01-25  2:24 ` Andy Lutomirski
2021-01-25 17:34   ` [PATCH V2 0/6] x86: don't abuse tss.sp1 Lai Jiangshan
2021-01-25 17:34     ` Lai Jiangshan [this message]
2021-03-28 20:46       ` [tip: x86/cleanups] x86/process/64: Move cpu_current_top_of_stack out of TSS tip-bot2 for Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 2/6] x86_32: use percpu instead of offset-calculation to get thread.sp0 when SWITCH_TO_KERNEL_STACK Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack Lai Jiangshan
2021-01-26  6:04       ` Brian Gerst
2021-01-25 17:34     ` [PATCH V2 4/6] x86_32/sysenter: restore %fs before switching stack Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 5/6] x86_32/sysenter: use percpu to get thread.sp0 when sysenter Lai Jiangshan
2021-01-25 17:34     ` [PATCH V2 6/6] x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1 Lai Jiangshan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210125173444.22696-2-jiangshanlai@gmail.com \
    --to=jiangshanlai@gmail.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=ardb@kernel.org \
    --cc=asteinhauser@google.com \
    --cc=axboe@kernel.dk \
    --cc=bp@alien8.de \
    --cc=brgerst@gmail.com \
    --cc=chang.seok.bae@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=fenghua.yu@intel.com \
    --cc=frederic@kernel.org \
    --cc=hpa@zytor.com \
    --cc=jaytlang@mit.edu \
    --cc=jroedel@suse.de \
    --cc=keescook@chromium.org \
    --cc=krisman@collabora.com \
    --cc=laijs@linux.alibaba.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mgross@linux.intel.com \
    --cc=mh@glandium.org \
    --cc=mingo@redhat.com \
    --cc=nivedita@alum.mit.edu \
    --cc=peterz@infradead.org \
    --cc=reinette.chatre@intel.com \
    --cc=ricardo.neri-calderon@linux.intel.com \
    --cc=rppt@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=tony.luck@intel.com \
    --cc=vincenzo.frascino@arm.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).