All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall entry
@ 2010-01-28 10:26 Jiang, Yunhong
  2010-02-10 18:13 ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 7+ messages in thread
From: Jiang, Yunhong @ 2010-01-28 10:26 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel, Keir Fraser, Jan Beulich

[-- Attachment #1: Type: text/plain, Size: 6200 bytes --]

Jeremy, this patch is a RFC for MCE injection issue as discussed in http://lists.xensource.com/archives/html/xen-devel/2009-12/msg00849.html .

Currently, when syscall into dom0, the stack is kernel stack. Dom0 kernel will firstly switch to user space stack, to create context same as physical syscall entry and then call kernel's common syscall entry. In kernel's common syscall entry, it will return back to kernel stack.
This give a small security windows for MCE. If a vMCE is injected into dom0 when dom0 is in the syscalll entry, but is still using the user space stack, it will cause great trouble (check above URL for detailed information).

I can think out two options for this issue:
a) The method this patch took. Dom0 didn't try to switch to user space stack. Instead, it will use kernel statck directly and jump to kenerl entry.
b) Add IST support to xen hypervisor, Considering the ptrace issue in http://lists.xensource.com/archives/html/xen-devel/2009-05/msg00200.html and also Ingo's comments in http://lkml.indiana.edu/hypermail/linux/kernel/0905.1/00087.html, maybe it still have value to add such support. But not sure if that is accepetable for xen hypervisor.

As RFC, this patch is just tested to make sure the system can still boot succesffully, I did't try ptrace or something else. If the idea is ok, I will do more testing on it.

Thanks
Yunhong Jiang

>From 6f8e8019b6cee92b23a3f02787376e0ab3a4244f Mon Sep 17 00:00:00 2001
From: Jiang, Yunhong <yunhong.jiang@intel.com>
Date: Thu, 28 Jan 2010 03:43:41 +0800
Subject: [PATCH] Change the syscall/sysexit entry to use the kernel stack directly

---
 arch/x86/ia32/ia32entry.S       |    2 +-
 arch/x86/include/asm/irqflags.h |    2 ++
 arch/x86/kernel/entry_64.S      |    2 +-
 arch/x86/xen/xen-asm_64.S       |   37 +++++++++++++++++++++++++------------
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b09502d..2f8e942 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -284,6 +284,7 @@ ENTRY(ia32_cstar_target)
 	movl	%esp,%r8d
 	CFI_REGISTER	rsp,r8
 	movq	PER_CPU_VAR(kernel_stack),%rsp
+ENTRY(ia32_cstart_after_switch_stack)
 	/*
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
@@ -337,7 +338,6 @@ sysretl_from_sys_call:
 	xorq	%r9,%r9
 	xorq	%r8,%r8
 	TRACE_IRQS_ON
-	movl RSP-ARGOFFSET(%rsp),%esp
 	CFI_RESTORE rsp
 	USERGS_SYSRET32
 	
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c6ccbe7..7e62aed 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -133,9 +133,11 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #define INTERRUPT_RETURN	iretq
 #define USERGS_SYSRET64				\
+	movq PER_CPU_VAR(old_rsp), %rsp	\
 	swapgs;					\
 	sysretq;
 #define USERGS_SYSRET32				\
+	movl RSP-ARGOFFSET(%rsp), %esp		\
 	swapgs;					\
 	sysretl
 #define ENABLE_INTERRUPTS_SYSEXIT32		\
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be7..5bc75fe 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -472,6 +472,7 @@ ENTRY(system_call_after_swapgs)
 	 * No need to follow this irqs off/on section - it's straight
 	 * and short:
 	 */
+ENTRY(system_call_after_switch_stack)
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_ARGS 8,1
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
@@ -510,7 +511,6 @@ sysret_check:
 	CFI_REGISTER	rip,rcx
 	RESTORE_ARGS 0,-ARG_SKIP,1
 	/*CFI_REGISTER	rflags,r11*/
-	movq	PER_CPU_VAR(old_rsp), %rsp
 	USERGS_SYSRET64
 
 	CFI_RESTORE_STATE
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefd..c953d14 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -12,6 +12,7 @@
  */
 
 #include <asm/errno.h>
+#include <asm/calling.h>
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
@@ -68,9 +69,7 @@ ENTRY(xen_sysret64)
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
-	movq %rsp, PER_CPU_VAR(old_rsp)
-	movq PER_CPU_VAR(kernel_stack), %rsp
-
+    movq PER_CPU_VAR(kernel_stack), %rsp
 	pushq $__USER_DS
 	pushq PER_CPU_VAR(old_rsp)
 	pushq %r11
@@ -84,12 +83,15 @@ RELOC(xen_sysret64, 1b+1)
 
 ENTRY(xen_sysret32)
 	/*
-	 * We're already on the usermode stack at this point, but
+	 * We're still on the kernel mode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
-	movq %rsp, PER_CPU_VAR(old_rsp)
-	movq PER_CPU_VAR(kernel_stack), %rsp
-
+    /* The ARGS is restored, so don't clobber anything */
+    pushq %rax
+    movq RSP-ARGOFFSET(%rsp), %rax
+    movq %rax, PER_CPU_VAR(old_rsp)
+    popq %rax
+    movq PER_CPU_VAR(kernel_stack), %rsp
 	pushq $__USER32_DS
 	pushq PER_CPU_VAR(old_rsp)
 	pushq %r11
@@ -116,27 +118,38 @@ RELOC(xen_sysret32, 1b+1)
  * rsp->rcx
  *
  * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
+ * CPU-generated syscall/sysenter and jump to the normal entrypoint,
+ * but we will not switch stack
  */
 
 .macro undo_xen_syscall
+    /* Clobber rcx is ok */
+	movq 5*8(%rsp), %rcx
+    movq %rcx, PER_CPU_VAR(old_rsp)
 	mov 0*8(%rsp), %rcx
 	mov 1*8(%rsp), %r11
-	mov 5*8(%rsp), %rsp
+	sub $56, %rsp
 .endm
 
 /* Normal 64-bit system call target */
 ENTRY(xen_syscall_target)
 	undo_xen_syscall
-	jmp system_call_after_swapgs
+	jmp system_call_after_switch_stack
 ENDPROC(xen_syscall_target)
 
 #ifdef CONFIG_IA32_EMULATION
 
+.macro undo_xen_ia32_syscall
+	mov 0*8(%rsp), %rcx
+	mov 1*8(%rsp), %r11
+	mov	5*8(%esp), %r8d
+	sub $56, %rsp
+.endm
 /* 32-bit compat syscall target */
 ENTRY(xen_syscall32_target)
-	undo_xen_syscall
-	jmp ia32_cstar_target
+	undo_xen_ia32_syscall
+	movl	%esp,%r8d
+	jmp ia32_cstart_after_switch_stack 
 ENDPROC(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
-- 
1.5.4.2




[-- Attachment #2: 0002-Change-the-syscall-sysexit-entry-to-use-the-kernel-s.patch --]
[-- Type: application/octet-stream, Size: 4643 bytes --]

From 6f8e8019b6cee92b23a3f02787376e0ab3a4244f Mon Sep 17 00:00:00 2001
From: Jiang, Yunhong <yunhong.jiang@intel.com>
Date: Thu, 28 Jan 2010 03:43:41 +0800
Subject: [PATCH] Change the syscall/sysexit entry to use the kernel stack directly

---
 arch/x86/ia32/ia32entry.S       |    2 +-
 arch/x86/include/asm/irqflags.h |    2 ++
 arch/x86/kernel/entry_64.S      |    2 +-
 arch/x86/xen/xen-asm_64.S       |   37 +++++++++++++++++++++++++------------
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b09502d..2f8e942 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -284,6 +284,7 @@ ENTRY(ia32_cstar_target)
 	movl	%esp,%r8d
 	CFI_REGISTER	rsp,r8
 	movq	PER_CPU_VAR(kernel_stack),%rsp
+ENTRY(ia32_cstart_after_switch_stack)
 	/*
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
@@ -337,7 +338,6 @@ sysretl_from_sys_call:
 	xorq	%r9,%r9
 	xorq	%r8,%r8
 	TRACE_IRQS_ON
-	movl RSP-ARGOFFSET(%rsp),%esp
 	CFI_RESTORE rsp
 	USERGS_SYSRET32
 	
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c6ccbe7..7e62aed 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -133,9 +133,11 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #define INTERRUPT_RETURN	iretq
 #define USERGS_SYSRET64				\
+	movq PER_CPU_VAR(old_rsp), %rsp	\
 	swapgs;					\
 	sysretq;
 #define USERGS_SYSRET32				\
+	movl RSP-ARGOFFSET(%rsp), %esp		\
 	swapgs;					\
 	sysretl
 #define ENABLE_INTERRUPTS_SYSEXIT32		\
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be7..5bc75fe 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -472,6 +472,7 @@ ENTRY(system_call_after_swapgs)
 	 * No need to follow this irqs off/on section - it's straight
 	 * and short:
 	 */
+ENTRY(system_call_after_switch_stack)
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_ARGS 8,1
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
@@ -510,7 +511,6 @@ sysret_check:
 	CFI_REGISTER	rip,rcx
 	RESTORE_ARGS 0,-ARG_SKIP,1
 	/*CFI_REGISTER	rflags,r11*/
-	movq	PER_CPU_VAR(old_rsp), %rsp
 	USERGS_SYSRET64
 
 	CFI_RESTORE_STATE
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefd..c953d14 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -12,6 +12,7 @@
  */
 
 #include <asm/errno.h>
+#include <asm/calling.h>
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
@@ -68,9 +69,7 @@ ENTRY(xen_sysret64)
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
-	movq %rsp, PER_CPU_VAR(old_rsp)
-	movq PER_CPU_VAR(kernel_stack), %rsp
-
+    movq PER_CPU_VAR(kernel_stack), %rsp
 	pushq $__USER_DS
 	pushq PER_CPU_VAR(old_rsp)
 	pushq %r11
@@ -84,12 +83,15 @@ RELOC(xen_sysret64, 1b+1)
 
 ENTRY(xen_sysret32)
 	/*
-	 * We're already on the usermode stack at this point, but
+	 * We're still on the kernel mode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
-	movq %rsp, PER_CPU_VAR(old_rsp)
-	movq PER_CPU_VAR(kernel_stack), %rsp
-
+    /* The ARGS is restored, so don't clobber anything */
+    pushq %rax
+    movq RSP-ARGOFFSET(%rsp), %rax
+    movq %rax, PER_CPU_VAR(old_rsp)
+    popq %rax
+    movq PER_CPU_VAR(kernel_stack), %rsp
 	pushq $__USER32_DS
 	pushq PER_CPU_VAR(old_rsp)
 	pushq %r11
@@ -116,27 +118,38 @@ RELOC(xen_sysret32, 1b+1)
  * rsp->rcx
  *
  * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
+ * CPU-generated syscall/sysenter and jump to the normal entrypoint,
+ * but we will not switch stack
  */
 
 .macro undo_xen_syscall
+    /* Clobber rcx is ok */
+	movq 5*8(%rsp), %rcx
+    movq %rcx, PER_CPU_VAR(old_rsp)
 	mov 0*8(%rsp), %rcx
 	mov 1*8(%rsp), %r11
-	mov 5*8(%rsp), %rsp
+	sub $56, %rsp
 .endm
 
 /* Normal 64-bit system call target */
 ENTRY(xen_syscall_target)
 	undo_xen_syscall
-	jmp system_call_after_swapgs
+	jmp system_call_after_switch_stack
 ENDPROC(xen_syscall_target)
 
 #ifdef CONFIG_IA32_EMULATION
 
+.macro undo_xen_ia32_syscall
+	mov 0*8(%rsp), %rcx
+	mov 1*8(%rsp), %r11
+	mov	5*8(%esp), %r8d
+	sub $56, %rsp
+.endm
 /* 32-bit compat syscall target */
 ENTRY(xen_syscall32_target)
-	undo_xen_syscall
-	jmp ia32_cstar_target
+	undo_xen_ia32_syscall
+	movl	%esp,%r8d
+	jmp ia32_cstart_after_switch_stack 
 ENDPROC(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
-- 
1.5.4.2


[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall entry
  2010-01-28 10:26 [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall entry Jiang, Yunhong
@ 2010-02-10 18:13 ` Jeremy Fitzhardinge
  2010-02-11  7:42   ` Jiang, Yunhong
  2010-02-11  8:53   ` Jan Beulich
  0 siblings, 2 replies; 7+ messages in thread
From: Jeremy Fitzhardinge @ 2010-02-10 18:13 UTC (permalink / raw)
  To: Jiang, Yunhong; +Cc: xen-devel, Keir Fraser, Jan Beulich

On 01/28/2010 02:26 AM, Jiang, Yunhong wrote:
> Jeremy, this patch is a RFC for MCE injection issue as discussed in http://lists.xensource.com/archives/html/xen-devel/2009-12/msg00849.html .
>
> Currently, when syscall into dom0, the stack is kernel stack. Dom0 kernel will firstly switch to user space stack, to create context same as physical syscall entry and then call kernel's common syscall entry. In kernel's common syscall entry, it will return back to kernel stack.
>    

Does this just apply to dom0? Can a domU kernel ever get an NMI?  (Not 
that it makes any difference since a solution for one will be a solution 
for both, but I'm wondering if there's a reason you're specifically 
talking about dom0).

> This give a small security windows for MCE. If a vMCE is injected into dom0 when dom0 is in the syscalll entry, but is still using the user space stack, it will cause great trouble (check above URL for detailed information).
>
> I can think out two options for this issue:
> a) The method this patch took. Dom0 didn't try to switch to user space stack. Instead, it will use kernel statck directly and jump to kenerl entry.
>    

Seems simple enough.  The patch looks pretty sound in concept, but I 
have some comments inline.

> b) Add IST support to xen hypervisor, Considering the ptrace issue in http://lists.xensource.com/archives/html/xen-devel/2009-05/msg00200.html and also Ingo's comments in http://lkml.indiana.edu/hypermail/linux/kernel/0905.1/00087.html, maybe it still have value to add such support. But not sure if that is accepetable for xen hypervisor.
>    

That would depend on Keir's thoughts, which I would guess depend on what 
the patch and ABI would look like.  It's certainly the most general 
solution if it can be done in a reasonable way.

> As RFC, this patch is just tested to make sure the system can still boot succesffully, I did't try ptrace or something else. If the idea is ok, I will do more testing on it.
>    

Did you try booting it native as well?  And test 32-bit compat usermode?

> Thanks
> Yunhong Jiang
>
>  From 6f8e8019b6cee92b23a3f02787376e0ab3a4244f Mon Sep 17 00:00:00 2001
> From: Jiang, Yunhong<yunhong.jiang@intel.com>
> Date: Thu, 28 Jan 2010 03:43:41 +0800
> Subject: [PATCH] Change the syscall/sysexit entry to use the kernel stack directly
>
> ---
>   arch/x86/ia32/ia32entry.S       |    2 +-
>   arch/x86/include/asm/irqflags.h |    2 ++
>   arch/x86/kernel/entry_64.S      |    2 +-
>   arch/x86/xen/xen-asm_64.S       |   37 +++++++++++++++++++++++++------------
>   4 files changed, 29 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index b09502d..2f8e942 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -284,6 +284,7 @@ ENTRY(ia32_cstar_target)
>   	movl	%esp,%r8d
>   	CFI_REGISTER	rsp,r8
>   	movq	PER_CPU_VAR(kernel_stack),%rsp
> +ENTRY(ia32_cstart_after_switch_stack)
>    

"cstar" I think.

>   	/*
>   	 * No need to follow this irqs on/off section: the syscall
>   	 * disabled irqs and here we enable it straight after entry:
> @@ -337,7 +338,6 @@ sysretl_from_sys_call:
>   	xorq	%r9,%r9
>   	xorq	%r8,%r8
>   	TRACE_IRQS_ON
> -	movl RSP-ARGOFFSET(%rsp),%esp
>   	CFI_RESTORE rsp
>    

You need to update/move this CFI annotation.

>   	USERGS_SYSRET32
>   	
> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
> index c6ccbe7..7e62aed 100644
> --- a/arch/x86/include/asm/irqflags.h
> +++ b/arch/x86/include/asm/irqflags.h
> @@ -133,9 +133,11 @@ static inline unsigned long __raw_local_irq_save(void)
>
>   #define INTERRUPT_RETURN	iretq
>   #define USERGS_SYSRET64				\
> +	movq PER_CPU_VAR(old_rsp), %rsp	\
>   	swapgs;					\
>   	sysretq;
>   #define USERGS_SYSRET32				\
> +	movl RSP-ARGOFFSET(%rsp), %esp		\
>   	swapgs;					\
>   	sysretl
>    

OK, so you're redefining sysret32/64 to also include the stack switch.  
Makes sense.
>   #define ENABLE_INTERRUPTS_SYSEXIT32		\
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index c251be7..5bc75fe 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -472,6 +472,7 @@ ENTRY(system_call_after_swapgs)
>   	 * No need to follow this irqs off/on section - it's straight
>   	 * and short:
>   	 */
> +ENTRY(system_call_after_switch_stack)
>   	ENABLE_INTERRUPTS(CLBR_NONE)
>   	SAVE_ARGS 8,1
>   	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
> @@ -510,7 +511,6 @@ sysret_check:
>   	CFI_REGISTER	rip,rcx
>   	RESTORE_ARGS 0,-ARG_SKIP,1
>   	/*CFI_REGISTER	rflags,r11*/
> -	movq	PER_CPU_VAR(old_rsp), %rsp
>   	USERGS_SYSRET64
>
>   	CFI_RESTORE_STATE
> diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
> index 53adefd..c953d14 100644
> --- a/arch/x86/xen/xen-asm_64.S
> +++ b/arch/x86/xen/xen-asm_64.S
> @@ -12,6 +12,7 @@
>    */
>
>   #include<asm/errno.h>
> +#include<asm/calling.h>
>   #include<asm/percpu.h>
>   #include<asm/processor-flags.h>
>   #include<asm/segment.h>
> @@ -68,9 +69,7 @@ ENTRY(xen_sysret64)
>   	 * We're already on the usermode stack at this point, but
>   	 * still with the kernel gs, so we can easily switch back
>   	 */
>    
This comment needs update/deletion.
> -	movq %rsp, PER_CPU_VAR(old_rsp)
> -	movq PER_CPU_VAR(kernel_stack), %rsp
> -
> +    movq PER_CPU_VAR(kernel_stack), %rsp
>    

Looks like your indentation is funny; this file uses 8-width tabs.

>   	pushq $__USER_DS
>   	pushq PER_CPU_VAR(old_rsp)
>   	pushq %r11
> @@ -84,12 +83,15 @@ RELOC(xen_sysret64, 1b+1)
>
>   ENTRY(xen_sysret32)
>   	/*
> -	 * We're already on the usermode stack at this point, but
> +	 * We're still on the kernel mode stack at this point, but
>   	 * still with the kernel gs, so we can easily switch back
>   	 */
>    
May as well delete the whole comment; it doesn't make any sense in this 
form.

> -	movq %rsp, PER_CPU_VAR(old_rsp)
> -	movq PER_CPU_VAR(kernel_stack), %rsp
> -
> +    /* The ARGS is restored, so don't clobber anything */
> +    pushq %rax
> +    movq RSP-ARGOFFSET(%rsp), %rax
> +    movq %rax, PER_CPU_VAR(old_rsp)
> +    popq %rax
>    
Rather than going via PER_CPU_VAR(old_rsp), maybe it would make more 
sense to restructure the iret frame construction to something like:

	sub $6*8, %rsp
	mov $0, 0*8(%rsp)
	mov %rcx, 1*8(%rsp)
	mov $__USER32_CS, 2*8(%rsp)
	mov %r11, 3*8(%rsp)
	mov PER_CPU_VAR(old_rsp), %rcx
	mov %rcx, 4*8(%rsp)
	mov $__USER32_DS, 5*8(%rsp)
	jmp hypercall_iret


> +    movq PER_CPU_VAR(kernel_stack), %rsp
>    

Why do you need to do this?  Isn't the point that rsp is already the 
kernel stack?

>   	pushq $__USER32_DS
>   	pushq PER_CPU_VAR(old_rsp)
>   	pushq %r11
> @@ -116,27 +118,38 @@ RELOC(xen_sysret32, 1b+1)
>    * rsp->rcx
>    *
>    * In all the entrypoints, we undo all that to make it look like a
> - * CPU-generated syscall/sysenter and jump to the normal entrypoint.
> + * CPU-generated syscall/sysenter and jump to the normal entrypoint,
> + * but we will not switch stack
>    */
>
>   .macro undo_xen_syscall
> +    /* Clobber rcx is ok */
> +	movq 5*8(%rsp), %rcx
> +    movq %rcx, PER_CPU_VAR(old_rsp)
>   	mov 0*8(%rsp), %rcx
>   	mov 1*8(%rsp), %r11
> -	mov 5*8(%rsp), %rsp
> +	sub $56, %rsp
>    

What's this for?  Where does "56" come from?

>   .endm
>
>   /* Normal 64-bit system call target */
>   ENTRY(xen_syscall_target)
>   	undo_xen_syscall
> -	jmp system_call_after_swapgs
> +	jmp system_call_after_switch_stack
>   ENDPROC(xen_syscall_target)
>
>   #ifdef CONFIG_IA32_EMULATION
>
> +.macro undo_xen_ia32_syscall
> +	mov 0*8(%rsp), %rcx
> +	mov 1*8(%rsp), %r11
> +	mov	5*8(%esp), %r8d
> +	sub $56, %rsp
> +.endm
>   /* 32-bit compat syscall target */
>   ENTRY(xen_syscall32_target)
> -	undo_xen_syscall
> -	jmp ia32_cstar_target
> +	undo_xen_ia32_syscall
> +	movl	%esp,%r8d
> +	jmp ia32_cstart_after_switch_stack
>   ENDPROC(xen_syscall32_target)
>
>   /* 32-bit compat sysenter target */
>    


Thanks,
     J

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall entry
  2010-02-10 18:13 ` Jeremy Fitzhardinge
@ 2010-02-11  7:42   ` Jiang, Yunhong
  2010-02-11  8:55     ` Jan Beulich
  2010-02-11  8:53   ` Jan Beulich
  1 sibling, 1 reply; 7+ messages in thread
From: Jiang, Yunhong @ 2010-02-11  7:42 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel, Keir Fraser, Jan Beulich


>-----Original Message-----
>From: Jeremy Fitzhardinge [mailto:jeremy@goop.org]
>Sent: Thursday, February 11, 2010 2:13 AM
>To: Jiang, Yunhong
>Cc: Keir Fraser; xen-devel@lists.xensource.com; Jan Beulich
>Subject: Re: [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall
>entry
>
>On 01/28/2010 02:26 AM, Jiang, Yunhong wrote:
>> Jeremy, this patch is a RFC for MCE injection issue as discussed in
>>http://lists.xensource.com/archives/html/xen-devel/2009-12/msg00849.html .
>>
>> Currently, when syscall into dom0, the stack is kernel stack. Dom0 kernel will firstly
>>switch to user space stack, to create context same as physical syscall entry and then
>>call kernel's common syscall entry. In kernel's common syscall entry, it will return
>>Lback to kernel stack.
>>
>
>Does this just apply to dom0? Can a domU kernel ever get an NMI?  (Not
>that it makes any difference since a solution for one will be a solution
>for both, but I'm wondering if there's a reason you're specifically
>talking about dom0).

Thanks for comments very much. 
Yes, this apply for both dom0 and domU. I didn't state domU because I have no idea how will NMI be applied. In fact, I'm told by Jan that NMI is needed by dom0.
For MCE, currently we hide MCA capability for domU in cpuid, from both libxc and domU kernel. In fact, I'm not sure if we do need enable the vMCE to PV guest. Comparing to HVM, which have statard hardware interface, PV guest interface may be different.

>
>> This give a small security windows for MCE. If a vMCE is injected into dom0 when
>dom0 is in the syscalll entry, but is still using the user space stack, it will cause great
>trouble (check above URL for detailed information).
>>
>> I can think out two options for this issue:
>> a) The method this patch took. Dom0 didn't try to switch to user space stack.
>>Instead, it will use kernel statck directly and jump to kenerl entry.
>>
>
>Seems simple enough.  The patch looks pretty sound in concept, but I
>have some comments inline.
>
>> b) Add IST support to xen hypervisor, Considering the ptrace issue in
>>http://lists.xensource.com/archives/html/xen-devel/2009-05/msg00200.html and
>>also Ingo's comments in
>>http://lkml.indiana.edu/hypermail/linux/kernel/0905.1/00087.html, maybe it still
>>have value to add such support. But not sure if that is accepetable for xen
>>hypervisor.
>>
>
>That would depend on Keir's thoughts, which I would guess depend on what
>the patch and ABI would look like.  It's certainly the most general
>solution if it can be done in a reasonable way.

Yes, that depends on Keir's suggestion, although he may not be fan for vNMI/vMCE :-)

>
>> As RFC, this patch is just tested to make sure the system can still boot succesffully,
>I did't try ptrace or something else. If the idea is ok, I will do more testing on it.
>>
>
>Did you try booting it native as well?  And test 32-bit compat usermode?

Yes, I tested it on native and works well, but didn't test 32-bit application.

>
>> Thanks
>> Yunhong Jiang
>>
>>  From 6f8e8019b6cee92b23a3f02787376e0ab3a4244f Mon Sep 17 00:00:00 2001
>> From: Jiang, Yunhong<yunhong.jiang@intel.com>
>> Date: Thu, 28 Jan 2010 03:43:41 +0800
>> Subject: [PATCH] Change the syscall/sysexit entry to use the kernel stack directly
>>
>> ---
>>   arch/x86/ia32/ia32entry.S       |    2 +-
>>   arch/x86/include/asm/irqflags.h |    2 ++
>>   arch/x86/kernel/entry_64.S      |    2 +-
>>   arch/x86/xen/xen-asm_64.S       |   37
>+++++++++++++++++++++++++------------
>>   4 files changed, 29 insertions(+), 14 deletions(-)
>>
>> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
>> index b09502d..2f8e942 100644
>> --- a/arch/x86/ia32/ia32entry.S
>> +++ b/arch/x86/ia32/ia32entry.S
>> @@ -284,6 +284,7 @@ ENTRY(ia32_cstar_target)
>>   	movl	%esp,%r8d
>>   	CFI_REGISTER	rsp,r8
>>   	movq	PER_CPU_VAR(kernel_stack),%rsp
>> +ENTRY(ia32_cstart_after_switch_stack)
>>
>
>"cstar" I think.

Yes, typo :$

>
>>   	/*
>>   	 * No need to follow this irqs on/off section: the syscall
>>   	 * disabled irqs and here we enable it straight after entry:
>> @@ -337,7 +338,6 @@ sysretl_from_sys_call:
>>   	xorq	%r9,%r9
>>   	xorq	%r8,%r8
>>   	TRACE_IRQS_ON
>> -	movl RSP-ARGOFFSET(%rsp),%esp
>>   	CFI_RESTORE rsp
>>
>
>You need to update/move this CFI annotation.
I will spend time time to get the whole CFI clean. BTW, how can we test if the CFI is changed correctly?

>
>>   	USERGS_SYSRET32
>>
>> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
>> index c6ccbe7..7e62aed 100644
>> --- a/arch/x86/include/asm/irqflags.h
>> +++ b/arch/x86/include/asm/irqflags.h
>> @@ -133,9 +133,11 @@ static inline unsigned long __raw_local_irq_save(void)
>>
>>   #define INTERRUPT_RETURN	iretq
>>   #define USERGS_SYSRET64				\
>> +	movq PER_CPU_VAR(old_rsp), %rsp	\
>>   	swapgs;					\
>>   	sysretq;
>>   #define USERGS_SYSRET32				\
>> +	movl RSP-ARGOFFSET(%rsp), %esp		\
>>   	swapgs;					\
>>   	sysretl
>>
>
>OK, so you're redefining sysret32/64 to also include the stack switch.
>Makes sense.

Yes, we need make sure the user stack will not be used while in kernel context.

>>   #define ENABLE_INTERRUPTS_SYSEXIT32		\
>> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
>> index c251be7..5bc75fe 100644
>> --- a/arch/x86/kernel/entry_64.S
>> +++ b/arch/x86/kernel/entry_64.S
>> @@ -472,6 +472,7 @@ ENTRY(system_call_after_swapgs)
>>   	 * No need to follow this irqs off/on section - it's straight
>>   	 * and short:
>>   	 */
>> +ENTRY(system_call_after_switch_stack)
>>   	ENABLE_INTERRUPTS(CLBR_NONE)
>>   	SAVE_ARGS 8,1
>>   	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
>> @@ -510,7 +511,6 @@ sysret_check:
>>   	CFI_REGISTER	rip,rcx
>>   	RESTORE_ARGS 0,-ARG_SKIP,1
>>   	/*CFI_REGISTER	rflags,r11*/
>> -	movq	PER_CPU_VAR(old_rsp), %rsp
>>   	USERGS_SYSRET64
>>
>>   	CFI_RESTORE_STATE
>> diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
>> index 53adefd..c953d14 100644
>> --- a/arch/x86/xen/xen-asm_64.S
>> +++ b/arch/x86/xen/xen-asm_64.S
>> @@ -12,6 +12,7 @@
>>    */
>>
>>   #include<asm/errno.h>
>> +#include<asm/calling.h>
>>   #include<asm/percpu.h>
>>   #include<asm/processor-flags.h>
>>   #include<asm/segment.h>
>> @@ -68,9 +69,7 @@ ENTRY(xen_sysret64)
>>   	 * We're already on the usermode stack at this point, but
>>   	 * still with the kernel gs, so we can easily switch back
>>   	 */
>>
>This comment needs update/deletion.

Yes, will change it.

>> -	movq %rsp, PER_CPU_VAR(old_rsp)
>> -	movq PER_CPU_VAR(kernel_stack), %rsp
>> -
>> +    movq PER_CPU_VAR(kernel_stack), %rsp
>>
>
>Looks like your indentation is funny; this file uses 8-width tabs.

I will check my .vimrc :(

>
>>   	pushq $__USER_DS
>>   	pushq PER_CPU_VAR(old_rsp)
>>   	pushq %r11
>> @@ -84,12 +83,15 @@ RELOC(xen_sysret64, 1b+1)
>>
>>   ENTRY(xen_sysret32)
>>   	/*
>> -	 * We're already on the usermode stack at this point, but
>> +	 * We're still on the kernel mode stack at this point, but
>>   	 * still with the kernel gs, so we can easily switch back
>>   	 */
>>
>May as well delete the whole comment; it doesn't make any sense in this
>form.

Sure.

>
>> -	movq %rsp, PER_CPU_VAR(old_rsp)
>> -	movq PER_CPU_VAR(kernel_stack), %rsp
>> -
>> +    /* The ARGS is restored, so don't clobber anything */
>> +    pushq %rax
>> +    movq RSP-ARGOFFSET(%rsp), %rax
>> +    movq %rax, PER_CPU_VAR(old_rsp)
>> +    popq %rax
>>
>Rather than going via PER_CPU_VAR(old_rsp), maybe it would make more
>sense to restructure the iret frame construction to something like:
>
>	sub $6*8, %rsp
>	mov $0, 0*8(%rsp)
>	mov %rcx, 1*8(%rsp)
>	mov $__USER32_CS, 2*8(%rsp)
>	mov %r11, 3*8(%rsp)
>	mov PER_CPU_VAR(old_rsp), %rcx
>	mov %rcx, 4*8(%rsp)
>	mov $__USER32_DS, 5*8(%rsp)
>	jmp hypercall_iret

Yes, that will be clear.

>
>
>> +    movq PER_CPU_VAR(kernel_stack), %rsp
>>
>
>Why do you need to do this?  Isn't the point that rsp is already the
>kernel stack?

My patch simply didn't change anything except the "movq %rsp, PER_CPU_VAR(old_rsp)", to avoid needless changes.
But yes, you are right, here we don't need to switch back. In fact, this " movq PER_CPU_VAR(kernel_stack), %rsp" is wrong if for vNMI happens when guest is running in kernel already, we may clober the stack context before the NMI. 

>
>>   	pushq $__USER32_DS
>>   	pushq PER_CPU_VAR(old_rsp)
>>   	pushq %r11
>> @@ -116,27 +118,38 @@ RELOC(xen_sysret32, 1b+1)
>>    * rsp->rcx
>>    *
>>    * In all the entrypoints, we undo all that to make it look like a
>> - * CPU-generated syscall/sysenter and jump to the normal entrypoint.
>> + * CPU-generated syscall/sysenter and jump to the normal entrypoint,
>> + * but we will not switch stack
>>    */
>>
>>   .macro undo_xen_syscall
>> +    /* Clobber rcx is ok */
>> +	movq 5*8(%rsp), %rcx
>> +    movq %rcx, PER_CPU_VAR(old_rsp)
>>   	mov 0*8(%rsp), %rcx
>>   	mov 1*8(%rsp), %r11
>> -	mov 5*8(%rsp), %rsp
>> +	sub $56, %rsp
>>
>
>What's this for?  Where does "56" come from?

Just to unwind the stack. We don't need the hypercall frame anymore now. the 56 is caculated for the stack frame size.

Thanks
Yunhong Jiang

>
>>   .endm
>>
>>   /* Normal 64-bit system call target */
>>   ENTRY(xen_syscall_target)
>>   	undo_xen_syscall
>> -	jmp system_call_after_swapgs
>> +	jmp system_call_after_switch_stack
>>   ENDPROC(xen_syscall_target)
>>
>>   #ifdef CONFIG_IA32_EMULATION
>>
>> +.macro undo_xen_ia32_syscall
>> +	mov 0*8(%rsp), %rcx
>> +	mov 1*8(%rsp), %r11
>> +	mov	5*8(%esp), %r8d
>> +	sub $56, %rsp
>> +.endm
>>   /* 32-bit compat syscall target */
>>   ENTRY(xen_syscall32_target)
>> -	undo_xen_syscall
>> -	jmp ia32_cstar_target
>> +	undo_xen_ia32_syscall
>> +	movl	%esp,%r8d
>> +	jmp ia32_cstart_after_switch_stack
>>   ENDPROC(xen_syscall32_target)
>>
>>   /* 32-bit compat sysenter target */
>>
>
>
>Thanks,
>     J

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall entry
  2010-02-10 18:13 ` Jeremy Fitzhardinge
  2010-02-11  7:42   ` Jiang, Yunhong
@ 2010-02-11  8:53   ` Jan Beulich
  2010-02-12  4:30     ` Jiang, Yunhong
  1 sibling, 1 reply; 7+ messages in thread
From: Jan Beulich @ 2010-02-11  8:53 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Yunhong Jiang; +Cc: xen-devel, Keir Fraser

>>> Jeremy Fitzhardinge <jeremy@goop.org> 10.02.10 19:13 >>>
>> b) Add IST support to xen hypervisor, Considering the ptrace issue in http://lists.xensource.com/archives/html/xen-devel/2009-05/msg00200.html and also Ingo's comments in http://lkml.indiana.edu/hypermail/linux/kernel/0905.1/00087.html, maybe it still have value to add such support. But not sure if that is accepetable for xen hypervisor.
>>    
>
>That would depend on Keir's thoughts, which I would guess depend on what 
>the patch and ABI would look like.  It's certainly the most general 
>solution if it can be done in a reasonable way.

But if at all, I'd recommend considering a properly nesting solution
rather than just to mimic hardware behavior (since there's no IDT,
functionality needs to differ from native anyway).

Jan

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall entry
  2010-02-11  7:42   ` Jiang, Yunhong
@ 2010-02-11  8:55     ` Jan Beulich
  0 siblings, 0 replies; 7+ messages in thread
From: Jan Beulich @ 2010-02-11  8:55 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Yunhong Jiang; +Cc: xen-devel, Keir Fraser

>>> "Jiang, Yunhong" <yunhong.jiang@intel.com> 11.02.10 08:42 >>>
>Yes, this apply for both dom0 and domU. I didn't state domU because I have no idea how will NMI be applied. In fact, I'm told by Jan that NMI is needed by dom0.

That's the current state. Xen has ways to inject NMIs to DomU-s, just
that currently there's no user of it in Linux or (as far as I'm aware) the
tools.

Jan

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC] [PATCH] Dom0: Don't switch back to user space stack  in syscall entry
  2010-02-11  8:53   ` Jan Beulich
@ 2010-02-12  4:30     ` Jiang, Yunhong
  2010-02-12  8:16       ` Jan Beulich
  0 siblings, 1 reply; 7+ messages in thread
From: Jiang, Yunhong @ 2010-02-12  4:30 UTC (permalink / raw)
  To: Jan Beulich, Jeremy Fitzhardinge; +Cc: xen-devel, Keir Fraser



>-----Original Message-----
>From: Jan Beulich [mailto:JBeulich@novell.com]
>Sent: Thursday, February 11, 2010 4:54 PM
>To: Jeremy Fitzhardinge; Jiang, Yunhong
>Cc: Keir Fraser; xen-devel@lists.xensource.com
>Subject: Re: [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall
>entry
>
>>>> Jeremy Fitzhardinge <jeremy@goop.org> 10.02.10 19:13 >>>
>>> b) Add IST support to xen hypervisor, Considering the ptrace issue in
>http://lists.xensource.com/archives/html/xen-devel/2009-05/msg00200.html and
>also Ingo's comments in
>http://lkml.indiana.edu/hypermail/linux/kernel/0905.1/00087.html, maybe it still
>have value to add such support. But not sure if that is accepetable for xen
>hypervisor.
>>>
>>
>>That would depend on Keir's thoughts, which I would guess depend on what
>>the patch and ABI would look like.  It's certainly the most general
>>solution if it can be done in a reasonable way.
>
>But if at all, I'd recommend considering a properly nesting solution
>rather than just to mimic hardware behavior (since there's no IDT,
>functionality needs to differ from native anyway).

Are there any issue currently for nesting interrupt? The issue here is because of the syscall's stack, which is not strictly related to nesting interrupt, I think.

--jyh

>
>Jan

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall entry
  2010-02-12  4:30     ` Jiang, Yunhong
@ 2010-02-12  8:16       ` Jan Beulich
  0 siblings, 0 replies; 7+ messages in thread
From: Jan Beulich @ 2010-02-12  8:16 UTC (permalink / raw)
  To: Yunhong Jiang; +Cc: Jeremy Fitzhardinge, xen-devel, Keir Fraser

>>> "Jiang, Yunhong" <yunhong.jiang@intel.com> 12.02.10 05:30 >>>
>Are there any issue currently for nesting interrupt? The issue here is
>because of the syscall's stack, which is not strictly related to nesting
>interrupt, I think.

For the specific syscall case there probably aren't (but I didn't fully
think this through). But for the general case there are, so if IST-like
support is to be added, it should account for nesting.

Jan

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2010-02-12  8:16 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-28 10:26 [RFC] [PATCH] Dom0: Don't switch back to user space stack in syscall entry Jiang, Yunhong
2010-02-10 18:13 ` Jeremy Fitzhardinge
2010-02-11  7:42   ` Jiang, Yunhong
2010-02-11  8:55     ` Jan Beulich
2010-02-11  8:53   ` Jan Beulich
2010-02-12  4:30     ` Jiang, Yunhong
2010-02-12  8:16       ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.