All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
@ 2011-06-03 13:57 Li, Xin
  2011-06-03 17:23 ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Li, Xin @ 2011-06-03 13:57 UTC (permalink / raw)
  To: 'Keir Fraser (keir.xen@gmail.com)'; +Cc: xen-devel

Intel new CPU supports SMEP (Supervisor Mode Execution Protection). SMEP
prevents software operating with CPL < 3 (supervisor mode) from fetching
instructions from any linear address with a valid translation for which the U/S
flag (bit 2) is 1 in every paging-structure entry controlling the translation
for the linear address.

This patch enables SMEP in Xen to protect Xen hypervisor from executing pv guest
instructions, whose translation paging-structure entries' U/S flags are all set.

It crashes a pv guest triggering SMEP fault.

Signed-off-by: Yang Wei <wei.y.yang@intel.com>
Signed-off-by: Shan Haitao <haitao.shan@intel.com>
Signed-off-by: Li Xin <xin.li@intel.com>

diff -r 0791661a32d8 xen/arch/x86/cpu/common.c
--- a/xen/arch/x86/cpu/common.c	Thu Jun 02 18:46:35 2011 +0100
+++ b/xen/arch/x86/cpu/common.c	Fri Jun 03 19:59:43 2011 +0800
@@ -222,6 +222,22 @@ static void __init early_cpu_detect(void
 	c->x86_capability[4] = cap4;
 }
 
+/* nosmep: if true, Intel SMEP is disabled. */
+static bool_t disable_smep __cpuinitdata;
+boolean_param("nosmep", disable_smep);
+
+static void __cpuinit setup_smep(struct cpuinfo_x86 *c)
+{
+	if ( cpu_has(c, X86_FEATURE_SMEP) ) {
+		if ( unlikely(disable_smep) )
+			setup_clear_cpu_cap(X86_FEATURE_SMEP);
+		else {
+			set_in_cr4(X86_CR4_SMEP);
+			mmu_cr4_features |= X86_CR4_SMEP;
+		}
+	}
+}
+
 void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl, capability, excap, ebx;
@@ -268,6 +284,8 @@ void __cpuinit generic_identify(struct c
 		c->x86_capability[X86_FEATURE_FSGSBASE / 32] = ebx;
 	}
 
+	setup_smep(c);
+
 	early_intel_workaround(c);
 
 #ifdef CONFIG_X86_HT
diff -r 0791661a32d8 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c	Thu Jun 02 18:46:35 2011 +0100
+++ b/xen/arch/x86/traps.c	Fri Jun 03 19:59:43 2011 +0800
@@ -1139,7 +1139,13 @@ static int handle_gdt_ldt_mapping_fault(
     (((va) >= HYPERVISOR_VIRT_START))
 #endif
 
-static int __spurious_page_fault(
+enum pf_type {
+    real_page_fault,
+    smep_fault,
+    spurious_fault
+};
+
+static enum pf_type __page_fault_type(
     unsigned long addr, unsigned int error_code)
 {
     unsigned long mfn, cr3 = read_cr3();
@@ -1151,7 +1157,7 @@ static int __spurious_page_fault(
 #endif
     l2_pgentry_t l2e, *l2t;
     l1_pgentry_t l1e, *l1t;
-    unsigned int required_flags, disallowed_flags;
+    unsigned int required_flags, disallowed_flags, u_flag;
 
     /*
      * We do not take spurious page faults in IRQ handlers as we do not
@@ -1159,11 +1165,11 @@ static int __spurious_page_fault(
      * map_domain_page() is not IRQ-safe.
      */
     if ( in_irq() )
-        return 0;
+        return real_page_fault;
 
     /* Reserved bit violations are never spurious faults. */
     if ( error_code & PFEC_reserved_bit )
-        return 0;
+        return real_page_fault;
 
     required_flags  = _PAGE_PRESENT;
     if ( error_code & PFEC_write_access )
@@ -1175,6 +1181,8 @@ static int __spurious_page_fault(
     if ( error_code & PFEC_insn_fetch )
         disallowed_flags |= _PAGE_NX_BIT;
 
+    u_flag = _PAGE_USER;
+
     mfn = cr3 >> PAGE_SHIFT;
 
 #if CONFIG_PAGING_LEVELS >= 4
@@ -1184,7 +1192,9 @@ static int __spurious_page_fault(
     unmap_domain_page(l4t);
     if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
          (l4e_get_flags(l4e) & disallowed_flags) )
-        return 0;
+        return real_page_fault;
+
+    u_flag &= l4e_get_flags(l4e);
 #endif
 
 #if CONFIG_PAGING_LEVELS >= 3
@@ -1197,13 +1207,23 @@ static int __spurious_page_fault(
     unmap_domain_page(l3t);
 #if CONFIG_PAGING_LEVELS == 3
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 0;
+        return real_page_fault;
 #else
     if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
          (l3e_get_flags(l3e) & disallowed_flags) )
-        return 0;
-    if ( l3e_get_flags(l3e) & _PAGE_PSE )
-        return 1;
+        return real_page_fault;
+
+    u_flag &= l3e_get_flags(l3e);
+
+    if ( l3e_get_flags(l3e) & _PAGE_PSE ) {
+        /* SMEP fault error code 10001b */
+        if ( (error_code & PFEC_insn_fetch) &&
+             !(error_code & PFEC_user_mode) &&
+             cpu_has_smep && u_flag )
+            return smep_fault;
+
+        return spurious_fault;
+    }
 #endif
 #endif
 
@@ -1213,9 +1233,19 @@ static int __spurious_page_fault(
     unmap_domain_page(l2t);
     if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
          (l2e_get_flags(l2e) & disallowed_flags) )
-        return 0;
-    if ( l2e_get_flags(l2e) & _PAGE_PSE )
-        return 1;
+        return real_page_fault;
+
+    u_flag &= l2e_get_flags(l2e);
+
+    if ( l2e_get_flags(l2e) & _PAGE_PSE ) {
+        /* SMEP fault error code 10001b */
+        if ( (error_code & PFEC_insn_fetch) &&
+             !(error_code & PFEC_user_mode) &&
+             cpu_has_smep && u_flag )
+            return smep_fault;
+
+        return spurious_fault;
+    }
 
     l1t = map_domain_page(mfn);
     l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
@@ -1223,26 +1253,37 @@ static int __spurious_page_fault(
     unmap_domain_page(l1t);
     if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
          (l1e_get_flags(l1e) & disallowed_flags) )
-        return 0;
-
-    return 1;
+        return real_page_fault;
+
+    u_flag &= l1e_get_flags(l1e);
+
+    /* SMEP fault error code 10001b */
+    if ( (error_code & PFEC_insn_fetch) &&
+         !(error_code & PFEC_user_mode) &&
+         cpu_has_smep && u_flag )
+        return smep_fault;
+
+    return spurious_fault;
 }
 
 static int spurious_page_fault(
     unsigned long addr, unsigned int error_code)
 {
     unsigned long flags;
-    int           is_spurious;
+    enum pf_type  pf_type;
 
     /*
      * Disabling interrupts prevents TLB flushing, and hence prevents
      * page tables from becoming invalid under our feet during the walk.
      */
     local_irq_save(flags);
-    is_spurious = __spurious_page_fault(addr, error_code);
+    pf_type = __page_fault_type(addr, error_code);
     local_irq_restore(flags);
 
-    return is_spurious;
+    if ( pf_type == smep_fault )
+        domain_crash_synchronous();
+
+    return (pf_type == spurious_fault);
 }
 
 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
diff -r 0791661a32d8 xen/include/asm-x86/cpufeature.h
--- a/xen/include/asm-x86/cpufeature.h	Thu Jun 02 18:46:35 2011 +0100
+++ b/xen/include/asm-x86/cpufeature.h	Fri Jun 03 19:59:43 2011 +0800
@@ -141,8 +141,9 @@
 #define X86_FEATURE_TBM         (6*32+21) /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT     (6*32+22) /* topology extensions CPUID leafs */
 
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 7 */
 #define X86_FEATURE_FSGSBASE	(7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */
+#define X86_FEATURE_SMEP	(7*32+ 7) /* Supervisor Mode Execution Protection */
 
 #define cpu_has(c, bit)		test_bit(bit, (c)->x86_capability)
 #define boot_cpu_has(bit)	test_bit(bit, boot_cpu_data.x86_capability)
@@ -202,6 +203,8 @@
 #define cpu_has_fsgsbase	boot_cpu_has(X86_FEATURE_FSGSBASE)
 #endif
 
+#define cpu_has_smep            boot_cpu_has(X86_FEATURE_SMEP)
+
 #define cpu_has_ffxsr           ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) \
                                  && boot_cpu_has(X86_FEATURE_FFXSR))
 
diff -r 0791661a32d8 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h	Thu Jun 02 18:46:35 2011 +0100
+++ b/xen/include/asm-x86/domain.h	Fri Jun 03 19:59:43 2011 +0800
@@ -527,12 +527,14 @@ unsigned long pv_guest_cr4_fixup(const s
 /* Convert between guest-visible and real CR4 values. */
 #define pv_guest_cr4_to_real_cr4(v)                         \
     (((v)->arch.pv_vcpu.ctrlreg[4]                          \
-      | (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE))    \
-      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)         \
-      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))              \
-      & ~X86_CR4_DE)
-#define real_cr4_to_pv_guest_cr4(c) \
-    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | X86_CR4_OSXSAVE))
+      | (mmu_cr4_features                                   \
+         & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP))      \
+      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)          \
+      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))          \
+     & ~X86_CR4_DE)
+#define real_cr4_to_pv_guest_cr4(c)                         \
+    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD        \
+             | X86_CR4_OSXSAVE | X86_CR4_SMEP))
 
 void domain_cpuid(struct domain *d,
                   unsigned int  input,
diff -r 0791661a32d8 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h	Thu Jun 02 18:46:35 2011 +0100
+++ b/xen/include/asm-x86/processor.h	Fri Jun 03 19:59:43 2011 +0800
@@ -85,6 +85,7 @@
 #define X86_CR4_SMXE		0x4000  /* enable SMX */
 #define X86_CR4_FSGSBASE	0x10000 /* enable {rd,wr}{fs,gs}base */
 #define X86_CR4_OSXSAVE	0x40000 /* enable XSAVE/XRSTOR */
+#define X86_CR4_SMEP		0x100000/* enable SMEP */
 
 /*
  * Trap/fault mnemonics.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
  2011-06-03 13:57 [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor Li, Xin
@ 2011-06-03 17:23 ` Keir Fraser
  2011-06-03 18:49   ` Li, Xin
  0 siblings, 1 reply; 14+ messages in thread
From: Keir Fraser @ 2011-06-03 17:23 UTC (permalink / raw)
  To: Li, Xin; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 11312 bytes --]

On 03/06/2011 14:57, "Li, Xin" <xin.li@intel.com> wrote:

> Intel new CPU supports SMEP (Supervisor Mode Execution Protection). SMEP
> prevents software operating with CPL < 3 (supervisor mode) from fetching
> instructions from any linear address with a valid translation for which the
> U/S
> flag (bit 2) is 1 in every paging-structure entry controlling the translation
> for the linear address.
> 
> This patch enables SMEP in Xen to protect Xen hypervisor from executing pv
> guest
> instructions, whose translation paging-structure entries' U/S flags are all
> set.

Xin,

I have attached a modified version of your patch which I believe deals with
a number of issues. I briefly outline the major ones here:

1. The initialisation in cpu/common.c is misguided. Features like
set_in_cr4(), setup_clear_cpu_cap(), and cpu_has_xxx, are designed to allow
the feature initialisation to happen close in code to where the feature is
used. Hence I have moved the initialisation into traps.c:trap_init().

2. I have renamed the command-line option to 'smep'. Boolean options can be
inverted by prepending 'no-', or appending '={0,false,off,no}'. We don't
generally encode the negative within the base option name string.

3. I have pushed interpretation of the pf_type enumeration out to the
callers of spurious_page_fault(). This is because a SMEP fault while Xen is
executing should *crash Xen*. So that's what I do. Further, when it is a
guest fault, we should domain_crash() not domain_crash_synchronous() (which
is generally a more dangerous function for Xen's own health).

More generally, I wonder whether SMEP should be enabled only for guests
(even PV guests) which detect it via CPUID and proactively enable it via
their virtualised CR4? I mean, it is off in real hardware by default for a
reason: backward compatibility. Furthermore, we only detect spurious page
faults for buggy old PV guests, the rest will get the SMEP fault punted up
to them, which seems odd if they don't understand SMEP. What do you think?

I have cc'ed Jan since he has commented on previous versions of this patch.

 -- Keir

> It crashes a pv guest triggering SMEP fault.
> 
> Signed-off-by: Yang Wei <wei.y.yang@intel.com>
> Signed-off-by: Shan Haitao <haitao.shan@intel.com>
> Signed-off-by: Li Xin <xin.li@intel.com>
> 
> diff -r 0791661a32d8 xen/arch/x86/cpu/common.c
> --- a/xen/arch/x86/cpu/common.c Thu Jun 02 18:46:35 2011 +0100
> +++ b/xen/arch/x86/cpu/common.c Fri Jun 03 19:59:43 2011 +0800
> @@ -222,6 +222,22 @@ static void __init early_cpu_detect(void
> c->x86_capability[4] = cap4;
>  }
>  
> +/* nosmep: if true, Intel SMEP is disabled. */
> +static bool_t disable_smep __cpuinitdata;
> +boolean_param("nosmep", disable_smep);
> +
> +static void __cpuinit setup_smep(struct cpuinfo_x86 *c)
> +{
> + if ( cpu_has(c, X86_FEATURE_SMEP) ) {
> +  if ( unlikely(disable_smep) )
> +   setup_clear_cpu_cap(X86_FEATURE_SMEP);
> +  else {
> +   set_in_cr4(X86_CR4_SMEP);
> +   mmu_cr4_features |= X86_CR4_SMEP;
> +  }
> + }
> +}
> +
>  void __cpuinit generic_identify(struct cpuinfo_x86 * c)
>  {
> u32 tfms, xlvl, capability, excap, ebx;
> @@ -268,6 +284,8 @@ void __cpuinit generic_identify(struct c
> c->x86_capability[X86_FEATURE_FSGSBASE / 32] = ebx;
> }
>  
> + setup_smep(c);
> +
> early_intel_workaround(c);
>  
>  #ifdef CONFIG_X86_HT
> diff -r 0791661a32d8 xen/arch/x86/traps.c
> --- a/xen/arch/x86/traps.c Thu Jun 02 18:46:35 2011 +0100
> +++ b/xen/arch/x86/traps.c Fri Jun 03 19:59:43 2011 +0800
> @@ -1139,7 +1139,13 @@ static int handle_gdt_ldt_mapping_fault(
>      (((va) >= HYPERVISOR_VIRT_START))
>  #endif
>  
> -static int __spurious_page_fault(
> +enum pf_type {
> +    real_page_fault,
> +    smep_fault,
> +    spurious_fault
> +};
> +
> +static enum pf_type __page_fault_type(
>      unsigned long addr, unsigned int error_code)
>  {
>      unsigned long mfn, cr3 = read_cr3();
> @@ -1151,7 +1157,7 @@ static int __spurious_page_fault(
>  #endif
>      l2_pgentry_t l2e, *l2t;
>      l1_pgentry_t l1e, *l1t;
> -    unsigned int required_flags, disallowed_flags;
> +    unsigned int required_flags, disallowed_flags, u_flag;
>  
>      /*
>       * We do not take spurious page faults in IRQ handlers as we do not
> @@ -1159,11 +1165,11 @@ static int __spurious_page_fault(
>       * map_domain_page() is not IRQ-safe.
>       */
>      if ( in_irq() )
> -        return 0;
> +        return real_page_fault;
>  
>      /* Reserved bit violations are never spurious faults. */
>      if ( error_code & PFEC_reserved_bit )
> -        return 0;
> +        return real_page_fault;
>  
>      required_flags  = _PAGE_PRESENT;
>      if ( error_code & PFEC_write_access )
> @@ -1175,6 +1181,8 @@ static int __spurious_page_fault(
>      if ( error_code & PFEC_insn_fetch )
>          disallowed_flags |= _PAGE_NX_BIT;
>  
> +    u_flag = _PAGE_USER;
> +
>      mfn = cr3 >> PAGE_SHIFT;
>  
>  #if CONFIG_PAGING_LEVELS >= 4
> @@ -1184,7 +1192,9 @@ static int __spurious_page_fault(
>      unmap_domain_page(l4t);
>      if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
>           (l4e_get_flags(l4e) & disallowed_flags) )
> -        return 0;
> +        return real_page_fault;
> +
> +    u_flag &= l4e_get_flags(l4e);
>  #endif
>  
>  #if CONFIG_PAGING_LEVELS >= 3
> @@ -1197,13 +1207,23 @@ static int __spurious_page_fault(
>      unmap_domain_page(l3t);
>  #if CONFIG_PAGING_LEVELS == 3
>      if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
> -        return 0;
> +        return real_page_fault;
>  #else
>      if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
>           (l3e_get_flags(l3e) & disallowed_flags) )
> -        return 0;
> -    if ( l3e_get_flags(l3e) & _PAGE_PSE )
> -        return 1;
> +        return real_page_fault;
> +
> +    u_flag &= l3e_get_flags(l3e);
> +
> +    if ( l3e_get_flags(l3e) & _PAGE_PSE ) {
> +        /* SMEP fault error code 10001b */
> +        if ( (error_code & PFEC_insn_fetch) &&
> +             !(error_code & PFEC_user_mode) &&
> +             cpu_has_smep && u_flag )
> +            return smep_fault;
> +
> +        return spurious_fault;
> +    }
>  #endif
>  #endif
>  
> @@ -1213,9 +1233,19 @@ static int __spurious_page_fault(
>      unmap_domain_page(l2t);
>      if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
>           (l2e_get_flags(l2e) & disallowed_flags) )
> -        return 0;
> -    if ( l2e_get_flags(l2e) & _PAGE_PSE )
> -        return 1;
> +        return real_page_fault;
> +
> +    u_flag &= l2e_get_flags(l2e);
> +
> +    if ( l2e_get_flags(l2e) & _PAGE_PSE ) {
> +        /* SMEP fault error code 10001b */
> +        if ( (error_code & PFEC_insn_fetch) &&
> +             !(error_code & PFEC_user_mode) &&
> +             cpu_has_smep && u_flag )
> +            return smep_fault;
> +
> +        return spurious_fault;
> +    }
>  
>      l1t = map_domain_page(mfn);
>      l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
> @@ -1223,26 +1253,37 @@ static int __spurious_page_fault(
>      unmap_domain_page(l1t);
>      if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
>           (l1e_get_flags(l1e) & disallowed_flags) )
> -        return 0;
> -
> -    return 1;
> +        return real_page_fault;
> +
> +    u_flag &= l1e_get_flags(l1e);
> +
> +    /* SMEP fault error code 10001b */
> +    if ( (error_code & PFEC_insn_fetch) &&
> +         !(error_code & PFEC_user_mode) &&
> +         cpu_has_smep && u_flag )
> +        return smep_fault;
> +
> +    return spurious_fault;
>  }
>  
>  static int spurious_page_fault(
>      unsigned long addr, unsigned int error_code)
>  {
>      unsigned long flags;
> -    int           is_spurious;
> +    enum pf_type  pf_type;
>  
>      /*
>       * Disabling interrupts prevents TLB flushing, and hence prevents
>       * page tables from becoming invalid under our feet during the walk.
>       */
>      local_irq_save(flags);
> -    is_spurious = __spurious_page_fault(addr, error_code);
> +    pf_type = __page_fault_type(addr, error_code);
>      local_irq_restore(flags);
>  
> -    return is_spurious;
> +    if ( pf_type == smep_fault )
> +        domain_crash_synchronous();
> +
> +    return (pf_type == spurious_fault);
>  }
>  
>  static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
> diff -r 0791661a32d8 xen/include/asm-x86/cpufeature.h
> --- a/xen/include/asm-x86/cpufeature.h Thu Jun 02 18:46:35 2011 +0100
> +++ b/xen/include/asm-x86/cpufeature.h Fri Jun 03 19:59:43 2011 +0800
> @@ -141,8 +141,9 @@
>  #define X86_FEATURE_TBM         (6*32+21) /* trailing bit manipulations */
>  #define X86_FEATURE_TOPOEXT     (6*32+22) /* topology extensions CPUID leafs
> */
>  
> -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
> +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 7 */
>  #define X86_FEATURE_FSGSBASE (7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */
> +#define X86_FEATURE_SMEP (7*32+ 7) /* Supervisor Mode Execution Protection */
>  
>  #define cpu_has(c, bit)  test_bit(bit, (c)->x86_capability)
>  #define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability)
> @@ -202,6 +203,8 @@
>  #define cpu_has_fsgsbase boot_cpu_has(X86_FEATURE_FSGSBASE)
>  #endif
>  
> +#define cpu_has_smep            boot_cpu_has(X86_FEATURE_SMEP)
> +
>  #define cpu_has_ffxsr           ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
> \
>                                   && boot_cpu_has(X86_FEATURE_FFXSR))
>  
> diff -r 0791661a32d8 xen/include/asm-x86/domain.h
> --- a/xen/include/asm-x86/domain.h Thu Jun 02 18:46:35 2011 +0100
> +++ b/xen/include/asm-x86/domain.h Fri Jun 03 19:59:43 2011 +0800
> @@ -527,12 +527,14 @@ unsigned long pv_guest_cr4_fixup(const s
>  /* Convert between guest-visible and real CR4 values. */
>  #define pv_guest_cr4_to_real_cr4(v)                         \
>      (((v)->arch.pv_vcpu.ctrlreg[4]                          \
> -      | (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE))    \
> -      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)         \
> -      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))              \
> -      & ~X86_CR4_DE)
> -#define real_cr4_to_pv_guest_cr4(c) \
> -    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | X86_CR4_OSXSAVE))
> +      | (mmu_cr4_features                                   \
> +         & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP))      \
> +      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)          \
> +      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))          \
> +     & ~X86_CR4_DE)
> +#define real_cr4_to_pv_guest_cr4(c)                         \
> +    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD        \
> +             | X86_CR4_OSXSAVE | X86_CR4_SMEP))
>  
>  void domain_cpuid(struct domain *d,
>                    unsigned int  input,
> diff -r 0791661a32d8 xen/include/asm-x86/processor.h
> --- a/xen/include/asm-x86/processor.h Thu Jun 02 18:46:35 2011 +0100
> +++ b/xen/include/asm-x86/processor.h Fri Jun 03 19:59:43 2011 +0800
> @@ -85,6 +85,7 @@
>  #define X86_CR4_SMXE  0x4000  /* enable SMX */
>  #define X86_CR4_FSGSBASE 0x10000 /* enable {rd,wr}{fs,gs}base */
>  #define X86_CR4_OSXSAVE 0x40000 /* enable XSAVE/XRSTOR */
> +#define X86_CR4_SMEP  0x100000/* enable SMEP */
>  
>  /*
>   * Trap/fault mnemonics.


[-- Attachment #2: 00-xen-smep --]
[-- Type: application/octet-stream, Size: 8941 bytes --]

diff -r bcd2476c2e2d xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/arch/x86/traps.c	Fri Jun 03 18:17:44 2011 +0100
@@ -85,6 +85,10 @@ static char __read_mostly opt_nmi[10] = 
 #endif
 string_param("nmi", opt_nmi);
 
+/* smep: enable/disable Supervisor Mode Execution Protection (default on). */
+static bool_t __initdata disable_smep;
+invbool_param("smep", disable_smep);
+
 DEFINE_PER_CPU(u64, efer);
 
 DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
@@ -1139,7 +1143,13 @@ static int handle_gdt_ldt_mapping_fault(
     (((va) >= HYPERVISOR_VIRT_START))
 #endif
 
-static int __spurious_page_fault(
+enum pf_type {
+    real_fault,
+    smep_fault,
+    spurious_fault
+};
+
+static enum pf_type __page_fault_type(
     unsigned long addr, unsigned int error_code)
 {
     unsigned long mfn, cr3 = read_cr3();
@@ -1151,7 +1161,7 @@ static int __spurious_page_fault(
 #endif
     l2_pgentry_t l2e, *l2t;
     l1_pgentry_t l1e, *l1t;
-    unsigned int required_flags, disallowed_flags;
+    unsigned int required_flags, disallowed_flags, page_user;
 
     /*
      * We do not take spurious page faults in IRQ handlers as we do not
@@ -1159,11 +1169,11 @@ static int __spurious_page_fault(
      * map_domain_page() is not IRQ-safe.
      */
     if ( in_irq() )
-        return 0;
+        return real_fault;
 
     /* Reserved bit violations are never spurious faults. */
     if ( error_code & PFEC_reserved_bit )
-        return 0;
+        return real_fault;
 
     required_flags  = _PAGE_PRESENT;
     if ( error_code & PFEC_write_access )
@@ -1175,6 +1185,8 @@ static int __spurious_page_fault(
     if ( error_code & PFEC_insn_fetch )
         disallowed_flags |= _PAGE_NX_BIT;
 
+    page_user = _PAGE_USER;
+
     mfn = cr3 >> PAGE_SHIFT;
 
 #if CONFIG_PAGING_LEVELS >= 4
@@ -1184,7 +1196,8 @@ static int __spurious_page_fault(
     unmap_domain_page(l4t);
     if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
          (l4e_get_flags(l4e) & disallowed_flags) )
-        return 0;
+        return real_fault;
+    page_user &= l4e_get_flags(l4e);
 #endif
 
 #if CONFIG_PAGING_LEVELS >= 3
@@ -1197,13 +1210,14 @@ static int __spurious_page_fault(
     unmap_domain_page(l3t);
 #if CONFIG_PAGING_LEVELS == 3
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 0;
+        return real_fault;
 #else
     if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
          (l3e_get_flags(l3e) & disallowed_flags) )
-        return 0;
+        return real_fault;
+    page_user &= l3e_get_flags(l3e);
     if ( l3e_get_flags(l3e) & _PAGE_PSE )
-        return 1;
+        goto leaf;
 #endif
 #endif
 
@@ -1213,9 +1227,10 @@ static int __spurious_page_fault(
     unmap_domain_page(l2t);
     if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
          (l2e_get_flags(l2e) & disallowed_flags) )
-        return 0;
+        return real_fault;
+    page_user &= l2e_get_flags(l2e);
     if ( l2e_get_flags(l2e) & _PAGE_PSE )
-        return 1;
+        goto leaf;
 
     l1t = map_domain_page(mfn);
     l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
@@ -1223,26 +1238,36 @@ static int __spurious_page_fault(
     unmap_domain_page(l1t);
     if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
          (l1e_get_flags(l1e) & disallowed_flags) )
-        return 0;
-
-    return 1;
+        return real_fault;
+    page_user &= l1e_get_flags(l1e);
+
+leaf:
+    /*
+     * Supervisor Mode Execution Protection (SMEP):
+     * Disallow supervisor execution from user-accessible mappings
+     */
+    if ( (read_cr4() & X86_CR4_SMEP) && page_user &&
+         ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) )
+        return smep_fault;
+
+    return spurious_fault;
 }
 
-static int spurious_page_fault(
+static enum pf_type spurious_page_fault(
     unsigned long addr, unsigned int error_code)
 {
     unsigned long flags;
-    int           is_spurious;
+    enum pf_type pf_type;
 
     /*
      * Disabling interrupts prevents TLB flushing, and hence prevents
      * page tables from becoming invalid under our feet during the walk.
      */
     local_irq_save(flags);
-    is_spurious = __spurious_page_fault(addr, error_code);
+    pf_type = __page_fault_type(addr, error_code);
     local_irq_restore(flags);
 
-    return is_spurious;
+    return pf_type;
 }
 
 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
@@ -1317,6 +1342,7 @@ asmlinkage void do_page_fault(struct cpu
 {
     unsigned long addr, fixup;
     unsigned int error_code;
+    enum pf_type pf_type;
 
     addr = read_cr2();
 
@@ -1332,7 +1358,9 @@ asmlinkage void do_page_fault(struct cpu
 
     if ( unlikely(!guest_mode(regs)) )
     {
-        if ( spurious_page_fault(addr, error_code) )
+        pf_type = spurious_page_fault(addr, error_code);
+        BUG_ON(pf_type == smep_fault);
+        if ( pf_type != real_fault )
             return;
 
         if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
@@ -1354,9 +1382,17 @@ asmlinkage void do_page_fault(struct cpu
               error_code, _p(addr));
     }
 
-    if ( unlikely(current->domain->arch.suppress_spurious_page_faults
-                  && spurious_page_fault(addr, error_code)) )
-        return;
+    if ( unlikely(current->domain->arch.suppress_spurious_page_faults) )
+    {
+        pf_type = spurious_page_fault(addr, error_code);
+        if ( pf_type == smep_fault )
+        {
+            gdprintk(XENLOG_ERR, "Fatal SMEP fault\n");
+            domain_crash(current->domain);
+        }
+        if ( pf_type != real_fault )
+            return;
+    }
 
     propagate_page_fault(addr, regs->error_code);
 }
@@ -3425,6 +3461,11 @@ void __init trap_init(void)
     cpu_init();
 
     open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq);
+
+    if ( disable_smep )
+        setup_clear_cpu_cap(X86_FEATURE_SMEP);
+    if ( cpu_has_smep )
+        set_in_cr4(X86_CR4_SMEP);
 }
 
 long register_guest_nmi_callback(unsigned long address)
diff -r bcd2476c2e2d xen/include/asm-x86/cpufeature.h
--- a/xen/include/asm-x86/cpufeature.h	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/include/asm-x86/cpufeature.h	Fri Jun 03 18:17:44 2011 +0100
@@ -141,8 +141,9 @@
 #define X86_FEATURE_TBM         (6*32+21) /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT     (6*32+22) /* topology extensions CPUID leafs */
 
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 7 */
 #define X86_FEATURE_FSGSBASE	(7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */
+#define X86_FEATURE_SMEP	(7*32+ 7) /* Supervisor Mode Execution Protection */
 
 #define cpu_has(c, bit)		test_bit(bit, (c)->x86_capability)
 #define boot_cpu_has(bit)	test_bit(bit, boot_cpu_data.x86_capability)
@@ -202,6 +203,8 @@
 #define cpu_has_fsgsbase	boot_cpu_has(X86_FEATURE_FSGSBASE)
 #endif
 
+#define cpu_has_smep            boot_cpu_has(X86_FEATURE_SMEP)
+
 #define cpu_has_ffxsr           ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) \
                                  && boot_cpu_has(X86_FEATURE_FFXSR))
 
diff -r bcd2476c2e2d xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/include/asm-x86/domain.h	Fri Jun 03 18:17:44 2011 +0100
@@ -527,12 +527,14 @@ unsigned long pv_guest_cr4_fixup(const s
 /* Convert between guest-visible and real CR4 values. */
 #define pv_guest_cr4_to_real_cr4(v)                         \
     (((v)->arch.pv_vcpu.ctrlreg[4]                          \
-      | (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE))    \
-      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)         \
-      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))              \
-      & ~X86_CR4_DE)
-#define real_cr4_to_pv_guest_cr4(c) \
-    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | X86_CR4_OSXSAVE))
+      | (mmu_cr4_features                                   \
+         & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP))      \
+      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)          \
+      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))          \
+     & ~X86_CR4_DE)
+#define real_cr4_to_pv_guest_cr4(c)                         \
+    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD        \
+             | X86_CR4_OSXSAVE | X86_CR4_SMEP))
 
 void domain_cpuid(struct domain *d,
                   unsigned int  input,
diff -r bcd2476c2e2d xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/include/asm-x86/processor.h	Fri Jun 03 18:17:44 2011 +0100
@@ -85,6 +85,7 @@
 #define X86_CR4_SMXE		0x4000  /* enable SMX */
 #define X86_CR4_FSGSBASE	0x10000 /* enable {rd,wr}{fs,gs}base */
 #define X86_CR4_OSXSAVE	0x40000 /* enable XSAVE/XRSTOR */
+#define X86_CR4_SMEP		0x100000/* enable SMEP */
 
 /*
  * Trap/fault mnemonics.

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
  2011-06-03 17:23 ` Keir Fraser
@ 2011-06-03 18:49   ` Li, Xin
  2011-06-03 19:22     ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Li, Xin @ 2011-06-03 18:49 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

> > Intel new CPU supports SMEP (Supervisor Mode Execution Protection). SMEP
> > prevents software operating with CPL < 3 (supervisor mode) from fetching
> > instructions from any linear address with a valid translation for which the
> > U/S
> > flag (bit 2) is 1 in every paging-structure entry controlling the translation
> > for the linear address.
> >
> > This patch enables SMEP in Xen to protect Xen hypervisor from executing pv
> > guest
> > instructions, whose translation paging-structure entries' U/S flags are all
> > set.
> 
> Xin,
> 
> I have attached a modified version of your patch which I believe deals with
> a number of issues. I briefly outline the major ones here:
> 
> 1. The initialisation in cpu/common.c is misguided. Features like
> set_in_cr4(), setup_clear_cpu_cap(), and cpu_has_xxx, are designed to allow
> the feature initialisation to happen close in code to where the feature is
> used. Hence I have moved the initialisation into traps.c:trap_init().

It's the right way to move.

But trap_init() is called before Xen does leaf 7.0 detection, thus we also need to
add leaf 7.0 detection in early_cpu_detect to get boot_cpu_data.x86_capability[7]
initialized before trap_init().

> 2. I have renamed the command-line option to 'smep'. Boolean options can be
> inverted by prepending 'no-', or appending '={0,false,off,no}'. We don't
> generally encode the negative within the base option name string.
> 
Thanks for doing in this better way.

> 3. I have pushed interpretation of the pf_type enumeration out to the
> callers of spurious_page_fault(). This is because a SMEP fault while Xen is
> executing should *crash Xen*. So that's what I do. Further, when it is a

I'm still wondering is it overkill to kill Xen?  An evil guest can crash Xen?

> guest fault, we should domain_crash() not domain_crash_synchronous() (which
> is generally a more dangerous function for Xen's own health).
> 
> More generally, I wonder whether SMEP should be enabled only for guests
> (even PV guests) which detect it via CPUID and proactively enable it via
> their virtualised CR4? I mean, it is off in real hardware by default for a
> reason: backward compatibility. Furthermore, we only detect spurious page
> faults for buggy old PV guests, the rest will get the SMEP fault punted up
> to them, which seems odd if they don't understand SMEP. What do you think?

64bit pv guest can't use SMEP as the kernel code is user accessible.  Thus
when guest executes it won't trigger SMEP fault. only when Xen executes
it will. It's okay to crash Xen (or just the guest).

32bit pv guest should be able to make use of SMEP.  When it is from Xen,
we crash Xen.  When it's is from guest kernel executing user code, we
can inject to guest to let it kill the current process.  Of course such cases
the guest should be able to do SMEP handling.

We can't consistently handle it for 64bit and 32bit guest.

Thanks!
-Xin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
  2011-06-03 18:49   ` Li, Xin
@ 2011-06-03 19:22     ` Keir Fraser
  2011-06-03 19:37       ` Li, Xin
  0 siblings, 1 reply; 14+ messages in thread
From: Keir Fraser @ 2011-06-03 19:22 UTC (permalink / raw)
  To: Li, Xin; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 2896 bytes --]

Hi Xen,

New patch attached and comments in-line.

On 03/06/2011 19:49, "Li, Xin" <xin.li@intel.com> wrote:

>> 1. The initialisation in cpu/common.c is misguided. Features like
>> set_in_cr4(), setup_clear_cpu_cap(), and cpu_has_xxx, are designed to allow
>> the feature initialisation to happen close in code to where the feature is
>> used. Hence I have moved the initialisation into traps.c:trap_init().
> 
> It's the right way to move.
> 
> But trap_init() is called before Xen does leaf 7.0 detection, thus we also
> need to add leaf 7.0 detection in early_cpu_detect to get
> boot_cpu_data.x86_capability[7] initialized before trap_init().

Oooo good point. Fixed in the attached patch by moving SMEP setup into
setup.c, explicitly and immediately after identify_cpu(). I was actually in
two minds whether to fix this by extending early_cpu_detect(). Overall I
decided I don't want to grow e_c_d() if not absolutely necessary. If the CPU
setup stuff in setup.c grows too big and ugly I'd rather refactor it another
way.

>> 3. I have pushed interpretation of the pf_type enumeration out to the
>> callers of spurious_page_fault(). This is because a SMEP fault while Xen is
>> executing should *crash Xen*. So that's what I do. Further, when it is a
> 
> I'm still wondering is it overkill to kill Xen?  An evil guest can crash Xen?

An evil guest has to penetrate Xen before it can crash it in this way. If
Xen has been subverted, and is lucky enough to notice, what should it do?
The only sane answer is to shoot itself in the head. This kind of issue
would require immediate developer attention to fix whatever Xen bug had been
exploited to trigger SMEP.

> 32bit pv guest should be able to make use of SMEP.  When it is from Xen,
> we crash Xen.  When it's is from guest kernel executing user code, we
> can inject to guest to let it kill the current process.  Of course such cases
> the guest should be able to do SMEP handling.

Haha, give over on this idea that unexplainable behaviour should make you
only crash the process/guest. If your behaviour is unexplainable, and you
have pretensions of security, you fail-stop.

> We can't consistently handle it for 64bit and 32bit guest.

Well yeah, but that ignores my actual question, which was...
"""I wonder whether SMEP should be enabled only for guests (even PV guests)
which detect it via CPUID and proactively enable it via their virtualised
CR4? I mean, it is off in real hardware by default for a reason: backward
compatibility. Furthermore, we only detect spurious page faults for buggy
old PV guests, the rest will get the SMEP fault punted up to them, which
seems odd if they don't understand SMEP."""

I mean, I know we may as well just hide the feature from PV 64b guests
totally. That's obvious. Let's stop talking about PV 64b guests already! The
question is: what to do about PV 32b guests?

 -- Keir

> Thanks!
> -Xin


[-- Attachment #2: 00-xen-smep --]
[-- Type: application/octet-stream, Size: 9386 bytes --]

diff -r bcd2476c2e2d xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/arch/x86/setup.c	Fri Jun 03 20:09:02 2011 +0100
@@ -57,6 +57,10 @@ integer_param("maxcpus", max_cpus);
 static bool_t __initdata opt_watchdog;
 boolean_param("watchdog", opt_watchdog);
 
+/* smep: Enable/disable Supervisor Mode Execution Protection (default on). */
+static bool_t __initdata disable_smep;
+invbool_param("smep", disable_smep);
+
 /* **** Linux config option: propagated to domain0. */
 /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
 /* "acpi=force":  Override the disable blacklist.                   */
@@ -1200,11 +1204,17 @@ void __init __start_xen(unsigned long mb
     arch_init_memory();
 
     identify_cpu(&boot_cpu_data);
+
     if ( cpu_has_fxsr )
         set_in_cr4(X86_CR4_OSFXSR);
     if ( cpu_has_xmm )
         set_in_cr4(X86_CR4_OSXMMEXCPT);
 
+    if ( disable_smep )
+        setup_clear_cpu_cap(X86_FEATURE_SMEP);
+    if ( cpu_has_smep )
+        set_in_cr4(X86_CR4_SMEP);
+
     local_irq_enable();
 
 #ifdef CONFIG_X86_64
diff -r bcd2476c2e2d xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/arch/x86/traps.c	Fri Jun 03 20:09:02 2011 +0100
@@ -1139,7 +1139,13 @@ static int handle_gdt_ldt_mapping_fault(
     (((va) >= HYPERVISOR_VIRT_START))
 #endif
 
-static int __spurious_page_fault(
+enum pf_type {
+    real_fault,
+    smep_fault,
+    spurious_fault
+};
+
+static enum pf_type __page_fault_type(
     unsigned long addr, unsigned int error_code)
 {
     unsigned long mfn, cr3 = read_cr3();
@@ -1151,7 +1157,7 @@ static int __spurious_page_fault(
 #endif
     l2_pgentry_t l2e, *l2t;
     l1_pgentry_t l1e, *l1t;
-    unsigned int required_flags, disallowed_flags;
+    unsigned int required_flags, disallowed_flags, page_user;
 
     /*
      * We do not take spurious page faults in IRQ handlers as we do not
@@ -1159,11 +1165,11 @@ static int __spurious_page_fault(
      * map_domain_page() is not IRQ-safe.
      */
     if ( in_irq() )
-        return 0;
+        return real_fault;
 
     /* Reserved bit violations are never spurious faults. */
     if ( error_code & PFEC_reserved_bit )
-        return 0;
+        return real_fault;
 
     required_flags  = _PAGE_PRESENT;
     if ( error_code & PFEC_write_access )
@@ -1175,6 +1181,8 @@ static int __spurious_page_fault(
     if ( error_code & PFEC_insn_fetch )
         disallowed_flags |= _PAGE_NX_BIT;
 
+    page_user = _PAGE_USER;
+
     mfn = cr3 >> PAGE_SHIFT;
 
 #if CONFIG_PAGING_LEVELS >= 4
@@ -1184,7 +1192,8 @@ static int __spurious_page_fault(
     unmap_domain_page(l4t);
     if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
          (l4e_get_flags(l4e) & disallowed_flags) )
-        return 0;
+        return real_fault;
+    page_user &= l4e_get_flags(l4e);
 #endif
 
 #if CONFIG_PAGING_LEVELS >= 3
@@ -1197,13 +1206,14 @@ static int __spurious_page_fault(
     unmap_domain_page(l3t);
 #if CONFIG_PAGING_LEVELS == 3
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 0;
+        return real_fault;
 #else
     if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
          (l3e_get_flags(l3e) & disallowed_flags) )
-        return 0;
+        return real_fault;
+    page_user &= l3e_get_flags(l3e);
     if ( l3e_get_flags(l3e) & _PAGE_PSE )
-        return 1;
+        goto leaf;
 #endif
 #endif
 
@@ -1213,9 +1223,10 @@ static int __spurious_page_fault(
     unmap_domain_page(l2t);
     if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
          (l2e_get_flags(l2e) & disallowed_flags) )
-        return 0;
+        return real_fault;
+    page_user &= l2e_get_flags(l2e);
     if ( l2e_get_flags(l2e) & _PAGE_PSE )
-        return 1;
+        goto leaf;
 
     l1t = map_domain_page(mfn);
     l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
@@ -1223,26 +1234,36 @@ static int __spurious_page_fault(
     unmap_domain_page(l1t);
     if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
          (l1e_get_flags(l1e) & disallowed_flags) )
-        return 0;
-
-    return 1;
+        return real_fault;
+    page_user &= l1e_get_flags(l1e);
+
+leaf:
+    /*
+     * Supervisor Mode Execution Protection (SMEP):
+     * Disallow supervisor execution from user-accessible mappings
+     */
+    if ( (read_cr4() & X86_CR4_SMEP) && page_user &&
+         ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) )
+        return smep_fault;
+
+    return spurious_fault;
 }
 
-static int spurious_page_fault(
+static enum pf_type spurious_page_fault(
     unsigned long addr, unsigned int error_code)
 {
     unsigned long flags;
-    int           is_spurious;
+    enum pf_type pf_type;
 
     /*
      * Disabling interrupts prevents TLB flushing, and hence prevents
      * page tables from becoming invalid under our feet during the walk.
      */
     local_irq_save(flags);
-    is_spurious = __spurious_page_fault(addr, error_code);
+    pf_type = __page_fault_type(addr, error_code);
     local_irq_restore(flags);
 
-    return is_spurious;
+    return pf_type;
 }
 
 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
@@ -1317,6 +1338,7 @@ asmlinkage void do_page_fault(struct cpu
 {
     unsigned long addr, fixup;
     unsigned int error_code;
+    enum pf_type pf_type;
 
     addr = read_cr2();
 
@@ -1332,7 +1354,9 @@ asmlinkage void do_page_fault(struct cpu
 
     if ( unlikely(!guest_mode(regs)) )
     {
-        if ( spurious_page_fault(addr, error_code) )
+        pf_type = spurious_page_fault(addr, error_code);
+        BUG_ON(pf_type == smep_fault);
+        if ( pf_type != real_fault )
             return;
 
         if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
@@ -1354,9 +1378,17 @@ asmlinkage void do_page_fault(struct cpu
               error_code, _p(addr));
     }
 
-    if ( unlikely(current->domain->arch.suppress_spurious_page_faults
-                  && spurious_page_fault(addr, error_code)) )
-        return;
+    if ( unlikely(current->domain->arch.suppress_spurious_page_faults) )
+    {
+        pf_type = spurious_page_fault(addr, error_code);
+        if ( pf_type == smep_fault )
+        {
+            gdprintk(XENLOG_ERR, "Fatal SMEP fault\n");
+            domain_crash(current->domain);
+        }
+        if ( pf_type != real_fault )
+            return;
+    }
 
     propagate_page_fault(addr, regs->error_code);
 }
diff -r bcd2476c2e2d xen/include/asm-x86/cpufeature.h
--- a/xen/include/asm-x86/cpufeature.h	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/include/asm-x86/cpufeature.h	Fri Jun 03 20:09:02 2011 +0100
@@ -141,8 +141,9 @@
 #define X86_FEATURE_TBM         (6*32+21) /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT     (6*32+22) /* topology extensions CPUID leafs */
 
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 7 */
 #define X86_FEATURE_FSGSBASE	(7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */
+#define X86_FEATURE_SMEP	(7*32+ 7) /* Supervisor Mode Execution Protection */
 
 #define cpu_has(c, bit)		test_bit(bit, (c)->x86_capability)
 #define boot_cpu_has(bit)	test_bit(bit, boot_cpu_data.x86_capability)
@@ -202,6 +203,8 @@
 #define cpu_has_fsgsbase	boot_cpu_has(X86_FEATURE_FSGSBASE)
 #endif
 
+#define cpu_has_smep            boot_cpu_has(X86_FEATURE_SMEP)
+
 #define cpu_has_ffxsr           ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) \
                                  && boot_cpu_has(X86_FEATURE_FFXSR))
 
diff -r bcd2476c2e2d xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/include/asm-x86/domain.h	Fri Jun 03 20:09:02 2011 +0100
@@ -527,12 +527,14 @@ unsigned long pv_guest_cr4_fixup(const s
 /* Convert between guest-visible and real CR4 values. */
 #define pv_guest_cr4_to_real_cr4(v)                         \
     (((v)->arch.pv_vcpu.ctrlreg[4]                          \
-      | (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE))    \
-      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)         \
-      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))              \
-      & ~X86_CR4_DE)
-#define real_cr4_to_pv_guest_cr4(c) \
-    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | X86_CR4_OSXSAVE))
+      | (mmu_cr4_features                                   \
+         & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP))      \
+      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)          \
+      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))          \
+     & ~X86_CR4_DE)
+#define real_cr4_to_pv_guest_cr4(c)                         \
+    ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD        \
+             | X86_CR4_OSXSAVE | X86_CR4_SMEP))
 
 void domain_cpuid(struct domain *d,
                   unsigned int  input,
diff -r bcd2476c2e2d xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h	Fri Jun 03 17:27:01 2011 +0100
+++ b/xen/include/asm-x86/processor.h	Fri Jun 03 20:09:02 2011 +0100
@@ -85,6 +85,7 @@
 #define X86_CR4_SMXE		0x4000  /* enable SMX */
 #define X86_CR4_FSGSBASE	0x10000 /* enable {rd,wr}{fs,gs}base */
 #define X86_CR4_OSXSAVE	0x40000 /* enable XSAVE/XRSTOR */
+#define X86_CR4_SMEP		0x100000/* enable SMEP */
 
 /*
  * Trap/fault mnemonics.

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
  2011-06-03 19:22     ` Keir Fraser
@ 2011-06-03 19:37       ` Li, Xin
  2011-06-03 20:15         ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Li, Xin @ 2011-06-03 19:37 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

One last comment from Jan, he suggested to use
    page_user = (read_cr4() & X86_CR4_SMEP) ? _PAEG_USER : 0;
thanks!
-Xin

> -----Original Message-----
> From: Keir Fraser [mailto:keir.xen@gmail.com] On Behalf Of Keir Fraser
> Sent: Saturday, June 04, 2011 3:22 AM
> To: Li, Xin
> Cc: xen-devel; Jan Beulich
> Subject: Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
> 
> Hi Xen,
> 
> New patch attached and comments in-line.
> 
> On 03/06/2011 19:49, "Li, Xin" <xin.li@intel.com> wrote:
> 
> >> 1. The initialisation in cpu/common.c is misguided. Features like
> >> set_in_cr4(), setup_clear_cpu_cap(), and cpu_has_xxx, are designed to allow
> >> the feature initialisation to happen close in code to where the feature is
> >> used. Hence I have moved the initialisation into traps.c:trap_init().
> >
> > It's the right way to move.
> >
> > But trap_init() is called before Xen does leaf 7.0 detection, thus we also
> > need to add leaf 7.0 detection in early_cpu_detect to get
> > boot_cpu_data.x86_capability[7] initialized before trap_init().
> 
> Oooo good point. Fixed in the attached patch by moving SMEP setup into
> setup.c, explicitly and immediately after identify_cpu(). I was actually in
> two minds whether to fix this by extending early_cpu_detect(). Overall I
> decided I don't want to grow e_c_d() if not absolutely necessary. If the CPU
> setup stuff in setup.c grows too big and ugly I'd rather refactor it another
> way.
> 
> >> 3. I have pushed interpretation of the pf_type enumeration out to the
> >> callers of spurious_page_fault(). This is because a SMEP fault while Xen is
> >> executing should *crash Xen*. So that's what I do. Further, when it is a
> >
> > I'm still wondering is it overkill to kill Xen?  An evil guest can crash Xen?
> 
> An evil guest has to penetrate Xen before it can crash it in this way. If
> Xen has been subverted, and is lucky enough to notice, what should it do?
> The only sane answer is to shoot itself in the head. This kind of issue
> would require immediate developer attention to fix whatever Xen bug had been
> exploited to trigger SMEP.
> 
> > 32bit pv guest should be able to make use of SMEP.  When it is from Xen,
> > we crash Xen.  When it's is from guest kernel executing user code, we
> > can inject to guest to let it kill the current process.  Of course such cases
> > the guest should be able to do SMEP handling.
> 
> Haha, give over on this idea that unexplainable behaviour should make you
> only crash the process/guest. If your behaviour is unexplainable, and you
> have pretensions of security, you fail-stop.
> 
> > We can't consistently handle it for 64bit and 32bit guest.
> 
> Well yeah, but that ignores my actual question, which was...
> """I wonder whether SMEP should be enabled only for guests (even PV guests)
> which detect it via CPUID and proactively enable it via their virtualised
> CR4? I mean, it is off in real hardware by default for a reason: backward
> compatibility. Furthermore, we only detect spurious page faults for buggy
> old PV guests, the rest will get the SMEP fault punted up to them, which
> seems odd if they don't understand SMEP."""
> 
> I mean, I know we may as well just hide the feature from PV 64b guests
> totally. That's obvious. Let's stop talking about PV 64b guests already! The
> question is: what to do about PV 32b guests?
> 
>  -- Keir
> 
> > Thanks!
> > -Xin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
  2011-06-03 19:37       ` Li, Xin
@ 2011-06-03 20:15         ` Keir Fraser
  0 siblings, 0 replies; 14+ messages in thread
From: Keir Fraser @ 2011-06-03 20:15 UTC (permalink / raw)
  To: Li, Xin; +Cc: xen-devel, Jan Beulich

On 03/06/2011 20:37, "Li, Xin" <xin.li@intel.com> wrote:

> One last comment from Jan, he suggested to use
>     page_user = (read_cr4() & X86_CR4_SMEP) ? _PAEG_USER : 0;

I effectively have the same X86_CR4_SMEP check at the bottom of
__spurious_page_fault(), replacing your check of cpu_has_smep.

 -- Keir

> thanks!
> -Xin
> 
>> -----Original Message-----
>> From: Keir Fraser [mailto:keir.xen@gmail.com] On Behalf Of Keir Fraser
>> Sent: Saturday, June 04, 2011 3:22 AM
>> To: Li, Xin
>> Cc: xen-devel; Jan Beulich
>> Subject: Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
>> 
>> Hi Xen,
>> 
>> New patch attached and comments in-line.
>> 
>> On 03/06/2011 19:49, "Li, Xin" <xin.li@intel.com> wrote:
>> 
>>>> 1. The initialisation in cpu/common.c is misguided. Features like
>>>> set_in_cr4(), setup_clear_cpu_cap(), and cpu_has_xxx, are designed to allow
>>>> the feature initialisation to happen close in code to where the feature is
>>>> used. Hence I have moved the initialisation into traps.c:trap_init().
>>> 
>>> It's the right way to move.
>>> 
>>> But trap_init() is called before Xen does leaf 7.0 detection, thus we also
>>> need to add leaf 7.0 detection in early_cpu_detect to get
>>> boot_cpu_data.x86_capability[7] initialized before trap_init().
>> 
>> Oooo good point. Fixed in the attached patch by moving SMEP setup into
>> setup.c, explicitly and immediately after identify_cpu(). I was actually in
>> two minds whether to fix this by extending early_cpu_detect(). Overall I
>> decided I don't want to grow e_c_d() if not absolutely necessary. If the CPU
>> setup stuff in setup.c grows too big and ugly I'd rather refactor it another
>> way.
>> 
>>>> 3. I have pushed interpretation of the pf_type enumeration out to the
>>>> callers of spurious_page_fault(). This is because a SMEP fault while Xen is
>>>> executing should *crash Xen*. So that's what I do. Further, when it is a
>>> 
>>> I'm still wondering is it overkill to kill Xen?  An evil guest can crash
>>> Xen?
>> 
>> An evil guest has to penetrate Xen before it can crash it in this way. If
>> Xen has been subverted, and is lucky enough to notice, what should it do?
>> The only sane answer is to shoot itself in the head. This kind of issue
>> would require immediate developer attention to fix whatever Xen bug had been
>> exploited to trigger SMEP.
>> 
>>> 32bit pv guest should be able to make use of SMEP.  When it is from Xen,
>>> we crash Xen.  When it's is from guest kernel executing user code, we
>>> can inject to guest to let it kill the current process.  Of course such
>>> cases
>>> the guest should be able to do SMEP handling.
>> 
>> Haha, give over on this idea that unexplainable behaviour should make you
>> only crash the process/guest. If your behaviour is unexplainable, and you
>> have pretensions of security, you fail-stop.
>> 
>>> We can't consistently handle it for 64bit and 32bit guest.
>> 
>> Well yeah, but that ignores my actual question, which was...
>> """I wonder whether SMEP should be enabled only for guests (even PV guests)
>> which detect it via CPUID and proactively enable it via their virtualised
>> CR4? I mean, it is off in real hardware by default for a reason: backward
>> compatibility. Furthermore, we only detect spurious page faults for buggy
>> old PV guests, the rest will get the SMEP fault punted up to them, which
>> seems odd if they don't understand SMEP."""
>> 
>> I mean, I know we may as well just hide the feature from PV 64b guests
>> totally. That's obvious. Let's stop talking about PV 64b guests already! The
>> question is: what to do about PV 32b guests?
>> 
>>  -- Keir
>> 
>>> Thanks!
>>> -Xin
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
  2011-06-05 15:43     ` Li, Xin
@ 2011-06-05 17:04       ` Keir Fraser
  0 siblings, 0 replies; 14+ messages in thread
From: Keir Fraser @ 2011-06-05 17:04 UTC (permalink / raw)
  To: Li, Xin, Jan Beulich; +Cc: xen-devel

On 05/06/2011 16:43, "Li, Xin" <xin.li@intel.com> wrote:

>>> That needs
>>> 1) inject SMEP faults back to the 32-bit pv guest.
>>> 2) let the guest see SMEP thru CPUID and config it in CR4 (actually it's
>>> already set, but just to let guest see it).
>>> 
>>> Anything else?
>> 
>> I thought about this myself and realised that we can't let PV guests control
>> this feature if we want Xen to benefit from it. There's little point in a
>> feature to protect Xen from guests, if an untrusted guest can turn it off!
>> 
>> Hence I think we probably have to leave the feature always on for PV guests.
>> Unless we find some guests are incompatible with that.
> 
> As we talked, 64-bit pv kernel can't trigger SMEP faults. So we decided to not
> let it see this feature.

Yes, we're talking about 32b guests here.

> Maybe it's okay to inject SMEP faults which are triggered from 32-bit pv
> kernel back to it even the guest is not aware of SMEP.

We don't really have a choice about it, if SMEP is enabled. We don't usually
call spurious_page_fault() for guest faults. And as I said, we can't allow a
32b PV guest to disable SMEP, as SMEP is protecting Xen too.

> We can refer to Linux SMEP patch, which just turns this feature on without
> touching any page table handling functions as SMEP handling actually can
> reuse NX handling code path.

Yeah, most likely we can turn this on for PV guests, without them really
knowing about it, and nothing will break.

> Ingo wanted to expose SMEP to KVM guest
> silently, but it is not architecturally right as it's required to flush TLB
> when
> CR4.SMEP is changed, or a kernel page is changed to user page. But luckily
> Linux doesn't have such cases thus doesn't need to flush TLB.

HVM guests are a separate matter and we will want to make SMEP properly
configurable by the guest, as I believe your current patch proposes.

 -- Keir

> Thanks!
> -Xin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
  2011-06-05 15:10   ` Keir Fraser
@ 2011-06-05 15:43     ` Li, Xin
  2011-06-05 17:04       ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Li, Xin @ 2011-06-05 15:43 UTC (permalink / raw)
  To: Keir Fraser, Jan Beulich; +Cc: xen-devel

> > That needs
> > 1) inject SMEP faults back to the 32-bit pv guest.
> > 2) let the guest see SMEP thru CPUID and config it in CR4 (actually it's
> > already set, but just to let guest see it).
> >
> > Anything else?
> 
> I thought about this myself and realised that we can't let PV guests control
> this feature if we want Xen to benefit from it. There's little point in a
> feature to protect Xen from guests, if an untrusted guest can turn it off!
> 
> Hence I think we probably have to leave the feature always on for PV guests.
> Unless we find some guests are incompatible with that.

As we talked, 64-bit pv kernel can't trigger SMEP faults. So we decided to not
let it see this feature.

Maybe it's okay to inject SMEP faults which are triggered from 32-bit pv
kernel back to it even the guest is not aware of SMEP.
We can refer to Linux SMEP patch, which just turns this feature on without
touching any page table handling functions as SMEP handling actually can
reuse NX handling code path. Ingo wanted to expose SMEP to KVM guest
silently, but it is not architecturally right as it's required to flush TLB when
CR4.SMEP is changed, or a kernel page is changed to user page. But luckily
Linux doesn't have such cases thus doesn't need to flush TLB.

Thanks!
-Xin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
  2011-06-05  8:39 ` Li, Xin
@ 2011-06-05 15:10   ` Keir Fraser
  2011-06-05 15:43     ` Li, Xin
  0 siblings, 1 reply; 14+ messages in thread
From: Keir Fraser @ 2011-06-05 15:10 UTC (permalink / raw)
  To: Li, Xin, Jan Beulich; +Cc: xen-devel

On 05/06/2011 09:39, "Li, Xin" <xin.li@intel.com> wrote:

>> I mean, I know we may as well just hide the feature from PV 64b guests
>> totally. That's obvious. Let's stop talking about PV 64b guests already! The
>> question is: what to do about PV 32b guests?
> 
>> Quite obviously we ought to allow 32-bit pv guests to control this for
>> themselves (and hence see the feature).
> 
> That needs
> 1) inject SMEP faults back to the 32-bit pv guest.
> 2) let the guest see SMEP thru CPUID and config it in CR4 (actually it's
> already set, but just to let guest see it).
> 
> Anything else?

I thought about this myself and realised that we can't let PV guests control
this feature if we want Xen to benefit from it. There's little point in a
feature to protect Xen from guests, if an untrusted guest can turn it off!

Hence I think we probably have to leave the feature always on for PV guests.
Unless we find some guests are incompatible with that.

 -- Keir

>> Besides that, assuming Xin verified it's working, your latest patch
>> looks great to me.
> 
> Yeah, verified, the system crashed from a SMEP fault from 64-bit pv kernel.
> Thanks!
> -Xin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] Enable SMEP CPU feature support for XEN  hypervisor
  2011-06-05  7:36 Jan Beulich
@ 2011-06-05  8:39 ` Li, Xin
  2011-06-05 15:10   ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Li, Xin @ 2011-06-05  8:39 UTC (permalink / raw)
  To: Jan Beulich, keir; +Cc: xen-devel

>I mean, I know we may as well just hide the feature from PV 64b guests
>totally. That's obvious. Let's stop talking about PV 64b guests already! The
>question is: what to do about PV 32b guests?

>Quite obviously we ought to allow 32-bit pv guests to control this for
>themselves (and hence see the feature).

That needs
1) inject SMEP faults back to the 32-bit pv guest.
2) let the guest see SMEP thru CPUID and config it in CR4 (actually it's already set, but just to let guest see it).

Anything else?

>Besides that, assuming Xin verified it's working, your latest patch
>looks great to me.

Yeah, verified, the system crashed from a SMEP fault from 64-bit pv kernel.
Thanks!
-Xin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
@ 2011-06-05  7:36 Jan Beulich
  2011-06-05  8:39 ` Li, Xin
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2011-06-05  7:36 UTC (permalink / raw)
  To: xin.li, keir; +Cc: xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 453 bytes --]

>>> Keir Fraser  06/04/11 5:32 PM >>>
>I mean, I know we may as well just hide the feature from PV 64b guests
>totally. That's obvious. Let's stop talking about PV 64b guests already! The
>question is: what to do about PV 32b guests?

Quite obviously we ought to allow 32-bit pv guests to control this for
themselves (and hence see the feature).

Besides that, assuming Xin verified it's working, your latest patch
looks great to me.

Jan


[-- Attachment #1.2: HTML --]
[-- Type: text/html, Size: 750 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] Enable SMEP CPU feature support for  XEN hypervisor
  2011-06-03 16:18 ` Li, Xin
@ 2011-06-03 17:09   ` Li, Xin
  0 siblings, 0 replies; 14+ messages in thread
From: Li, Xin @ 2011-06-03 17:09 UTC (permalink / raw)
  To: Li, Xin, Jan Beulich, keir.xen; +Cc: xen-devel

> >+ if ( pf_type == smep_fault )
> >+ domain_crash_synchronous();
> 
> >I don't think you can domain_crash_synchronous() here, domain_crash()
> >is perhaps the only possibility (and even if it was possible, I think there
> >was agreement to not add new calls to the former function and always
> >use the latter instead).
> >
> 
> so change to domain_crash()?
> 
> curious about the background how the agreement is reached?
> 

We can't use domain_crash, or it causes system reboot as it returns to caller.
Actually I chose domain_crash_synchronous because it doesn't return back.

Or change pf_type to spurious_fault just before calling crash to avoid Xen crash,
but looks ugly.

Thanks!
-Xin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] Enable SMEP CPU feature support for  XEN hypervisor
  2011-06-03 14:36 Jan Beulich
@ 2011-06-03 16:18 ` Li, Xin
  2011-06-03 17:09   ` Li, Xin
  0 siblings, 1 reply; 14+ messages in thread
From: Li, Xin @ 2011-06-03 16:18 UTC (permalink / raw)
  To: Jan Beulich, keir.xen; +Cc: xen-devel

>>+/* nosmep: if true, Intel SMEP is disabled. */
>>+static bool_t disable_smep __cpuinitdata;
>
>__initdata
>
>>+boolean_param("nosmep", disable_smep);
>>+
>>+static void __cpuinit setup_smep(struct cpuinfo_x86 *c)
>
>__init

I did want to change the code like you suggested to only call it in BP code
path, but met abnormal system response issue.  To fix it, I need change
some other AP boot code path as well. While this patch is for SMEP.

Actually your suggestion applies to some other system initialization variables
and functions.  I prefer to enhance all such cases in another patchset.

I also mentioned this in another reply to Keir.

>>+{
>>+    if ( cpu_has(c, X86_FEATURE_SMEP) ) {
>>+        if ( unlikely(disable_smep) )
>>+            setup_clear_cpu_cap(X86_FEATURE_SMEP);
>>+        else {
>>+            set_in_cr4(X86_CR4_SMEP);
>>+            mmu_cr4_features |= X86_CR4_SMEP;

>Why? We old you on the previous patch version that
>set_in_cr4() already does this.

I misunderstood the code even you reminded...
 
>>@@ -268,6 +284,8 @@ void __cpuinit generic_identify(struct c
>>        c->x86_capability[X86_FEATURE_FSGSBASE / 32] = ebx;
>>    }
>>
>>+    setup_smep(c);
>
>Still misplaced.
>

As said above, it should be ok then.

>>+ u_flag = _PAGE_USER;

>How about
>
>u_flag = cpu_has_smep ? _PAGE_USER : 0;
>
>avoiding the repeated cpu_has_smep checks below.

Will do in newer version.

>* Disabling interrupts prevents TLB flushing, and hence prevents
>* page tables from becoming invalid under our feet during the walk.
>*/
>local_irq_save(flags);
>- is_spurious = __spurious_page_fault(addr, error_code);
>+ pf_type = __page_fault_type(addr, error_code);
>local_irq_restore(flags);
>
>- return is_spurious;
>+ if ( pf_type == smep_fault )
>+ domain_crash_synchronous();

>I don't think you can domain_crash_synchronous() here, domain_crash()
>is perhaps the only possibility (and even if it was possible, I think there
>was agreement to not add new calls to the former function and always
>use the latter instead).
>

so change to domain_crash()?

curious about the background how the agreement is reached?

>That said I'm not certain it's a good choice at all, as Keir had already
>pointed out on the first patch version.

Do you mean to crash the pv guest?

One case it can avoid is that an evil pv guest can plant malicious code at
virtual address 0, and then exploit null pointer in hypervisor to get control
of the system, although it's few.

>Finally, spurious_page_fault() has two call sites, only one of which is
>for faults that occurred in hypervisor context (which you seem to
>assume always being the case). For the one where it's getting called
>for a fault in guest context using domain_crash() is probably correct,
>but an (possibly guest chosen) alternative would be to forward the
>(non-spurious) fault to the guest. Again I thin Keir hinted in that
>direction already in response to the first patch version.

For 64 bit pv guest, smep fault can only happen in hypervisor mode (ring 0).

For 32 bit pv guest, smep fault can happen in both hypervisor mode and 
guest kernel mode (ring 0 & 1).  When it happens in ring1, we probably
can inject to guest kernel, and let it kill the current running process. But
we have decided not to let guest see SMEP as Keir said the guest cannot
influence whether SMEP is enabled/disabled.

Thanks!
-Xin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor
@ 2011-06-03 14:36 Jan Beulich
  2011-06-03 16:18 ` Li, Xin
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2011-06-03 14:36 UTC (permalink / raw)
  To: keir.xen, xin.li; +Cc: xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 6961 bytes --]

>>> "Li, Xin"  06/03/11 3:59 PM >>>
>--- a/xen/arch/x86/cpu/common.c    Thu Jun 02 18:46:35 2011 +0100
>+++ b/xen/arch/x86/cpu/common.c    Fri Jun 03 19:59:43 2011 +0800
>@@ -222,6 +222,22 @@ static void __init early_cpu_detect(void
>    c->x86_capability[4] = cap4;
>}
 >
>+/* nosmep: if true, Intel SMEP is disabled. */
>+static bool_t disable_smep __cpuinitdata;

__initdata

>+boolean_param("nosmep", disable_smep);
>+
>+static void __cpuinit setup_smep(struct cpuinfo_x86 *c)

__init

>+{
>+    if ( cpu_has(c, X86_FEATURE_SMEP) ) {
>+        if ( unlikely(disable_smep) )
>+            setup_clear_cpu_cap(X86_FEATURE_SMEP);
>+        else {
>+            set_in_cr4(X86_CR4_SMEP);
>+            mmu_cr4_features |= X86_CR4_SMEP;

Why? We old you on the previous patch version that
set_in_cr4() already does this.

>+        }
>+    }
>+}
>+
>void __cpuinit generic_identify(struct cpuinfo_x86 * c)
>{
>    u32 tfms, xlvl, capability, excap, ebx;
>@@ -268,6 +284,8 @@ void __cpuinit generic_identify(struct c
>        c->x86_capability[X86_FEATURE_FSGSBASE / 32] = ebx;
>    }
 >
>+    setup_smep(c);

Still misplaced.

>+
>    early_intel_workaround(c);
 >
>#ifdef CONFIG_X86_HT
>diff -r 0791661a32d8 xen/arch/x86/traps.c
>--- a/xen/arch/x86/traps.c    Thu Jun 02 18:46:35 2011 +0100
>+++ b/xen/arch/x86/traps.c    Fri Jun 03 19:59:43 2011 +0800
>@@ -1139,7 +1139,13 @@ static int handle_gdt_ldt_mapping_fault(
>(((va) >= HYPERVISOR_VIRT_START))
>#endif
 >
>-static int __spurious_page_fault(
>+enum pf_type {
>+    real_page_fault,
>+    smep_fault,
>+    spurious_fault
>+};
>+
>+static enum pf_type __page_fault_type(
>unsigned long addr, unsigned int error_code)
>{
>unsigned long mfn, cr3 = read_cr3();
>@@ -1151,7 +1157,7 @@ static int __spurious_page_fault(
>#endif
>l2_pgentry_t l2e, *l2t;
>l1_pgentry_t l1e, *l1t;
>-    unsigned int required_flags, disallowed_flags;
>+    unsigned int required_flags, disallowed_flags, u_flag;
 >
>/*
>* We do not take spurious page faults in IRQ handlers as we do not
>@@ -1159,11 +1165,11 @@ static int __spurious_page_fault(
>* map_domain_page() is not IRQ-safe.
>*/
>if ( in_irq() )
>-        return 0;
>+        return real_page_fault;
 >
>/* Reserved bit violations are never spurious faults. */
>if ( error_code & PFEC_reserved_bit )
>-        return 0;
>+        return real_page_fault;
 >
>required_flags  = _PAGE_PRESENT;
>if ( error_code & PFEC_write_access )
>@@ -1175,6 +1181,8 @@ static int __spurious_page_fault(
>if ( error_code & PFEC_insn_fetch )
>disallowed_flags |= _PAGE_NX_BIT;
 >
>+    u_flag = _PAGE_USER;

How about

u_flag =             cpu_has_smep ? _PAGE_USER : 0;

avoiding the repeated cpu_has_smep checks below.

>+
>mfn = cr3 >> PAGE_SHIFT;
 >
>#if CONFIG_PAGING_LEVELS >= 4
>@@ -1184,7 +1192,9 @@ static int __spurious_page_fault(
>unmap_domain_page(l4t);
>if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
>(l4e_get_flags(l4e) & disallowed_flags) )
>-        return 0;
>+        return real_page_fault;
>+
>+    u_flag &= l4e_get_flags(l4e);
>#endif
 >
>#if CONFIG_PAGING_LEVELS >= 3
>@@ -1197,13 +1207,23 @@ static int __spurious_page_fault(
>unmap_domain_page(l3t);
>#if CONFIG_PAGING_LEVELS == 3
>if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
>-        return 0;
>+        return real_page_fault;
>#else
>if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
>(l3e_get_flags(l3e) & disallowed_flags) )
>-        return 0;
>-    if ( l3e_get_flags(l3e) & _PAGE_PSE )
>-        return 1;
>+        return real_page_fault;
>+
>+    u_flag &= l3e_get_flags(l3e);
>+
>+    if ( l3e_get_flags(l3e) & _PAGE_PSE ) {
>+        /* SMEP fault error code 10001b */
>+        if ( (error_code & PFEC_insn_fetch) &&
>+             !(error_code & PFEC_user_mode) &&
>+             cpu_has_smep && u_flag )
>+            return smep_fault;
>+
>+        return spurious_fault;
>+    }
>#endif
>#endif
 >
>@@ -1213,9 +1233,19 @@ static int __spurious_page_fault(
>unmap_domain_page(l2t);
>if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
>(l2e_get_flags(l2e) & disallowed_flags) )
>-        return 0;
>-    if ( l2e_get_flags(l2e) & _PAGE_PSE )
>-        return 1;
>+        return real_page_fault;
>+
>+    u_flag &= l2e_get_flags(l2e);
>+
>+    if ( l2e_get_flags(l2e) & _PAGE_PSE ) {
>+        /* SMEP fault error code 10001b */
>+        if ( (error_code & PFEC_insn_fetch) &&
>+             !(error_code & PFEC_user_mode) &&
>+             cpu_has_smep && u_flag )
>+            return smep_fault;
>+
>+        return spurious_fault;
>+    }
 >
>l1t = map_domain_page(mfn);
>l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
>@@ -1223,26 +1253,37 @@ static int __spurious_page_fault(
>unmap_domain_page(l1t);
>if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
>(l1e_get_flags(l1e) & disallowed_flags) )
>-        return 0;
>-
>-    return 1;
>+        return real_page_fault;
>+
>+    u_flag &= l1e_get_flags(l1e);
>+
>+    /* SMEP fault error code 10001b */
>+    if ( (error_code & PFEC_insn_fetch) &&
>+         !(error_code & PFEC_user_mode) &&
>+         cpu_has_smep && u_flag )
>+        return smep_fault;
>+
>+    return spurious_fault;
>}
 >
>static int spurious_page_fault(
>unsigned long addr, unsigned int error_code)
>{
>unsigned long flags;
>-    int           is_spurious;
>+    enum pf_type  pf_type;
 >
>/*
>* Disabling interrupts prevents TLB flushing, and hence prevents
>* page tables from becoming invalid under our feet during the walk.
>*/
>local_irq_save(flags);
>-    is_spurious = __spurious_page_fault(addr, error_code);
>+    pf_type = __page_fault_type(addr, error_code);
>local_irq_restore(flags);
 >
>-    return is_spurious;
>+    if ( pf_type == smep_fault )
>+        domain_crash_synchronous();

I don't think you can domain_crash_synchronous() here, domain_crash()
is perhaps the only possibility (and even if it was possible, I think there
was agreement to not add new calls to the former function and always
use the latter instead).

That said I'm not certain it's a good choice at all, as Keir had already
pointed out on the first patch version.

Finally, spurious_page_fault() has two call sites, only one of which is
for faults that occurred in hypervisor context (which you seem to
assume always being the case). For the one where it's getting called
for a fault in guest context using domain_crash() is probably correct,
but an (possibly guest chosen) alternative would be to forward the
(non-spurious) fault to the guest. Again I thin Keir hinted in that
direction already in response to the first patch version.

>+
>+    return (pf_type == spurious_fault);
>}
 >
>static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)

Jan


[-- Attachment #1.2: HTML --]
[-- Type: text/html, Size: 8995 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2011-06-05 17:04 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-03 13:57 [PATCH v2] Enable SMEP CPU feature support for XEN hypervisor Li, Xin
2011-06-03 17:23 ` Keir Fraser
2011-06-03 18:49   ` Li, Xin
2011-06-03 19:22     ` Keir Fraser
2011-06-03 19:37       ` Li, Xin
2011-06-03 20:15         ` Keir Fraser
2011-06-03 14:36 Jan Beulich
2011-06-03 16:18 ` Li, Xin
2011-06-03 17:09   ` Li, Xin
2011-06-05  7:36 Jan Beulich
2011-06-05  8:39 ` Li, Xin
2011-06-05 15:10   ` Keir Fraser
2011-06-05 15:43     ` Li, Xin
2011-06-05 17:04       ` Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.