All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] x86: don't default to executable mappings
@ 2015-05-18 10:28 Jan Beulich
  2015-05-18 12:46 ` [PATCH 1/4] x86: move syscall trampolines off the stack Jan Beulich
                   ` (3 more replies)
  0 siblings, 4 replies; 19+ messages in thread
From: Jan Beulich @ 2015-05-18 10:28 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

Particularly for the 1:1 mapping it was pointed out that in order to
limit the damage from security issues we should avoid mapping things
executable when they don't need to be.

1: move syscall trampolines off the stack
2: emul: move stubs off the stack
3: move I/O emulation stubs off the stack
4: switch default mapping attributes to non-executable

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-18 10:28 [PATCH 0/4] x86: don't default to executable mappings Jan Beulich
@ 2015-05-18 12:46 ` Jan Beulich
  2015-05-18 18:39   ` Andrew Cooper
  2015-05-19 16:59   ` Andrew Cooper
  2015-05-18 12:46 ` [PATCH 2/4] x86emul: move stubs " Jan Beulich
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 19+ messages in thread
From: Jan Beulich @ 2015-05-18 12:46 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 13731 bytes --]

This is needed as stacks are going to become non-executable. Use
separate stub pages (shared among suitable CPUs on the same node)
instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1270,6 +1270,10 @@ void __init noreturn __start_xen(unsigne
 
     init_idle_domain();
 
+    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
+                                           &this_cpu(stubs).mfn);
+    BUG_ON(!this_cpu(stubs.addr));
+
     trap_init();
 
     rcu_init();
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -25,6 +25,7 @@
 #include <xen/kernel.h>
 #include <xen/mm.h>
 #include <xen/domain.h>
+#include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <xen/sched-if.h>
 #include <xen/irq.h>
@@ -603,6 +604,42 @@ static int do_boot_cpu(int apicid, int c
     return rc;
 }
 
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
+{
+    unsigned long stub_va;
+    struct page_info *pg;
+
+    if ( *mfn )
+        pg = mfn_to_page(*mfn);
+    else
+    {
+        nodeid_t node = cpu_to_node(cpu);
+        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
+
+        pg = alloc_domheap_page(NULL, memflags);
+        if ( !pg )
+            return 0;
+
+        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
+    }
+
+    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
+    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
+                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
+    {
+        if ( !*mfn )
+            free_domheap_page(pg);
+        stub_va = 0;
+    }
+    else
+    {
+        stub_va += (cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE;
+        *mfn = page_to_mfn(pg);
+    }
+
+    return stub_va;
+}
+
 void cpu_exit_clear(unsigned int cpu)
 {
     cpu_uninit(cpu);
@@ -616,6 +653,24 @@ static void cpu_smpboot_free(unsigned in
     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
 
+    if ( per_cpu(stubs.addr, cpu) )
+    {
+        unsigned long mfn = per_cpu(stubs.mfn, cpu);
+        unsigned char *stub_page = map_domain_page(mfn);
+        unsigned int i;
+
+        memset(stub_page + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE,
+               0xcc, STUB_BUF_SIZE);
+        for ( i = 0; i < STUBS_PER_PAGE; ++i )
+            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc)
+                break;
+        unmap_domain_page(stub_page);
+        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
+                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
+        if ( i == STUBS_PER_PAGE )
+            free_domheap_page(mfn_to_page(mfn));
+    }
+
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
     free_xenheap_pages(per_cpu(gdt_table, cpu), order);
 
@@ -635,9 +690,10 @@ static void cpu_smpboot_free(unsigned in
 
 static int cpu_smpboot_alloc(unsigned int cpu)
 {
-    unsigned int order, memflags = 0;
+    unsigned int i, order, memflags = 0;
     nodeid_t node = cpu_to_node(cpu);
     struct desc_struct *gdt;
+    unsigned long stub_page;
 
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
@@ -667,6 +723,20 @@ static int cpu_smpboot_alloc(unsigned in
         goto oom;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
 
+    for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+          i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+        if ( cpu_online(i) && cpu_to_node(i) == node )
+        {
+            per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
+            break;
+        }
+    BUG_ON(i == cpu);
+    stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
+    if ( !stub_page )
+        goto oom;
+    per_cpu(stubs.addr, cpu) = stub_page +
+                               (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE;
+
     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
          zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
         return 0;
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -219,7 +219,19 @@ ENTRY(compat_post_handle_exception)
         movb  $0,TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
-ENTRY(compat_syscall)
+ENTRY(cstar_enter)
+        sti
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_USER_CS32
+        pushq %rcx
+        pushq $0
+        SAVE_VOLATILE TRAP_syscall
+        GET_CURRENT(%rbx)
+        movq  VCPU_domain(%rbx),%rcx
+        cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
+        je    switch_to_kernel
         cmpb  $0,VCPU_syscall32_disables_events(%rbx)
         movzwl VCPU_syscall32_sel(%rbx),%esi
         movq  VCPU_syscall32_addr(%rbx),%rax
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -13,9 +13,8 @@
 #include <public/xen.h>
 #include <irq_vectors.h>
 
-        ALIGN
 /* %rbx: struct vcpu */
-switch_to_kernel:
+ENTRY(switch_to_kernel)
         leaq  VCPU_trap_bounce(%rbx),%rdx
         /* TB_eip = (32-bit syscall && syscall32_addr) ?
          *          syscall32_addr : syscall_addr */
@@ -114,22 +113,21 @@ restore_all_xen:
  *  Vector directly to the registered arch.syscall_addr.
  *
  * Initial work is done by per-CPU stack trampolines. At this point %rsp
- * has been initialised to point at the correct Xen stack, and %rsp, %rflags
- * and %cs have been saved. All other registers are still to be saved onto
- * the stack, starting with %rip, and an appropriate %ss must be saved into
- * the space left by the trampoline.
+ * has been initialised to point at the correct Xen stack, %rsp has been
+ * saved, and %rax needs to be restored from the %ss save slot. All other
+ * registers are still to be saved onto the stack, starting with RFLAGS, and
+ * an appropriate %ss must be saved into the space left by the trampoline.
  */
-ENTRY(syscall_enter)
+ENTRY(lstar_enter)
         sti
-        movl  $FLAT_KERNEL_SS,24(%rsp)
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_KERNEL_CS64
         pushq %rcx
         pushq $0
-        movq  24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
         SAVE_VOLATILE TRAP_syscall
         GET_CURRENT(%rbx)
-        movq  VCPU_domain(%rbx),%rcx
-        testb $1,DOMAIN_is_32bit_pv(%rcx)
-        jnz   compat_syscall
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -337,70 +337,78 @@ unsigned long do_iret(void)
     return 0;
 }
 
-static int write_stack_trampoline(
-    char *stack, char *stack_bottom, uint16_t cs_seg)
+static unsigned int write_stub_trampoline(
+    unsigned char *stub, unsigned long stub_va,
+    unsigned long stack_bottom, unsigned long target_va)
 {
-    /* movq %rsp, saversp(%rip) */
-    stack[0] = 0x48;
-    stack[1] = 0x89;
-    stack[2] = 0x25;
-    *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
-
-    /* leaq saversp(%rip), %rsp */
-    stack[7] = 0x48;
-    stack[8] = 0x8d;
-    stack[9] = 0x25;
-    *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
-
-    /* pushq %r11 */
-    stack[14] = 0x41;
-    stack[15] = 0x53;
-
-    /* pushq $<cs_seg> */
-    stack[16] = 0x68;
-    *(u32 *)&stack[17] = cs_seg;
-
-    /* movq $syscall_enter,%r11 */
-    stack[21] = 0x49;
-    stack[22] = 0xbb;
-    *(void **)&stack[23] = (void *)syscall_enter;
-
-    /* jmpq *%r11 */
-    stack[31] = 0x41;
-    stack[32] = 0xff;
-    stack[33] = 0xe3;
+    /* movabsq %rax, stack_bottom - 8 */
+    stub[0] = 0x48;
+    stub[1] = 0xa3;
+    *(uint64_t *)&stub[2] = stack_bottom - 8;
+
+    /* movq %rsp, %rax */
+    stub[10] = 0x48;
+    stub[11] = 0x89;
+    stub[12] = 0xe0;
+
+    /* movabsq $stack_bottom - 8, %rsp */
+    stub[13] = 0x48;
+    stub[14] = 0xbc;
+    *(uint64_t *)&stub[15] = stack_bottom - 8;
+
+    /* pushq %rax */
+    stub[23] = 0x50;
+
+    /* jmp target_va */
+    stub[24] = 0xe9;
+    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
 
-    return 34;
+    /* Round up to a multiple of 16 bytes. */
+    return 32;
 }
 
+DEFINE_PER_CPU(struct stubs, stubs);
+void lstar_enter(void);
+void cstar_enter(void);
+
 void __devinit subarch_percpu_traps_init(void)
 {
-    char *stack_bottom, *stack;
-
-    stack_bottom = (char *)get_stack_bottom();
-    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
+    unsigned long stack_bottom = get_stack_bottom();
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned char *stub_page;
+    unsigned int offset;
 
     /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
     BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
 
-    /* Trampoline for SYSCALL entry from long mode. */
-    stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
-    wrmsrl(MSR_LSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
+    stub_page = map_domain_page(this_cpu(stubs.mfn));
+
+    /* Trampoline for SYSCALL entry from 64-bit mode. */
+    wrmsrl(MSR_LSTAR, stub_va);
+    offset = write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),
+                                   stub_va, stack_bottom,
+                                   (unsigned long)lstar_enter);
+    stub_va += offset;
 
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
          boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
     {
         /* SYSENTER entry. */
-        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
+        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
         wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
         wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
     }
 
     /* Trampoline for SYSCALL entry from compatibility mode. */
-    stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
-    wrmsrl(MSR_CSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
+    wrmsrl(MSR_CSTAR, stub_va);
+    offset += write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),
+                                    stub_va, stack_bottom,
+                                    (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
 
     /* Common SYSCALL parameters. */
     wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -217,4 +217,7 @@ SECTIONS
   .comment 0 : { *(.comment) }
 }
 
+ASSERT(__image_base__ > XEN_VIRT_START ||
+       _end <= XEN_VIRT_END - NR_CPUS * PAGE_SIZE,
+       "Xen image overlaps stubs area")
 ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large")
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -94,6 +94,10 @@
 /* Primary stack is restricted to 8kB by guard pages. */
 #define PRIMARY_STACK_SIZE 8192
 
+/* Total size of syscall and emulation stubs. */
+#define STUB_BUF_SHIFT max(L1_CACHE_SHIFT, 7)
+#define STUB_BUF_SIZE  (1 << STUB_BUF_SHIFT)
+
 /* Return value for zero-size _xmalloc(), distinguished from NULL. */
 #define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL)
 
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -320,10 +320,10 @@ void paging_init(void);
 #define _PAGE_GNTTAB   0
 #endif
 
-#define __PAGE_HYPERVISOR \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_HYPERVISOR_NOCACHE \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
+                                   _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR | _PAGE_PCD)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -532,12 +532,24 @@ void trap_nop(void);
 void enable_nmis(void);
 void do_reserved_trap(struct cpu_user_regs *regs);
 
-void syscall_enter(void);
 void sysenter_entry(void);
 void sysenter_eflags_saved(void);
 void compat_hypercall(void);
 void int80_direct_trap(void);
 
+#define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE)
+
+struct stubs {
+    union {
+        void(*func)(void);
+        unsigned long addr;
+    };
+    unsigned long mfn;
+};
+
+DECLARE_PER_CPU(struct stubs, stubs);
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn);
+
 extern int hypercall(void);
 
 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -148,6 +148,7 @@ typedef l4_pgentry_t root_pgentry_t;
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
 #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
 
 #endif /* __X86_64_PAGE_H__ */



[-- Attachment #2: x86-syscall-trampolines.patch --]
[-- Type: text/plain, Size: 13774 bytes --]

x86: move syscall trampolines off the stack

This is needed as stacks are going to become non-executable. Use
separate stub pages (shared among suitable CPUs on the same node)
instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1270,6 +1270,10 @@ void __init noreturn __start_xen(unsigne
 
     init_idle_domain();
 
+    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
+                                           &this_cpu(stubs).mfn);
+    BUG_ON(!this_cpu(stubs.addr));
+
     trap_init();
 
     rcu_init();
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -25,6 +25,7 @@
 #include <xen/kernel.h>
 #include <xen/mm.h>
 #include <xen/domain.h>
+#include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <xen/sched-if.h>
 #include <xen/irq.h>
@@ -603,6 +604,42 @@ static int do_boot_cpu(int apicid, int c
     return rc;
 }
 
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
+{
+    unsigned long stub_va;
+    struct page_info *pg;
+
+    if ( *mfn )
+        pg = mfn_to_page(*mfn);
+    else
+    {
+        nodeid_t node = cpu_to_node(cpu);
+        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
+
+        pg = alloc_domheap_page(NULL, memflags);
+        if ( !pg )
+            return 0;
+
+        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
+    }
+
+    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
+    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
+                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
+    {
+        if ( !*mfn )
+            free_domheap_page(pg);
+        stub_va = 0;
+    }
+    else
+    {
+        stub_va += (cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE;
+        *mfn = page_to_mfn(pg);
+    }
+
+    return stub_va;
+}
+
 void cpu_exit_clear(unsigned int cpu)
 {
     cpu_uninit(cpu);
@@ -616,6 +653,24 @@ static void cpu_smpboot_free(unsigned in
     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
 
+    if ( per_cpu(stubs.addr, cpu) )
+    {
+        unsigned long mfn = per_cpu(stubs.mfn, cpu);
+        unsigned char *stub_page = map_domain_page(mfn);
+        unsigned int i;
+
+        memset(stub_page + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE,
+               0xcc, STUB_BUF_SIZE);
+        for ( i = 0; i < STUBS_PER_PAGE; ++i )
+            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc)
+                break;
+        unmap_domain_page(stub_page);
+        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
+                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
+        if ( i == STUBS_PER_PAGE )
+            free_domheap_page(mfn_to_page(mfn));
+    }
+
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
     free_xenheap_pages(per_cpu(gdt_table, cpu), order);
 
@@ -635,9 +690,10 @@ static void cpu_smpboot_free(unsigned in
 
 static int cpu_smpboot_alloc(unsigned int cpu)
 {
-    unsigned int order, memflags = 0;
+    unsigned int i, order, memflags = 0;
     nodeid_t node = cpu_to_node(cpu);
     struct desc_struct *gdt;
+    unsigned long stub_page;
 
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
@@ -667,6 +723,20 @@ static int cpu_smpboot_alloc(unsigned in
         goto oom;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
 
+    for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+          i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+        if ( cpu_online(i) && cpu_to_node(i) == node )
+        {
+            per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
+            break;
+        }
+    BUG_ON(i == cpu);
+    stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
+    if ( !stub_page )
+        goto oom;
+    per_cpu(stubs.addr, cpu) = stub_page +
+                               (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE;
+
     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
          zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
         return 0;
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -219,7 +219,19 @@ ENTRY(compat_post_handle_exception)
         movb  $0,TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
-ENTRY(compat_syscall)
+ENTRY(cstar_enter)
+        sti
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_USER_CS32
+        pushq %rcx
+        pushq $0
+        SAVE_VOLATILE TRAP_syscall
+        GET_CURRENT(%rbx)
+        movq  VCPU_domain(%rbx),%rcx
+        cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
+        je    switch_to_kernel
         cmpb  $0,VCPU_syscall32_disables_events(%rbx)
         movzwl VCPU_syscall32_sel(%rbx),%esi
         movq  VCPU_syscall32_addr(%rbx),%rax
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -13,9 +13,8 @@
 #include <public/xen.h>
 #include <irq_vectors.h>
 
-        ALIGN
 /* %rbx: struct vcpu */
-switch_to_kernel:
+ENTRY(switch_to_kernel)
         leaq  VCPU_trap_bounce(%rbx),%rdx
         /* TB_eip = (32-bit syscall && syscall32_addr) ?
          *          syscall32_addr : syscall_addr */
@@ -114,22 +113,21 @@ restore_all_xen:
  *  Vector directly to the registered arch.syscall_addr.
  *
  * Initial work is done by per-CPU stack trampolines. At this point %rsp
- * has been initialised to point at the correct Xen stack, and %rsp, %rflags
- * and %cs have been saved. All other registers are still to be saved onto
- * the stack, starting with %rip, and an appropriate %ss must be saved into
- * the space left by the trampoline.
+ * has been initialised to point at the correct Xen stack, %rsp has been
+ * saved, and %rax needs to be restored from the %ss save slot. All other
+ * registers are still to be saved onto the stack, starting with RFLAGS, and
+ * an appropriate %ss must be saved into the space left by the trampoline.
  */
-ENTRY(syscall_enter)
+ENTRY(lstar_enter)
         sti
-        movl  $FLAT_KERNEL_SS,24(%rsp)
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_KERNEL_CS64
         pushq %rcx
         pushq $0
-        movq  24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
         SAVE_VOLATILE TRAP_syscall
         GET_CURRENT(%rbx)
-        movq  VCPU_domain(%rbx),%rcx
-        testb $1,DOMAIN_is_32bit_pv(%rcx)
-        jnz   compat_syscall
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -337,70 +337,78 @@ unsigned long do_iret(void)
     return 0;
 }
 
-static int write_stack_trampoline(
-    char *stack, char *stack_bottom, uint16_t cs_seg)
+static unsigned int write_stub_trampoline(
+    unsigned char *stub, unsigned long stub_va,
+    unsigned long stack_bottom, unsigned long target_va)
 {
-    /* movq %rsp, saversp(%rip) */
-    stack[0] = 0x48;
-    stack[1] = 0x89;
-    stack[2] = 0x25;
-    *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
-
-    /* leaq saversp(%rip), %rsp */
-    stack[7] = 0x48;
-    stack[8] = 0x8d;
-    stack[9] = 0x25;
-    *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
-
-    /* pushq %r11 */
-    stack[14] = 0x41;
-    stack[15] = 0x53;
-
-    /* pushq $<cs_seg> */
-    stack[16] = 0x68;
-    *(u32 *)&stack[17] = cs_seg;
-
-    /* movq $syscall_enter,%r11 */
-    stack[21] = 0x49;
-    stack[22] = 0xbb;
-    *(void **)&stack[23] = (void *)syscall_enter;
-
-    /* jmpq *%r11 */
-    stack[31] = 0x41;
-    stack[32] = 0xff;
-    stack[33] = 0xe3;
+    /* movabsq %rax, stack_bottom - 8 */
+    stub[0] = 0x48;
+    stub[1] = 0xa3;
+    *(uint64_t *)&stub[2] = stack_bottom - 8;
+
+    /* movq %rsp, %rax */
+    stub[10] = 0x48;
+    stub[11] = 0x89;
+    stub[12] = 0xe0;
+
+    /* movabsq $stack_bottom - 8, %rsp */
+    stub[13] = 0x48;
+    stub[14] = 0xbc;
+    *(uint64_t *)&stub[15] = stack_bottom - 8;
+
+    /* pushq %rax */
+    stub[23] = 0x50;
+
+    /* jmp target_va */
+    stub[24] = 0xe9;
+    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
 
-    return 34;
+    /* Round up to a multiple of 16 bytes. */
+    return 32;
 }
 
+DEFINE_PER_CPU(struct stubs, stubs);
+void lstar_enter(void);
+void cstar_enter(void);
+
 void __devinit subarch_percpu_traps_init(void)
 {
-    char *stack_bottom, *stack;
-
-    stack_bottom = (char *)get_stack_bottom();
-    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
+    unsigned long stack_bottom = get_stack_bottom();
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned char *stub_page;
+    unsigned int offset;
 
     /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
     BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
 
-    /* Trampoline for SYSCALL entry from long mode. */
-    stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
-    wrmsrl(MSR_LSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
+    stub_page = map_domain_page(this_cpu(stubs.mfn));
+
+    /* Trampoline for SYSCALL entry from 64-bit mode. */
+    wrmsrl(MSR_LSTAR, stub_va);
+    offset = write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),
+                                   stub_va, stack_bottom,
+                                   (unsigned long)lstar_enter);
+    stub_va += offset;
 
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
          boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
     {
         /* SYSENTER entry. */
-        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
+        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
         wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
         wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
     }
 
     /* Trampoline for SYSCALL entry from compatibility mode. */
-    stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
-    wrmsrl(MSR_CSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
+    wrmsrl(MSR_CSTAR, stub_va);
+    offset += write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),
+                                    stub_va, stack_bottom,
+                                    (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
 
     /* Common SYSCALL parameters. */
     wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -217,4 +217,7 @@ SECTIONS
   .comment 0 : { *(.comment) }
 }
 
+ASSERT(__image_base__ > XEN_VIRT_START ||
+       _end <= XEN_VIRT_END - NR_CPUS * PAGE_SIZE,
+       "Xen image overlaps stubs area")
 ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large")
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -94,6 +94,10 @@
 /* Primary stack is restricted to 8kB by guard pages. */
 #define PRIMARY_STACK_SIZE 8192
 
+/* Total size of syscall and emulation stubs. */
+#define STUB_BUF_SHIFT max(L1_CACHE_SHIFT, 7)
+#define STUB_BUF_SIZE  (1 << STUB_BUF_SHIFT)
+
 /* Return value for zero-size _xmalloc(), distinguished from NULL. */
 #define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL)
 
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -320,10 +320,10 @@ void paging_init(void);
 #define _PAGE_GNTTAB   0
 #endif
 
-#define __PAGE_HYPERVISOR \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_HYPERVISOR_NOCACHE \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
+                                   _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR | _PAGE_PCD)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -532,12 +532,24 @@ void trap_nop(void);
 void enable_nmis(void);
 void do_reserved_trap(struct cpu_user_regs *regs);
 
-void syscall_enter(void);
 void sysenter_entry(void);
 void sysenter_eflags_saved(void);
 void compat_hypercall(void);
 void int80_direct_trap(void);
 
+#define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE)
+
+struct stubs {
+    union {
+        void(*func)(void);
+        unsigned long addr;
+    };
+    unsigned long mfn;
+};
+
+DECLARE_PER_CPU(struct stubs, stubs);
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn);
+
 extern int hypercall(void);
 
 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -148,6 +148,7 @@ typedef l4_pgentry_t root_pgentry_t;
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
 #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
 
 #endif /* __X86_64_PAGE_H__ */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/4] x86emul: move stubs off the stack
  2015-05-18 10:28 [PATCH 0/4] x86: don't default to executable mappings Jan Beulich
  2015-05-18 12:46 ` [PATCH 1/4] x86: move syscall trampolines off the stack Jan Beulich
@ 2015-05-18 12:46 ` Jan Beulich
  2015-05-19 17:33   ` Andrew Cooper
  2015-05-18 12:47 ` [PATCH 3/4] x86: move I/O emulation " Jan Beulich
  2015-05-18 12:47 ` [PATCH 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
  3 siblings, 1 reply; 19+ messages in thread
From: Jan Beulich @ 2015-05-18 12:46 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 7265 bytes --]

This is needed as stacks are going to become non-executable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -17,4 +17,8 @@ typedef bool bool_t;
 #define __packed __attribute__((packed))
 
 #include "x86_emulate/x86_emulate.h"
+
+#define get_stub(stb) ((void *)((stb).addr = (uintptr_t)(stb).buf))
+#define put_stub(stb)
+
 #include "x86_emulate/x86_emulate.c"
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -9,6 +9,7 @@
  *    Keir Fraser <keir@xen.org>
  */
 
+#include <xen/domain_page.h>
 #include <asm/x86_emulate.h>
 #include <asm/asm_defns.h> /* mark_regs_dirty() */
 #include <asm/processor.h> /* current_cpu_info */
@@ -17,8 +18,22 @@
 /* Avoid namespace pollution. */
 #undef cmpxchg
 #undef cpuid
+#undef wbinvd
 
 #define cpu_has_amd_erratum(nr) \
         cpu_has_amd_erratum(&current_cpu_data, AMD_ERRATUM_##nr)
 
+#define get_stub(stb) ({                                   \
+    (stb).addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2; \
+    ((stb).ptr = map_domain_page(this_cpu(stubs.mfn))) +   \
+        ((stb).addr & (PAGE_SIZE - 1));                    \
+})
+#define put_stub(stb) ({                                   \
+    if ( (stb).ptr )                                       \
+    {                                                      \
+        unmap_domain_page((stb).ptr);                      \
+        (stb).ptr = NULL;                                  \
+    }                                                      \
+})
+
 #include "x86_emulate/x86_emulate.c"
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -717,11 +717,14 @@ do{ struct fpu_insn_ctxt fic;           
 } while (0)
 
 #define emulate_fpu_insn_stub(_bytes...)                                \
-do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
-    struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
+do{ uint8_t *buf = get_stub(stub);                                      \
+    unsigned int _nr = sizeof((uint8_t[]){ _bytes });                   \
+    struct fpu_insn_ctxt fic = { .insn_bytes = _nr };                    \
+    memcpy(buf, ((uint8_t[]){ _bytes, 0xc3 }), _nr + 1);                \
     get_fpu(X86EMUL_FPU_fpu, &fic);                                     \
-    (*(void(*)(void))stub)();                                           \
+    stub.func();                                                        \
     put_fpu(&fic);                                                      \
+    put_stub(stub);                                                     \
 } while (0)
 
 static unsigned long _get_rep_prefix(
@@ -1458,6 +1461,7 @@ x86_emulate(
     struct operand src = { .reg = REG_POISON };
     struct operand dst = { .reg = REG_POISON };
     enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -3792,6 +3796,7 @@ x86_emulate(
 
  done:
     _put_fpu();
+    put_stub(stub);
     return rc;
 
  twobyte_insn:
@@ -4007,9 +4012,15 @@ x86_emulate(
                /* {,v}movss xmm,xmm/m32 */
                /* {,v}movsd xmm,xmm/m64 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
 
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
@@ -4017,7 +4028,7 @@ x86_emulate(
             else
                 vcpu_must_have_sse();
             ea.bytes = 16;
-            SET_SSE_PREFIX(stub[0], vex.pfx);
+            SET_SSE_PREFIX(buf[0], vex.pfx);
             get_fpu(X86EMUL_FPU_xmm, &fic);
         }
         else
@@ -4044,15 +4055,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b & 1) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4242,9 +4254,15 @@ x86_emulate(
                /* {,v}movdq{a,u} xmm,xmm/m128 */
                /* vmovdq{a,u} ymm,ymm/m256 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
 
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             switch ( vex.pfx )
@@ -4252,7 +4270,7 @@ x86_emulate(
             case vex_66:
             case vex_f3:
                 vcpu_must_have_sse2();
-                stub[0] = 0x66; /* movdqa */
+                buf[0] = 0x66; /* movdqa */
                 get_fpu(X86EMUL_FPU_xmm, &fic);
                 ea.bytes = 16;
                 break;
@@ -4288,15 +4306,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4638,5 +4657,6 @@ x86_emulate(
 
  cannot_emulate:
     _put_fpu();
+    put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -429,6 +429,18 @@ struct x86_emulate_ctxt
     } retire;
 };
 
+struct x86_emulate_stub {
+    union {
+        void (*func)(void);
+        uintptr_t addr;
+    };
+#ifdef __XEN__
+    void *ptr;
+#else
+    uint8_t buf[32];
+#endif
+};
+
 /*
  * x86_emulate: Emulate an instruction.
  * Returns -1 on failure, 0 on success.



[-- Attachment #2: x86emul-stubs.patch --]
[-- Type: text/plain, Size: 7298 bytes --]

x86emul: move stubs off the stack

This is needed as stacks are going to become non-executable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -17,4 +17,8 @@ typedef bool bool_t;
 #define __packed __attribute__((packed))
 
 #include "x86_emulate/x86_emulate.h"
+
+#define get_stub(stb) ((void *)((stb).addr = (uintptr_t)(stb).buf))
+#define put_stub(stb)
+
 #include "x86_emulate/x86_emulate.c"
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -9,6 +9,7 @@
  *    Keir Fraser <keir@xen.org>
  */
 
+#include <xen/domain_page.h>
 #include <asm/x86_emulate.h>
 #include <asm/asm_defns.h> /* mark_regs_dirty() */
 #include <asm/processor.h> /* current_cpu_info */
@@ -17,8 +18,22 @@
 /* Avoid namespace pollution. */
 #undef cmpxchg
 #undef cpuid
+#undef wbinvd
 
 #define cpu_has_amd_erratum(nr) \
         cpu_has_amd_erratum(&current_cpu_data, AMD_ERRATUM_##nr)
 
+#define get_stub(stb) ({                                   \
+    (stb).addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2; \
+    ((stb).ptr = map_domain_page(this_cpu(stubs.mfn))) +   \
+        ((stb).addr & (PAGE_SIZE - 1));                    \
+})
+#define put_stub(stb) ({                                   \
+    if ( (stb).ptr )                                       \
+    {                                                      \
+        unmap_domain_page((stb).ptr);                      \
+        (stb).ptr = NULL;                                  \
+    }                                                      \
+})
+
 #include "x86_emulate/x86_emulate.c"
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -717,11 +717,14 @@ do{ struct fpu_insn_ctxt fic;           
 } while (0)
 
 #define emulate_fpu_insn_stub(_bytes...)                                \
-do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
-    struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
+do{ uint8_t *buf = get_stub(stub);                                      \
+    unsigned int _nr = sizeof((uint8_t[]){ _bytes });                   \
+    struct fpu_insn_ctxt fic = { .insn_bytes = _nr };                    \
+    memcpy(buf, ((uint8_t[]){ _bytes, 0xc3 }), _nr + 1);                \
     get_fpu(X86EMUL_FPU_fpu, &fic);                                     \
-    (*(void(*)(void))stub)();                                           \
+    stub.func();                                                        \
     put_fpu(&fic);                                                      \
+    put_stub(stub);                                                     \
 } while (0)
 
 static unsigned long _get_rep_prefix(
@@ -1458,6 +1461,7 @@ x86_emulate(
     struct operand src = { .reg = REG_POISON };
     struct operand dst = { .reg = REG_POISON };
     enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -3792,6 +3796,7 @@ x86_emulate(
 
  done:
     _put_fpu();
+    put_stub(stub);
     return rc;
 
  twobyte_insn:
@@ -4007,9 +4012,15 @@ x86_emulate(
                /* {,v}movss xmm,xmm/m32 */
                /* {,v}movsd xmm,xmm/m64 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
 
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
@@ -4017,7 +4028,7 @@ x86_emulate(
             else
                 vcpu_must_have_sse();
             ea.bytes = 16;
-            SET_SSE_PREFIX(stub[0], vex.pfx);
+            SET_SSE_PREFIX(buf[0], vex.pfx);
             get_fpu(X86EMUL_FPU_xmm, &fic);
         }
         else
@@ -4044,15 +4055,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b & 1) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4242,9 +4254,15 @@ x86_emulate(
                /* {,v}movdq{a,u} xmm,xmm/m128 */
                /* vmovdq{a,u} ymm,ymm/m256 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
 
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             switch ( vex.pfx )
@@ -4252,7 +4270,7 @@ x86_emulate(
             case vex_66:
             case vex_f3:
                 vcpu_must_have_sse2();
-                stub[0] = 0x66; /* movdqa */
+                buf[0] = 0x66; /* movdqa */
                 get_fpu(X86EMUL_FPU_xmm, &fic);
                 ea.bytes = 16;
                 break;
@@ -4288,15 +4306,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4638,5 +4657,6 @@ x86_emulate(
 
  cannot_emulate:
     _put_fpu();
+    put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -429,6 +429,18 @@ struct x86_emulate_ctxt
     } retire;
 };
 
+struct x86_emulate_stub {
+    union {
+        void (*func)(void);
+        uintptr_t addr;
+    };
+#ifdef __XEN__
+    void *ptr;
+#else
+    uint8_t buf[32];
+#endif
+};
+
 /*
  * x86_emulate: Emulate an instruction.
  * Returns -1 on failure, 0 on success.

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 3/4] x86: move I/O emulation stubs off the stack
  2015-05-18 10:28 [PATCH 0/4] x86: don't default to executable mappings Jan Beulich
  2015-05-18 12:46 ` [PATCH 1/4] x86: move syscall trampolines off the stack Jan Beulich
  2015-05-18 12:46 ` [PATCH 2/4] x86emul: move stubs " Jan Beulich
@ 2015-05-18 12:47 ` Jan Beulich
  2015-05-19 17:48   ` Andrew Cooper
  2015-05-18 12:47 ` [PATCH 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
  3 siblings, 1 reply; 19+ messages in thread
From: Jan Beulich @ 2015-05-18 12:47 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 1748 bytes --]

This is needed as stacks are going to become non-executable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2006,7 +2006,7 @@ static int emulate_privileged_op(struct 
                            ? (*(u32 *)&regs->reg = (val)) \
                            : (*(u16 *)&regs->reg = (val)))
     unsigned long code_base, code_limit;
-    char io_emul_stub[32];
+    char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
     uint64_t val;
 
@@ -2195,6 +2195,9 @@ static int emulate_privileged_op(struct 
      * GPR context. This is needed for some systems which (ab)use IN/OUT
      * to communicate with BIOS code in system-management mode.
      */
+    io_emul_stub = map_domain_page(this_cpu(stubs.mfn)) +
+                   (this_cpu(stubs.addr) & (PAGE_SIZE - 1)) +
+                   STUB_BUF_SIZE / 2;
     /* movq $host_to_guest_gpr_switch,%rcx */
     io_emul_stub[0] = 0x48;
     io_emul_stub[1] = 0xb9;
@@ -2212,7 +2215,7 @@ static int emulate_privileged_op(struct 
     io_emul_stub[15] = 0xc3;
 
     /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)io_emul_stub;
+    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
 
     if ( ioemul_handle_quirk )
         ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
@@ -2777,9 +2780,13 @@ static int emulate_privileged_op(struct 
  done:
     instruction_done(regs, eip, bpmatch);
  skip:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return EXCRET_fault_fixed;
 
  fail:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return 0;
 }
 




[-- Attachment #2: x86-ioemul-stub.patch --]
[-- Type: text/plain, Size: 1789 bytes --]

x86: move I/O emulation stubs off the stack

This is needed as stacks are going to become non-executable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2006,7 +2006,7 @@ static int emulate_privileged_op(struct 
                            ? (*(u32 *)&regs->reg = (val)) \
                            : (*(u16 *)&regs->reg = (val)))
     unsigned long code_base, code_limit;
-    char io_emul_stub[32];
+    char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
     uint64_t val;
 
@@ -2195,6 +2195,9 @@ static int emulate_privileged_op(struct 
      * GPR context. This is needed for some systems which (ab)use IN/OUT
      * to communicate with BIOS code in system-management mode.
      */
+    io_emul_stub = map_domain_page(this_cpu(stubs.mfn)) +
+                   (this_cpu(stubs.addr) & (PAGE_SIZE - 1)) +
+                   STUB_BUF_SIZE / 2;
     /* movq $host_to_guest_gpr_switch,%rcx */
     io_emul_stub[0] = 0x48;
     io_emul_stub[1] = 0xb9;
@@ -2212,7 +2215,7 @@ static int emulate_privileged_op(struct 
     io_emul_stub[15] = 0xc3;
 
     /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)io_emul_stub;
+    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
 
     if ( ioemul_handle_quirk )
         ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
@@ -2777,9 +2780,13 @@ static int emulate_privileged_op(struct 
  done:
     instruction_done(regs, eip, bpmatch);
  skip:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return EXCRET_fault_fixed;
 
  fail:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return 0;
 }
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 4/4] x86: switch default mapping attributes to non-executable
  2015-05-18 10:28 [PATCH 0/4] x86: don't default to executable mappings Jan Beulich
                   ` (2 preceding siblings ...)
  2015-05-18 12:47 ` [PATCH 3/4] x86: move I/O emulation " Jan Beulich
@ 2015-05-18 12:47 ` Jan Beulich
  2015-05-19 18:53   ` Andrew Cooper
  3 siblings, 1 reply; 19+ messages in thread
From: Jan Beulich @ 2015-05-18 12:47 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 8403 bytes --]

Only a very limited subset of mappings need to be done as executable
ones; in particular the direct mapping should not be executable to
limit the damage attackers can cause by exploiting security relevant
bugs.

The EFI change at once includes an adjustment to set NX only when
supported by the hardware.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -293,7 +293,7 @@ struct vcpu_guest_context *alloc_vcpu_gu
             free_vcpu_guest_context(NULL);
             return NULL;
         }
-        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR);
+        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR_RW);
         per_cpu(vgc_pages[i], cpu) = pg;
     }
     return (void *)fix_to_virt(idx);
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -160,7 +160,7 @@ void *map_domain_page(unsigned long mfn)
 
     spin_unlock(&dcache->lock);
 
-    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
+    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR_RW));
 
  out:
     local_irq_restore(flags);
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4416,7 +4416,7 @@ long set_gdt(struct vcpu *v, 
     for ( i = 0; i < nr_pages; i++ )
     {
         v->arch.pv_vcpu.gdt_frames[i] = frames[i];
-        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
+        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR_RW));
     }
 
     xfree(pfns);
@@ -6004,7 +6004,7 @@ int create_perdomain_mapping(struct doma
                 if ( !IS_NIL(ppg) )
                     *ppg++ = pg;
                 l1tab[l1_table_offset(va)] =
-                    l1e_from_page(pg, __PAGE_HYPERVISOR | _PAGE_AVAIL0);
+                    l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
                 l2e_add_flags(*pl2e, _PAGE_AVAIL0);
             }
             else
@@ -6133,7 +6133,7 @@ void memguard_init(void)
         (unsigned long)__va(start),
         start >> PAGE_SHIFT,
         (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
-        __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
+        __PAGE_HYPERVISOR_RW|MAP_SMALL_PAGES);
     BUG_ON(start != xen_phys_start);
     map_pages_to_xen(
         XEN_VIRT_START,
@@ -6146,7 +6146,7 @@ static void __memguard_change_range(void
 {
     unsigned long _p = (unsigned long)p;
     unsigned long _l = (unsigned long)l;
-    unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
+    unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
 
     /* Ensure we are dealing with a page-aligned whole number of pages. */
     ASSERT((_p&~PAGE_MASK) == 0);
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -900,7 +900,7 @@ void __init noreturn __start_xen(unsigne
             /* The only data mappings to be relocated are in the Xen area. */
             pl2e = __va(__pa(l2_xenmap));
             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
-                                   PAGE_HYPERVISOR | _PAGE_PSE);
+                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
             {
                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
@@ -1087,7 +1087,7 @@ void __init noreturn __start_xen(unsigne
             /* This range must not be passed to the boot allocator and
              * must also not be mapped with _PAGE_GLOBAL. */
             map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
-                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR);
+                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
         }
         if ( s < map_s )
         {
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -895,6 +895,33 @@ void __init subarch_init_memory(void)
             share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
         }
     }
+
+    /* Mark low 16Mb of direct map NX if hardware supports it. */
+    if ( !cpu_has_nx )
+        return;
+
+    v = DIRECTMAP_VIRT_START + (1UL << 20);
+    l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[l3_table_offset(v)];
+    ASSERT(l3e_get_flags(l3e) & _PAGE_PRESENT);
+    do {
+        l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
+        ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
+        if ( l2e_get_flags(l2e) & _PAGE_PSE )
+        {
+            l2e_add_flags(l2e, _PAGE_NX_BIT);
+            l3e_to_l2e(l3e)[l2_table_offset(v)] = l2e;
+            v += 1 << L2_PAGETABLE_SHIFT;
+        }
+        else
+        {
+            l1_pgentry_t l1e = l2e_to_l1e(l2e)[l1_table_offset(v)];
+
+            ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
+            l1e_add_flags(l1e, _PAGE_NX_BIT);
+            l2e_to_l1e(l2e)[l1_table_offset(v)] = l1e;
+            v += 1 << L1_PAGETABLE_SHIFT;
+        }
+    } while ( v < DIRECTMAP_VIRT_START + (16UL << 20) );
 }
 
 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -1359,7 +1386,7 @@ int memory_add(unsigned long spfn, unsig
         if ( i < spfn )
             i = spfn;
         ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i,
-                               epfn - i, __PAGE_HYPERVISOR);
+                               epfn - i, __PAGE_HYPERVISOR_RW);
         if ( ret )
             return ret;
     }
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1162,7 +1162,7 @@ void __init efi_init_memory(void)
         EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i;
         u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT;
         unsigned long smfn, emfn;
-        unsigned int prot = PAGE_HYPERVISOR;
+        unsigned int prot = PAGE_HYPERVISOR_RWX;
 
         printk(XENLOG_INFO " %013" PRIx64 "-%013" PRIx64
                            " type=%u attr=%016" PRIx64 "\n",
@@ -1195,7 +1195,7 @@ void __init efi_init_memory(void)
         if ( desc->Attribute & EFI_MEMORY_WP )
             prot &= _PAGE_RW;
         if ( desc->Attribute & EFI_MEMORY_XP )
-            prot |= _PAGE_NX_BIT;
+            prot |= _PAGE_NX;
 
         if ( pfn_to_pdx(emfn - 1) < (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
              !(smfn & pfn_hole_mask) &&
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -303,7 +303,8 @@ void paging_init(void);
 #define _PAGE_AVAIL1   _AC(0x400,U)
 #define _PAGE_AVAIL2   _AC(0x800,U)
 #define _PAGE_AVAIL    _AC(0xE00,U)
-#define _PAGE_PSE_PAT _AC(0x1000,U)
+#define _PAGE_PSE_PAT  _AC(0x1000,U)
+#define _PAGE_NX       (cpu_has_nx ? _PAGE_NX_BIT : 0)
 /* non-architectural flags */
 #define _PAGE_PAGED   0x2000U
 #define _PAGE_SHARED  0x4000U
@@ -320,6 +321,9 @@ void paging_init(void);
 #define _PAGE_GNTTAB   0
 #endif
 
+#define __PAGE_HYPERVISOR_RO      (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NX)
+#define __PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RO | \
+                                   _PAGE_DIRTY | _PAGE_RW)
 #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
 #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
                                    _PAGE_DIRTY | _PAGE_RW)
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -147,9 +147,20 @@ typedef l4_pgentry_t root_pgentry_t;
  */
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
-#define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RO      (__PAGE_HYPERVISOR_RO      | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RW      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
-#define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RWX     (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+
+#ifdef __ASSEMBLY__
+/* Dependency on NX being available can't be expressed. */
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RWX
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#else
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RW
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | \
+                                  _PAGE_GLOBAL | _PAGE_NX)
+#endif
 
 #endif /* __X86_64_PAGE_H__ */
 



[-- Attachment #2: x86-map-noexec.patch --]
[-- Type: text/plain, Size: 8459 bytes --]

x86: switch default mapping attributes to non-executable

Only a very limited subset of mappings need to be done as executable
ones; in particular the direct mapping should not be executable to
limit the damage attackers can cause by exploiting security relevant
bugs.

The EFI change at once includes an adjustment to set NX only when
supported by the hardware.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -293,7 +293,7 @@ struct vcpu_guest_context *alloc_vcpu_gu
             free_vcpu_guest_context(NULL);
             return NULL;
         }
-        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR);
+        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR_RW);
         per_cpu(vgc_pages[i], cpu) = pg;
     }
     return (void *)fix_to_virt(idx);
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -160,7 +160,7 @@ void *map_domain_page(unsigned long mfn)
 
     spin_unlock(&dcache->lock);
 
-    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
+    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR_RW));
 
  out:
     local_irq_restore(flags);
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4416,7 +4416,7 @@ long set_gdt(struct vcpu *v, 
     for ( i = 0; i < nr_pages; i++ )
     {
         v->arch.pv_vcpu.gdt_frames[i] = frames[i];
-        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
+        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR_RW));
     }
 
     xfree(pfns);
@@ -6004,7 +6004,7 @@ int create_perdomain_mapping(struct doma
                 if ( !IS_NIL(ppg) )
                     *ppg++ = pg;
                 l1tab[l1_table_offset(va)] =
-                    l1e_from_page(pg, __PAGE_HYPERVISOR | _PAGE_AVAIL0);
+                    l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
                 l2e_add_flags(*pl2e, _PAGE_AVAIL0);
             }
             else
@@ -6133,7 +6133,7 @@ void memguard_init(void)
         (unsigned long)__va(start),
         start >> PAGE_SHIFT,
         (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
-        __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
+        __PAGE_HYPERVISOR_RW|MAP_SMALL_PAGES);
     BUG_ON(start != xen_phys_start);
     map_pages_to_xen(
         XEN_VIRT_START,
@@ -6146,7 +6146,7 @@ static void __memguard_change_range(void
 {
     unsigned long _p = (unsigned long)p;
     unsigned long _l = (unsigned long)l;
-    unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
+    unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
 
     /* Ensure we are dealing with a page-aligned whole number of pages. */
     ASSERT((_p&~PAGE_MASK) == 0);
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -900,7 +900,7 @@ void __init noreturn __start_xen(unsigne
             /* The only data mappings to be relocated are in the Xen area. */
             pl2e = __va(__pa(l2_xenmap));
             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
-                                   PAGE_HYPERVISOR | _PAGE_PSE);
+                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
             {
                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
@@ -1087,7 +1087,7 @@ void __init noreturn __start_xen(unsigne
             /* This range must not be passed to the boot allocator and
              * must also not be mapped with _PAGE_GLOBAL. */
             map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
-                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR);
+                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
         }
         if ( s < map_s )
         {
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -895,6 +895,33 @@ void __init subarch_init_memory(void)
             share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
         }
     }
+
+    /* Mark low 16Mb of direct map NX if hardware supports it. */
+    if ( !cpu_has_nx )
+        return;
+
+    v = DIRECTMAP_VIRT_START + (1UL << 20);
+    l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[l3_table_offset(v)];
+    ASSERT(l3e_get_flags(l3e) & _PAGE_PRESENT);
+    do {
+        l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
+        ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
+        if ( l2e_get_flags(l2e) & _PAGE_PSE )
+        {
+            l2e_add_flags(l2e, _PAGE_NX_BIT);
+            l3e_to_l2e(l3e)[l2_table_offset(v)] = l2e;
+            v += 1 << L2_PAGETABLE_SHIFT;
+        }
+        else
+        {
+            l1_pgentry_t l1e = l2e_to_l1e(l2e)[l1_table_offset(v)];
+
+            ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
+            l1e_add_flags(l1e, _PAGE_NX_BIT);
+            l2e_to_l1e(l2e)[l1_table_offset(v)] = l1e;
+            v += 1 << L1_PAGETABLE_SHIFT;
+        }
+    } while ( v < DIRECTMAP_VIRT_START + (16UL << 20) );
 }
 
 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -1359,7 +1386,7 @@ int memory_add(unsigned long spfn, unsig
         if ( i < spfn )
             i = spfn;
         ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i,
-                               epfn - i, __PAGE_HYPERVISOR);
+                               epfn - i, __PAGE_HYPERVISOR_RW);
         if ( ret )
             return ret;
     }
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1162,7 +1162,7 @@ void __init efi_init_memory(void)
         EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i;
         u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT;
         unsigned long smfn, emfn;
-        unsigned int prot = PAGE_HYPERVISOR;
+        unsigned int prot = PAGE_HYPERVISOR_RWX;
 
         printk(XENLOG_INFO " %013" PRIx64 "-%013" PRIx64
                            " type=%u attr=%016" PRIx64 "\n",
@@ -1195,7 +1195,7 @@ void __init efi_init_memory(void)
         if ( desc->Attribute & EFI_MEMORY_WP )
             prot &= _PAGE_RW;
         if ( desc->Attribute & EFI_MEMORY_XP )
-            prot |= _PAGE_NX_BIT;
+            prot |= _PAGE_NX;
 
         if ( pfn_to_pdx(emfn - 1) < (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
              !(smfn & pfn_hole_mask) &&
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -303,7 +303,8 @@ void paging_init(void);
 #define _PAGE_AVAIL1   _AC(0x400,U)
 #define _PAGE_AVAIL2   _AC(0x800,U)
 #define _PAGE_AVAIL    _AC(0xE00,U)
-#define _PAGE_PSE_PAT _AC(0x1000,U)
+#define _PAGE_PSE_PAT  _AC(0x1000,U)
+#define _PAGE_NX       (cpu_has_nx ? _PAGE_NX_BIT : 0)
 /* non-architectural flags */
 #define _PAGE_PAGED   0x2000U
 #define _PAGE_SHARED  0x4000U
@@ -320,6 +321,9 @@ void paging_init(void);
 #define _PAGE_GNTTAB   0
 #endif
 
+#define __PAGE_HYPERVISOR_RO      (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NX)
+#define __PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RO | \
+                                   _PAGE_DIRTY | _PAGE_RW)
 #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
 #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
                                    _PAGE_DIRTY | _PAGE_RW)
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -147,9 +147,20 @@ typedef l4_pgentry_t root_pgentry_t;
  */
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
-#define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RO      (__PAGE_HYPERVISOR_RO      | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RW      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
-#define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RWX     (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+
+#ifdef __ASSEMBLY__
+/* Dependency on NX being available can't be expressed. */
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RWX
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#else
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RW
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | \
+                                  _PAGE_GLOBAL | _PAGE_NX)
+#endif
 
 #endif /* __X86_64_PAGE_H__ */
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-18 12:46 ` [PATCH 1/4] x86: move syscall trampolines off the stack Jan Beulich
@ 2015-05-18 18:39   ` Andrew Cooper
  2015-05-19  6:41     ` Jan Beulich
  2015-05-19 16:59   ` Andrew Cooper
  1 sibling, 1 reply; 19+ messages in thread
From: Andrew Cooper @ 2015-05-18 18:39 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 18/05/15 13:46, Jan Beulich wrote:
> This is needed as stacks are going to become non-executable. Use
> separate stub pages (shared among suitable CPUs on the same node)
> instead.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Can you please include a description of how you intend the stubs to
function, and how they are layed out?  Parts of the code look like a
single page per stub, while other bits look like several stubs per page.

(Personally, I would split the stub allocation/mapping/freeing into a
patch separately to moving the syscall trampolines, as each are
moderately complicated changes.)

~Andrew

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-18 18:39   ` Andrew Cooper
@ 2015-05-19  6:41     ` Jan Beulich
  2015-05-19  9:24       ` Andrew Cooper
  0 siblings, 1 reply; 19+ messages in thread
From: Jan Beulich @ 2015-05-19  6:41 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 18.05.15 at 20:39, <andrew.cooper3@citrix.com> wrote:
> On 18/05/15 13:46, Jan Beulich wrote:
>> This is needed as stacks are going to become non-executable. Use
>> separate stub pages (shared among suitable CPUs on the same node)
>> instead.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Can you please include a description of how you intend the stubs to
> function, and how they are layed out?  Parts of the code look like a
> single page per stub, while other bits look like several stubs per page.

I'm adding this to the already present description:

Stub areas (currently 128 bytes each) are being split into two parts -
a fixed usage one (the syscall ones) and dynamically usable space,
which will be used by subsequent changes to hold dynamically generated
code during instruction eumlation.

While sharing physical pages among certain CPUs on the same node, for
now the virtual mappings get established in distinct pages for each
CPU. This isn't a strict requirement, but simplifies VA space
management for this initial implementation: Sharing VA space would
require additional tracking of which areas are currently in use. If
the VA and/or TLB overhead turned out to be a problem, such extra code
could easily be added.

> (Personally, I would split the stub allocation/mapping/freeing into a
> patch separately to moving the syscall trampolines, as each are
> moderately complicated changes.)

I'm afraid this wouldn't work: The freeing of the stub page depends
on finding the first byte of each stub area being other than 0xCC in
order for the page to not get freed. Yet only the setting up of the
syscall stubs guarantees this (and I'm not really looking forward to
add - however little - code to store a placeholder instead).

Jan

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-19  6:41     ` Jan Beulich
@ 2015-05-19  9:24       ` Andrew Cooper
  0 siblings, 0 replies; 19+ messages in thread
From: Andrew Cooper @ 2015-05-19  9:24 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

On 19/05/15 07:41, Jan Beulich wrote:
>>>> On 18.05.15 at 20:39, <andrew.cooper3@citrix.com> wrote:
>> On 18/05/15 13:46, Jan Beulich wrote:
>>> This is needed as stacks are going to become non-executable. Use
>>> separate stub pages (shared among suitable CPUs on the same node)
>>> instead.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Can you please include a description of how you intend the stubs to
>> function, and how they are layed out?  Parts of the code look like a
>> single page per stub, while other bits look like several stubs per page.
> I'm adding this to the already present description:
>
> Stub areas (currently 128 bytes each) are being split into two parts -
> a fixed usage one (the syscall ones) and dynamically usable space,
> which will be used by subsequent changes to hold dynamically generated
> code during instruction eumlation.
>
> While sharing physical pages among certain CPUs on the same node, for
> now the virtual mappings get established in distinct pages for each
> CPU. This isn't a strict requirement, but simplifies VA space
> management for this initial implementation: Sharing VA space would
> require additional tracking of which areas are currently in use. If
> the VA and/or TLB overhead turned out to be a problem, such extra code
> could easily be added.

Thanks - this clarifies things a lot.

>
>> (Personally, I would split the stub allocation/mapping/freeing into a
>> patch separately to moving the syscall trampolines, as each are
>> moderately complicated changes.)
> I'm afraid this wouldn't work: The freeing of the stub page depends
> on finding the first byte of each stub area being other than 0xCC in
> order for the page to not get freed. Yet only the setting up of the
> syscall stubs guarantees this (and I'm not really looking forward to
> add - however little - code to store a placeholder instead).

Ah ok.

~Andrew

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-18 12:46 ` [PATCH 1/4] x86: move syscall trampolines off the stack Jan Beulich
  2015-05-18 18:39   ` Andrew Cooper
@ 2015-05-19 16:59   ` Andrew Cooper
  2015-05-20  9:16     ` Jan Beulich
                       ` (2 more replies)
  1 sibling, 3 replies; 19+ messages in thread
From: Andrew Cooper @ 2015-05-19 16:59 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 18/05/15 13:46, Jan Beulich wrote:
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -25,6 +25,7 @@
>  #include <xen/kernel.h>
>  #include <xen/mm.h>
>  #include <xen/domain.h>
> +#include <xen/domain_page.h>
>  #include <xen/sched.h>
>  #include <xen/sched-if.h>
>  #include <xen/irq.h>
> @@ -603,6 +604,42 @@ static int do_boot_cpu(int apicid, int c
>      return rc;
>  }
>  
> +unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
> +{
> +    unsigned long stub_va;
> +    struct page_info *pg;
> +
> +    if ( *mfn )
> +        pg = mfn_to_page(*mfn);
> +    else
> +    {
> +        nodeid_t node = cpu_to_node(cpu);
> +        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
> +
> +        pg = alloc_domheap_page(NULL, memflags);
> +        if ( !pg )
> +            return 0;
> +
> +        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));

Seems like memset_page(pg, int val) might be a nice piece of cleanup.

> +    }
> +
> +    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
> +    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
> +                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
> +    {
> +        if ( !*mfn )
> +            free_domheap_page(pg);
> +        stub_va = 0;
> +    }
> +    else
> +    {
> +        stub_va += (cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE;

"(cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE" comes up very frequently
through this patch.

I think the logic would be easier to read given:

#define STUB_OFFSET(cpu) (((cpu) & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)

> +        *mfn = page_to_mfn(pg);
> +    }
> +
> +    return stub_va;
> +}
> +
>  void cpu_exit_clear(unsigned int cpu)
>  {
>      cpu_uninit(cpu);
> @@ -616,6 +653,24 @@ static void cpu_smpboot_free(unsigned in
>      free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
>      free_cpumask_var(per_cpu(cpu_core_mask, cpu));
>  
> +    if ( per_cpu(stubs.addr, cpu) )
> +    {
> +        unsigned long mfn = per_cpu(stubs.mfn, cpu);
> +        unsigned char *stub_page = map_domain_page(mfn);
> +        unsigned int i;
> +
> +        memset(stub_page + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE,
> +               0xcc, STUB_BUF_SIZE);
> +        for ( i = 0; i < STUBS_PER_PAGE; ++i )
> +            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc)
> +                break;

There is a race condition between allocate and free, as
write_stub_trampoline() is written by the cpu itself.  Just finding 0xcc
here doesn't mean there isn't a different cpu in the process of coming
up and intending to use the stub page.

I suspect the race can be fixed by having the alloc side clobber the
first instruction, perhaps to ud2 ?

(Also, style at the end of the if)

> +        unmap_domain_page(stub_page);
> +        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
> +                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
> +        if ( i == STUBS_PER_PAGE )
> +            free_domheap_page(mfn_to_page(mfn));
> +    }
> +
>      order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
>      free_xenheap_pages(per_cpu(gdt_table, cpu), order);
>  
> --- a/xen/arch/x86/x86_64/compat/entry.S
> +++ b/xen/arch/x86/x86_64/compat/entry.S
> @@ -219,7 +219,19 @@ ENTRY(compat_post_handle_exception)
>          movb  $0,TRAPBOUNCE_flags(%rdx)
>          jmp   compat_test_all_events
>  

I would be tempted to leave a comment here referring to 'lstar_enter'
for stack/register details.

> -ENTRY(compat_syscall)
> +ENTRY(cstar_enter)
> +        sti
> +        movq  8(%rsp),%rax /* Restore %rax. */
> +        movq  $FLAT_KERNEL_SS,8(%rsp)
> +        pushq %r11
> +        pushq $FLAT_USER_CS32
> +        pushq %rcx
> +        pushq $0
> +        SAVE_VOLATILE TRAP_syscall
> +        GET_CURRENT(%rbx)
> +        movq  VCPU_domain(%rbx),%rcx
> +        cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
> +        je    switch_to_kernel
>          cmpb  $0,VCPU_syscall32_disables_events(%rbx)
>          movzwl VCPU_syscall32_sel(%rbx),%esi
>          movq  VCPU_syscall32_addr(%rbx),%rax
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -13,9 +13,8 @@
>  #include <public/xen.h>
>  #include <irq_vectors.h>
>  
> -        ALIGN
>  /* %rbx: struct vcpu */
> -switch_to_kernel:
> +ENTRY(switch_to_kernel)
>          leaq  VCPU_trap_bounce(%rbx),%rdx
>          /* TB_eip = (32-bit syscall && syscall32_addr) ?
>           *          syscall32_addr : syscall_addr */
> @@ -114,22 +113,21 @@ restore_all_xen:
>   *  Vector directly to the registered arch.syscall_addr.
>   *
>   * Initial work is done by per-CPU stack trampolines. At this point %rsp

The trampolines are no longer on the stack, so I think this line needs
editing as well.

> - * has been initialised to point at the correct Xen stack, and %rsp, %rflags
> - * and %cs have been saved. All other registers are still to be saved onto
> - * the stack, starting with %rip, and an appropriate %ss must be saved into
> - * the space left by the trampoline.
> + * has been initialised to point at the correct Xen stack, %rsp has been
> + * saved, and %rax needs to be restored from the %ss save slot. All other
> + * registers are still to be saved onto the stack, starting with RFLAGS, and
> + * an appropriate %ss must be saved into the space left by the trampoline.
>   */
> -ENTRY(syscall_enter)
> +ENTRY(lstar_enter)
>          sti
> -        movl  $FLAT_KERNEL_SS,24(%rsp)
> +        movq  8(%rsp),%rax /* Restore %rax. */
> +        movq  $FLAT_KERNEL_SS,8(%rsp)
> +        pushq %r11
> +        pushq $FLAT_KERNEL_CS64
>          pushq %rcx
>          pushq $0
> -        movq  24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
>          SAVE_VOLATILE TRAP_syscall
>          GET_CURRENT(%rbx)
> -        movq  VCPU_domain(%rbx),%rcx
> -        testb $1,DOMAIN_is_32bit_pv(%rcx)
> -        jnz   compat_syscall
>          testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
>          jz    switch_to_kernel
>  
> --- a/xen/arch/x86/x86_64/traps.c
> +++ b/xen/arch/x86/x86_64/traps.c
>
> +DEFINE_PER_CPU(struct stubs, stubs);
> +void lstar_enter(void);
> +void cstar_enter(void);
> +
>  void __devinit subarch_percpu_traps_init(void)
>  {
> -    char *stack_bottom, *stack;
> -
> -    stack_bottom = (char *)get_stack_bottom();
> -    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
> +    unsigned long stack_bottom = get_stack_bottom();
> +    unsigned long stub_va = this_cpu(stubs.addr);
> +    unsigned char *stub_page;
> +    unsigned int offset;
>  
>      /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
>      BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
>  
> -    /* Trampoline for SYSCALL entry from long mode. */
> -    stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
> -    wrmsrl(MSR_LSTAR, (unsigned long)stack);
> -    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
> +    stub_page = map_domain_page(this_cpu(stubs.mfn));
> +
> +    /* Trampoline for SYSCALL entry from 64-bit mode. */
> +    wrmsrl(MSR_LSTAR, stub_va);
> +    offset = write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),

I would personally opt for "stub_va & ~PAGE_MASK"

> +                                   stub_va, stack_bottom,
> +                                   (unsigned long)lstar_enter);
> +    stub_va += offset;
>  
>      if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
>           boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
>      {
>          /* SYSENTER entry. */
> -        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
> +        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
>          wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
>          wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
>      }
>  
>      /* Trampoline for SYSCALL entry from compatibility mode. */
> -    stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
> -    wrmsrl(MSR_CSTAR, (unsigned long)stack);
> -    stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
> +    wrmsrl(MSR_CSTAR, stub_va);
> +    offset += write_stub_trampoline(stub_page + (stub_va & (PAGE_SIZE - 1)),
> +                                    stub_va, stack_bottom,
> +                                    (unsigned long)cstar_enter);
> +
> +    /* Don't consume more than half of the stub space here. */
> +    ASSERT(offset <= STUB_BUF_SIZE / 2);
> +
> +    unmap_domain_page(stub_page);
>  
>      /* Common SYSCALL parameters. */
>      wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);

As for the rest of the patch, I have checked the asm changes carefully
and can't see any issues.

~Andrew

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] x86emul: move stubs off the stack
  2015-05-18 12:46 ` [PATCH 2/4] x86emul: move stubs " Jan Beulich
@ 2015-05-19 17:33   ` Andrew Cooper
  2015-05-20  9:25     ` Jan Beulich
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Cooper @ 2015-05-19 17:33 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 18/05/15 13:46, Jan Beulich wrote:
> This is needed as stacks are going to become non-executable.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/tools/tests/x86_emulator/x86_emulate.c
> +++ b/tools/tests/x86_emulator/x86_emulate.c
> @@ -17,4 +17,8 @@ typedef bool bool_t;
>  #define __packed __attribute__((packed))
>  
>  #include "x86_emulate/x86_emulate.h"
> +
> +#define get_stub(stb) ((void *)((stb).addr = (uintptr_t)(stb).buf))
> +#define put_stub(stb)
> +
>  #include "x86_emulate/x86_emulate.c"
> --- a/xen/arch/x86/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate.c
> @@ -9,6 +9,7 @@
>   *    Keir Fraser <keir@xen.org>
>   */
>  
> +#include <xen/domain_page.h>
>  #include <asm/x86_emulate.h>
>  #include <asm/asm_defns.h> /* mark_regs_dirty() */
>  #include <asm/processor.h> /* current_cpu_info */
> @@ -17,8 +18,22 @@
>  /* Avoid namespace pollution. */
>  #undef cmpxchg
>  #undef cpuid
> +#undef wbinvd
>  
>  #define cpu_has_amd_erratum(nr) \
>          cpu_has_amd_erratum(&current_cpu_data, AMD_ERRATUM_##nr)
>  
> +#define get_stub(stb) ({                                   \
> +    (stb).addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2; \
> +    ((stb).ptr = map_domain_page(this_cpu(stubs.mfn))) +   \
> +        ((stb).addr & (PAGE_SIZE - 1));                    \
> +})
> +#define put_stub(stb) ({                                   \
> +    if ( (stb).ptr )                                       \
> +    {                                                      \
> +        unmap_domain_page((stb).ptr);                      \
> +        (stb).ptr = NULL;                                  \
> +    }                                                      \
> +})
> +

These don't need to be macros, and I suspect a compiler would choose to
out-of-line get_stub() if it could.

>  #include "x86_emulate/x86_emulate.c"
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -717,11 +717,14 @@ do{ struct fpu_insn_ctxt fic;           
>  } while (0)
>  
>  #define emulate_fpu_insn_stub(_bytes...)                                \
> -do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
> -    struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
> +do{ uint8_t *buf = get_stub(stub);                                      \
> +    unsigned int _nr = sizeof((uint8_t[]){ _bytes });                   \
> +    struct fpu_insn_ctxt fic = { .insn_bytes = _nr };                    \

Alignment of backslashes.

> +    memcpy(buf, ((uint8_t[]){ _bytes, 0xc3 }), _nr + 1);                \
>      get_fpu(X86EMUL_FPU_fpu, &fic);                                     \
> -    (*(void(*)(void))stub)();                                           \
> +    stub.func();                                                        \
>      put_fpu(&fic);                                                      \
> +    put_stub(stub);                                                     \
>  } while (0)
>  
>  static unsigned long _get_rep_prefix(
> @@ -1458,6 +1461,7 @@ x86_emulate(
>      struct operand src = { .reg = REG_POISON };
>      struct operand dst = { .reg = REG_POISON };
>      enum x86_swint_type swint_type;
> +    struct x86_emulate_stub stub = {};
>      DECLARE_ALIGNED(mmval_t, mmval);
>      /*
>       * Data operand effective address (usually computed from ModRM).
> @@ -3792,6 +3796,7 @@ x86_emulate(
>  
>   done:
>      _put_fpu();
> +    put_stub(stub);
>      return rc;
>  
>   twobyte_insn:
> @@ -4007,9 +4012,15 @@ x86_emulate(
>                 /* {,v}movss xmm,xmm/m32 */
>                 /* {,v}movsd xmm,xmm/m64 */
>      {
> -        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
> -        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
> +        uint8_t *buf = get_stub(stub);
> +        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
>  
> +        buf[0] = 0x3e;
> +        buf[1] = 0x3e;
> +        buf[2] = 0x0f;
> +        buf[3] = b;
> +        buf[4] = modrm;
> +        buf[5] = 0xc3;
>          if ( vex.opcx == vex_none )
>          {
>              if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
> @@ -4017,7 +4028,7 @@ x86_emulate(
>              else
>                  vcpu_must_have_sse();
>              ea.bytes = 16;
> -            SET_SSE_PREFIX(stub[0], vex.pfx);
> +            SET_SSE_PREFIX(buf[0], vex.pfx);
>              get_fpu(X86EMUL_FPU_xmm, &fic);
>          }
>          else
> @@ -4044,15 +4055,16 @@ x86_emulate(
>              /* convert memory operand to (%rAX) */
>              rex_prefix &= ~REX_B;
>              vex.b = 1;
> -            stub[4] &= 0x38;
> +            buf[4] &= 0x38;
>          }
>          if ( !rc )
>          {
> -           copy_REX_VEX(stub, rex_prefix, vex);
> -           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
> +           copy_REX_VEX(buf, rex_prefix, vex);
> +           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
>                                       : "memory" );
>          }
>          put_fpu(&fic);
> +        put_stub(stub);
>          if ( !rc && (b & 1) && (ea.type == OP_MEM) )
>              rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
>                              ea.bytes, ctxt);
> @@ -4242,9 +4254,15 @@ x86_emulate(
>                 /* {,v}movdq{a,u} xmm,xmm/m128 */
>                 /* vmovdq{a,u} ymm,ymm/m256 */
>      {
> -        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
> -        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
> +        uint8_t *buf = get_stub(stub);
> +        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
>  
> +        buf[0] = 0x3e;
> +        buf[1] = 0x3e;
> +        buf[2] = 0x0f;
> +        buf[3] = b;
> +        buf[4] = modrm;
> +        buf[5] = 0xc3;
>          if ( vex.opcx == vex_none )
>          {
>              switch ( vex.pfx )
> @@ -4252,7 +4270,7 @@ x86_emulate(
>              case vex_66:
>              case vex_f3:
>                  vcpu_must_have_sse2();
> -                stub[0] = 0x66; /* movdqa */
> +                buf[0] = 0x66; /* movdqa */
>                  get_fpu(X86EMUL_FPU_xmm, &fic);
>                  ea.bytes = 16;
>                  break;
> @@ -4288,15 +4306,16 @@ x86_emulate(
>              /* convert memory operand to (%rAX) */
>              rex_prefix &= ~REX_B;
>              vex.b = 1;
> -            stub[4] &= 0x38;
> +            buf[4] &= 0x38;
>          }
>          if ( !rc )
>          {
> -           copy_REX_VEX(stub, rex_prefix, vex);
> -           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
> +           copy_REX_VEX(buf, rex_prefix, vex);
> +           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
>                                       : "memory" );
>          }
>          put_fpu(&fic);
> +        put_stub(stub);
>          if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
>              rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
>                              ea.bytes, ctxt);
> @@ -4638,5 +4657,6 @@ x86_emulate(
>  
>   cannot_emulate:
>      _put_fpu();
> +    put_stub(stub);
>      return X86EMUL_UNHANDLEABLE;
>  }
> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -429,6 +429,18 @@ struct x86_emulate_ctxt
>      } retire;
>  };
>  
> +struct x86_emulate_stub {
> +    union {
> +        void (*func)(void);
> +        uintptr_t addr;
> +    };
> +#ifdef __XEN__
> +    void *ptr;
> +#else
> +    uint8_t buf[32];

16 is surely enough? The emulator will #GP if it attempts to fetch more
than 15 bytes, and only a 'ret' is needed after that.

~Andrew

> +#endif
> +};
> +
>  /*
>   * x86_emulate: Emulate an instruction.
>   * Returns -1 on failure, 0 on success.
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] x86: move I/O emulation stubs off the stack
  2015-05-18 12:47 ` [PATCH 3/4] x86: move I/O emulation " Jan Beulich
@ 2015-05-19 17:48   ` Andrew Cooper
  2015-05-20 13:57     ` Jan Beulich
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Cooper @ 2015-05-19 17:48 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 18/05/15 13:47, Jan Beulich wrote:
> This is needed as stacks are going to become non-executable.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -2006,7 +2006,7 @@ static int emulate_privileged_op(struct 
>                             ? (*(u32 *)&regs->reg = (val)) \
>                             : (*(u16 *)&regs->reg = (val)))
>      unsigned long code_base, code_limit;
> -    char io_emul_stub[32];
> +    char *io_emul_stub = NULL;
>      void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
>      uint64_t val;
>  
> @@ -2195,6 +2195,9 @@ static int emulate_privileged_op(struct 
>       * GPR context. This is needed for some systems which (ab)use IN/OUT
>       * to communicate with BIOS code in system-management mode.
>       */

The comment (just out of context) needs tweaking.

> +    io_emul_stub = map_domain_page(this_cpu(stubs.mfn)) +
> +                   (this_cpu(stubs.addr) & (PAGE_SIZE - 1)) +
> +                   STUB_BUF_SIZE / 2;
>      /* movq $host_to_guest_gpr_switch,%rcx */
>      io_emul_stub[0] = 0x48;
>      io_emul_stub[1] = 0xb9;
> @@ -2212,7 +2215,7 @@ static int emulate_privileged_op(struct 
>      io_emul_stub[15] = 0xc3;
>  
>      /* Handy function-typed pointer to the stub. */
> -    io_emul = (void *)io_emul_stub;
> +    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);

As an unrelated observation during review, the two gpr switch functions
should probably gain some knowledge of TRAP_regs_partial

~Andrew

>  
>      if ( ioemul_handle_quirk )
>          ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
> @@ -2777,9 +2780,13 @@ static int emulate_privileged_op(struct 
>   done:
>      instruction_done(regs, eip, bpmatch);
>   skip:
> +    if ( io_emul_stub )
> +        unmap_domain_page(io_emul_stub);
>      return EXCRET_fault_fixed;
>  
>   fail:
> +    if ( io_emul_stub )
> +        unmap_domain_page(io_emul_stub);
>      return 0;
>  }
>  
>
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] x86: switch default mapping attributes to non-executable
  2015-05-18 12:47 ` [PATCH 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
@ 2015-05-19 18:53   ` Andrew Cooper
  2015-05-20  9:32     ` Jan Beulich
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Cooper @ 2015-05-19 18:53 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 18/05/15 13:47, Jan Beulich wrote:
> Only a very limited subset of mappings need to be done as executable
> ones; in particular the direct mapping should not be executable to
> limit the damage attackers can cause by exploiting security relevant
> bugs.
>
> The EFI change at once includes an adjustment to set NX only when
> supported by the hardware.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -293,7 +293,7 @@ struct vcpu_guest_context *alloc_vcpu_gu
>              free_vcpu_guest_context(NULL);
>              return NULL;
>          }
> -        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR);
> +        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR_RW);
>          per_cpu(vgc_pages[i], cpu) = pg;
>      }
>      return (void *)fix_to_virt(idx);
> --- a/xen/arch/x86/domain_page.c
> +++ b/xen/arch/x86/domain_page.c
> @@ -160,7 +160,7 @@ void *map_domain_page(unsigned long mfn)
>  
>      spin_unlock(&dcache->lock);
>  
> -    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
> +    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR_RW));
>  
>   out:
>      local_irq_restore(flags);
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -4416,7 +4416,7 @@ long set_gdt(struct vcpu *v, 
>      for ( i = 0; i < nr_pages; i++ )
>      {
>          v->arch.pv_vcpu.gdt_frames[i] = frames[i];
> -        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
> +        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR_RW));
>      }
>  
>      xfree(pfns);
> @@ -6004,7 +6004,7 @@ int create_perdomain_mapping(struct doma
>                  if ( !IS_NIL(ppg) )
>                      *ppg++ = pg;
>                  l1tab[l1_table_offset(va)] =
> -                    l1e_from_page(pg, __PAGE_HYPERVISOR | _PAGE_AVAIL0);
> +                    l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
>                  l2e_add_flags(*pl2e, _PAGE_AVAIL0);
>              }
>              else
> @@ -6133,7 +6133,7 @@ void memguard_init(void)
>          (unsigned long)__va(start),
>          start >> PAGE_SHIFT,
>          (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
> -        __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
> +        __PAGE_HYPERVISOR_RW|MAP_SMALL_PAGES);
>      BUG_ON(start != xen_phys_start);
>      map_pages_to_xen(
>          XEN_VIRT_START,
> @@ -6146,7 +6146,7 @@ static void __memguard_change_range(void
>  {
>      unsigned long _p = (unsigned long)p;
>      unsigned long _l = (unsigned long)l;
> -    unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
> +    unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
>  
>      /* Ensure we are dealing with a page-aligned whole number of pages. */
>      ASSERT((_p&~PAGE_MASK) == 0);
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -900,7 +900,7 @@ void __init noreturn __start_xen(unsigne
>              /* The only data mappings to be relocated are in the Xen area. */
>              pl2e = __va(__pa(l2_xenmap));
>              *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
> -                                   PAGE_HYPERVISOR | _PAGE_PSE);
> +                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
>              for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
>              {
>                  if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
> @@ -1087,7 +1087,7 @@ void __init noreturn __start_xen(unsigne
>              /* This range must not be passed to the boot allocator and
>               * must also not be mapped with _PAGE_GLOBAL. */
>              map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
> -                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR);
> +                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
>          }
>          if ( s < map_s )
>          {
> --- a/xen/arch/x86/x86_64/mm.c
> +++ b/xen/arch/x86/x86_64/mm.c
> @@ -895,6 +895,33 @@ void __init subarch_init_memory(void)
>              share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
>          }
>      }
> +
> +    /* Mark low 16Mb of direct map NX if hardware supports it. */
> +    if ( !cpu_has_nx )
> +        return;
> +
> +    v = DIRECTMAP_VIRT_START + (1UL << 20);

What about 0-1MB ?

> +    l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[l3_table_offset(v)];
> +    ASSERT(l3e_get_flags(l3e) & _PAGE_PRESENT);
> +    do {
> +        l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
> +        ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
> +        if ( l2e_get_flags(l2e) & _PAGE_PSE )
> +        {
> +            l2e_add_flags(l2e, _PAGE_NX_BIT);
> +            l3e_to_l2e(l3e)[l2_table_offset(v)] = l2e;
> +            v += 1 << L2_PAGETABLE_SHIFT;
> +        }
> +        else
> +        {
> +            l1_pgentry_t l1e = l2e_to_l1e(l2e)[l1_table_offset(v)];
> +
> +            ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
> +            l1e_add_flags(l1e, _PAGE_NX_BIT);
> +            l2e_to_l1e(l2e)[l1_table_offset(v)] = l1e;
> +            v += 1 << L1_PAGETABLE_SHIFT;
> +        }
> +    } while ( v < DIRECTMAP_VIRT_START + (16UL << 20) );

How about just setting l3e.nx and leaving all lower tables alone?

>  }
>  
>  long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
> @@ -1359,7 +1386,7 @@ int memory_add(unsigned long spfn, unsig
>          if ( i < spfn )
>              i = spfn;
>          ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i,
> -                               epfn - i, __PAGE_HYPERVISOR);
> +                               epfn - i, __PAGE_HYPERVISOR_RW);
>          if ( ret )
>              return ret;
>      }
> --- a/xen/common/efi/boot.c
> +++ b/xen/common/efi/boot.c
> @@ -1162,7 +1162,7 @@ void __init efi_init_memory(void)
>          EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i;
>          u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT;
>          unsigned long smfn, emfn;
> -        unsigned int prot = PAGE_HYPERVISOR;
> +        unsigned int prot = PAGE_HYPERVISOR_RWX;
>  
>          printk(XENLOG_INFO " %013" PRIx64 "-%013" PRIx64
>                             " type=%u attr=%016" PRIx64 "\n",
> @@ -1195,7 +1195,7 @@ void __init efi_init_memory(void)
>          if ( desc->Attribute & EFI_MEMORY_WP )
>              prot &= _PAGE_RW;
>          if ( desc->Attribute & EFI_MEMORY_XP )
> -            prot |= _PAGE_NX_BIT;
> +            prot |= _PAGE_NX;
>  
>          if ( pfn_to_pdx(emfn - 1) < (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
>               !(smfn & pfn_hole_mask) &&
> --- a/xen/include/asm-x86/page.h
> +++ b/xen/include/asm-x86/page.h
> @@ -303,7 +303,8 @@ void paging_init(void);
>  #define _PAGE_AVAIL1   _AC(0x400,U)
>  #define _PAGE_AVAIL2   _AC(0x800,U)
>  #define _PAGE_AVAIL    _AC(0xE00,U)
> -#define _PAGE_PSE_PAT _AC(0x1000,U)
> +#define _PAGE_PSE_PAT  _AC(0x1000,U)
> +#define _PAGE_NX       (cpu_has_nx ? _PAGE_NX_BIT : 0)
>  /* non-architectural flags */
>  #define _PAGE_PAGED   0x2000U
>  #define _PAGE_SHARED  0x4000U
> @@ -320,6 +321,9 @@ void paging_init(void);
>  #define _PAGE_GNTTAB   0
>  #endif
>  
> +#define __PAGE_HYPERVISOR_RO      (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NX)
> +#define __PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RO | \
> +                                   _PAGE_DIRTY | _PAGE_RW)
>  #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
>  #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
>                                     _PAGE_DIRTY | _PAGE_RW)
> --- a/xen/include/asm-x86/x86_64/page.h
> +++ b/xen/include/asm-x86/x86_64/page.h
> @@ -147,9 +147,20 @@ typedef l4_pgentry_t root_pgentry_t;
>   */
>  #define _PAGE_GUEST_KERNEL (1U<<12)
>  
> -#define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
> +#define PAGE_HYPERVISOR_RO      (__PAGE_HYPERVISOR_RO      | _PAGE_GLOBAL)
> +#define PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RW      | _PAGE_GLOBAL)
>  #define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
> -#define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
> +#define PAGE_HYPERVISOR_RWX     (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
> +
> +#ifdef __ASSEMBLY__
> +/* Dependency on NX being available can't be expressed. */
> +# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RWX
> +# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
> +#else
> +# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RW
> +# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | \
> +                                  _PAGE_GLOBAL | _PAGE_NX)
> +#endif

This patch is clearly an improvement, so my comments can possibly be
deferred to subsequent patches.

After boot, I can't think of a legitimate reason for Xen to have a RWX
mapping.  As such, leaving __PAGE_HYPERVISOR around constitutes a risk
that RWX mappings will be used.  It would be nice if we could remove all
constants (which aren't BOOT_*) which could lead to accidental use of
RWX mappings on NX-enabled hardware.

I think we also want a warning on boot if we find NX unavailable.  The
only 64bit capable CPUs which do not support NX are now 11 years old,
and I don't expect many of those to be running Xen these days.  However,
given that "disable NX" is a common BIOS option for compatibility
reasons, we should press people to alter their settings if possible.

~Andrew

>  
>  #endif /* __X86_64_PAGE_H__ */
>  
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-19 16:59   ` Andrew Cooper
@ 2015-05-20  9:16     ` Jan Beulich
  2015-05-20 13:37     ` Jan Beulich
  2015-05-20 15:54     ` Jan Beulich
  2 siblings, 0 replies; 19+ messages in thread
From: Jan Beulich @ 2015-05-20  9:16 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 19.05.15 at 18:59, <andrew.cooper3@citrix.com> wrote:
> On 18/05/15 13:46, Jan Beulich wrote:
>> +unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
>> +{
>> +    unsigned long stub_va;
>> +    struct page_info *pg;
>> +
>> +    if ( *mfn )
>> +        pg = mfn_to_page(*mfn);
>> +    else
>> +    {
>> +        nodeid_t node = cpu_to_node(cpu);
>> +        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
>> +
>> +        pg = alloc_domheap_page(NULL, memflags);
>> +        if ( !pg )
>> +            return 0;
>> +
>> +        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
> 
> Seems like memset_page(pg, int val) might be a nice piece of cleanup.

Except for this one (which, if at all, should be a follow-up patch) I'll
see to address all of your comments for v2.

Jan

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] x86emul: move stubs off the stack
  2015-05-19 17:33   ` Andrew Cooper
@ 2015-05-20  9:25     ` Jan Beulich
  0 siblings, 0 replies; 19+ messages in thread
From: Jan Beulich @ 2015-05-20  9:25 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 19.05.15 at 19:33, <andrew.cooper3@citrix.com> wrote:
> On 18/05/15 13:46, Jan Beulich wrote:
>> @@ -17,8 +18,22 @@
>>  /* Avoid namespace pollution. */
>>  #undef cmpxchg
>>  #undef cpuid
>> +#undef wbinvd
>>  
>>  #define cpu_has_amd_erratum(nr) \
>>          cpu_has_amd_erratum(&current_cpu_data, AMD_ERRATUM_##nr)
>>  
>> +#define get_stub(stb) ({                                   \
>> +    (stb).addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2; \
>> +    ((stb).ptr = map_domain_page(this_cpu(stubs.mfn))) +   \
>> +        ((stb).addr & (PAGE_SIZE - 1));                    \
>> +})
>> +#define put_stub(stb) ({                                   \
>> +    if ( (stb).ptr )                                       \
>> +    {                                                      \
>> +        unmap_domain_page((stb).ptr);                      \
>> +        (stb).ptr = NULL;                                  \
>> +    }                                                      \
>> +})
>> +
> 
> These don't need to be macros, and I suspect a compiler would choose to
> out-of-line get_stub() if it could.

But afaict there's also nothing wrong with them being macros.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
>> @@ -429,6 +429,18 @@ struct x86_emulate_ctxt
>>      } retire;
>>  };
>>  
>> +struct x86_emulate_stub {
>> +    union {
>> +        void (*func)(void);
>> +        uintptr_t addr;
>> +    };
>> +#ifdef __XEN__
>> +    void *ptr;
>> +#else
>> +    uint8_t buf[32];
> 
> 16 is surely enough? The emulator will #GP if it attempts to fetch more
> than 15 bytes, and only a 'ret' is needed after that.

Maybe, right now. I wanted to leave room for there perhaps being
a case where two instructions could be generated. But of course we
can grow the buffer later. I'll use MAX_INST_LEN + 1 for now.

Jan

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] x86: switch default mapping attributes to non-executable
  2015-05-19 18:53   ` Andrew Cooper
@ 2015-05-20  9:32     ` Jan Beulich
  0 siblings, 0 replies; 19+ messages in thread
From: Jan Beulich @ 2015-05-20  9:32 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 19.05.15 at 20:53, <andrew.cooper3@citrix.com> wrote:
> On 18/05/15 13:47, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_64/mm.c
>> +++ b/xen/arch/x86/x86_64/mm.c
>> @@ -895,6 +895,33 @@ void __init subarch_init_memory(void)
>>              share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
>>          }
>>      }
>> +
>> +    /* Mark low 16Mb of direct map NX if hardware supports it. */
>> +    if ( !cpu_has_nx )
>> +        return;
>> +
>> +    v = DIRECTMAP_VIRT_START + (1UL << 20);
> 
> What about 0-1MB ?

I'd like to avoid fiddling with that range, as the trampoline living there
will want to continue to be RWX anyway.

>> +    l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[l3_table_offset(v)];
>> +    ASSERT(l3e_get_flags(l3e) & _PAGE_PRESENT);
>> +    do {
>> +        l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
>> +        ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
>> +        if ( l2e_get_flags(l2e) & _PAGE_PSE )
>> +        {
>> +            l2e_add_flags(l2e, _PAGE_NX_BIT);
>> +            l3e_to_l2e(l3e)[l2_table_offset(v)] = l2e;
>> +            v += 1 << L2_PAGETABLE_SHIFT;
>> +        }
>> +        else
>> +        {
>> +            l1_pgentry_t l1e = l2e_to_l1e(l2e)[l1_table_offset(v)];
>> +
>> +            ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
>> +            l1e_add_flags(l1e, _PAGE_NX_BIT);
>> +            l2e_to_l1e(l2e)[l1_table_offset(v)] = l1e;
>> +            v += 1 << L1_PAGETABLE_SHIFT;
>> +        }
>> +    } while ( v < DIRECTMAP_VIRT_START + (16UL << 20) );
> 
> How about just setting l3e.nx and leaving all lower tables alone?

Apart from the trampoline aspect (see above) I don't think it's a
good idea to restrict permissions in non-leaf entries - that just
calls for more intrusive patches if later an individual page needs
them lifted.

>> --- a/xen/include/asm-x86/x86_64/page.h
>> +++ b/xen/include/asm-x86/x86_64/page.h
>> @@ -147,9 +147,20 @@ typedef l4_pgentry_t root_pgentry_t;
>>   */
>>  #define _PAGE_GUEST_KERNEL (1U<<12)
>>  
>> -#define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
>> +#define PAGE_HYPERVISOR_RO      (__PAGE_HYPERVISOR_RO      | _PAGE_GLOBAL)
>> +#define PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RW      | _PAGE_GLOBAL)
>>  #define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
>> -#define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
>> +#define PAGE_HYPERVISOR_RWX     (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
>> +
>> +#ifdef __ASSEMBLY__
>> +/* Dependency on NX being available can't be expressed. */
>> +# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RWX
>> +# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
>> +#else
>> +# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RW
>> +# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | \
>> +                                  _PAGE_GLOBAL | _PAGE_NX)
>> +#endif
> 
> This patch is clearly an improvement, so my comments can possibly be
> deferred to subsequent patches.
> 
> After boot, I can't think of a legitimate reason for Xen to have a RWX
> mapping.  As such, leaving __PAGE_HYPERVISOR around constitutes a risk
> that RWX mappings will be used.  It would be nice if we could remove all
> constants (which aren't BOOT_*) which could lead to accidental use of
> RWX mappings on NX-enabled hardware.

But you realize that __PAGE_HYPERVISOR is particularly used for
intermediate page table entry permissions?

> I think we also want a warning on boot if we find NX unavailable.  The
> only 64bit capable CPUs which do not support NX are now 11 years old,
> and I don't expect many of those to be running Xen these days.  However,
> given that "disable NX" is a common BIOS option for compatibility
> reasons, we should press people to alter their settings if possible.

Sure, easily done.

Jan

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-19 16:59   ` Andrew Cooper
  2015-05-20  9:16     ` Jan Beulich
@ 2015-05-20 13:37     ` Jan Beulich
  2015-05-20 13:58       ` Andrew Cooper
  2015-05-20 15:54     ` Jan Beulich
  2 siblings, 1 reply; 19+ messages in thread
From: Jan Beulich @ 2015-05-20 13:37 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 19.05.15 at 18:59, <andrew.cooper3@citrix.com> wrote:
> On 18/05/15 13:46, Jan Beulich wrote:
>> @@ -616,6 +653,24 @@ static void cpu_smpboot_free(unsigned in
>>      free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
>>      free_cpumask_var(per_cpu(cpu_core_mask, cpu));
>>  
>> +    if ( per_cpu(stubs.addr, cpu) )
>> +    {
>> +        unsigned long mfn = per_cpu(stubs.mfn, cpu);
>> +        unsigned char *stub_page = map_domain_page(mfn);
>> +        unsigned int i;
>> +
>> +        memset(stub_page + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE,
>> +               0xcc, STUB_BUF_SIZE);
>> +        for ( i = 0; i < STUBS_PER_PAGE; ++i )
>> +            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc)
>> +                break;
> 
> There is a race condition between allocate and free, as
> write_stub_trampoline() is written by the cpu itself.  Just finding 0xcc
> here doesn't mean there isn't a different cpu in the process of coming
> up and intending to use the stub page.

No, there is no race afaict: percpu_traps_init() gets called before
a CPU is marked online and hence before __cpu_up() returns, and
thus before the CPU hotplug lock gets dropped, so there can't be
any CPUs going down concurrently.

Jan

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] x86: move I/O emulation stubs off the stack
  2015-05-19 17:48   ` Andrew Cooper
@ 2015-05-20 13:57     ` Jan Beulich
  0 siblings, 0 replies; 19+ messages in thread
From: Jan Beulich @ 2015-05-20 13:57 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 19.05.15 at 19:48, <andrew.cooper3@citrix.com> wrote:
> On 18/05/15 13:47, Jan Beulich wrote:
>> @@ -2212,7 +2215,7 @@ static int emulate_privileged_op(struct 
>>      io_emul_stub[15] = 0xc3;
>>  
>>      /* Handy function-typed pointer to the stub. */
>> -    io_emul = (void *)io_emul_stub;
>> +    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
> 
> As an unrelated observation during review, the two gpr switch functions
> should probably gain some knowledge of TRAP_regs_partial

You mean adding a respective assertion? This code shouldn't
otherwise need to know of that flag, as execution must not make
it here with a partial frame.

Jan

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-20 13:37     ` Jan Beulich
@ 2015-05-20 13:58       ` Andrew Cooper
  0 siblings, 0 replies; 19+ messages in thread
From: Andrew Cooper @ 2015-05-20 13:58 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

On 20/05/15 14:37, Jan Beulich wrote:
>>>> On 19.05.15 at 18:59, <andrew.cooper3@citrix.com> wrote:
>> On 18/05/15 13:46, Jan Beulich wrote:
>>> @@ -616,6 +653,24 @@ static void cpu_smpboot_free(unsigned in
>>>      free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
>>>      free_cpumask_var(per_cpu(cpu_core_mask, cpu));
>>>  
>>> +    if ( per_cpu(stubs.addr, cpu) )
>>> +    {
>>> +        unsigned long mfn = per_cpu(stubs.mfn, cpu);
>>> +        unsigned char *stub_page = map_domain_page(mfn);
>>> +        unsigned int i;
>>> +
>>> +        memset(stub_page + (cpu & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE,
>>> +               0xcc, STUB_BUF_SIZE);
>>> +        for ( i = 0; i < STUBS_PER_PAGE; ++i )
>>> +            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc)
>>> +                break;
>> There is a race condition between allocate and free, as
>> write_stub_trampoline() is written by the cpu itself.  Just finding 0xcc
>> here doesn't mean there isn't a different cpu in the process of coming
>> up and intending to use the stub page.
> No, there is no race afaict: percpu_traps_init() gets called before
> a CPU is marked online and hence before __cpu_up() returns, and
> thus before the CPU hotplug lock gets dropped, so there can't be
> any CPUs going down concurrently.

Thinking out loud...

cpu_up/down have excusion due to the recursive spinlock used by
cpu_hotplug_begin/end.

The allocate is done by the CPU_UP_PREPARE notifier chain, and
__cpu_up() does wait until the AP has marked itself as online.

Therefore, so long as no CPU_UP_PREPARE action triggers a
CPU_UP_CANCELED or CPU_DEAD chain against the same cpu, we are safe.

~Andrew

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] x86: move syscall trampolines off the stack
  2015-05-19 16:59   ` Andrew Cooper
  2015-05-20  9:16     ` Jan Beulich
  2015-05-20 13:37     ` Jan Beulich
@ 2015-05-20 15:54     ` Jan Beulich
  2 siblings, 0 replies; 19+ messages in thread
From: Jan Beulich @ 2015-05-20 15:54 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 19.05.15 at 18:59, <andrew.cooper3@citrix.com> wrote:
> On 18/05/15 13:46, Jan Beulich wrote:
>> +    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
>> +    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
>> +                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
>> +    {
>> +        if ( !*mfn )
>> +            free_domheap_page(pg);
>> +        stub_va = 0;
>> +    }
>> +    else
>> +    {
>> +        stub_va += (cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE;
> 
> "(cpu & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE" comes up very frequently
> through this patch.
> 
> I think the logic would be easier to read given:
> 
> #define STUB_OFFSET(cpu) (((cpu) & ~(STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)

Except that (a) the ~ was wrong here (figured the hard way)
and (b) the stub_va adjustment was a leftover from when VA
management was still different (found out an even harder way,
and the ~ back then was correct here).

Jan

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2015-05-20 15:54 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-18 10:28 [PATCH 0/4] x86: don't default to executable mappings Jan Beulich
2015-05-18 12:46 ` [PATCH 1/4] x86: move syscall trampolines off the stack Jan Beulich
2015-05-18 18:39   ` Andrew Cooper
2015-05-19  6:41     ` Jan Beulich
2015-05-19  9:24       ` Andrew Cooper
2015-05-19 16:59   ` Andrew Cooper
2015-05-20  9:16     ` Jan Beulich
2015-05-20 13:37     ` Jan Beulich
2015-05-20 13:58       ` Andrew Cooper
2015-05-20 15:54     ` Jan Beulich
2015-05-18 12:46 ` [PATCH 2/4] x86emul: move stubs " Jan Beulich
2015-05-19 17:33   ` Andrew Cooper
2015-05-20  9:25     ` Jan Beulich
2015-05-18 12:47 ` [PATCH 3/4] x86: move I/O emulation " Jan Beulich
2015-05-19 17:48   ` Andrew Cooper
2015-05-20 13:57     ` Jan Beulich
2015-05-18 12:47 ` [PATCH 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
2015-05-19 18:53   ` Andrew Cooper
2015-05-20  9:32     ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.