All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] x86: don't default to executable mappings
@ 2015-05-21 10:07 Jan Beulich
  2015-05-21 10:15 ` [PATCH v2 1/4] x86: move syscall trampolines off the stack Jan Beulich
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Jan Beulich @ 2015-05-21 10:07 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

Particularly for the 1:1 mapping it was pointed out that in order to
limit the damage from security issues we should avoid mapping things
executable when they don't need to be.

1: move syscall trampolines off the stack
2: emul: move stubs off the stack
3: move I/O emulation stubs off the stack
4: switch default mapping attributes to non-executable

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 1/4] x86: move syscall trampolines off the stack
  2015-05-21 10:07 [PATCH v2 0/4] x86: don't default to executable mappings Jan Beulich
@ 2015-05-21 10:15 ` Jan Beulich
  2015-05-21 11:08   ` Andrew Cooper
  2015-05-21 10:16 ` [PATCH v2 2/4] x86emul: move stubs " Jan Beulich
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2015-05-21 10:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 14719 bytes --]

This is needed as stacks are going to become non-executable. Use
separate stub pages (shared among suitable CPUs on the same node)
instead.

Stub areas (currently 128 bytes each) are being split into two parts -
a fixed usage one (the syscall ones) and dynamically usable space,
which will be used by subsequent changes to hold dynamically generated
code during instruction eumlation.

While sharing physical pages among certain CPUs on the same node, for
now the virtual mappings get established in distinct pages for each
CPU. This isn't a strict requirement, but simplifies VA space
management for this initial implementation: Sharing VA space would
require additional tracking of which areas are currently in use. If
the VA and/or TLB overhead turned out to be a problem, such extra code
could easily be added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Get rid of bogus VA adjustment in alloc_stub_page(). Introduce
    STUB_BUF_CPU_OFFS(). Use ~PAGE_MASK instead of PAGE_SIZE - 1 for
    page offset masking. Comment adjustments. Extend description.

--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1270,6 +1270,10 @@ void __init noreturn __start_xen(unsigne
 
     init_idle_domain();
 
+    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
+                                           &this_cpu(stubs).mfn);
+    BUG_ON(!this_cpu(stubs.addr));
+
     trap_init();
 
     rcu_init();
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -25,6 +25,7 @@
 #include <xen/kernel.h>
 #include <xen/mm.h>
 #include <xen/domain.h>
+#include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <xen/sched-if.h>
 #include <xen/irq.h>
@@ -603,6 +604,41 @@ static int do_boot_cpu(int apicid, int c
     return rc;
 }
 
+#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
+
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
+{
+    unsigned long stub_va;
+    struct page_info *pg;
+
+    if ( *mfn )
+        pg = mfn_to_page(*mfn);
+    else
+    {
+        nodeid_t node = cpu_to_node(cpu);
+        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
+
+        pg = alloc_domheap_page(NULL, memflags);
+        if ( !pg )
+            return 0;
+
+        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
+    }
+
+    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
+    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
+                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
+    {
+        if ( !*mfn )
+            free_domheap_page(pg);
+        stub_va = 0;
+    }
+    else if ( !*mfn )
+        *mfn = page_to_mfn(pg);
+
+    return stub_va;
+}
+
 void cpu_exit_clear(unsigned int cpu)
 {
     cpu_uninit(cpu);
@@ -616,6 +652,23 @@ static void cpu_smpboot_free(unsigned in
     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
 
+    if ( per_cpu(stubs.addr, cpu) )
+    {
+        unsigned long mfn = per_cpu(stubs.mfn, cpu);
+        unsigned char *stub_page = map_domain_page(mfn);
+        unsigned int i;
+
+        memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
+        for ( i = 0; i < STUBS_PER_PAGE; ++i )
+            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
+                break;
+        unmap_domain_page(stub_page);
+        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
+                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
+        if ( i == STUBS_PER_PAGE )
+            free_domheap_page(mfn_to_page(mfn));
+    }
+
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
     free_xenheap_pages(per_cpu(gdt_table, cpu), order);
 
@@ -635,9 +688,10 @@ static void cpu_smpboot_free(unsigned in
 
 static int cpu_smpboot_alloc(unsigned int cpu)
 {
-    unsigned int order, memflags = 0;
+    unsigned int i, order, memflags = 0;
     nodeid_t node = cpu_to_node(cpu);
     struct desc_struct *gdt;
+    unsigned long stub_page;
 
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
@@ -667,6 +721,19 @@ static int cpu_smpboot_alloc(unsigned in
         goto oom;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
 
+    for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+          i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+        if ( cpu_online(i) && cpu_to_node(i) == node )
+        {
+            per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
+            break;
+        }
+    BUG_ON(i == cpu);
+    stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
+    if ( !stub_page )
+        goto oom;
+    per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
+
     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
          zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
         return 0;
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -219,7 +219,20 @@ ENTRY(compat_post_handle_exception)
         movb  $0,TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
-ENTRY(compat_syscall)
+/* See lstar_enter for entry register state. */
+ENTRY(cstar_enter)
+        sti
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_USER_CS32
+        pushq %rcx
+        pushq $0
+        SAVE_VOLATILE TRAP_syscall
+        GET_CURRENT(%rbx)
+        movq  VCPU_domain(%rbx),%rcx
+        cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
+        je    switch_to_kernel
         cmpb  $0,VCPU_syscall32_disables_events(%rbx)
         movzwl VCPU_syscall32_sel(%rbx),%esi
         movq  VCPU_syscall32_addr(%rbx),%rax
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -13,9 +13,8 @@
 #include <public/xen.h>
 #include <irq_vectors.h>
 
-        ALIGN
 /* %rbx: struct vcpu */
-switch_to_kernel:
+ENTRY(switch_to_kernel)
         leaq  VCPU_trap_bounce(%rbx),%rdx
         /* TB_eip = (32-bit syscall && syscall32_addr) ?
          *          syscall32_addr : syscall_addr */
@@ -113,23 +112,22 @@ restore_all_xen:
  * When entering SYSCALL from user mode:
  *  Vector directly to the registered arch.syscall_addr.
  *
- * Initial work is done by per-CPU stack trampolines. At this point %rsp
- * has been initialised to point at the correct Xen stack, and %rsp, %rflags
- * and %cs have been saved. All other registers are still to be saved onto
- * the stack, starting with %rip, and an appropriate %ss must be saved into
- * the space left by the trampoline.
+ * Initial work is done by per-CPU trampolines. At this point %rsp has been
+ * initialised to point at the correct Xen stack, %rsp has been saved, and
+ * %rax needs to be restored from the %ss save slot. All other registers are
+ * still to be saved onto the stack, starting with RFLAGS, and an appropriate
+ * %ss must be saved into the space left by the trampoline.
  */
-ENTRY(syscall_enter)
+ENTRY(lstar_enter)
         sti
-        movl  $FLAT_KERNEL_SS,24(%rsp)
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_KERNEL_CS64
         pushq %rcx
         pushq $0
-        movq  24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
         SAVE_VOLATILE TRAP_syscall
         GET_CURRENT(%rbx)
-        movq  VCPU_domain(%rbx),%rcx
-        testb $1,DOMAIN_is_32bit_pv(%rcx)
-        jnz   compat_syscall
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -337,70 +337,78 @@ unsigned long do_iret(void)
     return 0;
 }
 
-static int write_stack_trampoline(
-    char *stack, char *stack_bottom, uint16_t cs_seg)
+static unsigned int write_stub_trampoline(
+    unsigned char *stub, unsigned long stub_va,
+    unsigned long stack_bottom, unsigned long target_va)
 {
-    /* movq %rsp, saversp(%rip) */
-    stack[0] = 0x48;
-    stack[1] = 0x89;
-    stack[2] = 0x25;
-    *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
-
-    /* leaq saversp(%rip), %rsp */
-    stack[7] = 0x48;
-    stack[8] = 0x8d;
-    stack[9] = 0x25;
-    *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
-
-    /* pushq %r11 */
-    stack[14] = 0x41;
-    stack[15] = 0x53;
-
-    /* pushq $<cs_seg> */
-    stack[16] = 0x68;
-    *(u32 *)&stack[17] = cs_seg;
-
-    /* movq $syscall_enter,%r11 */
-    stack[21] = 0x49;
-    stack[22] = 0xbb;
-    *(void **)&stack[23] = (void *)syscall_enter;
-
-    /* jmpq *%r11 */
-    stack[31] = 0x41;
-    stack[32] = 0xff;
-    stack[33] = 0xe3;
+    /* movabsq %rax, stack_bottom - 8 */
+    stub[0] = 0x48;
+    stub[1] = 0xa3;
+    *(uint64_t *)&stub[2] = stack_bottom - 8;
+
+    /* movq %rsp, %rax */
+    stub[10] = 0x48;
+    stub[11] = 0x89;
+    stub[12] = 0xe0;
+
+    /* movabsq $stack_bottom - 8, %rsp */
+    stub[13] = 0x48;
+    stub[14] = 0xbc;
+    *(uint64_t *)&stub[15] = stack_bottom - 8;
+
+    /* pushq %rax */
+    stub[23] = 0x50;
+
+    /* jmp target_va */
+    stub[24] = 0xe9;
+    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
 
-    return 34;
+    /* Round up to a multiple of 16 bytes. */
+    return 32;
 }
 
+DEFINE_PER_CPU(struct stubs, stubs);
+void lstar_enter(void);
+void cstar_enter(void);
+
 void __devinit subarch_percpu_traps_init(void)
 {
-    char *stack_bottom, *stack;
-
-    stack_bottom = (char *)get_stack_bottom();
-    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
+    unsigned long stack_bottom = get_stack_bottom();
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned char *stub_page;
+    unsigned int offset;
 
     /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
     BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
 
-    /* Trampoline for SYSCALL entry from long mode. */
-    stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
-    wrmsrl(MSR_LSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
+    stub_page = map_domain_page(this_cpu(stubs.mfn));
+
+    /* Trampoline for SYSCALL entry from 64-bit mode. */
+    wrmsrl(MSR_LSTAR, stub_va);
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_bottom,
+                                   (unsigned long)lstar_enter);
+    stub_va += offset;
 
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
          boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
     {
         /* SYSENTER entry. */
-        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
+        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
         wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
         wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
     }
 
     /* Trampoline for SYSCALL entry from compatibility mode. */
-    stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
-    wrmsrl(MSR_CSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
+    wrmsrl(MSR_CSTAR, stub_va);
+    offset += write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                    stub_va, stack_bottom,
+                                    (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
 
     /* Common SYSCALL parameters. */
     wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -217,4 +217,7 @@ SECTIONS
   .comment 0 : { *(.comment) }
 }
 
+ASSERT(__image_base__ > XEN_VIRT_START ||
+       _end <= XEN_VIRT_END - NR_CPUS * PAGE_SIZE,
+       "Xen image overlaps stubs area")
 ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large")
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -94,6 +94,10 @@
 /* Primary stack is restricted to 8kB by guard pages. */
 #define PRIMARY_STACK_SIZE 8192
 
+/* Total size of syscall and emulation stubs. */
+#define STUB_BUF_SHIFT max(L1_CACHE_SHIFT, 7)
+#define STUB_BUF_SIZE  (1 << STUB_BUF_SHIFT)
+
 /* Return value for zero-size _xmalloc(), distinguished from NULL. */
 #define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL)
 
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -321,10 +321,10 @@ void efi_update_l4_pgtable(unsigned int 
 #define _PAGE_GNTTAB   0
 #endif
 
-#define __PAGE_HYPERVISOR \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_HYPERVISOR_NOCACHE \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
+                                   _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR | _PAGE_PCD)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -532,12 +532,24 @@ void trap_nop(void);
 void enable_nmis(void);
 void do_reserved_trap(struct cpu_user_regs *regs);
 
-void syscall_enter(void);
 void sysenter_entry(void);
 void sysenter_eflags_saved(void);
 void compat_hypercall(void);
 void int80_direct_trap(void);
 
+#define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE)
+
+struct stubs {
+    union {
+        void(*func)(void);
+        unsigned long addr;
+    };
+    unsigned long mfn;
+};
+
+DECLARE_PER_CPU(struct stubs, stubs);
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn);
+
 extern int hypercall(void);
 
 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -148,6 +148,7 @@ typedef l4_pgentry_t root_pgentry_t;
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
 #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
 
 #endif /* __X86_64_PAGE_H__ */



[-- Attachment #2: x86-syscall-trampolines.patch --]
[-- Type: text/plain, Size: 14762 bytes --]

x86: move syscall trampolines off the stack

This is needed as stacks are going to become non-executable. Use
separate stub pages (shared among suitable CPUs on the same node)
instead.

Stub areas (currently 128 bytes each) are being split into two parts -
a fixed usage one (the syscall ones) and dynamically usable space,
which will be used by subsequent changes to hold dynamically generated
code during instruction eumlation.

While sharing physical pages among certain CPUs on the same node, for
now the virtual mappings get established in distinct pages for each
CPU. This isn't a strict requirement, but simplifies VA space
management for this initial implementation: Sharing VA space would
require additional tracking of which areas are currently in use. If
the VA and/or TLB overhead turned out to be a problem, such extra code
could easily be added.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Get rid of bogus VA adjustment in alloc_stub_page(). Introduce
    STUB_BUF_CPU_OFFS(). Use ~PAGE_MASK instead of PAGE_SIZE - 1 for
    page offset masking. Comment adjustments. Extend description.

--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1270,6 +1270,10 @@ void __init noreturn __start_xen(unsigne
 
     init_idle_domain();
 
+    this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
+                                           &this_cpu(stubs).mfn);
+    BUG_ON(!this_cpu(stubs.addr));
+
     trap_init();
 
     rcu_init();
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -25,6 +25,7 @@
 #include <xen/kernel.h>
 #include <xen/mm.h>
 #include <xen/domain.h>
+#include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <xen/sched-if.h>
 #include <xen/irq.h>
@@ -603,6 +604,41 @@ static int do_boot_cpu(int apicid, int c
     return rc;
 }
 
+#define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
+
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
+{
+    unsigned long stub_va;
+    struct page_info *pg;
+
+    if ( *mfn )
+        pg = mfn_to_page(*mfn);
+    else
+    {
+        nodeid_t node = cpu_to_node(cpu);
+        unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
+
+        pg = alloc_domheap_page(NULL, memflags);
+        if ( !pg )
+            return 0;
+
+        unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
+    }
+
+    stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
+    if ( map_pages_to_xen(stub_va, page_to_mfn(pg), 1,
+                          PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
+    {
+        if ( !*mfn )
+            free_domheap_page(pg);
+        stub_va = 0;
+    }
+    else if ( !*mfn )
+        *mfn = page_to_mfn(pg);
+
+    return stub_va;
+}
+
 void cpu_exit_clear(unsigned int cpu)
 {
     cpu_uninit(cpu);
@@ -616,6 +652,23 @@ static void cpu_smpboot_free(unsigned in
     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
 
+    if ( per_cpu(stubs.addr, cpu) )
+    {
+        unsigned long mfn = per_cpu(stubs.mfn, cpu);
+        unsigned char *stub_page = map_domain_page(mfn);
+        unsigned int i;
+
+        memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
+        for ( i = 0; i < STUBS_PER_PAGE; ++i )
+            if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
+                break;
+        unmap_domain_page(stub_page);
+        destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
+                             (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
+        if ( i == STUBS_PER_PAGE )
+            free_domheap_page(mfn_to_page(mfn));
+    }
+
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
     free_xenheap_pages(per_cpu(gdt_table, cpu), order);
 
@@ -635,9 +688,10 @@ static void cpu_smpboot_free(unsigned in
 
 static int cpu_smpboot_alloc(unsigned int cpu)
 {
-    unsigned int order, memflags = 0;
+    unsigned int i, order, memflags = 0;
     nodeid_t node = cpu_to_node(cpu);
     struct desc_struct *gdt;
+    unsigned long stub_page;
 
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
@@ -667,6 +721,19 @@ static int cpu_smpboot_alloc(unsigned in
         goto oom;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
 
+    for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+          i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+        if ( cpu_online(i) && cpu_to_node(i) == node )
+        {
+            per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
+            break;
+        }
+    BUG_ON(i == cpu);
+    stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
+    if ( !stub_page )
+        goto oom;
+    per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
+
     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
          zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
         return 0;
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -219,7 +219,20 @@ ENTRY(compat_post_handle_exception)
         movb  $0,TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
-ENTRY(compat_syscall)
+/* See lstar_enter for entry register state. */
+ENTRY(cstar_enter)
+        sti
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_USER_CS32
+        pushq %rcx
+        pushq $0
+        SAVE_VOLATILE TRAP_syscall
+        GET_CURRENT(%rbx)
+        movq  VCPU_domain(%rbx),%rcx
+        cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
+        je    switch_to_kernel
         cmpb  $0,VCPU_syscall32_disables_events(%rbx)
         movzwl VCPU_syscall32_sel(%rbx),%esi
         movq  VCPU_syscall32_addr(%rbx),%rax
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -13,9 +13,8 @@
 #include <public/xen.h>
 #include <irq_vectors.h>
 
-        ALIGN
 /* %rbx: struct vcpu */
-switch_to_kernel:
+ENTRY(switch_to_kernel)
         leaq  VCPU_trap_bounce(%rbx),%rdx
         /* TB_eip = (32-bit syscall && syscall32_addr) ?
          *          syscall32_addr : syscall_addr */
@@ -113,23 +112,22 @@ restore_all_xen:
  * When entering SYSCALL from user mode:
  *  Vector directly to the registered arch.syscall_addr.
  *
- * Initial work is done by per-CPU stack trampolines. At this point %rsp
- * has been initialised to point at the correct Xen stack, and %rsp, %rflags
- * and %cs have been saved. All other registers are still to be saved onto
- * the stack, starting with %rip, and an appropriate %ss must be saved into
- * the space left by the trampoline.
+ * Initial work is done by per-CPU trampolines. At this point %rsp has been
+ * initialised to point at the correct Xen stack, %rsp has been saved, and
+ * %rax needs to be restored from the %ss save slot. All other registers are
+ * still to be saved onto the stack, starting with RFLAGS, and an appropriate
+ * %ss must be saved into the space left by the trampoline.
  */
-ENTRY(syscall_enter)
+ENTRY(lstar_enter)
         sti
-        movl  $FLAT_KERNEL_SS,24(%rsp)
+        movq  8(%rsp),%rax /* Restore %rax. */
+        movq  $FLAT_KERNEL_SS,8(%rsp)
+        pushq %r11
+        pushq $FLAT_KERNEL_CS64
         pushq %rcx
         pushq $0
-        movq  24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */
         SAVE_VOLATILE TRAP_syscall
         GET_CURRENT(%rbx)
-        movq  VCPU_domain(%rbx),%rcx
-        testb $1,DOMAIN_is_32bit_pv(%rcx)
-        jnz   compat_syscall
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -337,70 +337,78 @@ unsigned long do_iret(void)
     return 0;
 }
 
-static int write_stack_trampoline(
-    char *stack, char *stack_bottom, uint16_t cs_seg)
+static unsigned int write_stub_trampoline(
+    unsigned char *stub, unsigned long stub_va,
+    unsigned long stack_bottom, unsigned long target_va)
 {
-    /* movq %rsp, saversp(%rip) */
-    stack[0] = 0x48;
-    stack[1] = 0x89;
-    stack[2] = 0x25;
-    *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
-
-    /* leaq saversp(%rip), %rsp */
-    stack[7] = 0x48;
-    stack[8] = 0x8d;
-    stack[9] = 0x25;
-    *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
-
-    /* pushq %r11 */
-    stack[14] = 0x41;
-    stack[15] = 0x53;
-
-    /* pushq $<cs_seg> */
-    stack[16] = 0x68;
-    *(u32 *)&stack[17] = cs_seg;
-
-    /* movq $syscall_enter,%r11 */
-    stack[21] = 0x49;
-    stack[22] = 0xbb;
-    *(void **)&stack[23] = (void *)syscall_enter;
-
-    /* jmpq *%r11 */
-    stack[31] = 0x41;
-    stack[32] = 0xff;
-    stack[33] = 0xe3;
+    /* movabsq %rax, stack_bottom - 8 */
+    stub[0] = 0x48;
+    stub[1] = 0xa3;
+    *(uint64_t *)&stub[2] = stack_bottom - 8;
+
+    /* movq %rsp, %rax */
+    stub[10] = 0x48;
+    stub[11] = 0x89;
+    stub[12] = 0xe0;
+
+    /* movabsq $stack_bottom - 8, %rsp */
+    stub[13] = 0x48;
+    stub[14] = 0xbc;
+    *(uint64_t *)&stub[15] = stack_bottom - 8;
+
+    /* pushq %rax */
+    stub[23] = 0x50;
+
+    /* jmp target_va */
+    stub[24] = 0xe9;
+    *(int32_t *)&stub[25] = target_va - (stub_va + 29);
 
-    return 34;
+    /* Round up to a multiple of 16 bytes. */
+    return 32;
 }
 
+DEFINE_PER_CPU(struct stubs, stubs);
+void lstar_enter(void);
+void cstar_enter(void);
+
 void __devinit subarch_percpu_traps_init(void)
 {
-    char *stack_bottom, *stack;
-
-    stack_bottom = (char *)get_stack_bottom();
-    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
+    unsigned long stack_bottom = get_stack_bottom();
+    unsigned long stub_va = this_cpu(stubs.addr);
+    unsigned char *stub_page;
+    unsigned int offset;
 
     /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */
     BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE);
 
-    /* Trampoline for SYSCALL entry from long mode. */
-    stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
-    wrmsrl(MSR_LSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
+    stub_page = map_domain_page(this_cpu(stubs.mfn));
+
+    /* Trampoline for SYSCALL entry from 64-bit mode. */
+    wrmsrl(MSR_LSTAR, stub_va);
+    offset = write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                   stub_va, stack_bottom,
+                                   (unsigned long)lstar_enter);
+    stub_va += offset;
 
     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
          boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR )
     {
         /* SYSENTER entry. */
-        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
+        wrmsrl(MSR_IA32_SYSENTER_ESP, stack_bottom);
         wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
         wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
     }
 
     /* Trampoline for SYSCALL entry from compatibility mode. */
-    stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
-    wrmsrl(MSR_CSTAR, (unsigned long)stack);
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
+    wrmsrl(MSR_CSTAR, stub_va);
+    offset += write_stub_trampoline(stub_page + (stub_va & ~PAGE_MASK),
+                                    stub_va, stack_bottom,
+                                    (unsigned long)cstar_enter);
+
+    /* Don't consume more than half of the stub space here. */
+    ASSERT(offset <= STUB_BUF_SIZE / 2);
+
+    unmap_domain_page(stub_page);
 
     /* Common SYSCALL parameters. */
     wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -217,4 +217,7 @@ SECTIONS
   .comment 0 : { *(.comment) }
 }
 
+ASSERT(__image_base__ > XEN_VIRT_START ||
+       _end <= XEN_VIRT_END - NR_CPUS * PAGE_SIZE,
+       "Xen image overlaps stubs area")
 ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large")
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -94,6 +94,10 @@
 /* Primary stack is restricted to 8kB by guard pages. */
 #define PRIMARY_STACK_SIZE 8192
 
+/* Total size of syscall and emulation stubs. */
+#define STUB_BUF_SHIFT max(L1_CACHE_SHIFT, 7)
+#define STUB_BUF_SIZE  (1 << STUB_BUF_SHIFT)
+
 /* Return value for zero-size _xmalloc(), distinguished from NULL. */
 #define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL)
 
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -321,10 +321,10 @@ void efi_update_l4_pgtable(unsigned int 
 #define _PAGE_GNTTAB   0
 #endif
 
-#define __PAGE_HYPERVISOR \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_HYPERVISOR_NOCACHE \
-    (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
+#define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
+                                   _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR | _PAGE_PCD)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -532,12 +532,24 @@ void trap_nop(void);
 void enable_nmis(void);
 void do_reserved_trap(struct cpu_user_regs *regs);
 
-void syscall_enter(void);
 void sysenter_entry(void);
 void sysenter_eflags_saved(void);
 void compat_hypercall(void);
 void int80_direct_trap(void);
 
+#define STUBS_PER_PAGE (PAGE_SIZE / STUB_BUF_SIZE)
+
+struct stubs {
+    union {
+        void(*func)(void);
+        unsigned long addr;
+    };
+    unsigned long mfn;
+};
+
+DECLARE_PER_CPU(struct stubs, stubs);
+unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn);
+
 extern int hypercall(void);
 
 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -148,6 +148,7 @@ typedef l4_pgentry_t root_pgentry_t;
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
 #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
 
 #endif /* __X86_64_PAGE_H__ */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 2/4] x86emul: move stubs off the stack
  2015-05-21 10:07 [PATCH v2 0/4] x86: don't default to executable mappings Jan Beulich
  2015-05-21 10:15 ` [PATCH v2 1/4] x86: move syscall trampolines off the stack Jan Beulich
@ 2015-05-21 10:16 ` Jan Beulich
  2015-05-21 11:10   ` Andrew Cooper
  2015-05-21 10:17 ` [PATCH v2 3/4] x86: move I/O emulation " Jan Beulich
  2015-05-21 10:17 ` [PATCH v2 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
  3 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2015-05-21 10:16 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 7503 bytes --]

This is needed as stacks are going to become non-executable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Limit the testing tool's stub buffer size to MAX_INST_LEN + 1. Fix
    formatting.

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -17,4 +17,8 @@ typedef bool bool_t;
 #define __packed __attribute__((packed))
 
 #include "x86_emulate/x86_emulate.h"
+
+#define get_stub(stb) ((void *)((stb).addr = (uintptr_t)(stb).buf))
+#define put_stub(stb)
+
 #include "x86_emulate/x86_emulate.c"
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -9,6 +9,7 @@
  *    Keir Fraser <keir@xen.org>
  */
 
+#include <xen/domain_page.h>
 #include <asm/x86_emulate.h>
 #include <asm/asm_defns.h> /* mark_regs_dirty() */
 #include <asm/processor.h> /* current_cpu_info */
@@ -17,8 +18,22 @@
 /* Avoid namespace pollution. */
 #undef cmpxchg
 #undef cpuid
+#undef wbinvd
 
 #define cpu_has_amd_erratum(nr) \
         cpu_has_amd_erratum(&current_cpu_data, AMD_ERRATUM_##nr)
 
+#define get_stub(stb) ({                                   \
+    (stb).addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2; \
+    ((stb).ptr = map_domain_page(this_cpu(stubs.mfn))) +   \
+        ((stb).addr & ~PAGE_MASK);                         \
+})
+#define put_stub(stb) ({                                   \
+    if ( (stb).ptr )                                       \
+    {                                                      \
+        unmap_domain_page((stb).ptr);                      \
+        (stb).ptr = NULL;                                  \
+    }                                                      \
+})
+
 #include "x86_emulate/x86_emulate.c"
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -717,11 +717,15 @@ do{ struct fpu_insn_ctxt fic;           
 } while (0)
 
 #define emulate_fpu_insn_stub(_bytes...)                                \
-do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
-    struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
+do {                                                                    \
+    uint8_t *buf = get_stub(stub);                                      \
+    unsigned int _nr = sizeof((uint8_t[]){ _bytes });                   \
+    struct fpu_insn_ctxt fic = { .insn_bytes = _nr };                   \
+    memcpy(buf, ((uint8_t[]){ _bytes, 0xc3 }), _nr + 1);                \
     get_fpu(X86EMUL_FPU_fpu, &fic);                                     \
-    (*(void(*)(void))stub)();                                           \
+    stub.func();                                                        \
     put_fpu(&fic);                                                      \
+    put_stub(stub);                                                     \
 } while (0)
 
 static unsigned long _get_rep_prefix(
@@ -1458,6 +1462,7 @@ x86_emulate(
     struct operand src = { .reg = REG_POISON };
     struct operand dst = { .reg = REG_POISON };
     enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -3792,6 +3797,7 @@ x86_emulate(
 
  done:
     _put_fpu();
+    put_stub(stub);
     return rc;
 
  twobyte_insn:
@@ -4007,9 +4013,15 @@ x86_emulate(
                /* {,v}movss xmm,xmm/m32 */
                /* {,v}movsd xmm,xmm/m64 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
 
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
@@ -4017,7 +4029,7 @@ x86_emulate(
             else
                 vcpu_must_have_sse();
             ea.bytes = 16;
-            SET_SSE_PREFIX(stub[0], vex.pfx);
+            SET_SSE_PREFIX(buf[0], vex.pfx);
             get_fpu(X86EMUL_FPU_xmm, &fic);
         }
         else
@@ -4044,15 +4056,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b & 1) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4242,9 +4255,15 @@ x86_emulate(
                /* {,v}movdq{a,u} xmm,xmm/m128 */
                /* vmovdq{a,u} ymm,ymm/m256 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
 
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             switch ( vex.pfx )
@@ -4252,7 +4271,7 @@ x86_emulate(
             case vex_66:
             case vex_f3:
                 vcpu_must_have_sse2();
-                stub[0] = 0x66; /* movdqa */
+                buf[0] = 0x66; /* movdqa */
                 get_fpu(X86EMUL_FPU_xmm, &fic);
                 ea.bytes = 16;
                 break;
@@ -4288,15 +4307,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4638,5 +4658,6 @@ x86_emulate(
 
  cannot_emulate:
     _put_fpu();
+    put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -429,6 +429,19 @@ struct x86_emulate_ctxt
     } retire;
 };
 
+struct x86_emulate_stub {
+    union {
+        void (*func)(void);
+        uintptr_t addr;
+    };
+#ifdef __XEN__
+    void *ptr;
+#else
+    /* Room for one insn and a (single byte) RET. */
+    uint8_t buf[MAX_INST_LEN + 1];
+#endif
+};
+
 /*
  * x86_emulate: Emulate an instruction.
  * Returns -1 on failure, 0 on success.



[-- Attachment #2: x86emul-stubs.patch --]
[-- Type: text/plain, Size: 7536 bytes --]

x86emul: move stubs off the stack

This is needed as stacks are going to become non-executable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Limit the testing tool's stub buffer size to MAX_INST_LEN + 1. Fix
    formatting.

--- a/tools/tests/x86_emulator/x86_emulate.c
+++ b/tools/tests/x86_emulator/x86_emulate.c
@@ -17,4 +17,8 @@ typedef bool bool_t;
 #define __packed __attribute__((packed))
 
 #include "x86_emulate/x86_emulate.h"
+
+#define get_stub(stb) ((void *)((stb).addr = (uintptr_t)(stb).buf))
+#define put_stub(stb)
+
 #include "x86_emulate/x86_emulate.c"
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -9,6 +9,7 @@
  *    Keir Fraser <keir@xen.org>
  */
 
+#include <xen/domain_page.h>
 #include <asm/x86_emulate.h>
 #include <asm/asm_defns.h> /* mark_regs_dirty() */
 #include <asm/processor.h> /* current_cpu_info */
@@ -17,8 +18,22 @@
 /* Avoid namespace pollution. */
 #undef cmpxchg
 #undef cpuid
+#undef wbinvd
 
 #define cpu_has_amd_erratum(nr) \
         cpu_has_amd_erratum(&current_cpu_data, AMD_ERRATUM_##nr)
 
+#define get_stub(stb) ({                                   \
+    (stb).addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2; \
+    ((stb).ptr = map_domain_page(this_cpu(stubs.mfn))) +   \
+        ((stb).addr & ~PAGE_MASK);                         \
+})
+#define put_stub(stb) ({                                   \
+    if ( (stb).ptr )                                       \
+    {                                                      \
+        unmap_domain_page((stb).ptr);                      \
+        (stb).ptr = NULL;                                  \
+    }                                                      \
+})
+
 #include "x86_emulate/x86_emulate.c"
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -717,11 +717,15 @@ do{ struct fpu_insn_ctxt fic;           
 } while (0)
 
 #define emulate_fpu_insn_stub(_bytes...)                                \
-do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
-    struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
+do {                                                                    \
+    uint8_t *buf = get_stub(stub);                                      \
+    unsigned int _nr = sizeof((uint8_t[]){ _bytes });                   \
+    struct fpu_insn_ctxt fic = { .insn_bytes = _nr };                   \
+    memcpy(buf, ((uint8_t[]){ _bytes, 0xc3 }), _nr + 1);                \
     get_fpu(X86EMUL_FPU_fpu, &fic);                                     \
-    (*(void(*)(void))stub)();                                           \
+    stub.func();                                                        \
     put_fpu(&fic);                                                      \
+    put_stub(stub);                                                     \
 } while (0)
 
 static unsigned long _get_rep_prefix(
@@ -1458,6 +1462,7 @@ x86_emulate(
     struct operand src = { .reg = REG_POISON };
     struct operand dst = { .reg = REG_POISON };
     enum x86_swint_type swint_type;
+    struct x86_emulate_stub stub = {};
     DECLARE_ALIGNED(mmval_t, mmval);
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -3792,6 +3797,7 @@ x86_emulate(
 
  done:
     _put_fpu();
+    put_stub(stub);
     return rc;
 
  twobyte_insn:
@@ -4007,9 +4013,15 @@ x86_emulate(
                /* {,v}movss xmm,xmm/m32 */
                /* {,v}movsd xmm,xmm/m64 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
 
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
@@ -4017,7 +4029,7 @@ x86_emulate(
             else
                 vcpu_must_have_sse();
             ea.bytes = 16;
-            SET_SSE_PREFIX(stub[0], vex.pfx);
+            SET_SSE_PREFIX(buf[0], vex.pfx);
             get_fpu(X86EMUL_FPU_xmm, &fic);
         }
         else
@@ -4044,15 +4056,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b & 1) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4242,9 +4255,15 @@ x86_emulate(
                /* {,v}movdq{a,u} xmm,xmm/m128 */
                /* vmovdq{a,u} ymm,ymm/m256 */
     {
-        uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 };
-        struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };
+        uint8_t *buf = get_stub(stub);
+        struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
 
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        buf[5] = 0xc3;
         if ( vex.opcx == vex_none )
         {
             switch ( vex.pfx )
@@ -4252,7 +4271,7 @@ x86_emulate(
             case vex_66:
             case vex_f3:
                 vcpu_must_have_sse2();
-                stub[0] = 0x66; /* movdqa */
+                buf[0] = 0x66; /* movdqa */
                 get_fpu(X86EMUL_FPU_xmm, &fic);
                 ea.bytes = 16;
                 break;
@@ -4288,15 +4307,16 @@ x86_emulate(
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
-            stub[4] &= 0x38;
+            buf[4] &= 0x38;
         }
         if ( !rc )
         {
-           copy_REX_VEX(stub, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp)
+           copy_REX_VEX(buf, rex_prefix, vex);
+           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }
         put_fpu(&fic);
+        put_stub(stub);
         if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
             rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
                             ea.bytes, ctxt);
@@ -4638,5 +4658,6 @@ x86_emulate(
 
  cannot_emulate:
     _put_fpu();
+    put_stub(stub);
     return X86EMUL_UNHANDLEABLE;
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -429,6 +429,19 @@ struct x86_emulate_ctxt
     } retire;
 };
 
+struct x86_emulate_stub {
+    union {
+        void (*func)(void);
+        uintptr_t addr;
+    };
+#ifdef __XEN__
+    void *ptr;
+#else
+    /* Room for one insn and a (single byte) RET. */
+    uint8_t buf[MAX_INST_LEN + 1];
+#endif
+};
+
 /*
  * x86_emulate: Emulate an instruction.
  * Returns -1 on failure, 0 on success.

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 3/4] x86: move I/O emulation stubs off the stack
  2015-05-21 10:07 [PATCH v2 0/4] x86: don't default to executable mappings Jan Beulich
  2015-05-21 10:15 ` [PATCH v2 1/4] x86: move syscall trampolines off the stack Jan Beulich
  2015-05-21 10:16 ` [PATCH v2 2/4] x86emul: move stubs " Jan Beulich
@ 2015-05-21 10:17 ` Jan Beulich
  2015-05-21 11:12   ` Andrew Cooper
  2015-05-21 10:17 ` [PATCH v2 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
  3 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2015-05-21 10:17 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 2130 bytes --]

This is needed as stacks are going to become non-executable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Use ~PAGE_MASK instead of PAGE_SIZE - 1 for page offset masking.
    Comment adjustments.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2006,7 +2006,7 @@ static int emulate_privileged_op(struct 
                            ? (*(u32 *)&regs->reg = (val)) \
                            : (*(u16 *)&regs->reg = (val)))
     unsigned long code_base, code_limit;
-    char io_emul_stub[32];
+    char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
     uint64_t val;
 
@@ -2191,10 +2191,13 @@ static int emulate_privileged_op(struct 
 
     /*
      * Very likely to be an I/O instruction (IN/OUT).
-     * Build an on-stack stub to execute the instruction with full guest
-     * GPR context. This is needed for some systems which (ab)use IN/OUT
+     * Build an stub to execute the instruction with full guest GPR
+     * context. This is needed for some systems which (ab)use IN/OUT
      * to communicate with BIOS code in system-management mode.
      */
+    io_emul_stub = map_domain_page(this_cpu(stubs.mfn)) +
+                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
+                   STUB_BUF_SIZE / 2;
     /* movq $host_to_guest_gpr_switch,%rcx */
     io_emul_stub[0] = 0x48;
     io_emul_stub[1] = 0xb9;
@@ -2212,7 +2215,7 @@ static int emulate_privileged_op(struct 
     io_emul_stub[15] = 0xc3;
 
     /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)io_emul_stub;
+    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
 
     if ( ioemul_handle_quirk )
         ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
@@ -2777,9 +2780,13 @@ static int emulate_privileged_op(struct 
  done:
     instruction_done(regs, eip, bpmatch);
  skip:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return EXCRET_fault_fixed;
 
  fail:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return 0;
 }
 




[-- Attachment #2: x86-ioemul-stub.patch --]
[-- Type: text/plain, Size: 2171 bytes --]

x86: move I/O emulation stubs off the stack

This is needed as stacks are going to become non-executable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Use ~PAGE_MASK instead of PAGE_SIZE - 1 for page offset masking.
    Comment adjustments.

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2006,7 +2006,7 @@ static int emulate_privileged_op(struct 
                            ? (*(u32 *)&regs->reg = (val)) \
                            : (*(u16 *)&regs->reg = (val)))
     unsigned long code_base, code_limit;
-    char io_emul_stub[32];
+    char *io_emul_stub = NULL;
     void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1)));
     uint64_t val;
 
@@ -2191,10 +2191,13 @@ static int emulate_privileged_op(struct 
 
     /*
      * Very likely to be an I/O instruction (IN/OUT).
-     * Build an on-stack stub to execute the instruction with full guest
-     * GPR context. This is needed for some systems which (ab)use IN/OUT
+     * Build an stub to execute the instruction with full guest GPR
+     * context. This is needed for some systems which (ab)use IN/OUT
      * to communicate with BIOS code in system-management mode.
      */
+    io_emul_stub = map_domain_page(this_cpu(stubs.mfn)) +
+                   (this_cpu(stubs.addr) & ~PAGE_MASK) +
+                   STUB_BUF_SIZE / 2;
     /* movq $host_to_guest_gpr_switch,%rcx */
     io_emul_stub[0] = 0x48;
     io_emul_stub[1] = 0xb9;
@@ -2212,7 +2215,7 @@ static int emulate_privileged_op(struct 
     io_emul_stub[15] = 0xc3;
 
     /* Handy function-typed pointer to the stub. */
-    io_emul = (void *)io_emul_stub;
+    io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
 
     if ( ioemul_handle_quirk )
         ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
@@ -2777,9 +2780,13 @@ static int emulate_privileged_op(struct 
  done:
     instruction_done(regs, eip, bpmatch);
  skip:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return EXCRET_fault_fixed;
 
  fail:
+    if ( io_emul_stub )
+        unmap_domain_page(io_emul_stub);
     return 0;
 }
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 4/4] x86: switch default mapping attributes to non-executable
  2015-05-21 10:07 [PATCH v2 0/4] x86: don't default to executable mappings Jan Beulich
                   ` (2 preceding siblings ...)
  2015-05-21 10:17 ` [PATCH v2 3/4] x86: move I/O emulation " Jan Beulich
@ 2015-05-21 10:17 ` Jan Beulich
  2015-05-21 11:14   ` Andrew Cooper
  3 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2015-05-21 10:17 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 8976 bytes --]

Only a very limited subset of mappings need to be done as executable
ones; in particular the direct mapping should not be executable to
limit the damage attackers can cause by exploiting security relevant
bugs.

The EFI change at once includes an adjustment to set NX only when
supported by the hardware.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Log a message indicating NX protection state.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -293,7 +293,7 @@ struct vcpu_guest_context *alloc_vcpu_gu
             free_vcpu_guest_context(NULL);
             return NULL;
         }
-        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR);
+        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR_RW);
         per_cpu(vgc_pages[i], cpu) = pg;
     }
     return (void *)fix_to_virt(idx);
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -160,7 +160,7 @@ void *map_domain_page(unsigned long mfn)
 
     spin_unlock(&dcache->lock);
 
-    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
+    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR_RW));
 
  out:
     local_irq_restore(flags);
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4416,7 +4416,7 @@ long set_gdt(struct vcpu *v, 
     for ( i = 0; i < nr_pages; i++ )
     {
         v->arch.pv_vcpu.gdt_frames[i] = frames[i];
-        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
+        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR_RW));
     }
 
     xfree(pfns);
@@ -6007,7 +6007,7 @@ int create_perdomain_mapping(struct doma
                 if ( !IS_NIL(ppg) )
                     *ppg++ = pg;
                 l1tab[l1_table_offset(va)] =
-                    l1e_from_page(pg, __PAGE_HYPERVISOR | _PAGE_AVAIL0);
+                    l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
                 l2e_add_flags(*pl2e, _PAGE_AVAIL0);
             }
             else
@@ -6136,7 +6136,7 @@ void memguard_init(void)
         (unsigned long)__va(start),
         start >> PAGE_SHIFT,
         (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
-        __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
+        __PAGE_HYPERVISOR_RW|MAP_SMALL_PAGES);
     BUG_ON(start != xen_phys_start);
     map_pages_to_xen(
         XEN_VIRT_START,
@@ -6149,7 +6149,7 @@ static void __memguard_change_range(void
 {
     unsigned long _p = (unsigned long)p;
     unsigned long _l = (unsigned long)l;
-    unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
+    unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
 
     /* Ensure we are dealing with a page-aligned whole number of pages. */
     ASSERT((_p&~PAGE_MASK) == 0);
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -900,7 +900,7 @@ void __init noreturn __start_xen(unsigne
             /* The only data mappings to be relocated are in the Xen area. */
             pl2e = __va(__pa(l2_xenmap));
             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
-                                   PAGE_HYPERVISOR | _PAGE_PSE);
+                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
             {
                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
@@ -1087,7 +1087,7 @@ void __init noreturn __start_xen(unsigne
             /* This range must not be passed to the boot allocator and
              * must also not be mapped with _PAGE_GLOBAL. */
             map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
-                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR);
+                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
         }
         if ( s < map_s )
         {
@@ -1423,6 +1423,10 @@ void __init noreturn __start_xen(unsigne
     if ( cpu_has_smap )
         write_cr4(read_cr4() & ~X86_CR4_SMAP);
 
+    printk("%sNX (Execute Disable) protection %sactive\n",
+           cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
+           cpu_has_nx ? "" : "not ");
+
     /*
      * We're going to setup domain0 using the module(s) that we stashed safely
      * above our heap. The second module, if present, is an initrd ramdisk.
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -895,6 +895,33 @@ void __init subarch_init_memory(void)
             share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
         }
     }
+
+    /* Mark low 16Mb of direct map NX if hardware supports it. */
+    if ( !cpu_has_nx )
+        return;
+
+    v = DIRECTMAP_VIRT_START + (1UL << 20);
+    l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[l3_table_offset(v)];
+    ASSERT(l3e_get_flags(l3e) & _PAGE_PRESENT);
+    do {
+        l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
+        ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
+        if ( l2e_get_flags(l2e) & _PAGE_PSE )
+        {
+            l2e_add_flags(l2e, _PAGE_NX_BIT);
+            l3e_to_l2e(l3e)[l2_table_offset(v)] = l2e;
+            v += 1 << L2_PAGETABLE_SHIFT;
+        }
+        else
+        {
+            l1_pgentry_t l1e = l2e_to_l1e(l2e)[l1_table_offset(v)];
+
+            ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
+            l1e_add_flags(l1e, _PAGE_NX_BIT);
+            l2e_to_l1e(l2e)[l1_table_offset(v)] = l1e;
+            v += 1 << L1_PAGETABLE_SHIFT;
+        }
+    } while ( v < DIRECTMAP_VIRT_START + (16UL << 20) );
 }
 
 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -1359,7 +1386,7 @@ int memory_add(unsigned long spfn, unsig
         if ( i < spfn )
             i = spfn;
         ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i,
-                               epfn - i, __PAGE_HYPERVISOR);
+                               epfn - i, __PAGE_HYPERVISOR_RW);
         if ( ret )
             return ret;
     }
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1162,7 +1162,7 @@ void __init efi_init_memory(void)
         EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i;
         u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT;
         unsigned long smfn, emfn;
-        unsigned int prot = PAGE_HYPERVISOR;
+        unsigned int prot = PAGE_HYPERVISOR_RWX;
 
         printk(XENLOG_INFO " %013" PRIx64 "-%013" PRIx64
                            " type=%u attr=%016" PRIx64 "\n",
@@ -1195,7 +1195,7 @@ void __init efi_init_memory(void)
         if ( desc->Attribute & EFI_MEMORY_WP )
             prot &= _PAGE_RW;
         if ( desc->Attribute & EFI_MEMORY_XP )
-            prot |= _PAGE_NX_BIT;
+            prot |= _PAGE_NX;
 
         if ( pfn_to_pdx(emfn - 1) < (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
              !(smfn & pfn_hole_mask) &&
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -304,7 +304,8 @@ void efi_update_l4_pgtable(unsigned int 
 #define _PAGE_AVAIL1   _AC(0x400,U)
 #define _PAGE_AVAIL2   _AC(0x800,U)
 #define _PAGE_AVAIL    _AC(0xE00,U)
-#define _PAGE_PSE_PAT _AC(0x1000,U)
+#define _PAGE_PSE_PAT  _AC(0x1000,U)
+#define _PAGE_NX       (cpu_has_nx ? _PAGE_NX_BIT : 0)
 /* non-architectural flags */
 #define _PAGE_PAGED   0x2000U
 #define _PAGE_SHARED  0x4000U
@@ -321,6 +322,9 @@ void efi_update_l4_pgtable(unsigned int 
 #define _PAGE_GNTTAB   0
 #endif
 
+#define __PAGE_HYPERVISOR_RO      (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NX)
+#define __PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RO | \
+                                   _PAGE_DIRTY | _PAGE_RW)
 #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
 #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
                                    _PAGE_DIRTY | _PAGE_RW)
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -147,9 +147,20 @@ typedef l4_pgentry_t root_pgentry_t;
  */
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
-#define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RO      (__PAGE_HYPERVISOR_RO      | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RW      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
-#define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RWX     (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+
+#ifdef __ASSEMBLY__
+/* Dependency on NX being available can't be expressed. */
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RWX
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#else
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RW
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | \
+                                  _PAGE_GLOBAL | _PAGE_NX)
+#endif
 
 #endif /* __X86_64_PAGE_H__ */
 



[-- Attachment #2: x86-map-noexec.patch --]
[-- Type: text/plain, Size: 9032 bytes --]

x86: switch default mapping attributes to non-executable

Only a very limited subset of mappings need to be done as executable
ones; in particular the direct mapping should not be executable to
limit the damage attackers can cause by exploiting security relevant
bugs.

The EFI change at once includes an adjustment to set NX only when
supported by the hardware.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Log a message indicating NX protection state.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -293,7 +293,7 @@ struct vcpu_guest_context *alloc_vcpu_gu
             free_vcpu_guest_context(NULL);
             return NULL;
         }
-        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR);
+        __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR_RW);
         per_cpu(vgc_pages[i], cpu) = pg;
     }
     return (void *)fix_to_virt(idx);
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -160,7 +160,7 @@ void *map_domain_page(unsigned long mfn)
 
     spin_unlock(&dcache->lock);
 
-    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
+    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR_RW));
 
  out:
     local_irq_restore(flags);
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4416,7 +4416,7 @@ long set_gdt(struct vcpu *v, 
     for ( i = 0; i < nr_pages; i++ )
     {
         v->arch.pv_vcpu.gdt_frames[i] = frames[i];
-        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR));
+        l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR_RW));
     }
 
     xfree(pfns);
@@ -6007,7 +6007,7 @@ int create_perdomain_mapping(struct doma
                 if ( !IS_NIL(ppg) )
                     *ppg++ = pg;
                 l1tab[l1_table_offset(va)] =
-                    l1e_from_page(pg, __PAGE_HYPERVISOR | _PAGE_AVAIL0);
+                    l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
                 l2e_add_flags(*pl2e, _PAGE_AVAIL0);
             }
             else
@@ -6136,7 +6136,7 @@ void memguard_init(void)
         (unsigned long)__va(start),
         start >> PAGE_SHIFT,
         (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
-        __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
+        __PAGE_HYPERVISOR_RW|MAP_SMALL_PAGES);
     BUG_ON(start != xen_phys_start);
     map_pages_to_xen(
         XEN_VIRT_START,
@@ -6149,7 +6149,7 @@ static void __memguard_change_range(void
 {
     unsigned long _p = (unsigned long)p;
     unsigned long _l = (unsigned long)l;
-    unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES;
+    unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
 
     /* Ensure we are dealing with a page-aligned whole number of pages. */
     ASSERT((_p&~PAGE_MASK) == 0);
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -900,7 +900,7 @@ void __init noreturn __start_xen(unsigne
             /* The only data mappings to be relocated are in the Xen area. */
             pl2e = __va(__pa(l2_xenmap));
             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
-                                   PAGE_HYPERVISOR | _PAGE_PSE);
+                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
             {
                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
@@ -1087,7 +1087,7 @@ void __init noreturn __start_xen(unsigne
             /* This range must not be passed to the boot allocator and
              * must also not be mapped with _PAGE_GLOBAL. */
             map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
-                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR);
+                             PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
         }
         if ( s < map_s )
         {
@@ -1423,6 +1423,10 @@ void __init noreturn __start_xen(unsigne
     if ( cpu_has_smap )
         write_cr4(read_cr4() & ~X86_CR4_SMAP);
 
+    printk("%sNX (Execute Disable) protection %sactive\n",
+           cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
+           cpu_has_nx ? "" : "not ");
+
     /*
      * We're going to setup domain0 using the module(s) that we stashed safely
      * above our heap. The second module, if present, is an initrd ramdisk.
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -895,6 +895,33 @@ void __init subarch_init_memory(void)
             share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
         }
     }
+
+    /* Mark low 16Mb of direct map NX if hardware supports it. */
+    if ( !cpu_has_nx )
+        return;
+
+    v = DIRECTMAP_VIRT_START + (1UL << 20);
+    l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[l3_table_offset(v)];
+    ASSERT(l3e_get_flags(l3e) & _PAGE_PRESENT);
+    do {
+        l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
+        ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
+        if ( l2e_get_flags(l2e) & _PAGE_PSE )
+        {
+            l2e_add_flags(l2e, _PAGE_NX_BIT);
+            l3e_to_l2e(l3e)[l2_table_offset(v)] = l2e;
+            v += 1 << L2_PAGETABLE_SHIFT;
+        }
+        else
+        {
+            l1_pgentry_t l1e = l2e_to_l1e(l2e)[l1_table_offset(v)];
+
+            ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
+            l1e_add_flags(l1e, _PAGE_NX_BIT);
+            l2e_to_l1e(l2e)[l1_table_offset(v)] = l1e;
+            v += 1 << L1_PAGETABLE_SHIFT;
+        }
+    } while ( v < DIRECTMAP_VIRT_START + (16UL << 20) );
 }
 
 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -1359,7 +1386,7 @@ int memory_add(unsigned long spfn, unsig
         if ( i < spfn )
             i = spfn;
         ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i,
-                               epfn - i, __PAGE_HYPERVISOR);
+                               epfn - i, __PAGE_HYPERVISOR_RW);
         if ( ret )
             return ret;
     }
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -1162,7 +1162,7 @@ void __init efi_init_memory(void)
         EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i;
         u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT;
         unsigned long smfn, emfn;
-        unsigned int prot = PAGE_HYPERVISOR;
+        unsigned int prot = PAGE_HYPERVISOR_RWX;
 
         printk(XENLOG_INFO " %013" PRIx64 "-%013" PRIx64
                            " type=%u attr=%016" PRIx64 "\n",
@@ -1195,7 +1195,7 @@ void __init efi_init_memory(void)
         if ( desc->Attribute & EFI_MEMORY_WP )
             prot &= _PAGE_RW;
         if ( desc->Attribute & EFI_MEMORY_XP )
-            prot |= _PAGE_NX_BIT;
+            prot |= _PAGE_NX;
 
         if ( pfn_to_pdx(emfn - 1) < (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
              !(smfn & pfn_hole_mask) &&
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -304,7 +304,8 @@ void efi_update_l4_pgtable(unsigned int 
 #define _PAGE_AVAIL1   _AC(0x400,U)
 #define _PAGE_AVAIL2   _AC(0x800,U)
 #define _PAGE_AVAIL    _AC(0xE00,U)
-#define _PAGE_PSE_PAT _AC(0x1000,U)
+#define _PAGE_PSE_PAT  _AC(0x1000,U)
+#define _PAGE_NX       (cpu_has_nx ? _PAGE_NX_BIT : 0)
 /* non-architectural flags */
 #define _PAGE_PAGED   0x2000U
 #define _PAGE_SHARED  0x4000U
@@ -321,6 +322,9 @@ void efi_update_l4_pgtable(unsigned int 
 #define _PAGE_GNTTAB   0
 #endif
 
+#define __PAGE_HYPERVISOR_RO      (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_NX)
+#define __PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RO | \
+                                   _PAGE_DIRTY | _PAGE_RW)
 #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
 #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
                                    _PAGE_DIRTY | _PAGE_RW)
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -147,9 +147,20 @@ typedef l4_pgentry_t root_pgentry_t;
  */
 #define _PAGE_GUEST_KERNEL (1U<<12)
 
-#define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RO      (__PAGE_HYPERVISOR_RO      | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RW      (__PAGE_HYPERVISOR_RW      | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_RX      (__PAGE_HYPERVISOR_RX      | _PAGE_GLOBAL)
-#define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#define PAGE_HYPERVISOR_RWX     (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
+
+#ifdef __ASSEMBLY__
+/* Dependency on NX being available can't be expressed. */
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RWX
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
+#else
+# define PAGE_HYPERVISOR         PAGE_HYPERVISOR_RW
+# define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | \
+                                  _PAGE_GLOBAL | _PAGE_NX)
+#endif
 
 #endif /* __X86_64_PAGE_H__ */
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 1/4] x86: move syscall trampolines off the stack
  2015-05-21 10:15 ` [PATCH v2 1/4] x86: move syscall trampolines off the stack Jan Beulich
@ 2015-05-21 11:08   ` Andrew Cooper
  2015-05-21 11:48     ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Andrew Cooper @ 2015-05-21 11:08 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 21/05/15 11:15, Jan Beulich wrote:
> This is needed as stacks are going to become non-executable. Use
> separate stub pages (shared among suitable CPUs on the same node)
> instead.
>
> Stub areas (currently 128 bytes each) are being split into two parts -
> a fixed usage one (the syscall ones) and dynamically usable space,
> which will be used by subsequent changes to hold dynamically generated
> code during instruction eumlation.
>
> While sharing physical pages among certain CPUs on the same node, for
> now the virtual mappings get established in distinct pages for each
> CPU. This isn't a strict requirement, but simplifies VA space
> management for this initial implementation: Sharing VA space would
> require additional tracking of which areas are currently in use. If
> the VA and/or TLB overhead turned out to be a problem, such extra code
> could easily be added.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

It might be wise to have a BUILD_BUG_ON() which confirms that
STUBS_PER_PAGE is a power of two, which is a requirement given the way
it is used.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/4] x86emul: move stubs off the stack
  2015-05-21 10:16 ` [PATCH v2 2/4] x86emul: move stubs " Jan Beulich
@ 2015-05-21 11:10   ` Andrew Cooper
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2015-05-21 11:10 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 21/05/15 11:16, Jan Beulich wrote:
> This is needed as stacks are going to become non-executable.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 3/4] x86: move I/O emulation stubs off the stack
  2015-05-21 10:17 ` [PATCH v2 3/4] x86: move I/O emulation " Jan Beulich
@ 2015-05-21 11:12   ` Andrew Cooper
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2015-05-21 11:12 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 21/05/15 11:17, Jan Beulich wrote:
> This is needed as stacks are going to become non-executable.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 4/4] x86: switch default mapping attributes to non-executable
  2015-05-21 10:17 ` [PATCH v2 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
@ 2015-05-21 11:14   ` Andrew Cooper
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2015-05-21 11:14 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 21/05/15 11:17, Jan Beulich wrote:
> Only a very limited subset of mappings need to be done as executable
> ones; in particular the direct mapping should not be executable to
> limit the damage attackers can cause by exploiting security relevant
> bugs.
>
> The EFI change at once includes an adjustment to set NX only when
> supported by the hardware.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 1/4] x86: move syscall trampolines off the stack
  2015-05-21 11:08   ` Andrew Cooper
@ 2015-05-21 11:48     ` Jan Beulich
  2015-05-21 11:50       ` Andrew Cooper
  2015-05-22  8:25       ` Jan Beulich
  0 siblings, 2 replies; 14+ messages in thread
From: Jan Beulich @ 2015-05-21 11:48 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 21.05.15 at 13:08, <andrew.cooper3@citrix.com> wrote:
> On 21/05/15 11:15, Jan Beulich wrote:
>> This is needed as stacks are going to become non-executable. Use
>> separate stub pages (shared among suitable CPUs on the same node)
>> instead.
>>
>> Stub areas (currently 128 bytes each) are being split into two parts -
>> a fixed usage one (the syscall ones) and dynamically usable space,
>> which will be used by subsequent changes to hold dynamically generated
>> code during instruction eumlation.
>>
>> While sharing physical pages among certain CPUs on the same node, for
>> now the virtual mappings get established in distinct pages for each
>> CPU. This isn't a strict requirement, but simplifies VA space
>> management for this initial implementation: Sharing VA space would
>> require additional tracking of which areas are currently in use. If
>> the VA and/or TLB overhead turned out to be a problem, such extra code
>> could easily be added.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
> 
> It might be wise to have a BUILD_BUG_ON() which confirms that
> STUBS_PER_PAGE is a power of two, which is a requirement given the way
> it is used.

Good idea. Along with that one I'm also adding

    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < MAX_INST_LEN + 1);

to patch 2 and

    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);

to patch 3, in the hope that this won't invalidate your R-b.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 1/4] x86: move syscall trampolines off the stack
  2015-05-21 11:48     ` Jan Beulich
@ 2015-05-21 11:50       ` Andrew Cooper
  2015-05-22  8:25       ` Jan Beulich
  1 sibling, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2015-05-21 11:50 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

On 21/05/15 12:48, Jan Beulich wrote:
>>>> On 21.05.15 at 13:08, <andrew.cooper3@citrix.com> wrote:
>> On 21/05/15 11:15, Jan Beulich wrote:
>>> This is needed as stacks are going to become non-executable. Use
>>> separate stub pages (shared among suitable CPUs on the same node)
>>> instead.
>>>
>>> Stub areas (currently 128 bytes each) are being split into two parts -
>>> a fixed usage one (the syscall ones) and dynamically usable space,
>>> which will be used by subsequent changes to hold dynamically generated
>>> code during instruction eumlation.
>>>
>>> While sharing physical pages among certain CPUs on the same node, for
>>> now the virtual mappings get established in distinct pages for each
>>> CPU. This isn't a strict requirement, but simplifies VA space
>>> management for this initial implementation: Sharing VA space would
>>> require additional tracking of which areas are currently in use. If
>>> the VA and/or TLB overhead turned out to be a problem, such extra code
>>> could easily be added.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>
>> It might be wise to have a BUILD_BUG_ON() which confirms that
>> STUBS_PER_PAGE is a power of two, which is a requirement given the way
>> it is used.
> Good idea. Along with that one I'm also adding
>
>     BUILD_BUG_ON(STUB_BUF_SIZE / 2 < MAX_INST_LEN + 1);
>
> to patch 2 and
>
>     BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
>
> to patch 3, in the hope that this won't invalidate your R-b.

Both fine by me

~Andrew

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 1/4] x86: move syscall trampolines off the stack
  2015-05-21 11:48     ` Jan Beulich
  2015-05-21 11:50       ` Andrew Cooper
@ 2015-05-22  8:25       ` Jan Beulich
  2015-05-22  8:34         ` Jan Beulich
  2015-05-22  8:35         ` Andrew Cooper
  1 sibling, 2 replies; 14+ messages in thread
From: Jan Beulich @ 2015-05-22  8:25 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 21.05.15 at 13:48, <JBeulich@suse.com> wrote:
>>>> On 21.05.15 at 13:08, <andrew.cooper3@citrix.com> wrote:
>> On 21/05/15 11:15, Jan Beulich wrote:
>>> This is needed as stacks are going to become non-executable. Use
>>> separate stub pages (shared among suitable CPUs on the same node)
>>> instead.
>>>
>>> Stub areas (currently 128 bytes each) are being split into two parts -
>>> a fixed usage one (the syscall ones) and dynamically usable space,
>>> which will be used by subsequent changes to hold dynamically generated
>>> code during instruction eumlation.
>>>
>>> While sharing physical pages among certain CPUs on the same node, for
>>> now the virtual mappings get established in distinct pages for each
>>> CPU. This isn't a strict requirement, but simplifies VA space
>>> management for this initial implementation: Sharing VA space would
>>> require additional tracking of which areas are currently in use. If
>>> the VA and/or TLB overhead turned out to be a problem, such extra code
>>> could easily be added.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> 
>> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> 
>> It might be wise to have a BUILD_BUG_ON() which confirms that
>> STUBS_PER_PAGE is a power of two, which is a requirement given the way
>> it is used.
> 
> Good idea.

Sadly this can't be a BUILD_BUG_ON(), as STUBS_PER_PAGE isn't a
compile time constant (due to the use of max()). I made it an
ASSERT() for the time being.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 1/4] x86: move syscall trampolines off the stack
  2015-05-22  8:25       ` Jan Beulich
@ 2015-05-22  8:34         ` Jan Beulich
  2015-05-22  8:35         ` Andrew Cooper
  1 sibling, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2015-05-22  8:34 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 22.05.15 at 10:25, <JBeulich@suse.com> wrote:
>>>> On 21.05.15 at 13:48, <JBeulich@suse.com> wrote:
>>>>> On 21.05.15 at 13:08, <andrew.cooper3@citrix.com> wrote:
>>> It might be wise to have a BUILD_BUG_ON() which confirms that
>>> STUBS_PER_PAGE is a power of two, which is a requirement given the way
>>> it is used.
>> 
>> Good idea.
> 
> Sadly this can't be a BUILD_BUG_ON(), as STUBS_PER_PAGE isn't a
> compile time constant (due to the use of max()). I made it an
> ASSERT() for the time being.

Or actually no, since the other two would need to become ASSERT()s
too, I'll rather open code the max() instead.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 1/4] x86: move syscall trampolines off the stack
  2015-05-22  8:25       ` Jan Beulich
  2015-05-22  8:34         ` Jan Beulich
@ 2015-05-22  8:35         ` Andrew Cooper
  1 sibling, 0 replies; 14+ messages in thread
From: Andrew Cooper @ 2015-05-22  8:35 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Keir Fraser

On 22/05/2015 09:25, Jan Beulich wrote:
>>>> On 21.05.15 at 13:48, <JBeulich@suse.com> wrote:
>>>>> On 21.05.15 at 13:08, <andrew.cooper3@citrix.com> wrote:
>>> On 21/05/15 11:15, Jan Beulich wrote:
>>>> This is needed as stacks are going to become non-executable. Use
>>>> separate stub pages (shared among suitable CPUs on the same node)
>>>> instead.
>>>>
>>>> Stub areas (currently 128 bytes each) are being split into two parts -
>>>> a fixed usage one (the syscall ones) and dynamically usable space,
>>>> which will be used by subsequent changes to hold dynamically generated
>>>> code during instruction eumlation.
>>>>
>>>> While sharing physical pages among certain CPUs on the same node, for
>>>> now the virtual mappings get established in distinct pages for each
>>>> CPU. This isn't a strict requirement, but simplifies VA space
>>>> management for this initial implementation: Sharing VA space would
>>>> require additional tracking of which areas are currently in use. If
>>>> the VA and/or TLB overhead turned out to be a problem, such extra code
>>>> could easily be added.
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>>>
>>> It might be wise to have a BUILD_BUG_ON() which confirms that
>>> STUBS_PER_PAGE is a power of two, which is a requirement given the way
>>> it is used.
>> Good idea.
> Sadly this can't be a BUILD_BUG_ON(), as STUBS_PER_PAGE isn't a
> compile time constant (due to the use of max()). I made it an
> ASSERT() for the time being.

In some copious free time, I shall see about borrowing some of the Linux
constants infrastructure.

We have quite a few examples of calculations which should be able to be
evaluated by the compiler, but cant given our current setup.

~Andrew

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-05-22  8:35 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-21 10:07 [PATCH v2 0/4] x86: don't default to executable mappings Jan Beulich
2015-05-21 10:15 ` [PATCH v2 1/4] x86: move syscall trampolines off the stack Jan Beulich
2015-05-21 11:08   ` Andrew Cooper
2015-05-21 11:48     ` Jan Beulich
2015-05-21 11:50       ` Andrew Cooper
2015-05-22  8:25       ` Jan Beulich
2015-05-22  8:34         ` Jan Beulich
2015-05-22  8:35         ` Andrew Cooper
2015-05-21 10:16 ` [PATCH v2 2/4] x86emul: move stubs " Jan Beulich
2015-05-21 11:10   ` Andrew Cooper
2015-05-21 10:17 ` [PATCH v2 3/4] x86: move I/O emulation " Jan Beulich
2015-05-21 11:12   ` Andrew Cooper
2015-05-21 10:17 ` [PATCH v2 4/4] x86: switch default mapping attributes to non-executable Jan Beulich
2015-05-21 11:14   ` Andrew Cooper

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.