All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] x86: reduce Meltdown band-aid overhead a little further
@ 2018-01-23 10:33 Jan Beulich
  2018-01-23 10:36 ` [PATCH 1/4] x86: remove CR reads from exit-to-guest path Jan Beulich
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Jan Beulich @ 2018-01-23 10:33 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

Prereq: x86: slightly reduce Meltdown band-aid overhead
(https://lists.xenproject.org/archives/html/xen-devel/2018-01/msg01609.html)

The overall effect of this series is smaller than that of the single
patch above (up to 1% in my measurements, but again instead
of the almost 40% loss of performance George has seen that
system showed a 10-13% decline "only"), but perhaps still worth
considering.

1: remove CR reads from exit-to-guest path
2: eliminate most XPTI entry/exit code when it's not in use
3: re-organize toggle_guest_*()
4: avoid double CR3 reload when switching to guest user mode

Signed-off-by: Jan Beulich <jbeulich@suse.com>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/4] x86: remove CR reads from exit-to-guest path
  2018-01-23 10:33 [PATCH 0/4] x86: reduce Meltdown band-aid overhead a little further Jan Beulich
@ 2018-01-23 10:36 ` Jan Beulich
  2018-01-30 11:01   ` Andrew Cooper
  2018-01-23 10:37 ` [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use Jan Beulich
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2018-01-23 10:36 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

CR3 is - during normal operation - only ever loaded from v->arch.cr3,
so there's no need to read the actual control register. For CR4 we can
generally use the cached value on all synchronous entry end exit paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -88,6 +88,7 @@ void __dummy__(void)
     OFFSET(VCPU_kernel_ss, struct vcpu, arch.pv_vcpu.kernel_ss);
     OFFSET(VCPU_iopl, struct vcpu, arch.pv_vcpu.iopl);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.vgc_flags);
+    OFFSET(VCPU_cr3, struct vcpu, arch.cr3);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
     OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
     OFFSET(VCPU_nmi_old_mask, struct vcpu, nmi_state.old_mask);
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -39,7 +39,7 @@ restore_all_guest:
         ASSERT_INTERRUPTS_DISABLED
 
         /* Copy guest mappings and switch to per-CPU root page table. */
-        mov   %cr3, %r9
+        mov   VCPU_cr3(%rbx), %r9
         GET_STACK_END(dx)
         mov   STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
         movabs $PADDR_MASK & PAGE_MASK, %rsi
@@ -61,6 +61,7 @@ restore_all_guest:
         sub   $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
                 ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
         rep movsq
+        mov   STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi
         mov   %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
         write_cr3 rax, rdi, rsi
 .Lrag_keep_cr3:
@@ -112,6 +113,7 @@ restore_all_xen:
          * so "g" will have to do.
          */
 UNLIKELY_START(g, exit_cr3)
+        mov   %cr4, %rdi
         write_cr3 rax, rdi, rsi
 UNLIKELY_END(exit_cr3)
 
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -206,13 +206,12 @@ void ret_from_intr(void);
 #define ASM_STAC ASM_AC(STAC)
 #define ASM_CLAC ASM_AC(CLAC)
 
-.macro write_cr3 val:req, tmp1:req, tmp2:req
-        mov   %cr4, %\tmp1
-        mov   %\tmp1, %\tmp2
-        and   $~X86_CR4_PGE, %\tmp1
-        mov   %\tmp1, %cr4
+.macro write_cr3 val:req, cr4:req, tmp:req
+        mov   %\cr4, %\tmp
+        and   $~X86_CR4_PGE, %\cr4
+        mov   %\cr4, %cr4
         mov   %\val, %cr3
-        mov   %\tmp2, %cr4
+        mov   %\tmp, %cr4
 .endm
 
 #define CR4_PV32_RESTORE                                           \




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use
  2018-01-23 10:33 [PATCH 0/4] x86: reduce Meltdown band-aid overhead a little further Jan Beulich
  2018-01-23 10:36 ` [PATCH 1/4] x86: remove CR reads from exit-to-guest path Jan Beulich
@ 2018-01-23 10:37 ` Jan Beulich
  2018-01-30 12:02   ` Andrew Cooper
  2018-01-23 10:38 ` [PATCH 3/4] x86: re-organize toggle_guest_*() Jan Beulich
  2018-01-23 10:38 ` [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode Jan Beulich
  3 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2018-01-23 10:37 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

Introduce a synthetic feature flag to use alternative instruction
patching to NOP out all code on entry/exit paths other than those
involved in NMI/#MC handling (the patching logic can't properly handle
those paths yet). Having NOPs here is generally better than using
conditional branches.

Also change the limit on the number of bytes we can patch in one go to
that resulting from the encoding in struct alt_instr - there's no point
reducing it below that limit, and without a check being in place that
the limit isn't actually exceeded, such an artificial boundary is a
latent risk.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -24,7 +24,7 @@
 #include <asm/nmi.h>
 #include <xen/livepatch.h>
 
-#define MAX_PATCH_LEN (255-1)
+#define MAX_PATCH_LEN 255
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3654,7 +3654,7 @@ long do_mmu_update(
                     rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
                     if ( !rc )
-                        sync_guest = this_cpu(root_pgt);
+                        sync_guest = !cpu_has_no_xpti;
                     break;
 
                 case PGT_writable_page:
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -169,6 +169,9 @@ static int __init parse_smap_param(const
 }
 custom_param("smap", parse_smap_param);
 
+static int8_t __initdata opt_xpti = -1;
+boolean_param("xpti", opt_xpti);
+
 bool __read_mostly acpi_disabled;
 bool __initdata acpi_force;
 static char __initdata acpi_param[10] = "";
@@ -1540,6 +1543,13 @@ void __init noreturn __start_xen(unsigne
 
     cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
 
+    if ( opt_xpti < 0 )
+        opt_xpti = boot_cpu_data.x86_vendor != X86_VENDOR_AMD;
+    if ( opt_xpti )
+        setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
+    else
+        setup_force_cpu_cap(X86_FEATURE_NO_XPTI);
+
     if ( cpu_has_fsgsbase )
         set_in_cr4(X86_CR4_FSGSBASE);
 
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -735,8 +735,6 @@ static int clone_mapping(const void *ptr
     return 0;
 }
 
-static __read_mostly int8_t opt_xpti = -1;
-boolean_param("xpti", opt_xpti);
 DEFINE_PER_CPU(root_pgentry_t *, root_pgt);
 
 static int setup_cpu_root_pgt(unsigned int cpu)
@@ -745,7 +743,7 @@ static int setup_cpu_root_pgt(unsigned i
     unsigned int off;
     int rc;
 
-    if ( !opt_xpti )
+    if ( cpu_has_no_xpti )
         return 0;
 
     rpt = alloc_xen_pagetable();
@@ -999,9 +997,6 @@ void __init smp_prepare_cpus(unsigned in
 
     stack_base[0] = stack_start;
 
-    if ( opt_xpti < 0 )
-        opt_xpti = boot_cpu_data.x86_vendor != X86_VENDOR_AMD;
-
     rc = setup_cpu_root_pgt(0);
     if ( rc )
         panic("Error %d setting up PV root page table\n", rc);
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -189,7 +189,7 @@ ENTRY(compat_post_handle_exception)
 
 /* See lstar_enter for entry register state. */
 ENTRY(cstar_enter)
-        /* sti could live here when we don't switch page tables below. */
+        ALTERNATIVE nop, sti, X86_FEATURE_NO_XPTI
         CR4_PV32_RESTORE
         movq  8(%rsp),%rax /* Restore %rax. */
         movq  $FLAT_KERNEL_SS,8(%rsp)
@@ -201,6 +201,7 @@ ENTRY(cstar_enter)
         SAVE_ALL
 
         GET_STACK_END(bx)
+.Lcstar_cr3_start:
         mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
         neg   %rcx
         jz    .Lcstar_cr3_okay
@@ -210,6 +211,12 @@ ENTRY(cstar_enter)
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
 .Lcstar_cr3_okay:
         sti
+.Lcstar_cr3_end:
+        .pushsection .altinstructions, "a", @progbits
+        altinstruction_entry .Lcstar_cr3_start, .Lcstar_cr3_start, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lcstar_cr3_end - .Lcstar_cr3_start), 0
+        .popsection
 
         __GET_CURRENT(bx)
         movq  VCPU_domain(%rbx),%rcx
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -39,6 +39,7 @@ restore_all_guest:
         ASSERT_INTERRUPTS_DISABLED
 
         /* Copy guest mappings and switch to per-CPU root page table. */
+.Lrag_cr3_start:
         mov   VCPU_cr3(%rbx), %r9
         GET_STACK_END(dx)
         mov   STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
@@ -46,7 +47,6 @@ restore_all_guest:
         movabs $DIRECTMAP_VIRT_START, %rcx
         mov   %rdi, %rax
         and   %rsi, %rdi
-        jz    .Lrag_keep_cr3
         and   %r9, %rsi
         add   %rcx, %rdi
         add   %rcx, %rsi
@@ -64,7 +64,12 @@ restore_all_guest:
         mov   STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi
         mov   %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
         write_cr3 rax, rdi, rsi
-.Lrag_keep_cr3:
+.Lrag_cr3_end:
+        .pushsection .altinstructions, "a", @progbits
+        altinstruction_entry .Lrag_cr3_start, .Lrag_cr3_start, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lrag_cr3_end - .Lrag_cr3_start), 0
+        .popsection
 
         RESTORE_ALL
         testw $TRAP_syscall,4(%rsp)
@@ -137,7 +142,7 @@ UNLIKELY_END(exit_cr3)
  * %ss must be saved into the space left by the trampoline.
  */
 ENTRY(lstar_enter)
-        /* sti could live here when we don't switch page tables below. */
+        ALTERNATIVE nop, sti, X86_FEATURE_NO_XPTI
         movq  8(%rsp),%rax /* Restore %rax. */
         movq  $FLAT_KERNEL_SS,8(%rsp)
         pushq %r11
@@ -148,6 +153,7 @@ ENTRY(lstar_enter)
         SAVE_ALL
 
         GET_STACK_END(bx)
+.Llstar_cr3_start:
         mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
         neg   %rcx
         jz    .Llstar_cr3_okay
@@ -157,6 +163,12 @@ ENTRY(lstar_enter)
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
 .Llstar_cr3_okay:
         sti
+.Llstar_cr3_end:
+        .pushsection .altinstructions, "a", @progbits
+        altinstruction_entry .Llstar_cr3_start, .Llstar_cr3_start, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Llstar_cr3_end - .Llstar_cr3_start), 0
+        .popsection
 
         __GET_CURRENT(bx)
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
@@ -239,7 +251,7 @@ process_trap:
         jmp  test_all_events
 
 ENTRY(sysenter_entry)
-        /* sti could live here when we don't switch page tables below. */
+        ALTERNATIVE nop, sti, X86_FEATURE_NO_XPTI
         pushq $FLAT_USER_SS
         pushq $0
         pushfq
@@ -252,6 +264,7 @@ GLOBAL(sysenter_eflags_saved)
         SAVE_ALL
 
         GET_STACK_END(bx)
+.Lsyse_cr3_start:
         mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
         neg   %rcx
         jz    .Lsyse_cr3_okay
@@ -261,6 +274,12 @@ GLOBAL(sysenter_eflags_saved)
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
 .Lsyse_cr3_okay:
         sti
+.Lsyse_cr3_end:
+        .pushsection .altinstructions, "a", @progbits
+        altinstruction_entry .Lsyse_cr3_start, .Lsyse_cr3_start, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lsyse_cr3_end - .Lsyse_cr3_start), 0
+        .popsection
 
         __GET_CURRENT(bx)
         cmpb  $0,VCPU_sysenter_disables_events(%rbx)
@@ -299,6 +318,7 @@ ENTRY(int80_direct_trap)
         SAVE_ALL
 
         GET_STACK_END(bx)
+.Lint80_cr3_start:
         mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
         neg   %rcx
         jz    .Lint80_cr3_okay
@@ -308,6 +328,12 @@ ENTRY(int80_direct_trap)
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
 .Lint80_cr3_okay:
         sti
+.Lint80_cr3_end:
+        .pushsection .altinstructions, "a", @progbits
+        altinstruction_entry .Lint80_cr3_start, .Lint80_cr3_start, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lint80_cr3_end - .Lint80_cr3_start), 0
+        .popsection
 
         cmpb  $0,untrusted_msi(%rip)
 UNLIKELY_START(ne, msi_check)
@@ -473,6 +499,7 @@ ENTRY(dom_crash_sync_extable)
 ENTRY(common_interrupt)
         SAVE_ALL CLAC
 
+.Lintr_cr3_start:
         GET_STACK_END(14)
         mov   STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
         mov   %rcx, %r15
@@ -492,9 +519,20 @@ ENTRY(common_interrupt)
         CR4_PV32_RESTORE
         movq %rsp,%rdi
         callq do_IRQ
+.Lintr_cr3_restore:
         mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+.Lintr_cr3_end:
         jmp ret_from_intr
 
+        .pushsection .altinstructions, "a", @progbits
+        altinstruction_entry .Lintr_cr3_restore, .Lintr_cr3_restore, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lintr_cr3_end - .Lintr_cr3_restore), 0
+        altinstruction_entry .Lintr_cr3_start, .Lintr_cr3_start, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lintr_cr3_okay - .Lintr_cr3_start), 0
+        .popsection
+
 /* No special register assumptions. */
 ENTRY(ret_from_intr)
         GET_CURRENT(bx)
@@ -511,6 +549,7 @@ ENTRY(page_fault)
 GLOBAL(handle_exception)
         SAVE_ALL CLAC
 
+.Lxcpt_cr3_start:
         GET_STACK_END(14)
         mov   STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
         mov   %rcx, %r15
@@ -592,7 +631,9 @@ handle_exception_saved:
         PERFC_INCR(exceptions, %rax, %rbx)
         mov   (%rdx, %rax, 8), %rdx
         INDIRECT_CALL %rdx
+.Lxcpt_cr3_restore1:
         mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+.Lxcpt_cr3_end1:
         testb $3,UREGS_cs(%rsp)
         jz    restore_all_xen
         leaq  VCPU_trap_bounce(%rbx),%rdx
@@ -625,9 +666,23 @@ exception_with_ints_disabled:
         rep;  movsq                     # make room for ec/ev
 1:      movq  UREGS_error_code(%rsp),%rax # ec/ev
         movq  %rax,UREGS_kernel_sizeof(%rsp)
+.Lxcpt_cr3_restore2:
         mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+.Lxcpt_cr3_end2:
         jmp   restore_all_xen           # return to fixup code
 
+        .pushsection .altinstructions, "a", @progbits
+        altinstruction_entry .Lxcpt_cr3_restore1, .Lxcpt_cr3_restore1, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lxcpt_cr3_end1 - .Lxcpt_cr3_restore1), 0
+        altinstruction_entry .Lxcpt_cr3_restore2, .Lxcpt_cr3_restore2, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lxcpt_cr3_end2 - .Lxcpt_cr3_restore2), 0
+        altinstruction_entry .Lxcpt_cr3_start, .Lxcpt_cr3_start, \
+                             X86_FEATURE_NO_XPTI, \
+                             (.Lxcpt_cr3_okay - .Lxcpt_cr3_start), 0
+        .popsection
+
 /* No special register assumptions. */
 FATAL_exception_with_ints_disabled:
         xorl  %esi,%esi
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -105,6 +105,7 @@
 #define cpu_has_cpuid_faulting  boot_cpu_has(X86_FEATURE_CPUID_FAULTING)
 #define cpu_has_aperfmperf      boot_cpu_has(X86_FEATURE_APERFMPERF)
 #define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH)
+#define cpu_has_no_xpti         boot_cpu_has(X86_FEATURE_NO_XPTI)
 
 enum _cache_type {
     CACHE_TYPE_NULL = 0,
--- a/xen/include/asm-x86/cpufeatures.h
+++ b/xen/include/asm-x86/cpufeatures.h
@@ -28,3 +28,4 @@ XEN_CPUFEATURE(IND_THUNK_JMP,   (FSCAPIN
 XEN_CPUFEATURE(XEN_IBPB,        (FSCAPINTS+0)*32+15) /* IBRSB || IBPB */
 XEN_CPUFEATURE(XEN_IBRS_SET,    (FSCAPINTS+0)*32+16) /* IBRSB && IRBS set in Xen */
 XEN_CPUFEATURE(XEN_IBRS_CLEAR,  (FSCAPINTS+0)*32+17) /* IBRSB && IBRS clear in Xen */
+XEN_CPUFEATURE(NO_XPTI,         (FSCAPINTS+0)*32+18) /* XPTI mitigation not in use */



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 3/4] x86: re-organize toggle_guest_*()
  2018-01-23 10:33 [PATCH 0/4] x86: reduce Meltdown band-aid overhead a little further Jan Beulich
  2018-01-23 10:36 ` [PATCH 1/4] x86: remove CR reads from exit-to-guest path Jan Beulich
  2018-01-23 10:37 ` [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use Jan Beulich
@ 2018-01-23 10:38 ` Jan Beulich
  2018-01-30 13:45   ` Andrew Cooper
  2018-01-23 10:38 ` [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode Jan Beulich
  3 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2018-01-23 10:38 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

toggle_guest_mode() is only ever being called for 64-bit PV vCPU-s -
replace the 32-bit PV conditional by an ASSERT().

Introduce a local helper without 32-bit PV conditional, to be used by
both pre-existing functions.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -220,28 +220,8 @@ int pv_domain_initialise(struct domain *
     return rc;
 }
 
-void toggle_guest_mode(struct vcpu *v)
-{
-    if ( is_pv_32bit_vcpu(v) )
-        return;
-
-    if ( cpu_has_fsgsbase )
-    {
-        if ( v->arch.flags & TF_kernel_mode )
-            v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
-        else
-            v->arch.pv_vcpu.gs_base_user = __rdgsbase();
-    }
-    asm volatile ( "swapgs" );
-
-    toggle_guest_pt(v);
-}
-
-void toggle_guest_pt(struct vcpu *v)
+static void _toggle_guest_pt(struct vcpu *v)
 {
-    if ( is_pv_32bit_vcpu(v) )
-        return;
-
     v->arch.flags ^= TF_kernel_mode;
     update_cr3(v);
     /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
@@ -260,6 +240,28 @@ void toggle_guest_pt(struct vcpu *v)
         v->arch.pv_vcpu.pending_system_time.version = 0;
 }
 
+void toggle_guest_mode(struct vcpu *v)
+{
+    ASSERT(!is_pv_32bit_vcpu(v));
+
+    if ( cpu_has_fsgsbase )
+    {
+        if ( v->arch.flags & TF_kernel_mode )
+            v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
+        else
+            v->arch.pv_vcpu.gs_base_user = __rdgsbase();
+    }
+    asm volatile ( "swapgs" );
+
+    _toggle_guest_pt(v);
+}
+
+void toggle_guest_pt(struct vcpu *v)
+{
+    if ( !is_pv_32bit_vcpu(v) )
+        _toggle_guest_pt(v);
+}
+
 /*
  * Local variables:
  * mode: C




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode
  2018-01-23 10:33 [PATCH 0/4] x86: reduce Meltdown band-aid overhead a little further Jan Beulich
                   ` (2 preceding siblings ...)
  2018-01-23 10:38 ` [PATCH 3/4] x86: re-organize toggle_guest_*() Jan Beulich
@ 2018-01-23 10:38 ` Jan Beulich
  2018-01-30 14:29   ` Andrew Cooper
  3 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2018-01-23 10:38 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

When XPTI is active, the CR3 load in restore_all_guest is sufficient
when switching to user mode, improving in particular system call and
page fault exit paths for the guest.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -220,10 +220,20 @@ int pv_domain_initialise(struct domain *
     return rc;
 }
 
-static void _toggle_guest_pt(struct vcpu *v)
+static void _toggle_guest_pt(struct vcpu *v, bool force_cr3)
 {
     v->arch.flags ^= TF_kernel_mode;
     update_cr3(v);
+
+    /*
+     * There's no need to load CR3 here when it is going to be loaded on the
+     * way out to guest mode again anyway, and when the page tables we're
+     * currently on are the kernel ones (whereas when switching to kernel
+     * mode we need to be able to write a bounce frame onto the kernel stack).
+     */
+    if ( !force_cr3 && !(v->arch.flags & TF_kernel_mode) )
+        return;
+
     /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
     asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
 
@@ -253,13 +263,13 @@ void toggle_guest_mode(struct vcpu *v)
     }
     asm volatile ( "swapgs" );
 
-    _toggle_guest_pt(v);
+    _toggle_guest_pt(v, cpu_has_no_xpti);
 }
 
 void toggle_guest_pt(struct vcpu *v)
 {
     if ( !is_pv_32bit_vcpu(v) )
-        _toggle_guest_pt(v);
+        _toggle_guest_pt(v, true);
 }
 
 /*




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] x86: remove CR reads from exit-to-guest path
  2018-01-23 10:36 ` [PATCH 1/4] x86: remove CR reads from exit-to-guest path Jan Beulich
@ 2018-01-30 11:01   ` Andrew Cooper
  2018-01-30 11:10     ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2018-01-30 11:01 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 23/01/18 10:36, Jan Beulich wrote:
> --- a/xen/include/asm-x86/asm_defns.h
> +++ b/xen/include/asm-x86/asm_defns.h
> @@ -206,13 +206,12 @@ void ret_from_intr(void);
>  #define ASM_STAC ASM_AC(STAC)
>  #define ASM_CLAC ASM_AC(CLAC)
>  
> -.macro write_cr3 val:req, tmp1:req, tmp2:req
> -        mov   %cr4, %\tmp1
> -        mov   %\tmp1, %\tmp2
> -        and   $~X86_CR4_PGE, %\tmp1
> -        mov   %\tmp1, %cr4
> +.macro write_cr3 val:req, cr4:req, tmp:req
> +        mov   %\cr4, %\tmp
> +        and   $~X86_CR4_PGE, %\cr4
> +        mov   %\cr4, %cr4

This is confusing to read; It took me a while to work out why it
assembled in the first place.  Given that there are only two instances
of write_cr3 now, I'd suggest expanding this in the two sites.  It will
also make it clear which registers have real values and which are
temporary, which isn't clear from the current callsites.

Otherwise, Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] x86: remove CR reads from exit-to-guest path
  2018-01-30 11:01   ` Andrew Cooper
@ 2018-01-30 11:10     ` Jan Beulich
  2018-02-05 16:47       ` Andrew Cooper
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2018-01-30 11:10 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 30.01.18 at 12:01, <andrew.cooper3@citrix.com> wrote:
> On 23/01/18 10:36, Jan Beulich wrote:
>> --- a/xen/include/asm-x86/asm_defns.h
>> +++ b/xen/include/asm-x86/asm_defns.h
>> @@ -206,13 +206,12 @@ void ret_from_intr(void);
>>  #define ASM_STAC ASM_AC(STAC)
>>  #define ASM_CLAC ASM_AC(CLAC)
>>  
>> -.macro write_cr3 val:req, tmp1:req, tmp2:req
>> -        mov   %cr4, %\tmp1
>> -        mov   %\tmp1, %\tmp2
>> -        and   $~X86_CR4_PGE, %\tmp1
>> -        mov   %\tmp1, %cr4
>> +.macro write_cr3 val:req, cr4:req, tmp:req
>> +        mov   %\cr4, %\tmp
>> +        and   $~X86_CR4_PGE, %\cr4
>> +        mov   %\cr4, %cr4
> 
> This is confusing to read; It took me a while to work out why it
> assembled in the first place.  Given that there are only two instances
> of write_cr3 now, I'd suggest expanding this in the two sites.  It will
> also make it clear which registers have real values and which are
> temporary, which isn't clear from the current callsites.

Hmm, part of the reason I didn't want to drop the macro altogether
is its similarity with write_cr3(), so it would turn up in grep-s for that
one. How about switching the two use sites to specify named
arguments to the macro?

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use
  2018-01-23 10:37 ` [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use Jan Beulich
@ 2018-01-30 12:02   ` Andrew Cooper
  2018-01-30 13:51     ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2018-01-30 12:02 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 23/01/18 10:37, Jan Beulich wrote:
> Introduce a synthetic feature flag to use alternative instruction
> patching to NOP out all code on entry/exit paths other than those
> involved in NMI/#MC handling (the patching logic can't properly handle
> those paths yet). Having NOPs here is generally better than using
> conditional branches.

Given my other series, I'd prefer to fix the IST paths rather than
introduce yet-more workarounds.

> --- a/xen/arch/x86/x86_64/compat/entry.S
> +++ b/xen/arch/x86/x86_64/compat/entry.S
> @@ -189,7 +189,7 @@ ENTRY(compat_post_handle_exception)
>  
>  /* See lstar_enter for entry register state. */
>  ENTRY(cstar_enter)
> -        /* sti could live here when we don't switch page tables below. */
> +        ALTERNATIVE nop, sti, X86_FEATURE_NO_XPTI

I do not think the complexity of of altering the position of sti
outweighs the fractional extra delay which would result from
unilaterally having the sti later.  Furthermore, if you really are
concerned about microoptimising this, you don't want a singlebyte nop here.

>          CR4_PV32_RESTORE
>          movq  8(%rsp),%rax /* Restore %rax. */
>          movq  $FLAT_KERNEL_SS,8(%rsp)
> @@ -201,6 +201,7 @@ ENTRY(cstar_enter)
>          SAVE_ALL
>  
>          GET_STACK_END(bx)
> +.Lcstar_cr3_start:
>          mov   STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
>          neg   %rcx
>          jz    .Lcstar_cr3_okay
> @@ -210,6 +211,12 @@ ENTRY(cstar_enter)
>          movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
>  .Lcstar_cr3_okay:
>          sti
> +.Lcstar_cr3_end:
> +        .pushsection .altinstructions, "a", @progbits
> +        altinstruction_entry .Lcstar_cr3_start, .Lcstar_cr3_start, \
> +                             X86_FEATURE_NO_XPTI, \
> +                             (.Lcstar_cr3_end - .Lcstar_cr3_start), 0
> +        .popsection

It occurs to me that this would be far more legible if we had an alt_nop
wrapper.  Reusing .Lcstar_cr3_start and a length of 0 isn't obvious.

> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -46,7 +47,6 @@ restore_all_guest:
>          movabs $DIRECTMAP_VIRT_START, %rcx
>          mov   %rdi, %rax
>          and   %rsi, %rdi
> -        jz    .Lrag_keep_cr3

This looks like a functional change?

>          and   %r9, %rsi
>          add   %rcx, %rdi
>          add   %rcx, %rsi
> @@ -473,6 +499,7 @@ ENTRY(dom_crash_sync_extable)
>  ENTRY(common_interrupt)
>          SAVE_ALL CLAC
>  
> +.Lintr_cr3_start:
>          GET_STACK_END(14)
>          mov   STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
>          mov   %rcx, %r15
> @@ -492,9 +519,20 @@ ENTRY(common_interrupt)
>          CR4_PV32_RESTORE
>          movq %rsp,%rdi
>          callq do_IRQ
> +.Lintr_cr3_restore:
>          mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
> +.Lintr_cr3_end:
>          jmp ret_from_intr
>  
> +        .pushsection .altinstructions, "a", @progbits
> +        altinstruction_entry .Lintr_cr3_restore, .Lintr_cr3_restore, \
> +                             X86_FEATURE_NO_XPTI, \
> +                             (.Lintr_cr3_end - .Lintr_cr3_restore), 0
> +        altinstruction_entry .Lintr_cr3_start, .Lintr_cr3_start, \
> +                             X86_FEATURE_NO_XPTI, \
> +                             (.Lintr_cr3_okay - .Lintr_cr3_start), 0

This is now getting very complicated to follow.  Is it just for IST
safety and liable to disappear?  If not, I think we need a different
way,as this is now saying "sporadic instructions inside this block, but
not all of them, turn into nops".

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/4] x86: re-organize toggle_guest_*()
  2018-01-23 10:38 ` [PATCH 3/4] x86: re-organize toggle_guest_*() Jan Beulich
@ 2018-01-30 13:45   ` Andrew Cooper
  0 siblings, 0 replies; 17+ messages in thread
From: Andrew Cooper @ 2018-01-30 13:45 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 23/01/18 10:38, Jan Beulich wrote:
> toggle_guest_mode() is only ever being called for 64-bit PV vCPU-s -
> replace the 32-bit PV conditional by an ASSERT().
>
> Introduce a local helper without 32-bit PV conditional, to be used by
> both pre-existing functions.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use
  2018-01-30 12:02   ` Andrew Cooper
@ 2018-01-30 13:51     ` Jan Beulich
  2018-02-05 17:28       ` Andrew Cooper
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2018-01-30 13:51 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 30.01.18 at 13:02, <andrew.cooper3@citrix.com> wrote:
> On 23/01/18 10:37, Jan Beulich wrote:
>> Introduce a synthetic feature flag to use alternative instruction
>> patching to NOP out all code on entry/exit paths other than those
>> involved in NMI/#MC handling (the patching logic can't properly handle
>> those paths yet). Having NOPs here is generally better than using
>> conditional branches.
> 
> Given my other series, I'd prefer to fix the IST paths rather than
> introduce yet-more workarounds.

More workarounds? The patch simply avoids touching the IST path.

>> --- a/xen/arch/x86/x86_64/compat/entry.S
>> +++ b/xen/arch/x86/x86_64/compat/entry.S
>> @@ -189,7 +189,7 @@ ENTRY(compat_post_handle_exception)
>>  
>>  /* See lstar_enter for entry register state. */
>>  ENTRY(cstar_enter)
>> -        /* sti could live here when we don't switch page tables below. */
>> +        ALTERNATIVE nop, sti, X86_FEATURE_NO_XPTI
> 
> I do not think the complexity of of altering the position of sti
> outweighs the fractional extra delay which would result from
> unilaterally having the sti later.  Furthermore, if you really are
> concerned about microoptimising this, you don't want a singlebyte nop here.
> 
>>          CR4_PV32_RESTORE

There is, not the least, this, which I'm afraid is adding quite a bit
of a delay. While we're not real-time ready, I don't think we should
needlessly delay the enabling of interrupts.

As to the single byte NOP - are you suggesting it is worse than
the single byte STI?

>> @@ -210,6 +211,12 @@ ENTRY(cstar_enter)
>>          movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
>>  .Lcstar_cr3_okay:
>>          sti
>> +.Lcstar_cr3_end:
>> +        .pushsection .altinstructions, "a", @progbits
>> +        altinstruction_entry .Lcstar_cr3_start, .Lcstar_cr3_start, \
>> +                             X86_FEATURE_NO_XPTI, \
>> +                             (.Lcstar_cr3_end - .Lcstar_cr3_start), 0
>> +        .popsection
> 
> It occurs to me that this would be far more legible if we had an alt_nop
> wrapper.  Reusing .Lcstar_cr3_start and a length of 0 isn't obvious.

Could certainly do that, but one thing at a time.

>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -46,7 +47,6 @@ restore_all_guest:
>>          movabs $DIRECTMAP_VIRT_START, %rcx
>>          mov   %rdi, %rax
>>          and   %rsi, %rdi
>> -        jz    .Lrag_keep_cr3
> 
> This looks like a functional change?

Definitely not - the conditional branch simply becomes unnecessary
when the entire piece of code gets NOP-ed out.

>> @@ -492,9 +519,20 @@ ENTRY(common_interrupt)
>>          CR4_PV32_RESTORE
>>          movq %rsp,%rdi
>>          callq do_IRQ
>> +.Lintr_cr3_restore:
>>          mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
>> +.Lintr_cr3_end:
>>          jmp ret_from_intr
>>  
>> +        .pushsection .altinstructions, "a", @progbits
>> +        altinstruction_entry .Lintr_cr3_restore, .Lintr_cr3_restore, \
>> +                             X86_FEATURE_NO_XPTI, \
>> +                             (.Lintr_cr3_end - .Lintr_cr3_restore), 0
>> +        altinstruction_entry .Lintr_cr3_start, .Lintr_cr3_start, \
>> +                             X86_FEATURE_NO_XPTI, \
>> +                             (.Lintr_cr3_okay - .Lintr_cr3_start), 0
> 
> This is now getting very complicated to follow.  Is it just for IST
> safety and liable to disappear?  If not, I think we need a different
> way,as this is now saying "sporadic instructions inside this block, but
> not all of them, turn into nops".

This is not an IST path, and it is also not NOP-ing out sporadic
instructions - we can't drop the first piece of code without also
dropping the second, as %r14 won't be set up if the first block
is gone. They're clearly framed by .Lintr_cr3_* labels - I'm not
sure how to make even more obvious what's going on.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode
  2018-01-23 10:38 ` [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode Jan Beulich
@ 2018-01-30 14:29   ` Andrew Cooper
  2018-01-31 10:12     ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2018-01-30 14:29 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 23/01/18 10:38, Jan Beulich wrote:
> When XPTI is active, the CR3 load in restore_all_guest is sufficient
> when switching to user mode, improving in particular system call and
> page fault exit paths for the guest.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

While I can see the utility of this, we are starting to get into
complicated territory as to which cr3 is loaded.  Also, the name
"toggle" is no longer strictly accurate.

That being said, all of these helpers are only used in synchronous
contexts as far as I can tell, so some ASSERT(!in_irq()) would probably
go a long way.

>
> --- a/xen/arch/x86/pv/domain.c
> +++ b/xen/arch/x86/pv/domain.c
> @@ -220,10 +220,20 @@ int pv_domain_initialise(struct domain *
>      return rc;
>  }
>  
> -static void _toggle_guest_pt(struct vcpu *v)
> +static void _toggle_guest_pt(struct vcpu *v, bool force_cr3)
>  {
>      v->arch.flags ^= TF_kernel_mode;
>      update_cr3(v);
> +
> +    /*
> +     * There's no need to load CR3 here when it is going to be loaded on the
> +     * way out to guest mode again anyway, and when the page tables we're
> +     * currently on are the kernel ones (whereas when switching to kernel
> +     * mode we need to be able to write a bounce frame onto the kernel stack).
> +     */
> +    if ( !force_cr3 && !(v->arch.flags & TF_kernel_mode) )
> +        return;
> +
>      /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
>      asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
>  
> @@ -253,13 +263,13 @@ void toggle_guest_mode(struct vcpu *v)
>      }
>      asm volatile ( "swapgs" );
>  
> -    _toggle_guest_pt(v);
> +    _toggle_guest_pt(v, cpu_has_no_xpti);
>  }
>  
>  void toggle_guest_pt(struct vcpu *v)
>  {
>      if ( !is_pv_32bit_vcpu(v) )
> -        _toggle_guest_pt(v);
> +        _toggle_guest_pt(v, true);

This can be converted as well.  The only callers are the LDT-fault and
I/O perm check, both when we are currently on user pagetables, needing
to switch to kernel briefly, then back to user.

However, it would be better to drop the parameter and feed
cpu_has_no_xpti into the conditional above which explains why it is safe
to do.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode
  2018-01-30 14:29   ` Andrew Cooper
@ 2018-01-31 10:12     ` Jan Beulich
  2018-02-05 17:37       ` Andrew Cooper
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2018-01-31 10:12 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 30.01.18 at 15:29, <andrew.cooper3@citrix.com> wrote:
> On 23/01/18 10:38, Jan Beulich wrote:
>> When XPTI is active, the CR3 load in restore_all_guest is sufficient
>> when switching to user mode, improving in particular system call and
>> page fault exit paths for the guest.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> While I can see the utility of this, we are starting to get into
> complicated territory as to which cr3 is loaded.  Also, the name
> "toggle" is no longer strictly accurate.

Yes, the "which CR3 is loaded" is not very nice a situation. I
wouldn't mind this being dropped unless it can be proven by
someone else to have more than just a pretty marginal effect.
The one patch from this series I certainly want to see go in is
patch 2 (with whatever adjustments are necessary); all the
others are more or less optional, but I still wanted to post
them.

As to "toggle" in the name - the function still toggles page
tables in all cases (in the sense of what's stored in v->arch.cr3);
it just doesn't necessarily also load that value into CR3.

> That being said, all of these helpers are only used in synchronous
> contexts as far as I can tell, so some ASSERT(!in_irq()) would probably
> go a long way.

I can certainly do that.

>> --- a/xen/arch/x86/pv/domain.c
>> +++ b/xen/arch/x86/pv/domain.c
>> @@ -220,10 +220,20 @@ int pv_domain_initialise(struct domain *
>>      return rc;
>>  }
>>  
>> -static void _toggle_guest_pt(struct vcpu *v)
>> +static void _toggle_guest_pt(struct vcpu *v, bool force_cr3)
>>  {
>>      v->arch.flags ^= TF_kernel_mode;
>>      update_cr3(v);
>> +
>> +    /*
>> +     * There's no need to load CR3 here when it is going to be loaded on the
>> +     * way out to guest mode again anyway, and when the page tables we're
>> +     * currently on are the kernel ones (whereas when switching to kernel
>> +     * mode we need to be able to write a bounce frame onto the kernel stack).
>> +     */
>> +    if ( !force_cr3 && !(v->arch.flags & TF_kernel_mode) )
>> +        return;
>> +
>>      /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
>>      asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
>>  
>> @@ -253,13 +263,13 @@ void toggle_guest_mode(struct vcpu *v)
>>      }
>>      asm volatile ( "swapgs" );
>>  
>> -    _toggle_guest_pt(v);
>> +    _toggle_guest_pt(v, cpu_has_no_xpti);
>>  }
>>  
>>  void toggle_guest_pt(struct vcpu *v)
>>  {
>>      if ( !is_pv_32bit_vcpu(v) )
>> -        _toggle_guest_pt(v);
>> +        _toggle_guest_pt(v, true);
> 
> This can be converted as well.  The only callers are the LDT-fault and
> I/O perm check, both when we are currently on user pagetables, needing
> to switch to kernel briefly, then back to user.

Converted to what? Those code paths certainly need CR3 to be
written, or else the actual memory accesses will fail.

> However, it would be better to drop the parameter and feed
> cpu_has_no_xpti into the conditional above which explains why it is safe
> to do.

As a result, I also don't understand this part.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/4] x86: remove CR reads from exit-to-guest path
  2018-01-30 11:10     ` Jan Beulich
@ 2018-02-05 16:47       ` Andrew Cooper
  0 siblings, 0 replies; 17+ messages in thread
From: Andrew Cooper @ 2018-02-05 16:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 30/01/18 11:10, Jan Beulich wrote:
>>>> On 30.01.18 at 12:01, <andrew.cooper3@citrix.com> wrote:
>> On 23/01/18 10:36, Jan Beulich wrote:
>>> --- a/xen/include/asm-x86/asm_defns.h
>>> +++ b/xen/include/asm-x86/asm_defns.h
>>> @@ -206,13 +206,12 @@ void ret_from_intr(void);
>>>  #define ASM_STAC ASM_AC(STAC)
>>>  #define ASM_CLAC ASM_AC(CLAC)
>>>  
>>> -.macro write_cr3 val:req, tmp1:req, tmp2:req
>>> -        mov   %cr4, %\tmp1
>>> -        mov   %\tmp1, %\tmp2
>>> -        and   $~X86_CR4_PGE, %\tmp1
>>> -        mov   %\tmp1, %cr4
>>> +.macro write_cr3 val:req, cr4:req, tmp:req
>>> +        mov   %\cr4, %\tmp
>>> +        and   $~X86_CR4_PGE, %\cr4
>>> +        mov   %\cr4, %cr4
>> This is confusing to read; It took me a while to work out why it
>> assembled in the first place.  Given that there are only two instances
>> of write_cr3 now, I'd suggest expanding this in the two sites.  It will
>> also make it clear which registers have real values and which are
>> temporary, which isn't clear from the current callsites.
> Hmm, part of the reason I didn't want to drop the macro altogether
> is its similarity with write_cr3(), so it would turn up in grep-s for that
> one. How about switching the two use sites to specify named
> arguments to the macro?

Frankly, neither uses are suitably named.  They are "switch to new cr3
and flush global pages", except that the asm version doesn't tick the
TLB clock.

Using explicitly named parameters would be an improvement over what is
here at the moment, but removing the macro would be an improvement still.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use
  2018-01-30 13:51     ` Jan Beulich
@ 2018-02-05 17:28       ` Andrew Cooper
  2018-02-06  9:52         ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2018-02-05 17:28 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 30/01/18 13:51, Jan Beulich wrote:
>
>>> --- a/xen/arch/x86/x86_64/compat/entry.S
>>> +++ b/xen/arch/x86/x86_64/compat/entry.S
>>> @@ -189,7 +189,7 @@ ENTRY(compat_post_handle_exception)
>>>  
>>>  /* See lstar_enter for entry register state. */
>>>  ENTRY(cstar_enter)
>>> -        /* sti could live here when we don't switch page tables below. */
>>> +        ALTERNATIVE nop, sti, X86_FEATURE_NO_XPTI
>> I do not think the complexity of of altering the position of sti
>> outweighs the fractional extra delay which would result from
>> unilaterally having the sti later.  Furthermore, if you really are
>> concerned about microoptimising this, you don't want a singlebyte nop here.
>>
>>>          CR4_PV32_RESTORE
> There is, not the least, this, which I'm afraid is adding quite a bit
> of a delay. While we're not real-time ready, I don't think we should
> needlessly delay the enabling of interrupts.

The lion share of delay in the serialising effects of the write to cr4,
which also blocks interrupts.

i.e. I don't agree with this argument.

>>> @@ -210,6 +211,12 @@ ENTRY(cstar_enter)
>>>          movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
>>>  .Lcstar_cr3_okay:
>>>          sti
>>> +.Lcstar_cr3_end:
>>> +        .pushsection .altinstructions, "a", @progbits
>>> +        altinstruction_entry .Lcstar_cr3_start, .Lcstar_cr3_start, \
>>> +                             X86_FEATURE_NO_XPTI, \
>>> +                             (.Lcstar_cr3_end - .Lcstar_cr3_start), 0
>>> +        .popsection
>> It occurs to me that this would be far more legible if we had an alt_nop
>> wrapper.  Reusing .Lcstar_cr3_start and a length of 0 isn't obvious.
> Could certainly do that, but one thing at a time.

The problem is that this logic is borderline unfollowable.

>>> --- a/xen/arch/x86/x86_64/entry.S
>>> +++ b/xen/arch/x86/x86_64/entry.S
>>> @@ -46,7 +47,6 @@ restore_all_guest:
>>>          movabs $DIRECTMAP_VIRT_START, %rcx
>>>          mov   %rdi, %rax
>>>          and   %rsi, %rdi
>>> -        jz    .Lrag_keep_cr3
>> This looks like a functional change?
> Definitely not - the conditional branch simply becomes unnecessary
> when the entire piece of code gets NOP-ed out.

Hmm.  That is not at all obvious.  What about about cases were we'd want
to conditionally disable xpti on a per domain basis?

>
>>> @@ -492,9 +519,20 @@ ENTRY(common_interrupt)
>>>          CR4_PV32_RESTORE
>>>          movq %rsp,%rdi
>>>          callq do_IRQ
>>> +.Lintr_cr3_restore:
>>>          mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
>>> +.Lintr_cr3_end:
>>>          jmp ret_from_intr
>>>  
>>> +        .pushsection .altinstructions, "a", @progbits
>>> +        altinstruction_entry .Lintr_cr3_restore, .Lintr_cr3_restore, \
>>> +                             X86_FEATURE_NO_XPTI, \
>>> +                             (.Lintr_cr3_end - .Lintr_cr3_restore), 0
>>> +        altinstruction_entry .Lintr_cr3_start, .Lintr_cr3_start, \
>>> +                             X86_FEATURE_NO_XPTI, \
>>> +                             (.Lintr_cr3_okay - .Lintr_cr3_start), 0
>> This is now getting very complicated to follow.  Is it just for IST
>> safety and liable to disappear?  If not, I think we need a different
>> way,as this is now saying "sporadic instructions inside this block, but
>> not all of them, turn into nops".
> This is not an IST path, and it is also not NOP-ing out sporadic
> instructions - we can't drop the first piece of code without also
> dropping the second, as %r14 won't be set up if the first block
> is gone. They're clearly framed by .Lintr_cr3_* labels - I'm not
> sure how to make even more obvious what's going on.

I know you're not a fan of my SPEC_CTRL macros, but they do make it very
clear what is going on in each configuration.

They are certainly clearer than this.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode
  2018-01-31 10:12     ` Jan Beulich
@ 2018-02-05 17:37       ` Andrew Cooper
  2018-02-06 10:01         ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2018-02-05 17:37 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 31/01/18 10:12, Jan Beulich wrote:
>
>>> --- a/xen/arch/x86/pv/domain.c
>>> +++ b/xen/arch/x86/pv/domain.c
>>> @@ -220,10 +220,20 @@ int pv_domain_initialise(struct domain *
>>>      return rc;
>>>  }
>>>  
>>> -static void _toggle_guest_pt(struct vcpu *v)
>>> +static void _toggle_guest_pt(struct vcpu *v, bool force_cr3)
>>>  {
>>>      v->arch.flags ^= TF_kernel_mode;
>>>      update_cr3(v);
>>> +
>>> +    /*
>>> +     * There's no need to load CR3 here when it is going to be loaded on the
>>> +     * way out to guest mode again anyway, and when the page tables we're
>>> +     * currently on are the kernel ones (whereas when switching to kernel
>>> +     * mode we need to be able to write a bounce frame onto the kernel stack).
>>> +     */
>>> +    if ( !force_cr3 && !(v->arch.flags & TF_kernel_mode) )
>>> +        return;
>>> +
>>>      /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
>>>      asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
>>>  
>>> @@ -253,13 +263,13 @@ void toggle_guest_mode(struct vcpu *v)
>>>      }
>>>      asm volatile ( "swapgs" );
>>>  
>>> -    _toggle_guest_pt(v);
>>> +    _toggle_guest_pt(v, cpu_has_no_xpti);
>>>  }
>>>  
>>>  void toggle_guest_pt(struct vcpu *v)
>>>  {
>>>      if ( !is_pv_32bit_vcpu(v) )
>>> -        _toggle_guest_pt(v);
>>> +        _toggle_guest_pt(v, true);
>> This can be converted as well.  The only callers are the LDT-fault and
>> I/O perm check, both when we are currently on user pagetables, needing
>> to switch to kernel briefly, then back to user.
> Converted to what?

_toggle_guest_pt(v, cpu_has_no_xpti);

> Those code paths certainly need CR3 to be
> written, or else the actual memory accesses will fail.

These are called in pairs, the first of which switches from user to the
kernel which pagetables, which fails your TF_kernel_mode fastpath
(because of the xor at the top), and the second call to move from the
kernel back to user, which can in principle use the same fastpath.

>
>> However, it would be better to drop the parameter and feed
>> cpu_has_no_xpti into the conditional above which explains why it is safe
>> to do.
> As a result, I also don't understand this part.

The force_cr3 parameter is unnecessary, at which point it would be far
clearer to explicitly check against cpu_has_no_xpti for the fastpath.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use
  2018-02-05 17:28       ` Andrew Cooper
@ 2018-02-06  9:52         ` Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2018-02-06  9:52 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 05.02.18 at 18:28, <andrew.cooper3@citrix.com> wrote:
> On 30/01/18 13:51, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/x86_64/compat/entry.S
>>>> +++ b/xen/arch/x86/x86_64/compat/entry.S
>>>> @@ -189,7 +189,7 @@ ENTRY(compat_post_handle_exception)
>>>>  
>>>>  /* See lstar_enter for entry register state. */
>>>>  ENTRY(cstar_enter)
>>>> -        /* sti could live here when we don't switch page tables below. */
>>>> +        ALTERNATIVE nop, sti, X86_FEATURE_NO_XPTI
>>> I do not think the complexity of of altering the position of sti
>>> outweighs the fractional extra delay which would result from
>>> unilaterally having the sti later.  Furthermore, if you really are
>>> concerned about microoptimising this, you don't want a singlebyte nop here.
>>>
>>>>          CR4_PV32_RESTORE
>> There is, not the least, this, which I'm afraid is adding quite a bit
>> of a delay. While we're not real-time ready, I don't think we should
>> needlessly delay the enabling of interrupts.
> 
> The lion share of delay in the serialising effects of the write to cr4,
> which also blocks interrupts.

What is the main cause for the delay doesn't matter - interrupts
necessarily will be held off for the duration of the execution of
this one instruction. However, with the STI ahead of it,
interrupts which became pending _before_ the CR4 write would
have been serviced already. IOW by moving the STI ahead
we're splitting an interrupts-delayed window into two.

>>>> --- a/xen/arch/x86/x86_64/entry.S
>>>> +++ b/xen/arch/x86/x86_64/entry.S
>>>> @@ -46,7 +47,6 @@ restore_all_guest:
>>>>          movabs $DIRECTMAP_VIRT_START, %rcx
>>>>          mov   %rdi, %rax
>>>>          and   %rsi, %rdi
>>>> -        jz    .Lrag_keep_cr3
>>> This looks like a functional change?
>> Definitely not - the conditional branch simply becomes unnecessary
>> when the entire piece of code gets NOP-ed out.
> 
> Hmm.  That is not at all obvious.  What about about cases were we'd want
> to conditionally disable xpti on a per domain basis?

I would be at that time that the branch would need to be
re-introduced. But I hope we'll never make it there, and have a
better solution in place before we introduce a per-domain control
for this.

>>>> @@ -492,9 +519,20 @@ ENTRY(common_interrupt)
>>>>          CR4_PV32_RESTORE
>>>>          movq %rsp,%rdi
>>>>          callq do_IRQ
>>>> +.Lintr_cr3_restore:
>>>>          mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
>>>> +.Lintr_cr3_end:
>>>>          jmp ret_from_intr
>>>>  
>>>> +        .pushsection .altinstructions, "a", @progbits
>>>> +        altinstruction_entry .Lintr_cr3_restore, .Lintr_cr3_restore, \
>>>> +                             X86_FEATURE_NO_XPTI, \
>>>> +                             (.Lintr_cr3_end - .Lintr_cr3_restore), 0
>>>> +        altinstruction_entry .Lintr_cr3_start, .Lintr_cr3_start, \
>>>> +                             X86_FEATURE_NO_XPTI, \
>>>> +                             (.Lintr_cr3_okay - .Lintr_cr3_start), 0
>>> This is now getting very complicated to follow.  Is it just for IST
>>> safety and liable to disappear?  If not, I think we need a different
>>> way,as this is now saying "sporadic instructions inside this block, but
>>> not all of them, turn into nops".
>> This is not an IST path, and it is also not NOP-ing out sporadic
>> instructions - we can't drop the first piece of code without also
>> dropping the second, as %r14 won't be set up if the first block
>> is gone. They're clearly framed by .Lintr_cr3_* labels - I'm not
>> sure how to make even more obvious what's going on.
> 
> I know you're not a fan of my SPEC_CTRL macros, but they do make it very
> clear what is going on in each configuration.
> 
> They are certainly clearer than this.

A matter of personal preference, I'm afraid.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode
  2018-02-05 17:37       ` Andrew Cooper
@ 2018-02-06 10:01         ` Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2018-02-06 10:01 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 05.02.18 at 18:37, <andrew.cooper3@citrix.com> wrote:
> On 31/01/18 10:12, Jan Beulich wrote:
>>
>>>> --- a/xen/arch/x86/pv/domain.c
>>>> +++ b/xen/arch/x86/pv/domain.c
>>>> @@ -220,10 +220,20 @@ int pv_domain_initialise(struct domain *
>>>>      return rc;
>>>>  }
>>>>  
>>>> -static void _toggle_guest_pt(struct vcpu *v)
>>>> +static void _toggle_guest_pt(struct vcpu *v, bool force_cr3)
>>>>  {
>>>>      v->arch.flags ^= TF_kernel_mode;
>>>>      update_cr3(v);
>>>> +
>>>> +    /*
>>>> +     * There's no need to load CR3 here when it is going to be loaded on 
> the
>>>> +     * way out to guest mode again anyway, and when the page tables we're
>>>> +     * currently on are the kernel ones (whereas when switching to kernel
>>>> +     * mode we need to be able to write a bounce frame onto the kernel 
> stack).
>>>> +     */
>>>> +    if ( !force_cr3 && !(v->arch.flags & TF_kernel_mode) )
>>>> +        return;
>>>> +
>>>>      /* Don't flush user global mappings from the TLB. Don't tick TLB clock. 
> */
>>>>      asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
>>>>  
>>>> @@ -253,13 +263,13 @@ void toggle_guest_mode(struct vcpu *v)
>>>>      }
>>>>      asm volatile ( "swapgs" );
>>>>  
>>>> -    _toggle_guest_pt(v);
>>>> +    _toggle_guest_pt(v, cpu_has_no_xpti);
>>>>  }
>>>>  
>>>>  void toggle_guest_pt(struct vcpu *v)
>>>>  {
>>>>      if ( !is_pv_32bit_vcpu(v) )
>>>> -        _toggle_guest_pt(v);
>>>> +        _toggle_guest_pt(v, true);
>>> This can be converted as well.  The only callers are the LDT-fault and
>>> I/O perm check, both when we are currently on user pagetables, needing
>>> to switch to kernel briefly, then back to user.
>> Converted to what?
> 
> _toggle_guest_pt(v, cpu_has_no_xpti);

I don't understand: We can't avoid the page table switch when
!cpu_has_no_xpti.

>> Those code paths certainly need CR3 to be
>> written, or else the actual memory accesses will fail.
> 
> These are called in pairs, the first of which switches from user to the
> kernel which pagetables, which fails your TF_kernel_mode fastpath
> (because of the xor at the top),

That TF_kernel_mode check happens only when force_cr3 is false.

> and the second call to move from the
> kernel back to user, which can in principle use the same fastpath.

Just like for the first of the paired calls, the second one also
mustn't skip the CR3 write. Just to summarize: When we come
here for one of the two cases where the paired calls are used,
we're running on user mode page tables but want to access a
kernel mode data structure. Hence we need to switch to the
kernel page tables (including the CR3 load), do the access, and
switch back to the user mode page tables.

Jan

>>> However, it would be better to drop the parameter and feed
>>> cpu_has_no_xpti into the conditional above which explains why it is safe
>>> to do.
>> As a result, I also don't understand this part.
> 
> The force_cr3 parameter is unnecessary, at which point it would be far
> clearer to explicitly check against cpu_has_no_xpti for the fastpath.
> 
> ~Andrew




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2018-02-06 10:02 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-23 10:33 [PATCH 0/4] x86: reduce Meltdown band-aid overhead a little further Jan Beulich
2018-01-23 10:36 ` [PATCH 1/4] x86: remove CR reads from exit-to-guest path Jan Beulich
2018-01-30 11:01   ` Andrew Cooper
2018-01-30 11:10     ` Jan Beulich
2018-02-05 16:47       ` Andrew Cooper
2018-01-23 10:37 ` [PATCH 2/4] x86: eliminate most XPTI entry/exit code when it's not in use Jan Beulich
2018-01-30 12:02   ` Andrew Cooper
2018-01-30 13:51     ` Jan Beulich
2018-02-05 17:28       ` Andrew Cooper
2018-02-06  9:52         ` Jan Beulich
2018-01-23 10:38 ` [PATCH 3/4] x86: re-organize toggle_guest_*() Jan Beulich
2018-01-30 13:45   ` Andrew Cooper
2018-01-23 10:38 ` [PATCH 4/4] x86: avoid double CR3 reload when switching to guest user mode Jan Beulich
2018-01-30 14:29   ` Andrew Cooper
2018-01-31 10:12     ` Jan Beulich
2018-02-05 17:37       ` Andrew Cooper
2018-02-06 10:01         ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.