All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements
@ 2023-01-25 15:24 Jan Beulich
  2023-01-25 15:25 ` [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
                   ` (4 more replies)
  0 siblings, 5 replies; 17+ messages in thread
From: Jan Beulich @ 2023-01-25 15:24 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Versions of the two final patches were submitted standalone earlier
on. The series here tries to carry out a suggestion from Andrew,
which the two of us have been discussing. Then said previously posted
patches are re-based on top, utilizing the new functionality.

1: spec-ctrl: add logic to issue IBPB on exit to guest
2: spec-ctrl: defer context-switch IBPB until guest entry
3: limit issuing of IBPB during context switch
4: PV: issue branch prediction barrier when switching 64-bit guest to kernel mode

Jan


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-01-25 15:24 [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Jan Beulich
@ 2023-01-25 15:25 ` Jan Beulich
  2023-01-25 21:10   ` Andrew Cooper
  2023-01-25 15:26 ` [PATCH v3 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry Jan Beulich
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2023-01-25 15:25 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, Kevin Tian, Jun Nakajima

In order to be able to defer the context switch IBPB to the last
possible point, add logic to the exit-to-guest paths to issue the
barrier there, including the "IBPB doesn't flush the RSB/RAS"
workaround. Since alternatives, for now at least, can't nest, emit JMP
to skip past both constructs where both are needed. This may be more
efficient anyway, as the sequence of NOPs is pretty long.

LFENCEs are omitted - for HVM a VM entry is immanent, which already
elsewhere we deem sufficiently serializing an event. For 32-bit PV
we're going through IRET, which ought to be good enough as well. While
64-bit PV may use SYSRET, there are several more conditional branches
there which are all unprotected.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I have to admit that I'm not really certain about the placement of the
IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
entry".

Since we're going to run out of SCF_* bits soon and since the new flag
is meaningful only in struct cpu_info's spec_ctrl_flags, we could choose
to widen that field to 16 bits right away and then use bit 8 (or higher)
for the purpose here.
---
v3: New.

--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -75,6 +75,12 @@ __UNLIKELY_END(nsvm_hap)
         .endm
         ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM
 
+        ALTERNATIVE "jmp 2f", __stringify(DO_SPEC_CTRL_EXIT_IBPB disp=(2f-1f)), \
+                    X86_FEATURE_IBPB_EXIT_HVM
+1:
+        ALTERNATIVE "", DO_OVERWRITE_RSB, X86_BUG_IBPB_NO_RET
+2:
+
         pop  %r15
         pop  %r14
         pop  %r13
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -86,7 +86,8 @@ UNLIKELY_END(realmode)
         jz .Lvmx_vmentry_restart
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        /* SPEC_CTRL_EXIT_TO_VMX   Req: %rsp=regs/cpuinfo              Clob:    */
+        /* SPEC_CTRL_EXIT_TO_VMX   Req: %rsp=regs/cpuinfo              Clob: acd */
+        ALTERNATIVE "", DO_SPEC_CTRL_EXIT_IBPB, X86_FEATURE_IBPB_EXIT_HVM
         DO_SPEC_CTRL_COND_VERW
 
         mov  VCPU_hvm_guest_cr2(%rbx),%rax
--- a/xen/arch/x86/include/asm/cpufeatures.h
+++ b/xen/arch/x86/include/asm/cpufeatures.h
@@ -39,8 +39,10 @@ XEN_CPUFEATURE(XEN_LBR,           X86_SY
 XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle */
 XEN_CPUFEATURE(XEN_SHSTK,         X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */
 XEN_CPUFEATURE(XEN_IBT,           X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */
-XEN_CPUFEATURE(IBPB_ENTRY_PV,     X86_SYNTH(28)) /* MSR_PRED_CMD used by Xen for PV */
-XEN_CPUFEATURE(IBPB_ENTRY_HVM,    X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */
+XEN_CPUFEATURE(IBPB_ENTRY_PV,     X86_SYNTH(28)) /* MSR_PRED_CMD used by Xen when entered from PV */
+XEN_CPUFEATURE(IBPB_ENTRY_HVM,    X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen when entered from HVM */
+XEN_CPUFEATURE(IBPB_EXIT_PV,      X86_SYNTH(30)) /* MSR_PRED_CMD used by Xen when exiting to PV */
+XEN_CPUFEATURE(IBPB_EXIT_HVM,     X86_SYNTH(31)) /* MSR_PRED_CMD used by Xen when exiting to HVM */
 
 /* Bug words follow the synthetic words. */
 #define X86_NR_BUG 1
--- a/xen/arch/x86/include/asm/current.h
+++ b/xen/arch/x86/include/asm/current.h
@@ -55,9 +55,13 @@ struct cpu_info {
 
     /* See asm/spec_ctrl_asm.h for usage. */
     unsigned int shadow_spec_ctrl;
+    /*
+     * spec_ctrl_flags can be accessed as a 32-bit entity and hence needs
+     * placing suitably.
+     */
+    uint8_t      spec_ctrl_flags;
     uint8_t      xen_spec_ctrl;
     uint8_t      last_spec_ctrl;
-    uint8_t      spec_ctrl_flags;
 
     /*
      * The following field controls copying of the L4 page table of 64-bit
--- a/xen/arch/x86/include/asm/spec_ctrl.h
+++ b/xen/arch/x86/include/asm/spec_ctrl.h
@@ -36,6 +36,8 @@
 #define SCF_verw       (1 << 3)
 #define SCF_ist_ibpb   (1 << 4)
 #define SCF_entry_ibpb (1 << 5)
+#define SCF_exit_ibpb_bit 6
+#define SCF_exit_ibpb  (1 << SCF_exit_ibpb_bit)
 
 /*
  * The IST paths (NMI/#MC) can interrupt any arbitrary context.  Some
--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -117,6 +117,27 @@
 .L\@_done:
 .endm
 
+.macro DO_SPEC_CTRL_EXIT_IBPB disp=0
+/*
+ * Requires %rsp=regs
+ * Clobbers %rax, %rcx, %rdx
+ *
+ * Conditionally issue IBPB if SCF_exit_ibpb is active.  The macro invocation
+ * may be followed by X86_BUG_IBPB_NO_RET workaround code.  The "disp" argument
+ * is to allow invocation sites to pass in the extra amount of code which needs
+ * skipping in case no action is necessary.
+ *
+ * The flag is a "one-shot" indicator, so it is being cleared at the same time.
+ */
+    btrl    $SCF_exit_ibpb_bit, CPUINFO_spec_ctrl_flags(%rsp)
+    jnc     .L\@_skip + (\disp)
+    mov     $MSR_PRED_CMD, %ecx
+    mov     $PRED_CMD_IBPB, %eax
+    xor     %edx, %edx
+    wrmsr
+.L\@_skip:
+.endm
+
 .macro DO_OVERWRITE_RSB tmp=rax
 /*
  * Requires nothing
@@ -272,6 +293,14 @@
 #define SPEC_CTRL_EXIT_TO_PV                                            \
     ALTERNATIVE "",                                                     \
         DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
+    ALTERNATIVE __stringify(jmp PASTE(.Lscexitpv_done, __LINE__)),      \
+        __stringify(DO_SPEC_CTRL_EXIT_IBPB                              \
+                    disp=(PASTE(.Lscexitpv_done, __LINE__) -            \
+                          PASTE(.Lscexitpv_rsb, __LINE__))),            \
+        X86_FEATURE_IBPB_EXIT_PV;                                       \
+PASTE(.Lscexitpv_rsb, __LINE__):                                        \
+    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_BUG_IBPB_NO_RET;              \
+PASTE(.Lscexitpv_done, __LINE__):                                       \
     DO_SPEC_CTRL_COND_VERW
 
 /*
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -8,6 +8,7 @@
 #include <asm/page.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
+#include <xen/lib.h>
 #include <public/xen.h>
 #include <irq_vectors.h>
 
@@ -156,7 +157,7 @@ ENTRY(compat_restore_all_guest)
         mov VCPUMSR_spec_ctrl_raw(%rax), %eax
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+        SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: acd */
 
         RESTORE_ALL adj=8 compat=1
 .Lft0:  iretq
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -9,6 +9,7 @@
 #include <asm/asm_defns.h>
 #include <asm/page.h>
 #include <asm/processor.h>
+#include <xen/lib.h>
 #include <public/xen.h>
 #include <irq_vectors.h>
 
@@ -187,7 +188,7 @@ restore_all_guest:
         mov   %r15d, %eax
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+        SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: acd */
 
         RESTORE_ALL
         testw $TRAP_syscall,4(%rsp)



^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v3 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry
  2023-01-25 15:24 [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Jan Beulich
  2023-01-25 15:25 ` [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
@ 2023-01-25 15:26 ` Jan Beulich
  2023-01-26 20:43   ` Andrew Cooper
  2023-01-25 15:26 ` [PATCH v3 3/4] x86: limit issuing of IBPB during context switch Jan Beulich
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2023-01-25 15:26 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

In order to avoid clobbering Xen's own predictions, defer the barrier as
much as possible. Merely mark the CPU as needing a barrier issued the
next time we're exiting to guest context.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I couldn't find any sensible (central/unique) place where to move the
comment which is being deleted alongside spec_ctrl_new_guest_context().
---
v3: New.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2038,7 +2038,7 @@ void context_switch(struct vcpu *prev, s
              */
             if ( *last_id != next_id )
             {
-                spec_ctrl_new_guest_context();
+                info->spec_ctrl_flags |= SCF_exit_ibpb;
                 *last_id = next_id;
             }
         }
--- a/xen/arch/x86/include/asm/spec_ctrl.h
+++ b/xen/arch/x86/include/asm/spec_ctrl.h
@@ -67,28 +67,6 @@
 void init_speculation_mitigations(void);
 void spec_ctrl_init_domain(struct domain *d);
 
-/*
- * Switch to a new guest prediction context.
- *
- * This flushes all indirect branch predictors (BTB, RSB/RAS), so guest code
- * which has previously run on this CPU can't attack subsequent guest code.
- *
- * As this flushes the RSB/RAS, it destroys the predictions of the calling
- * context.  For best performace, arrange for this to be used when we're going
- * to jump out of the current context, e.g. with reset_stack_and_jump().
- *
- * For hardware which mis-implements IBPB, fix up by flushing the RSB/RAS
- * manually.
- */
-static always_inline void spec_ctrl_new_guest_context(void)
-{
-    wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB);
-
-    /* (ab)use alternative_input() to specify clobbers. */
-    alternative_input("", "DO_OVERWRITE_RSB", X86_BUG_IBPB_NO_RET,
-                      : "rax", "rcx");
-}
-
 extern int8_t opt_ibpb_ctxt_switch;
 extern bool opt_ssbd;
 extern int8_t opt_eager_fpu;
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -854,6 +854,11 @@ static void __init ibpb_calculations(voi
      */
     if ( opt_ibpb_ctxt_switch == -1 )
         opt_ibpb_ctxt_switch = !(opt_ibpb_entry_hvm && opt_ibpb_entry_pv);
+    if ( opt_ibpb_ctxt_switch )
+    {
+        setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_PV);
+        setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_HVM);
+    }
 }
 
 /* Calculate whether this CPU is vulnerable to L1TF. */



^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v3 3/4] x86: limit issuing of IBPB during context switch
  2023-01-25 15:24 [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Jan Beulich
  2023-01-25 15:25 ` [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
  2023-01-25 15:26 ` [PATCH v3 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry Jan Beulich
@ 2023-01-25 15:26 ` Jan Beulich
  2023-01-26 20:49   ` Andrew Cooper
  2023-01-25 15:27 ` [PATCH v3 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode Jan Beulich
  2023-01-25 17:49 ` [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Andrew Cooper
  4 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2023-01-25 15:26 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

When the outgoing vCPU had IBPB issued upon entering Xen there's no
need for a 2nd barrier during context switch.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Fold into series.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2015,7 +2015,8 @@ void context_switch(struct vcpu *prev, s
 
         ctxt_switch_levelling(next);
 
-        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
+        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
+             !(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) )
         {
             static DEFINE_PER_CPU(unsigned int, last);
             unsigned int *last_id = &this_cpu(last);



^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v3 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-01-25 15:24 [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Jan Beulich
                   ` (2 preceding siblings ...)
  2023-01-25 15:26 ` [PATCH v3 3/4] x86: limit issuing of IBPB during context switch Jan Beulich
@ 2023-01-25 15:27 ` Jan Beulich
  2023-01-25 17:49 ` [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Andrew Cooper
  4 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2023-01-25 15:27 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Since both kernel and user mode run in ring 3, they run in the same
"predictor mode". While the kernel could take care of this itself, doing
so would be yet another item distinguishing PV from native. Additionally
we're in a much better position to issue the barrier command, and we can
save a #GP (for privileged instruction emulation) this way.

To allow to recover performance, introduce a new VM assist allowing the
guest kernel to suppress this barrier. Make availability of the assist
dependent upon the command line control, such that kernels have a way to
know whether their request actually took any effect.

Note that because of its use in PV64_VM_ASSIST_MASK, the declaration of
opt_ibpb_mode_switch can't live in asm/spec_ctrl.h.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Is the placement of the clearing of opt_ibpb_ctxt_switch correct in
parse_spec_ctrl()? Shouldn't it live ahead of the "disable_common"
label, as being about guest protection, not Xen's?

Adding setting of the variable to the "pv" sub-case in parse_spec_ctrl()
didn't seem quite right to me, considering that we default it to the
opposite of opt_ibpb_entry_pv.
---
v3: Leverage exit-IBPB. Introduce separate command line control.
v2: Leverage entry-IBPB. Add VM assist. Re-base.

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -2315,8 +2315,8 @@ By default SSBD will be mitigated at run
 ### spec-ctrl (x86)
 > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>,
 >              {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
->              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
->              eager-fpu,l1d-flush,branch-harden,srb-lock,
+>              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ibpb-mode-switch,
+>              ssbd,psfd,eager-fpu,l1d-flush,branch-harden,srb-lock,
 >              unpriv-mmio}=<bool> ]`
 
 Controls for speculative execution sidechannel mitigations.  By default, Xen
@@ -2398,7 +2398,10 @@ default.
 
 On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=`
 option can be used to force (the default) or prevent Xen from issuing branch
-prediction barriers on vcpu context switches.
+prediction barriers on vcpu context switches.  On such hardware the
+`ibpb-mode-switch` option can be used to control whether, by default, Xen
+would issue branch prediction barriers when 64-bit PV guests switch from
+user to kernel mode.  If enabled, guest kernels can op out of this behavior.
 
 On all hardware, the `eager-fpu=` option can be used to force or prevent Xen
 from using fully eager FPU context switches.  This is currently implemented as
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -742,6 +742,8 @@ static inline void pv_inject_sw_interrup
     pv_inject_event(&event);
 }
 
+extern int8_t opt_ibpb_mode_switch;
+
 #define PV32_VM_ASSIST_MASK ((1UL << VMASST_TYPE_4gb_segments)        | \
                              (1UL << VMASST_TYPE_4gb_segments_notify) | \
                              (1UL << VMASST_TYPE_writable_pagetables) | \
@@ -753,7 +755,9 @@ static inline void pv_inject_sw_interrup
  * but we can't make such requests fail all of the sudden.
  */
 #define PV64_VM_ASSIST_MASK (PV32_VM_ASSIST_MASK                      | \
-                             (1UL << VMASST_TYPE_m2p_strict))
+                             (1UL << VMASST_TYPE_m2p_strict)          | \
+                             ((opt_ibpb_mode_switch + 0UL) <<           \
+                              VMASST_TYPE_mode_switch_no_ibpb))
 #define HVM_VM_ASSIST_MASK  (1UL << VMASST_TYPE_runstate_update_flag)
 
 #define arch_vm_assist_valid_mask(d) \
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
 void toggle_guest_mode(struct vcpu *v)
 {
     const struct domain *d = v->domain;
+    struct cpu_info *cpu_info = get_cpu_info();
     unsigned long gs_base;
 
     ASSERT(!is_pv_32bit_vcpu(v));
@@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
     if ( v->arch.flags & TF_kernel_mode )
         v->arch.pv.gs_base_kernel = gs_base;
     else
+    {
         v->arch.pv.gs_base_user = gs_base;
+
+        if ( opt_ibpb_mode_switch &&
+             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
+             !VM_ASSIST(d, mode_switch_no_ibpb) )
+            cpu_info->spec_ctrl_flags |= SCF_exit_ibpb;
+    }
+
     asm volatile ( "swapgs" );
 
     _toggle_guest_pt(v);
 
     if ( d->arch.pv.xpti )
     {
-        struct cpu_info *cpu_info = get_cpu_info();
-
         cpu_info->root_pgt_changed = true;
         cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
                            (d->arch.pv.pcid ? get_pcid_bits(v, true) : 0);
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -60,6 +60,7 @@ bool __ro_after_init opt_ssbd;
 int8_t __initdata opt_psfd = -1;
 
 int8_t __ro_after_init opt_ibpb_ctxt_switch = -1;
+int8_t __ro_after_init opt_ibpb_mode_switch = -1;
 int8_t __read_mostly opt_eager_fpu = -1;
 int8_t __read_mostly opt_l1d_flush = -1;
 static bool __initdata opt_branch_harden = true;
@@ -111,6 +112,8 @@ static int __init cf_check parse_spec_ct
             if ( opt_pv_l1tf_domu < 0 )
                 opt_pv_l1tf_domu = 0;
 
+            opt_ibpb_mode_switch = 0;
+
             if ( opt_tsx == -1 )
                 opt_tsx = -3;
 
@@ -271,6 +274,8 @@ static int __init cf_check parse_spec_ct
         /* Misc settings. */
         else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
             opt_ibpb_ctxt_switch = val;
+        else if ( (val = parse_boolean("ibpb-mode-switch", s, ss)) >= 0 )
+            opt_ibpb_mode_switch = val;
         else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
             opt_eager_fpu = val;
         else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
@@ -527,7 +532,7 @@ static void __init print_details(enum in
 
 #endif
 #ifdef CONFIG_PV
-    printk("  Support for PV VMs:%s%s%s%s%s%s\n",
+    printk("  Support for PV VMs:%s%s%s%s%s%s%s\n",
            (boot_cpu_has(X86_FEATURE_SC_MSR_PV) ||
             boot_cpu_has(X86_FEATURE_SC_RSB_PV) ||
             boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ||
@@ -536,7 +541,8 @@ static void __init print_details(enum in
            boot_cpu_has(X86_FEATURE_SC_RSB_PV)       ? " RSB"           : "",
            opt_eager_fpu                             ? " EAGER_FPU"     : "",
            opt_md_clear_pv                           ? " MD_CLEAR"      : "",
-           boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "");
+           boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "",
+           opt_ibpb_mode_switch                      ? " IBPB-mode-switch" : "");
 
     printk("  XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n",
            opt_xpti_hwdom ? "enabled" : "disabled",
@@ -804,7 +810,8 @@ static void __init ibpb_calculations(voi
     /* Check we have hardware IBPB support before using it... */
     if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
     {
-        opt_ibpb_entry_hvm = opt_ibpb_entry_pv = opt_ibpb_ctxt_switch = 0;
+        opt_ibpb_entry_hvm = opt_ibpb_entry_pv = 0;
+        opt_ibpb_mode_switch = opt_ibpb_ctxt_switch = 0;
         opt_ibpb_entry_dom0 = false;
         return;
     }
@@ -859,6 +866,18 @@ static void __init ibpb_calculations(voi
         setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_PV);
         setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_HVM);
     }
+
+#ifdef CONFIG_PV
+    /*
+     * If we're using IBPB-on-entry to protect against PV guests, then
+     * there's no need to also issue IBPB on a guest user->kernel switch.
+     */
+    if ( opt_ibpb_mode_switch == -1 )
+        opt_ibpb_mode_switch = !opt_ibpb_entry_pv ||
+                               (!opt_ibpb_entry_dom0 && !opt_dom0_pvh);
+    if ( opt_ibpb_mode_switch )
+        setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_PV);
+#endif
 }
 
 /* Calculate whether this CPU is vulnerable to L1TF. */
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -554,6 +554,16 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
  */
 #define VMASST_TYPE_m2p_strict           32
 
+/*
+ * x86-64 guests: Suppress IBPB on guest-user to guest-kernel mode switch.
+ *
+ * By default (on affected and capable hardware) as a safety measure Xen,
+ * to cover for the fact that guest-kernel and guest-user modes are both
+ * running in ring 3 (and hence share prediction context), would issue a
+ * barrier for user->kernel mode switches of PV guests.
+ */
+#define VMASST_TYPE_mode_switch_no_ibpb  33
+
 #if __XEN_INTERFACE_VERSION__ < 0x00040600
 #define MAX_VMASST_TYPE                  3
 #endif



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements
  2023-01-25 15:24 [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Jan Beulich
                   ` (3 preceding siblings ...)
  2023-01-25 15:27 ` [PATCH v3 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode Jan Beulich
@ 2023-01-25 17:49 ` Andrew Cooper
  2023-01-26  7:32   ` Jan Beulich
  4 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2023-01-25 17:49 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

On 25/01/2023 3:24 pm, Jan Beulich wrote:
> Versions of the two final patches were submitted standalone earlier
> on. The series here tries to carry out a suggestion from Andrew,
> which the two of us have been discussing. Then said previously posted
> patches are re-based on top, utilizing the new functionality.
>
> 1: spec-ctrl: add logic to issue IBPB on exit to guest
> 2: spec-ctrl: defer context-switch IBPB until guest entry
> 3: limit issuing of IBPB during context switch
> 4: PV: issue branch prediction barrier when switching 64-bit guest to kernel mode

In the subject, you mean IBPB.  I think all the individual patches are fine.

Do you have an implementation of VMASST_TYPE_mode_switch_no_ibpb for
Linux yet?  The thing I'd like to avoid is that we commit this perf it
to Xen, without lining Linux up to be able to skip it.

~Andrew


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-01-25 15:25 ` [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
@ 2023-01-25 21:10   ` Andrew Cooper
  2023-01-26  8:02     ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2023-01-25 21:10 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, Kevin Tian, Jun Nakajima

On 25/01/2023 3:25 pm, Jan Beulich wrote:
> In order to be able to defer the context switch IBPB to the last
> possible point, add logic to the exit-to-guest paths to issue the
> barrier there, including the "IBPB doesn't flush the RSB/RAS"
> workaround. Since alternatives, for now at least, can't nest, emit JMP
> to skip past both constructs where both are needed. This may be more
> efficient anyway, as the sequence of NOPs is pretty long.

It is very uarch specific as to when a jump is less overhead than a line
of nops.

In all CPUs liable to be running Xen, even unconditional jumps take up
branch prediction resource, because all branch prediction is pre-decode
these days, so branch locations/types/destinations all need deriving
from %rip and "history" alone.

So whether a branch or a line of nops is better is a tradeoff between
how much competition there is for branch prediction resource, and how
efficiently the CPU can brute-force its way through a long line of nops.

But a very interesting datapoint.  It turns out that AMD Zen4 CPUs
macrofuse adjacent nops, including longnops, because it reduces the
amount of execute/retire resources required.  And a lot of
kernel/hypervisor fastpaths have a lot of nops these days.


For us, the "can't nest" is singularly more important than any worry
about uarch behaviour.  We've frankly got much lower hanging fruit than
worring about one branch vs a few nops.

> LFENCEs are omitted - for HVM a VM entry is immanent, which already
> elsewhere we deem sufficiently serializing an event. For 32-bit PV
> we're going through IRET, which ought to be good enough as well. While
> 64-bit PV may use SYSRET, there are several more conditional branches
> there which are all unprotected.

Privilege changes are serialsing-ish, and this behaviour has been
guaranteed moving forwards, although not documented coherently.

CPL (well - privilege, which includes SMM, root/non-root, etc) is not
written speculatively.  So any logic which needs to modify privilege has
to block until it is known to be an architectural execution path.

This gets us "lfence-like" or "dispatch serialising" behaviour, which is
also the reason why INT3 is our go-to speculation halting instruction. 
Microcode has to be entirely certain we are going to deliver an
interrupt/exception/etc before it can start reading the IDT/etc.

Either way, we've been promised that all instructions like IRET,
SYS{CALL,RET,ENTER,EXIT}, VM{RUN,LAUNCH,RESUME} (and ERET{U,S} in the
future FRED world) do, and shall continue to not execute speculatively.

Which in practice means we don't need to worry about Spectre-v1 attack
against codepaths which hit an exit-from-xen path, in terms of skipping
protections.

We do need to be careful about memory accesses and potential double
dereferences, but all the data is on the top of the stack for XPTI
reasons.  About the only concern is v->arch.msrs->* in the HVM path, and
we're fine with the current layout (AFAICT).

>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> I have to admit that I'm not really certain about the placement of the
> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
> entry".

It really doesn't matter.  They're independent operations that both need
doing, and are fully serialising so can't parallelise.

But on this note, WRMSRNS and WRMSRLIST are on the horizon.  The CPUs
which implement these instructions are the ones which also ought not to
need any adjustments in the exit paths.  So I think it is specifically
not worth trying to make any effort to turn *these* WRMSR's into more
optimised forms.

But WRMSRLIST was designed specifically for this kind of usecase
(actually, more for the main context switch path) where you can prepare
the list of MSRs in memory, including the ability to conditionally skip
certain entries by adjusting the index field.


It occurs to me, having written this out, is that what we actually want
to do is have slightly custom not-quite-alternative blocks.  We have a
sequence of independent code blocks, and a small block at the end that
happens to contain an IRET.

We could remove the nops at boot time if we treated it as one large
region, with the IRET at the end also able to have a variable position,
and assembles the "active" blocks tightly from the start.  Complications
would include adjusting the IRET extable entry, but this isn't
insurmountable.  Entrypoints are a bit more tricky but could be done by
packing from the back forward, and adjusting the entry position.

Either way, something to ponder.  (It's also possible that it doesn't
make a measurable difference until we get to FRED, at which point we
have a set of fresh entry-points to write anyway, and a slight glimmer
of hope of not needing to pollute them with speculation workarounds...)

> Since we're going to run out of SCF_* bits soon and since the new flag
> is meaningful only in struct cpu_info's spec_ctrl_flags, we could choose
> to widen that field to 16 bits right away and then use bit 8 (or higher)
> for the purpose here.

I really don't think it matters.  We've got plenty of room, and the
flexibility to shuffle, in both structures.  It's absolutely not worth
trying to introduce asymmetries to save 1 bit.

> --- a/xen/arch/x86/include/asm/current.h
> +++ b/xen/arch/x86/include/asm/current.h
> @@ -55,9 +55,13 @@ struct cpu_info {
>  
>      /* See asm/spec_ctrl_asm.h for usage. */
>      unsigned int shadow_spec_ctrl;
> +    /*
> +     * spec_ctrl_flags can be accessed as a 32-bit entity and hence needs
> +     * placing suitably.

I'd suggest "is accessed as a 32-bit entity, and wants aligning suitably" ?

If I've followed the logic correctly.  (I can't say I was specifically
aware that the bit test instructions didn't have byte forms, but I
suspect such instruction forms would be very very niche.)

> +     */
> +    uint8_t      spec_ctrl_flags;
>      uint8_t      xen_spec_ctrl;
>      uint8_t      last_spec_ctrl;
> -    uint8_t      spec_ctrl_flags;
>  
>      /*
>       * The following field controls copying of the L4 page table of 64-bit
> --- a/xen/arch/x86/include/asm/spec_ctrl.h
> +++ b/xen/arch/x86/include/asm/spec_ctrl.h
> @@ -36,6 +36,8 @@
>  #define SCF_verw       (1 << 3)
>  #define SCF_ist_ibpb   (1 << 4)
>  #define SCF_entry_ibpb (1 << 5)
> +#define SCF_exit_ibpb_bit 6
> +#define SCF_exit_ibpb  (1 << SCF_exit_ibpb_bit)

One option to avoid the second define is to use ILOG2() with btrl.

Of all the common forms of doing this, its the only one I'm aware of
which avoids needing the second define.

>  
>  /*
>   * The IST paths (NMI/#MC) can interrupt any arbitrary context.  Some
> --- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
> +++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
> @@ -117,6 +117,27 @@
>  .L\@_done:
>  .endm
>  
> +.macro DO_SPEC_CTRL_EXIT_IBPB disp=0
> +/*
> + * Requires %rsp=regs
> + * Clobbers %rax, %rcx, %rdx
> + *
> + * Conditionally issue IBPB if SCF_exit_ibpb is active.  The macro invocation
> + * may be followed by X86_BUG_IBPB_NO_RET workaround code.  The "disp" argument
> + * is to allow invocation sites to pass in the extra amount of code which needs
> + * skipping in case no action is necessary.
> + *
> + * The flag is a "one-shot" indicator, so it is being cleared at the same time.
> + */
> +    btrl    $SCF_exit_ibpb_bit, CPUINFO_spec_ctrl_flags(%rsp)
> +    jnc     .L\@_skip + (\disp)
> +    mov     $MSR_PRED_CMD, %ecx
> +    mov     $PRED_CMD_IBPB, %eax
> +    xor     %edx, %edx
> +    wrmsr
> +.L\@_skip:
> +.endm
> +
>  .macro DO_OVERWRITE_RSB tmp=rax
>  /*
>   * Requires nothing
> @@ -272,6 +293,14 @@
>  #define SPEC_CTRL_EXIT_TO_PV                                            \
>      ALTERNATIVE "",                                                     \
>          DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
> +    ALTERNATIVE __stringify(jmp PASTE(.Lscexitpv_done, __LINE__)),      \
> +        __stringify(DO_SPEC_CTRL_EXIT_IBPB                              \
> +                    disp=(PASTE(.Lscexitpv_done, __LINE__) -            \
> +                          PASTE(.Lscexitpv_rsb, __LINE__))),            \
> +        X86_FEATURE_IBPB_EXIT_PV;                                       \
> +PASTE(.Lscexitpv_rsb, __LINE__):                                        \
> +    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_BUG_IBPB_NO_RET;              \
> +PASTE(.Lscexitpv_done, __LINE__):                                       \
>      DO_SPEC_CTRL_COND_VERW

What's wrong with the normal %= trick?  The use of __LINE__ makes this
hard to subsequently livepatch, so I'd prefer to avoid it if possible.

~Andrew


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements
  2023-01-25 17:49 ` [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Andrew Cooper
@ 2023-01-26  7:32   ` Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2023-01-26  7:32 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, xen-devel

On 25.01.2023 18:49, Andrew Cooper wrote:
> On 25/01/2023 3:24 pm, Jan Beulich wrote:
>> Versions of the two final patches were submitted standalone earlier
>> on. The series here tries to carry out a suggestion from Andrew,
>> which the two of us have been discussing. Then said previously posted
>> patches are re-based on top, utilizing the new functionality.
>>
>> 1: spec-ctrl: add logic to issue IBPB on exit to guest
>> 2: spec-ctrl: defer context-switch IBPB until guest entry
>> 3: limit issuing of IBPB during context switch
>> 4: PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
> 
> In the subject, you mean IBPB.  I think all the individual patches are fine.

Yes, I did notice the typo immediately after sending.

> Do you have an implementation of VMASST_TYPE_mode_switch_no_ibpb for
> Linux yet?  The thing I'd like to avoid is that we commit this perf it
> to Xen, without lining Linux up to be able to skip it.

No, I don't. I haven't even looked at where invoking this might be best placed.
Also I have to admit that it's not really clear to me what the criteria are
going to be for Linux to disable this, and whether perhaps finer grained
control might be needed (i.e. to turn it on/off dynamically under certain
conditions).

In any event this concern is only related to patch 4; I'd appreciate if at
least the earlier three patches wouldn't be blocked on there being something
on the Linux side. (In fact patch 3 ends up [still] being entirely independent
of the rest of the rework, unlike I think you were expecting it to be.)

Jan


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-01-25 21:10   ` Andrew Cooper
@ 2023-01-26  8:02     ` Jan Beulich
  2023-01-26 20:27       ` Andrew Cooper
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2023-01-26  8:02 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné,
	Kevin Tian, Jun Nakajima, xen-devel

On 25.01.2023 22:10, Andrew Cooper wrote:
> On 25/01/2023 3:25 pm, Jan Beulich wrote:
>> In order to be able to defer the context switch IBPB to the last
>> possible point, add logic to the exit-to-guest paths to issue the
>> barrier there, including the "IBPB doesn't flush the RSB/RAS"
>> workaround. Since alternatives, for now at least, can't nest, emit JMP
>> to skip past both constructs where both are needed. This may be more
>> efficient anyway, as the sequence of NOPs is pretty long.
> 
> It is very uarch specific as to when a jump is less overhead than a line
> of nops.
> 
> In all CPUs liable to be running Xen, even unconditional jumps take up
> branch prediction resource, because all branch prediction is pre-decode
> these days, so branch locations/types/destinations all need deriving
> from %rip and "history" alone.
> 
> So whether a branch or a line of nops is better is a tradeoff between
> how much competition there is for branch prediction resource, and how
> efficiently the CPU can brute-force its way through a long line of nops.
> 
> But a very interesting datapoint.  It turns out that AMD Zen4 CPUs
> macrofuse adjacent nops, including longnops, because it reduces the
> amount of execute/retire resources required.  And a lot of
> kernel/hypervisor fastpaths have a lot of nops these days.
> 
> 
> For us, the "can't nest" is singularly more important than any worry
> about uarch behaviour.  We've frankly got much lower hanging fruit than
> worring about one branch vs a few nops.
> 
>> LFENCEs are omitted - for HVM a VM entry is immanent, which already
>> elsewhere we deem sufficiently serializing an event. For 32-bit PV
>> we're going through IRET, which ought to be good enough as well. While
>> 64-bit PV may use SYSRET, there are several more conditional branches
>> there which are all unprotected.
> 
> Privilege changes are serialsing-ish, and this behaviour has been
> guaranteed moving forwards, although not documented coherently.
> 
> CPL (well - privilege, which includes SMM, root/non-root, etc) is not
> written speculatively.  So any logic which needs to modify privilege has
> to block until it is known to be an architectural execution path.
> 
> This gets us "lfence-like" or "dispatch serialising" behaviour, which is
> also the reason why INT3 is our go-to speculation halting instruction. 
> Microcode has to be entirely certain we are going to deliver an
> interrupt/exception/etc before it can start reading the IDT/etc.
> 
> Either way, we've been promised that all instructions like IRET,
> SYS{CALL,RET,ENTER,EXIT}, VM{RUN,LAUNCH,RESUME} (and ERET{U,S} in the
> future FRED world) do, and shall continue to not execute speculatively.
> 
> Which in practice means we don't need to worry about Spectre-v1 attack
> against codepaths which hit an exit-from-xen path, in terms of skipping
> protections.
> 
> We do need to be careful about memory accesses and potential double
> dereferences, but all the data is on the top of the stack for XPTI
> reasons.  About the only concern is v->arch.msrs->* in the HVM path, and
> we're fine with the current layout (AFAICT).
> 
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> I have to admit that I'm not really certain about the placement of the
>> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
>> entry".
> 
> It really doesn't matter.  They're independent operations that both need
> doing, and are fully serialising so can't parallelise.
> 
> But on this note, WRMSRNS and WRMSRLIST are on the horizon.  The CPUs
> which implement these instructions are the ones which also ought not to
> need any adjustments in the exit paths.  So I think it is specifically
> not worth trying to make any effort to turn *these* WRMSR's into more
> optimised forms.
> 
> But WRMSRLIST was designed specifically for this kind of usecase
> (actually, more for the main context switch path) where you can prepare
> the list of MSRs in memory, including the ability to conditionally skip
> certain entries by adjusting the index field.
> 
> 
> It occurs to me, having written this out, is that what we actually want
> to do is have slightly custom not-quite-alternative blocks.  We have a
> sequence of independent code blocks, and a small block at the end that
> happens to contain an IRET.
> 
> We could remove the nops at boot time if we treated it as one large
> region, with the IRET at the end also able to have a variable position,
> and assembles the "active" blocks tightly from the start.  Complications
> would include adjusting the IRET extable entry, but this isn't
> insurmountable.  Entrypoints are a bit more tricky but could be done by
> packing from the back forward, and adjusting the entry position.
> 
> Either way, something to ponder.  (It's also possible that it doesn't
> make a measurable difference until we get to FRED, at which point we
> have a set of fresh entry-points to write anyway, and a slight glimmer
> of hope of not needing to pollute them with speculation workarounds...)
> 
>> Since we're going to run out of SCF_* bits soon and since the new flag
>> is meaningful only in struct cpu_info's spec_ctrl_flags, we could choose
>> to widen that field to 16 bits right away and then use bit 8 (or higher)
>> for the purpose here.
> 
> I really don't think it matters.  We've got plenty of room, and the
> flexibility to shuffle, in both structures.  It's absolutely not worth
> trying to introduce asymmetries to save 1 bit.

Thanks for all the comments up to here. Just to clarify - I've not spotted
anything there that would result in me being expected to take any action.

>> --- a/xen/arch/x86/include/asm/current.h
>> +++ b/xen/arch/x86/include/asm/current.h
>> @@ -55,9 +55,13 @@ struct cpu_info {
>>  
>>      /* See asm/spec_ctrl_asm.h for usage. */
>>      unsigned int shadow_spec_ctrl;
>> +    /*
>> +     * spec_ctrl_flags can be accessed as a 32-bit entity and hence needs
>> +     * placing suitably.
> 
> I'd suggest "is accessed as a 32-bit entity, and wants aligning suitably" ?

I've tried to choose the wording carefully: The 32-bit access is in an
alternative block, so doesn't always come into play. Hence the "may",
not "is". Alignment alone also isn't sufficient here (and mis-aligning
isn't merely a performance problem) - the following three bytes also
need to be valid to access in the first place. Hence "needs" and
"placing", not "wants" and "aligning".

> If I've followed the logic correctly.  (I can't say I was specifically
> aware that the bit test instructions didn't have byte forms, but I
> suspect such instruction forms would be very very niche.)

Yes, there not being byte forms of BT* is the sole reason here.

>> --- a/xen/arch/x86/include/asm/spec_ctrl.h
>> +++ b/xen/arch/x86/include/asm/spec_ctrl.h
>> @@ -36,6 +36,8 @@
>>  #define SCF_verw       (1 << 3)
>>  #define SCF_ist_ibpb   (1 << 4)
>>  #define SCF_entry_ibpb (1 << 5)
>> +#define SCF_exit_ibpb_bit 6
>> +#define SCF_exit_ibpb  (1 << SCF_exit_ibpb_bit)
> 
> One option to avoid the second define is to use ILOG2() with btrl.

Specifically not. The assembler doesn't know the conditional operator,
and the pre-processor won't collapse the expression resulting from
expanding ilog2().

>> @@ -272,6 +293,14 @@
>>  #define SPEC_CTRL_EXIT_TO_PV                                            \
>>      ALTERNATIVE "",                                                     \
>>          DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
>> +    ALTERNATIVE __stringify(jmp PASTE(.Lscexitpv_done, __LINE__)),      \
>> +        __stringify(DO_SPEC_CTRL_EXIT_IBPB                              \
>> +                    disp=(PASTE(.Lscexitpv_done, __LINE__) -            \
>> +                          PASTE(.Lscexitpv_rsb, __LINE__))),            \
>> +        X86_FEATURE_IBPB_EXIT_PV;                                       \
>> +PASTE(.Lscexitpv_rsb, __LINE__):                                        \
>> +    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_BUG_IBPB_NO_RET;              \
>> +PASTE(.Lscexitpv_done, __LINE__):                                       \
>>      DO_SPEC_CTRL_COND_VERW
> 
> What's wrong with the normal %= trick?

We're in a C macro here which is then used in assembly code. %= only
works in asm(), though. If we were in an assembler macro, I'd have
used \@. Yet wrapping the whole thing in an assembler macro would, for
my taste, hide too much information from the use sites (in particular
the X86_{FEATURE,BUG}_* which are imo relevant to be visible there).

>  The use of __LINE__ makes this
> hard to subsequently livepatch, so I'd prefer to avoid it if possible.

Yes, I was certainly aware this would be a concern. I couldn't think of
a (forward looking) clean solution, though: Right now we have only one
use per source file (the native and compat PV entry.S), so we could use
a context-independent label name. But as you say above, for FRED we're
likely to get new entry points, and they're likely better placed in the
same files.

Jan


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-01-26  8:02     ` Jan Beulich
@ 2023-01-26 20:27       ` Andrew Cooper
  2023-02-06 13:58         ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2023-01-26 20:27 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné,
	Kevin Tian, Jun Nakajima, xen-devel

On 26/01/2023 8:02 am, Jan Beulich wrote:
> On 25.01.2023 22:10, Andrew Cooper wrote:
>> On 25/01/2023 3:25 pm, Jan Beulich wrote:
>>> In order to be able to defer the context switch IBPB to the last
>>> possible point, add logic to the exit-to-guest paths to issue the
>>> barrier there, including the "IBPB doesn't flush the RSB/RAS"
>>> workaround. Since alternatives, for now at least, can't nest, emit JMP
>>> to skip past both constructs where both are needed. This may be more
>>> efficient anyway, as the sequence of NOPs is pretty long.
>> It is very uarch specific as to when a jump is less overhead than a line
>> of nops.
>>
>> In all CPUs liable to be running Xen, even unconditional jumps take up
>> branch prediction resource, because all branch prediction is pre-decode
>> these days, so branch locations/types/destinations all need deriving
>> from %rip and "history" alone.
>>
>> So whether a branch or a line of nops is better is a tradeoff between
>> how much competition there is for branch prediction resource, and how
>> efficiently the CPU can brute-force its way through a long line of nops.
>>
>> But a very interesting datapoint.  It turns out that AMD Zen4 CPUs
>> macrofuse adjacent nops, including longnops, because it reduces the
>> amount of execute/retire resources required.  And a lot of
>> kernel/hypervisor fastpaths have a lot of nops these days.
>>
>>
>> For us, the "can't nest" is singularly more important than any worry
>> about uarch behaviour.  We've frankly got much lower hanging fruit than
>> worring about one branch vs a few nops.
>>
>>> LFENCEs are omitted - for HVM a VM entry is immanent, which already
>>> elsewhere we deem sufficiently serializing an event. For 32-bit PV
>>> we're going through IRET, which ought to be good enough as well. While
>>> 64-bit PV may use SYSRET, there are several more conditional branches
>>> there which are all unprotected.
>> Privilege changes are serialsing-ish, and this behaviour has been
>> guaranteed moving forwards, although not documented coherently.
>>
>> CPL (well - privilege, which includes SMM, root/non-root, etc) is not
>> written speculatively.  So any logic which needs to modify privilege has
>> to block until it is known to be an architectural execution path.
>>
>> This gets us "lfence-like" or "dispatch serialising" behaviour, which is
>> also the reason why INT3 is our go-to speculation halting instruction. 
>> Microcode has to be entirely certain we are going to deliver an
>> interrupt/exception/etc before it can start reading the IDT/etc.
>>
>> Either way, we've been promised that all instructions like IRET,
>> SYS{CALL,RET,ENTER,EXIT}, VM{RUN,LAUNCH,RESUME} (and ERET{U,S} in the
>> future FRED world) do, and shall continue to not execute speculatively.
>>
>> Which in practice means we don't need to worry about Spectre-v1 attack
>> against codepaths which hit an exit-from-xen path, in terms of skipping
>> protections.
>>
>> We do need to be careful about memory accesses and potential double
>> dereferences, but all the data is on the top of the stack for XPTI
>> reasons.  About the only concern is v->arch.msrs->* in the HVM path, and
>> we're fine with the current layout (AFAICT).
>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> I have to admit that I'm not really certain about the placement of the
>>> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
>>> entry".
>> It really doesn't matter.  They're independent operations that both need
>> doing, and are fully serialising so can't parallelise.
>>
>> But on this note, WRMSRNS and WRMSRLIST are on the horizon.  The CPUs
>> which implement these instructions are the ones which also ought not to
>> need any adjustments in the exit paths.  So I think it is specifically
>> not worth trying to make any effort to turn *these* WRMSR's into more
>> optimised forms.
>>
>> But WRMSRLIST was designed specifically for this kind of usecase
>> (actually, more for the main context switch path) where you can prepare
>> the list of MSRs in memory, including the ability to conditionally skip
>> certain entries by adjusting the index field.
>>
>>
>> It occurs to me, having written this out, is that what we actually want
>> to do is have slightly custom not-quite-alternative blocks.  We have a
>> sequence of independent code blocks, and a small block at the end that
>> happens to contain an IRET.
>>
>> We could remove the nops at boot time if we treated it as one large
>> region, with the IRET at the end also able to have a variable position,
>> and assembles the "active" blocks tightly from the start.  Complications
>> would include adjusting the IRET extable entry, but this isn't
>> insurmountable.  Entrypoints are a bit more tricky but could be done by
>> packing from the back forward, and adjusting the entry position.
>>
>> Either way, something to ponder.  (It's also possible that it doesn't
>> make a measurable difference until we get to FRED, at which point we
>> have a set of fresh entry-points to write anyway, and a slight glimmer
>> of hope of not needing to pollute them with speculation workarounds...)
>>
>>> Since we're going to run out of SCF_* bits soon and since the new flag
>>> is meaningful only in struct cpu_info's spec_ctrl_flags, we could choose
>>> to widen that field to 16 bits right away and then use bit 8 (or higher)
>>> for the purpose here.
>> I really don't think it matters.  We've got plenty of room, and the
>> flexibility to shuffle, in both structures.  It's absolutely not worth
>> trying to introduce asymmetries to save 1 bit.
> Thanks for all the comments up to here. Just to clarify - I've not spotted
> anything there that would result in me being expected to take any action.

I'm tempted to suggest dropping the sentence about "might be more
efficient".  It's not necessary at all IMO, and it's probably not
correct if you were to compare an atom microserver vs a big Xeon.

For the lfences paragraph, I'd suggest rephrasing to something like this:

"As with all other conditional blocks on exit-to-guest paths, no
Spectre-v1 protections are necessary as execution will imminently be
hitting a serialising event."

The commentary on IRET/SYSRET and other jumps is adding confusion to
matter which is known to be safe.

>
>>> --- a/xen/arch/x86/include/asm/current.h
>>> +++ b/xen/arch/x86/include/asm/current.h
>>> @@ -55,9 +55,13 @@ struct cpu_info {
>>>  
>>>      /* See asm/spec_ctrl_asm.h for usage. */
>>>      unsigned int shadow_spec_ctrl;
>>> +    /*
>>> +     * spec_ctrl_flags can be accessed as a 32-bit entity and hence needs
>>> +     * placing suitably.
>> I'd suggest "is accessed as a 32-bit entity, and wants aligning suitably" ?
> I've tried to choose the wording carefully: The 32-bit access is in an
> alternative block, so doesn't always come into play. Hence the "may",
> not "is". Alignment alone also isn't sufficient here (and mis-aligning
> isn't merely a performance problem) - the following three bytes also
> need to be valid to access in the first place. Hence "needs" and
> "placing", not "wants" and "aligning".

"can" is actually a very ambiguous, and one valid interpretation of the
sentence is "spec_ctrl_fags is permitted to be accessed as a 32bit
quantity, but isn't actually accessed in that way".

In this case, we mean "it *is* accessed as a 32bit entity, in certain
cases".

So what about:

"spec_ctrl_flags is accessed as a 32-bit entity in some cases.  Place it
so it can be used as if it were an int."

?

>
>> If I've followed the logic correctly.  (I can't say I was specifically
>> aware that the bit test instructions didn't have byte forms, but I
>> suspect such instruction forms would be very very niche.)
> Yes, there not being byte forms of BT* is the sole reason here.
>
>>> --- a/xen/arch/x86/include/asm/spec_ctrl.h
>>> +++ b/xen/arch/x86/include/asm/spec_ctrl.h
>>> @@ -36,6 +36,8 @@
>>>  #define SCF_verw       (1 << 3)
>>>  #define SCF_ist_ibpb   (1 << 4)
>>>  #define SCF_entry_ibpb (1 << 5)
>>> +#define SCF_exit_ibpb_bit 6
>>> +#define SCF_exit_ibpb  (1 << SCF_exit_ibpb_bit)
>> One option to avoid the second define is to use ILOG2() with btrl.
> Specifically not. The assembler doesn't know the conditional operator,
> and the pre-processor won't collapse the expression resulting from
> expanding ilog2().

Oh that's dull.

I suspect we could construct equivalent logic with an .if/.else chain,
but I have no idea if the order of evaluation would be conducive to
doing so as part of evaluating an immediate operand.  We would
specifically not want something that ended looking like `ilog2 const
"btrl $" ",%eax"`, even though I could see how to make that work.

It would be nice if we could make something suitable here, but if not we
can live with the second _bit constant.

>
>>> @@ -272,6 +293,14 @@
>>>  #define SPEC_CTRL_EXIT_TO_PV                                            \
>>>      ALTERNATIVE "",                                                     \
>>>          DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
>>> +    ALTERNATIVE __stringify(jmp PASTE(.Lscexitpv_done, __LINE__)),      \
>>> +        __stringify(DO_SPEC_CTRL_EXIT_IBPB                              \
>>> +                    disp=(PASTE(.Lscexitpv_done, __LINE__) -            \
>>> +                          PASTE(.Lscexitpv_rsb, __LINE__))),            \
>>> +        X86_FEATURE_IBPB_EXIT_PV;                                       \
>>> +PASTE(.Lscexitpv_rsb, __LINE__):                                        \
>>> +    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_BUG_IBPB_NO_RET;              \
>>> +PASTE(.Lscexitpv_done, __LINE__):                                       \
>>>      DO_SPEC_CTRL_COND_VERW
>> What's wrong with the normal %= trick?
> We're in a C macro here which is then used in assembly code. %= only
> works in asm(), though. If we were in an assembler macro, I'd have
> used \@. Yet wrapping the whole thing in an assembler macro would, for
> my taste, hide too much information from the use sites (in particular
> the X86_{FEATURE,BUG}_* which are imo relevant to be visible there).
>
>>   The use of __LINE__ makes this
>> hard to subsequently livepatch, so I'd prefer to avoid it if possible.
> Yes, I was certainly aware this would be a concern. I couldn't think of
> a (forward looking) clean solution, though: Right now we have only one
> use per source file (the native and compat PV entry.S), so we could use
> a context-independent label name. But as you say above, for FRED we're
> likely to get new entry points, and they're likely better placed in the
> same files.

I was going to suggest using __COUNTER__ but I've just realised why that
wont work.

But on further consideration, this might be ok for livepatching, so long
as __LINE__ is only ever used with a local label name.  By the time it
has been compiled to a binary, the label name wont survive; only the
resulting displacement will.

I think we probably want to merge this with TEMP_NAME() (perhaps as
UNIQ_NAME(), as it will have to move elsewhere to become common with
this) to avoid proliferating our livepatching reasoning.

~Andrew


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry
  2023-01-25 15:26 ` [PATCH v3 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry Jan Beulich
@ 2023-01-26 20:43   ` Andrew Cooper
  2023-02-06 14:24     ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2023-01-26 20:43 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

On 25/01/2023 3:26 pm, Jan Beulich wrote:
> In order to avoid clobbering Xen's own predictions, defer the barrier as
> much as possible.

I can't actually think of a case where this matters.  We've done a
reasonable amount of work to get rid of indirect branches, and rets were
already immaterial because of the reset_stack_and_jump().

What I'm trying to figure out is whether this ends up hurting us.  If
there was an indirect call we used reliably pre and post context switch,
that changed target based on the guest type, then we'd now retain and
use the bad prediction.

>  Merely mark the CPU as needing a barrier issued the
> next time we're exiting to guest context.
>
> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> I couldn't find any sensible (central/unique) place where to move the
> comment which is being deleted alongside spec_ctrl_new_guest_context().

Given this, I'm wondering whether in patch 1, it might be better to name
the new bit SCF_new_guest_context.  Or new_pred_context context (which
on consideration, I think is better than the current name)?

This would have a knock-on effect on the feature names, which I think is
important because I think you've got a subtle bug in patch 3.

The comment really wants to stay, and it could simply move into
spec_ctrl_asm.h at that point.

~Andrew


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 3/4] x86: limit issuing of IBPB during context switch
  2023-01-25 15:26 ` [PATCH v3 3/4] x86: limit issuing of IBPB during context switch Jan Beulich
@ 2023-01-26 20:49   ` Andrew Cooper
  2023-01-27  7:51     ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2023-01-26 20:49 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

On 25/01/2023 3:26 pm, Jan Beulich wrote:
> When the outgoing vCPU had IBPB issued upon entering Xen there's no
> need for a 2nd barrier during context switch.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: Fold into series.
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2015,7 +2015,8 @@ void context_switch(struct vcpu *prev, s
>  
>          ctxt_switch_levelling(next);
>  
> -        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
> +        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
> +             !(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) )
>          {
>              static DEFINE_PER_CPU(unsigned int, last);
>              unsigned int *last_id = &this_cpu(last);
>
>

The aforementioned naming change makes the (marginal) security hole here
more obvious.

When we use entry-IBPB to protect Xen, we only care about the branch
types in the BTB.  We don't flush the RSB when using the SMEP optimisation.

Therefore, entry-IBPB is not something which lets us safely skip
exit-new-pred-context.

~Andrew


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 3/4] x86: limit issuing of IBPB during context switch
  2023-01-26 20:49   ` Andrew Cooper
@ 2023-01-27  7:51     ` Jan Beulich
  2023-01-27 17:47       ` Andrew Cooper
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2023-01-27  7:51 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, xen-devel

On 26.01.2023 21:49, Andrew Cooper wrote:
> On 25/01/2023 3:26 pm, Jan Beulich wrote:
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -2015,7 +2015,8 @@ void context_switch(struct vcpu *prev, s
>>  
>>          ctxt_switch_levelling(next);
>>  
>> -        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
>> +        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
>> +             !(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) )
>>          {
>>              static DEFINE_PER_CPU(unsigned int, last);
>>              unsigned int *last_id = &this_cpu(last);
>>
>>
> 
> The aforementioned naming change makes the (marginal) security hole here
> more obvious.
> 
> When we use entry-IBPB to protect Xen, we only care about the branch
> types in the BTB.  We don't flush the RSB when using the SMEP optimisation.
> 
> Therefore, entry-IBPB is not something which lets us safely skip
> exit-new-pred-context.

Yet what's to be my takeaway? You may be suggesting to drop the patch,
or you may be suggesting to tighten the condition. (My guess would be
the former.)

Jan


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 3/4] x86: limit issuing of IBPB during context switch
  2023-01-27  7:51     ` Jan Beulich
@ 2023-01-27 17:47       ` Andrew Cooper
  2023-02-06 14:58         ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Cooper @ 2023-01-27 17:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, xen-devel

On 27/01/2023 7:51 am, Jan Beulich wrote:
> On 26.01.2023 21:49, Andrew Cooper wrote:
>> On 25/01/2023 3:26 pm, Jan Beulich wrote:
>>> --- a/xen/arch/x86/domain.c
>>> +++ b/xen/arch/x86/domain.c
>>> @@ -2015,7 +2015,8 @@ void context_switch(struct vcpu *prev, s
>>>  
>>>          ctxt_switch_levelling(next);
>>>  
>>> -        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
>>> +        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
>>> +             !(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) )
>>>          {
>>>              static DEFINE_PER_CPU(unsigned int, last);
>>>              unsigned int *last_id = &this_cpu(last);
>>>
>>>
>> The aforementioned naming change makes the (marginal) security hole here
>> more obvious.
>>
>> When we use entry-IBPB to protect Xen, we only care about the branch
>> types in the BTB.  We don't flush the RSB when using the SMEP optimisation.
>>
>> Therefore, entry-IBPB is not something which lets us safely skip
>> exit-new-pred-context.
> Yet what's to be my takeaway? You may be suggesting to drop the patch,
> or you may be suggesting to tighten the condition. (My guess would be
> the former.)

Well - the patch can't be committed in this form.

I haven't figured out if there is a nice way to solve this, so I'm
afraid I don't have a helpful answer to your question.  I think it is
reasonable to wait a bit and see if a solution comes to mind.

I'm not aversed in principle to having this optimisation (well - a
working version of it), but honestly, its marginal right now and it has
to be weighed against whatever extra complexity is required to not open
the security hole.


And FYI, this (general issue) was precisely why ended up not trying to
do this rearranging in XSA-407/422.  I almost missed this security hole
and acked the patch.

~Andrew


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-01-26 20:27       ` Andrew Cooper
@ 2023-02-06 13:58         ` Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2023-02-06 13:58 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Andrew Cooper, Wei Liu, Roger Pau Monné,
	Kevin Tian, Jun Nakajima, xen-devel

On 26.01.2023 21:27, Andrew Cooper wrote:
> On 26/01/2023 8:02 am, Jan Beulich wrote:
>> On 25.01.2023 22:10, Andrew Cooper wrote:
>>> On 25/01/2023 3:25 pm, Jan Beulich wrote:
>>>> In order to be able to defer the context switch IBPB to the last
>>>> possible point, add logic to the exit-to-guest paths to issue the
>>>> barrier there, including the "IBPB doesn't flush the RSB/RAS"
>>>> workaround. Since alternatives, for now at least, can't nest, emit JMP
>>>> to skip past both constructs where both are needed. This may be more
>>>> efficient anyway, as the sequence of NOPs is pretty long.
>>> It is very uarch specific as to when a jump is less overhead than a line
>>> of nops.
>>>
>>> In all CPUs liable to be running Xen, even unconditional jumps take up
>>> branch prediction resource, because all branch prediction is pre-decode
>>> these days, so branch locations/types/destinations all need deriving
>>> from %rip and "history" alone.
>>>
>>> So whether a branch or a line of nops is better is a tradeoff between
>>> how much competition there is for branch prediction resource, and how
>>> efficiently the CPU can brute-force its way through a long line of nops.
>>>
>>> But a very interesting datapoint.  It turns out that AMD Zen4 CPUs
>>> macrofuse adjacent nops, including longnops, because it reduces the
>>> amount of execute/retire resources required.  And a lot of
>>> kernel/hypervisor fastpaths have a lot of nops these days.
>>>
>>>
>>> For us, the "can't nest" is singularly more important than any worry
>>> about uarch behaviour.  We've frankly got much lower hanging fruit than
>>> worring about one branch vs a few nops.
>>>
>>>> LFENCEs are omitted - for HVM a VM entry is immanent, which already
>>>> elsewhere we deem sufficiently serializing an event. For 32-bit PV
>>>> we're going through IRET, which ought to be good enough as well. While
>>>> 64-bit PV may use SYSRET, there are several more conditional branches
>>>> there which are all unprotected.
>>> Privilege changes are serialsing-ish, and this behaviour has been
>>> guaranteed moving forwards, although not documented coherently.
>>>
>>> CPL (well - privilege, which includes SMM, root/non-root, etc) is not
>>> written speculatively.  So any logic which needs to modify privilege has
>>> to block until it is known to be an architectural execution path.
>>>
>>> This gets us "lfence-like" or "dispatch serialising" behaviour, which is
>>> also the reason why INT3 is our go-to speculation halting instruction. 
>>> Microcode has to be entirely certain we are going to deliver an
>>> interrupt/exception/etc before it can start reading the IDT/etc.
>>>
>>> Either way, we've been promised that all instructions like IRET,
>>> SYS{CALL,RET,ENTER,EXIT}, VM{RUN,LAUNCH,RESUME} (and ERET{U,S} in the
>>> future FRED world) do, and shall continue to not execute speculatively.
>>>
>>> Which in practice means we don't need to worry about Spectre-v1 attack
>>> against codepaths which hit an exit-from-xen path, in terms of skipping
>>> protections.
>>>
>>> We do need to be careful about memory accesses and potential double
>>> dereferences, but all the data is on the top of the stack for XPTI
>>> reasons.  About the only concern is v->arch.msrs->* in the HVM path, and
>>> we're fine with the current layout (AFAICT).
>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> ---
>>>> I have to admit that I'm not really certain about the placement of the
>>>> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
>>>> entry".
>>> It really doesn't matter.  They're independent operations that both need
>>> doing, and are fully serialising so can't parallelise.
>>>
>>> But on this note, WRMSRNS and WRMSRLIST are on the horizon.  The CPUs
>>> which implement these instructions are the ones which also ought not to
>>> need any adjustments in the exit paths.  So I think it is specifically
>>> not worth trying to make any effort to turn *these* WRMSR's into more
>>> optimised forms.
>>>
>>> But WRMSRLIST was designed specifically for this kind of usecase
>>> (actually, more for the main context switch path) where you can prepare
>>> the list of MSRs in memory, including the ability to conditionally skip
>>> certain entries by adjusting the index field.
>>>
>>>
>>> It occurs to me, having written this out, is that what we actually want
>>> to do is have slightly custom not-quite-alternative blocks.  We have a
>>> sequence of independent code blocks, and a small block at the end that
>>> happens to contain an IRET.
>>>
>>> We could remove the nops at boot time if we treated it as one large
>>> region, with the IRET at the end also able to have a variable position,
>>> and assembles the "active" blocks tightly from the start.  Complications
>>> would include adjusting the IRET extable entry, but this isn't
>>> insurmountable.  Entrypoints are a bit more tricky but could be done by
>>> packing from the back forward, and adjusting the entry position.
>>>
>>> Either way, something to ponder.  (It's also possible that it doesn't
>>> make a measurable difference until we get to FRED, at which point we
>>> have a set of fresh entry-points to write anyway, and a slight glimmer
>>> of hope of not needing to pollute them with speculation workarounds...)
>>>
>>>> Since we're going to run out of SCF_* bits soon and since the new flag
>>>> is meaningful only in struct cpu_info's spec_ctrl_flags, we could choose
>>>> to widen that field to 16 bits right away and then use bit 8 (or higher)
>>>> for the purpose here.
>>> I really don't think it matters.  We've got plenty of room, and the
>>> flexibility to shuffle, in both structures.  It's absolutely not worth
>>> trying to introduce asymmetries to save 1 bit.
>> Thanks for all the comments up to here. Just to clarify - I've not spotted
>> anything there that would result in me being expected to take any action.
> 
> I'm tempted to suggest dropping the sentence about "might be more
> efficient".  It's not necessary at all IMO, and it's probably not
> correct if you were to compare an atom microserver vs a big Xeon.

Hmm - "might" still isn't "will". ISTR us actually discussing to limit
how long a sequence of NOPs we'd tolerate before switching to JMP.

>>>> --- a/xen/arch/x86/include/asm/spec_ctrl.h
>>>> +++ b/xen/arch/x86/include/asm/spec_ctrl.h
>>>> @@ -36,6 +36,8 @@
>>>>  #define SCF_verw       (1 << 3)
>>>>  #define SCF_ist_ibpb   (1 << 4)
>>>>  #define SCF_entry_ibpb (1 << 5)
>>>> +#define SCF_exit_ibpb_bit 6
>>>> +#define SCF_exit_ibpb  (1 << SCF_exit_ibpb_bit)
>>> One option to avoid the second define is to use ILOG2() with btrl.
>> Specifically not. The assembler doesn't know the conditional operator,
>> and the pre-processor won't collapse the expression resulting from
>> expanding ilog2().
> 
> Oh that's dull.
> 
> I suspect we could construct equivalent logic with an .if/.else chain,
> but I have no idea if the order of evaluation would be conducive to
> doing so as part of evaluating an immediate operand.  We would
> specifically not want something that ended looking like `ilog2 const
> "btrl $" ",%eax"`, even though I could see how to make that work.
> 
> It would be nice if we could make something suitable here, but if not we
> can live with the second _bit constant.

How would .if/.else be able to go inside an expression? You can't even
put this in a .macro, as that can't be invoked as part of an expression
either.

>>>> @@ -272,6 +293,14 @@
>>>>  #define SPEC_CTRL_EXIT_TO_PV                                            \
>>>>      ALTERNATIVE "",                                                     \
>>>>          DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
>>>> +    ALTERNATIVE __stringify(jmp PASTE(.Lscexitpv_done, __LINE__)),      \
>>>> +        __stringify(DO_SPEC_CTRL_EXIT_IBPB                              \
>>>> +                    disp=(PASTE(.Lscexitpv_done, __LINE__) -            \
>>>> +                          PASTE(.Lscexitpv_rsb, __LINE__))),            \
>>>> +        X86_FEATURE_IBPB_EXIT_PV;                                       \
>>>> +PASTE(.Lscexitpv_rsb, __LINE__):                                        \
>>>> +    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_BUG_IBPB_NO_RET;              \
>>>> +PASTE(.Lscexitpv_done, __LINE__):                                       \
>>>>      DO_SPEC_CTRL_COND_VERW
>>> What's wrong with the normal %= trick?
>> We're in a C macro here which is then used in assembly code. %= only
>> works in asm(), though. If we were in an assembler macro, I'd have
>> used \@. Yet wrapping the whole thing in an assembler macro would, for
>> my taste, hide too much information from the use sites (in particular
>> the X86_{FEATURE,BUG}_* which are imo relevant to be visible there).
>>
>>>   The use of __LINE__ makes this
>>> hard to subsequently livepatch, so I'd prefer to avoid it if possible.
>> Yes, I was certainly aware this would be a concern. I couldn't think of
>> a (forward looking) clean solution, though: Right now we have only one
>> use per source file (the native and compat PV entry.S), so we could use
>> a context-independent label name. But as you say above, for FRED we're
>> likely to get new entry points, and they're likely better placed in the
>> same files.
> 
> I was going to suggest using __COUNTER__ but I've just realised why that
> wont work.
> 
> But on further consideration, this might be ok for livepatching, so long
> as __LINE__ is only ever used with a local label name.  By the time it
> has been compiled to a binary, the label name wont survive; only the
> resulting displacement will.
> 
> I think we probably want to merge this with TEMP_NAME() (perhaps as
> UNIQ_NAME(), as it will have to move elsewhere to become common with
> this) to avoid proliferating our livepatching reasoning.

Even if I had recalled that we have TEMP_NAME() somewhere, I'd be very
hesitant to make anything like that into more generally accessible
infrastructure. I consider TEMP_NAME() itself already a too widely
exposed. The problem with it is that you can easily end up with two uses
as the result of expanding something that's all contained on a single
source line. Hence I very specifically want to have uses of __LINE__
only in places where either it is the top level source line, or where
- as is the case here - it is clear that no two instance of the same or
a similar macro will ever sensibly be put on one line. (Even then there's
still the risk of using the C macro inside an assembler macro, where two
instances would cause problems. But if such appeared, making the
assembler macro suitably use \@ instead shouldn't be overly difficult.)

Jan


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry
  2023-01-26 20:43   ` Andrew Cooper
@ 2023-02-06 14:24     ` Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2023-02-06 14:24 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, xen-devel

On 26.01.2023 21:43, Andrew Cooper wrote:
> On 25/01/2023 3:26 pm, Jan Beulich wrote:
>> In order to avoid clobbering Xen's own predictions, defer the barrier as
>> much as possible.
> 
> I can't actually think of a case where this matters.  We've done a
> reasonable amount of work to get rid of indirect branches, and rets were
> already immaterial because of the reset_stack_and_jump().
> 
> What I'm trying to figure out is whether this ends up hurting us.  If
> there was an indirect call we used reliably pre and post context switch,
> that changed target based on the guest type, then we'd now retain and
> use the bad prediction.

Hmm, so far I was understanding that the context switch operation is
solely for guests' purposes; this looks to be supported by the comments
there. If we were worried about Xen itself here (which of course is a
valid concern), then I think that together with the issue you've
spotted in patch 3 all I can do is drop the two middle patches (and
the HVM part of patch 1). At which point ...

>>  Merely mark the CPU as needing a barrier issued the
>> next time we're exiting to guest context.
>>
>> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> I couldn't find any sensible (central/unique) place where to move the
>> comment which is being deleted alongside spec_ctrl_new_guest_context().
> 
> Given this, I'm wondering whether in patch 1, it might be better to name
> the new bit SCF_new_guest_context.  Or new_pred_context context (which
> on consideration, I think is better than the current name)?
> 
> This would have a knock-on effect on the feature names, which I think is
> important because I think you've got a subtle bug in patch 3.

... this renaming, which I've done already, may have been in vein.

Jan


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3 3/4] x86: limit issuing of IBPB during context switch
  2023-01-27 17:47       ` Andrew Cooper
@ 2023-02-06 14:58         ` Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2023-02-06 14:58 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné, xen-devel

On 27.01.2023 18:47, Andrew Cooper wrote:
> On 27/01/2023 7:51 am, Jan Beulich wrote:
>> On 26.01.2023 21:49, Andrew Cooper wrote:
>>> On 25/01/2023 3:26 pm, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/domain.c
>>>> +++ b/xen/arch/x86/domain.c
>>>> @@ -2015,7 +2015,8 @@ void context_switch(struct vcpu *prev, s
>>>>  
>>>>          ctxt_switch_levelling(next);
>>>>  
>>>> -        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
>>>> +        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
>>>> +             !(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) )
>>>>          {
>>>>              static DEFINE_PER_CPU(unsigned int, last);
>>>>              unsigned int *last_id = &this_cpu(last);
>>>>
>>>>
>>> The aforementioned naming change makes the (marginal) security hole here
>>> more obvious.
>>>
>>> When we use entry-IBPB to protect Xen, we only care about the branch
>>> types in the BTB.  We don't flush the RSB when using the SMEP optimisation.
>>>
>>> Therefore, entry-IBPB is not something which lets us safely skip
>>> exit-new-pred-context.
>> Yet what's to be my takeaway? You may be suggesting to drop the patch,
>> or you may be suggesting to tighten the condition. (My guess would be
>> the former.)
> 
> Well - the patch can't be committed in this form.
> 
> I haven't figured out if there is a nice way to solve this, so I'm
> afraid I don't have a helpful answer to your question.  I think it is
> reasonable to wait a bit and see if a solution comes to mind.
> 
> I'm not aversed in principle to having this optimisation (well - a
> working version of it), but honestly, its marginal right now and it has
> to be weighed against whatever extra complexity is required to not open
> the security hole.

Well, the condition can be extended accordingly (see patch fragment at
the end), but the question is going to be whether the more complex
logic is going to be worth the savings (especially in case it needs
further extending, i.e. if I still have overlooked something).

> And FYI, this (general issue) was precisely why ended up not trying to
> do this rearranging in XSA-407/422.  I almost missed this security hole
> and acked the patch.

I appreciate you having forced us to be cautious at that time.

Jan

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2067,17 +2067,26 @@ void context_switch(struct vcpu *prev, s
     }
     else
     {
+        unsigned int feat_sc_rsb = X86_FEATURE_SC_RSB_HVM;
+
         __context_switch();
 
         /* Re-enable interrupts before restoring state which may fault. */
         local_irq_enable();
 
         if ( is_pv_domain(nextd) )
+        {
             load_segments(next);
 
+            feat_sc_rsb = X86_FEATURE_SC_RSB_PV;
+        }
+
         ctxt_switch_levelling(next);
 
-        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
+        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
+             (!(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) ||
+              /* is_idle_domain(prevd) || */
+              !boot_cpu_has(feat_sc_rsb)) )
         {
             static DEFINE_PER_CPU(unsigned int, last);
             unsigned int *last_id = &this_cpu(last);



^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2023-02-06 14:58 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-25 15:24 [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Jan Beulich
2023-01-25 15:25 ` [PATCH v3 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
2023-01-25 21:10   ` Andrew Cooper
2023-01-26  8:02     ` Jan Beulich
2023-01-26 20:27       ` Andrew Cooper
2023-02-06 13:58         ` Jan Beulich
2023-01-25 15:26 ` [PATCH v3 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry Jan Beulich
2023-01-26 20:43   ` Andrew Cooper
2023-02-06 14:24     ` Jan Beulich
2023-01-25 15:26 ` [PATCH v3 3/4] x86: limit issuing of IBPB during context switch Jan Beulich
2023-01-26 20:49   ` Andrew Cooper
2023-01-27  7:51     ` Jan Beulich
2023-01-27 17:47       ` Andrew Cooper
2023-02-06 14:58         ` Jan Beulich
2023-01-25 15:27 ` [PATCH v3 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode Jan Beulich
2023-01-25 17:49 ` [PATCH v3 0/4] x86/spec-ctrl: IPBP improvements Andrew Cooper
2023-01-26  7:32   ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.