All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 0/4 + v1 0/1] x86/spec-ctrl: IBPB improvements
@ 2023-02-14 16:09 Jan Beulich
  2023-02-14 16:10 ` [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
                   ` (4 more replies)
  0 siblings, 5 replies; 41+ messages in thread
From: Jan Beulich @ 2023-02-14 16:09 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Versions of the two final patches were submitted standalone earlier
on. The series here tries to carry out a suggestion from Andrew,
which the two of us have been discussing. Then said previously posted
patches are re-based on top, utilizing the new functionality.

Xen:

1: spec-ctrl: add logic to issue IBPB on exit to guest
2: spec-ctrl: defer context-switch IBPB until guest entry
3: limit issuing of IBPB during context switch
4: PV: issue branch prediction barrier when switching 64-bit guest to kernel mode

Linux (new):

1: x86/Xen: make use of IBPB controlling VM assist

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-02-14 16:09 [PATCH v4 0/4 + v1 0/1] x86/spec-ctrl: IBPB improvements Jan Beulich
@ 2023-02-14 16:10 ` Jan Beulich
  2023-12-18 12:11   ` Roger Pau Monné
  2023-02-14 16:11 ` [PATCH v4 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry Jan Beulich
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-02-14 16:10 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

In order to be able to defer the context switch IBPB to the last
possible point, add logic to the exit-to-guest paths to issue the
barrier there, including the "IBPB doesn't flush the RSB/RAS"
workaround. Since alternatives, for now at least, can't nest, emit JMP
to skip past both constructs where both are needed. This may be more
efficient anyway, as the sequence of NOPs is pretty long.

As with all other conditional blocks on exit-to-guest paths, no
Spectre-v1 protections are necessary as execution will imminently be
hitting a serialising event.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I have to admit that I'm not really certain about the placement of the
IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
entry".

Since we're going to run out of SCF_* bits soon and since the new flag
is meaningful only in struct cpu_info's spec_ctrl_flags, we could choose
to widen that field to 16 bits right away and then use bit 8 (or higher)
for the purpose here.
---
v4: Alter parts of the description. Re-word a comment. Rename flag and
    feature identifiers.
v3: New.

--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -75,6 +75,12 @@ __UNLIKELY_END(nsvm_hap)
         .endm
         ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM
 
+        ALTERNATIVE "jmp 2f", __stringify(DO_SPEC_CTRL_EXIT_IBPB disp=(2f-1f)), \
+                    X86_FEATURE_NEW_PRED_CTXT_HVM
+1:
+        ALTERNATIVE "", DO_OVERWRITE_RSB, X86_BUG_IBPB_NO_RET
+2:
+
         pop  %r15
         pop  %r14
         pop  %r13
--- a/xen/arch/x86/hvm/vmx/entry.S
+++ b/xen/arch/x86/hvm/vmx/entry.S
@@ -86,7 +86,8 @@ UNLIKELY_END(realmode)
         jz .Lvmx_vmentry_restart
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        /* SPEC_CTRL_EXIT_TO_VMX   Req: %rsp=regs/cpuinfo              Clob:    */
+        /* SPEC_CTRL_EXIT_TO_VMX   Req: %rsp=regs/cpuinfo              Clob: acd */
+        ALTERNATIVE "", DO_SPEC_CTRL_EXIT_IBPB, X86_FEATURE_NEW_PRED_CTXT_HVM
         DO_SPEC_CTRL_COND_VERW
 
         mov  VCPU_hvm_guest_cr2(%rbx),%rax
--- a/xen/arch/x86/include/asm/cpufeatures.h
+++ b/xen/arch/x86/include/asm/cpufeatures.h
@@ -39,8 +39,10 @@ XEN_CPUFEATURE(XEN_LBR,           X86_SY
 XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle */
 XEN_CPUFEATURE(XEN_SHSTK,         X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */
 XEN_CPUFEATURE(XEN_IBT,           X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */
-XEN_CPUFEATURE(IBPB_ENTRY_PV,     X86_SYNTH(28)) /* MSR_PRED_CMD used by Xen for PV */
-XEN_CPUFEATURE(IBPB_ENTRY_HVM,    X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */
+XEN_CPUFEATURE(IBPB_ENTRY_PV,     X86_SYNTH(28)) /* MSR_PRED_CMD used by Xen when entered from PV */
+XEN_CPUFEATURE(IBPB_ENTRY_HVM,    X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen when entered from HVM */
+XEN_CPUFEATURE(NEW_PRED_CTXT_PV,  X86_SYNTH(30)) /* issue prediction barrier when exiting to PV */
+XEN_CPUFEATURE(NEW_PRED_CTXT_HVM, X86_SYNTH(31)) /* issue prediction barrier when exiting to HVM */
 
 /* Bug words follow the synthetic words. */
 #define X86_NR_BUG 1
--- a/xen/arch/x86/include/asm/current.h
+++ b/xen/arch/x86/include/asm/current.h
@@ -55,9 +55,13 @@ struct cpu_info {
 
     /* See asm/spec_ctrl_asm.h for usage. */
     unsigned int shadow_spec_ctrl;
+    /*
+     * spec_ctrl_flags is accessed as a 32-bit entity in certain cases. Place
+     * it accordingly.
+     */
+    uint8_t      spec_ctrl_flags;
     uint8_t      xen_spec_ctrl;
     uint8_t      last_spec_ctrl;
-    uint8_t      spec_ctrl_flags;
 
     /*
      * The following field controls copying of the L4 page table of 64-bit
--- a/xen/arch/x86/include/asm/spec_ctrl.h
+++ b/xen/arch/x86/include/asm/spec_ctrl.h
@@ -36,6 +36,8 @@
 #define SCF_verw       (1 << 3)
 #define SCF_ist_ibpb   (1 << 4)
 #define SCF_entry_ibpb (1 << 5)
+#define SCF_new_pred_ctxt_bit 6
+#define SCF_new_pred_ctxt (1 << SCF_new_pred_ctxt_bit)
 
 /*
  * The IST paths (NMI/#MC) can interrupt any arbitrary context.  Some
--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -117,6 +117,27 @@
 .L\@_done:
 .endm
 
+.macro DO_SPEC_CTRL_EXIT_IBPB disp=0
+/*
+ * Requires %rsp=regs
+ * Clobbers %rax, %rcx, %rdx
+ *
+ * Conditionally issue IBPB if SCF_new_pred_ctxt is active.  The macro
+ * invocation may be followed by X86_BUG_IBPB_NO_RET workaround code.  The
+ * "disp" argument is to allow invocation sites to pass in the extra amount
+ * of code which needs skipping in case no action is necessary.
+ *
+ * The flag is a "one-shot" indicator, so it is being cleared at the same time.
+ */
+    btrl    $SCF_new_pred_ctxt_bit, CPUINFO_spec_ctrl_flags(%rsp)
+    jnc     .L\@_skip + (\disp)
+    mov     $MSR_PRED_CMD, %ecx
+    mov     $PRED_CMD_IBPB, %eax
+    xor     %edx, %edx
+    wrmsr
+.L\@_skip:
+.endm
+
 .macro DO_OVERWRITE_RSB tmp=rax
 /*
  * Requires nothing
@@ -272,6 +293,14 @@
 #define SPEC_CTRL_EXIT_TO_PV                                            \
     ALTERNATIVE "",                                                     \
         DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
+    ALTERNATIVE __stringify(jmp PASTE(.Lscexitpv_done, __LINE__)),      \
+        __stringify(DO_SPEC_CTRL_EXIT_IBPB                              \
+                    disp=(PASTE(.Lscexitpv_done, __LINE__) -            \
+                          PASTE(.Lscexitpv_rsb, __LINE__))),            \
+        X86_FEATURE_NEW_PRED_CTXT_PV;                                   \
+PASTE(.Lscexitpv_rsb, __LINE__):                                        \
+    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_BUG_IBPB_NO_RET;              \
+PASTE(.Lscexitpv_done, __LINE__):                                       \
     DO_SPEC_CTRL_COND_VERW
 
 /*
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -8,6 +8,7 @@
 #include <asm/page.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
+#include <xen/lib.h>
 #include <public/xen.h>
 #include <irq_vectors.h>
 
@@ -156,7 +157,7 @@ ENTRY(compat_restore_all_guest)
         mov VCPUMSR_spec_ctrl_raw(%rax), %eax
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+        SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: acd */
 
         RESTORE_ALL adj=8 compat=1
 .Lft0:  iretq
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -9,6 +9,7 @@
 #include <asm/asm_defns.h>
 #include <asm/page.h>
 #include <asm/processor.h>
+#include <xen/lib.h>
 #include <public/xen.h>
 #include <irq_vectors.h>
 
@@ -187,7 +188,7 @@ restore_all_guest:
         mov   %r15d, %eax
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+        SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: acd */
 
         RESTORE_ALL
         testw $TRAP_syscall,4(%rsp)



^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry
  2023-02-14 16:09 [PATCH v4 0/4 + v1 0/1] x86/spec-ctrl: IBPB improvements Jan Beulich
  2023-02-14 16:10 ` [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
@ 2023-02-14 16:11 ` Jan Beulich
  2023-12-18 12:39   ` Roger Pau Monné
  2023-02-14 16:11 ` [PATCH v4 3/4] x86: limit issuing of IBPB during context switch Jan Beulich
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-02-14 16:11 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

In order to avoid clobbering Xen's own predictions, defer the barrier as
much as possible. Merely mark the CPU as needing a barrier issued the
next time we're exiting to guest context.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I couldn't find any sensible (central/unique) place where to move the
comment which is being deleted alongside spec_ctrl_new_guest_context().
(If this patch is to survive in the first place, it was suggested to
move to spect_ctrl_asm.h, next to the #define of the controlling bit.)
---
v4: Re-base in particular over changes earlier in the series.
v3: New.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2038,7 +2038,7 @@ void context_switch(struct vcpu *prev, s
              */
             if ( *last_id != next_id )
             {
-                spec_ctrl_new_guest_context();
+                info->spec_ctrl_flags |= SCF_new_pred_ctxt;
                 *last_id = next_id;
             }
         }
--- a/xen/arch/x86/include/asm/spec_ctrl.h
+++ b/xen/arch/x86/include/asm/spec_ctrl.h
@@ -67,28 +67,6 @@
 void init_speculation_mitigations(void);
 void spec_ctrl_init_domain(struct domain *d);
 
-/*
- * Switch to a new guest prediction context.
- *
- * This flushes all indirect branch predictors (BTB, RSB/RAS), so guest code
- * which has previously run on this CPU can't attack subsequent guest code.
- *
- * As this flushes the RSB/RAS, it destroys the predictions of the calling
- * context.  For best performace, arrange for this to be used when we're going
- * to jump out of the current context, e.g. with reset_stack_and_jump().
- *
- * For hardware which mis-implements IBPB, fix up by flushing the RSB/RAS
- * manually.
- */
-static always_inline void spec_ctrl_new_guest_context(void)
-{
-    wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB);
-
-    /* (ab)use alternative_input() to specify clobbers. */
-    alternative_input("", "DO_OVERWRITE_RSB", X86_BUG_IBPB_NO_RET,
-                      : "rax", "rcx");
-}
-
 extern int8_t opt_ibpb_ctxt_switch;
 extern bool opt_ssbd;
 extern int8_t opt_eager_fpu;
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -854,6 +854,11 @@ static void __init ibpb_calculations(voi
      */
     if ( opt_ibpb_ctxt_switch == -1 )
         opt_ibpb_ctxt_switch = !(opt_ibpb_entry_hvm && opt_ibpb_entry_pv);
+    if ( opt_ibpb_ctxt_switch )
+    {
+        setup_force_cpu_cap(X86_FEATURE_NEW_PRED_CTXT_PV);
+        setup_force_cpu_cap(X86_FEATURE_NEW_PRED_CTXT_HVM);
+    }
 }
 
 /* Calculate whether this CPU is vulnerable to L1TF. */



^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 3/4] x86: limit issuing of IBPB during context switch
  2023-02-14 16:09 [PATCH v4 0/4 + v1 0/1] x86/spec-ctrl: IBPB improvements Jan Beulich
  2023-02-14 16:10 ` [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
  2023-02-14 16:11 ` [PATCH v4 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry Jan Beulich
@ 2023-02-14 16:11 ` Jan Beulich
  2023-12-18 15:19   ` Roger Pau Monné
  2023-02-14 16:12 ` [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode Jan Beulich
  2023-02-14 16:13 ` [PATCH] x86/Xen: make use of IBPB controlling VM assist Jan Beulich
  4 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-02-14 16:11 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

When the outgoing vCPU had IBPB issued and RSB overwritten upon entering
Xen, then there's no need for a 2nd barrier during context switch.

Note that SCF_entry_ibpb is always clear for the idle domain, so no
explicit idle domain check is needed to augment the feature check
(which is simply inapplicable to "idle").

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Tighten the condition.
v3: Fold into series.
---
I think in principle we could limit the impact from finding the idle
domain as "prevd", by having __context_switch() tell us what kind
domain's vCPU was switched out (it could still be "idle", but in fewer
cases).

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2005,17 +2005,26 @@ void context_switch(struct vcpu *prev, s
     }
     else
     {
+        unsigned int feat_sc_rsb = X86_FEATURE_SC_RSB_HVM;
+
         __context_switch();
 
         /* Re-enable interrupts before restoring state which may fault. */
         local_irq_enable();
 
         if ( is_pv_domain(nextd) )
+        {
             load_segments(next);
 
+            feat_sc_rsb = X86_FEATURE_SC_RSB_PV;
+        }
+
         ctxt_switch_levelling(next);
 
-        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
+        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
+             (!(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) ||
+              /* is_idle_domain(prevd) || */
+              !boot_cpu_has(feat_sc_rsb)) )
         {
             static DEFINE_PER_CPU(unsigned int, last);
             unsigned int *last_id = &this_cpu(last);



^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-02-14 16:09 [PATCH v4 0/4 + v1 0/1] x86/spec-ctrl: IBPB improvements Jan Beulich
                   ` (2 preceding siblings ...)
  2023-02-14 16:11 ` [PATCH v4 3/4] x86: limit issuing of IBPB during context switch Jan Beulich
@ 2023-02-14 16:12 ` Jan Beulich
  2023-12-18 17:24   ` Roger Pau Monné
  2023-02-14 16:13 ` [PATCH] x86/Xen: make use of IBPB controlling VM assist Jan Beulich
  4 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-02-14 16:12 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monné

Since both kernel and user mode run in ring 3, they run in the same
"predictor mode". While the kernel could take care of this itself, doing
so would be yet another item distinguishing PV from native. Additionally
we're in a much better position to issue the barrier command, and we can
save a #GP (for privileged instruction emulation) this way.

To allow to recover performance, introduce a new VM assist allowing the
guest kernel to suppress this barrier. Make availability of the assist
dependent upon the command line control, such that kernels have a way to
know whether their request actually took any effect.

Note that because of its use in PV64_VM_ASSIST_MASK, the declaration of
opt_ibpb_mode_switch can't live in asm/spec_ctrl.h.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Is the placement of the clearing of opt_ibpb_ctxt_switch correct in
parse_spec_ctrl()? Shouldn't it live ahead of the "disable_common"
label, as being about guest protection, not Xen's?

Adding setting of the variable to the "pv" sub-case in parse_spec_ctrl()
didn't seem quite right to me, considering that we default it to the
opposite of opt_ibpb_entry_pv.
---
v4: Correct the print_details() change. Re-base in particular over
    changes earlier in the series.
v3: Leverage exit-IBPB. Introduce separate command line control.
v2: Leverage entry-IBPB. Add VM assist. Re-base.

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -2320,8 +2320,8 @@ By default SSBD will be mitigated at run
 ### spec-ctrl (x86)
 > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>,
 >              {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
->              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
->              eager-fpu,l1d-flush,branch-harden,srb-lock,
+>              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ibpb-mode-switch,
+>              ssbd,psfd,eager-fpu,l1d-flush,branch-harden,srb-lock,
 >              unpriv-mmio}=<bool> ]`
 
 Controls for speculative execution sidechannel mitigations.  By default, Xen
@@ -2403,7 +2403,10 @@ default.
 
 On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=`
 option can be used to force (the default) or prevent Xen from issuing branch
-prediction barriers on vcpu context switches.
+prediction barriers on vcpu context switches.  On such hardware the
+`ibpb-mode-switch` option can be used to control whether, by default, Xen
+would issue branch prediction barriers when 64-bit PV guests switch from
+user to kernel mode.  If enabled, guest kernels can op out of this behavior.
 
 On all hardware, the `eager-fpu=` option can be used to force or prevent Xen
 from using fully eager FPU context switches.  This is currently implemented as
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -742,6 +742,8 @@ static inline void pv_inject_sw_interrup
     pv_inject_event(&event);
 }
 
+extern int8_t opt_ibpb_mode_switch;
+
 #define PV32_VM_ASSIST_MASK ((1UL << VMASST_TYPE_4gb_segments)        | \
                              (1UL << VMASST_TYPE_4gb_segments_notify) | \
                              (1UL << VMASST_TYPE_writable_pagetables) | \
@@ -753,7 +755,9 @@ static inline void pv_inject_sw_interrup
  * but we can't make such requests fail all of the sudden.
  */
 #define PV64_VM_ASSIST_MASK (PV32_VM_ASSIST_MASK                      | \
-                             (1UL << VMASST_TYPE_m2p_strict))
+                             (1UL << VMASST_TYPE_m2p_strict)          | \
+                             ((opt_ibpb_mode_switch + 0UL) <<           \
+                              VMASST_TYPE_mode_switch_no_ibpb))
 #define HVM_VM_ASSIST_MASK  (1UL << VMASST_TYPE_runstate_update_flag)
 
 #define arch_vm_assist_valid_mask(d) \
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
 void toggle_guest_mode(struct vcpu *v)
 {
     const struct domain *d = v->domain;
+    struct cpu_info *cpu_info = get_cpu_info();
     unsigned long gs_base;
 
     ASSERT(!is_pv_32bit_vcpu(v));
@@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
     if ( v->arch.flags & TF_kernel_mode )
         v->arch.pv.gs_base_kernel = gs_base;
     else
+    {
         v->arch.pv.gs_base_user = gs_base;
+
+        if ( opt_ibpb_mode_switch &&
+             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
+             !VM_ASSIST(d, mode_switch_no_ibpb) )
+            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;
+    }
+
     asm volatile ( "swapgs" );
 
     _toggle_guest_pt(v);
 
     if ( d->arch.pv.xpti )
     {
-        struct cpu_info *cpu_info = get_cpu_info();
-
         cpu_info->root_pgt_changed = true;
         cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
                            (d->arch.pv.pcid ? get_pcid_bits(v, true) : 0);
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -60,6 +60,7 @@ bool __ro_after_init opt_ssbd;
 int8_t __initdata opt_psfd = -1;
 
 int8_t __ro_after_init opt_ibpb_ctxt_switch = -1;
+int8_t __ro_after_init opt_ibpb_mode_switch = -1;
 int8_t __read_mostly opt_eager_fpu = -1;
 int8_t __read_mostly opt_l1d_flush = -1;
 static bool __initdata opt_branch_harden = true;
@@ -111,6 +112,8 @@ static int __init cf_check parse_spec_ct
             if ( opt_pv_l1tf_domu < 0 )
                 opt_pv_l1tf_domu = 0;
 
+            opt_ibpb_mode_switch = 0;
+
             if ( opt_tsx == -1 )
                 opt_tsx = -3;
 
@@ -271,6 +274,8 @@ static int __init cf_check parse_spec_ct
         /* Misc settings. */
         else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
             opt_ibpb_ctxt_switch = val;
+        else if ( (val = parse_boolean("ibpb-mode-switch", s, ss)) >= 0 )
+            opt_ibpb_mode_switch = val;
         else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
             opt_eager_fpu = val;
         else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
@@ -527,16 +532,18 @@ static void __init print_details(enum in
 
 #endif
 #ifdef CONFIG_PV
-    printk("  Support for PV VMs:%s%s%s%s%s%s\n",
+    printk("  Support for PV VMs:%s%s%s%s%s%s%s\n",
            (boot_cpu_has(X86_FEATURE_SC_MSR_PV) ||
             boot_cpu_has(X86_FEATURE_SC_RSB_PV) ||
             boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ||
-            opt_eager_fpu || opt_md_clear_pv)        ? ""               : " None",
+            opt_eager_fpu || opt_md_clear_pv ||
+            opt_ibpb_mode_switch)                    ? ""               : " None",
            boot_cpu_has(X86_FEATURE_SC_MSR_PV)       ? " MSR_SPEC_CTRL" : "",
            boot_cpu_has(X86_FEATURE_SC_RSB_PV)       ? " RSB"           : "",
            opt_eager_fpu                             ? " EAGER_FPU"     : "",
            opt_md_clear_pv                           ? " MD_CLEAR"      : "",
-           boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "");
+           boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "",
+           opt_ibpb_mode_switch                      ? " IBPB-mode-switch" : "");
 
     printk("  XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n",
            opt_xpti_hwdom ? "enabled" : "disabled",
@@ -804,7 +811,8 @@ static void __init ibpb_calculations(voi
     /* Check we have hardware IBPB support before using it... */
     if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
     {
-        opt_ibpb_entry_hvm = opt_ibpb_entry_pv = opt_ibpb_ctxt_switch = 0;
+        opt_ibpb_entry_hvm = opt_ibpb_entry_pv = 0;
+        opt_ibpb_mode_switch = opt_ibpb_ctxt_switch = 0;
         opt_ibpb_entry_dom0 = false;
         return;
     }
@@ -859,6 +867,18 @@ static void __init ibpb_calculations(voi
         setup_force_cpu_cap(X86_FEATURE_NEW_PRED_CTXT_PV);
         setup_force_cpu_cap(X86_FEATURE_NEW_PRED_CTXT_HVM);
     }
+
+#ifdef CONFIG_PV
+    /*
+     * If we're using IBPB-on-entry to protect against PV guests, then
+     * there's no need to also issue IBPB on a guest user->kernel switch.
+     */
+    if ( opt_ibpb_mode_switch == -1 )
+        opt_ibpb_mode_switch = !opt_ibpb_entry_pv ||
+                               (!opt_ibpb_entry_dom0 && !opt_dom0_pvh);
+    if ( opt_ibpb_mode_switch )
+        setup_force_cpu_cap(X86_FEATURE_NEW_PRED_CTXT_PV);
+#endif
 }
 
 /* Calculate whether this CPU is vulnerable to L1TF. */
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -554,6 +554,16 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
  */
 #define VMASST_TYPE_m2p_strict           32
 
+/*
+ * x86-64 guests: Suppress IBPB on guest-user to guest-kernel mode switch.
+ *
+ * By default (on affected and capable hardware) as a safety measure Xen,
+ * to cover for the fact that guest-kernel and guest-user modes are both
+ * running in ring 3 (and hence share prediction context), would issue a
+ * barrier for user->kernel mode switches of PV guests.
+ */
+#define VMASST_TYPE_mode_switch_no_ibpb  33
+
 #if __XEN_INTERFACE_VERSION__ < 0x00040600
 #define MAX_VMASST_TYPE                  3
 #endif



^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-02-14 16:09 [PATCH v4 0/4 + v1 0/1] x86/spec-ctrl: IBPB improvements Jan Beulich
                   ` (3 preceding siblings ...)
  2023-02-14 16:12 ` [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode Jan Beulich
@ 2023-02-14 16:13 ` Jan Beulich
  2023-02-14 23:53   ` Boris Ostrovsky
  4 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-02-14 16:13 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Juergen Gross, Boris Ostrovsky

If this VM assist is available (to PV guests only), use it to
- avoid issuing an IBPB ourselves upon entry from user mode (which the
  hypervisor would then have to emulate, as the MSR write traps),
- suppress the IBPB in the hypervisor if we don't mean to have one
  issued.

As there's no good place to have xen_vm_assist_ibpb() as an inline
function, make it an init-only out-of-line one.

While adjusting the Xen public header, drop the unused and no longer
applicable MAX_VMASST_TYPE (instead of modifying its value).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -43,6 +43,8 @@ static inline uint32_t xen_cpuid_base(vo
 	return hypervisor_cpuid_base("XenVMMXenVMM", 2);
 }
 
+int xen_vm_assist_ibpb(bool enable);
+
 struct pci_dev;
 
 #ifdef CONFIG_XEN_PV_DOM0
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -18,6 +18,8 @@
 #include <linux/pgtable.h>
 #include <linux/bpf.h>
 
+#include <xen/xen.h>
+
 #include <asm/spec-ctrl.h>
 #include <asm/cmdline.h>
 #include <asm/bugs.h>
@@ -32,6 +34,7 @@
 #include <asm/intel-family.h>
 #include <asm/e820/api.h>
 #include <asm/hypervisor.h>
+#include <asm/xen/hypervisor.h>
 #include <asm/tlbflush.h>
 
 #include "cpu.h"
@@ -934,7 +937,8 @@ do_cmd_auto:
 		break;
 
 	case RETBLEED_MITIGATION_IBPB:
-		setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+		if (!xen_pv_domain() || xen_vm_assist_ibpb(true))
+			setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
 		mitigate_smt = true;
 		break;
 
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -294,6 +294,17 @@ int xen_panic_handler_init(void)
 	return 0;
 }
 
+int __init xen_vm_assist_ibpb(bool enable)
+{
+	/*
+	 * Note that the VM-assist is a disable, so a request to enable IBPB
+	 * on our behalf needs to turn the functionality off (and vice versa).
+	 */
+	return HYPERVISOR_vm_assist(enable ? VMASST_CMD_disable
+					   : VMASST_CMD_enable,
+				    VMASST_TYPE_mode_switch_no_ibpb);
+}
+
 void xen_pin_vcpu(int cpu)
 {
 	static bool disable_pinning;
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -940,6 +940,13 @@ static void __init xen_pvmmu_arch_setup(
 	HYPERVISOR_vm_assist(VMASST_CMD_enable,
 			     VMASST_TYPE_pae_extended_cr3);
 
+	/*
+	 * By default suppress the hypervisor issuing IBPB on our behalf.  In
+	 * the RETBLEED_MITIGATION_IBPB case the VM assist will be disengaged
+	 * again in retbleed_select_mitigation().
+	 */
+	xen_vm_assist_ibpb(false);
+
 	if (register_callback(CALLBACKTYPE_event,
 			      xen_asm_exc_xen_hypervisor_callback) ||
 	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -413,7 +413,15 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
  */
 #define VMASST_TYPE_runstate_update_flag 5
 
-#define MAX_VMASST_TYPE 5
+/*
+ * x86-64 guests: Suppress IBPB on guest-user to guest-kernel mode switch.
+ *
+ * By default (on affected and capable hardware) as a safety measure Xen,
+ * to cover for the fact that guest-kernel and guest-user modes are both
+ * running in ring 3 (and hence share prediction context), would issue a
+ * barrier for user->kernel mode switches of PV guests.
+ */
+#define VMASST_TYPE_mode_switch_no_ibpb  33
 
 #ifndef __ASSEMBLY__
 



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-02-14 16:13 ` [PATCH] x86/Xen: make use of IBPB controlling VM assist Jan Beulich
@ 2023-02-14 23:53   ` Boris Ostrovsky
  2023-02-15  0:07     ` Boris Ostrovsky
  0 siblings, 1 reply; 41+ messages in thread
From: Boris Ostrovsky @ 2023-02-14 23:53 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Juergen Gross


On 2/14/23 11:13 AM, Jan Beulich wrote:

> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -18,6 +18,8 @@
>   #include <linux/pgtable.h>
>   #include <linux/bpf.h>
>   
> +#include <xen/xen.h>
> +
>   #include <asm/spec-ctrl.h>
>   #include <asm/cmdline.h>
>   #include <asm/bugs.h>
> @@ -32,6 +34,7 @@
>   #include <asm/intel-family.h>
>   #include <asm/e820/api.h>
>   #include <asm/hypervisor.h>
> +#include <asm/xen/hypervisor.h>
>   #include <asm/tlbflush.h>
>   
>   #include "cpu.h"
> @@ -934,7 +937,8 @@ do_cmd_auto:
>   		break;
>   
>   	case RETBLEED_MITIGATION_IBPB:
> -		setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
> +		if (!xen_pv_domain() || xen_vm_assist_ibpb(true))


Is this going to compile without CONFIG_XEN?


I also think these two conditions should be wrapped into something to limit exposure of non-Xen code to Xen-specific primitives.


-boris


> +			setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>   		mitigate_smt = true;
>   		break;


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-02-14 23:53   ` Boris Ostrovsky
@ 2023-02-15  0:07     ` Boris Ostrovsky
  2023-02-15  8:31       ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Boris Ostrovsky @ 2023-02-15  0:07 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Juergen Gross


On 2/14/23 6:53 PM, Boris Ostrovsky wrote:
>
> On 2/14/23 11:13 AM, Jan Beulich wrote:
>
>> --- a/arch/x86/kernel/cpu/bugs.c
>> +++ b/arch/x86/kernel/cpu/bugs.c
>> @@ -18,6 +18,8 @@
>>   #include <linux/pgtable.h>
>>   #include <linux/bpf.h>
>>   +#include <xen/xen.h>
>> +
>>   #include <asm/spec-ctrl.h>
>>   #include <asm/cmdline.h>
>>   #include <asm/bugs.h>
>> @@ -32,6 +34,7 @@
>>   #include <asm/intel-family.h>
>>   #include <asm/e820/api.h>
>>   #include <asm/hypervisor.h>
>> +#include <asm/xen/hypervisor.h>
>>   #include <asm/tlbflush.h>
>>     #include "cpu.h"
>> @@ -934,7 +937,8 @@ do_cmd_auto:
>>           break;
>>         case RETBLEED_MITIGATION_IBPB:
>> -        setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>> +        if (!xen_pv_domain() || xen_vm_assist_ibpb(true))
>
>
> Is this going to compile without CONFIG_XEN?
>
>
> I also think these two conditions should be wrapped into something to limit exposure of non-Xen code to Xen-specific primitives.
>

Oh, and this needs x86 maintainers.


-boris



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-02-15  0:07     ` Boris Ostrovsky
@ 2023-02-15  8:31       ` Jan Beulich
  2023-02-15 23:22         ` Boris Ostrovsky
  2023-03-17 13:56         ` Juergen Gross
  0 siblings, 2 replies; 41+ messages in thread
From: Jan Beulich @ 2023-02-15  8:31 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: Andrew Cooper, Juergen Gross, xen-devel

On 15.02.2023 01:07, Boris Ostrovsky wrote:
> 
> On 2/14/23 6:53 PM, Boris Ostrovsky wrote:
>>
>> On 2/14/23 11:13 AM, Jan Beulich wrote:
>>
>>> --- a/arch/x86/kernel/cpu/bugs.c
>>> +++ b/arch/x86/kernel/cpu/bugs.c
>>> @@ -18,6 +18,8 @@
>>>   #include <linux/pgtable.h>
>>>   #include <linux/bpf.h>
>>>   +#include <xen/xen.h>
>>> +
>>>   #include <asm/spec-ctrl.h>
>>>   #include <asm/cmdline.h>
>>>   #include <asm/bugs.h>
>>> @@ -32,6 +34,7 @@
>>>   #include <asm/intel-family.h>
>>>   #include <asm/e820/api.h>
>>>   #include <asm/hypervisor.h>
>>> +#include <asm/xen/hypervisor.h>
>>>   #include <asm/tlbflush.h>
>>>     #include "cpu.h"
>>> @@ -934,7 +937,8 @@ do_cmd_auto:
>>>           break;
>>>         case RETBLEED_MITIGATION_IBPB:
>>> -        setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>>> +        if (!xen_pv_domain() || xen_vm_assist_ibpb(true))
>>
>>
>> Is this going to compile without CONFIG_XEN?

Yes. The declaration of xen_vm_assist_ibpb() is visible (satisfying
the compiler) and DCE will eliminate the call to the function due to
xen_pv_domain() being constant "false" in that case, avoiding any
linking issues. The interesting case here really is building with
XEN but without XEN_PV: That's why I needed to put the function in
enlighten.c. This wouldn't be needed if xen_pv_domain() was also
constant "false" in that case (just like xen_pvh_domain() is when
!XEN_PVH).

>> I also think these two conditions should be wrapped into something to limit exposure of non-Xen code to Xen-specific primitives.

I would have done so, if I had any halfway sensible idea on how to
go about doing so in this particular case. In the absence of that it
looked okay-ish to me to reference Xen functions directly here.

> Oh, and this needs x86 maintainers.

Eventually yes. But I would prefer to sort the above question first
(which I'm sure would have been raised by them, in perhaps more
harsh a way), hence the initially limited exposure.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-02-15  8:31       ` Jan Beulich
@ 2023-02-15 23:22         ` Boris Ostrovsky
  2023-02-16  7:33           ` Jan Beulich
  2023-03-17 13:56         ` Juergen Gross
  1 sibling, 1 reply; 41+ messages in thread
From: Boris Ostrovsky @ 2023-02-15 23:22 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Juergen Gross, xen-devel


On 2/15/23 3:31 AM, Jan Beulich wrote:
> On 15.02.2023 01:07, Boris Ostrovsky wrote:
>> On 2/14/23 6:53 PM, Boris Ostrovsky wrote:
>>> On 2/14/23 11:13 AM, Jan Beulich wrote:
>>>
>>>> --- a/arch/x86/kernel/cpu/bugs.c
>>>> +++ b/arch/x86/kernel/cpu/bugs.c
>>>> @@ -18,6 +18,8 @@
>>>>    #include <linux/pgtable.h>
>>>>    #include <linux/bpf.h>
>>>>    +#include <xen/xen.h>
>>>> +
>>>>    #include <asm/spec-ctrl.h>
>>>>    #include <asm/cmdline.h>
>>>>    #include <asm/bugs.h>
>>>> @@ -32,6 +34,7 @@
>>>>    #include <asm/intel-family.h>
>>>>    #include <asm/e820/api.h>
>>>>    #include <asm/hypervisor.h>
>>>> +#include <asm/xen/hypervisor.h>
>>>>    #include <asm/tlbflush.h>
>>>>      #include "cpu.h"
>>>> @@ -934,7 +937,8 @@ do_cmd_auto:
>>>>            break;
>>>>          case RETBLEED_MITIGATION_IBPB:
>>>> -        setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>>>> +        if (!xen_pv_domain() || xen_vm_assist_ibpb(true))
>>>
>>> Is this going to compile without CONFIG_XEN?
> Yes. The declaration of xen_vm_assist_ibpb() is visible (satisfying
> the compiler) and DCE will eliminate the call to the function due to
> xen_pv_domain() being constant "false" in that case, avoiding any
> linking issues. The interesting case here really is building with
> XEN but without XEN_PV: That's why I needed to put the function in
> enlighten.c. This wouldn't be needed if xen_pv_domain() was also
> constant "false" in that case (just like xen_pvh_domain() is when
> !XEN_PVH).
>
>>> I also think these two conditions should be wrapped into something to limit exposure of non-Xen code to Xen-specific primitives.
> I would have done so, if I had any halfway sensible idea on how to
> go about doing so in this particular case. In the absence of that it
> looked okay-ish to me to reference Xen functions directly here.
>
>> Oh, and this needs x86 maintainers.
> Eventually yes. But I would prefer to sort the above question first
> (which I'm sure would have been raised by them, in perhaps more
> harsh a way), hence the initially limited exposure.
>

I also think there is a bit of a disconnect between how the mitigation is reported in the log/sysfs (retbleed_mitigation is RETBLEED_MITIGATION_IBPB, so "Mitigation: IBPB") and, for example, lscpu (since X86_FEATURE_ENTRY_IBPB is not set anymore).


-boris



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-02-15 23:22         ` Boris Ostrovsky
@ 2023-02-16  7:33           ` Jan Beulich
  0 siblings, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2023-02-16  7:33 UTC (permalink / raw)
  To: Boris Ostrovsky; +Cc: Andrew Cooper, Juergen Gross, xen-devel

On 16.02.2023 00:22, Boris Ostrovsky wrote:
> 
> On 2/15/23 3:31 AM, Jan Beulich wrote:
>> On 15.02.2023 01:07, Boris Ostrovsky wrote:
>>> On 2/14/23 6:53 PM, Boris Ostrovsky wrote:
>>>> On 2/14/23 11:13 AM, Jan Beulich wrote:
>>>>
>>>>> --- a/arch/x86/kernel/cpu/bugs.c
>>>>> +++ b/arch/x86/kernel/cpu/bugs.c
>>>>> @@ -18,6 +18,8 @@
>>>>>    #include <linux/pgtable.h>
>>>>>    #include <linux/bpf.h>
>>>>>    +#include <xen/xen.h>
>>>>> +
>>>>>    #include <asm/spec-ctrl.h>
>>>>>    #include <asm/cmdline.h>
>>>>>    #include <asm/bugs.h>
>>>>> @@ -32,6 +34,7 @@
>>>>>    #include <asm/intel-family.h>
>>>>>    #include <asm/e820/api.h>
>>>>>    #include <asm/hypervisor.h>
>>>>> +#include <asm/xen/hypervisor.h>
>>>>>    #include <asm/tlbflush.h>
>>>>>      #include "cpu.h"
>>>>> @@ -934,7 +937,8 @@ do_cmd_auto:
>>>>>            break;
>>>>>          case RETBLEED_MITIGATION_IBPB:
>>>>> -        setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>>>>> +        if (!xen_pv_domain() || xen_vm_assist_ibpb(true))
>>>>
>>>> Is this going to compile without CONFIG_XEN?
>> Yes. The declaration of xen_vm_assist_ibpb() is visible (satisfying
>> the compiler) and DCE will eliminate the call to the function due to
>> xen_pv_domain() being constant "false" in that case, avoiding any
>> linking issues. The interesting case here really is building with
>> XEN but without XEN_PV: That's why I needed to put the function in
>> enlighten.c. This wouldn't be needed if xen_pv_domain() was also
>> constant "false" in that case (just like xen_pvh_domain() is when
>> !XEN_PVH).
>>
>>>> I also think these two conditions should be wrapped into something to limit exposure of non-Xen code to Xen-specific primitives.
>> I would have done so, if I had any halfway sensible idea on how to
>> go about doing so in this particular case. In the absence of that it
>> looked okay-ish to me to reference Xen functions directly here.
>>
>>> Oh, and this needs x86 maintainers.
>> Eventually yes. But I would prefer to sort the above question first
>> (which I'm sure would have been raised by them, in perhaps more
>> harsh a way), hence the initially limited exposure.
>>
> 
> I also think there is a bit of a disconnect between how the mitigation is reported in the log/sysfs (retbleed_mitigation is RETBLEED_MITIGATION_IBPB, so "Mitigation: IBPB") and, for example, lscpu (since X86_FEATURE_ENTRY_IBPB is not set anymore).

Initially I too was worried about this, but ENTRY_IBPB is not exposed,
as per the empty double quotes in

#define X86_FEATURE_ENTRY_IBPB		(11*32+10) /* "" Issue an IBPB on kernel entry */

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-02-15  8:31       ` Jan Beulich
  2023-02-15 23:22         ` Boris Ostrovsky
@ 2023-03-17 13:56         ` Juergen Gross
  2023-03-17 14:21           ` Andrew Cooper
  2023-03-20 10:19           ` Jan Beulich
  1 sibling, 2 replies; 41+ messages in thread
From: Juergen Gross @ 2023-03-17 13:56 UTC (permalink / raw)
  To: Jan Beulich, Boris Ostrovsky; +Cc: Andrew Cooper, xen-devel


[-- Attachment #1.1.1: Type: text/plain, Size: 4994 bytes --]

On 15.02.23 09:31, Jan Beulich wrote:
> On 15.02.2023 01:07, Boris Ostrovsky wrote:
>>
>> On 2/14/23 6:53 PM, Boris Ostrovsky wrote:
>>>
>>> On 2/14/23 11:13 AM, Jan Beulich wrote:
>>>
>>>> --- a/arch/x86/kernel/cpu/bugs.c
>>>> +++ b/arch/x86/kernel/cpu/bugs.c
>>>> @@ -18,6 +18,8 @@
>>>>    #include <linux/pgtable.h>
>>>>    #include <linux/bpf.h>
>>>>    +#include <xen/xen.h>
>>>> +
>>>>    #include <asm/spec-ctrl.h>
>>>>    #include <asm/cmdline.h>
>>>>    #include <asm/bugs.h>
>>>> @@ -32,6 +34,7 @@
>>>>    #include <asm/intel-family.h>
>>>>    #include <asm/e820/api.h>
>>>>    #include <asm/hypervisor.h>
>>>> +#include <asm/xen/hypervisor.h>
>>>>    #include <asm/tlbflush.h>
>>>>      #include "cpu.h"
>>>> @@ -934,7 +937,8 @@ do_cmd_auto:
>>>>            break;
>>>>          case RETBLEED_MITIGATION_IBPB:
>>>> -        setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>>>> +        if (!xen_pv_domain() || xen_vm_assist_ibpb(true))
>>>
>>>
>>> Is this going to compile without CONFIG_XEN?
> 
> Yes. The declaration of xen_vm_assist_ibpb() is visible (satisfying
> the compiler) and DCE will eliminate the call to the function due to
> xen_pv_domain() being constant "false" in that case, avoiding any
> linking issues. The interesting case here really is building with
> XEN but without XEN_PV: That's why I needed to put the function in
> enlighten.c. This wouldn't be needed if xen_pv_domain() was also
> constant "false" in that case (just like xen_pvh_domain() is when
> !XEN_PVH).
> 
>>> I also think these two conditions should be wrapped into something to limit exposure of non-Xen code to Xen-specific primitives.
> 
> I would have done so, if I had any halfway sensible idea on how to
> go about doing so in this particular case. In the absence of that it
> looked okay-ish to me to reference Xen functions directly here.
> 
>> Oh, and this needs x86 maintainers.
> 
> Eventually yes. But I would prefer to sort the above question first
> (which I'm sure would have been raised by them, in perhaps more
> harsh a way), hence the initially limited exposure.

I'd rather add _one_ hook for Xen-PV in check_bugs() just before the call
of arch_smt_update(). This can then correct any needed mitigation settings.

So something like (note that due to using cpu_feature_enabled(X86_FEATURE_XENPV)
DCE is happening in case CONFIG_XEN_PV isn't defined)":

--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -63,4 +63,7 @@ void __init xen_pvh_init(struct boot_params *boot_params);
  void __init mem_map_via_hcall(struct boot_params *boot_params_p);
  #endif

+int __init xen_vm_assist_ibpb(bool enable);
+void __init xen_pv_fix_mitigations(void);
+
  #endif /* _ASM_X86_XEN_HYPERVISOR_H */
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -18,6 +18,8 @@
  #include <linux/pgtable.h>
  #include <linux/bpf.h>

+#include <xen/xen.h>
+
  #include <asm/spec-ctrl.h>
  #include <asm/cmdline.h>
  #include <asm/bugs.h>
@@ -177,6 +179,9 @@ void __init check_bugs(void)
         srbds_select_mitigation();
         l1d_flush_select_mitigation();

+       if (cpu_feature_enabled(X86_FEATURE_XENPV))
+               xen_pv_fix_mitigations();
+
         arch_smt_update();

  #ifdef CONFIG_X86_32
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1476,6 +1476,23 @@ static uint32_t __init xen_platform_pv(void)
         return 0;
  }

+int __init xen_vm_assist_ibpb(bool enable)
+{
+       /*
+        * Note that the VM-assist is a disable, so a request to enable IBPB
+        * on our behalf needs to turn the functionality off (and vice versa).
+        */
+       return HYPERVISOR_vm_assist(enable ? VMASST_CMD_disable
+                                          : VMASST_CMD_enable,
+                                   VMASST_TYPE_mode_switch_no_ibpb);
+}
+
+void __init xen_pv_fix_mitigations(void)
+{
+       if (!xen_vm_assist_ibpb(true))
+               setup_clear_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+}
+
  const __initconst struct hypervisor_x86 x86_hyper_xen_pv = {
         .name                   = "Xen PV",
         .detect                 = xen_platform_pv,
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -413,7 +413,15 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
   */
  #define VMASST_TYPE_runstate_update_flag 5

-#define MAX_VMASST_TYPE 5
+/*
+ * x86-64 guests: Suppress IBPB on guest-user to guest-kernel mode switch.
+ *
+ * By default (on affected and capable hardware) as a safety measure Xen,
+ * to cover for the fact that guest-kernel and guest-user modes are both
+ * running in ring 3 (and hence share prediction context), would issue a
+ * barrier for user->kernel mode switches of PV guests.
+ */
+#define VMASST_TYPE_mode_switch_no_ibpb  33

  #ifndef __ASSEMBLY__


Juergen


[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-03-17 13:56         ` Juergen Gross
@ 2023-03-17 14:21           ` Andrew Cooper
  2023-03-17 14:28             ` Juergen Gross
  2023-03-20 10:21             ` Jan Beulich
  2023-03-20 10:19           ` Jan Beulich
  1 sibling, 2 replies; 41+ messages in thread
From: Andrew Cooper @ 2023-03-17 14:21 UTC (permalink / raw)
  To: Juergen Gross, Jan Beulich, Boris Ostrovsky; +Cc: xen-devel

On 17/03/2023 1:56 pm, Juergen Gross wrote:
> On 15.02.23 09:31, Jan Beulich wrote:
>> On 15.02.2023 01:07, Boris Ostrovsky wrote:
>>>
>>> On 2/14/23 6:53 PM, Boris Ostrovsky wrote:
>>>>
>>>> On 2/14/23 11:13 AM, Jan Beulich wrote:
>>>>
>>>>> --- a/arch/x86/kernel/cpu/bugs.c
>>>>> +++ b/arch/x86/kernel/cpu/bugs.c
>>>>> @@ -18,6 +18,8 @@
>>>>>    #include <linux/pgtable.h>
>>>>>    #include <linux/bpf.h>
>>>>>    +#include <xen/xen.h>
>>>>> +
>>>>>    #include <asm/spec-ctrl.h>
>>>>>    #include <asm/cmdline.h>
>>>>>    #include <asm/bugs.h>
>>>>> @@ -32,6 +34,7 @@
>>>>>    #include <asm/intel-family.h>
>>>>>    #include <asm/e820/api.h>
>>>>>    #include <asm/hypervisor.h>
>>>>> +#include <asm/xen/hypervisor.h>
>>>>>    #include <asm/tlbflush.h>
>>>>>      #include "cpu.h"
>>>>> @@ -934,7 +937,8 @@ do_cmd_auto:
>>>>>            break;
>>>>>          case RETBLEED_MITIGATION_IBPB:
>>>>> -        setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>>>>> +        if (!xen_pv_domain() || xen_vm_assist_ibpb(true))
>>>>
>>>>
>>>> Is this going to compile without CONFIG_XEN?
>>
>> Yes. The declaration of xen_vm_assist_ibpb() is visible (satisfying
>> the compiler) and DCE will eliminate the call to the function due to
>> xen_pv_domain() being constant "false" in that case, avoiding any
>> linking issues. The interesting case here really is building with
>> XEN but without XEN_PV: That's why I needed to put the function in
>> enlighten.c. This wouldn't be needed if xen_pv_domain() was also
>> constant "false" in that case (just like xen_pvh_domain() is when
>> !XEN_PVH).
>>
>>>> I also think these two conditions should be wrapped into something
>>>> to limit exposure of non-Xen code to Xen-specific primitives.
>>
>> I would have done so, if I had any halfway sensible idea on how to
>> go about doing so in this particular case. In the absence of that it
>> looked okay-ish to me to reference Xen functions directly here.
>>
>>> Oh, and this needs x86 maintainers.
>>
>> Eventually yes. But I would prefer to sort the above question first
>> (which I'm sure would have been raised by them, in perhaps more
>> harsh a way), hence the initially limited exposure.
>
> I'd rather add _one_ hook for Xen-PV in check_bugs() just before the call
> of arch_smt_update(). This can then correct any needed mitigation
> settings.
>
> So something like (note that due to using
> cpu_feature_enabled(X86_FEATURE_XENPV)
> DCE is happening in case CONFIG_XEN_PV isn't defined)":
>
> --- a/arch/x86/include/asm/xen/hypervisor.h
> +++ b/arch/x86/include/asm/xen/hypervisor.h
> @@ -63,4 +63,7 @@ void __init xen_pvh_init(struct boot_params
> *boot_params);
>  void __init mem_map_via_hcall(struct boot_params *boot_params_p);
>  #endif
>
> +int __init xen_vm_assist_ibpb(bool enable);
> +void __init xen_pv_fix_mitigations(void);

I'd suggest 'adjust' in preference to 'fix', but this could equally be
xen_pv_mitigations().

XenPV has very legitimate reasons to adjust things owing to it running
in Ring3, but "fix" comes with other implications too.

> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -1476,6 +1476,23 @@ static uint32_t __init xen_platform_pv(void)
>         return 0;
>  }
>
> +int __init xen_vm_assist_ibpb(bool enable)
> +{
> +       /*
> +        * Note that the VM-assist is a disable, so a request to
> enable IBPB
> +        * on our behalf needs to turn the functionality off (and vice
> versa).
> +        */
> +       return HYPERVISOR_vm_assist(enable ? VMASST_CMD_disable
> +                                          : VMASST_CMD_enable,
> +                                   VMASST_TYPE_mode_switch_no_ibpb);
> +}
> +
> +void __init xen_pv_fix_mitigations(void)
> +{
> +       if (!xen_vm_assist_ibpb(true))
> +               setup_clear_cpu_cap(X86_FEATURE_ENTRY_IBPB);

If nothing else, this needs a comment explaining what's going on.

"Xen PV guest kernels run in ring3, so share the same prediction domain
as userspace.  Xen (since version $X) default to issuing an IBPB on
guest user -> guest kernel transitions on behalf of the guest kernel. 
If Linux isn't depending on mode based prediction separation, turn this
behaviour off".

But this does open the next question.  Yes, unilaterally turning turning
this off restores the prior behaviour, but is this really the best thing
to do ?

~Andrew


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-03-17 14:21           ` Andrew Cooper
@ 2023-03-17 14:28             ` Juergen Gross
  2023-03-20 10:21             ` Jan Beulich
  1 sibling, 0 replies; 41+ messages in thread
From: Juergen Gross @ 2023-03-17 14:28 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich, Boris Ostrovsky; +Cc: xen-devel


[-- Attachment #1.1.1: Type: text/plain, Size: 5061 bytes --]

On 17.03.23 15:21, Andrew Cooper wrote:
> On 17/03/2023 1:56 pm, Juergen Gross wrote:
>> On 15.02.23 09:31, Jan Beulich wrote:
>>> On 15.02.2023 01:07, Boris Ostrovsky wrote:
>>>>
>>>> On 2/14/23 6:53 PM, Boris Ostrovsky wrote:
>>>>>
>>>>> On 2/14/23 11:13 AM, Jan Beulich wrote:
>>>>>
>>>>>> --- a/arch/x86/kernel/cpu/bugs.c
>>>>>> +++ b/arch/x86/kernel/cpu/bugs.c
>>>>>> @@ -18,6 +18,8 @@
>>>>>>     #include <linux/pgtable.h>
>>>>>>     #include <linux/bpf.h>
>>>>>>     +#include <xen/xen.h>
>>>>>> +
>>>>>>     #include <asm/spec-ctrl.h>
>>>>>>     #include <asm/cmdline.h>
>>>>>>     #include <asm/bugs.h>
>>>>>> @@ -32,6 +34,7 @@
>>>>>>     #include <asm/intel-family.h>
>>>>>>     #include <asm/e820/api.h>
>>>>>>     #include <asm/hypervisor.h>
>>>>>> +#include <asm/xen/hypervisor.h>
>>>>>>     #include <asm/tlbflush.h>
>>>>>>       #include "cpu.h"
>>>>>> @@ -934,7 +937,8 @@ do_cmd_auto:
>>>>>>             break;
>>>>>>           case RETBLEED_MITIGATION_IBPB:
>>>>>> -        setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>>>>>> +        if (!xen_pv_domain() || xen_vm_assist_ibpb(true))
>>>>>
>>>>>
>>>>> Is this going to compile without CONFIG_XEN?
>>>
>>> Yes. The declaration of xen_vm_assist_ibpb() is visible (satisfying
>>> the compiler) and DCE will eliminate the call to the function due to
>>> xen_pv_domain() being constant "false" in that case, avoiding any
>>> linking issues. The interesting case here really is building with
>>> XEN but without XEN_PV: That's why I needed to put the function in
>>> enlighten.c. This wouldn't be needed if xen_pv_domain() was also
>>> constant "false" in that case (just like xen_pvh_domain() is when
>>> !XEN_PVH).
>>>
>>>>> I also think these two conditions should be wrapped into something
>>>>> to limit exposure of non-Xen code to Xen-specific primitives.
>>>
>>> I would have done so, if I had any halfway sensible idea on how to
>>> go about doing so in this particular case. In the absence of that it
>>> looked okay-ish to me to reference Xen functions directly here.
>>>
>>>> Oh, and this needs x86 maintainers.
>>>
>>> Eventually yes. But I would prefer to sort the above question first
>>> (which I'm sure would have been raised by them, in perhaps more
>>> harsh a way), hence the initially limited exposure.
>>
>> I'd rather add _one_ hook for Xen-PV in check_bugs() just before the call
>> of arch_smt_update(). This can then correct any needed mitigation
>> settings.
>>
>> So something like (note that due to using
>> cpu_feature_enabled(X86_FEATURE_XENPV)
>> DCE is happening in case CONFIG_XEN_PV isn't defined)":
>>
>> --- a/arch/x86/include/asm/xen/hypervisor.h
>> +++ b/arch/x86/include/asm/xen/hypervisor.h
>> @@ -63,4 +63,7 @@ void __init xen_pvh_init(struct boot_params
>> *boot_params);
>>   void __init mem_map_via_hcall(struct boot_params *boot_params_p);
>>   #endif
>>
>> +int __init xen_vm_assist_ibpb(bool enable);
>> +void __init xen_pv_fix_mitigations(void);
> 
> I'd suggest 'adjust' in preference to 'fix', but this could equally be
> xen_pv_mitigations().
> 
> XenPV has very legitimate reasons to adjust things owing to it running
> in Ring3, but "fix" comes with other implications too.
> 
>> --- a/arch/x86/xen/enlighten_pv.c
>> +++ b/arch/x86/xen/enlighten_pv.c
>> @@ -1476,6 +1476,23 @@ static uint32_t __init xen_platform_pv(void)
>>          return 0;
>>   }
>>
>> +int __init xen_vm_assist_ibpb(bool enable)
>> +{
>> +       /*
>> +        * Note that the VM-assist is a disable, so a request to
>> enable IBPB
>> +        * on our behalf needs to turn the functionality off (and vice
>> versa).
>> +        */
>> +       return HYPERVISOR_vm_assist(enable ? VMASST_CMD_disable
>> +                                          : VMASST_CMD_enable,
>> +                                   VMASST_TYPE_mode_switch_no_ibpb);
>> +}
>> +
>> +void __init xen_pv_fix_mitigations(void)
>> +{
>> +       if (!xen_vm_assist_ibpb(true))
>> +               setup_clear_cpu_cap(X86_FEATURE_ENTRY_IBPB);
> 
> If nothing else, this needs a comment explaining what's going on.
> 
> "Xen PV guest kernels run in ring3, so share the same prediction domain
> as userspace.  Xen (since version $X) default to issuing an IBPB on
> guest user -> guest kernel transitions on behalf of the guest kernel.
> If Linux isn't depending on mode based prediction separation, turn this
> behaviour off".
> 
> But this does open the next question.  Yes, unilaterally turning turning
> this off restores the prior behaviour, but is this really the best thing
> to do ?

I believe this question is primarily for Jan, as he is the initial author of
the patch.

I was just suggesting a variant which is IMHO more probable to be accepted by
the x86 maintainers. :-)


Juergen

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-03-17 13:56         ` Juergen Gross
  2023-03-17 14:21           ` Andrew Cooper
@ 2023-03-20 10:19           ` Jan Beulich
  2023-03-20 13:02             ` Juergen Gross
  1 sibling, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-03-20 10:19 UTC (permalink / raw)
  To: Juergen Gross; +Cc: Andrew Cooper, xen-devel, Boris Ostrovsky

On 17.03.2023 14:56, Juergen Gross wrote:
> On 15.02.23 09:31, Jan Beulich wrote:
>> Eventually yes. But I would prefer to sort the above question first
>> (which I'm sure would have been raised by them, in perhaps more
>> harsh a way), hence the initially limited exposure.
> 
> I'd rather add _one_ hook for Xen-PV in check_bugs() just before the call
> of arch_smt_update(). This can then correct any needed mitigation settings.

Doing this in single central place is what I was originally hoping I
could do. But that simply doesn't work (afaict): It is for a reason
that I apply the adjustment in the RETBLEED_MITIGATION_IBPB case, by
suppressing the setting of the feature bit. Not the least because ...

> So something like (note that due to using cpu_feature_enabled(X86_FEATURE_XENPV)
> DCE is happening in case CONFIG_XEN_PV isn't defined)":
> 
> --- a/arch/x86/include/asm/xen/hypervisor.h
> +++ b/arch/x86/include/asm/xen/hypervisor.h
> @@ -63,4 +63,7 @@ void __init xen_pvh_init(struct boot_params *boot_params);
>   void __init mem_map_via_hcall(struct boot_params *boot_params_p);
>   #endif
> 
> +int __init xen_vm_assist_ibpb(bool enable);
> +void __init xen_pv_fix_mitigations(void);
> +
>   #endif /* _ASM_X86_XEN_HYPERVISOR_H */
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -18,6 +18,8 @@
>   #include <linux/pgtable.h>
>   #include <linux/bpf.h>
> 
> +#include <xen/xen.h>
> +
>   #include <asm/spec-ctrl.h>
>   #include <asm/cmdline.h>
>   #include <asm/bugs.h>
> @@ -177,6 +179,9 @@ void __init check_bugs(void)
>          srbds_select_mitigation();
>          l1d_flush_select_mitigation();
> 
> +       if (cpu_feature_enabled(X86_FEATURE_XENPV))
> +               xen_pv_fix_mitigations();
> +
>          arch_smt_update();
> 
>   #ifdef CONFIG_X86_32
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -1476,6 +1476,23 @@ static uint32_t __init xen_platform_pv(void)
>          return 0;
>   }
> 
> +int __init xen_vm_assist_ibpb(bool enable)
> +{
> +       /*
> +        * Note that the VM-assist is a disable, so a request to enable IBPB
> +        * on our behalf needs to turn the functionality off (and vice versa).
> +        */
> +       return HYPERVISOR_vm_assist(enable ? VMASST_CMD_disable
> +                                          : VMASST_CMD_enable,
> +                                   VMASST_TYPE_mode_switch_no_ibpb);
> +}
> +
> +void __init xen_pv_fix_mitigations(void)
> +{
> +       if (!xen_vm_assist_ibpb(true))
> +               setup_clear_cpu_cap(X86_FEATURE_ENTRY_IBPB);

... using both setup_clear_cpu_cap() (here) and setup_force_cpu_cap()
(in retbleed_select_mitigation() won't work: The latter wins, due to
how apply_forced_caps() works.

But of course calling both functions for the same feature is bogus
anyway. In fact I think it is for a good reason that in Xen we log a
message in such an event.

A new helper could be introduced (and used in
retbleed_select_mitigation()) to check whether a feature was
previously cleared, but I did conclude that it's likely for a good
reason that such doesn't exist.

As to your use of cpu_feature_enabled(X86_FEATURE_XENPV) and DCE -
I can certainly switch to using that, which then ought allow to move
xen_vm_assist_ibpb() back to enlighten_pv.c (as you have it, and as
I first had it until noticing the build breakage with PVH=y and
PV=n).

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-03-17 14:21           ` Andrew Cooper
  2023-03-17 14:28             ` Juergen Gross
@ 2023-03-20 10:21             ` Jan Beulich
  1 sibling, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2023-03-20 10:21 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Juergen Gross, Boris Ostrovsky

On 17.03.2023 15:21, Andrew Cooper wrote:
> On 17/03/2023 1:56 pm, Juergen Gross wrote:
>> --- a/arch/x86/xen/enlighten_pv.c
>> +++ b/arch/x86/xen/enlighten_pv.c
>> @@ -1476,6 +1476,23 @@ static uint32_t __init xen_platform_pv(void)
>>         return 0;
>>  }
>>
>> +int __init xen_vm_assist_ibpb(bool enable)
>> +{
>> +       /*
>> +        * Note that the VM-assist is a disable, so a request to
>> enable IBPB
>> +        * on our behalf needs to turn the functionality off (and vice
>> versa).
>> +        */
>> +       return HYPERVISOR_vm_assist(enable ? VMASST_CMD_disable
>> +                                          : VMASST_CMD_enable,
>> +                                   VMASST_TYPE_mode_switch_no_ibpb);
>> +}
>> +
>> +void __init xen_pv_fix_mitigations(void)
>> +{
>> +       if (!xen_vm_assist_ibpb(true))
>> +               setup_clear_cpu_cap(X86_FEATURE_ENTRY_IBPB);
> 
> If nothing else, this needs a comment explaining what's going on.
> 
> "Xen PV guest kernels run in ring3, so share the same prediction domain
> as userspace.  Xen (since version $X) default to issuing an IBPB on
> guest user -> guest kernel transitions on behalf of the guest kernel. 
> If Linux isn't depending on mode based prediction separation, turn this
> behaviour off".

I would have thought the comment in the public header - saying exactly
that - is sufficient.

> But this does open the next question.  Yes, unilaterally turning turning
> this off restores the prior behaviour, but is this really the best thing
> to do ?

Unless this is purely a question on Jürgen's suggested version (in which
case I'd let him answer) - what alternative do you suggest, within the
present policy used in the kernel?

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-03-20 10:19           ` Jan Beulich
@ 2023-03-20 13:02             ` Juergen Gross
  2023-03-20 13:17               ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Juergen Gross @ 2023-03-20 13:02 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, xen-devel, Boris Ostrovsky


[-- Attachment #1.1.1: Type: text/plain, Size: 3408 bytes --]

On 20.03.23 11:19, Jan Beulich wrote:
> On 17.03.2023 14:56, Juergen Gross wrote:
>> On 15.02.23 09:31, Jan Beulich wrote:
>>> Eventually yes. But I would prefer to sort the above question first
>>> (which I'm sure would have been raised by them, in perhaps more
>>> harsh a way), hence the initially limited exposure.
>>
>> I'd rather add _one_ hook for Xen-PV in check_bugs() just before the call
>> of arch_smt_update(). This can then correct any needed mitigation settings.
> 
> Doing this in single central place is what I was originally hoping I
> could do. But that simply doesn't work (afaict): It is for a reason
> that I apply the adjustment in the RETBLEED_MITIGATION_IBPB case, by
> suppressing the setting of the feature bit. Not the least because ...
> 
>> So something like (note that due to using cpu_feature_enabled(X86_FEATURE_XENPV)
>> DCE is happening in case CONFIG_XEN_PV isn't defined)":
>>
>> --- a/arch/x86/include/asm/xen/hypervisor.h
>> +++ b/arch/x86/include/asm/xen/hypervisor.h
>> @@ -63,4 +63,7 @@ void __init xen_pvh_init(struct boot_params *boot_params);
>>    void __init mem_map_via_hcall(struct boot_params *boot_params_p);
>>    #endif
>>
>> +int __init xen_vm_assist_ibpb(bool enable);
>> +void __init xen_pv_fix_mitigations(void);
>> +
>>    #endif /* _ASM_X86_XEN_HYPERVISOR_H */
>> --- a/arch/x86/kernel/cpu/bugs.c
>> +++ b/arch/x86/kernel/cpu/bugs.c
>> @@ -18,6 +18,8 @@
>>    #include <linux/pgtable.h>
>>    #include <linux/bpf.h>
>>
>> +#include <xen/xen.h>
>> +
>>    #include <asm/spec-ctrl.h>
>>    #include <asm/cmdline.h>
>>    #include <asm/bugs.h>
>> @@ -177,6 +179,9 @@ void __init check_bugs(void)
>>           srbds_select_mitigation();
>>           l1d_flush_select_mitigation();
>>
>> +       if (cpu_feature_enabled(X86_FEATURE_XENPV))
>> +               xen_pv_fix_mitigations();
>> +
>>           arch_smt_update();
>>
>>    #ifdef CONFIG_X86_32
>> --- a/arch/x86/xen/enlighten_pv.c
>> +++ b/arch/x86/xen/enlighten_pv.c
>> @@ -1476,6 +1476,23 @@ static uint32_t __init xen_platform_pv(void)
>>           return 0;
>>    }
>>
>> +int __init xen_vm_assist_ibpb(bool enable)
>> +{
>> +       /*
>> +        * Note that the VM-assist is a disable, so a request to enable IBPB
>> +        * on our behalf needs to turn the functionality off (and vice versa).
>> +        */
>> +       return HYPERVISOR_vm_assist(enable ? VMASST_CMD_disable
>> +                                          : VMASST_CMD_enable,
>> +                                   VMASST_TYPE_mode_switch_no_ibpb);
>> +}
>> +
>> +void __init xen_pv_fix_mitigations(void)
>> +{
>> +       if (!xen_vm_assist_ibpb(true))
>> +               setup_clear_cpu_cap(X86_FEATURE_ENTRY_IBPB);
> 
> ... using both setup_clear_cpu_cap() (here) and setup_force_cpu_cap()
> (in retbleed_select_mitigation() won't work: The latter wins, due to
> how apply_forced_caps() works.

Oh, right.

Just a wild guess of mine: probably the x86 maintainers would still prefer
a single Xen hook plus something like a setup_unforce_cpu_cap() addition.

> But of course calling both functions for the same feature is bogus
> anyway. In fact I think it is for a good reason that in Xen we log a
> message in such an event.

Depends. For Xen we do so in the kernel for multiple features, see
xen_init_capabilities().


Juergen


[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-03-20 13:02             ` Juergen Gross
@ 2023-03-20 13:17               ` Jan Beulich
  2023-03-20 13:35                 ` Juergen Gross
  0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-03-20 13:17 UTC (permalink / raw)
  To: Juergen Gross; +Cc: Andrew Cooper, xen-devel, Boris Ostrovsky

On 20.03.2023 14:02, Juergen Gross wrote:
> On 20.03.23 11:19, Jan Beulich wrote:
>> On 17.03.2023 14:56, Juergen Gross wrote:
>>> +void __init xen_pv_fix_mitigations(void)
>>> +{
>>> +       if (!xen_vm_assist_ibpb(true))
>>> +               setup_clear_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>>
>> ... using both setup_clear_cpu_cap() (here) and setup_force_cpu_cap()
>> (in retbleed_select_mitigation() won't work: The latter wins, due to
>> how apply_forced_caps() works.
> 
> Oh, right.
> 
> Just a wild guess of mine: probably the x86 maintainers would still prefer
> a single Xen hook plus something like a setup_unforce_cpu_cap() addition.

If so, I'm not willing to make such a patch. That's clearly more fragile
than the approach chosen. I guess once I've made the one adjustment you
have pointed out, I'll resubmit otherwise unchanged and include x86 folks.
We'll see what the responses are going to be, if any at all.

>> But of course calling both functions for the same feature is bogus
>> anyway. In fact I think it is for a good reason that in Xen we log a
>> message in such an event.
> 
> Depends. For Xen we do so in the kernel for multiple features, see
> xen_init_capabilities().

I don't see anything there which looks like it might be both "force"d
and "clear"ed in a single session.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] x86/Xen: make use of IBPB controlling VM assist
  2023-03-20 13:17               ` Jan Beulich
@ 2023-03-20 13:35                 ` Juergen Gross
  0 siblings, 0 replies; 41+ messages in thread
From: Juergen Gross @ 2023-03-20 13:35 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, xen-devel, Boris Ostrovsky


[-- Attachment #1.1.1: Type: text/plain, Size: 1476 bytes --]

On 20.03.23 14:17, Jan Beulich wrote:
> On 20.03.2023 14:02, Juergen Gross wrote:
>> On 20.03.23 11:19, Jan Beulich wrote:
>>> On 17.03.2023 14:56, Juergen Gross wrote:
>>>> +void __init xen_pv_fix_mitigations(void)
>>>> +{
>>>> +       if (!xen_vm_assist_ibpb(true))
>>>> +               setup_clear_cpu_cap(X86_FEATURE_ENTRY_IBPB);
>>>
>>> ... using both setup_clear_cpu_cap() (here) and setup_force_cpu_cap()
>>> (in retbleed_select_mitigation() won't work: The latter wins, due to
>>> how apply_forced_caps() works.
>>
>> Oh, right.
>>
>> Just a wild guess of mine: probably the x86 maintainers would still prefer
>> a single Xen hook plus something like a setup_unforce_cpu_cap() addition.
> 
> If so, I'm not willing to make such a patch. That's clearly more fragile
> than the approach chosen. I guess once I've made the one adjustment you
> have pointed out, I'll resubmit otherwise unchanged and include x86 folks.
> We'll see what the responses are going to be, if any at all.

Fine with me.

> 
>>> But of course calling both functions for the same feature is bogus
>>> anyway. In fact I think it is for a good reason that in Xen we log a
>>> message in such an event.
>>
>> Depends. For Xen we do so in the kernel for multiple features, see
>> xen_init_capabilities().
> 
> I don't see anything there which looks like it might be both "force"d
> and "clear"ed in a single session.

Oh, I misunderstood you then.


Juergen


[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 3149 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 495 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-02-14 16:10 ` [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
@ 2023-12-18 12:11   ` Roger Pau Monné
  2023-12-18 13:46     ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-18 12:11 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

Hello,

I'm not as expert as Andrew in all the speculation stuff, but I will
try to provide some feedback.

On Tue, Feb 14, 2023 at 05:10:42PM +0100, Jan Beulich wrote:
> In order to be able to defer the context switch IBPB to the last
> possible point, add logic to the exit-to-guest paths to issue the
> barrier there, including the "IBPB doesn't flush the RSB/RAS"
> workaround. Since alternatives, for now at least, can't nest, emit JMP
> to skip past both constructs where both are needed. This may be more
> efficient anyway, as the sequence of NOPs is pretty long.

Could you elaborate on the reason why deferring the IBPB to the exit
to guest path is helpful?  So far it just seem to make the logic more
complex without nay justification (at least in the changelog).

> 
> As with all other conditional blocks on exit-to-guest paths, no
> Spectre-v1 protections are necessary as execution will imminently be
> hitting a serialising event.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> I have to admit that I'm not really certain about the placement of the
> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
> entry".

Maybe it would easier to just add the MSR_PRED_CMD PRED_CMD_IBPB write
to the vmcs MSR load list?

It's a one-time only AFAICT, as you would only want to do this for
context-switch AFAICT.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry
  2023-02-14 16:11 ` [PATCH v4 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry Jan Beulich
@ 2023-12-18 12:39   ` Roger Pau Monné
  2023-12-18 13:58     ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-18 12:39 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Feb 14, 2023 at 05:11:05PM +0100, Jan Beulich wrote:
> In order to avoid clobbering Xen's own predictions, defer the barrier as
> much as possible. Merely mark the CPU as needing a barrier issued the
> next time we're exiting to guest context.

While I understand that doing the flush in the middle of the guest
context might not be ideal, as it's my understanding we also
needlessly flush Xen predictions, I'm unsure whether this makes any
difference in practice, and IMO just makes the exit to guest paths
more complex.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-12-18 12:11   ` Roger Pau Monné
@ 2023-12-18 13:46     ` Jan Beulich
  2023-12-18 13:50       ` Jan Beulich
                         ` (2 more replies)
  0 siblings, 3 replies; 41+ messages in thread
From: Jan Beulich @ 2023-12-18 13:46 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 13:11, Roger Pau Monné wrote:
> Hello,
> 
> I'm not as expert as Andrew in all the speculation stuff, but I will
> try to provide some feedback.
> 
> On Tue, Feb 14, 2023 at 05:10:42PM +0100, Jan Beulich wrote:
>> In order to be able to defer the context switch IBPB to the last
>> possible point, add logic to the exit-to-guest paths to issue the
>> barrier there, including the "IBPB doesn't flush the RSB/RAS"
>> workaround. Since alternatives, for now at least, can't nest, emit JMP
>> to skip past both constructs where both are needed. This may be more
>> efficient anyway, as the sequence of NOPs is pretty long.
> 
> Could you elaborate on the reason why deferring the IBPB to the exit
> to guest path is helpful?  So far it just seem to make the logic more
> complex without nay justification (at least in the changelog).

I've added "(to leave behind as little as possible)" ahead of the 1st
comma - is that sufficient, do you think?

>> ---
>> I have to admit that I'm not really certain about the placement of the
>> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
>> entry".
> 
> Maybe it would easier to just add the MSR_PRED_CMD PRED_CMD_IBPB write
> to the vmcs MSR load list?
> 
> It's a one-time only AFAICT, as you would only want to do this for
> context-switch AFAICT.

That would be a back and forth of adding and removing the MSR to/from
that list then, which I'm not convinced is helpful. With these special
MSRs I would further be uncertain as to their effect when used via one
of these lists.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-12-18 13:46     ` Jan Beulich
@ 2023-12-18 13:50       ` Jan Beulich
  2023-12-18 15:43         ` Roger Pau Monné
  2023-12-18 13:54       ` Jan Beulich
  2023-12-18 15:40       ` Roger Pau Monné
  2 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-12-18 13:50 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 14:46, Jan Beulich wrote:
> On 18.12.2023 13:11, Roger Pau Monné wrote:
>> Hello,
>>
>> I'm not as expert as Andrew in all the speculation stuff, but I will
>> try to provide some feedback.
>>
>> On Tue, Feb 14, 2023 at 05:10:42PM +0100, Jan Beulich wrote:
>>> In order to be able to defer the context switch IBPB to the last
>>> possible point, add logic to the exit-to-guest paths to issue the
>>> barrier there, including the "IBPB doesn't flush the RSB/RAS"
>>> workaround. Since alternatives, for now at least, can't nest, emit JMP
>>> to skip past both constructs where both are needed. This may be more
>>> efficient anyway, as the sequence of NOPs is pretty long.
>>
>> Could you elaborate on the reason why deferring the IBPB to the exit
>> to guest path is helpful?  So far it just seem to make the logic more
>> complex without nay justification (at least in the changelog).
> 
> I've added "(to leave behind as little as possible)" ahead of the 1st
> comma - is that sufficient, do you think?

Actually, the next patch supplies better context, i.e. is more / also
about avoiding to clobber Xen's own predictions.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-12-18 13:46     ` Jan Beulich
  2023-12-18 13:50       ` Jan Beulich
@ 2023-12-18 13:54       ` Jan Beulich
  2023-12-18 15:40       ` Roger Pau Monné
  2 siblings, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2023-12-18 13:54 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 14:46, Jan Beulich wrote:
> On 18.12.2023 13:11, Roger Pau Monné wrote:
>> On Tue, Feb 14, 2023 at 05:10:42PM +0100, Jan Beulich wrote:
>>> ---
>>> I have to admit that I'm not really certain about the placement of the
>>> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
>>> entry".
>>
>> Maybe it would easier to just add the MSR_PRED_CMD PRED_CMD_IBPB write
>> to the vmcs MSR load list?
>>
>> It's a one-time only AFAICT, as you would only want to do this for
>> context-switch AFAICT.
> 
> That would be a back and forth of adding and removing the MSR to/from
> that list then, which I'm not convinced is helpful. With these special
> MSRs I would further be uncertain as to their effect when used via one
> of these lists.

Plus (as only a secondary argument, but still) it would make PV and HVM
mechanisms entirely different.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry
  2023-12-18 12:39   ` Roger Pau Monné
@ 2023-12-18 13:58     ` Jan Beulich
  2023-12-18 17:27       ` Roger Pau Monné
  0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-12-18 13:58 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 13:39, Roger Pau Monné wrote:
> On Tue, Feb 14, 2023 at 05:11:05PM +0100, Jan Beulich wrote:
>> In order to avoid clobbering Xen's own predictions, defer the barrier as
>> much as possible. Merely mark the CPU as needing a barrier issued the
>> next time we're exiting to guest context.
> 
> While I understand that doing the flush in the middle of the guest
> context might not be ideal, as it's my understanding we also

s/guest context/context switch/?

> needlessly flush Xen predictions, I'm unsure whether this makes any
> difference in practice, and IMO just makes the exit to guest paths
> more complex.

I need to redirect this question to Andrew, who suggested that doing so
can be expected to make a difference. When we were discussing this, I
could easily see it might make a difference, but I cannot provide hard
proof.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 3/4] x86: limit issuing of IBPB during context switch
  2023-02-14 16:11 ` [PATCH v4 3/4] x86: limit issuing of IBPB during context switch Jan Beulich
@ 2023-12-18 15:19   ` Roger Pau Monné
  2023-12-18 16:09     ` Jan Beulich
  2023-12-18 16:11     ` Jan Beulich
  0 siblings, 2 replies; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-18 15:19 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Feb 14, 2023 at 05:11:40PM +0100, Jan Beulich wrote:
> When the outgoing vCPU had IBPB issued and RSB overwritten upon entering
> Xen, then there's no need for a 2nd barrier during context switch.
> 
> Note that SCF_entry_ibpb is always clear for the idle domain, so no
> explicit idle domain check is needed to augment the feature check
> (which is simply inapplicable to "idle").
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Roger Pau Monné <roger.pau@citrix.com>

> ---
> v4: Tighten the condition.
> v3: Fold into series.
> ---
> I think in principle we could limit the impact from finding the idle
> domain as "prevd", by having __context_switch() tell us what kind
> domain's vCPU was switched out (it could still be "idle", but in fewer
> cases).
> 
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -2005,17 +2005,26 @@ void context_switch(struct vcpu *prev, s
>      }
>      else
>      {
> +        unsigned int feat_sc_rsb = X86_FEATURE_SC_RSB_HVM;
> +
>          __context_switch();
>  
>          /* Re-enable interrupts before restoring state which may fault. */
>          local_irq_enable();
>  
>          if ( is_pv_domain(nextd) )
> +        {
>              load_segments(next);
>  
> +            feat_sc_rsb = X86_FEATURE_SC_RSB_PV;
> +        }
> +
>          ctxt_switch_levelling(next);
>  
> -        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
> +        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
> +             (!(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) ||
> +              /* is_idle_domain(prevd) || */

I would rather add a comment to note that the idle domain always has
SCF_entry_ibpb clear, rather than leaving this commented check in the
condition.

> +              !boot_cpu_has(feat_sc_rsb)) )

I do wonder if it would be more fail safe (and easier to expand going
forward) if we introduce a new cpu_info field to track the CPU state:
relevant here would be whether RSB has been overwritten and IBPB
executed.  Such state would be cleared on each return from guest path.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-12-18 13:46     ` Jan Beulich
  2023-12-18 13:50       ` Jan Beulich
  2023-12-18 13:54       ` Jan Beulich
@ 2023-12-18 15:40       ` Roger Pau Monné
  2023-12-18 16:00         ` Jan Beulich
  2 siblings, 1 reply; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-18 15:40 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Mon, Dec 18, 2023 at 02:46:37PM +0100, Jan Beulich wrote:
> On 18.12.2023 13:11, Roger Pau Monné wrote:
> > Hello,
> > 
> > I'm not as expert as Andrew in all the speculation stuff, but I will
> > try to provide some feedback.
> > 
> > On Tue, Feb 14, 2023 at 05:10:42PM +0100, Jan Beulich wrote:
> >> In order to be able to defer the context switch IBPB to the last
> >> possible point, add logic to the exit-to-guest paths to issue the
> >> barrier there, including the "IBPB doesn't flush the RSB/RAS"
> >> workaround. Since alternatives, for now at least, can't nest, emit JMP
> >> to skip past both constructs where both are needed. This may be more
> >> efficient anyway, as the sequence of NOPs is pretty long.
> > 
> > Could you elaborate on the reason why deferring the IBPB to the exit
> > to guest path is helpful?  So far it just seem to make the logic more
> > complex without nay justification (at least in the changelog).
> 
> I've added "(to leave behind as little as possible)" ahead of the 1st
> comma - is that sufficient, do you think?

Please bear with me, but I'm still uncertain.

Even if IBRS is not enabled, and such indirect branch predictions are
at the same predictor mode, how would that be of any use to a guest?
My understanding was that the attacker is the one that has to control
the indirect branch predictor contents in order to attack the
hypervisor or other guests.

> >> ---
> >> I have to admit that I'm not really certain about the placement of the
> >> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
> >> entry".
> > 
> > Maybe it would easier to just add the MSR_PRED_CMD PRED_CMD_IBPB write
> > to the vmcs MSR load list?
> > 
> > It's a one-time only AFAICT, as you would only want to do this for
> > context-switch AFAICT.
> 
> That would be a back and forth of adding and removing the MSR to/from
> that list then, which I'm not convinced is helpful. With these special
> MSRs I would further be uncertain as to their effect when used via one
> of these lists.

Hm, we do seem to already use MSR_PRED_CMD with such lists, so I would
assume they work just fine.

Anyway, was mostly a recommendation towards clarity, because I think
the return to guest context assembly is getting rather convoluted, and
it's IMO critical for it to be easy to follow.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-12-18 13:50       ` Jan Beulich
@ 2023-12-18 15:43         ` Roger Pau Monné
  2023-12-18 16:02           ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-18 15:43 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Mon, Dec 18, 2023 at 02:50:27PM +0100, Jan Beulich wrote:
> On 18.12.2023 14:46, Jan Beulich wrote:
> > On 18.12.2023 13:11, Roger Pau Monné wrote:
> >> Hello,
> >>
> >> I'm not as expert as Andrew in all the speculation stuff, but I will
> >> try to provide some feedback.
> >>
> >> On Tue, Feb 14, 2023 at 05:10:42PM +0100, Jan Beulich wrote:
> >>> In order to be able to defer the context switch IBPB to the last
> >>> possible point, add logic to the exit-to-guest paths to issue the
> >>> barrier there, including the "IBPB doesn't flush the RSB/RAS"
> >>> workaround. Since alternatives, for now at least, can't nest, emit JMP
> >>> to skip past both constructs where both are needed. This may be more
> >>> efficient anyway, as the sequence of NOPs is pretty long.
> >>
> >> Could you elaborate on the reason why deferring the IBPB to the exit
> >> to guest path is helpful?  So far it just seem to make the logic more
> >> complex without nay justification (at least in the changelog).
> > 
> > I've added "(to leave behind as little as possible)" ahead of the 1st
> > comma - is that sufficient, do you think?
> 
> Actually, the next patch supplies better context, i.e. is more / also
> about avoiding to clobber Xen's own predictions.

Right, that I got from next patch, but given context switch is already
a quite heavy operation, does avoiding the cleaning of the branch
predictor make that much of a difference?

IMO it needs good justification given it's a change that makes the
logic harder to follow, so if it turns out there's no difference I
would rather leave the IBPB at context switch.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-12-18 15:40       ` Roger Pau Monné
@ 2023-12-18 16:00         ` Jan Beulich
  0 siblings, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2023-12-18 16:00 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 16:40, Roger Pau Monné wrote:
> On Mon, Dec 18, 2023 at 02:46:37PM +0100, Jan Beulich wrote:
>> On 18.12.2023 13:11, Roger Pau Monné wrote:
>>> Hello,
>>>
>>> I'm not as expert as Andrew in all the speculation stuff, but I will
>>> try to provide some feedback.
>>>
>>> On Tue, Feb 14, 2023 at 05:10:42PM +0100, Jan Beulich wrote:
>>>> In order to be able to defer the context switch IBPB to the last
>>>> possible point, add logic to the exit-to-guest paths to issue the
>>>> barrier there, including the "IBPB doesn't flush the RSB/RAS"
>>>> workaround. Since alternatives, for now at least, can't nest, emit JMP
>>>> to skip past both constructs where both are needed. This may be more
>>>> efficient anyway, as the sequence of NOPs is pretty long.
>>>
>>> Could you elaborate on the reason why deferring the IBPB to the exit
>>> to guest path is helpful?  So far it just seem to make the logic more
>>> complex without nay justification (at least in the changelog).
>>
>> I've added "(to leave behind as little as possible)" ahead of the 1st
>> comma - is that sufficient, do you think?
> 
> Please bear with me, but I'm still uncertain.
> 
> Even if IBRS is not enabled, and such indirect branch predictions are
> at the same predictor mode, how would that be of any use to a guest?
> My understanding was that the attacker is the one that has to control
> the indirect branch predictor contents in order to attack the
> hypervisor or other guests.

Right; see my later reply.

>>>> ---
>>>> I have to admit that I'm not really certain about the placement of the
>>>> IBPB wrt the MSR_SPEC_CTRL writes. For now I've simply used "opposite of
>>>> entry".
>>>
>>> Maybe it would easier to just add the MSR_PRED_CMD PRED_CMD_IBPB write
>>> to the vmcs MSR load list?
>>>
>>> It's a one-time only AFAICT, as you would only want to do this for
>>> context-switch AFAICT.
>>
>> That would be a back and forth of adding and removing the MSR to/from
>> that list then, which I'm not convinced is helpful. With these special
>> MSRs I would further be uncertain as to their effect when used via one
>> of these lists.
> 
> Hm, we do seem to already use MSR_PRED_CMD with such lists, so I would
> assume they work just fine.

Ah, yes, I forgot about that. Still it would be a back and forth if we
wanted the MSR on the list only after a context switch, but not anymore
for later VM entry. Also iirc these lists are VMX-only?

Jan

> Anyway, was mostly a recommendation towards clarity, because I think
> the return to guest context assembly is getting rather convoluted, and
> it's IMO critical for it to be easy to follow.
> 
> Thanks, Roger.



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest
  2023-12-18 15:43         ` Roger Pau Monné
@ 2023-12-18 16:02           ` Jan Beulich
  0 siblings, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2023-12-18 16:02 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 16:43, Roger Pau Monné wrote:
> On Mon, Dec 18, 2023 at 02:50:27PM +0100, Jan Beulich wrote:
>> On 18.12.2023 14:46, Jan Beulich wrote:
>>> On 18.12.2023 13:11, Roger Pau Monné wrote:
>>>> Hello,
>>>>
>>>> I'm not as expert as Andrew in all the speculation stuff, but I will
>>>> try to provide some feedback.
>>>>
>>>> On Tue, Feb 14, 2023 at 05:10:42PM +0100, Jan Beulich wrote:
>>>>> In order to be able to defer the context switch IBPB to the last
>>>>> possible point, add logic to the exit-to-guest paths to issue the
>>>>> barrier there, including the "IBPB doesn't flush the RSB/RAS"
>>>>> workaround. Since alternatives, for now at least, can't nest, emit JMP
>>>>> to skip past both constructs where both are needed. This may be more
>>>>> efficient anyway, as the sequence of NOPs is pretty long.
>>>>
>>>> Could you elaborate on the reason why deferring the IBPB to the exit
>>>> to guest path is helpful?  So far it just seem to make the logic more
>>>> complex without nay justification (at least in the changelog).
>>>
>>> I've added "(to leave behind as little as possible)" ahead of the 1st
>>> comma - is that sufficient, do you think?
>>
>> Actually, the next patch supplies better context, i.e. is more / also
>> about avoiding to clobber Xen's own predictions.
> 
> Right, that I got from next patch, but given context switch is already
> a quite heavy operation, does avoiding the cleaning of the branch
> predictor make that much of a difference?
> 
> IMO it needs good justification given it's a change that makes the
> logic harder to follow, so if it turns out there's no difference I
> would rather leave the IBPB at context switch.

As per another reply, I guess we want to discuss this with Andrew, so
maybe a good thing to try to remember to put up on the next x86 meeting
we're going to have.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 3/4] x86: limit issuing of IBPB during context switch
  2023-12-18 15:19   ` Roger Pau Monné
@ 2023-12-18 16:09     ` Jan Beulich
  2023-12-18 16:11     ` Jan Beulich
  1 sibling, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2023-12-18 16:09 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 16:19, Roger Pau Monné wrote:
> On Tue, Feb 14, 2023 at 05:11:40PM +0100, Jan Beulich wrote:
>> When the outgoing vCPU had IBPB issued and RSB overwritten upon entering
>> Xen, then there's no need for a 2nd barrier during context switch.
>>
>> Note that SCF_entry_ibpb is always clear for the idle domain, so no
>> explicit idle domain check is needed to augment the feature check
>> (which is simply inapplicable to "idle").
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Acked-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks. However, aiui the plan still is for Andrew to pick up this series
and integrate it with other work he has in progress (or he is planning to
do).

>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -2005,17 +2005,26 @@ void context_switch(struct vcpu *prev, s
>>      }
>>      else
>>      {
>> +        unsigned int feat_sc_rsb = X86_FEATURE_SC_RSB_HVM;
>> +
>>          __context_switch();
>>  
>>          /* Re-enable interrupts before restoring state which may fault. */
>>          local_irq_enable();
>>  
>>          if ( is_pv_domain(nextd) )
>> +        {
>>              load_segments(next);
>>  
>> +            feat_sc_rsb = X86_FEATURE_SC_RSB_PV;
>> +        }
>> +
>>          ctxt_switch_levelling(next);
>>  
>> -        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
>> +        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
>> +             (!(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) ||
>> +              /* is_idle_domain(prevd) || */
> 
> I would rather add a comment to note that the idle domain always has
> SCF_entry_ibpb clear, rather than leaving this commented check in the
> condition.

While I think I can see your point, I like it this way to match the
other !is_idle_domain() that's here.

>> +              !boot_cpu_has(feat_sc_rsb)) )
> 
> I do wonder if it would be more fail safe (and easier to expand going
> forward) if we introduce a new cpu_info field to track the CPU state:
> relevant here would be whether RSB has been overwritten and IBPB
> executed.  Such state would be cleared on each return from guest path.

To be honest - I'm not sure whether that would help or make things more
fragile. More state also means more things which can become incorrect /
inconsistent.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 3/4] x86: limit issuing of IBPB during context switch
  2023-12-18 15:19   ` Roger Pau Monné
  2023-12-18 16:09     ` Jan Beulich
@ 2023-12-18 16:11     ` Jan Beulich
  1 sibling, 0 replies; 41+ messages in thread
From: Jan Beulich @ 2023-12-18 16:11 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 16:19, Roger Pau Monné wrote:
> On Tue, Feb 14, 2023 at 05:11:40PM +0100, Jan Beulich wrote:
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -2005,17 +2005,26 @@ void context_switch(struct vcpu *prev, s
>>      }
>>      else
>>      {
>> +        unsigned int feat_sc_rsb = X86_FEATURE_SC_RSB_HVM;
>> +
>>          __context_switch();
>>  
>>          /* Re-enable interrupts before restoring state which may fault. */
>>          local_irq_enable();
>>  
>>          if ( is_pv_domain(nextd) )
>> +        {
>>              load_segments(next);
>>  
>> +            feat_sc_rsb = X86_FEATURE_SC_RSB_PV;
>> +        }
>> +
>>          ctxt_switch_levelling(next);
>>  
>> -        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
>> +        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
>> +             (!(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) ||
>> +              /* is_idle_domain(prevd) || */
> 
> I would rather add a comment to note that the idle domain always has
> SCF_entry_ibpb clear, rather than leaving this commented check in the
> condition.
> 
>> +              !boot_cpu_has(feat_sc_rsb)) )

Oh, for completeness: For v5 I have this

@@ -2092,17 +2092,26 @@ void context_switch(struct vcpu *prev, s
     }
     else
     {
+        unsigned int feat_sc_rsb = X86_FEATURE_SC_RSB_HVM;
+
         __context_switch();
 
         /* Re-enable interrupts before restoring state which may fault. */
         local_irq_enable();
 
         if ( is_pv_domain(nextd) )
+        {
             load_segments(next);
 
+            feat_sc_rsb = X86_FEATURE_SC_RSB_PV;
+        }
+
         ctxt_switch_levelling(next);
 
-        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
+        if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) &&
+             (!(prevd->arch.spec_ctrl_flags & SCF_entry_ibpb) ||
+              /* is_idle_domain(prevd) || */
+              (!cpu_has_auto_ibrs && !boot_cpu_has(feat_sc_rsb))) )
         {
             static DEFINE_PER_CPU(unsigned int, last);
             unsigned int *last_id = &this_cpu(last);

i.e. with the cpu_has_auto_ibrs check added.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-02-14 16:12 ` [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode Jan Beulich
@ 2023-12-18 17:24   ` Roger Pau Monné
  2023-12-19  9:56     ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-18 17:24 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Feb 14, 2023 at 05:12:08PM +0100, Jan Beulich wrote:
> Since both kernel and user mode run in ring 3, they run in the same
> "predictor mode".

That only true when IBRS is enabled, otherwise all CPU modes share the
same predictor mode?

> While the kernel could take care of this itself, doing
> so would be yet another item distinguishing PV from native. Additionally
> we're in a much better position to issue the barrier command, and we can
> save a #GP (for privileged instruction emulation) this way.
> 
> To allow to recover performance, introduce a new VM assist allowing the
> guest kernel to suppress this barrier. Make availability of the assist
> dependent upon the command line control, such that kernels have a way to
> know whether their request actually took any effect.
> 
> Note that because of its use in PV64_VM_ASSIST_MASK, the declaration of
> opt_ibpb_mode_switch can't live in asm/spec_ctrl.h.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Is the placement of the clearing of opt_ibpb_ctxt_switch correct in
> parse_spec_ctrl()? Shouldn't it live ahead of the "disable_common"
> label, as being about guest protection, not Xen's?
> 
> Adding setting of the variable to the "pv" sub-case in parse_spec_ctrl()
> didn't seem quite right to me, considering that we default it to the
> opposite of opt_ibpb_entry_pv.
> ---
> v4: Correct the print_details() change. Re-base in particular over
>     changes earlier in the series.
> v3: Leverage exit-IBPB. Introduce separate command line control.
> v2: Leverage entry-IBPB. Add VM assist. Re-base.
> 
> --- a/docs/misc/xen-command-line.pandoc
> +++ b/docs/misc/xen-command-line.pandoc
> @@ -2320,8 +2320,8 @@ By default SSBD will be mitigated at run
>  ### spec-ctrl (x86)
>  > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>,
>  >              {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
> ->              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
> ->              eager-fpu,l1d-flush,branch-harden,srb-lock,
> +>              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ibpb-mode-switch,
> +>              ssbd,psfd,eager-fpu,l1d-flush,branch-harden,srb-lock,
>  >              unpriv-mmio}=<bool> ]`
>  
>  Controls for speculative execution sidechannel mitigations.  By default, Xen
> @@ -2403,7 +2403,10 @@ default.
>  
>  On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=`
>  option can be used to force (the default) or prevent Xen from issuing branch
> -prediction barriers on vcpu context switches.
> +prediction barriers on vcpu context switches.  On such hardware the
> +`ibpb-mode-switch` option can be used to control whether, by default, Xen
> +would issue branch prediction barriers when 64-bit PV guests switch from
> +user to kernel mode.  If enabled, guest kernels can op out of this behavior.
>  
>  On all hardware, the `eager-fpu=` option can be used to force or prevent Xen
>  from using fully eager FPU context switches.  This is currently implemented as
> --- a/xen/arch/x86/include/asm/domain.h
> +++ b/xen/arch/x86/include/asm/domain.h
> @@ -742,6 +742,8 @@ static inline void pv_inject_sw_interrup
>      pv_inject_event(&event);
>  }
>  
> +extern int8_t opt_ibpb_mode_switch;
> +
>  #define PV32_VM_ASSIST_MASK ((1UL << VMASST_TYPE_4gb_segments)        | \
>                               (1UL << VMASST_TYPE_4gb_segments_notify) | \
>                               (1UL << VMASST_TYPE_writable_pagetables) | \
> @@ -753,7 +755,9 @@ static inline void pv_inject_sw_interrup
>   * but we can't make such requests fail all of the sudden.
>   */
>  #define PV64_VM_ASSIST_MASK (PV32_VM_ASSIST_MASK                      | \
> -                             (1UL << VMASST_TYPE_m2p_strict))
> +                             (1UL << VMASST_TYPE_m2p_strict)          | \
> +                             ((opt_ibpb_mode_switch + 0UL) <<           \
> +                              VMASST_TYPE_mode_switch_no_ibpb))

I'm wondering that it's kind of weird to offer the option to PV domUs
if opt_ibpb_entry_pv is set, as then the guest mode switch will always
(implicitly) do a IBPB as requiring an hypercall and thus take an
entry point into Xen.

I guess it's worth having it just as a way to signal to Xen that the
hypervisor does perform an IBPB, even if the guest cannot disable it.

>  #define HVM_VM_ASSIST_MASK  (1UL << VMASST_TYPE_runstate_update_flag)
>  
>  #define arch_vm_assist_valid_mask(d) \
> --- a/xen/arch/x86/pv/domain.c
> +++ b/xen/arch/x86/pv/domain.c
> @@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
>  void toggle_guest_mode(struct vcpu *v)
>  {
>      const struct domain *d = v->domain;
> +    struct cpu_info *cpu_info = get_cpu_info();
>      unsigned long gs_base;
>  
>      ASSERT(!is_pv_32bit_vcpu(v));
> @@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
>      if ( v->arch.flags & TF_kernel_mode )
>          v->arch.pv.gs_base_kernel = gs_base;
>      else
> +    {
>          v->arch.pv.gs_base_user = gs_base;
> +
> +        if ( opt_ibpb_mode_switch &&
> +             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
> +             !VM_ASSIST(d, mode_switch_no_ibpb) )
> +            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;

Likewise similar to the remarks I've made before, if doing an IBPB on
entry is enough to cover for the case here, it must also be fine to
issue the IBPB right here, instead of deferring to return to guest
context?

The only concern would be (as you mentioned before) to avoid clearing
valid Xen predictions, but I would rather see some figures about what
effect the delaying to return to guest has vs issuing it right here.

> +    }
> +
>      asm volatile ( "swapgs" );
>  
>      _toggle_guest_pt(v);
>  
>      if ( d->arch.pv.xpti )
>      {
> -        struct cpu_info *cpu_info = get_cpu_info();
> -
>          cpu_info->root_pgt_changed = true;
>          cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) |
>                             (d->arch.pv.pcid ? get_pcid_bits(v, true) : 0);
> --- a/xen/arch/x86/spec_ctrl.c
> +++ b/xen/arch/x86/spec_ctrl.c
> @@ -60,6 +60,7 @@ bool __ro_after_init opt_ssbd;
>  int8_t __initdata opt_psfd = -1;
>  
>  int8_t __ro_after_init opt_ibpb_ctxt_switch = -1;
> +int8_t __ro_after_init opt_ibpb_mode_switch = -1;
>  int8_t __read_mostly opt_eager_fpu = -1;
>  int8_t __read_mostly opt_l1d_flush = -1;
>  static bool __initdata opt_branch_harden = true;
> @@ -111,6 +112,8 @@ static int __init cf_check parse_spec_ct
>              if ( opt_pv_l1tf_domu < 0 )
>                  opt_pv_l1tf_domu = 0;
>  
> +            opt_ibpb_mode_switch = 0;
> +
>              if ( opt_tsx == -1 )
>                  opt_tsx = -3;
>  
> @@ -271,6 +274,8 @@ static int __init cf_check parse_spec_ct
>          /* Misc settings. */
>          else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
>              opt_ibpb_ctxt_switch = val;
> +        else if ( (val = parse_boolean("ibpb-mode-switch", s, ss)) >= 0 )
> +            opt_ibpb_mode_switch = val;
>          else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
>              opt_eager_fpu = val;
>          else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
> @@ -527,16 +532,18 @@ static void __init print_details(enum in
>  
>  #endif
>  #ifdef CONFIG_PV
> -    printk("  Support for PV VMs:%s%s%s%s%s%s\n",
> +    printk("  Support for PV VMs:%s%s%s%s%s%s%s\n",
>             (boot_cpu_has(X86_FEATURE_SC_MSR_PV) ||
>              boot_cpu_has(X86_FEATURE_SC_RSB_PV) ||
>              boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ||
> -            opt_eager_fpu || opt_md_clear_pv)        ? ""               : " None",
> +            opt_eager_fpu || opt_md_clear_pv ||
> +            opt_ibpb_mode_switch)                    ? ""               : " None",
>             boot_cpu_has(X86_FEATURE_SC_MSR_PV)       ? " MSR_SPEC_CTRL" : "",
>             boot_cpu_has(X86_FEATURE_SC_RSB_PV)       ? " RSB"           : "",
>             opt_eager_fpu                             ? " EAGER_FPU"     : "",
>             opt_md_clear_pv                           ? " MD_CLEAR"      : "",
> -           boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "");
> +           boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "",
> +           opt_ibpb_mode_switch                      ? " IBPB-mode-switch" : "");
>  
>      printk("  XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n",
>             opt_xpti_hwdom ? "enabled" : "disabled",
> @@ -804,7 +811,8 @@ static void __init ibpb_calculations(voi
>      /* Check we have hardware IBPB support before using it... */
>      if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
>      {
> -        opt_ibpb_entry_hvm = opt_ibpb_entry_pv = opt_ibpb_ctxt_switch = 0;
> +        opt_ibpb_entry_hvm = opt_ibpb_entry_pv = 0;
> +        opt_ibpb_mode_switch = opt_ibpb_ctxt_switch = 0;
>          opt_ibpb_entry_dom0 = false;
>          return;
>      }
> @@ -859,6 +867,18 @@ static void __init ibpb_calculations(voi
>          setup_force_cpu_cap(X86_FEATURE_NEW_PRED_CTXT_PV);
>          setup_force_cpu_cap(X86_FEATURE_NEW_PRED_CTXT_HVM);
>      }
> +
> +#ifdef CONFIG_PV
> +    /*
> +     * If we're using IBPB-on-entry to protect against PV guests, then
> +     * there's no need to also issue IBPB on a guest user->kernel switch.
> +     */
> +    if ( opt_ibpb_mode_switch == -1 )
> +        opt_ibpb_mode_switch = !opt_ibpb_entry_pv ||
> +                               (!opt_ibpb_entry_dom0 && !opt_dom0_pvh);
> +    if ( opt_ibpb_mode_switch )
> +        setup_force_cpu_cap(X86_FEATURE_NEW_PRED_CTXT_PV);
> +#endif
>  }
>  
>  /* Calculate whether this CPU is vulnerable to L1TF. */
> --- a/xen/include/public/xen.h
> +++ b/xen/include/public/xen.h
> @@ -554,6 +554,16 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
>   */
>  #define VMASST_TYPE_m2p_strict           32
>  
> +/*
> + * x86-64 guests: Suppress IBPB on guest-user to guest-kernel mode switch.

I think this needs to be more vague, as it's not true that the IBPB
will be suppressed if Xen is unconditionally issuing one on all guest
entry points.

Maybe adding:

"Setting the assist signals Xen that the IBPB can be avoided from a
guest perspective, however Xen might still issue one for other
reasons."

> + *
> + * By default (on affected and capable hardware) as a safety measure Xen,
> + * to cover for the fact that guest-kernel and guest-user modes are both
> + * running in ring 3 (and hence share prediction context), would issue a
> + * barrier for user->kernel mode switches of PV guests.
> + */
> +#define VMASST_TYPE_mode_switch_no_ibpb  33

Would it be possible to define the assist as
VMASST_TYPE_mode_switch_ibpb and have it on when enabled?  So that the
guest would disable it if unneeded?  IMO negated options are in
general harder to understand.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry
  2023-12-18 13:58     ` Jan Beulich
@ 2023-12-18 17:27       ` Roger Pau Monné
  0 siblings, 0 replies; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-18 17:27 UTC (permalink / raw)
  To: Jan Beulich, Andrew Cooper; +Cc: xen-devel, Wei Liu

On Mon, Dec 18, 2023 at 02:58:01PM +0100, Jan Beulich wrote:
> On 18.12.2023 13:39, Roger Pau Monné wrote:
> > On Tue, Feb 14, 2023 at 05:11:05PM +0100, Jan Beulich wrote:
> >> In order to avoid clobbering Xen's own predictions, defer the barrier as
> >> much as possible. Merely mark the CPU as needing a barrier issued the
> >> next time we're exiting to guest context.
> > 
> > While I understand that doing the flush in the middle of the guest
> > context might not be ideal, as it's my understanding we also
> 
> s/guest context/context switch/?

Indeed, sorry.

> > needlessly flush Xen predictions, I'm unsure whether this makes any
> > difference in practice, and IMO just makes the exit to guest paths
> > more complex.
> 
> I need to redirect this question to Andrew, who suggested that doing so
> can be expected to make a difference. When we were discussing this, I
> could easily see it might make a difference, but I cannot provide hard
> proof.

That's fine, but with the added complexity in the return to guests
paths I think we need some kind of justification for such a change.
If it was the other way around I could easily see it as a benefits if
just for code clarity reasons.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-12-18 17:24   ` Roger Pau Monné
@ 2023-12-19  9:56     ` Jan Beulich
  2023-12-19 11:48       ` Roger Pau Monné
  0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-12-19  9:56 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 18.12.2023 18:24, Roger Pau Monné wrote:
> On Tue, Feb 14, 2023 at 05:12:08PM +0100, Jan Beulich wrote:
>> Since both kernel and user mode run in ring 3, they run in the same
>> "predictor mode".
> 
> That only true when IBRS is enabled, otherwise all CPU modes share the
> same predictor mode?

But here we only care about ring 3 anyway?

>> @@ -753,7 +755,9 @@ static inline void pv_inject_sw_interrup
>>   * but we can't make such requests fail all of the sudden.
>>   */
>>  #define PV64_VM_ASSIST_MASK (PV32_VM_ASSIST_MASK                      | \
>> -                             (1UL << VMASST_TYPE_m2p_strict))
>> +                             (1UL << VMASST_TYPE_m2p_strict)          | \
>> +                             ((opt_ibpb_mode_switch + 0UL) <<           \
>> +                              VMASST_TYPE_mode_switch_no_ibpb))
> 
> I'm wondering that it's kind of weird to offer the option to PV domUs
> if opt_ibpb_entry_pv is set, as then the guest mode switch will always
> (implicitly) do a IBPB as requiring an hypercall and thus take an
> entry point into Xen.
> 
> I guess it's worth having it just as a way to signal to Xen that the
> hypervisor does perform an IBPB, even if the guest cannot disable it.

I'm afraid I'm confused by your reply. Not only, but also because the
latter sentence looks partly backwards / non-logical to me.

>> --- a/xen/arch/x86/pv/domain.c
>> +++ b/xen/arch/x86/pv/domain.c
>> @@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
>>  void toggle_guest_mode(struct vcpu *v)
>>  {
>>      const struct domain *d = v->domain;
>> +    struct cpu_info *cpu_info = get_cpu_info();
>>      unsigned long gs_base;
>>  
>>      ASSERT(!is_pv_32bit_vcpu(v));
>> @@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
>>      if ( v->arch.flags & TF_kernel_mode )
>>          v->arch.pv.gs_base_kernel = gs_base;
>>      else
>> +    {
>>          v->arch.pv.gs_base_user = gs_base;
>> +
>> +        if ( opt_ibpb_mode_switch &&
>> +             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
>> +             !VM_ASSIST(d, mode_switch_no_ibpb) )
>> +            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;
> 
> Likewise similar to the remarks I've made before, if doing an IBPB on
> entry is enough to cover for the case here, it must also be fine to
> issue the IBPB right here, instead of deferring to return to guest
> context?
> 
> The only concern would be (as you mentioned before) to avoid clearing
> valid Xen predictions, but I would rather see some figures about what
> effect the delaying to return to guest has vs issuing it right here.

Part of the reason (aiui) to do things on the exit path was to
consolidate the context switch induced one and the user->kernel switch
one into the same place and mechanism.

>> --- a/xen/include/public/xen.h
>> +++ b/xen/include/public/xen.h
>> @@ -554,6 +554,16 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
>>   */
>>  #define VMASST_TYPE_m2p_strict           32
>>  
>> +/*
>> + * x86-64 guests: Suppress IBPB on guest-user to guest-kernel mode switch.
> 
> I think this needs to be more vague, as it's not true that the IBPB
> will be suppressed if Xen is unconditionally issuing one on all guest
> entry points.
> 
> Maybe adding:
> 
> "Setting the assist signals Xen that the IBPB can be avoided from a
> guest perspective, however Xen might still issue one for other
> reasons."

I've done s/Suppress/Permit skipping/. I wouldn't want to go further,
as that then becomes related to implementation details imo. IOW of
course Xen may issue IBPB whenever it thinks there's a possible need.

>> + *
>> + * By default (on affected and capable hardware) as a safety measure Xen,
>> + * to cover for the fact that guest-kernel and guest-user modes are both
>> + * running in ring 3 (and hence share prediction context), would issue a
>> + * barrier for user->kernel mode switches of PV guests.
>> + */
>> +#define VMASST_TYPE_mode_switch_no_ibpb  33
> 
> Would it be possible to define the assist as
> VMASST_TYPE_mode_switch_ibpb and have it on when enabled?  So that the
> guest would disable it if unneeded?  IMO negated options are in
> general harder to understand.

Negative options aren't nice, yes, but VM assists start out as all
clear. The guest needs to change a "false" to a "true", and thus it
cannot be a positive option here, as we want the default (off) to be
safe/secure.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-12-19  9:56     ` Jan Beulich
@ 2023-12-19 11:48       ` Roger Pau Monné
  2023-12-19 14:06         ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-19 11:48 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Dec 19, 2023 at 10:56:16AM +0100, Jan Beulich wrote:
> On 18.12.2023 18:24, Roger Pau Monné wrote:
> > On Tue, Feb 14, 2023 at 05:12:08PM +0100, Jan Beulich wrote:
> >> Since both kernel and user mode run in ring 3, they run in the same
> >> "predictor mode".
> > 
> > That only true when IBRS is enabled, otherwise all CPU modes share the
> > same predictor mode?
> 
> But here we only care about ring 3 anyway?

Hm, yes, what I wanted to say is that this is not exclusive to 64bit
PV, as with IBRS disabled ring 3 and ring 0 share the same predictor
mode.  Anyway, not relevant.

> 
> >> @@ -753,7 +755,9 @@ static inline void pv_inject_sw_interrup
> >>   * but we can't make such requests fail all of the sudden.
> >>   */
> >>  #define PV64_VM_ASSIST_MASK (PV32_VM_ASSIST_MASK                      | \
> >> -                             (1UL << VMASST_TYPE_m2p_strict))
> >> +                             (1UL << VMASST_TYPE_m2p_strict)          | \
> >> +                             ((opt_ibpb_mode_switch + 0UL) <<           \
> >> +                              VMASST_TYPE_mode_switch_no_ibpb))
> > 
> > I'm wondering that it's kind of weird to offer the option to PV domUs
> > if opt_ibpb_entry_pv is set, as then the guest mode switch will always
> > (implicitly) do a IBPB as requiring an hypercall and thus take an
> > entry point into Xen.
> > 
> > I guess it's worth having it just as a way to signal to Xen that the
> > hypervisor does perform an IBPB, even if the guest cannot disable it.
> 
> I'm afraid I'm confused by your reply. Not only, but also because the
> latter sentence looks partly backwards / non-logical to me.

Sorry, I think I didn't word that very well.  The remark is tied to
the one below about the vmassist 'possibly' allowing the guest to
disable IBPB on guest user -> kernel context switches, but Xen might
unconditionally do additional IBPBs that the guest cannot disable.

> >> --- a/xen/arch/x86/pv/domain.c
> >> +++ b/xen/arch/x86/pv/domain.c
> >> @@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
> >>  void toggle_guest_mode(struct vcpu *v)
> >>  {
> >>      const struct domain *d = v->domain;
> >> +    struct cpu_info *cpu_info = get_cpu_info();
> >>      unsigned long gs_base;
> >>  
> >>      ASSERT(!is_pv_32bit_vcpu(v));
> >> @@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
> >>      if ( v->arch.flags & TF_kernel_mode )
> >>          v->arch.pv.gs_base_kernel = gs_base;
> >>      else
> >> +    {
> >>          v->arch.pv.gs_base_user = gs_base;
> >> +
> >> +        if ( opt_ibpb_mode_switch &&
> >> +             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
> >> +             !VM_ASSIST(d, mode_switch_no_ibpb) )
> >> +            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;
> > 
> > Likewise similar to the remarks I've made before, if doing an IBPB on
> > entry is enough to cover for the case here, it must also be fine to
> > issue the IBPB right here, instead of deferring to return to guest
> > context?
> > 
> > The only concern would be (as you mentioned before) to avoid clearing
> > valid Xen predictions, but I would rather see some figures about what
> > effect the delaying to return to guest has vs issuing it right here.
> 
> Part of the reason (aiui) to do things on the exit path was to
> consolidate the context switch induced one and the user->kernel switch
> one into the same place and mechanism.

Isn't it kind of a very specific case that we end up doing a
user->kernel switch as part of a context switch?  IOW: would require
the vCPU to be scheduled out at that very specific point.

> >> + *
> >> + * By default (on affected and capable hardware) as a safety measure Xen,
> >> + * to cover for the fact that guest-kernel and guest-user modes are both
> >> + * running in ring 3 (and hence share prediction context), would issue a
> >> + * barrier for user->kernel mode switches of PV guests.
> >> + */
> >> +#define VMASST_TYPE_mode_switch_no_ibpb  33
> > 
> > Would it be possible to define the assist as
> > VMASST_TYPE_mode_switch_ibpb and have it on when enabled?  So that the
> > guest would disable it if unneeded?  IMO negated options are in
> > general harder to understand.
> 
> Negative options aren't nice, yes, but VM assists start out as all
> clear.

Are you sure?  I see VMASST_TYPE_pae_extended_cr3 getting set in
dom0_construct_pv() and that makes me wonder whether other bits
couldn't start set also.

Maybe there's some restriction I'm missing, but I don't see any
wording in the description of the interface that states that all
assists are supposed to start disabled.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-12-19 11:48       ` Roger Pau Monné
@ 2023-12-19 14:06         ` Jan Beulich
  2023-12-19 15:11           ` Roger Pau Monné
  0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-12-19 14:06 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 19.12.2023 12:48, Roger Pau Monné wrote:
> On Tue, Dec 19, 2023 at 10:56:16AM +0100, Jan Beulich wrote:
>> On 18.12.2023 18:24, Roger Pau Monné wrote:
>>> On Tue, Feb 14, 2023 at 05:12:08PM +0100, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/pv/domain.c
>>>> +++ b/xen/arch/x86/pv/domain.c
>>>> @@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
>>>>  void toggle_guest_mode(struct vcpu *v)
>>>>  {
>>>>      const struct domain *d = v->domain;
>>>> +    struct cpu_info *cpu_info = get_cpu_info();
>>>>      unsigned long gs_base;
>>>>  
>>>>      ASSERT(!is_pv_32bit_vcpu(v));
>>>> @@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
>>>>      if ( v->arch.flags & TF_kernel_mode )
>>>>          v->arch.pv.gs_base_kernel = gs_base;
>>>>      else
>>>> +    {
>>>>          v->arch.pv.gs_base_user = gs_base;
>>>> +
>>>> +        if ( opt_ibpb_mode_switch &&
>>>> +             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
>>>> +             !VM_ASSIST(d, mode_switch_no_ibpb) )
>>>> +            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;
>>>
>>> Likewise similar to the remarks I've made before, if doing an IBPB on
>>> entry is enough to cover for the case here, it must also be fine to
>>> issue the IBPB right here, instead of deferring to return to guest
>>> context?
>>>
>>> The only concern would be (as you mentioned before) to avoid clearing
>>> valid Xen predictions, but I would rather see some figures about what
>>> effect the delaying to return to guest has vs issuing it right here.
>>
>> Part of the reason (aiui) to do things on the exit path was to
>> consolidate the context switch induced one and the user->kernel switch
>> one into the same place and mechanism.
> 
> Isn't it kind of a very specific case that we end up doing a
> user->kernel switch as part of a context switch?  IOW: would require
> the vCPU to be scheduled out at that very specific point.

No, there's no user->kernel switch at the same time as context switch.
What I was trying to explain is that with the actual IBPB being issued
on exit to guest, both the context switch path and the user->kernel
mode switch path set the same indicator, for the exit path to consume.

>>>> + *
>>>> + * By default (on affected and capable hardware) as a safety measure Xen,
>>>> + * to cover for the fact that guest-kernel and guest-user modes are both
>>>> + * running in ring 3 (and hence share prediction context), would issue a
>>>> + * barrier for user->kernel mode switches of PV guests.
>>>> + */
>>>> +#define VMASST_TYPE_mode_switch_no_ibpb  33
>>>
>>> Would it be possible to define the assist as
>>> VMASST_TYPE_mode_switch_ibpb and have it on when enabled?  So that the
>>> guest would disable it if unneeded?  IMO negated options are in
>>> general harder to understand.
>>
>> Negative options aren't nice, yes, but VM assists start out as all
>> clear.
> 
> Are you sure?  I see VMASST_TYPE_pae_extended_cr3 getting set in
> dom0_construct_pv() and that makes me wonder whether other bits
> couldn't start set also.
> 
> Maybe there's some restriction I'm missing, but I don't see any
> wording in the description of the interface that states that all
> assists are supposed to start disabled.

Well, that setting of pae_extended_cr3 is in response to the kernel's
notes section having a respective indicator. So we still only set the
bit in response to what the kernel's asking us to do, just that here
we carry out the request ahead of launching the kernel.

Also consider what would happen during migration if there was a
default-on assist: At the destination we can't know whether the
source simply didn't know of the bit, or whether the guest elected to
clear it.

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-12-19 14:06         ` Jan Beulich
@ 2023-12-19 15:11           ` Roger Pau Monné
  2023-12-19 17:07             ` Roger Pau Monné
  0 siblings, 1 reply; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-19 15:11 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Dec 19, 2023 at 03:06:50PM +0100, Jan Beulich wrote:
> On 19.12.2023 12:48, Roger Pau Monné wrote:
> > On Tue, Dec 19, 2023 at 10:56:16AM +0100, Jan Beulich wrote:
> >> On 18.12.2023 18:24, Roger Pau Monné wrote:
> >>> On Tue, Feb 14, 2023 at 05:12:08PM +0100, Jan Beulich wrote:
> >>>> --- a/xen/arch/x86/pv/domain.c
> >>>> +++ b/xen/arch/x86/pv/domain.c
> >>>> @@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
> >>>>  void toggle_guest_mode(struct vcpu *v)
> >>>>  {
> >>>>      const struct domain *d = v->domain;
> >>>> +    struct cpu_info *cpu_info = get_cpu_info();
> >>>>      unsigned long gs_base;
> >>>>  
> >>>>      ASSERT(!is_pv_32bit_vcpu(v));
> >>>> @@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
> >>>>      if ( v->arch.flags & TF_kernel_mode )
> >>>>          v->arch.pv.gs_base_kernel = gs_base;
> >>>>      else
> >>>> +    {
> >>>>          v->arch.pv.gs_base_user = gs_base;
> >>>> +
> >>>> +        if ( opt_ibpb_mode_switch &&
> >>>> +             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
> >>>> +             !VM_ASSIST(d, mode_switch_no_ibpb) )
> >>>> +            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;
> >>>
> >>> Likewise similar to the remarks I've made before, if doing an IBPB on
> >>> entry is enough to cover for the case here, it must also be fine to
> >>> issue the IBPB right here, instead of deferring to return to guest
> >>> context?
> >>>
> >>> The only concern would be (as you mentioned before) to avoid clearing
> >>> valid Xen predictions, but I would rather see some figures about what
> >>> effect the delaying to return to guest has vs issuing it right here.
> >>
> >> Part of the reason (aiui) to do things on the exit path was to
> >> consolidate the context switch induced one and the user->kernel switch
> >> one into the same place and mechanism.
> > 
> > Isn't it kind of a very specific case that we end up doing a
> > user->kernel switch as part of a context switch?  IOW: would require
> > the vCPU to be scheduled out at that very specific point.
> 
> No, there's no user->kernel switch at the same time as context switch.
> What I was trying to explain is that with the actual IBPB being issued
> on exit to guest, both the context switch path and the user->kernel
> mode switch path set the same indicator, for the exit path to consume.

Deferring to exit to guest path could be OK, but unless strictly
needed, which I don't think it's the case, I would request for IBPB to
be executed in C context rather than assembly one.

> >>>> + *
> >>>> + * By default (on affected and capable hardware) as a safety measure Xen,
> >>>> + * to cover for the fact that guest-kernel and guest-user modes are both
> >>>> + * running in ring 3 (and hence share prediction context), would issue a
> >>>> + * barrier for user->kernel mode switches of PV guests.
> >>>> + */
> >>>> +#define VMASST_TYPE_mode_switch_no_ibpb  33
> >>>
> >>> Would it be possible to define the assist as
> >>> VMASST_TYPE_mode_switch_ibpb and have it on when enabled?  So that the
> >>> guest would disable it if unneeded?  IMO negated options are in
> >>> general harder to understand.
> >>
> >> Negative options aren't nice, yes, but VM assists start out as all
> >> clear.
> > 
> > Are you sure?  I see VMASST_TYPE_pae_extended_cr3 getting set in
> > dom0_construct_pv() and that makes me wonder whether other bits
> > couldn't start set also.
> > 
> > Maybe there's some restriction I'm missing, but I don't see any
> > wording in the description of the interface that states that all
> > assists are supposed to start disabled.
> 
> Well, that setting of pae_extended_cr3 is in response to the kernel's
> notes section having a respective indicator. So we still only set the
> bit in response to what the kernel's asking us to do, just that here
> we carry out the request ahead of launching the kernel.
> 
> Also consider what would happen during migration if there was a
> default-on assist: At the destination we can't know whether the
> source simply didn't know of the bit, or whether the guest elected to
> clear it.

Hm, I see, so I was indeed missing that aspect.  VM assist is passed
as a plain bitmap, and there's no signal on which assists the VM had
available on the source side if not enabled.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-12-19 15:11           ` Roger Pau Monné
@ 2023-12-19 17:07             ` Roger Pau Monné
  2023-12-20  9:25               ` Jan Beulich
  0 siblings, 1 reply; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-19 17:07 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Tue, Dec 19, 2023 at 04:11:09PM +0100, Roger Pau Monné wrote:
> On Tue, Dec 19, 2023 at 03:06:50PM +0100, Jan Beulich wrote:
> > On 19.12.2023 12:48, Roger Pau Monné wrote:
> > > On Tue, Dec 19, 2023 at 10:56:16AM +0100, Jan Beulich wrote:
> > >> On 18.12.2023 18:24, Roger Pau Monné wrote:
> > >>> On Tue, Feb 14, 2023 at 05:12:08PM +0100, Jan Beulich wrote:
> > >>>> --- a/xen/arch/x86/pv/domain.c
> > >>>> +++ b/xen/arch/x86/pv/domain.c
> > >>>> @@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
> > >>>>  void toggle_guest_mode(struct vcpu *v)
> > >>>>  {
> > >>>>      const struct domain *d = v->domain;
> > >>>> +    struct cpu_info *cpu_info = get_cpu_info();
> > >>>>      unsigned long gs_base;
> > >>>>  
> > >>>>      ASSERT(!is_pv_32bit_vcpu(v));
> > >>>> @@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
> > >>>>      if ( v->arch.flags & TF_kernel_mode )
> > >>>>          v->arch.pv.gs_base_kernel = gs_base;
> > >>>>      else
> > >>>> +    {
> > >>>>          v->arch.pv.gs_base_user = gs_base;
> > >>>> +
> > >>>> +        if ( opt_ibpb_mode_switch &&
> > >>>> +             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
> > >>>> +             !VM_ASSIST(d, mode_switch_no_ibpb) )
> > >>>> +            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;
> > >>>
> > >>> Likewise similar to the remarks I've made before, if doing an IBPB on
> > >>> entry is enough to cover for the case here, it must also be fine to
> > >>> issue the IBPB right here, instead of deferring to return to guest
> > >>> context?
> > >>>
> > >>> The only concern would be (as you mentioned before) to avoid clearing
> > >>> valid Xen predictions, but I would rather see some figures about what
> > >>> effect the delaying to return to guest has vs issuing it right here.
> > >>
> > >> Part of the reason (aiui) to do things on the exit path was to
> > >> consolidate the context switch induced one and the user->kernel switch
> > >> one into the same place and mechanism.
> > > 
> > > Isn't it kind of a very specific case that we end up doing a
> > > user->kernel switch as part of a context switch?  IOW: would require
> > > the vCPU to be scheduled out at that very specific point.
> > 
> > No, there's no user->kernel switch at the same time as context switch.
> > What I was trying to explain is that with the actual IBPB being issued
> > on exit to guest, both the context switch path and the user->kernel
> > mode switch path set the same indicator, for the exit path to consume.
> 
> Deferring to exit to guest path could be OK, but unless strictly
> needed, which I don't think it's the case, I would request for IBPB to
> be executed in C context rather than assembly one.
> 
> > >>>> + *
> > >>>> + * By default (on affected and capable hardware) as a safety measure Xen,
> > >>>> + * to cover for the fact that guest-kernel and guest-user modes are both
> > >>>> + * running in ring 3 (and hence share prediction context), would issue a
> > >>>> + * barrier for user->kernel mode switches of PV guests.
> > >>>> + */
> > >>>> +#define VMASST_TYPE_mode_switch_no_ibpb  33
> > >>>
> > >>> Would it be possible to define the assist as
> > >>> VMASST_TYPE_mode_switch_ibpb and have it on when enabled?  So that the
> > >>> guest would disable it if unneeded?  IMO negated options are in
> > >>> general harder to understand.
> > >>
> > >> Negative options aren't nice, yes, but VM assists start out as all
> > >> clear.
> > > 
> > > Are you sure?  I see VMASST_TYPE_pae_extended_cr3 getting set in
> > > dom0_construct_pv() and that makes me wonder whether other bits
> > > couldn't start set also.
> > > 
> > > Maybe there's some restriction I'm missing, but I don't see any
> > > wording in the description of the interface that states that all
> > > assists are supposed to start disabled.
> > 
> > Well, that setting of pae_extended_cr3 is in response to the kernel's
> > notes section having a respective indicator. So we still only set the
> > bit in response to what the kernel's asking us to do, just that here
> > we carry out the request ahead of launching the kernel.
> > 
> > Also consider what would happen during migration if there was a
> > default-on assist: At the destination we can't know whether the
> > source simply didn't know of the bit, or whether the guest elected to
> > clear it.
> 
> Hm, I see, so I was indeed missing that aspect.  VM assist is passed
> as a plain bitmap, and there's no signal on which assists the VM had
> available on the source side if not enabled.

Sorry, please bear with me, as I've been further thinking about this.

Why does the assist needs to be default-on?  It's my understanding
that the guest can execute the IBPB itself by writing to the MSR, but
that's suboptimal in the user -> kernel context switch as it then
involves two traps into Xen, but the guest is not left insecure, it
just needs to write the MSR itself like on native.

In fact, if we add an IBPB by default as part of amd64 PV user ->
kernel guest context switch, we are likely doing a double IBPB on
guests not aware of the assist.

IOW: I don't know why doing the assist as guest opt-in would be
insecure, in fact I think it's the best approach (again I might be
missing something).

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-12-19 17:07             ` Roger Pau Monné
@ 2023-12-20  9:25               ` Jan Beulich
  2023-12-20  9:59                 ` Roger Pau Monné
  0 siblings, 1 reply; 41+ messages in thread
From: Jan Beulich @ 2023-12-20  9:25 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Andrew Cooper, Wei Liu

On 19.12.2023 18:07, Roger Pau Monné wrote:
> On Tue, Dec 19, 2023 at 04:11:09PM +0100, Roger Pau Monné wrote:
>> On Tue, Dec 19, 2023 at 03:06:50PM +0100, Jan Beulich wrote:
>>> On 19.12.2023 12:48, Roger Pau Monné wrote:
>>>> On Tue, Dec 19, 2023 at 10:56:16AM +0100, Jan Beulich wrote:
>>>>> On 18.12.2023 18:24, Roger Pau Monné wrote:
>>>>>> On Tue, Feb 14, 2023 at 05:12:08PM +0100, Jan Beulich wrote:
>>>>>>> --- a/xen/arch/x86/pv/domain.c
>>>>>>> +++ b/xen/arch/x86/pv/domain.c
>>>>>>> @@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
>>>>>>>  void toggle_guest_mode(struct vcpu *v)
>>>>>>>  {
>>>>>>>      const struct domain *d = v->domain;
>>>>>>> +    struct cpu_info *cpu_info = get_cpu_info();
>>>>>>>      unsigned long gs_base;
>>>>>>>  
>>>>>>>      ASSERT(!is_pv_32bit_vcpu(v));
>>>>>>> @@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
>>>>>>>      if ( v->arch.flags & TF_kernel_mode )
>>>>>>>          v->arch.pv.gs_base_kernel = gs_base;
>>>>>>>      else
>>>>>>> +    {
>>>>>>>          v->arch.pv.gs_base_user = gs_base;
>>>>>>> +
>>>>>>> +        if ( opt_ibpb_mode_switch &&
>>>>>>> +             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
>>>>>>> +             !VM_ASSIST(d, mode_switch_no_ibpb) )
>>>>>>> +            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;
>>>>>>
>>>>>> Likewise similar to the remarks I've made before, if doing an IBPB on
>>>>>> entry is enough to cover for the case here, it must also be fine to
>>>>>> issue the IBPB right here, instead of deferring to return to guest
>>>>>> context?
>>>>>>
>>>>>> The only concern would be (as you mentioned before) to avoid clearing
>>>>>> valid Xen predictions, but I would rather see some figures about what
>>>>>> effect the delaying to return to guest has vs issuing it right here.
>>>>>
>>>>> Part of the reason (aiui) to do things on the exit path was to
>>>>> consolidate the context switch induced one and the user->kernel switch
>>>>> one into the same place and mechanism.
>>>>
>>>> Isn't it kind of a very specific case that we end up doing a
>>>> user->kernel switch as part of a context switch?  IOW: would require
>>>> the vCPU to be scheduled out at that very specific point.
>>>
>>> No, there's no user->kernel switch at the same time as context switch.
>>> What I was trying to explain is that with the actual IBPB being issued
>>> on exit to guest, both the context switch path and the user->kernel
>>> mode switch path set the same indicator, for the exit path to consume.
>>
>> Deferring to exit to guest path could be OK, but unless strictly
>> needed, which I don't think it's the case, I would request for IBPB to
>> be executed in C context rather than assembly one.
>>
>>>>>>> + *
>>>>>>> + * By default (on affected and capable hardware) as a safety measure Xen,
>>>>>>> + * to cover for the fact that guest-kernel and guest-user modes are both
>>>>>>> + * running in ring 3 (and hence share prediction context), would issue a
>>>>>>> + * barrier for user->kernel mode switches of PV guests.
>>>>>>> + */
>>>>>>> +#define VMASST_TYPE_mode_switch_no_ibpb  33
>>>>>>
>>>>>> Would it be possible to define the assist as
>>>>>> VMASST_TYPE_mode_switch_ibpb and have it on when enabled?  So that the
>>>>>> guest would disable it if unneeded?  IMO negated options are in
>>>>>> general harder to understand.
>>>>>
>>>>> Negative options aren't nice, yes, but VM assists start out as all
>>>>> clear.
>>>>
>>>> Are you sure?  I see VMASST_TYPE_pae_extended_cr3 getting set in
>>>> dom0_construct_pv() and that makes me wonder whether other bits
>>>> couldn't start set also.
>>>>
>>>> Maybe there's some restriction I'm missing, but I don't see any
>>>> wording in the description of the interface that states that all
>>>> assists are supposed to start disabled.
>>>
>>> Well, that setting of pae_extended_cr3 is in response to the kernel's
>>> notes section having a respective indicator. So we still only set the
>>> bit in response to what the kernel's asking us to do, just that here
>>> we carry out the request ahead of launching the kernel.
>>>
>>> Also consider what would happen during migration if there was a
>>> default-on assist: At the destination we can't know whether the
>>> source simply didn't know of the bit, or whether the guest elected to
>>> clear it.
>>
>> Hm, I see, so I was indeed missing that aspect.  VM assist is passed
>> as a plain bitmap, and there's no signal on which assists the VM had
>> available on the source side if not enabled.
> 
> Sorry, please bear with me, as I've been further thinking about this.
> 
> Why does the assist needs to be default-on?  It's my understanding
> that the guest can execute the IBPB itself by writing to the MSR, but
> that's suboptimal in the user -> kernel context switch as it then
> involves two traps into Xen, but the guest is not left insecure, it
> just needs to write the MSR itself like on native.
> 
> In fact, if we add an IBPB by default as part of amd64 PV user ->
> kernel guest context switch, we are likely doing a double IBPB on
> guests not aware of the assist.
> 
> IOW: I don't know why doing the assist as guest opt-in would be
> insecure, in fact I think it's the best approach (again I might be
> missing something).

By issuing IBPB by default we can make guests safe (in this regard)
irrespective of their awareness of IBPB, and in particular their
awareness of IBPB being needed explicitly on the user->kernel mode
transition (where on native, with IBRS enabled, sufficient separation
exists iirc). IOW we're trying to cater for a 64-bit-PV special aspect
by default. (Andrew, please correct me if there's anything wrong in
here.)

Jan


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode
  2023-12-20  9:25               ` Jan Beulich
@ 2023-12-20  9:59                 ` Roger Pau Monné
  0 siblings, 0 replies; 41+ messages in thread
From: Roger Pau Monné @ 2023-12-20  9:59 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Andrew Cooper, Wei Liu

On Wed, Dec 20, 2023 at 10:25:57AM +0100, Jan Beulich wrote:
> On 19.12.2023 18:07, Roger Pau Monné wrote:
> > On Tue, Dec 19, 2023 at 04:11:09PM +0100, Roger Pau Monné wrote:
> >> On Tue, Dec 19, 2023 at 03:06:50PM +0100, Jan Beulich wrote:
> >>> On 19.12.2023 12:48, Roger Pau Monné wrote:
> >>>> On Tue, Dec 19, 2023 at 10:56:16AM +0100, Jan Beulich wrote:
> >>>>> On 18.12.2023 18:24, Roger Pau Monné wrote:
> >>>>>> On Tue, Feb 14, 2023 at 05:12:08PM +0100, Jan Beulich wrote:
> >>>>>>> --- a/xen/arch/x86/pv/domain.c
> >>>>>>> +++ b/xen/arch/x86/pv/domain.c
> >>>>>>> @@ -455,6 +455,7 @@ static void _toggle_guest_pt(struct vcpu
> >>>>>>>  void toggle_guest_mode(struct vcpu *v)
> >>>>>>>  {
> >>>>>>>      const struct domain *d = v->domain;
> >>>>>>> +    struct cpu_info *cpu_info = get_cpu_info();
> >>>>>>>      unsigned long gs_base;
> >>>>>>>  
> >>>>>>>      ASSERT(!is_pv_32bit_vcpu(v));
> >>>>>>> @@ -467,15 +468,21 @@ void toggle_guest_mode(struct vcpu *v)
> >>>>>>>      if ( v->arch.flags & TF_kernel_mode )
> >>>>>>>          v->arch.pv.gs_base_kernel = gs_base;
> >>>>>>>      else
> >>>>>>> +    {
> >>>>>>>          v->arch.pv.gs_base_user = gs_base;
> >>>>>>> +
> >>>>>>> +        if ( opt_ibpb_mode_switch &&
> >>>>>>> +             !(d->arch.spec_ctrl_flags & SCF_entry_ibpb) &&
> >>>>>>> +             !VM_ASSIST(d, mode_switch_no_ibpb) )
> >>>>>>> +            cpu_info->spec_ctrl_flags |= SCF_new_pred_ctxt;
> >>>>>>
> >>>>>> Likewise similar to the remarks I've made before, if doing an IBPB on
> >>>>>> entry is enough to cover for the case here, it must also be fine to
> >>>>>> issue the IBPB right here, instead of deferring to return to guest
> >>>>>> context?
> >>>>>>
> >>>>>> The only concern would be (as you mentioned before) to avoid clearing
> >>>>>> valid Xen predictions, but I would rather see some figures about what
> >>>>>> effect the delaying to return to guest has vs issuing it right here.
> >>>>>
> >>>>> Part of the reason (aiui) to do things on the exit path was to
> >>>>> consolidate the context switch induced one and the user->kernel switch
> >>>>> one into the same place and mechanism.
> >>>>
> >>>> Isn't it kind of a very specific case that we end up doing a
> >>>> user->kernel switch as part of a context switch?  IOW: would require
> >>>> the vCPU to be scheduled out at that very specific point.
> >>>
> >>> No, there's no user->kernel switch at the same time as context switch.
> >>> What I was trying to explain is that with the actual IBPB being issued
> >>> on exit to guest, both the context switch path and the user->kernel
> >>> mode switch path set the same indicator, for the exit path to consume.
> >>
> >> Deferring to exit to guest path could be OK, but unless strictly
> >> needed, which I don't think it's the case, I would request for IBPB to
> >> be executed in C context rather than assembly one.
> >>
> >>>>>>> + *
> >>>>>>> + * By default (on affected and capable hardware) as a safety measure Xen,
> >>>>>>> + * to cover for the fact that guest-kernel and guest-user modes are both
> >>>>>>> + * running in ring 3 (and hence share prediction context), would issue a
> >>>>>>> + * barrier for user->kernel mode switches of PV guests.
> >>>>>>> + */
> >>>>>>> +#define VMASST_TYPE_mode_switch_no_ibpb  33
> >>>>>>
> >>>>>> Would it be possible to define the assist as
> >>>>>> VMASST_TYPE_mode_switch_ibpb and have it on when enabled?  So that the
> >>>>>> guest would disable it if unneeded?  IMO negated options are in
> >>>>>> general harder to understand.
> >>>>>
> >>>>> Negative options aren't nice, yes, but VM assists start out as all
> >>>>> clear.
> >>>>
> >>>> Are you sure?  I see VMASST_TYPE_pae_extended_cr3 getting set in
> >>>> dom0_construct_pv() and that makes me wonder whether other bits
> >>>> couldn't start set also.
> >>>>
> >>>> Maybe there's some restriction I'm missing, but I don't see any
> >>>> wording in the description of the interface that states that all
> >>>> assists are supposed to start disabled.
> >>>
> >>> Well, that setting of pae_extended_cr3 is in response to the kernel's
> >>> notes section having a respective indicator. So we still only set the
> >>> bit in response to what the kernel's asking us to do, just that here
> >>> we carry out the request ahead of launching the kernel.
> >>>
> >>> Also consider what would happen during migration if there was a
> >>> default-on assist: At the destination we can't know whether the
> >>> source simply didn't know of the bit, or whether the guest elected to
> >>> clear it.
> >>
> >> Hm, I see, so I was indeed missing that aspect.  VM assist is passed
> >> as a plain bitmap, and there's no signal on which assists the VM had
> >> available on the source side if not enabled.
> > 
> > Sorry, please bear with me, as I've been further thinking about this.
> > 
> > Why does the assist needs to be default-on?  It's my understanding
> > that the guest can execute the IBPB itself by writing to the MSR, but
> > that's suboptimal in the user -> kernel context switch as it then
> > involves two traps into Xen, but the guest is not left insecure, it
> > just needs to write the MSR itself like on native.
> > 
> > In fact, if we add an IBPB by default as part of amd64 PV user ->
> > kernel guest context switch, we are likely doing a double IBPB on
> > guests not aware of the assist.
> > 
> > IOW: I don't know why doing the assist as guest opt-in would be
> > insecure, in fact I think it's the best approach (again I might be
> > missing something).
> 
> By issuing IBPB by default we can make guests safe (in this regard)
> irrespective of their awareness of IBPB, and in particular their
> awareness of IBPB being needed explicitly on the user->kernel mode
> transition (where on native, with IBRS enabled, sufficient separation
> exists iirc). IOW we're trying to cater for a 64-bit-PV special aspect
> by default. (Andrew, please correct me if there's anything wrong in
> here.)

Hm, maybe.  My point would be that PV is already specific enough that
OSes shouldn't expect things like IBRS to work as on native, and hence
should be aware of user and kernel running in the same privilege mode
and issue the IBPB themselves.

Setting that aside, would it make sense to tie the IBPB on guest user
-> kernel switches to the guest having enabled IBRS?  AFAICT IBRS on
64bit PV is useless, as from the predictor PoV both user and kernel
space share the same mode.  Hence a PV guest enabling IBRS could be
used as a signal for Xen to execute IBPB on user -> kernel guest
context switches?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2023-12-20 10:00 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-14 16:09 [PATCH v4 0/4 + v1 0/1] x86/spec-ctrl: IBPB improvements Jan Beulich
2023-02-14 16:10 ` [PATCH v4 1/4] x86/spec-ctrl: add logic to issue IBPB on exit to guest Jan Beulich
2023-12-18 12:11   ` Roger Pau Monné
2023-12-18 13:46     ` Jan Beulich
2023-12-18 13:50       ` Jan Beulich
2023-12-18 15:43         ` Roger Pau Monné
2023-12-18 16:02           ` Jan Beulich
2023-12-18 13:54       ` Jan Beulich
2023-12-18 15:40       ` Roger Pau Monné
2023-12-18 16:00         ` Jan Beulich
2023-02-14 16:11 ` [PATCH v4 2/4] x86/spec-ctrl: defer context-switch IBPB until guest entry Jan Beulich
2023-12-18 12:39   ` Roger Pau Monné
2023-12-18 13:58     ` Jan Beulich
2023-12-18 17:27       ` Roger Pau Monné
2023-02-14 16:11 ` [PATCH v4 3/4] x86: limit issuing of IBPB during context switch Jan Beulich
2023-12-18 15:19   ` Roger Pau Monné
2023-12-18 16:09     ` Jan Beulich
2023-12-18 16:11     ` Jan Beulich
2023-02-14 16:12 ` [PATCH v4 4/4] x86/PV: issue branch prediction barrier when switching 64-bit guest to kernel mode Jan Beulich
2023-12-18 17:24   ` Roger Pau Monné
2023-12-19  9:56     ` Jan Beulich
2023-12-19 11:48       ` Roger Pau Monné
2023-12-19 14:06         ` Jan Beulich
2023-12-19 15:11           ` Roger Pau Monné
2023-12-19 17:07             ` Roger Pau Monné
2023-12-20  9:25               ` Jan Beulich
2023-12-20  9:59                 ` Roger Pau Monné
2023-02-14 16:13 ` [PATCH] x86/Xen: make use of IBPB controlling VM assist Jan Beulich
2023-02-14 23:53   ` Boris Ostrovsky
2023-02-15  0:07     ` Boris Ostrovsky
2023-02-15  8:31       ` Jan Beulich
2023-02-15 23:22         ` Boris Ostrovsky
2023-02-16  7:33           ` Jan Beulich
2023-03-17 13:56         ` Juergen Gross
2023-03-17 14:21           ` Andrew Cooper
2023-03-17 14:28             ` Juergen Gross
2023-03-20 10:21             ` Jan Beulich
2023-03-20 10:19           ` Jan Beulich
2023-03-20 13:02             ` Juergen Gross
2023-03-20 13:17               ` Jan Beulich
2023-03-20 13:35                 ` Juergen Gross

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.