All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] x86/vmx: implement Bus Lock and VM Notify
@ 2022-05-17 13:21 Roger Pau Monne
  2022-05-17 13:21 ` [PATCH 1/2] x86/vmx: implement Bus Lock detection Roger Pau Monne
  2022-05-17 13:21 ` [PATCH 2/2] x86/vmx: implement Notify VM Exit Roger Pau Monne
  0 siblings, 2 replies; 15+ messages in thread
From: Roger Pau Monne @ 2022-05-17 13:21 UTC (permalink / raw)
  To: xen-devel
  Cc: Roger Pau Monne, Jun Nakajima, Kevin Tian, Jan Beulich,
	Andrew Cooper, Wei Liu, George Dunlap, Julien Grall,
	Stefano Stabellini

Hello,

Following series implements support for bus lock and VM notify.

Patches are not really dependent, but I've developed them together by
virtue of both features being in Intel Instructions Set Extensions PR
Chapter 9.

Thanks, Roger.

Roger Pau Monne (2):
  x86/vmx: implement Bus Lock detection
  x86/vmx: implement Notify VM Exit

 docs/misc/xen-command-line.pandoc       | 11 +++++++
 xen/arch/x86/hvm/vmx/vmcs.c             | 22 ++++++++++++-
 xen/arch/x86/hvm/vmx/vmx.c              | 42 +++++++++++++++++++++++++
 xen/arch/x86/include/asm/hvm/vmx/vmcs.h |  7 +++++
 xen/arch/x86/include/asm/hvm/vmx/vmx.h  |  8 +++++
 xen/arch/x86/include/asm/perfc_defn.h   |  5 ++-
 6 files changed, 93 insertions(+), 2 deletions(-)

-- 
2.36.0



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/2] x86/vmx: implement Bus Lock detection
  2022-05-17 13:21 [PATCH 0/2] x86/vmx: implement Bus Lock and VM Notify Roger Pau Monne
@ 2022-05-17 13:21 ` Roger Pau Monne
  2022-05-18 22:50   ` Andrew Cooper
  2022-05-17 13:21 ` [PATCH 2/2] x86/vmx: implement Notify VM Exit Roger Pau Monne
  1 sibling, 1 reply; 15+ messages in thread
From: Roger Pau Monne @ 2022-05-17 13:21 UTC (permalink / raw)
  To: xen-devel
  Cc: Roger Pau Monne, Jun Nakajima, Kevin Tian, Jan Beulich,
	Andrew Cooper, Wei Liu

Add support for enabling Bus Lock Detection on Intel systems.  Such
detection works by triggering a vmexit, which is enough of a pause to
prevent a guest from abusing of the Bus Lock.

Add an extra perf counter to track the number of Bus Locks detected.
This is done because Bus Locks can also be reported by setting the bit
26 in the exit reason field, so also account for those.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/hvm/vmx/vmcs.c             |  4 +++-
 xen/arch/x86/hvm/vmx/vmx.c              | 18 ++++++++++++++++++
 xen/arch/x86/include/asm/hvm/vmx/vmcs.h |  3 +++
 xen/arch/x86/include/asm/hvm/vmx/vmx.h  |  2 ++
 xen/arch/x86/include/asm/perfc_defn.h   |  4 +++-
 5 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 56fed2db03..d388e6729c 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -209,6 +209,7 @@ static void __init vmx_display_features(void)
     P(cpu_has_vmx_virt_exceptions, "Virtualisation Exceptions");
     P(cpu_has_vmx_pml, "Page Modification Logging");
     P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
+    P(cpu_has_vmx_bus_lock_detection, "Bus Lock Detection");
 #undef P
 
     if ( !printed )
@@ -318,7 +319,8 @@ static int vmx_init_vmcs_config(bool bsp)
                SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
                SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS |
                SECONDARY_EXEC_XSAVES |
-               SECONDARY_EXEC_TSC_SCALING);
+               SECONDARY_EXEC_TSC_SCALING |
+               SECONDARY_EXEC_BUS_LOCK_DETECTION);
         if ( _vmx_misc_cap & VMX_MISC_VMWRITE_ALL )
             opt |= SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
         if ( opt_vpid_enabled )
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index d03e78bf0d..02cc7a2023 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4053,6 +4053,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 
     if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
         return vmx_failed_vmentry(exit_reason, regs);
+    if ( unlikely(exit_reason & VMX_EXIT_REASONS_BUS_LOCK) )
+    {
+        /*
+         * Delivery of Bus Lock VM exit was pre-empted by a higher priority VM
+         * exit.
+         */
+        exit_reason &= ~VMX_EXIT_REASONS_BUS_LOCK;
+        if ( exit_reason != EXIT_REASON_BUS_LOCK )
+            perfc_incr(buslock);
+    }
 
     if ( v->arch.hvm.vmx.vmx_realmode )
     {
@@ -4549,6 +4559,14 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
         vmx_handle_descriptor_access(exit_reason);
         break;
 
+    case EXIT_REASON_BUS_LOCK:
+        perfc_incr(buslock);
+        /*
+         * Nothing to do: just taking a vmexit should be enough of a pause to
+         * prevent a VM from crippling the host with bus locks.
+         */
+        break;
+
     case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
     case EXIT_REASON_INVPCID:
     /* fall through */
diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
index 9119aa8536..5d3edc1642 100644
--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
@@ -266,6 +266,7 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS   0x00040000
 #define SECONDARY_EXEC_XSAVES                   0x00100000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
+#define SECONDARY_EXEC_BUS_LOCK_DETECTION       0x40000000
 extern u32 vmx_secondary_exec_control;
 
 #define VMX_EPT_EXEC_ONLY_SUPPORTED                         0x00000001
@@ -345,6 +346,8 @@ extern u64 vmx_ept_vpid_cap;
     (vmx_secondary_exec_control & SECONDARY_EXEC_XSAVES)
 #define cpu_has_vmx_tsc_scaling \
     (vmx_secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+#define cpu_has_vmx_bus_lock_detection \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_BUS_LOCK_DETECTION)
 
 #define VMCS_RID_TYPE_MASK              0x80000000
 
diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmx.h b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
index 8eedf59155..03995701a1 100644
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -159,6 +159,7 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
  * Exit Reasons
  */
 #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+#define VMX_EXIT_REASONS_BUS_LOCK       (1u << 26)
 
 #define EXIT_REASON_EXCEPTION_NMI       0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
@@ -219,6 +220,7 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
+#define EXIT_REASON_BUS_LOCK            74
 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
 
 /*
diff --git a/xen/arch/x86/include/asm/perfc_defn.h b/xen/arch/x86/include/asm/perfc_defn.h
index b07063b7d8..d6eb661940 100644
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,           "exceptions", 32)
 
 #ifdef CONFIG_HVM
 
-#define VMX_PERF_EXIT_REASON_SIZE 65
+#define VMX_PERF_EXIT_REASON_SIZE 75
 #define VMEXIT_NPF_PERFC 143
 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
 PERFCOUNTER_ARRAY(vmexits,              "vmexits",
@@ -125,4 +125,6 @@ PERFCOUNTER(realmode_exits,      "vmexits from realmode")
 
 PERFCOUNTER(pauseloop_exits, "vmexits from Pause-Loop Detection")
 
+PERFCOUNTER(buslock, "Bus Locks Detected")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
-- 
2.36.0



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-17 13:21 [PATCH 0/2] x86/vmx: implement Bus Lock and VM Notify Roger Pau Monne
  2022-05-17 13:21 ` [PATCH 1/2] x86/vmx: implement Bus Lock detection Roger Pau Monne
@ 2022-05-17 13:21 ` Roger Pau Monne
  2022-05-17 14:55   ` Roger Pau Monné
                     ` (2 more replies)
  1 sibling, 3 replies; 15+ messages in thread
From: Roger Pau Monne @ 2022-05-17 13:21 UTC (permalink / raw)
  To: xen-devel
  Cc: Roger Pau Monne, Andrew Cooper, George Dunlap, Jan Beulich,
	Julien Grall, Stefano Stabellini, Wei Liu, Jun Nakajima,
	Kevin Tian

Under certain conditions guests can get the CPU stuck in an infinite
loop without the possibility of an interrupt window to occur.  This
was the case with the scenarios described in XSA-156.

Make use of the Notify VM Exit mechanism, that will trigger a VM Exit
if no interrupt window occurs for a specified amount of time.  Note
that using the Notify VM Exit avoids having to trap #AC and #DB
exceptions, as Xen is guaranteed to get a VM Exit even if the guest
puts the CPU in a loop without an interrupt window, as such disable
the intercepts if the feature is available and enabled.

Setting the notify VM exit window to 0 is safe because there's a
threshold added by the hardware in order to have a sane window value.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
This change enables the notify VM exit by default, KVM however doesn't
seem to enable it by default, and there's the following note in the
commit message:

"- There's a possibility, however small, that a notify VM exit happens
   with VM_CONTEXT_INVALID set in exit qualification. In this case, the
   vcpu can no longer run. To avoid killing a well-behaved guest, set
   notify window as -1 to disable this feature by default."

It's not obviously clear to me whether the comment was meant to be:
"There's a possibility, however small, that a notify VM exit _wrongly_
happens with VM_CONTEXT_INVALID".

It's also not clear whether such wrong hardware behavior only affects
a specific set of hardware, in a way that we could avoid enabling
notify VM exit there.

There's a discussion in one of the Linux patches that 128K might be
the safer value in order to prevent false positives, but I have no
formal confirmation about this.  Maybe our Intel maintainers can
provide some more feedback on a suitable notify VM exit window
value.

I've tested with 0 (the proposed default in the patch) and I don't
seem to be able to trigger notify VM exits under normal guest
operation.  Note that even in that case the guest won't be destroyed
unless the context is corrupt.
---
 docs/misc/xen-command-line.pandoc       | 11 +++++++++++
 xen/arch/x86/hvm/vmx/vmcs.c             | 20 +++++++++++++++++++-
 xen/arch/x86/hvm/vmx/vmx.c              | 24 ++++++++++++++++++++++++
 xen/arch/x86/include/asm/hvm/vmx/vmcs.h |  4 ++++
 xen/arch/x86/include/asm/hvm/vmx/vmx.h  |  6 ++++++
 xen/arch/x86/include/asm/perfc_defn.h   |  3 ++-
 6 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
index 1dc7e1ca07..ccf8bf5806 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -2544,6 +2544,17 @@ guest will notify Xen that it has failed to acquire a spinlock.
 <major>, <minor> and <build> must be integers. The values will be
 encoded in guest CPUID 0x40000002 if viridian enlightenments are enabled.
 
+### vm-notify-window (Intel)
+> `= <integer>`
+
+> Default: `0`
+
+Specify the value of the VM Notify window used to detect locked VMs. Set to -1
+to disable the feature.  Value is in units of crystal clock cycles.
+
+Note the hardware might add a threshold to the provided value in order to make
+it safe, and hence using 0 is fine.
+
 ### vpid (Intel)
 > `= <boolean>`
 
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index d388e6729c..5685a5523e 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -67,6 +67,9 @@ integer_param("ple_gap", ple_gap);
 static unsigned int __read_mostly ple_window = 4096;
 integer_param("ple_window", ple_window);
 
+static int __read_mostly vm_notify_window;
+integer_param("vm-notify-window", vm_notify_window);
+
 static bool __read_mostly opt_ept_pml = true;
 static s8 __read_mostly opt_ept_ad = -1;
 int8_t __read_mostly opt_ept_exec_sp = -1;
@@ -210,6 +213,7 @@ static void __init vmx_display_features(void)
     P(cpu_has_vmx_pml, "Page Modification Logging");
     P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
     P(cpu_has_vmx_bus_lock_detection, "Bus Lock Detection");
+    P(cpu_has_vmx_notify_vm_exiting, "Notify VM Exit");
 #undef P
 
     if ( !printed )
@@ -320,7 +324,8 @@ static int vmx_init_vmcs_config(bool bsp)
                SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS |
                SECONDARY_EXEC_XSAVES |
                SECONDARY_EXEC_TSC_SCALING |
-               SECONDARY_EXEC_BUS_LOCK_DETECTION);
+               SECONDARY_EXEC_BUS_LOCK_DETECTION |
+               SECONDARY_EXEC_NOTIFY_VM_EXITING);
         if ( _vmx_misc_cap & VMX_MISC_VMWRITE_ALL )
             opt |= SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
         if ( opt_vpid_enabled )
@@ -1333,6 +1338,19 @@ static int construct_vmcs(struct vcpu *v)
         rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
                          VMX_MSR_GUEST_LOADONLY);
 
+    if ( cpu_has_vmx_notify_vm_exiting && vm_notify_window >= 0 )
+    {
+        __vmwrite(NOTIFY_WINDOW, vm_notify_window);
+        /*
+         * Disable #AC and #DB interception: by using VM Notify Xen is
+         * guaranteed to get a VM exit even if the guest manages to lock the
+         * CPU.
+         */
+        v->arch.hvm.vmx.exception_bitmap &= ~((1U << TRAP_debug) |
+                                              (1U << TRAP_alignment_check));
+        vmx_update_exception_bitmap(v);
+    }
+
  out:
     vmx_vmcs_exit(v);
 
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 02cc7a2023..9c37790c36 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4567,6 +4567,30 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
          */
         break;
 
+    case EXIT_REASON_NOTIFY:
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+        if ( exit_qualification & NOTIFY_VM_CONTEXT_INVALID )
+        {
+            perfc_incr(vmnotify_crash);
+            gprintk(XENLOG_ERR, "invalid VM context after notify vmexit\n");
+            domain_crash(v->domain);
+            break;
+        }
+
+        if ( cpu_has_vmx_vnmi &&
+             (exit_qualification & INTR_INFO_NMI_UNBLOCKED_BY_IRET) )
+        {
+            unsigned long guest_info;
+
+            /* Exit was incident to an execution of IRET that unblocked NMIs. */
+            __vmread(GUEST_INTERRUPTIBILITY_INFO, &guest_info);
+            __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
+                      guest_info | VMX_INTR_SHADOW_NMI);
+        }
+
+        break;
+
     case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
     case EXIT_REASON_INVPCID:
     /* fall through */
diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
index 5d3edc1642..0961eabf3f 100644
--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
@@ -267,6 +267,7 @@ extern u32 vmx_vmentry_control;
 #define SECONDARY_EXEC_XSAVES                   0x00100000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
 #define SECONDARY_EXEC_BUS_LOCK_DETECTION       0x40000000
+#define SECONDARY_EXEC_NOTIFY_VM_EXITING        0x80000000
 extern u32 vmx_secondary_exec_control;
 
 #define VMX_EPT_EXEC_ONLY_SUPPORTED                         0x00000001
@@ -348,6 +349,8 @@ extern u64 vmx_ept_vpid_cap;
     (vmx_secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
 #define cpu_has_vmx_bus_lock_detection \
     (vmx_secondary_exec_control & SECONDARY_EXEC_BUS_LOCK_DETECTION)
+#define cpu_has_vmx_notify_vm_exiting \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_NOTIFY_VM_EXITING)
 
 #define VMCS_RID_TYPE_MASK              0x80000000
 
@@ -455,6 +458,7 @@ enum vmcs_field {
     SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
     PLE_GAP                         = 0x00004020,
     PLE_WINDOW                      = 0x00004022,
+    NOTIFY_WINDOW                   = 0x00004024,
     VM_INSTRUCTION_ERROR            = 0x00004400,
     VM_EXIT_REASON                  = 0x00004402,
     VM_EXIT_INTR_INFO               = 0x00004404,
diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmx.h b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
index 03995701a1..a16055643b 100644
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -221,6 +221,7 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_BUS_LOCK            74
+#define EXIT_REASON_NOTIFY              75
 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
 
 /*
@@ -233,6 +234,11 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
 #define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
 #define INTR_INFO_RESVD_BITS_MASK       0x7ffff000
 
+/*
+ * Exit Qualifications for NOTIFY VM EXIT
+ */
+#define NOTIFY_VM_CONTEXT_INVALID       1u
+
 /*
  * Exit Qualifications for MOV for Control Register Access
  */
diff --git a/xen/arch/x86/include/asm/perfc_defn.h b/xen/arch/x86/include/asm/perfc_defn.h
index d6eb661940..c6b601b729 100644
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,           "exceptions", 32)
 
 #ifdef CONFIG_HVM
 
-#define VMX_PERF_EXIT_REASON_SIZE 75
+#define VMX_PERF_EXIT_REASON_SIZE 76
 #define VMEXIT_NPF_PERFC 143
 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
 PERFCOUNTER_ARRAY(vmexits,              "vmexits",
@@ -126,5 +126,6 @@ PERFCOUNTER(realmode_exits,      "vmexits from realmode")
 PERFCOUNTER(pauseloop_exits, "vmexits from Pause-Loop Detection")
 
 PERFCOUNTER(buslock, "Bus Locks Detected")
+PERFCOUNTER(vmnotify_crash, "domains crashed by Notify VM Exit")
 
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
-- 
2.36.0



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-17 13:21 ` [PATCH 2/2] x86/vmx: implement Notify VM Exit Roger Pau Monne
@ 2022-05-17 14:55   ` Roger Pau Monné
  2022-05-19  0:10   ` Andrew Cooper
  2022-05-19  6:50   ` Jan Beulich
  2 siblings, 0 replies; 15+ messages in thread
From: Roger Pau Monné @ 2022-05-17 14:55 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, George Dunlap, Jan Beulich, Julien Grall,
	Stefano Stabellini, Wei Liu, Jun Nakajima, Kevin Tian

On Tue, May 17, 2022 at 03:21:30PM +0200, Roger Pau Monne wrote:
> Under certain conditions guests can get the CPU stuck in an infinite
> loop without the possibility of an interrupt window to occur.  This
> was the case with the scenarios described in XSA-156.
> 
> Make use of the Notify VM Exit mechanism, that will trigger a VM Exit
> if no interrupt window occurs for a specified amount of time.  Note
> that using the Notify VM Exit avoids having to trap #AC and #DB
> exceptions, as Xen is guaranteed to get a VM Exit even if the guest
> puts the CPU in a loop without an interrupt window, as such disable
> the intercepts if the feature is available and enabled.
> 
> Setting the notify VM exit window to 0 is safe because there's a
> threshold added by the hardware in order to have a sane window value.
> 
> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> This change enables the notify VM exit by default, KVM however doesn't
> seem to enable it by default, and there's the following note in the
> commit message:
> 
> "- There's a possibility, however small, that a notify VM exit happens
>    with VM_CONTEXT_INVALID set in exit qualification. In this case, the
>    vcpu can no longer run. To avoid killing a well-behaved guest, set
>    notify window as -1 to disable this feature by default."
> 
> It's not obviously clear to me whether the comment was meant to be:
> "There's a possibility, however small, that a notify VM exit _wrongly_
> happens with VM_CONTEXT_INVALID".
> 
> It's also not clear whether such wrong hardware behavior only affects
> a specific set of hardware, in a way that we could avoid enabling
> notify VM exit there.
> 
> There's a discussion in one of the Linux patches that 128K might be
> the safer value in order to prevent false positives, but I have no
> formal confirmation about this.  Maybe our Intel maintainers can
> provide some more feedback on a suitable notify VM exit window
> value.
> 
> I've tested with 0 (the proposed default in the patch) and I don't
> seem to be able to trigger notify VM exits under normal guest
> operation.  Note that even in that case the guest won't be destroyed
> unless the context is corrupt.
> ---
>  docs/misc/xen-command-line.pandoc       | 11 +++++++++++
>  xen/arch/x86/hvm/vmx/vmcs.c             | 20 +++++++++++++++++++-
>  xen/arch/x86/hvm/vmx/vmx.c              | 24 ++++++++++++++++++++++++
>  xen/arch/x86/include/asm/hvm/vmx/vmcs.h |  4 ++++
>  xen/arch/x86/include/asm/hvm/vmx/vmx.h  |  6 ++++++
>  xen/arch/x86/include/asm/perfc_defn.h   |  3 ++-
>  6 files changed, 66 insertions(+), 2 deletions(-)
> 
> diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
> index 1dc7e1ca07..ccf8bf5806 100644
> --- a/docs/misc/xen-command-line.pandoc
> +++ b/docs/misc/xen-command-line.pandoc
> @@ -2544,6 +2544,17 @@ guest will notify Xen that it has failed to acquire a spinlock.
>  <major>, <minor> and <build> must be integers. The values will be
>  encoded in guest CPUID 0x40000002 if viridian enlightenments are enabled.
>  
> +### vm-notify-window (Intel)
> +> `= <integer>`
> +
> +> Default: `0`
> +
> +Specify the value of the VM Notify window used to detect locked VMs. Set to -1
> +to disable the feature.  Value is in units of crystal clock cycles.

The following chunk is missing in order for -1 to disable the feature:

diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 5685a5523e..817e644d09 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1110,6 +1110,10 @@ static int construct_vmcs(struct vcpu *v)
           SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
           SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS);

+    if ( vm_notify_window < 0 )
+        v->arch.hvm.vmx.secondary_exec_control &=
+            ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
     if ( paging_mode_hap(d) )
     {
         v->arch.hvm.vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |

Thanks, Roger.


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] x86/vmx: implement Bus Lock detection
  2022-05-17 13:21 ` [PATCH 1/2] x86/vmx: implement Bus Lock detection Roger Pau Monne
@ 2022-05-18 22:50   ` Andrew Cooper
  2022-05-19 12:21     ` Roger Pau Monné
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2022-05-18 22:50 UTC (permalink / raw)
  To: Roger Pau Monne, xen-devel; +Cc: Jun Nakajima, Kevin Tian, Jan Beulich, Wei Liu

On 17/05/2022 14:21, Roger Pau Monne wrote:
> Add support for enabling Bus Lock Detection on Intel systems.  Such
> detection works by triggering a vmexit, which is enough of a pause to
> prevent a guest from abusing of the Bus Lock.

"which is enough of a" is a bit firmer than ideal.  "which Andy says
will be ok" is perhaps more accurate.

Perhaps "which ought to be enough" ?

A buslock here or there is no problem, and non-malicious software
appears to be devoid of buslocks (hardly surprising - it would be a hard
error on other architectures), but a malicious piece of userspace can
trivially cripple the system.

Forcing a vmexit on every buslock limits an attacker to one buslock per
however long a vmentry/exit cycle takes.

> Add an extra perf counter to track the number of Bus Locks detected.

extra Xen perf counter.

Because other hypervisors use actual perf counters to emulate this
ability on current hardware.  Maybe something we should consider...

> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index d03e78bf0d..02cc7a2023 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -4053,6 +4053,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>  
>      if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
>          return vmx_failed_vmentry(exit_reason, regs);
> +    if ( unlikely(exit_reason & VMX_EXIT_REASONS_BUS_LOCK) )
> +    {
> +        /*
> +         * Delivery of Bus Lock VM exit was pre-empted by a higher priority VM
> +         * exit.
> +         */
> +        exit_reason &= ~VMX_EXIT_REASONS_BUS_LOCK;
> +        if ( exit_reason != EXIT_REASON_BUS_LOCK )
> +            perfc_incr(buslock);

I'm pretty sure you can drop the if, and do the perfc_incr()
unconditionally.  You won't get EXIT_REASON_BUS_LOCK |
VMX_EXIT_REASONS_BUS_LOCK given that wording in the ISE.

To test, Intel has PENDING_DBG which interferes with most easy attempts
to create the condition, but how about this.

Load an LDT, misaligned across a cacheline boundary, and set #DB's %cs
to LDT[0] with a clear access bit, then execute an `icebp` instruction. 
The atomic write to set the access bit is a 4-byte access typically.

This should cause the #DB intercept to trigger on the same instantaneous
boundary that generated the buslock.

Otherwise, LGTM.

~Andrew

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-17 13:21 ` [PATCH 2/2] x86/vmx: implement Notify VM Exit Roger Pau Monne
  2022-05-17 14:55   ` Roger Pau Monné
@ 2022-05-19  0:10   ` Andrew Cooper
  2022-05-19  6:59     ` Jan Beulich
  2022-05-19 14:45     ` Roger Pau Monné
  2022-05-19  6:50   ` Jan Beulich
  2 siblings, 2 replies; 15+ messages in thread
From: Andrew Cooper @ 2022-05-19  0:10 UTC (permalink / raw)
  To: Roger Pau Monne, xen-devel
  Cc: George Dunlap, Jan Beulich, Julien Grall, Stefano Stabellini,
	Wei Liu, Jun Nakajima, Kevin Tian

On 17/05/2022 14:21, Roger Pau Monne wrote:
> Under certain conditions guests can get the CPU stuck in an infinite
> loop without the possibility of an interrupt window to occur.

instruction boundary.

It's trivial to create an infinite loop without an interrupt window :)

Also, I'd probably phrase that as an unbounded loop, because not all
problem cases are truly infinite.

>   This
> was the case with the scenarios described in XSA-156.

Case in point, both of these can be broken by something else (another
vCPU, or coherent DMA write) editing the IDT and e.g. making the #AC/#DB
vectors not present, which will yield #NP instead.

>
> Make use of the Notify VM Exit mechanism, that will trigger a VM Exit
> if no interrupt window occurs for a specified amount of time.  Note
> that using the Notify VM Exit avoids having to trap #AC and #DB
> exceptions, as Xen is guaranteed to get a VM Exit even if the guest
> puts the CPU in a loop without an interrupt window, as such disable
> the intercepts if the feature is available and enabled.
>
> Setting the notify VM exit window to 0 is safe because there's a
> threshold added by the hardware in order to have a sane window value.
>
> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> This change enables the notify VM exit by default, KVM however doesn't
> seem to enable it by default, and there's the following note in the
> commit message:
>
> "- There's a possibility, however small, that a notify VM exit happens
>    with VM_CONTEXT_INVALID set in exit qualification. In this case, the
>    vcpu can no longer run. To avoid killing a well-behaved guest, set
>    notify window as -1 to disable this feature by default."
>
> It's not obviously clear to me whether the comment was meant to be:
> "There's a possibility, however small, that a notify VM exit _wrongly_
> happens with VM_CONTEXT_INVALID".

TBH, I read that as a get-out clause for "we have no idea what to set
for a default window", and it's not a decision reasonable to defer to
users, because they have even less of an idea than us.

All CPUs with Notify VM Exit have the TSC crystal information in CPUID,
so I'd suggest that we trust CPUID to be accurate, and program for maybe
10us?  That's 1/3 of a default timeslice.



> It's also not clear whether such wrong hardware behavior only affects
> a specific set of hardware, in a way that we could avoid enabling
> notify VM exit there.
>
> There's a discussion in one of the Linux patches that 128K might be
> the safer value in order to prevent false positives, but I have no
> formal confirmation about this.  Maybe our Intel maintainers can
> provide some more feedback on a suitable notify VM exit window
> value.
>
> I've tested with 0 (the proposed default in the patch) and I don't
> seem to be able to trigger notify VM exits under normal guest
> operation.  Note that even in that case the guest won't be destroyed
> unless the context is corrupt.

Huh... There's nothing in the manual about that, but obviously hardware
has some minimum safe value if 0 appears to work in practice.

> diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
> index d388e6729c..5685a5523e 100644
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -67,6 +67,9 @@ integer_param("ple_gap", ple_gap);
>  static unsigned int __read_mostly ple_window = 4096;
>  integer_param("ple_window", ple_window);
>  
> +static int __read_mostly vm_notify_window;
> +integer_param("vm-notify-window", vm_notify_window);

Part of me is loath to keep on adding new top-level options for this.

I was about to suggest having a vmx= option, but I've just noticed that
ple_{window,gap} are wired up to cmdline options on Intel, and fixed
constants on AMD.

Thoughts on a suitable name?

> @@ -1333,6 +1338,19 @@ static int construct_vmcs(struct vcpu *v)
>          rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
>                           VMX_MSR_GUEST_LOADONLY);
>  
> +    if ( cpu_has_vmx_notify_vm_exiting && vm_notify_window >= 0 )
> +    {
> +        __vmwrite(NOTIFY_WINDOW, vm_notify_window);
> +        /*
> +         * Disable #AC and #DB interception: by using VM Notify Xen is
> +         * guaranteed to get a VM exit even if the guest manages to lock the
> +         * CPU.
> +         */
> +        v->arch.hvm.vmx.exception_bitmap &= ~((1U << TRAP_debug) |
> +                                              (1U << TRAP_alignment_check));
> +        vmx_update_exception_bitmap(v);

IIRC, it's not quite this easy.  There are conditions, e.g. attaching
gdbsx, where #DB interception wants turning on/off dynamically, and the
logic got simplified to nothing following XSA-156, so will need
reintroducing.

AMD Milan (Zen3) actually has NoNestedDataBp in CPUID.80000021.eax[0]
which allows us to not intercept #DB, so perhaps that might offer an
easier way of adjusting the interception logic.  (Or maybe not.  I can't
remember).

> +    }
> +
>   out:
>      vmx_vmcs_exit(v);
>  
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 02cc7a2023..9c37790c36 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -4567,6 +4567,30 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>           */
>          break;
>  
> +    case EXIT_REASON_NOTIFY:
> +        __vmread(EXIT_QUALIFICATION, &exit_qualification);
> +
> +        if ( exit_qualification & NOTIFY_VM_CONTEXT_INVALID )
> +        {
> +            perfc_incr(vmnotify_crash);
> +            gprintk(XENLOG_ERR, "invalid VM context after notify vmexit\n");
> +            domain_crash(v->domain);
> +            break;
> +        }
> +
> +        if ( cpu_has_vmx_vnmi &&
> +             (exit_qualification & INTR_INFO_NMI_UNBLOCKED_BY_IRET) )
> +        {
> +            unsigned long guest_info;
> +
> +            /* Exit was incident to an execution of IRET that unblocked NMIs. */
> +            __vmread(GUEST_INTERRUPTIBILITY_INFO, &guest_info);
> +            __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
> +                      guest_info | VMX_INTR_SHADOW_NMI);

I am saddened by how irritating it is having the UNBLOCKED_BY_IRET (in
the first place...) but moving between the exit qualification and the
vmexit intr info fields.  The constant probably ought to be renamed to
lose the INTR_INFO prefix.

I'd suggest a prereq patch to also break

static void undo_nmis_unblocked_by_iret(void)
{
    ...
}

out to avoid opencoding it in several places.  There's one other
instance in our code (general fault intercept), but we're buggy on
PML-full, APIC-access and EPT violation all of which Xen handles.

I don't think you need the vnmi check, because the bit is 0 otherwise.

~Andrew

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-17 13:21 ` [PATCH 2/2] x86/vmx: implement Notify VM Exit Roger Pau Monne
  2022-05-17 14:55   ` Roger Pau Monné
  2022-05-19  0:10   ` Andrew Cooper
@ 2022-05-19  6:50   ` Jan Beulich
  2022-05-19 12:44     ` Roger Pau Monné
  2 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2022-05-19  6:50 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Jun Nakajima, Kevin Tian, xen-devel

On 17.05.2022 15:21, Roger Pau Monne wrote:
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -67,6 +67,9 @@ integer_param("ple_gap", ple_gap);
>  static unsigned int __read_mostly ple_window = 4096;
>  integer_param("ple_window", ple_window);
>  
> +static int __read_mostly vm_notify_window;

__ro_after_init?

> @@ -1333,6 +1338,19 @@ static int construct_vmcs(struct vcpu *v)
>          rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
>                           VMX_MSR_GUEST_LOADONLY);
>  
> +    if ( cpu_has_vmx_notify_vm_exiting && vm_notify_window >= 0 )

The assumption then is that values >= 2^^31 are nonsense? Generally
I'd think we want to special case merely ~0u, giving the variable
unsigned type. However, I also don't see where you disable use of
the feature in that case: Merely skipping the VMCS update here isn't
enough, is it? The field itself doesn't know any special case
values (like ~0) as per the doc I'm looking at. So I guess the OR-ing
in of SECONDARY_EXEC_NOTIFY_VM_EXITING in vmx_init_vmcs_config()
wants to be conditional.

> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -4567,6 +4567,30 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>           */
>          break;
>  
> +    case EXIT_REASON_NOTIFY:
> +        __vmread(EXIT_QUALIFICATION, &exit_qualification);
> +
> +        if ( exit_qualification & NOTIFY_VM_CONTEXT_INVALID )
> +        {
> +            perfc_incr(vmnotify_crash);

Is this a useful event to count? We don't count other crash causes,
iirc.

Jan



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-19  0:10   ` Andrew Cooper
@ 2022-05-19  6:59     ` Jan Beulich
  2022-05-19  9:20       ` Andrew Cooper
  2022-05-19 14:45     ` Roger Pau Monné
  1 sibling, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2022-05-19  6:59 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Jun Nakajima, Kevin Tian, Roger Pau Monne, xen-devel

On 19.05.2022 02:10, Andrew Cooper wrote:
> On 17/05/2022 14:21, Roger Pau Monne wrote:
>> Under certain conditions guests can get the CPU stuck in an infinite
>> loop without the possibility of an interrupt window to occur.
> 
> instruction boundary.
> 
> It's trivial to create an infinite loop without an interrupt window :)
> 
> Also, I'd probably phrase that as an unbounded loop, because not all
> problem cases are truly infinite.
> 
>>   This
>> was the case with the scenarios described in XSA-156.
> 
> Case in point, both of these can be broken by something else (another
> vCPU, or coherent DMA write) editing the IDT and e.g. making the #AC/#DB
> vectors not present, which will yield #NP instead.

"Can be broken" as in "the loop can be forced to be exited"? If so, how
would a remote CPU / agent become aware of the situation, and know what
the cause is (and hence know which IDT entry to clobber)? After all it's
guest state, which we wouldn't want to alter for no reason. Nor should
we put a guest in a state where #DF might eventually result.

Jan



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-19  6:59     ` Jan Beulich
@ 2022-05-19  9:20       ` Andrew Cooper
  0 siblings, 0 replies; 15+ messages in thread
From: Andrew Cooper @ 2022-05-19  9:20 UTC (permalink / raw)
  To: Jan Beulich
  Cc: George Dunlap, Julien Grall, Stefano Stabellini, Wei Liu,
	Jun Nakajima, Kevin Tian, Roger Pau Monne, xen-devel

On 19/05/2022 07:59, Jan Beulich wrote:
> On 19.05.2022 02:10, Andrew Cooper wrote:
>> On 17/05/2022 14:21, Roger Pau Monne wrote:
>>> Under certain conditions guests can get the CPU stuck in an infinite
>>> loop without the possibility of an interrupt window to occur.
>> instruction boundary.
>>
>> It's trivial to create an infinite loop without an interrupt window :)
>>
>> Also, I'd probably phrase that as an unbounded loop, because not all
>> problem cases are truly infinite.
>>
>>>   This
>>> was the case with the scenarios described in XSA-156.
>> Case in point, both of these can be broken by something else (another
>> vCPU, or coherent DMA write) editing the IDT and e.g. making the #AC/#DB
>> vectors not present, which will yield #NP instead.
> "Can be broken" as in "the loop can be forced to be exited"? If so, how
> would a remote CPU / agent become aware of the situation, and know what
> the cause is (and hence know which IDT entry to clobber)? After all it's
> guest state, which we wouldn't want to alter for no reason. Nor should
> we put a guest in a state where #DF might eventually result.

Well quite... "Can be broken" does not mean that this approach is a
viable security defence.

It does highlight that the loop only continues while there is no
perturbation to the memory accesses involved.

~Andrew

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] x86/vmx: implement Bus Lock detection
  2022-05-18 22:50   ` Andrew Cooper
@ 2022-05-19 12:21     ` Roger Pau Monné
  2022-05-19 13:05       ` Andrew Cooper
  0 siblings, 1 reply; 15+ messages in thread
From: Roger Pau Monné @ 2022-05-19 12:21 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Jun Nakajima, Kevin Tian, Jan Beulich, Wei Liu

On Wed, May 18, 2022 at 10:50:02PM +0000, Andrew Cooper wrote:
> On 17/05/2022 14:21, Roger Pau Monne wrote:
> > Add support for enabling Bus Lock Detection on Intel systems.  Such
> > detection works by triggering a vmexit, which is enough of a pause to
> > prevent a guest from abusing of the Bus Lock.
> 
> "which is enough of a" is a bit firmer than ideal.  "which Andy says
> will be ok" is perhaps more accurate.
> 
> Perhaps "which ought to be enough" ?
> 
> A buslock here or there is no problem, and non-malicious software
> appears to be devoid of buslocks (hardly surprising - it would be a hard
> error on other architectures), but a malicious piece of userspace can
> trivially cripple the system.
> 
> Forcing a vmexit on every buslock limits an attacker to one buslock per
> however long a vmentry/exit cycle takes.
> 
> > Add an extra perf counter to track the number of Bus Locks detected.
> 
> extra Xen perf counter.
> 
> Because other hypervisors use actual perf counters to emulate this
> ability on current hardware.  Maybe something we should consider...
> 
> > diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> > index d03e78bf0d..02cc7a2023 100644
> > --- a/xen/arch/x86/hvm/vmx/vmx.c
> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
> > @@ -4053,6 +4053,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
> >  
> >      if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
> >          return vmx_failed_vmentry(exit_reason, regs);
> > +    if ( unlikely(exit_reason & VMX_EXIT_REASONS_BUS_LOCK) )
> > +    {
> > +        /*
> > +         * Delivery of Bus Lock VM exit was pre-empted by a higher priority VM
> > +         * exit.
> > +         */
> > +        exit_reason &= ~VMX_EXIT_REASONS_BUS_LOCK;
> > +        if ( exit_reason != EXIT_REASON_BUS_LOCK )
> > +            perfc_incr(buslock);
> 
> I'm pretty sure you can drop the if, and do the perfc_incr()
> unconditionally.  You won't get EXIT_REASON_BUS_LOCK |
> VMX_EXIT_REASONS_BUS_LOCK given that wording in the ISE.

I though the same, but got a EXIT_REASON_BUS_LOCK |
VMX_EXIT_REASONS_BUS_LOCK fairly easy by simply doing a xchg over a
cache line boundary.

I think at least on the model I'm testing it looks like
VMX_EXIT_REASONS_BUS_LOCK is added unconditionally, regardless of
whether the vmexit itself is already EXIT_REASON_BUS_LOCK.

> To test, Intel has PENDING_DBG which interferes with most easy attempts
> to create the condition, but how about this.
> 
> Load an LDT, misaligned across a cacheline boundary, and set #DB's %cs
> to LDT[0] with a clear access bit, then execute an `icebp` instruction. 
> The atomic write to set the access bit is a 4-byte access typically.
> 
> This should cause the #DB intercept to trigger on the same instantaneous
> boundary that generated the buslock.
> 
> Otherwise, LGTM.

If you agree with the above I will modify the commit message and
resend.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-19  6:50   ` Jan Beulich
@ 2022-05-19 12:44     ` Roger Pau Monné
  2022-05-19 14:14       ` Jan Beulich
  0 siblings, 1 reply; 15+ messages in thread
From: Roger Pau Monné @ 2022-05-19 12:44 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Jun Nakajima, Kevin Tian, xen-devel

On Thu, May 19, 2022 at 08:50:55AM +0200, Jan Beulich wrote:
> On 17.05.2022 15:21, Roger Pau Monne wrote:
> > --- a/xen/arch/x86/hvm/vmx/vmcs.c
> > +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> > @@ -67,6 +67,9 @@ integer_param("ple_gap", ple_gap);
> >  static unsigned int __read_mostly ple_window = 4096;
> >  integer_param("ple_window", ple_window);
> >  
> > +static int __read_mostly vm_notify_window;
> 
> __ro_after_init?

Yes, I tend to forget we have this now.

> > @@ -1333,6 +1338,19 @@ static int construct_vmcs(struct vcpu *v)
> >          rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
> >                           VMX_MSR_GUEST_LOADONLY);
> >  
> > +    if ( cpu_has_vmx_notify_vm_exiting && vm_notify_window >= 0 )
> 
> The assumption then is that values >= 2^^31 are nonsense? Generally
> I'd think we want to special case merely ~0u, giving the variable
> unsigned type.

OK, I really don't know whether >= 2^31 makes sense or not, I would
think that using such values the window would be too big to be
helpful.  In any case I don't see a point in preventing >= 2^31 so
will adjust the type and check.

> However, I also don't see where you disable use of
> the feature in that case: Merely skipping the VMCS update here isn't
> enough, is it? The field itself doesn't know any special case
> values (like ~0) as per the doc I'm looking at. So I guess the OR-ing
> in of SECONDARY_EXEC_NOTIFY_VM_EXITING in vmx_init_vmcs_config()
> wants to be conditional.

I've sent the disabling chunk as a followup, forgot to add it here,
sorry.

> > --- a/xen/arch/x86/hvm/vmx/vmx.c
> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
> > @@ -4567,6 +4567,30 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
> >           */
> >          break;
> >  
> > +    case EXIT_REASON_NOTIFY:
> > +        __vmread(EXIT_QUALIFICATION, &exit_qualification);
> > +
> > +        if ( exit_qualification & NOTIFY_VM_CONTEXT_INVALID )
> > +        {
> > +            perfc_incr(vmnotify_crash);
> 
> Is this a useful event to count? We don't count other crash causes,
> iirc.

I thought it was helpful information from an admin PoV, but maybe I'm
mistaken.  I know we don't count other crash reasons, but that doesn't
mean it won't be helpful to do so.  Given that users have to opt-in to
enable counters I suggest to leave the counter there.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] x86/vmx: implement Bus Lock detection
  2022-05-19 12:21     ` Roger Pau Monné
@ 2022-05-19 13:05       ` Andrew Cooper
  0 siblings, 0 replies; 15+ messages in thread
From: Andrew Cooper @ 2022-05-19 13:05 UTC (permalink / raw)
  To: Roger Pau Monné, Andrew Cooper
  Cc: xen-devel, Jun Nakajima, Kevin Tian, Jan Beulich, Wei Liu

On 19/05/2022 13:21, Roger Pau Monné wrote:
> On Wed, May 18, 2022 at 10:50:02PM +0000, Andrew Cooper wrote:
>> On 17/05/2022 14:21, Roger Pau Monne wrote:
>>> Add support for enabling Bus Lock Detection on Intel systems.  Such
>>> detection works by triggering a vmexit, which is enough of a pause to
>>> prevent a guest from abusing of the Bus Lock.
>> "which is enough of a" is a bit firmer than ideal.  "which Andy says
>> will be ok" is perhaps more accurate.
>>
>> Perhaps "which ought to be enough" ?
>>
>> A buslock here or there is no problem, and non-malicious software
>> appears to be devoid of buslocks (hardly surprising - it would be a hard
>> error on other architectures), but a malicious piece of userspace can
>> trivially cripple the system.
>>
>> Forcing a vmexit on every buslock limits an attacker to one buslock per
>> however long a vmentry/exit cycle takes.
>>
>>> Add an extra perf counter to track the number of Bus Locks detected.
>> extra Xen perf counter.
>>
>> Because other hypervisors use actual perf counters to emulate this
>> ability on current hardware.  Maybe something we should consider...
>>
>>> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
>>> index d03e78bf0d..02cc7a2023 100644
>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>> @@ -4053,6 +4053,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>>>  
>>>      if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
>>>          return vmx_failed_vmentry(exit_reason, regs);
>>> +    if ( unlikely(exit_reason & VMX_EXIT_REASONS_BUS_LOCK) )
>>> +    {
>>> +        /*
>>> +         * Delivery of Bus Lock VM exit was pre-empted by a higher priority VM
>>> +         * exit.
>>> +         */
>>> +        exit_reason &= ~VMX_EXIT_REASONS_BUS_LOCK;
>>> +        if ( exit_reason != EXIT_REASON_BUS_LOCK )
>>> +            perfc_incr(buslock);
>> I'm pretty sure you can drop the if, and do the perfc_incr()
>> unconditionally.  You won't get EXIT_REASON_BUS_LOCK |
>> VMX_EXIT_REASONS_BUS_LOCK given that wording in the ISE.
> I though the same, but got a EXIT_REASON_BUS_LOCK |
> VMX_EXIT_REASONS_BUS_LOCK fairly easy by simply doing a xchg over a
> cache line boundary.
>
> I think at least on the model I'm testing it looks like
> VMX_EXIT_REASONS_BUS_LOCK is added unconditionally, regardless of
> whether the vmexit itself is already EXIT_REASON_BUS_LOCK.

Hmm, in which case you've found either an SDP bug, or a documentation bug.

Lets follow up with Intel and try to identify which it is.

~Andrew


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-19 12:44     ` Roger Pau Monné
@ 2022-05-19 14:14       ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2022-05-19 14:14 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Andrew Cooper, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Jun Nakajima, Kevin Tian, xen-devel

On 19.05.2022 14:44, Roger Pau Monné wrote:
> On Thu, May 19, 2022 at 08:50:55AM +0200, Jan Beulich wrote:
>> On 17.05.2022 15:21, Roger Pau Monne wrote:
>>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>>> @@ -4567,6 +4567,30 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
>>>           */
>>>          break;
>>>  
>>> +    case EXIT_REASON_NOTIFY:
>>> +        __vmread(EXIT_QUALIFICATION, &exit_qualification);
>>> +
>>> +        if ( exit_qualification & NOTIFY_VM_CONTEXT_INVALID )
>>> +        {
>>> +            perfc_incr(vmnotify_crash);
>>
>> Is this a useful event to count? We don't count other crash causes,
>> iirc.
> 
> I thought it was helpful information from an admin PoV, but maybe I'm
> mistaken.  I know we don't count other crash reasons, but that doesn't
> mean it won't be helpful to do so.  Given that users have to opt-in to
> enable counters I suggest to leave the counter there.

Just to be explicit: I don't mind the counter, I merely find its addition
inconsistent with what we've got.

Jan



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-19  0:10   ` Andrew Cooper
  2022-05-19  6:59     ` Jan Beulich
@ 2022-05-19 14:45     ` Roger Pau Monné
  2022-05-20 10:08       ` Roger Pau Monné
  1 sibling, 1 reply; 15+ messages in thread
From: Roger Pau Monné @ 2022-05-19 14:45 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: xen-devel, George Dunlap, Jan Beulich, Julien Grall,
	Stefano Stabellini, Wei Liu, Jun Nakajima, Kevin Tian

On Thu, May 19, 2022 at 12:10:24AM +0000, Andrew Cooper wrote:
> On 17/05/2022 14:21, Roger Pau Monne wrote:
> > Under certain conditions guests can get the CPU stuck in an infinite
> > loop without the possibility of an interrupt window to occur.
> 
> instruction boundary.
> 
> It's trivial to create an infinite loop without an interrupt window :)
> 
> Also, I'd probably phrase that as an unbounded loop, because not all
> problem cases are truly infinite.
> 
> >   This
> > was the case with the scenarios described in XSA-156.
> 
> Case in point, both of these can be broken by something else (another
> vCPU, or coherent DMA write) editing the IDT and e.g. making the #AC/#DB
> vectors not present, which will yield #NP instead.
> 
> >
> > Make use of the Notify VM Exit mechanism, that will trigger a VM Exit
> > if no interrupt window occurs for a specified amount of time.  Note
> > that using the Notify VM Exit avoids having to trap #AC and #DB
> > exceptions, as Xen is guaranteed to get a VM Exit even if the guest
> > puts the CPU in a loop without an interrupt window, as such disable
> > the intercepts if the feature is available and enabled.
> >
> > Setting the notify VM exit window to 0 is safe because there's a
> > threshold added by the hardware in order to have a sane window value.
> >
> > Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> > This change enables the notify VM exit by default, KVM however doesn't
> > seem to enable it by default, and there's the following note in the
> > commit message:
> >
> > "- There's a possibility, however small, that a notify VM exit happens
> >    with VM_CONTEXT_INVALID set in exit qualification. In this case, the
> >    vcpu can no longer run. To avoid killing a well-behaved guest, set
> >    notify window as -1 to disable this feature by default."
> >
> > It's not obviously clear to me whether the comment was meant to be:
> > "There's a possibility, however small, that a notify VM exit _wrongly_
> > happens with VM_CONTEXT_INVALID".
> 
> TBH, I read that as a get-out clause for "we have no idea what to set
> for a default window", and it's not a decision reasonable to defer to
> users, because they have even less of an idea than us.
> 
> All CPUs with Notify VM Exit have the TSC crystal information in CPUID,
> so I'd suggest that we trust CPUID to be accurate, and program for maybe
> 10us?  That's 1/3 of a default timeslice.
> 
> 
> 
> > It's also not clear whether such wrong hardware behavior only affects
> > a specific set of hardware, in a way that we could avoid enabling
> > notify VM exit there.
> >
> > There's a discussion in one of the Linux patches that 128K might be
> > the safer value in order to prevent false positives, but I have no
> > formal confirmation about this.  Maybe our Intel maintainers can
> > provide some more feedback on a suitable notify VM exit window
> > value.
> >
> > I've tested with 0 (the proposed default in the patch) and I don't
> > seem to be able to trigger notify VM exits under normal guest
> > operation.  Note that even in that case the guest won't be destroyed
> > unless the context is corrupt.
> 
> Huh... There's nothing in the manual about that, but obviously hardware
> has some minimum safe value if 0 appears to work in practice.

Got the tip from looking at the KVM patch submission for notify VM
exit implementation.

Seeing your suggestion above to use 10us and your reply here, I'm
unsure whether you are fine with using 0.

> > diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
> > index d388e6729c..5685a5523e 100644
> > --- a/xen/arch/x86/hvm/vmx/vmcs.c
> > +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> > @@ -67,6 +67,9 @@ integer_param("ple_gap", ple_gap);
> >  static unsigned int __read_mostly ple_window = 4096;
> >  integer_param("ple_window", ple_window);
> >  
> > +static int __read_mostly vm_notify_window;
> > +integer_param("vm-notify-window", vm_notify_window);
> 
> Part of me is loath to keep on adding new top-level options for this.
> 
> I was about to suggest having a vmx= option, but I've just noticed that
> ple_{window,gap} are wired up to cmdline options on Intel, and fixed
> constants on AMD.

Do we want to make this VMX specific?  The problem affects both Intel
and AMD, so I would think it's possible for AMD to introduce a similar
solution in the future and hence we might want to share
"vm-notify-window".  That however raises questions as to whether AMD
will also allow specifying a window, and whether it will be in crystal
clock units.

> Thoughts on a suitable name?

vmx and svm would seem fine to me.

> > @@ -1333,6 +1338,19 @@ static int construct_vmcs(struct vcpu *v)
> >          rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
> >                           VMX_MSR_GUEST_LOADONLY);
> >  
> > +    if ( cpu_has_vmx_notify_vm_exiting && vm_notify_window >= 0 )
> > +    {
> > +        __vmwrite(NOTIFY_WINDOW, vm_notify_window);
> > +        /*
> > +         * Disable #AC and #DB interception: by using VM Notify Xen is
> > +         * guaranteed to get a VM exit even if the guest manages to lock the
> > +         * CPU.
> > +         */
> > +        v->arch.hvm.vmx.exception_bitmap &= ~((1U << TRAP_debug) |
> > +                                              (1U << TRAP_alignment_check));
> > +        vmx_update_exception_bitmap(v);
> 
> IIRC, it's not quite this easy.  There are conditions, e.g. attaching
> gdbsx, where #DB interception wants turning on/off dynamically, and the
> logic got simplified to nothing following XSA-156, so will need
> reintroducing.
> 
> AMD Milan (Zen3) actually has NoNestedDataBp in CPUID.80000021.eax[0]
> which allows us to not intercept #DB, so perhaps that might offer an
> easier way of adjusting the interception logic.  (Or maybe not.  I can't
> remember).

OK, will look into it.

> > +    }
> > +
> >   out:
> >      vmx_vmcs_exit(v);
> >  
> > diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> > index 02cc7a2023..9c37790c36 100644
> > --- a/xen/arch/x86/hvm/vmx/vmx.c
> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
> > @@ -4567,6 +4567,30 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
> >           */
> >          break;
> >  
> > +    case EXIT_REASON_NOTIFY:
> > +        __vmread(EXIT_QUALIFICATION, &exit_qualification);
> > +
> > +        if ( exit_qualification & NOTIFY_VM_CONTEXT_INVALID )
> > +        {
> > +            perfc_incr(vmnotify_crash);
> > +            gprintk(XENLOG_ERR, "invalid VM context after notify vmexit\n");
> > +            domain_crash(v->domain);
> > +            break;
> > +        }
> > +
> > +        if ( cpu_has_vmx_vnmi &&
> > +             (exit_qualification & INTR_INFO_NMI_UNBLOCKED_BY_IRET) )
> > +        {
> > +            unsigned long guest_info;
> > +
> > +            /* Exit was incident to an execution of IRET that unblocked NMIs. */
> > +            __vmread(GUEST_INTERRUPTIBILITY_INFO, &guest_info);
> > +            __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
> > +                      guest_info | VMX_INTR_SHADOW_NMI);
> 
> I am saddened by how irritating it is having the UNBLOCKED_BY_IRET (in
> the first place...) but moving between the exit qualification and the
> vmexit intr info fields.  The constant probably ought to be renamed to
> lose the INTR_INFO prefix.
> 
> I'd suggest a prereq patch to also break
> 
> static void undo_nmis_unblocked_by_iret(void)
> {
>     ...
> }
> 
> out to avoid opencoding it in several places.  There's one other
> instance in our code (general fault intercept), but we're buggy on
> PML-full, APIC-access and EPT violation all of which Xen handles.

I don't think we are buggy on APIC-accesses, the SDM says:

"Executions of IRET may also incur VM exits due to APIC accesses and
EPT misconfigurations. These VM exits do not report information about
NMI unblocking due to IRET."

> I don't think you need the vnmi check, because the bit is 0 otherwise.

Right, the bit is only defined in the Xen case if vnmi is enabled,
because Xen requires PIN_BASED_NMI_EXITING to be set (NMI Exiting set
to 0 could also allow the bit to be 1).

Thanks, Roger.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] x86/vmx: implement Notify VM Exit
  2022-05-19 14:45     ` Roger Pau Monné
@ 2022-05-20 10:08       ` Roger Pau Monné
  0 siblings, 0 replies; 15+ messages in thread
From: Roger Pau Monné @ 2022-05-20 10:08 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich
  Cc: xen-devel, George Dunlap, Julien Grall, Stefano Stabellini,
	Wei Liu, Jun Nakajima, Kevin Tian

On Thu, May 19, 2022 at 04:45:20PM +0200, Roger Pau Monné wrote:
> On Thu, May 19, 2022 at 12:10:24AM +0000, Andrew Cooper wrote:
> > On 17/05/2022 14:21, Roger Pau Monne wrote:
> > > @@ -1333,6 +1338,19 @@ static int construct_vmcs(struct vcpu *v)
> > >          rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
> > >                           VMX_MSR_GUEST_LOADONLY);
> > >  
> > > +    if ( cpu_has_vmx_notify_vm_exiting && vm_notify_window >= 0 )
> > > +    {
> > > +        __vmwrite(NOTIFY_WINDOW, vm_notify_window);
> > > +        /*
> > > +         * Disable #AC and #DB interception: by using VM Notify Xen is
> > > +         * guaranteed to get a VM exit even if the guest manages to lock the
> > > +         * CPU.
> > > +         */
> > > +        v->arch.hvm.vmx.exception_bitmap &= ~((1U << TRAP_debug) |
> > > +                                              (1U << TRAP_alignment_check));
> > > +        vmx_update_exception_bitmap(v);
> > 
> > IIRC, it's not quite this easy.  There are conditions, e.g. attaching
> > gdbsx, where #DB interception wants turning on/off dynamically, and the
> > logic got simplified to nothing following XSA-156, so will need
> > reintroducing.
> > 
> > AMD Milan (Zen3) actually has NoNestedDataBp in CPUID.80000021.eax[0]
> > which allows us to not intercept #DB, so perhaps that might offer an
> > easier way of adjusting the interception logic.  (Or maybe not.  I can't
> > remember).
> 
> OK, will look into it.

So after taking a look, I think we need to modify vmx_update_debug_state() so it's:

void vmx_update_debug_state(struct vcpu *v)
{
    unsigned int mask = 1u << TRAP_int3;

    if ( v->arch.hvm.vmx.secondary_exec_control &
         SECONDARY_EXEC_NOTIFY_VM_EXITING )
        /*
         * Only allow toggling TRAP_debug if notify VM exit is enabled, as
         * unconditionally setting TRAP_debug is part of the XSA-156 fix.
         */
        mask |= 1u << TRAP_debug;

    if ( v->arch.hvm.debug_state_latch )
        v->arch.hvm.vmx.exception_bitmap |= mask;
    else
        v->arch.hvm.vmx.exception_bitmap &= ~mask;

[...]

I'm however confused by the usage of cpu_has_monitor_trap_flag
previous to XSA-156, which was:

void vmx_update_debug_state(struct vcpu *v)
{
    unsigned long mask;

    mask = 1u << TRAP_int3;
    if ( !cpu_has_monitor_trap_flag )
        mask |= 1u << TRAP_debug;

Was it fine to not set TRAP_debug only if cpu_has_monitor_trap_flag
is supported by the CPU? (even if not currently set on
secondary_exec_control)?

Thanks, Roger.


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-05-20 10:09 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-17 13:21 [PATCH 0/2] x86/vmx: implement Bus Lock and VM Notify Roger Pau Monne
2022-05-17 13:21 ` [PATCH 1/2] x86/vmx: implement Bus Lock detection Roger Pau Monne
2022-05-18 22:50   ` Andrew Cooper
2022-05-19 12:21     ` Roger Pau Monné
2022-05-19 13:05       ` Andrew Cooper
2022-05-17 13:21 ` [PATCH 2/2] x86/vmx: implement Notify VM Exit Roger Pau Monne
2022-05-17 14:55   ` Roger Pau Monné
2022-05-19  0:10   ` Andrew Cooper
2022-05-19  6:59     ` Jan Beulich
2022-05-19  9:20       ` Andrew Cooper
2022-05-19 14:45     ` Roger Pau Monné
2022-05-20 10:08       ` Roger Pau Monné
2022-05-19  6:50   ` Jan Beulich
2022-05-19 12:44     ` Roger Pau Monné
2022-05-19 14:14       ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.