[PATCH] Xen vMCE bugfix: inject vMCE# to all vcpus

* [PATCH] Xen vMCE bugfix: inject vMCE# to all vcpus
@ 2012-06-13  8:05 Liu, Jinsong
  2012-06-13  8:53 ` Jan Beulich
  0 siblings, 1 reply; 5+ messages in thread
From: Liu, Jinsong @ 2012-06-13  8:05 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Shan, Haitao, Jiang, Yunhong, Keir Fraser,
	'xen-devel@lists.xensource.com',
	Zhang, Xiantao

[-- Attachment #1: Type: text/plain, Size: 4757 bytes --]

Xen vMCE bugfix: inject vMCE# to all vcpus

In our test for win8 guest mce, we find a bug in that no matter what SRAO/SRAR
error xen inject to win8 guest, it always reboot.

The root cause is, current Xen vMCE logic inject vMCE# only to vcpu0, this is
not correct for Intel MCE (Under Intel arch, h/w generate MCE# to all CPUs).

This patch fix vMCE injection bug, injecting vMCE# to all vcpus.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>

diff -r 19c15f3dfe1f xen/arch/x86/cpu/mcheck/mce.h

--- a/xen/arch/x86/cpu/mcheck/mce.h	Tue Jun 05 03:18:00 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.h	Wed Jun 13 23:40:45 2012 +0800
@@ -167,7 +167,6 @@
 
 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
         uint64_t gstatus);
-int inject_vmce(struct domain *d);
 int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global);
 
 extern int vmce_init(struct cpuinfo_x86 *c);
diff -r 19c15f3dfe1f xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Tue Jun 05 03:18:00 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Wed Jun 13 23:40:45 2012 +0800
@@ -638,6 +638,32 @@
     return rec;
 }
 
+static int inject_vmce(struct domain *d)
+{
+    struct vcpu *v;
+
+    /* inject vMCE to all vcpus */
+    for_each_vcpu(d, v)
+    {
+        if ( !test_and_set_bool(v->mce_pending) &&
+            ((d->is_hvm) ? 1 :
+            guest_has_trap_callback(d, v->vcpu_id, TRAP_machine_check)) )
+        {
+            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to dom%d vcpu%d\n",
+                       d->domain_id, v->vcpu_id);
+            vcpu_kick(v);
+        }
+        else
+        {
+            mce_printk(MCE_QUIET, "Fail to inject vMCE to dom%d vcpu%d\n",
+                       d->domain_id, v->vcpu_id);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
              enum mce_result *result,
@@ -718,11 +744,8 @@
 
                 /* We will inject vMCE to DOMU*/
                 if ( inject_vmce(d) < 0 )
-                {
-                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
-                      " failed\n", d->domain_id);
                     goto vmce_failed;
-                }
+
                 /* Impacted domain go on with domain's recovery job
                  * if the domain has its own MCA handler.
                  * For xen, it has contained the error and finished
diff -r 19c15f3dfe1f xen/arch/x86/cpu/mcheck/vmce.c
--- a/xen/arch/x86/cpu/mcheck/vmce.c	Tue Jun 05 03:18:00 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/vmce.c	Wed Jun 13 23:40:45 2012 +0800
@@ -389,53 +389,6 @@
 HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
                           vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
 
-int inject_vmce(struct domain *d)
-{
-    int cpu = smp_processor_id();
-
-    /* PV guest and HVM guest have different vMCE# injection methods. */
-    if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
-    {
-        if ( d->is_hvm )
-        {
-            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
-                       d->domain_id);
-            vcpu_kick(d->vcpu[0]);
-        }
-        else
-        {
-            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
-                       d->domain_id);
-            if ( guest_has_trap_callback(d, 0, TRAP_machine_check) )
-            {
-                cpumask_copy(d->vcpu[0]->cpu_affinity_tmp,
-                             d->vcpu[0]->cpu_affinity);
-                mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n",
-                           cpu, d->vcpu[0]->processor);
-                vcpu_set_affinity(d->vcpu[0], cpumask_of(cpu));
-                vcpu_kick(d->vcpu[0]);
-            }
-            else
-            {
-                mce_printk(MCE_VERBOSE,
-                           "MCE: Kill PV guest with No MCE handler\n");
-                domain_crash(d);
-            }
-        }
-    }
-    else
-    {
-        /* new vMCE comes while first one has not been injected yet,
-         * in this case, inject fail. [We can't lose this vMCE for
-         * the mce node's consistency].
-         */
-        mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected "
-                   " to this DOM%d!\n", d->domain_id);
-        return -1;
-    }
-    return 0;
-}
-
 /* This node list records errors impacting a domain. when one
  * MCE# happens, one error bank impacts a domain. This error node
  * will be inserted to the tail of the per_dom data for vMCE# MSR

[-- Attachment #2: vmce-injection-bugfix.patch --]
[-- Type: application/octet-stream, Size: 4630 bytes --]

Xen vMCE bugfix: inject vMCE# to all vcpus

In our test for win8 guest mce, we find a bug in that no matter what SRAO/SRAR
error xen inject to win8 guest, it always reboot.

The root cause is, current Xen vMCE logic inject vMCE# only to vcpu0, this is
not correct for Intel MCE (Under Intel arch, h/w generate MCE# to all CPUs).

This patch fix vMCE injection bug, injecting vMCE# to all vcpus.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>

diff -r 19c15f3dfe1f xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h	Tue Jun 05 03:18:00 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.h	Wed Jun 13 23:40:45 2012 +0800
@@ -167,7 +167,6 @@
 
 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
         uint64_t gstatus);
-int inject_vmce(struct domain *d);
 int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global);
 
 extern int vmce_init(struct cpuinfo_x86 *c);
diff -r 19c15f3dfe1f xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Tue Jun 05 03:18:00 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Wed Jun 13 23:40:45 2012 +0800
@@ -638,6 +638,32 @@
     return rec;
 }
 
+static int inject_vmce(struct domain *d)
+{
+    struct vcpu *v;
+
+    /* inject vMCE to all vcpus */
+    for_each_vcpu(d, v)
+    {
+        if ( !test_and_set_bool(v->mce_pending) &&
+            ((d->is_hvm) ? 1 :
+            guest_has_trap_callback(d, v->vcpu_id, TRAP_machine_check)) )
+        {
+            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to dom%d vcpu%d\n",
+                       d->domain_id, v->vcpu_id);
+            vcpu_kick(v);
+        }
+        else
+        {
+            mce_printk(MCE_QUIET, "Fail to inject vMCE to dom%d vcpu%d\n",
+                       d->domain_id, v->vcpu_id);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
              enum mce_result *result,
@@ -718,11 +744,8 @@
 
                 /* We will inject vMCE to DOMU*/
                 if ( inject_vmce(d) < 0 )
-                {
-                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
-                      " failed\n", d->domain_id);
                     goto vmce_failed;
-                }
+
                 /* Impacted domain go on with domain's recovery job
                  * if the domain has its own MCA handler.
                  * For xen, it has contained the error and finished
diff -r 19c15f3dfe1f xen/arch/x86/cpu/mcheck/vmce.c
--- a/xen/arch/x86/cpu/mcheck/vmce.c	Tue Jun 05 03:18:00 2012 +0800
+++ b/xen/arch/x86/cpu/mcheck/vmce.c	Wed Jun 13 23:40:45 2012 +0800
@@ -389,53 +389,6 @@
 HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
                           vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
 
-int inject_vmce(struct domain *d)
-{
-    int cpu = smp_processor_id();
-
-    /* PV guest and HVM guest have different vMCE# injection methods. */
-    if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
-    {
-        if ( d->is_hvm )
-        {
-            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
-                       d->domain_id);
-            vcpu_kick(d->vcpu[0]);
-        }
-        else
-        {
-            mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
-                       d->domain_id);
-            if ( guest_has_trap_callback(d, 0, TRAP_machine_check) )
-            {
-                cpumask_copy(d->vcpu[0]->cpu_affinity_tmp,
-                             d->vcpu[0]->cpu_affinity);
-                mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n",
-                           cpu, d->vcpu[0]->processor);
-                vcpu_set_affinity(d->vcpu[0], cpumask_of(cpu));
-                vcpu_kick(d->vcpu[0]);
-            }
-            else
-            {
-                mce_printk(MCE_VERBOSE,
-                           "MCE: Kill PV guest with No MCE handler\n");
-                domain_crash(d);
-            }
-        }
-    }
-    else
-    {
-        /* new vMCE comes while first one has not been injected yet,
-         * in this case, inject fail. [We can't lose this vMCE for
-         * the mce node's consistency].
-         */
-        mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected "
-                   " to this DOM%d!\n", d->domain_id);
-        return -1;
-    }
-    return 0;
-}
-
 /* This node list records errors impacting a domain. when one
  * MCE# happens, one error bank impacts a domain. This error node
  * will be inserted to the tail of the per_dom data for vMCE# MSR

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread