[Xen-devel] [PATCH 1/4] x86/mcheck: allow varying bank counts per CPU

From: "Jan Beulich" <JBeulich@suse.com>
To: "xen-devel" <xen-devel@lists.xenproject.org>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>, WeiLiu <wl@xen.org>,
	Roger Pau Monne <roger.pau@citrix.com>
Subject: [Xen-devel] [PATCH 1/4] x86/mcheck: allow varying bank counts per CPU
Date: Fri, 14 Jun 2019 09:35:58 -0600	[thread overview]
Message-ID: <5D03BEDE0200007800238613@prv1-mh.provo.novell.com> (raw)
In-Reply-To: <5D03BE5102000078002385FE@prv1-mh.provo.novell.com>

Up to now we've been assuming that all CPUs would have the same number
of reporting banks. However, on upcoming AMD CPUs this isn't the case,
and one can observe

(XEN) mce.c:666: Different bank number on cpu <N>

indicating that Machine Check support would not be enabled on the
affected CPUs. Convert the count variable to a per-CPU one, and adjust
code where needed to cope with the values not being the same. In
particular the mcabanks_alloc() invocations during AP bringup need to
now allocate maximum-size bitmaps, because the truly needed size can't
be known until we actually execute on that CPU, yet mcheck_init() gets
called too early to do any allocations itself.

Take the liberty and also
- make mca_cap_init() static,
- replace several __get_cpu_var() uses when a local variable suitable
  for use with per_cpu() appears,
- correct which CPU's cpu_data[] entry x86_mc_msrinject_verify() uses,
- replace a BUG() by panic().

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -34,7 +34,7 @@ bool __read_mostly opt_mce = true;
 boolean_param("mce", opt_mce);
 bool __read_mostly mce_broadcast;
 bool is_mc_panic;
-unsigned int __read_mostly nr_mce_banks;
+DEFINE_PER_CPU_READ_MOSTLY(unsigned int, nr_mce_banks);
 unsigned int __read_mostly firstbank;
 uint8_t __read_mostly cmci_apic_vector;
 
@@ -120,7 +120,7 @@ void mce_recoverable_register(mce_recove
     mc_recoverable_scan = cbfunc;
 }
 
-struct mca_banks *mcabanks_alloc(void)
+struct mca_banks *mcabanks_alloc(unsigned int nr_mce_banks)
 {
     struct mca_banks *mb;
 
@@ -128,6 +128,13 @@ struct mca_banks *mcabanks_alloc(void)
     if ( !mb )
         return NULL;
 
+    /*
+     * For APs allocations get done by the BSP, i.e. when the bank count may
+     * may not be known yet. A zero bank count is a clear indication of this.
+     */
+    if ( !nr_mce_banks )
+        nr_mce_banks = MCG_CAP_COUNT;
+
     mb->bank_map = xzalloc_array(unsigned long,
                                  BITS_TO_LONGS(nr_mce_banks));
     if ( !mb->bank_map )
@@ -319,7 +326,7 @@ mcheck_mca_logout(enum mca_source who, s
      */
     recover = mc_recoverable_scan ? 1 : 0;
 
-    for ( i = 0; i < nr_mce_banks; i++ )
+    for ( i = 0; i < this_cpu(nr_mce_banks); i++ )
     {
         /* Skip bank if corresponding bit in bankmask is clear */
         if ( !mcabanks_test(i, bankmask) )
@@ -565,7 +572,7 @@ void mcheck_mca_clearbanks(struct mca_ba
 {
     int i;
 
-    for ( i = 0; i < nr_mce_banks; i++ )
+    for ( i = 0; i < this_cpu(nr_mce_banks); i++ )
     {
         if ( !mcabanks_test(i, bankmask) )
             continue;
@@ -638,54 +645,56 @@ static void set_poll_bankmask(struct cpu
 
     if ( cmci_support && opt_mce )
     {
-        mb->num = per_cpu(no_cmci_banks, cpu)->num;
-        bitmap_copy(mb->bank_map, per_cpu(no_cmci_banks, cpu)->bank_map,
-                    nr_mce_banks);
+        const struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu);
+
+        if ( unlikely(cmci->num < mb->num) )
+            bitmap_fill(mb->bank_map, mb->num);
+        bitmap_copy(mb->bank_map, cmci->bank_map, min(mb->num, cmci->num));
     }
     else
     {
-        bitmap_copy(mb->bank_map, mca_allbanks->bank_map, nr_mce_banks);
+        bitmap_copy(mb->bank_map, mca_allbanks->bank_map,
+                    per_cpu(nr_mce_banks, cpu));
         if ( mce_firstbank(c) )
             mcabanks_clear(0, mb);
     }
 }
 
 /* The perbank ctl/status init is platform specific because of AMD's quirk */
-int mca_cap_init(void)
+static int mca_cap_init(void)
 {
     uint64_t msr_content;
+    unsigned int nr, cpu = smp_processor_id();
 
     rdmsrl(MSR_IA32_MCG_CAP, msr_content);
 
     if ( msr_content & MCG_CTL_P ) /* Control register present ? */
         wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
 
-    if ( nr_mce_banks && (msr_content & MCG_CAP_COUNT) != nr_mce_banks )
-    {
-        dprintk(XENLOG_WARNING, "Different bank number on cpu %x\n",
-                smp_processor_id());
-        return -ENODEV;
-    }
-    nr_mce_banks = msr_content & MCG_CAP_COUNT;
+    per_cpu(nr_mce_banks, cpu) = nr = MASK_EXTR(msr_content, MCG_CAP_COUNT);
 
-    if ( !nr_mce_banks )
+    if ( !nr )
     {
-        printk(XENLOG_INFO "CPU%u: No MCE banks present. "
-               "Machine check support disabled\n", smp_processor_id());
+        printk(XENLOG_INFO
+               "CPU%u: No MCE banks present. Machine check support disabled\n",
+               cpu);
         return -ENODEV;
     }
 
     /* mcabanks_alloc depends on nr_mce_banks */
-    if ( !mca_allbanks )
+    if ( !mca_allbanks || nr > mca_allbanks->num )
     {
-        int i;
+        unsigned int i;
+        struct mca_banks *all = mcabanks_alloc(nr);
 
-        mca_allbanks = mcabanks_alloc();
-        for ( i = 0; i < nr_mce_banks; i++ )
+        if ( !all )
+            return -ENOMEM;
+        for ( i = 0; i < nr; i++ )
             mcabanks_set(i, mca_allbanks);
+        mcabanks_free(xchg(&mca_allbanks, all));
     }
 
-    return mca_allbanks ? 0 : -ENOMEM;
+    return 0;
 }
 
 static void cpu_bank_free(unsigned int cpu)
@@ -702,8 +711,9 @@ static void cpu_bank_free(unsigned int c
 
 static int cpu_bank_alloc(unsigned int cpu)
 {
-    struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc();
-    struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc();
+    unsigned int nr = per_cpu(nr_mce_banks, cpu);
+    struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc(nr);
+    struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc(nr);
 
     if ( !poll || !clr )
     {
@@ -752,6 +762,7 @@ static struct notifier_block cpu_nfb = {
 void mcheck_init(struct cpuinfo_x86 *c, bool bsp)
 {
     enum mcheck_type inited = mcheck_none;
+    unsigned int cpu = smp_processor_id();
 
     if ( !opt_mce )
     {
@@ -762,8 +773,7 @@ void mcheck_init(struct cpuinfo_x86 *c,
 
     if ( !mce_available(c) )
     {
-        printk(XENLOG_INFO "CPU%i: No machine check support available\n",
-               smp_processor_id());
+        printk(XENLOG_INFO "CPU%i: No machine check support available\n", cpu);
         return;
     }
 
@@ -771,9 +781,13 @@ void mcheck_init(struct cpuinfo_x86 *c,
     if ( mca_cap_init() )
         return;
 
-    /* Early MCE initialisation for BSP. */
-    if ( bsp && cpu_bank_alloc(smp_processor_id()) )
-        BUG();
+    if ( !bsp )
+    {
+        per_cpu(poll_bankmask, cpu)->num = per_cpu(nr_mce_banks, cpu);
+        per_cpu(mce_clear_banks, cpu)->num = per_cpu(nr_mce_banks, cpu);
+    }
+    else if ( cpu_bank_alloc(cpu) )
+        panic("Insufficient memory for MCE bank allocations\n");
 
     switch ( c->x86_vendor )
     {
@@ -1111,24 +1125,22 @@ bool intpose_inval(unsigned int cpu_nr,
     return true;
 }
 
-#define IS_MCA_BANKREG(r) \
+#define IS_MCA_BANKREG(r, cpu) \
     ((r) >= MSR_IA32_MC0_CTL && \
-    (r) <= MSR_IA32_MCx_MISC(nr_mce_banks - 1) && \
-    ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
+     (r) <= MSR_IA32_MCx_MISC(per_cpu(nr_mce_banks, cpu) - 1) && \
+     ((r) - MSR_IA32_MC0_CTL) % 4) /* excludes MCi_CTL */
 
 static bool x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
 {
-    struct cpuinfo_x86 *c;
+    const struct cpuinfo_x86 *c = &cpu_data[mci->mcinj_cpunr];
     int i, errs = 0;
 
-    c = &cpu_data[smp_processor_id()];
-
     for ( i = 0; i < mci->mcinj_count; i++ )
     {
         uint64_t reg = mci->mcinj_msr[i].reg;
         const char *reason = NULL;
 
-        if ( IS_MCA_BANKREG(reg) )
+        if ( IS_MCA_BANKREG(reg, mci->mcinj_cpunr) )
         {
             if ( c->x86_vendor == X86_VENDOR_AMD )
             {
@@ -1448,7 +1460,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_m
         break;
 
     case XEN_MC_msrinject:
-        if ( nr_mce_banks == 0 )
+        if ( !mca_allbanks || !mca_allbanks->num )
             return x86_mcerr("do_mca inject", -ENODEV);
 
         mc_msrinject = &op->u.mc_msrinject;
@@ -1461,6 +1473,9 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_m
             return x86_mcerr("do_mca inject: target offline",
                              -EINVAL);
 
+        if ( !per_cpu(nr_mce_banks, target) )
+            return x86_mcerr("do_mca inject: no banks", -ENOENT);
+
         if ( mc_msrinject->mcinj_count == 0 )
             return 0;
 
@@ -1521,7 +1536,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_m
         break;
 
     case XEN_MC_mceinject:
-        if ( nr_mce_banks == 0 )
+        if ( !mca_allbanks || !mca_allbanks->num )
             return x86_mcerr("do_mca #MC", -ENODEV);
 
         mc_mceinject = &op->u.mc_mceinject;
@@ -1533,6 +1548,9 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_m
         if ( !cpu_online(target) )
             return x86_mcerr("do_mca #MC: target offline", -EINVAL);
 
+        if ( !per_cpu(nr_mce_banks, target) )
+            return x86_mcerr("do_mca #MC: no banks", -ENOENT);
+
         add_taint(TAINT_ERROR_INJECT);
 
         if ( mce_broadcast )
@@ -1548,7 +1566,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_m
         cpumask_var_t cmv;
         bool broadcast = op->u.mc_inject_v2.flags & XEN_MC_INJECT_CPU_BROADCAST;
 
-        if ( nr_mce_banks == 0 )
+        if ( !mca_allbanks || !mca_allbanks->num )
             return x86_mcerr("do_mca #MC", -ENODEV);
 
         if ( broadcast )
@@ -1570,6 +1588,16 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_m
                         "Not all required CPUs are online\n");
         }
 
+        for_each_cpu(target, cpumap)
+            if ( cpu_online(target) && !per_cpu(nr_mce_banks, target) )
+            {
+                ret = x86_mcerr("do_mca #MC: CPU%u has no banks",
+                                -ENOENT, target);
+                break;
+            }
+        if ( ret )
+            break;
+
         switch ( op->u.mc_inject_v2.flags & XEN_MC_INJECT_TYPE_MASK )
         {
         case XEN_MC_INJECT_TYPE_MCE:
--- a/xen/arch/x86/cpu/mcheck/mce_amd.c
+++ b/xen/arch/x86/cpu/mcheck/mce_amd.c
@@ -297,7 +297,7 @@ amd_mcheck_init(struct cpuinfo_x86 *ci)
     x86_mce_vector_register(mcheck_cmn_handler);
     mce_need_clearbank_register(amd_need_clearbank_scan);
 
-    for ( i = 0; i < nr_mce_banks; i++ )
+    for ( i = 0; i < this_cpu(nr_mce_banks); i++ )
     {
         if ( quirkflag == MCEQUIRK_K8_GART && i == 4 )
             mcequirk_amd_apply(quirkflag);
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -535,16 +535,16 @@ out:
 static void cmci_discover(void)
 {
     unsigned long flags;
-    int i;
+    unsigned int i, cpu = smp_processor_id();
     mctelem_cookie_t mctc;
     struct mca_summary bs;
 
-    mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%d\n", smp_processor_id());
+    mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%u\n", cpu);
 
     spin_lock_irqsave(&cmci_discover_lock, flags);
 
-    for ( i = 0; i < nr_mce_banks; i++ )
-        if ( !mcabanks_test(i, __get_cpu_var(mce_banks_owned)) )
+    for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
+        if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
             do_cmci_discover(i);
 
     spin_unlock_irqrestore(&cmci_discover_lock, flags);
@@ -557,7 +557,7 @@ static void cmci_discover(void)
      */
 
     mctc = mcheck_mca_logout(
-        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
+        MCA_CMCI_HANDLER, per_cpu(mce_banks_owned, cpu), &bs, NULL);
 
     if ( bs.errcnt && mctc != NULL )
     {
@@ -576,9 +576,9 @@ static void cmci_discover(void)
         mctelem_dismiss(mctc);
 
     mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
-               smp_processor_id(),
-               *((unsigned long *)__get_cpu_var(mce_banks_owned)->bank_map),
-               *((unsigned long *)__get_cpu_var(no_cmci_banks)->bank_map));
+               cpu,
+               per_cpu(mce_banks_owned, cpu)->bank_map[0],
+               per_cpu(no_cmci_banks, cpu)->bank_map[0]);
 }
 
 /*
@@ -613,24 +613,24 @@ static void cpu_mcheck_distribute_cmci(v
 
 static void clear_cmci(void)
 {
-    int i;
+    unsigned int i, cpu = smp_processor_id();
 
     if ( !cmci_support || !opt_mce )
         return;
 
-    mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%d\n",
-               smp_processor_id());
+    mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%u\n", cpu);
 
-    for ( i = 0; i < nr_mce_banks; i++ )
+    for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
     {
         unsigned msr = MSR_IA32_MCx_CTL2(i);
         u64 val;
-        if ( !mcabanks_test(i, __get_cpu_var(mce_banks_owned)) )
+
+        if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
             continue;
         rdmsrl(msr, val);
         if ( val & (CMCI_EN|CMCI_THRESHOLD_MASK) )
             wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
-        mcabanks_clear(i, __get_cpu_var(mce_banks_owned));
+        mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
     }
 }
 
@@ -826,7 +826,7 @@ static void intel_init_mce(void)
     intel_mce_post_reset();
 
     /* clear all banks */
-    for ( i = firstbank; i < nr_mce_banks; i++ )
+    for ( i = firstbank; i < this_cpu(nr_mce_banks); i++ )
     {
         /*
          * Some banks are shared across cores, use MCi_CTRL to judge whether
@@ -866,8 +866,9 @@ static void cpu_mcabank_free(unsigned in
 
 static int cpu_mcabank_alloc(unsigned int cpu)
 {
-    struct mca_banks *cmci = mcabanks_alloc();
-    struct mca_banks *owned = mcabanks_alloc();
+    unsigned int nr = per_cpu(nr_mce_banks, cpu);
+    struct mca_banks *cmci = mcabanks_alloc(nr);
+    struct mca_banks *owned = mcabanks_alloc(nr);
 
     if ( !cmci || !owned )
         goto out;
@@ -924,6 +925,13 @@ enum mcheck_type intel_mcheck_init(struc
         register_cpu_notifier(&cpu_nfb);
         mcheck_intel_therm_init();
     }
+    else
+    {
+        unsigned int cpu = smp_processor_id();
+
+        per_cpu(no_cmci_banks, cpu)->num = per_cpu(nr_mce_banks, cpu);
+        per_cpu(mce_banks_owned, cpu)->num = per_cpu(nr_mce_banks, cpu);
+    }
 
     intel_init_mca(c);
 
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -125,7 +125,7 @@ static inline int mcabanks_test(int bit,
     return test_bit(bit, banks->bank_map);
 }
 
-struct mca_banks *mcabanks_alloc(void);
+struct mca_banks *mcabanks_alloc(unsigned int nr);
 void mcabanks_free(struct mca_banks *banks);
 extern struct mca_banks *mca_allbanks;
 
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2374,7 +2374,7 @@ static int svm_is_erratum_383(struct cpu
         return 0;
     
     /* Clear MCi_STATUS registers */
-    for (i = 0; i < nr_mce_banks; i++)
+    for (i = 0; i < this_cpu(nr_mce_banks); i++)
         wrmsrl(MSR_IA32_MCx_STATUS(i), 0ULL);
     
     rdmsrl(MSR_IA32_MCG_STATUS, msr_content);
--- a/xen/include/asm-x86/mce.h
+++ b/xen/include/asm-x86/mce.h
@@ -40,6 +40,6 @@ extern int vmce_rdmsr(uint32_t msr, uint
 extern bool vmce_has_lmce(const struct vcpu *v);
 extern int vmce_enable_mca_cap(struct domain *d, uint64_t cap);
 
-extern unsigned int nr_mce_banks;
+DECLARE_PER_CPU(unsigned int, nr_mce_banks);
 
 #endif




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel