* [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity
@ 2020-04-03 16:19 Borislav Petkov
2020-04-03 16:19 ` [PATCH 1/7] x86/mce/amd: Do proper cleanup on error paths Borislav Petkov
` (6 more replies)
0 siblings, 7 replies; 8+ messages in thread
From: Borislav Petkov @ 2020-04-03 16:19 UTC (permalink / raw)
To: X86 ML; +Cc: Yazen Ghannam, linux-edac, LKML
From: Borislav Petkov <bp@suse.de>
Hi,
so tglx found a couple of issues while we were talking about something
else and threw a couple of fixes over the wall, my way. Here's the
productized and tested result.
Thx.
Thomas Gleixner (7):
x86/mce/amd: Do proper cleanup on error paths
x86/mce/amd: Init thresholding machinery only on relevant vendors
x86/mce/amd: Protect a not-fully initialized bank from the
thresholding interrupt
x86/mce/amd: Sanitize thresholding device creation hotplug path
x86/mce/amd: Straighten CPU hotplug path
x86/mce/amd: Cleanup threshold device remove path
x86/mce/amd: Make threshold bank setting hotplug robust
arch/x86/include/asm/amd_nb.h | 1 +
arch/x86/kernel/cpu/mce/amd.c | 224 ++++++++++++++---------------
arch/x86/kernel/cpu/mce/core.c | 12 ++
arch/x86/kernel/cpu/mce/internal.h | 9 +-
4 files changed, 131 insertions(+), 115 deletions(-)
--
2.21.0
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/7] x86/mce/amd: Do proper cleanup on error paths
2020-04-03 16:19 [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity Borislav Petkov
@ 2020-04-03 16:19 ` Borislav Petkov
2020-04-03 16:19 ` [PATCH 2/7] x86/mce/amd: Init thresholding machinery only on relevant vendors Borislav Petkov
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Borislav Petkov @ 2020-04-03 16:19 UTC (permalink / raw)
To: X86 ML; +Cc: Yazen Ghannam, linux-edac, LKML
From: Thomas Gleixner <tglx@linutronix.de>
Drop kobject reference counts properly on error in the banks and blocks
allocation functions.
[ bp: Write commit message. ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
arch/x86/kernel/cpu/mce/amd.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 52de616a8065..477cf773cf1c 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1267,13 +1267,12 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
if (b)
kobject_uevent(&b->kobj, KOBJ_ADD);
- return err;
+ return 0;
out_free:
if (b) {
- kobject_put(&b->kobj);
list_del(&b->miscj);
- kfree(b);
+ kobject_put(&b->kobj);
}
return err;
}
@@ -1339,6 +1338,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
+ /* Associate the bank with the per-CPU MCE device */
b->kobj = kobject_create_and_add(name, &dev->kobj);
if (!b->kobj) {
err = -EINVAL;
@@ -1357,16 +1357,17 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
if (err)
- goto out_free;
+ goto out_kobj;
per_cpu(threshold_banks, cpu)[bank] = b;
return 0;
- out_free:
+out_kobj:
+ kobject_put(b->kobj);
+out_free:
kfree(b);
-
- out:
+out:
return err;
}
--
2.21.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/7] x86/mce/amd: Init thresholding machinery only on relevant vendors
2020-04-03 16:19 [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity Borislav Petkov
2020-04-03 16:19 ` [PATCH 1/7] x86/mce/amd: Do proper cleanup on error paths Borislav Petkov
@ 2020-04-03 16:19 ` Borislav Petkov
2020-04-03 16:19 ` [PATCH 3/7] x86/mce/amd: Protect a not-fully initialized bank from the thresholding interrupt Borislav Petkov
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Borislav Petkov @ 2020-04-03 16:19 UTC (permalink / raw)
To: X86 ML; +Cc: Yazen Ghannam, linux-edac, LKML
From: Thomas Gleixner <tglx@linutronix.de>
... and not unconditionally.
[ bp: Add a new vendor_flags bit for that. ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
arch/x86/kernel/cpu/mce/amd.c | 12 ++++++++++--
arch/x86/kernel/cpu/mce/core.c | 1 +
arch/x86/kernel/cpu/mce/internal.h | 9 ++++++---
3 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 477cf773cf1c..c3b3326ad4ac 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1442,15 +1442,20 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
int mce_threshold_remove_device(unsigned int cpu)
{
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
unsigned int bank;
+ if (!bp)
+ return 0;
+
for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
}
- kfree(per_cpu(threshold_banks, cpu));
- per_cpu(threshold_banks, cpu) = NULL;
+ /* Clear the pointer before freeing the memory */
+ this_cpu_write(threshold_banks, NULL);
+ kfree(bp);
return 0;
}
@@ -1461,6 +1466,9 @@ int mce_threshold_create_device(unsigned int cpu)
struct threshold_bank **bp;
int err = 0;
+ if (!mce_flags.amd_threshold)
+ return 0;
+
bp = per_cpu(threshold_banks, cpu);
if (bp)
return 0;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 54165f3569e8..43ca91e14a77 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1756,6 +1756,7 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
+ mce_flags.amd_threshold = 1;
if (mce_flags.smca) {
msr_ops.ctl = smca_ctl_reg;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 3b008172ad73..74a01829c4f4 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -148,7 +148,7 @@ struct mce_vendor_flags {
* Recovery. It indicates support for data poisoning in HW and deferred
* error interrupts.
*/
- succor : 1,
+ succor : 1,
/*
* (AMD) SMCA: This bit indicates support for Scalable MCA which expands
@@ -156,9 +156,12 @@ struct mce_vendor_flags {
* banks. Also, to accommodate the new banks and registers, the MCA
* register space is moved to a new MSR range.
*/
- smca : 1,
+ smca : 1,
- __reserved_0 : 61;
+ /* AMD-style error thresholding banks present. */
+ amd_threshold : 1,
+
+ __reserved_0 : 60;
};
extern struct mce_vendor_flags mce_flags;
--
2.21.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 3/7] x86/mce/amd: Protect a not-fully initialized bank from the thresholding interrupt
2020-04-03 16:19 [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity Borislav Petkov
2020-04-03 16:19 ` [PATCH 1/7] x86/mce/amd: Do proper cleanup on error paths Borislav Petkov
2020-04-03 16:19 ` [PATCH 2/7] x86/mce/amd: Init thresholding machinery only on relevant vendors Borislav Petkov
@ 2020-04-03 16:19 ` Borislav Petkov
2020-04-03 16:19 ` [PATCH 4/7] x86/mce/amd: Sanitize thresholding device creation hotplug path Borislav Petkov
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Borislav Petkov @ 2020-04-03 16:19 UTC (permalink / raw)
To: X86 ML; +Cc: Yazen Ghannam, linux-edac, LKML
From: Thomas Gleixner <tglx@linutronix.de>
Make sure the thresholding bank descriptor is fully initialized when the
thresholding interrupt fires after a hotplug event.
[ bp: Write commit message and document long-forgotten bank_map. ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
arch/x86/kernel/cpu/mce/amd.c | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index c3b3326ad4ac..563942157758 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -192,7 +192,12 @@ EXPORT_SYMBOL_GPL(smca_banks);
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
+
+/*
+ * A list of the banks enabled on each logical CPU. Controls which respective
+ * descriptors to initialize later in mce_threshold_create_device().
+ */
+static DEFINE_PER_CPU(unsigned int, bank_map);
/* Map of banks that have more than MCA_MISC0 available. */
static DEFINE_PER_CPU(u32, smca_misc_banks_map);
@@ -1016,13 +1021,22 @@ static void log_and_reset_block(struct threshold_block *block)
static void amd_threshold_interrupt(void)
{
struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
unsigned int bank, cpu = smp_processor_id();
+ /*
+ * Validate that the threshold bank has been initialized already. The
+ * handler is installed at boot time, but on a hotplug event the
+ * interrupt might fire before the data has been initialized.
+ */
+ if (!bp)
+ return;
+
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
- first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+ first_block = bp[bank]->blocks;
if (!first_block)
continue;
@@ -1247,6 +1261,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
INIT_LIST_HEAD(&b->miscj);
+ /* This is safe as @tb is not visible yet */
if (tb->blocks)
list_add(&b->miscj, &tb->blocks->miscj);
else
--
2.21.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 4/7] x86/mce/amd: Sanitize thresholding device creation hotplug path
2020-04-03 16:19 [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity Borislav Petkov
` (2 preceding siblings ...)
2020-04-03 16:19 ` [PATCH 3/7] x86/mce/amd: Protect a not-fully initialized bank from the thresholding interrupt Borislav Petkov
@ 2020-04-03 16:19 ` Borislav Petkov
2020-04-03 16:19 ` [PATCH 5/7] x86/mce/amd: Straighten CPU " Borislav Petkov
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Borislav Petkov @ 2020-04-03 16:19 UTC (permalink / raw)
To: X86 ML; +Cc: Yazen Ghannam, linux-edac, LKML
From: Thomas Gleixner <tglx@linutronix.de>
Drop the stupid threshold_init_device() initcall iterating over all
online CPUs in favor of properly setting up everything on the CPU
hotplug path, when each CPU's callback is invoked.
[ bp: Write commit message. ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
arch/x86/kernel/cpu/mce/amd.c | 57 ++++++++++------------------------
arch/x86/kernel/cpu/mce/core.c | 11 +++++++
2 files changed, 27 insertions(+), 41 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 563942157758..d3c416b6052a 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1474,12 +1474,22 @@ int mce_threshold_remove_device(unsigned int cpu)
return 0;
}
-/* create dir/files for all valid threshold banks */
+/**
+ * mce_threshold_create_device - Create the per-CPU MCE threshold device
+ * @cpu: The plugged in CPU
+ *
+ * Create directories and files for all valid threshold banks.
+ *
+ * This is invoked from the CPU hotplug callback which was installed in
+ * mcheck_init_device(). The invocation happens in context of the hotplug
+ * thread running on @cpu. The callback is invoked on all CPUs which are
+ * online when the callback is installed or during a real hotplug event.
+ */
int mce_threshold_create_device(unsigned int cpu)
{
unsigned int bank;
struct threshold_bank **bp;
- int err = 0;
+ int err;
if (!mce_flags.amd_threshold)
return 0;
@@ -1500,49 +1510,14 @@ int mce_threshold_create_device(unsigned int cpu)
continue;
err = threshold_create_bank(cpu, bank);
if (err)
- goto err;
- }
- return err;
-err:
- mce_threshold_remove_device(cpu);
- return err;
-}
-
-static __init int threshold_init_device(void)
-{
- unsigned lcpu = 0;
-
- /* to hit CPUs online before the notifier is up */
- for_each_online_cpu(lcpu) {
- int err = mce_threshold_create_device(lcpu);
-
- if (err)
- return err;
+ goto out_err;
}
if (thresholding_irq_en)
mce_threshold_vector = amd_threshold_interrupt;
return 0;
+out_err:
+ mce_threshold_remove_device(cpu);
+ return err;
}
-/*
- * there are 3 funcs which need to be _initcalled in a logic sequence:
- * 1. xen_late_init_mcelog
- * 2. mcheck_init_device
- * 3. threshold_init_device
- *
- * xen_late_init_mcelog must register xen_mce_chrdev_device before
- * native mce_chrdev_device registration if running under xen platform;
- *
- * mcheck_init_device should be inited before threshold_init_device to
- * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
- *
- * so we use following _initcalls
- * 1. device_initcall(xen_late_init_mcelog);
- * 2. device_initcall_sync(mcheck_init_device);
- * 3. late_initcall(threshold_init_device);
- *
- * when running under xen, the initcall order is 1,2,3;
- * on baremetal, we skip 1 and we do only 2 and 3.
- */
-late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 43ca91e14a77..a6009efdfe2b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2481,6 +2481,13 @@ static __init void mce_init_banks(void)
}
}
+/*
+ * When running on XEN, this initcall is ordered against the XEN mcelog
+ * initcall:
+ *
+ * device_initcall(xen_late_init_mcelog);
+ * device_initcall_sync(mcheck_init_device);
+ */
static __init int mcheck_init_device(void)
{
int err;
@@ -2512,6 +2519,10 @@ static __init int mcheck_init_device(void)
if (err)
goto err_out_mem;
+ /*
+ * Invokes mce_cpu_online() on all CPUs which are online when
+ * the state is installed.
+ */
err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
mce_cpu_online, mce_cpu_pre_down);
if (err < 0)
--
2.21.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 5/7] x86/mce/amd: Straighten CPU hotplug path
2020-04-03 16:19 [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity Borislav Petkov
` (3 preceding siblings ...)
2020-04-03 16:19 ` [PATCH 4/7] x86/mce/amd: Sanitize thresholding device creation hotplug path Borislav Petkov
@ 2020-04-03 16:19 ` Borislav Petkov
2020-04-03 16:19 ` [PATCH 6/7] x86/mce/amd: Cleanup threshold device remove path Borislav Petkov
2020-04-03 16:19 ` [PATCH 7/7] x86/mce/amd: Make threshold bank setting hotplug robust Borislav Petkov
6 siblings, 0 replies; 8+ messages in thread
From: Borislav Petkov @ 2020-04-03 16:19 UTC (permalink / raw)
To: X86 ML; +Cc: Yazen Ghannam, linux-edac, LKML
From: Thomas Gleixner <tglx@linutronix.de>
mce_threshold_create_device() hotplug callback runs on the plugged in
CPU so:
- use this_cpu_read() which is faster
- pass in struct threshold_bank **bp to threshold_create_bank() and
instead of doing per-CPU accesses
- Use rdmsr_safe() instead of rdmsr_safe_on_cpu() which avoids an IPI.
No functional changes.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
arch/x86/kernel/cpu/mce/amd.c | 32 +++++++++++++++-----------------
1 file changed, 15 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index d3c416b6052a..a33d9a1caf36 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1223,10 +1223,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
u32 low, high;
int err;
- if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
+ if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS))
return 0;
- if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
+ if (rdmsr_safe(address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
@@ -1316,9 +1316,10 @@ static int __threshold_add_blocks(struct threshold_bank *b)
return err;
}
-static int threshold_create_bank(unsigned int cpu, unsigned int bank)
+static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
+ unsigned int bank)
{
- struct device *dev = per_cpu(mce_device, cpu);
+ struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = get_name(bank, NULL);
@@ -1338,7 +1339,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (err)
goto out;
- per_cpu(threshold_banks, cpu)[bank] = b;
+ bp[bank] = b;
refcount_inc(&b->cpus);
err = __threshold_add_blocks(b);
@@ -1374,8 +1375,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (err)
goto out_kobj;
- per_cpu(threshold_banks, cpu)[bank] = b;
-
+ bp[bank] = b;
return 0;
out_kobj:
@@ -1487,35 +1487,33 @@ int mce_threshold_remove_device(unsigned int cpu)
*/
int mce_threshold_create_device(unsigned int cpu)
{
- unsigned int bank;
+ unsigned int numbanks, bank;
struct threshold_bank **bp;
int err;
if (!mce_flags.amd_threshold)
return 0;
- bp = per_cpu(threshold_banks, cpu);
+ bp = this_cpu_read(threshold_banks);
if (bp)
return 0;
- bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *),
- GFP_KERNEL);
+ numbanks = this_cpu_read(mce_num_banks);
+ bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL);
if (!bp)
return -ENOMEM;
- per_cpu(threshold_banks, cpu) = bp;
-
- for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ for (bank = 0; bank < numbanks; ++bank) {
+ if (!(this_cpu_read(bank_map) & (1 << bank)))
continue;
- err = threshold_create_bank(cpu, bank);
+ err = threshold_create_bank(bp, cpu, bank);
if (err)
goto out_err;
}
+ this_cpu_write(threshold_banks, bp);
if (thresholding_irq_en)
mce_threshold_vector = amd_threshold_interrupt;
-
return 0;
out_err:
mce_threshold_remove_device(cpu);
--
2.21.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 6/7] x86/mce/amd: Cleanup threshold device remove path
2020-04-03 16:19 [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity Borislav Petkov
` (4 preceding siblings ...)
2020-04-03 16:19 ` [PATCH 5/7] x86/mce/amd: Straighten CPU " Borislav Petkov
@ 2020-04-03 16:19 ` Borislav Petkov
2020-04-03 16:19 ` [PATCH 7/7] x86/mce/amd: Make threshold bank setting hotplug robust Borislav Petkov
6 siblings, 0 replies; 8+ messages in thread
From: Borislav Petkov @ 2020-04-03 16:19 UTC (permalink / raw)
To: X86 ML; +Cc: Yazen Ghannam, linux-edac, LKML
From: Thomas Gleixner <tglx@linutronix.de>
Pass in the bank pointer directly to the cleaning up functions,
obviating the need for per-CPU accesses. Make the clean up path
interrupt-safe by cleaning the bank pointer first so that the rest of
the teardown happens safe from the thresholding interrupt.
No functional changes.
[ bp: Write commit message and reverse bank->shared test to save an
indentation level in threshold_remove_bank(). ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
arch/x86/include/asm/amd_nb.h | 1 +
arch/x86/kernel/cpu/mce/amd.c | 79 ++++++++++++++++-------------------
2 files changed, 38 insertions(+), 42 deletions(-)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index c7df20e78b09..455066a06f60 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -57,6 +57,7 @@ struct threshold_bank {
/* initialized to the number of CPUs on the node sharing this bank */
refcount_t cpus;
+ unsigned int shared;
};
struct amd_northbridge {
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index a33d9a1caf36..16e7aea86ab1 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1362,6 +1362,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
}
if (is_shared_bank(bank)) {
+ b->shared = 1;
refcount_set(&b->cpus, 1);
/* nb is already initialized, see above */
@@ -1391,21 +1392,16 @@ static void threshold_block_release(struct kobject *kobj)
kfree(to_block(kobj));
}
-static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
+static void deallocate_threshold_blocks(struct threshold_bank *bank)
{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
-
- if (!head)
- return;
+ struct threshold_block *pos, *tmp;
- list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+ list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) {
list_del(&pos->miscj);
kobject_put(&pos->kobj);
}
- kobject_put(&head->blocks->kobj);
+ kobject_put(&bank->blocks->kobj);
}
static void __threshold_remove_blocks(struct threshold_bank *b)
@@ -1419,57 +1415,56 @@ static void __threshold_remove_blocks(struct threshold_bank *b)
kobject_del(&pos->kobj);
}
-static void threshold_remove_bank(unsigned int cpu, int bank)
+static void threshold_remove_bank(struct threshold_bank *bank)
{
struct amd_northbridge *nb;
- struct threshold_bank *b;
- b = per_cpu(threshold_banks, cpu)[bank];
- if (!b)
- return;
+ if (!bank->blocks)
+ goto out_free;
- if (!b->blocks)
- goto free_out;
+ if (!bank->shared)
+ goto out_dealloc;
- if (is_shared_bank(bank)) {
- if (!refcount_dec_and_test(&b->cpus)) {
- __threshold_remove_blocks(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
- return;
- } else {
- /*
- * the last CPU on this node using the shared bank is
- * going away, remove that bank now.
- */
- nb = node_to_amd_nb(amd_get_nb_id(cpu));
- nb->bank4 = NULL;
- }
+ if (!refcount_dec_and_test(&bank->cpus)) {
+ __threshold_remove_blocks(bank);
+ return;
+ } else {
+ /*
+ * The last CPU on this node using the shared bank is going
+ * away, remove that bank now.
+ */
+ nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id()));
+ nb->bank4 = NULL;
}
- deallocate_threshold_block(cpu, bank);
+out_dealloc:
+ deallocate_threshold_blocks(bank);
-free_out:
- kobject_del(b->kobj);
- kobject_put(b->kobj);
- kfree(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
+out_free:
+ kobject_put(bank->kobj);
+ kfree(bank);
}
int mce_threshold_remove_device(unsigned int cpu)
{
struct threshold_bank **bp = this_cpu_read(threshold_banks);
- unsigned int bank;
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
if (!bp)
return 0;
- for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- threshold_remove_bank(cpu, bank);
- }
- /* Clear the pointer before freeing the memory */
+ /*
+ * Clear the pointer before cleaning up, so that the interrupt won't
+ * touch anything of this.
+ */
this_cpu_write(threshold_banks, NULL);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (bp[bank]) {
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
+ }
kfree(bp);
return 0;
}
--
2.21.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 7/7] x86/mce/amd: Make threshold bank setting hotplug robust
2020-04-03 16:19 [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity Borislav Petkov
` (5 preceding siblings ...)
2020-04-03 16:19 ` [PATCH 6/7] x86/mce/amd: Cleanup threshold device remove path Borislav Petkov
@ 2020-04-03 16:19 ` Borislav Petkov
6 siblings, 0 replies; 8+ messages in thread
From: Borislav Petkov @ 2020-04-03 16:19 UTC (permalink / raw)
To: X86 ML; +Cc: Yazen Ghannam, linux-edac, LKML
From: Thomas Gleixner <tglx@linutronix.de>
Handle the cases when the CPU goes offline before the bank
setting/reading happens.
[ bp: Write commit message. ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
arch/x86/kernel/cpu/mce/amd.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 16e7aea86ab1..15c87b87b901 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -386,6 +386,10 @@ static void threshold_restart_bank(void *_tr)
struct thresh_restart *tr = _tr;
u32 hi, lo;
+ /* sysfs write might race against an offline operation */
+ if (this_cpu_read(threshold_banks))
+ return;
+
rdmsr(tr->b->address, lo, hi);
if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
@@ -1085,7 +1089,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
memset(&tr, 0, sizeof(tr));
tr.b = b;
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+ if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+ return -ENODEV;
return size;
}
@@ -1109,7 +1114,8 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
b->threshold_limit = new;
tr.b = b;
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+ if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+ return -ENODEV;
return size;
}
@@ -1118,7 +1124,9 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
u32 lo, hi;
- rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
+ /* CPU might be offline by now */
+ if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi))
+ return -ENODEV;
return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
(THRESHOLD_MAX - b->threshold_limit)));
--
2.21.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
end of thread, other threads:[~2020-04-03 16:20 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-03 16:19 [PATCH 0/7] x86/mce/amd: Fix some CPU hotplug insanity Borislav Petkov
2020-04-03 16:19 ` [PATCH 1/7] x86/mce/amd: Do proper cleanup on error paths Borislav Petkov
2020-04-03 16:19 ` [PATCH 2/7] x86/mce/amd: Init thresholding machinery only on relevant vendors Borislav Petkov
2020-04-03 16:19 ` [PATCH 3/7] x86/mce/amd: Protect a not-fully initialized bank from the thresholding interrupt Borislav Petkov
2020-04-03 16:19 ` [PATCH 4/7] x86/mce/amd: Sanitize thresholding device creation hotplug path Borislav Petkov
2020-04-03 16:19 ` [PATCH 5/7] x86/mce/amd: Straighten CPU " Borislav Petkov
2020-04-03 16:19 ` [PATCH 6/7] x86/mce/amd: Cleanup threshold device remove path Borislav Petkov
2020-04-03 16:19 ` [PATCH 7/7] x86/mce/amd: Make threshold bank setting hotplug robust Borislav Petkov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).